@botpress/zai 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/adapter.js +2 -0
- package/dist/adapters/botpress-table.js +168 -0
- package/dist/adapters/memory.js +12 -0
- package/dist/index.d.ts +99 -98
- package/dist/index.js +9 -1873
- package/dist/models.js +387 -0
- package/dist/operations/check.js +141 -0
- package/dist/operations/constants.js +2 -0
- package/dist/operations/errors.js +15 -0
- package/dist/operations/extract.js +212 -0
- package/dist/operations/filter.js +179 -0
- package/dist/operations/label.js +237 -0
- package/dist/operations/rewrite.js +111 -0
- package/dist/operations/summarize.js +132 -0
- package/dist/operations/text.js +46 -0
- package/dist/utils.js +43 -0
- package/dist/zai.js +140 -0
- package/package.json +21 -19
- package/src/adapters/adapter.ts +35 -0
- package/src/adapters/botpress-table.ts +210 -0
- package/src/adapters/memory.ts +13 -0
- package/src/index.ts +11 -0
- package/src/models.ts +394 -0
- package/src/operations/__tests/botpress_docs.txt +26040 -0
- package/src/operations/__tests/cache.jsonl +101 -0
- package/src/operations/__tests/index.ts +87 -0
- package/src/operations/check.ts +187 -0
- package/src/operations/constants.ts +2 -0
- package/src/operations/errors.ts +9 -0
- package/src/operations/extract.ts +291 -0
- package/src/operations/filter.ts +231 -0
- package/src/operations/label.ts +332 -0
- package/src/operations/rewrite.ts +148 -0
- package/src/operations/summarize.ts +193 -0
- package/src/operations/text.ts +63 -0
- package/src/sdk-interfaces/llm/generateContent.ts +127 -0
- package/src/sdk-interfaces/llm/listLanguageModels.ts +19 -0
- package/src/utils.ts +61 -0
- package/src/zai.ts +193 -0
- package/tsconfig.json +2 -2
- package/dist/index.cjs +0 -1903
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -916
- package/dist/index.js.map +0 -1
- package/tsup.config.ts +0 -16
- package/vitest.config.ts +0 -9
- package/vitest.setup.ts +0 -24
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import { z } from '@bpinternal/zui'
|
|
2
|
+
|
|
3
|
+
import { clamp } from 'lodash-es'
|
|
4
|
+
import { fastHash, stringify, takeUntilTokens } from '../utils'
|
|
5
|
+
import { Zai } from '../zai'
|
|
6
|
+
import { PROMPT_INPUT_BUFFER, PROMPT_OUTPUT_BUFFER } from './constants'
|
|
7
|
+
|
|
8
|
+
type Example = z.input<typeof Example>
|
|
9
|
+
const Example = z.object({
|
|
10
|
+
input: z.any(),
|
|
11
|
+
filter: z.boolean(),
|
|
12
|
+
reason: z.string().optional()
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
export type Options = z.input<typeof Options>
|
|
16
|
+
const Options = z.object({
|
|
17
|
+
tokensPerItem: z
|
|
18
|
+
.number()
|
|
19
|
+
.min(1)
|
|
20
|
+
.max(100_000)
|
|
21
|
+
.optional()
|
|
22
|
+
.describe('The maximum number of tokens per item')
|
|
23
|
+
.default(250),
|
|
24
|
+
examples: z.array(Example).describe('Examples to filter the condition against').default([])
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
declare module '@botpress/zai' {
|
|
28
|
+
interface Zai {
|
|
29
|
+
/** Filters elements of an array against a condition */
|
|
30
|
+
filter<T>(input: Array<T>, condition: string, options?: Options): Promise<Array<T>>
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const END = '■END■'
|
|
35
|
+
|
|
36
|
+
Zai.prototype.filter = async function (this: Zai, input, condition, _options) {
|
|
37
|
+
const options = Options.parse(_options ?? {})
|
|
38
|
+
const tokenizer = await this.getTokenizer()
|
|
39
|
+
|
|
40
|
+
const taskId = this.taskId
|
|
41
|
+
const taskType = 'zai.filter'
|
|
42
|
+
|
|
43
|
+
const MAX_ITEMS_PER_CHUNK = 50
|
|
44
|
+
const TOKENS_TOTAL_MAX = this.Model.input.maxTokens - PROMPT_INPUT_BUFFER - PROMPT_OUTPUT_BUFFER
|
|
45
|
+
const TOKENS_EXAMPLES_MAX = Math.floor(Math.max(250, TOKENS_TOTAL_MAX * 0.5))
|
|
46
|
+
const TOKENS_CONDITION_MAX = clamp(TOKENS_TOTAL_MAX * 0.25, 250, tokenizer.count(condition))
|
|
47
|
+
const TOKENS_INPUT_ARRAY_MAX = TOKENS_TOTAL_MAX - TOKENS_EXAMPLES_MAX - TOKENS_CONDITION_MAX
|
|
48
|
+
|
|
49
|
+
condition = tokenizer.truncate(condition, TOKENS_CONDITION_MAX)
|
|
50
|
+
|
|
51
|
+
let chunks: Array<typeof input> = []
|
|
52
|
+
let currentChunk: typeof input = []
|
|
53
|
+
let currentChunkTokens = 0
|
|
54
|
+
|
|
55
|
+
for (const element of input) {
|
|
56
|
+
const elementAsString = tokenizer.truncate(stringify(element, false), options.tokensPerItem)
|
|
57
|
+
const elementTokens = tokenizer.count(elementAsString)
|
|
58
|
+
|
|
59
|
+
if (currentChunkTokens + elementTokens > TOKENS_INPUT_ARRAY_MAX || currentChunk.length >= MAX_ITEMS_PER_CHUNK) {
|
|
60
|
+
chunks.push(currentChunk)
|
|
61
|
+
currentChunk = []
|
|
62
|
+
currentChunkTokens = 0
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
currentChunk.push(element)
|
|
66
|
+
currentChunkTokens += elementTokens
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (currentChunk.length > 0) {
|
|
70
|
+
chunks.push(currentChunk)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
chunks = chunks.filter((x) => x.length > 0)
|
|
74
|
+
|
|
75
|
+
// ■1:true■2:true■3:true
|
|
76
|
+
|
|
77
|
+
const formatInput = (input: Example[], condition: string) => {
|
|
78
|
+
return `
|
|
79
|
+
Condition to check:
|
|
80
|
+
${condition}
|
|
81
|
+
|
|
82
|
+
Items (from ■0 to ■${input.length - 1})
|
|
83
|
+
==============================
|
|
84
|
+
${input.map((x, idx) => `■${idx} = ${stringify(x.input ?? null, false)}`).join('\n')}
|
|
85
|
+
`.trim()
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const formatExamples = (examples: Example[]) => {
|
|
89
|
+
return `
|
|
90
|
+
${examples.map((x, idx) => `■${idx}:${!!x.filter ? 'true' : 'false'}`).join('')}
|
|
91
|
+
${END}
|
|
92
|
+
====
|
|
93
|
+
Here's the reasoning behind each example:
|
|
94
|
+
${examples.map((x, idx) => `■${idx}:${!!x.filter ? 'true' : 'false'}:${x.reason ?? 'No reason provided'}`).join('\n')}
|
|
95
|
+
`.trim()
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const genericExamples: Example[] = [
|
|
99
|
+
{
|
|
100
|
+
input: 'apple',
|
|
101
|
+
filter: true,
|
|
102
|
+
reason: 'Apples are fruits'
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
input: 'Apple Inc.',
|
|
106
|
+
filter: false,
|
|
107
|
+
reason: 'Apple Inc. is a company, not a fruit'
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
input: 'banana',
|
|
111
|
+
filter: true,
|
|
112
|
+
reason: 'Bananas are fruits'
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
input: 'potato',
|
|
116
|
+
filter: false,
|
|
117
|
+
reason: 'Potatoes are vegetables'
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
const genericExamplesMessages = [
|
|
122
|
+
{
|
|
123
|
+
type: 'text' as const,
|
|
124
|
+
content: formatInput(genericExamples, 'is a fruit'),
|
|
125
|
+
role: 'user' as const
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
type: 'text' as const,
|
|
129
|
+
content: formatExamples(genericExamples),
|
|
130
|
+
role: 'assistant' as const
|
|
131
|
+
}
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
const filterChunk = async (chunk: typeof input) => {
|
|
135
|
+
const examples = taskId
|
|
136
|
+
? await this.adapter
|
|
137
|
+
.getExamples<string, unknown>({
|
|
138
|
+
// The Table API can't search for a huge input string
|
|
139
|
+
input: JSON.stringify(chunk).slice(0, 1000),
|
|
140
|
+
taskType,
|
|
141
|
+
taskId
|
|
142
|
+
})
|
|
143
|
+
.then((x) =>
|
|
144
|
+
x.map((y) => ({ filter: y.output as boolean, input: y.input, reason: y.explanation } satisfies Example))
|
|
145
|
+
)
|
|
146
|
+
: []
|
|
147
|
+
|
|
148
|
+
const allExamples = takeUntilTokens([...examples, ...(options.examples ?? [])], TOKENS_EXAMPLES_MAX, (el) =>
|
|
149
|
+
tokenizer.count(stringify(el.input))
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
const exampleMessages = [
|
|
153
|
+
{
|
|
154
|
+
type: 'text' as const,
|
|
155
|
+
content: formatInput(allExamples, condition),
|
|
156
|
+
role: 'user' as const
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
type: 'text' as const,
|
|
160
|
+
content: formatExamples(allExamples),
|
|
161
|
+
role: 'assistant' as const
|
|
162
|
+
}
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
const output = await this.callModel({
|
|
166
|
+
systemPrompt: `
|
|
167
|
+
You are given a list of items. Your task is to filter out the items that meet the condition below.
|
|
168
|
+
You need to return the full list of items with the format:
|
|
169
|
+
■x:true■y:false■z:true (where x, y, z are the indices of the items in the list)
|
|
170
|
+
You need to start with "■0" and go up to the last index "■${chunk.length - 1}".
|
|
171
|
+
If an item meets the condition, you should return ":true", otherwise ":false".
|
|
172
|
+
|
|
173
|
+
IMPORTANT: Make sure to read the condition and the examples carefully before making your decision.
|
|
174
|
+
The condition is: "${condition}"
|
|
175
|
+
`.trim(),
|
|
176
|
+
stopSequences: [END],
|
|
177
|
+
messages: [
|
|
178
|
+
...(exampleMessages.length ? exampleMessages : genericExamplesMessages),
|
|
179
|
+
{
|
|
180
|
+
type: 'text',
|
|
181
|
+
content: formatInput(
|
|
182
|
+
chunk.map((x) => ({ input: x } as Example)),
|
|
183
|
+
condition
|
|
184
|
+
),
|
|
185
|
+
role: 'user'
|
|
186
|
+
}
|
|
187
|
+
]
|
|
188
|
+
})
|
|
189
|
+
|
|
190
|
+
const answer = output.choices[0]?.content as string
|
|
191
|
+
const indices = answer
|
|
192
|
+
.trim()
|
|
193
|
+
.split('■')
|
|
194
|
+
.filter((x) => x.length > 0)
|
|
195
|
+
.map((x) => {
|
|
196
|
+
const [idx, filter] = x.split(':')
|
|
197
|
+
return { idx: parseInt(idx?.trim() ?? ''), filter: filter?.toLowerCase().trim() === 'true' }
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
const partial = chunk.filter((_, idx) => {
|
|
201
|
+
return indices.find((x) => x.idx === idx)?.filter ?? false
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
if (taskId) {
|
|
205
|
+
const key = fastHash(
|
|
206
|
+
stringify({
|
|
207
|
+
taskId,
|
|
208
|
+
taskType,
|
|
209
|
+
input: JSON.stringify(chunk),
|
|
210
|
+
condition
|
|
211
|
+
})
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
await this.adapter.saveExample({
|
|
215
|
+
key,
|
|
216
|
+
taskType,
|
|
217
|
+
taskId,
|
|
218
|
+
input: JSON.stringify(chunk),
|
|
219
|
+
output: partial,
|
|
220
|
+
instructions: condition,
|
|
221
|
+
metadata: output.metadata
|
|
222
|
+
})
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return partial
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const filteredChunks = await Promise.all(chunks.map(filterChunk))
|
|
229
|
+
|
|
230
|
+
return filteredChunks.flat()
|
|
231
|
+
}
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import { z } from '@bpinternal/zui'
|
|
2
|
+
|
|
3
|
+
import { clamp, chunk } from 'lodash-es'
|
|
4
|
+
import { fastHash, stringify, takeUntilTokens } from '../utils'
|
|
5
|
+
import { Zai } from '../zai'
|
|
6
|
+
import { PROMPT_INPUT_BUFFER } from './constants'
|
|
7
|
+
|
|
8
|
+
type Label = keyof typeof LABELS
|
|
9
|
+
const LABELS = {
|
|
10
|
+
ABSOLUTELY_NOT: 'ABSOLUTELY_NOT',
|
|
11
|
+
PROBABLY_NOT: 'PROBABLY_NOT',
|
|
12
|
+
AMBIGUOUS: 'AMBIGUOUS',
|
|
13
|
+
PROBABLY_YES: 'PROBABLY_YES',
|
|
14
|
+
ABSOLUTELY_YES: 'ABSOLUTELY_YES'
|
|
15
|
+
} as const
|
|
16
|
+
const ALL_LABELS = Object.values(LABELS).join(' | ')
|
|
17
|
+
|
|
18
|
+
type Example<T extends string> = {
|
|
19
|
+
input: unknown
|
|
20
|
+
labels: Partial<Record<T, { label: Label; explanation?: string }>>
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export type Options<T extends string> = Omit<z.input<typeof Options>, 'examples'> & {
|
|
24
|
+
examples?: Array<Partial<Example<T>>>
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const Options = z.object({
|
|
28
|
+
examples: z
|
|
29
|
+
.array(
|
|
30
|
+
z.object({
|
|
31
|
+
input: z.any(),
|
|
32
|
+
labels: z.record(z.object({ label: z.enum(ALL_LABELS as never), explanation: z.string().optional() }))
|
|
33
|
+
})
|
|
34
|
+
)
|
|
35
|
+
.default([])
|
|
36
|
+
.describe('Examples to help the user make a decision'),
|
|
37
|
+
instructions: z.string().optional().describe('Instructions to guide the user on how to extract the data'),
|
|
38
|
+
chunkLength: z
|
|
39
|
+
.number()
|
|
40
|
+
.min(100)
|
|
41
|
+
.max(100_000)
|
|
42
|
+
.optional()
|
|
43
|
+
.describe('The maximum number of tokens per chunk')
|
|
44
|
+
.default(16_000)
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
type Labels<T extends string> = Record<T, string>
|
|
48
|
+
|
|
49
|
+
const Labels = z.record(z.string().min(1).max(250), z.string()).superRefine((labels, ctx) => {
|
|
50
|
+
const keys = Object.keys(labels)
|
|
51
|
+
|
|
52
|
+
for (const key of keys) {
|
|
53
|
+
if (key.length < 1 || key.length > 250) {
|
|
54
|
+
ctx.addIssue({ message: `The label key "${key}" must be between 1 and 250 characters long`, code: 'custom' })
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (keys.lastIndexOf(key) !== keys.indexOf(key)) {
|
|
58
|
+
ctx.addIssue({ message: `Duplicate label: ${labels[key]}`, code: 'custom' })
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (/[^a-zA-Z0-9_]/.test(key)) {
|
|
62
|
+
ctx.addIssue({
|
|
63
|
+
message: `The label key "${key}" must only contain alphanumeric characters and underscores`,
|
|
64
|
+
code: 'custom'
|
|
65
|
+
})
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return true
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
declare module '@botpress/zai' {
|
|
73
|
+
interface Zai {
|
|
74
|
+
/** Tags the provided input with a list of predefined labels */
|
|
75
|
+
label<T extends string>(
|
|
76
|
+
input: unknown,
|
|
77
|
+
labels: Labels<T>,
|
|
78
|
+
options?: Options<T>
|
|
79
|
+
): Promise<{
|
|
80
|
+
[K in T]: boolean
|
|
81
|
+
}>
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const parseLabel = (label: string): Label => {
|
|
86
|
+
label = label.toUpperCase().replace(/\s+/g, '_').replace(/_{2,}/g, '_').trim()
|
|
87
|
+
if (label.includes('ABSOLUTELY') && label.includes('NOT')) {
|
|
88
|
+
return LABELS.ABSOLUTELY_NOT
|
|
89
|
+
} else if (label.includes('NOT')) {
|
|
90
|
+
return LABELS.PROBABLY_NOT
|
|
91
|
+
} else if (label.includes('AMBIGUOUS')) {
|
|
92
|
+
return LABELS.AMBIGUOUS
|
|
93
|
+
}
|
|
94
|
+
if (label.includes('YES')) {
|
|
95
|
+
return LABELS.PROBABLY_YES
|
|
96
|
+
} else if (label.includes('ABSOLUTELY') && label.includes('YES')) {
|
|
97
|
+
return LABELS.ABSOLUTELY_YES
|
|
98
|
+
}
|
|
99
|
+
return LABELS.AMBIGUOUS
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
Zai.prototype.label = async function <T extends string>(this: Zai, input, _labels, _options) {
|
|
103
|
+
const options = Options.parse(_options ?? {})
|
|
104
|
+
const labels = Labels.parse(_labels)
|
|
105
|
+
const tokenizer = await this.getTokenizer()
|
|
106
|
+
|
|
107
|
+
const taskId = this.taskId
|
|
108
|
+
const taskType = 'zai.label'
|
|
109
|
+
|
|
110
|
+
const TOTAL_MAX_TOKENS = clamp(options.chunkLength, 1000, this.Model.input.maxTokens - PROMPT_INPUT_BUFFER)
|
|
111
|
+
const CHUNK_EXAMPLES_MAX_TOKENS = clamp(Math.floor(TOTAL_MAX_TOKENS * 0.5), 250, 10_000)
|
|
112
|
+
const CHUNK_INPUT_MAX_TOKENS = clamp(
|
|
113
|
+
TOTAL_MAX_TOKENS - CHUNK_EXAMPLES_MAX_TOKENS,
|
|
114
|
+
TOTAL_MAX_TOKENS * 0.5,
|
|
115
|
+
TOTAL_MAX_TOKENS
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
const inputAsString = stringify(input)
|
|
119
|
+
|
|
120
|
+
if (tokenizer.count(inputAsString) > CHUNK_INPUT_MAX_TOKENS) {
|
|
121
|
+
const tokens = tokenizer.split(inputAsString)
|
|
122
|
+
const chunks = chunk(tokens, CHUNK_INPUT_MAX_TOKENS).map((x) => x.join(''))
|
|
123
|
+
const allLabels = await Promise.all(chunks.map((chunk) => this.label(chunk, _labels)))
|
|
124
|
+
|
|
125
|
+
// Merge all the labels together (those who are true will remain true)
|
|
126
|
+
return allLabels.reduce((acc, x) => {
|
|
127
|
+
Object.keys(x).forEach((key) => {
|
|
128
|
+
if (acc[key] === true) {
|
|
129
|
+
acc[key] = true
|
|
130
|
+
} else {
|
|
131
|
+
acc[key] = acc[key] || x[key]
|
|
132
|
+
}
|
|
133
|
+
})
|
|
134
|
+
return acc
|
|
135
|
+
}, {}) as {
|
|
136
|
+
[K in T]: boolean
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const END = '■END■'
|
|
141
|
+
|
|
142
|
+
const Key = fastHash(
|
|
143
|
+
JSON.stringify({
|
|
144
|
+
taskType,
|
|
145
|
+
taskId,
|
|
146
|
+
input: inputAsString,
|
|
147
|
+
instructions: options.instructions ?? ''
|
|
148
|
+
})
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
const convertToAnswer = (mapping: { [K in T]: { explanation: string; label: Label } }) => {
|
|
152
|
+
return Object.keys(labels).reduce((acc, key) => {
|
|
153
|
+
acc[key] = mapping[key]?.label === 'ABSOLUTELY_YES' || mapping[key]?.label === 'PROBABLY_YES'
|
|
154
|
+
return acc
|
|
155
|
+
}, {}) as { [K in T]: boolean }
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const examples = taskId
|
|
159
|
+
? await this.adapter.getExamples<
|
|
160
|
+
string,
|
|
161
|
+
{
|
|
162
|
+
[K in T]: {
|
|
163
|
+
explanation: string
|
|
164
|
+
label: Label
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
>({
|
|
168
|
+
input: inputAsString,
|
|
169
|
+
taskType,
|
|
170
|
+
taskId
|
|
171
|
+
})
|
|
172
|
+
: []
|
|
173
|
+
|
|
174
|
+
options.examples.forEach((example) => {
|
|
175
|
+
examples.push({
|
|
176
|
+
key: fastHash(JSON.stringify(example)),
|
|
177
|
+
input: example.input,
|
|
178
|
+
similarity: 1,
|
|
179
|
+
explanation: '',
|
|
180
|
+
output: example.labels as unknown as {
|
|
181
|
+
[K in T]: {
|
|
182
|
+
explanation: string
|
|
183
|
+
label: Label
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
})
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
const exactMatch = examples.find((x) => x.key === Key)
|
|
190
|
+
if (exactMatch) {
|
|
191
|
+
return convertToAnswer(exactMatch.output)
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const allExamples = takeUntilTokens(
|
|
195
|
+
examples,
|
|
196
|
+
CHUNK_EXAMPLES_MAX_TOKENS,
|
|
197
|
+
(el) =>
|
|
198
|
+
tokenizer.count(stringify(el.input)) +
|
|
199
|
+
tokenizer.count(stringify(el.output)) +
|
|
200
|
+
tokenizer.count(el.explanation ?? '') +
|
|
201
|
+
100
|
|
202
|
+
)
|
|
203
|
+
.map((example, idx) => [
|
|
204
|
+
{
|
|
205
|
+
type: 'text' as const,
|
|
206
|
+
role: 'user' as const,
|
|
207
|
+
content: `
|
|
208
|
+
Expert Example #${idx + 1}
|
|
209
|
+
|
|
210
|
+
<|start_input|>
|
|
211
|
+
${stringify(example.input)}
|
|
212
|
+
<|end_input|>`.trim()
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
type: 'text' as const,
|
|
216
|
+
role: 'assistant' as const,
|
|
217
|
+
content: `
|
|
218
|
+
Expert Example #${idx + 1}
|
|
219
|
+
============
|
|
220
|
+
${Object.keys(example.output)
|
|
221
|
+
.map((key) =>
|
|
222
|
+
`
|
|
223
|
+
■${key}:【${example.output[key]?.explanation}】:${example.output[key]?.label}■
|
|
224
|
+
`.trim()
|
|
225
|
+
)
|
|
226
|
+
.join('\n')}
|
|
227
|
+
${END}
|
|
228
|
+
`.trim()
|
|
229
|
+
}
|
|
230
|
+
])
|
|
231
|
+
.flat()
|
|
232
|
+
|
|
233
|
+
const format = Object.keys(labels)
|
|
234
|
+
.map((key) => {
|
|
235
|
+
return `
|
|
236
|
+
■${key}:【explanation (where "explanation" is answering the question "${labels[key]}")】:x■ (where x is ${ALL_LABELS})
|
|
237
|
+
`.trim()
|
|
238
|
+
})
|
|
239
|
+
.join('\n\n')
|
|
240
|
+
|
|
241
|
+
const output = await this.callModel({
|
|
242
|
+
stopSequences: [END],
|
|
243
|
+
systemPrompt: `
|
|
244
|
+
You need to tag the input with the following labels based on the question asked:
|
|
245
|
+
${LABELS.ABSOLUTELY_NOT}: You are absolutely sure that the answer is "NO" to the question.
|
|
246
|
+
${LABELS.PROBABLY_NOT}: You are leaning towards "NO" to the question.
|
|
247
|
+
${LABELS.AMBIGUOUS}: You are unsure about the answer to the question.
|
|
248
|
+
${LABELS.PROBABLY_YES}: You are leaning towards "YES" to the question.
|
|
249
|
+
${LABELS.ABSOLUTELY_YES}: You are absolutely sure that the answer is "YES" to the question.
|
|
250
|
+
|
|
251
|
+
You need to return a mapping of the labels, an explanation and the answer for each label following the format below:
|
|
252
|
+
\`\`\`
|
|
253
|
+
${format}
|
|
254
|
+
${END}
|
|
255
|
+
\`\`\`
|
|
256
|
+
|
|
257
|
+
${options.instructions}
|
|
258
|
+
|
|
259
|
+
===
|
|
260
|
+
You should consider the Expert Examples below to help you make your decision.
|
|
261
|
+
In your "Analysis", please refer to the Expert Examples # to justify your decision.
|
|
262
|
+
`.trim(),
|
|
263
|
+
messages: [
|
|
264
|
+
...allExamples,
|
|
265
|
+
{
|
|
266
|
+
type: 'text',
|
|
267
|
+
role: 'user',
|
|
268
|
+
content: `
|
|
269
|
+
Input to tag:
|
|
270
|
+
<|start_input|>
|
|
271
|
+
${inputAsString}
|
|
272
|
+
<|end_input|>
|
|
273
|
+
|
|
274
|
+
Answer with this following format:
|
|
275
|
+
\`\`\`
|
|
276
|
+
${format}
|
|
277
|
+
${END}
|
|
278
|
+
\`\`\`
|
|
279
|
+
|
|
280
|
+
Format cheatsheet:
|
|
281
|
+
\`\`\`
|
|
282
|
+
■label:【explanation】:x■
|
|
283
|
+
\`\`\`
|
|
284
|
+
|
|
285
|
+
Where \`x\` is one of the following: ${ALL_LABELS}
|
|
286
|
+
|
|
287
|
+
Remember: In your \`explanation\`, please refer to the Expert Examples # (and quote them) that are relevant to ground your decision-making process.
|
|
288
|
+
The Expert Examples are there to help you make your decision. They have been provided by experts in the field and their answers (and reasoning) are considered the ground truth and should be used as a reference to make your decision when applicable.
|
|
289
|
+
For example, you can say: "According to Expert Example #1, ..."`.trim()
|
|
290
|
+
}
|
|
291
|
+
]
|
|
292
|
+
})
|
|
293
|
+
|
|
294
|
+
const answer = output.choices[0].content as string
|
|
295
|
+
|
|
296
|
+
const final = Object.keys(labels).reduce((acc, key) => {
|
|
297
|
+
const match = answer.match(new RegExp(`■${key}:【(.+)】:(\\w{2,})■`, 'i'))
|
|
298
|
+
if (match) {
|
|
299
|
+
const explanation = match[1].trim()
|
|
300
|
+
const label = parseLabel(match[2])
|
|
301
|
+
acc[key] = {
|
|
302
|
+
explanation,
|
|
303
|
+
label
|
|
304
|
+
}
|
|
305
|
+
} else {
|
|
306
|
+
acc[key] = {
|
|
307
|
+
explanation: '',
|
|
308
|
+
label: LABELS.AMBIGUOUS
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
return acc
|
|
312
|
+
}, {}) as {
|
|
313
|
+
[K in T]: {
|
|
314
|
+
explanation: string
|
|
315
|
+
label: Label
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
if (taskId) {
|
|
320
|
+
await this.adapter.saveExample({
|
|
321
|
+
key: Key,
|
|
322
|
+
taskType,
|
|
323
|
+
taskId,
|
|
324
|
+
instructions: options.instructions ?? '',
|
|
325
|
+
metadata: output.metadata,
|
|
326
|
+
input: inputAsString,
|
|
327
|
+
output: final
|
|
328
|
+
})
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return convertToAnswer(final)
|
|
332
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import { z } from '@bpinternal/zui'
|
|
2
|
+
|
|
3
|
+
import { fastHash, stringify, takeUntilTokens } from '../utils'
|
|
4
|
+
import { Zai } from '../zai'
|
|
5
|
+
import { PROMPT_INPUT_BUFFER } from './constants'
|
|
6
|
+
|
|
7
|
+
type Example = z.input<typeof Example> & { instructions?: string }
|
|
8
|
+
const Example = z.object({
|
|
9
|
+
input: z.string(),
|
|
10
|
+
output: z.string()
|
|
11
|
+
})
|
|
12
|
+
|
|
13
|
+
export type Options = z.input<typeof Options>
|
|
14
|
+
const Options = z.object({
|
|
15
|
+
examples: z.array(Example).default([]),
|
|
16
|
+
length: z.number().min(10).max(16_000).optional().describe('The maximum number of tokens to generate')
|
|
17
|
+
})
|
|
18
|
+
|
|
19
|
+
declare module '@botpress/zai' {
|
|
20
|
+
interface Zai {
|
|
21
|
+
/** Rewrites a string according to match the prompt */
|
|
22
|
+
rewrite(original: string, prompt: string, options?: Options): Promise<string>
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const START = '■START■'
|
|
27
|
+
const END = '■END■'
|
|
28
|
+
|
|
29
|
+
Zai.prototype.rewrite = async function (this: Zai, original, prompt, _options) {
|
|
30
|
+
const options = Options.parse(_options ?? {})
|
|
31
|
+
const tokenizer = await this.getTokenizer()
|
|
32
|
+
|
|
33
|
+
const taskId = this.taskId
|
|
34
|
+
const taskType = 'zai.rewrite'
|
|
35
|
+
|
|
36
|
+
const INPUT_COMPONENT_SIZE = Math.max(100, (this.Model.input.maxTokens - PROMPT_INPUT_BUFFER) / 2)
|
|
37
|
+
prompt = tokenizer.truncate(prompt, INPUT_COMPONENT_SIZE)
|
|
38
|
+
|
|
39
|
+
const inputSize = tokenizer.count(original) + tokenizer.count(prompt)
|
|
40
|
+
const maxInputSize = this.Model.input.maxTokens - tokenizer.count(prompt) - PROMPT_INPUT_BUFFER
|
|
41
|
+
if (inputSize > maxInputSize) {
|
|
42
|
+
throw new Error(
|
|
43
|
+
`The input size is ${inputSize} tokens long, which is more than the maximum of ${maxInputSize} tokens for this model (${this.Model.name} = ${this.Model.input.maxTokens} tokens)`
|
|
44
|
+
)
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const instructions: string[] = []
|
|
48
|
+
|
|
49
|
+
const originalSize = tokenizer.count(original)
|
|
50
|
+
if (options.length && originalSize > options.length) {
|
|
51
|
+
instructions.push(`The original text is ${originalSize} tokens long – it should be less than ${options.length}`)
|
|
52
|
+
instructions.push(
|
|
53
|
+
`The text must be standalone and complete in less than ${options.length} tokens, so it has to be shortened to fit the length as well`
|
|
54
|
+
)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const format = (before: string, prompt: string) => {
|
|
58
|
+
return `
|
|
59
|
+
Prompt: ${prompt}
|
|
60
|
+
|
|
61
|
+
${START}
|
|
62
|
+
${before}
|
|
63
|
+
${END}
|
|
64
|
+
`.trim()
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const Key = fastHash(
|
|
68
|
+
stringify({
|
|
69
|
+
taskId,
|
|
70
|
+
taskType,
|
|
71
|
+
input: original,
|
|
72
|
+
prompt
|
|
73
|
+
})
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
const formatExample = ({ input, output, instructions }: Example) => {
|
|
77
|
+
return [
|
|
78
|
+
{ type: 'text' as const, role: 'user' as const, content: format(input, instructions || prompt) },
|
|
79
|
+
{ type: 'text' as const, role: 'assistant' as const, content: `${START}${output}${END}` }
|
|
80
|
+
]
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const defaultExamples: Example[] = [
|
|
84
|
+
{ input: 'Hello, how are you?', output: 'Bonjour, comment ça va?', instructions: 'translate to French' },
|
|
85
|
+
{ input: '1\n2\n3', output: '3\n2\n1', instructions: 'reverse the order' }
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
const tableExamples = taskId
|
|
89
|
+
? await this.adapter.getExamples<string, string>({
|
|
90
|
+
input: original,
|
|
91
|
+
taskId,
|
|
92
|
+
taskType
|
|
93
|
+
})
|
|
94
|
+
: []
|
|
95
|
+
|
|
96
|
+
const exactMatch = tableExamples.find((x) => x.key === Key)
|
|
97
|
+
if (exactMatch) {
|
|
98
|
+
return exactMatch.output
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const savedExamples: Example[] = [
|
|
102
|
+
...tableExamples.map((x) => ({ input: x.input as string, output: x.output as string })),
|
|
103
|
+
...options.examples
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
const REMAINING_TOKENS = this.Model.input.maxTokens - tokenizer.count(prompt) - PROMPT_INPUT_BUFFER
|
|
107
|
+
const examples = takeUntilTokens(
|
|
108
|
+
savedExamples.length ? savedExamples : defaultExamples,
|
|
109
|
+
REMAINING_TOKENS,
|
|
110
|
+
(el) => tokenizer.count(stringify(el.input)) + tokenizer.count(stringify(el.output))
|
|
111
|
+
)
|
|
112
|
+
.map(formatExample)
|
|
113
|
+
.flat()
|
|
114
|
+
|
|
115
|
+
const output = await this.callModel({
|
|
116
|
+
systemPrompt: `
|
|
117
|
+
Rewrite the text between the ${START} and ${END} tags to match the user prompt.
|
|
118
|
+
${instructions.map((x) => `• ${x}`).join('\n')}
|
|
119
|
+
`.trim(),
|
|
120
|
+
messages: [...examples, { type: 'text', content: format(original, prompt), role: 'user' }],
|
|
121
|
+
maxTokens: options.length,
|
|
122
|
+
stopSequences: [END]
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
let result = output.choices[0]?.content as string
|
|
126
|
+
|
|
127
|
+
if (result.includes(START)) {
|
|
128
|
+
result = result.slice(result.indexOf(START) + START.length)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (result.includes(END)) {
|
|
132
|
+
result = result.slice(0, result.indexOf(END))
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (taskId) {
|
|
136
|
+
await this.adapter.saveExample({
|
|
137
|
+
key: Key,
|
|
138
|
+
metadata: output.metadata,
|
|
139
|
+
instructions: prompt,
|
|
140
|
+
input: original,
|
|
141
|
+
output: result,
|
|
142
|
+
taskType,
|
|
143
|
+
taskId
|
|
144
|
+
})
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return result
|
|
148
|
+
}
|