@botpress/zai 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +1 -1
  2. package/build.ts +9 -0
  3. package/dist/adapters/adapter.js +2 -0
  4. package/dist/adapters/botpress-table.js +168 -0
  5. package/dist/adapters/memory.js +12 -0
  6. package/dist/index.d.ts +111 -609
  7. package/dist/index.js +9 -1873
  8. package/dist/operations/check.js +153 -0
  9. package/dist/operations/constants.js +2 -0
  10. package/dist/operations/errors.js +15 -0
  11. package/dist/operations/extract.js +232 -0
  12. package/dist/operations/filter.js +191 -0
  13. package/dist/operations/label.js +249 -0
  14. package/dist/operations/rewrite.js +123 -0
  15. package/dist/operations/summarize.js +133 -0
  16. package/dist/operations/text.js +47 -0
  17. package/dist/utils.js +37 -0
  18. package/dist/zai.js +100 -0
  19. package/e2e/data/botpress_docs.txt +26040 -0
  20. package/e2e/data/cache.jsonl +107 -0
  21. package/e2e/utils.ts +89 -0
  22. package/package.json +33 -29
  23. package/src/adapters/adapter.ts +35 -0
  24. package/src/adapters/botpress-table.ts +210 -0
  25. package/src/adapters/memory.ts +13 -0
  26. package/src/index.ts +11 -0
  27. package/src/operations/check.ts +201 -0
  28. package/src/operations/constants.ts +2 -0
  29. package/src/operations/errors.ts +9 -0
  30. package/src/operations/extract.ts +309 -0
  31. package/src/operations/filter.ts +244 -0
  32. package/src/operations/label.ts +345 -0
  33. package/src/operations/rewrite.ts +161 -0
  34. package/src/operations/summarize.ts +195 -0
  35. package/src/operations/text.ts +65 -0
  36. package/src/utils.ts +52 -0
  37. package/src/zai.ts +147 -0
  38. package/tsconfig.json +3 -23
  39. package/dist/index.cjs +0 -1903
  40. package/dist/index.cjs.map +0 -1
  41. package/dist/index.d.cts +0 -916
  42. package/dist/index.js.map +0 -1
  43. package/tsup.config.ts +0 -16
  44. package/vitest.config.ts +0 -9
  45. package/vitest.setup.ts +0 -24
@@ -0,0 +1,309 @@
1
+ // eslint-disable consistent-type-definitions
2
+ import { z, ZodObject } from '@bpinternal/zui'
3
+
4
+ import JSON5 from 'json5'
5
+ import { jsonrepair } from 'jsonrepair'
6
+
7
+ import { chunk, isArray } from 'lodash-es'
8
+ import { fastHash, stringify, takeUntilTokens } from '../utils'
9
+ import { Zai } from '../zai'
10
+ import { PROMPT_INPUT_BUFFER } from './constants'
11
+ import { JsonParsingError } from './errors'
12
+
13
+ export type Options = z.input<typeof Options>
14
+ const Options = z.object({
15
+ instructions: z.string().optional().describe('Instructions to guide the user on how to extract the data'),
16
+ chunkLength: z
17
+ .number()
18
+ .min(100)
19
+ .max(100_000)
20
+ .optional()
21
+ .describe('The maximum number of tokens per chunk')
22
+ .default(16_000),
23
+ })
24
+
25
+ declare module '@botpress/zai' {
26
+ interface Zai {
27
+ /** Extracts one or many elements from an arbitrary input */
28
+ extract<S extends z.AnyZodObject | z.ZodArray>(input: unknown, schema: S, options?: Options): Promise<z.TypeOf<S>>
29
+ }
30
+ }
31
+
32
+ const START = '■json_start■'
33
+ const END = '■json_end■'
34
+ const NO_MORE = '■NO_MORE_ELEMENT■'
35
+
36
+ Zai.prototype.extract = async function (this: Zai, input, schema, _options) {
37
+ const options = Options.parse(_options ?? {})
38
+ const tokenizer = await this.getTokenizer()
39
+ await this.fetchModelDetails()
40
+
41
+ const taskId = this.taskId
42
+ const taskType = 'zai.extract'
43
+
44
+ const PROMPT_COMPONENT = Math.max(this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER, 100)
45
+
46
+ let isArrayOfObjects = false
47
+ const originalSchema = schema
48
+
49
+ const baseType = (schema.naked ? schema.naked() : schema)?.constructor?.name ?? 'unknown'
50
+
51
+ if (baseType === 'ZodObject') {
52
+ // Do nothing
53
+ } else if (baseType === 'ZodArray') {
54
+ let elementType = (schema as any).element
55
+ if (elementType.naked) {
56
+ elementType = elementType.naked()
57
+ }
58
+
59
+ if (elementType?.constructor?.name === 'ZodObject') {
60
+ isArrayOfObjects = true
61
+ schema = elementType
62
+ } else {
63
+ throw new Error('Schema must be a ZodObject or a ZodArray<ZodObject>')
64
+ }
65
+ } else {
66
+ throw new Error('Schema must be either a ZuiObject or a ZuiArray<ZuiObject>')
67
+ }
68
+
69
+ const schemaTypescript = schema.toTypescript({ declaration: false })
70
+ const schemaLength = tokenizer.count(schemaTypescript)
71
+
72
+ options.chunkLength = Math.min(
73
+ options.chunkLength,
74
+ this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER - schemaLength
75
+ )
76
+
77
+ const keys = Object.keys((schema as ZodObject).shape)
78
+
79
+ let inputAsString = stringify(input)
80
+
81
+ if (tokenizer.count(inputAsString) > options.chunkLength) {
82
+ // If we want to extract an array of objects, we will run this function recursively
83
+ if (isArrayOfObjects) {
84
+ const tokens = tokenizer.split(inputAsString)
85
+ const chunks = chunk(tokens, options.chunkLength).map((x) => x.join(''))
86
+ const all = await Promise.all(chunks.map((chunk) => this.extract(chunk, originalSchema as z.AnyZodObject)))
87
+
88
+ return all.flat()
89
+ } else {
90
+ // Truncate the input to fit the model's input size
91
+ inputAsString = tokenizer.truncate(stringify(input), options.chunkLength)
92
+ }
93
+ }
94
+
95
+ const instructions: string[] = []
96
+
97
+ if (options.instructions) {
98
+ instructions.push(options.instructions)
99
+ }
100
+
101
+ const shape = `{ ${keys.map((key) => `"${key}": ...`).join(', ')} }`
102
+ const abbv = '{ ... }'
103
+
104
+ if (isArrayOfObjects) {
105
+ instructions.push('You may have multiple elements, or zero elements in the input.')
106
+ instructions.push('You must extract each element separately.')
107
+ instructions.push(`Each element must be a JSON object with exactly the format: ${START}${shape}${END}`)
108
+ instructions.push(`When you are done extracting all elements, type "${NO_MORE}" to finish.`)
109
+ instructions.push(`For example, if you have zero elements, the output should look like this: ${NO_MORE}`)
110
+ instructions.push(
111
+ `For example, if you have two elements, the output should look like this: ${START}${abbv}${END}${START}${abbv}${END}${NO_MORE}`
112
+ )
113
+ } else {
114
+ instructions.push('You may have exactly one element in the input.')
115
+ instructions.push(`The element must be a JSON object with exactly the format: ${START}${shape}${END}`)
116
+ }
117
+
118
+ // All tokens remaining after the input and condition are accounted can be used for examples
119
+ const EXAMPLES_TOKENS = PROMPT_COMPONENT - tokenizer.count(inputAsString) - tokenizer.count(instructions.join('\n'))
120
+
121
+ const Key = fastHash(
122
+ JSON.stringify({
123
+ taskType,
124
+ taskId,
125
+ input: inputAsString,
126
+ instructions: options.instructions,
127
+ })
128
+ )
129
+
130
+ const examples = taskId
131
+ ? await this.adapter.getExamples<string, unknown>({
132
+ input: inputAsString,
133
+ taskType,
134
+ taskId,
135
+ })
136
+ : []
137
+
138
+ const exactMatch = examples.find((x) => x.key === Key)
139
+ if (exactMatch) {
140
+ return exactMatch.output
141
+ }
142
+
143
+ const defaultExample = isArrayOfObjects
144
+ ? {
145
+ input: `The story goes as follow.
146
+ Once upon a time, there was a person named Alice who was 30 years old.
147
+ Then, there was a person named Bob who was 25 years old.
148
+ The end.`,
149
+ schema: 'Array<{ name: string, age: number }>',
150
+ instructions: 'Extract all people',
151
+ extracted: [
152
+ {
153
+ name: 'Alice',
154
+ age: 30,
155
+ },
156
+ {
157
+ name: 'Bob',
158
+ age: 25,
159
+ },
160
+ ],
161
+ }
162
+ : {
163
+ input: `The story goes as follow.
164
+ Once upon a time, there was a person named Alice who was 30 years old.
165
+ The end.`,
166
+ schema: '{ name: string, age: number }',
167
+ instructions: 'Extract the person',
168
+ extracted: { name: 'Alice', age: 30 },
169
+ }
170
+
171
+ const userExamples = examples.map((e) => ({
172
+ input: e.input,
173
+ extracted: e.output,
174
+ schema: schemaTypescript,
175
+ instructions: options.instructions,
176
+ }))
177
+
178
+ let exampleId = 1
179
+
180
+ const formatInput = (input: string, schema: string, instructions?: string) => {
181
+ const header = userExamples.length
182
+ ? `Expert Example #${exampleId++}`
183
+ : "Here's an example to help you understand the format:"
184
+
185
+ return `
186
+ ${header}
187
+
188
+ <|start_schema|>
189
+ ${schema}
190
+ <|end_schema|>
191
+
192
+ <|start_instructions|>
193
+ ${instructions ?? 'No specific instructions, just follow the schema above.'}
194
+ <|end_instructions|>
195
+
196
+ <|start_input|>
197
+ ${input.trim()}
198
+ <|end_input|>
199
+ `.trim()
200
+ }
201
+
202
+ const formatOutput = (extracted: any) => {
203
+ extracted = isArray(extracted) ? extracted : [extracted]
204
+
205
+ return (
206
+ extracted
207
+ .map((x: string) =>
208
+ `
209
+ ${START}
210
+ ${JSON.stringify(x, null, 2)}
211
+ ${END}`.trim()
212
+ )
213
+ .join('\n') + NO_MORE
214
+ )
215
+ }
216
+
217
+ const formatExample = (example: { input?: any; schema: string; instructions?: string; extracted: any }) => [
218
+ {
219
+ type: 'text' as const,
220
+ content: formatInput(stringify(example.input ?? null), example.schema, example.instructions),
221
+ role: 'user' as const,
222
+ },
223
+ {
224
+ type: 'text' as const,
225
+ content: formatOutput(example.extracted),
226
+ role: 'assistant' as const,
227
+ },
228
+ ]
229
+
230
+ const allExamples = takeUntilTokens(
231
+ userExamples.length ? userExamples : [defaultExample],
232
+ EXAMPLES_TOKENS,
233
+ (el) => tokenizer.count(stringify(el.input)) + tokenizer.count(stringify(el.extracted))
234
+ )
235
+ .map(formatExample)
236
+ .flat()
237
+
238
+ const { output, meta } = await this.callModel({
239
+ systemPrompt: `
240
+ Extract the following information from the input:
241
+ ${schemaTypescript}
242
+ ====
243
+
244
+ ${instructions.map((x) => `• ${x}`).join('\n')}
245
+ `.trim(),
246
+ stopSequences: [isArrayOfObjects ? NO_MORE : END],
247
+ messages: [
248
+ ...allExamples,
249
+ {
250
+ role: 'user',
251
+ type: 'text',
252
+ content: formatInput(inputAsString, schemaTypescript, options.instructions ?? ''),
253
+ },
254
+ ],
255
+ })
256
+
257
+ const answer = output.choices[0]?.content as string
258
+
259
+ const elements = answer
260
+ .split(START)
261
+ .filter((x) => x.trim().length > 0)
262
+ .map((x) => {
263
+ try {
264
+ const json = x.slice(0, x.indexOf(END)).trim()
265
+ const repairedJson = jsonrepair(json)
266
+ const parsedJson = JSON5.parse(repairedJson)
267
+
268
+ return schema.parse(parsedJson)
269
+ } catch (error) {
270
+ throw new JsonParsingError(x, error instanceof Error ? error : new Error('Unknown error'))
271
+ }
272
+ })
273
+ .filter((x) => x !== null)
274
+
275
+ let final: any
276
+
277
+ if (isArrayOfObjects) {
278
+ final = elements
279
+ } else if (elements.length === 0) {
280
+ final = schema.parse({})
281
+ } else {
282
+ final = elements[0]
283
+ }
284
+
285
+ if (taskId) {
286
+ await this.adapter.saveExample({
287
+ key: Key,
288
+ taskId: `zai/${taskId}`,
289
+ taskType,
290
+ instructions: options.instructions ?? 'No specific instructions',
291
+ input: inputAsString,
292
+ output: final,
293
+ metadata: {
294
+ cost: {
295
+ input: meta.cost.input,
296
+ output: meta.cost.output,
297
+ },
298
+ latency: meta.latency,
299
+ model: this.Model,
300
+ tokens: {
301
+ input: meta.tokens.input,
302
+ output: meta.tokens.output,
303
+ },
304
+ },
305
+ })
306
+ }
307
+
308
+ return final
309
+ }
@@ -0,0 +1,244 @@
1
+ // eslint-disable consistent-type-definitions
2
+ import { z } from '@bpinternal/zui'
3
+
4
+ import { clamp } from 'lodash-es'
5
+ import { fastHash, stringify, takeUntilTokens } from '../utils'
6
+ import { Zai } from '../zai'
7
+ import { PROMPT_INPUT_BUFFER, PROMPT_OUTPUT_BUFFER } from './constants'
8
+
9
+ type Example = z.input<typeof Example>
10
+ const Example = z.object({
11
+ input: z.any(),
12
+ filter: z.boolean(),
13
+ reason: z.string().optional(),
14
+ })
15
+
16
+ export type Options = z.input<typeof Options>
17
+ const Options = z.object({
18
+ tokensPerItem: z
19
+ .number()
20
+ .min(1)
21
+ .max(100_000)
22
+ .optional()
23
+ .describe('The maximum number of tokens per item')
24
+ .default(250),
25
+ examples: z.array(Example).describe('Examples to filter the condition against').default([]),
26
+ })
27
+
28
+ declare module '@botpress/zai' {
29
+ interface Zai {
30
+ /** Filters elements of an array against a condition */
31
+ filter<T>(input: Array<T>, condition: string, options?: Options): Promise<Array<T>>
32
+ }
33
+ }
34
+
35
+ const END = '■END■'
36
+
37
+ Zai.prototype.filter = async function (this: Zai, input, condition, _options) {
38
+ const options = Options.parse(_options ?? {})
39
+ const tokenizer = await this.getTokenizer()
40
+ await this.fetchModelDetails()
41
+
42
+ const taskId = this.taskId
43
+ const taskType = 'zai.filter'
44
+
45
+ const MAX_ITEMS_PER_CHUNK = 50
46
+ const TOKENS_TOTAL_MAX = this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER - PROMPT_OUTPUT_BUFFER
47
+ const TOKENS_EXAMPLES_MAX = Math.floor(Math.max(250, TOKENS_TOTAL_MAX * 0.5))
48
+ const TOKENS_CONDITION_MAX = clamp(TOKENS_TOTAL_MAX * 0.25, 250, tokenizer.count(condition))
49
+ const TOKENS_INPUT_ARRAY_MAX = TOKENS_TOTAL_MAX - TOKENS_EXAMPLES_MAX - TOKENS_CONDITION_MAX
50
+
51
+ condition = tokenizer.truncate(condition, TOKENS_CONDITION_MAX)
52
+
53
+ let chunks: Array<typeof input> = []
54
+ let currentChunk: typeof input = []
55
+ let currentChunkTokens = 0
56
+
57
+ for (const element of input) {
58
+ const elementAsString = tokenizer.truncate(stringify(element, false), options.tokensPerItem)
59
+ const elementTokens = tokenizer.count(elementAsString)
60
+
61
+ if (currentChunkTokens + elementTokens > TOKENS_INPUT_ARRAY_MAX || currentChunk.length >= MAX_ITEMS_PER_CHUNK) {
62
+ chunks.push(currentChunk)
63
+ currentChunk = []
64
+ currentChunkTokens = 0
65
+ }
66
+
67
+ currentChunk.push(element)
68
+ currentChunkTokens += elementTokens
69
+ }
70
+
71
+ if (currentChunk.length > 0) {
72
+ chunks.push(currentChunk)
73
+ }
74
+
75
+ chunks = chunks.filter((x) => x.length > 0)
76
+
77
+ // ■1:true■2:true■3:true
78
+
79
+ const formatInput = (input: Example[], condition: string) => {
80
+ return `
81
+ Condition to check:
82
+ ${condition}
83
+
84
+ Items (from ■0 to ■${input.length - 1})
85
+ ==============================
86
+ ${input.map((x, idx) => `■${idx} = ${stringify(x.input ?? null, false)}`).join('\n')}
87
+ `.trim()
88
+ }
89
+
90
+ const formatExamples = (examples: Example[]) => {
91
+ return `
92
+ ${examples.map((x, idx) => `■${idx}:${!!x.filter ? 'true' : 'false'}`).join('')}
93
+ ${END}
94
+ ====
95
+ Here's the reasoning behind each example:
96
+ ${examples.map((x, idx) => `■${idx}:${!!x.filter ? 'true' : 'false'}:${x.reason ?? 'No reason provided'}`).join('\n')}
97
+ `.trim()
98
+ }
99
+
100
+ const genericExamples: Example[] = [
101
+ {
102
+ input: 'apple',
103
+ filter: true,
104
+ reason: 'Apples are fruits',
105
+ },
106
+ {
107
+ input: 'Apple Inc.',
108
+ filter: false,
109
+ reason: 'Apple Inc. is a company, not a fruit',
110
+ },
111
+ {
112
+ input: 'banana',
113
+ filter: true,
114
+ reason: 'Bananas are fruits',
115
+ },
116
+ {
117
+ input: 'potato',
118
+ filter: false,
119
+ reason: 'Potatoes are vegetables',
120
+ },
121
+ ]
122
+
123
+ const genericExamplesMessages = [
124
+ {
125
+ type: 'text' as const,
126
+ content: formatInput(genericExamples, 'is a fruit'),
127
+ role: 'user' as const,
128
+ },
129
+ {
130
+ type: 'text' as const,
131
+ content: formatExamples(genericExamples),
132
+ role: 'assistant' as const,
133
+ },
134
+ ]
135
+
136
+ const filterChunk = async (chunk: typeof input) => {
137
+ const examples = taskId
138
+ ? await this.adapter
139
+ .getExamples<string, unknown>({
140
+ // The Table API can't search for a huge input string
141
+ input: JSON.stringify(chunk).slice(0, 1000),
142
+ taskType,
143
+ taskId,
144
+ })
145
+ .then((x) =>
146
+ x.map((y) => ({ filter: y.output as boolean, input: y.input, reason: y.explanation }) satisfies Example)
147
+ )
148
+ : []
149
+
150
+ const allExamples = takeUntilTokens([...examples, ...(options.examples ?? [])], TOKENS_EXAMPLES_MAX, (el) =>
151
+ tokenizer.count(stringify(el.input))
152
+ )
153
+
154
+ const exampleMessages = [
155
+ {
156
+ type: 'text' as const,
157
+ content: formatInput(allExamples, condition),
158
+ role: 'user' as const,
159
+ },
160
+ {
161
+ type: 'text' as const,
162
+ content: formatExamples(allExamples),
163
+ role: 'assistant' as const,
164
+ },
165
+ ]
166
+
167
+ const { output, meta } = await this.callModel({
168
+ systemPrompt: `
169
+ You are given a list of items. Your task is to filter out the items that meet the condition below.
170
+ You need to return the full list of items with the format:
171
+ ■x:true■y:false■z:true (where x, y, z are the indices of the items in the list)
172
+ You need to start with "■0" and go up to the last index "■${chunk.length - 1}".
173
+ If an item meets the condition, you should return ":true", otherwise ":false".
174
+
175
+ IMPORTANT: Make sure to read the condition and the examples carefully before making your decision.
176
+ The condition is: "${condition}"
177
+ `.trim(),
178
+ stopSequences: [END],
179
+ messages: [
180
+ ...(exampleMessages.length ? exampleMessages : genericExamplesMessages),
181
+ {
182
+ type: 'text',
183
+ content: formatInput(
184
+ chunk.map((x) => ({ input: x }) as Example),
185
+ condition
186
+ ),
187
+ role: 'user',
188
+ },
189
+ ],
190
+ })
191
+
192
+ const answer = output.choices[0]?.content as string
193
+ const indices = answer
194
+ .trim()
195
+ .split('■')
196
+ .filter((x) => x.length > 0)
197
+ .map((x) => {
198
+ const [idx, filter] = x.split(':')
199
+ return { idx: parseInt(idx?.trim() ?? ''), filter: filter?.toLowerCase().trim() === 'true' }
200
+ })
201
+
202
+ const partial = chunk.filter((_, idx) => {
203
+ return indices.find((x) => x.idx === idx)?.filter ?? false
204
+ })
205
+
206
+ if (taskId) {
207
+ const key = fastHash(
208
+ stringify({
209
+ taskId,
210
+ taskType,
211
+ input: JSON.stringify(chunk),
212
+ condition,
213
+ })
214
+ )
215
+
216
+ await this.adapter.saveExample({
217
+ key,
218
+ taskType,
219
+ taskId,
220
+ input: JSON.stringify(chunk),
221
+ output: partial,
222
+ instructions: condition,
223
+ metadata: {
224
+ cost: {
225
+ input: meta.cost.input,
226
+ output: meta.cost.output,
227
+ },
228
+ latency: meta.latency,
229
+ model: this.Model,
230
+ tokens: {
231
+ input: meta.tokens.input,
232
+ output: meta.tokens.output,
233
+ },
234
+ },
235
+ })
236
+ }
237
+
238
+ return partial
239
+ }
240
+
241
+ const filteredChunks = await Promise.all(chunks.map(filterChunk))
242
+
243
+ return filteredChunks.flat()
244
+ }