npm - @botpress/zai - Versions diffs - 1.0.1 → 1.2.0 - Mend

@botpress/zai 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/README.md +1 -1
package/build.ts +9 -0
package/dist/adapters/adapter.js +2 -0
package/dist/adapters/botpress-table.js +168 -0
package/dist/adapters/memory.js +12 -0
package/dist/index.d.ts +111 -609
package/dist/index.js +9 -1873
package/dist/operations/check.js +153 -0
package/dist/operations/constants.js +2 -0
package/dist/operations/errors.js +15 -0
package/dist/operations/extract.js +232 -0
package/dist/operations/filter.js +191 -0
package/dist/operations/label.js +249 -0
package/dist/operations/rewrite.js +123 -0
package/dist/operations/summarize.js +133 -0
package/dist/operations/text.js +47 -0
package/dist/utils.js +37 -0
package/dist/zai.js +100 -0
package/e2e/data/botpress_docs.txt +26040 -0
package/e2e/data/cache.jsonl +107 -0
package/e2e/utils.ts +89 -0
package/package.json +33 -29
package/src/adapters/adapter.ts +35 -0
package/src/adapters/botpress-table.ts +210 -0
package/src/adapters/memory.ts +13 -0
package/src/index.ts +11 -0
package/src/operations/check.ts +201 -0
package/src/operations/constants.ts +2 -0
package/src/operations/errors.ts +9 -0
package/src/operations/extract.ts +309 -0
package/src/operations/filter.ts +244 -0
package/src/operations/label.ts +345 -0
package/src/operations/rewrite.ts +161 -0
package/src/operations/summarize.ts +195 -0
package/src/operations/text.ts +65 -0
package/src/utils.ts +52 -0
package/src/zai.ts +147 -0
package/tsconfig.json +3 -23
package/dist/index.cjs +0 -1903
package/dist/index.cjs.map +0 -1
package/dist/index.d.cts +0 -916
package/dist/index.js.map +0 -1
package/tsup.config.ts +0 -16
package/vitest.config.ts +0 -9
package/vitest.setup.ts +0 -24

package/dist/operations/check.js ADDED Viewed

@@ -0,0 +1,153 @@
+import { z } from "@bpinternal/zui";
+import { fastHash, stringify, takeUntilTokens } from "../utils";
+import { Zai } from "../zai";
+import { PROMPT_INPUT_BUFFER } from "./constants";
+const Example = z.object({
+  input: z.any(),
+  check: z.boolean(),
+  reason: z.string().optional()
+});
+const Options = z.object({
+  examples: z.array(Example).describe("Examples to check the condition against").default([])
+});
+const TRUE = "\u25A0TRUE\u25A0";
+const FALSE = "\u25A0FALSE\u25A0";
+const END = "\u25A0END\u25A0";
+Zai.prototype.check = async function(input, condition, _options) {
+  const options = Options.parse(_options ?? {});
+  const tokenizer = await this.getTokenizer();
+  await this.fetchModelDetails();
+  const PROMPT_COMPONENT = Math.max(this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER, 100);
+  const taskId = this.taskId;
+  const taskType = "zai.check";
+  const PROMPT_TOKENS = {
+    INPUT: Math.floor(0.5 * PROMPT_COMPONENT),
+    CONDITION: Math.floor(0.2 * PROMPT_COMPONENT)
+  };
+  const inputAsString = tokenizer.truncate(stringify(input), PROMPT_TOKENS.INPUT);
+  condition = tokenizer.truncate(condition, PROMPT_TOKENS.CONDITION);
+  const EXAMPLES_TOKENS = PROMPT_COMPONENT - tokenizer.count(inputAsString) - tokenizer.count(condition);
+  const Key = fastHash(
+    JSON.stringify({
+      taskType,
+      taskId,
+      input: inputAsString,
+      condition
+    })
+  );
+  const examples = taskId ? await this.adapter.getExamples({
+    input: inputAsString,
+    taskType,
+    taskId
+  }) : [];
+  const exactMatch = examples.find((x) => x.key === Key);
+  if (exactMatch) {
+    return exactMatch.output;
+  }
+  const defaultExamples = [
+    { input: "50 Cent", check: true, reason: "50 Cent is widely recognized as a public personality." },
+    {
+      input: ["apple", "banana", "carrot", "house"],
+      check: false,
+      reason: "The list contains a house, which is not a fruit. Also, the list contains a carrot, which is a vegetable."
+    }
+  ];
+  const userExamples = [
+    ...examples.map((e) => ({ input: e.input, check: e.output, reason: e.explanation })),
+    ...options.examples
+  ];
+  let exampleId = 1;
+  const formatInput = (input2, condition2) => {
+    const header = userExamples.length ? `Expert Example #${exampleId++}` : `Example of condition: "${condition2}"`;
+    return `
+${header}
+<|start_input|>
+${input2.trim()}
+<|end_input|>
+`.trim();
+  };
+  const formatOutput = (answer2, justification) => {
+    return `
+Analysis: ${justification}
+Final Answer: ${answer2 ? TRUE : FALSE}
+${END}
+`.trim();
+  };
+  const formatExample = (example) => [
+    { type: "text", content: formatInput(stringify(example.input ?? null), condition), role: "user" },
+    {
+      type: "text",
+      content: formatOutput(example.check, example.reason ?? ""),
+      role: "assistant"
+    }
+  ];
+  const allExamples = takeUntilTokens(
+    userExamples.length ? userExamples : defaultExamples,
+    EXAMPLES_TOKENS,
+    (el) => tokenizer.count(stringify(el.input)) + tokenizer.count(el.reason ?? "")
+  ).map(formatExample).flat();
+  const specialInstructions = userExamples.length ? `
+- You have been provided with examples from previous experts. Make sure to read them carefully before making your decision.
+- Make sure to refer to the examples provided by the experts to justify your decision (when applicable).
+- When in doubt, ground your decision on the examples provided by the experts instead of your own intuition.
+- When no example is similar to the input, make sure to provide a clear justification for your decision while inferring the decision-making process from the examples provided by the experts.
+`.trim() : "";
+  const { output, meta } = await this.callModel({
+    systemPrompt: `
+Check if the following condition is true or false for the given input. Before answering, make sure to read the input and the condition carefully.
+Justify your answer, then answer with either ${TRUE} or ${FALSE} at the very end, then add ${END} to finish the response.
+IMPORTANT: Make sure to answer with either ${TRUE} or ${FALSE} at the end of your response, but NOT both.
+---
+Expert Examples (#1 to #${exampleId - 1}):
+${specialInstructions}
+`.trim(),
+    stopSequences: [END],
+    messages: [
+      ...allExamples,
+      {
+        type: "text",
+        content: `
+Considering the below input and above examples, is the following condition true or false?
+${formatInput(inputAsString, condition)}
+In your "Analysis", please refer to the Expert Examples # to justify your decision.`.trim(),
+        role: "user"
+      }
+    ]
+  });
+  const answer = output.choices[0]?.content;
+  const hasTrue = answer.includes(TRUE);
+  const hasFalse = answer.includes(FALSE);
+  if (!hasTrue && !hasFalse) {
+    throw new Error(`The model did not return a valid answer. The response was: ${answer}`);
+  }
+  let finalAnswer;
+  if (hasTrue && hasFalse) {
+    finalAnswer = answer.lastIndexOf(TRUE) > answer.lastIndexOf(FALSE);
+  } else {
+    finalAnswer = hasTrue;
+  }
+  if (taskId) {
+    await this.adapter.saveExample({
+      key: Key,
+      taskType,
+      taskId,
+      input: inputAsString,
+      instructions: condition,
+      metadata: {
+        cost: {
+          input: meta.cost.input,
+          output: meta.cost.output
+        },
+        latency: meta.latency,
+        model: this.Model,
+        tokens: {
+          input: meta.tokens.input,
+          output: meta.tokens.output
+        }
+      },
+      output: finalAnswer,
+      explanation: answer.replace(TRUE, "").replace(FALSE, "").replace(END, "").replace("Final Answer:", "").trim()
+    });
+  }
+  return finalAnswer;
+};

package/dist/operations/constants.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export const PROMPT_INPUT_BUFFER = 1048;
2	+ export const PROMPT_OUTPUT_BUFFER = 512;

package/dist/operations/errors.js ADDED Viewed

@@ -0,0 +1,15 @@
+export class JsonParsingError extends Error {
+  constructor(json, error) {
+    const message = `Error parsing JSON:
+---JSON---
+${json}
+---Error---
+ ${error}`;
+    super(message);
+    this.json = json;
+    this.error = error;
+  }
+}

package/dist/operations/extract.js ADDED Viewed

@@ -0,0 +1,232 @@
+import { z } from "@bpinternal/zui";
+import JSON5 from "json5";
+import { jsonrepair } from "jsonrepair";
+import { chunk, isArray } from "lodash-es";
+import { fastHash, stringify, takeUntilTokens } from "../utils";
+import { Zai } from "../zai";
+import { PROMPT_INPUT_BUFFER } from "./constants";
+import { JsonParsingError } from "./errors";
+const Options = z.object({
+  instructions: z.string().optional().describe("Instructions to guide the user on how to extract the data"),
+  chunkLength: z.number().min(100).max(1e5).optional().describe("The maximum number of tokens per chunk").default(16e3)
+});
+const START = "\u25A0json_start\u25A0";
+const END = "\u25A0json_end\u25A0";
+const NO_MORE = "\u25A0NO_MORE_ELEMENT\u25A0";
+Zai.prototype.extract = async function(input, schema, _options) {
+  const options = Options.parse(_options ?? {});
+  const tokenizer = await this.getTokenizer();
+  await this.fetchModelDetails();
+  const taskId = this.taskId;
+  const taskType = "zai.extract";
+  const PROMPT_COMPONENT = Math.max(this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER, 100);
+  let isArrayOfObjects = false;
+  const originalSchema = schema;
+  const baseType = (schema.naked ? schema.naked() : schema)?.constructor?.name ?? "unknown";
+  if (baseType === "ZodObject") {
+  } else if (baseType === "ZodArray") {
+    let elementType = schema.element;
+    if (elementType.naked) {
+      elementType = elementType.naked();
+    }
+    if (elementType?.constructor?.name === "ZodObject") {
+      isArrayOfObjects = true;
+      schema = elementType;
+    } else {
+      throw new Error("Schema must be a ZodObject or a ZodArray<ZodObject>");
+    }
+  } else {
+    throw new Error("Schema must be either a ZuiObject or a ZuiArray<ZuiObject>");
+  }
+  const schemaTypescript = schema.toTypescript({ declaration: false });
+  const schemaLength = tokenizer.count(schemaTypescript);
+  options.chunkLength = Math.min(
+    options.chunkLength,
+    this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER - schemaLength
+  );
+  const keys = Object.keys(schema.shape);
+  let inputAsString = stringify(input);
+  if (tokenizer.count(inputAsString) > options.chunkLength) {
+    if (isArrayOfObjects) {
+      const tokens = tokenizer.split(inputAsString);
+      const chunks = chunk(tokens, options.chunkLength).map((x) => x.join(""));
+      const all = await Promise.all(chunks.map((chunk2) => this.extract(chunk2, originalSchema)));
+      return all.flat();
+    } else {
+      inputAsString = tokenizer.truncate(stringify(input), options.chunkLength);
+    }
+  }
+  const instructions = [];
+  if (options.instructions) {
+    instructions.push(options.instructions);
+  }
+  const shape = `{ ${keys.map((key) => `"${key}": ...`).join(", ")} }`;
+  const abbv = "{ ... }";
+  if (isArrayOfObjects) {
+    instructions.push("You may have multiple elements, or zero elements in the input.");
+    instructions.push("You must extract each element separately.");
+    instructions.push(`Each element must be a JSON object with exactly the format: ${START}${shape}${END}`);
+    instructions.push(`When you are done extracting all elements, type "${NO_MORE}" to finish.`);
+    instructions.push(`For example, if you have zero elements, the output should look like this: ${NO_MORE}`);
+    instructions.push(
+      `For example, if you have two elements, the output should look like this: ${START}${abbv}${END}${START}${abbv}${END}${NO_MORE}`
+    );
+  } else {
+    instructions.push("You may have exactly one element in the input.");
+    instructions.push(`The element must be a JSON object with exactly the format: ${START}${shape}${END}`);
+  }
+  const EXAMPLES_TOKENS = PROMPT_COMPONENT - tokenizer.count(inputAsString) - tokenizer.count(instructions.join("\n"));
+  const Key = fastHash(
+    JSON.stringify({
+      taskType,
+      taskId,
+      input: inputAsString,
+      instructions: options.instructions
+    })
+  );
+  const examples = taskId ? await this.adapter.getExamples({
+    input: inputAsString,
+    taskType,
+    taskId
+  }) : [];
+  const exactMatch = examples.find((x) => x.key === Key);
+  if (exactMatch) {
+    return exactMatch.output;
+  }
+  const defaultExample = isArrayOfObjects ? {
+    input: `The story goes as follow.
+Once upon a time, there was a person named Alice who was 30 years old.
+Then, there was a person named Bob who was 25 years old.
+The end.`,
+    schema: "Array<{ name: string, age: number }>",
+    instructions: "Extract all people",
+    extracted: [
+      {
+        name: "Alice",
+        age: 30
+      },
+      {
+        name: "Bob",
+        age: 25
+      }
+    ]
+  } : {
+    input: `The story goes as follow.
+Once upon a time, there was a person named Alice who was 30 years old.
+The end.`,
+    schema: "{ name: string, age: number }",
+    instructions: "Extract the person",
+    extracted: { name: "Alice", age: 30 }
+  };
+  const userExamples = examples.map((e) => ({
+    input: e.input,
+    extracted: e.output,
+    schema: schemaTypescript,
+    instructions: options.instructions
+  }));
+  let exampleId = 1;
+  const formatInput = (input2, schema2, instructions2) => {
+    const header = userExamples.length ? `Expert Example #${exampleId++}` : "Here's an example to help you understand the format:";
+    return `
+${header}
+<|start_schema|>
+${schema2}
+<|end_schema|>
+<|start_instructions|>
+${instructions2 ?? "No specific instructions, just follow the schema above."}
+<|end_instructions|>
+<|start_input|>
+${input2.trim()}
+<|end_input|>
+  `.trim();
+  };
+  const formatOutput = (extracted) => {
+    extracted = isArray(extracted) ? extracted : [extracted];
+    return extracted.map(
+      (x) => `
+${START}
+${JSON.stringify(x, null, 2)}
+${END}`.trim()
+    ).join("\n") + NO_MORE;
+  };
+  const formatExample = (example) => [
+    {
+      type: "text",
+      content: formatInput(stringify(example.input ?? null), example.schema, example.instructions),
+      role: "user"
+    },
+    {
+      type: "text",
+      content: formatOutput(example.extracted),
+      role: "assistant"
+    }
+  ];
+  const allExamples = takeUntilTokens(
+    userExamples.length ? userExamples : [defaultExample],
+    EXAMPLES_TOKENS,
+    (el) => tokenizer.count(stringify(el.input)) + tokenizer.count(stringify(el.extracted))
+  ).map(formatExample).flat();
+  const { output, meta } = await this.callModel({
+    systemPrompt: `
+Extract the following information from the input:
+${schemaTypescript}
+====
+${instructions.map((x) => `\u2022 ${x}`).join("\n")}
+`.trim(),
+    stopSequences: [isArrayOfObjects ? NO_MORE : END],
+    messages: [
+      ...allExamples,
+      {
+        role: "user",
+        type: "text",
+        content: formatInput(inputAsString, schemaTypescript, options.instructions ?? "")
+      }
+    ]
+  });
+  const answer = output.choices[0]?.content;
+  const elements = answer.split(START).filter((x) => x.trim().length > 0).map((x) => {
+    try {
+      const json = x.slice(0, x.indexOf(END)).trim();
+      const repairedJson = jsonrepair(json);
+      const parsedJson = JSON5.parse(repairedJson);
+      return schema.parse(parsedJson);
+    } catch (error) {
+      throw new JsonParsingError(x, error instanceof Error ? error : new Error("Unknown error"));
+    }
+  }).filter((x) => x !== null);
+  let final;
+  if (isArrayOfObjects) {
+    final = elements;
+  } else if (elements.length === 0) {
+    final = schema.parse({});
+  } else {
+    final = elements[0];
+  }
+  if (taskId) {
+    await this.adapter.saveExample({
+      key: Key,
+      taskId: `zai/${taskId}`,
+      taskType,
+      instructions: options.instructions ?? "No specific instructions",
+      input: inputAsString,
+      output: final,
+      metadata: {
+        cost: {
+          input: meta.cost.input,
+          output: meta.cost.output
+        },
+        latency: meta.latency,
+        model: this.Model,
+        tokens: {
+          input: meta.tokens.input,
+          output: meta.tokens.output
+        }
+      }
+    });
+  }
+  return final;
+};

package/dist/operations/filter.js ADDED Viewed

@@ -0,0 +1,191 @@
+import { z } from "@bpinternal/zui";
+import { clamp } from "lodash-es";
+import { fastHash, stringify, takeUntilTokens } from "../utils";
+import { Zai } from "../zai";
+import { PROMPT_INPUT_BUFFER, PROMPT_OUTPUT_BUFFER } from "./constants";
+const Example = z.object({
+  input: z.any(),
+  filter: z.boolean(),
+  reason: z.string().optional()
+});
+const Options = z.object({
+  tokensPerItem: z.number().min(1).max(1e5).optional().describe("The maximum number of tokens per item").default(250),
+  examples: z.array(Example).describe("Examples to filter the condition against").default([])
+});
+const END = "\u25A0END\u25A0";
+Zai.prototype.filter = async function(input, condition, _options) {
+  const options = Options.parse(_options ?? {});
+  const tokenizer = await this.getTokenizer();
+  await this.fetchModelDetails();
+  const taskId = this.taskId;
+  const taskType = "zai.filter";
+  const MAX_ITEMS_PER_CHUNK = 50;
+  const TOKENS_TOTAL_MAX = this.ModelDetails.input.maxTokens - PROMPT_INPUT_BUFFER - PROMPT_OUTPUT_BUFFER;
+  const TOKENS_EXAMPLES_MAX = Math.floor(Math.max(250, TOKENS_TOTAL_MAX * 0.5));
+  const TOKENS_CONDITION_MAX = clamp(TOKENS_TOTAL_MAX * 0.25, 250, tokenizer.count(condition));
+  const TOKENS_INPUT_ARRAY_MAX = TOKENS_TOTAL_MAX - TOKENS_EXAMPLES_MAX - TOKENS_CONDITION_MAX;
+  condition = tokenizer.truncate(condition, TOKENS_CONDITION_MAX);
+  let chunks = [];
+  let currentChunk = [];
+  let currentChunkTokens = 0;
+  for (const element of input) {
+    const elementAsString = tokenizer.truncate(stringify(element, false), options.tokensPerItem);
+    const elementTokens = tokenizer.count(elementAsString);
+    if (currentChunkTokens + elementTokens > TOKENS_INPUT_ARRAY_MAX || currentChunk.length >= MAX_ITEMS_PER_CHUNK) {
+      chunks.push(currentChunk);
+      currentChunk = [];
+      currentChunkTokens = 0;
+    }
+    currentChunk.push(element);
+    currentChunkTokens += elementTokens;
+  }
+  if (currentChunk.length > 0) {
+    chunks.push(currentChunk);
+  }
+  chunks = chunks.filter((x) => x.length > 0);
+  const formatInput = (input2, condition2) => {
+    return `
+Condition to check:
+${condition2}
+Items (from \u25A00 to \u25A0${input2.length - 1})
+==============================
+${input2.map((x, idx) => `\u25A0${idx} = ${stringify(x.input ?? null, false)}`).join("\n")}
+`.trim();
+  };
+  const formatExamples = (examples) => {
+    return `
+${examples.map((x, idx) => `\u25A0${idx}:${!!x.filter ? "true" : "false"}`).join("")}
+${END}
+====
+Here's the reasoning behind each example:
+${examples.map((x, idx) => `\u25A0${idx}:${!!x.filter ? "true" : "false"}:${x.reason ?? "No reason provided"}`).join("\n")}
+`.trim();
+  };
+  const genericExamples = [
+    {
+      input: "apple",
+      filter: true,
+      reason: "Apples are fruits"
+    },
+    {
+      input: "Apple Inc.",
+      filter: false,
+      reason: "Apple Inc. is a company, not a fruit"
+    },
+    {
+      input: "banana",
+      filter: true,
+      reason: "Bananas are fruits"
+    },
+    {
+      input: "potato",
+      filter: false,
+      reason: "Potatoes are vegetables"
+    }
+  ];
+  const genericExamplesMessages = [
+    {
+      type: "text",
+      content: formatInput(genericExamples, "is a fruit"),
+      role: "user"
+    },
+    {
+      type: "text",
+      content: formatExamples(genericExamples),
+      role: "assistant"
+    }
+  ];
+  const filterChunk = async (chunk) => {
+    const examples = taskId ? await this.adapter.getExamples({
+      // The Table API can't search for a huge input string
+      input: JSON.stringify(chunk).slice(0, 1e3),
+      taskType,
+      taskId
+    }).then(
+      (x) => x.map((y) => ({ filter: y.output, input: y.input, reason: y.explanation }))
+    ) : [];
+    const allExamples = takeUntilTokens(
+      [...examples, ...options.examples ?? []],
+      TOKENS_EXAMPLES_MAX,
+      (el) => tokenizer.count(stringify(el.input))
+    );
+    const exampleMessages = [
+      {
+        type: "text",
+        content: formatInput(allExamples, condition),
+        role: "user"
+      },
+      {
+        type: "text",
+        content: formatExamples(allExamples),
+        role: "assistant"
+      }
+    ];
+    const { output, meta } = await this.callModel({
+      systemPrompt: `
+You are given a list of items. Your task is to filter out the items that meet the condition below.
+You need to return the full list of items with the format:
+\u25A0x:true\u25A0y:false\u25A0z:true (where x, y, z are the indices of the items in the list)
+You need to start with "\u25A00" and go up to the last index "\u25A0${chunk.length - 1}".
+If an item meets the condition, you should return ":true", otherwise ":false".
+IMPORTANT: Make sure to read the condition and the examples carefully before making your decision.
+The condition is: "${condition}"
+`.trim(),
+      stopSequences: [END],
+      messages: [
+        ...exampleMessages.length ? exampleMessages : genericExamplesMessages,
+        {
+          type: "text",
+          content: formatInput(
+            chunk.map((x) => ({ input: x })),
+            condition
+          ),
+          role: "user"
+        }
+      ]
+    });
+    const answer = output.choices[0]?.content;
+    const indices = answer.trim().split("\u25A0").filter((x) => x.length > 0).map((x) => {
+      const [idx, filter] = x.split(":");
+      return { idx: parseInt(idx?.trim() ?? ""), filter: filter?.toLowerCase().trim() === "true" };
+    });
+    const partial = chunk.filter((_, idx) => {
+      return indices.find((x) => x.idx === idx)?.filter ?? false;
+    });
+    if (taskId) {
+      const key = fastHash(
+        stringify({
+          taskId,
+          taskType,
+          input: JSON.stringify(chunk),
+          condition
+        })
+      );
+      await this.adapter.saveExample({
+        key,
+        taskType,
+        taskId,
+        input: JSON.stringify(chunk),
+        output: partial,
+        instructions: condition,
+        metadata: {
+          cost: {
+            input: meta.cost.input,
+            output: meta.cost.output
+          },
+          latency: meta.latency,
+          model: this.Model,
+          tokens: {
+            input: meta.tokens.input,
+            output: meta.tokens.output
+          }
+        }
+      });
+    }
+    return partial;
+  };
+  const filteredChunks = await Promise.all(chunks.map(filterChunk));
+  return filteredChunks.flat();
+};