npm - @botpress/zai - Versions diffs - 2.1.20 → 2.3.0 - Mend

@botpress/zai 2.1.20 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/CLAUDE.md +696 -0
package/README.md +79 -2
package/dist/index.d.ts +85 -14
package/dist/index.js +3 -0
package/dist/operations/group.js +369 -0
package/dist/operations/rate.js +350 -0
package/dist/operations/sort.js +450 -0
package/e2e/data/cache.jsonl +289 -0
package/package.json +1 -1
package/src/index.ts +3 -0
package/src/operations/group.ts +543 -0
package/src/operations/rate.ts +518 -0
package/src/operations/sort.ts +618 -0

package/src/operations/rate.ts ADDED Viewed

@@ -0,0 +1,518 @@
+// eslint-disable consistent-type-definitions
+import { z } from '@bpinternal/zui'
+import pLimit from 'p-limit'
+import { ZaiContext } from '../context'
+import { Response } from '../response'
+import { getTokenizer } from '../tokenizer'
+import { fastHash, stringify } from '../utils'
+import { Zai } from '../zai'
+import { PROMPT_INPUT_BUFFER, PROMPT_OUTPUT_BUFFER } from './constants'
+// Rating scale constants
+const RATING_VALUES = {
+  very_bad: 1,
+  bad: 2,
+  average: 3,
+  good: 4,
+  very_good: 5,
+} as const
+type RatingLabel = keyof typeof RATING_VALUES
+// Evaluation criteria generated by LLM
+type EvaluationCriteria = Record<
+  string,
+  {
+    very_bad: string
+    bad: string
+    average: string
+    good: string
+    very_good: string
+  }
+>
+export type RatingInstructions = string | Record<string, string>
+export type Options = {
+  /** The maximum number of tokens per item */
+  tokensPerItem?: number
+  /** The maximum number of items to rate per chunk */
+  maxItemsPerChunk?: number
+}
+const _Options = z.object({
+  tokensPerItem: z
+    .number()
+    .min(1)
+    .max(100_000)
+    .optional()
+    .describe('The maximum number of tokens per item')
+    .default(250),
+  maxItemsPerChunk: z
+    .number()
+    .min(1)
+    .max(100)
+    .optional()
+    .describe('The maximum number of items to rate per chunk')
+    .default(50),
+})
+// Result types based on instructions type
+export type RatingResult<T extends RatingInstructions> = T extends string
+  ? {
+      [key: string]: number // criteria scores
+      total: number // sum of all criteria
+    }
+  : T extends Record<string, string>
+    ? {
+        [K in keyof T]: number // score for each criterion
+      } & {
+        total: number // sum of all criteria
+      }
+    : never
+export type SimplifiedRatingResult<T extends RatingInstructions> = T extends string ? number : RatingResult<T>
+declare module '@botpress/zai' {
+  interface Zai {
+    /**
+     * Rates an array of items based on provided instructions.
+     * Returns a number (1-5) if instructions is a string, or a Record<string, number> if instructions is a Record.
+     */
+    rate<T, I extends RatingInstructions>(
+      input: Array<T>,
+      instructions: I,
+      options?: Options
+    ): Response<Array<RatingResult<I>>, Array<SimplifiedRatingResult<I>>>
+  }
+}
+const END = '■END■'
+const rate = async <T, I extends RatingInstructions>(
+  input: Array<T>,
+  instructions: I,
+  _options: Options | undefined,
+  ctx: ZaiContext
+): Promise<Array<RatingResult<I>>> => {
+  ctx.controller.signal.throwIfAborted()
+  const options = _Options.parse(_options ?? {})
+  const tokenizer = await getTokenizer()
+  const model = await ctx.getModel()
+  // Handle empty array
+  if (input.length === 0) {
+    return []
+  }
+  const taskId = ctx.taskId
+  const taskType = 'zai.rate'
+  const TOKENS_TOTAL_MAX = model.input.maxTokens - PROMPT_INPUT_BUFFER - PROMPT_OUTPUT_BUFFER
+  // Phase 1: Generate evaluation criteria
+  const isStringInstructions = typeof instructions === 'string'
+  const criteriaKeys: string[] = isStringInstructions
+    ? [] // Will be generated by LLM
+    : Object.keys(instructions as Record<string, string>)
+  const generateCriteriaPrompt = isStringInstructions
+    ? `Generate 3-5 evaluation criteria for: "${instructions}"
+For each criterion, provide 5 labels (very_bad, bad, average, good, very_good) with brief descriptions.
+Output format (JSON):
+{
+  "criterion1_name": {
+    "very_bad": "description",
+    "bad": "description",
+    "average": "description",
+    "good": "description",
+    "very_good": "description"
+  },
+  "criterion2_name": { ... }
+}
+Keep criterion names short (1-2 words, lowercase, use underscores).
+Keep descriptions brief (5-10 words each).`
+    : `For these evaluation criteria, provide 5 labels (very_bad, bad, average, good, very_good) with brief descriptions for each:
+${criteriaKeys.map((key) => `- ${key}: ${(instructions as Record<string, string>)[key]}`).join('\n')}
+Output format (JSON):
+{
+  "${criteriaKeys[0]}": {
+    "very_bad": "description",
+    "bad": "description",
+    "average": "description",
+    "good": "description",
+    "very_good": "description"
+  }
+  ${criteriaKeys.length > 1 ? '...' : ''}
+}
+Keep descriptions brief (5-10 words each).`
+  const { extracted: evaluationCriteria } = await ctx.generateContent({
+    systemPrompt: `You are creating evaluation criteria for rating items on a 1-5 scale.
+Each criterion must have exactly 5 labels: very_bad (1), bad (2), average (3), good (4), very_good (5).
+Output valid JSON only.`,
+    messages: [
+      {
+        type: 'text',
+        role: 'user',
+        content: generateCriteriaPrompt,
+      },
+    ],
+    transform: (text) => {
+      // Extract JSON from markdown code blocks if present
+      const jsonMatch = text.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/) || text.match(/(\{[\s\S]*\})/)
+      if (!jsonMatch) {
+        throw new Error('Failed to parse evaluation criteria JSON')
+      }
+      return JSON.parse(jsonMatch[1]) as EvaluationCriteria
+    },
+  })
+  // Extract final criteria keys
+  const finalCriteriaKeys = Object.keys(evaluationCriteria)
+  if (finalCriteriaKeys.length === 0) {
+    throw new Error('No evaluation criteria generated')
+  }
+  // Phase 2: Chunk items
+  const TOKENS_CRITERIA_MAX = Math.floor(TOKENS_TOTAL_MAX * 0.3)
+  const TOKENS_ITEMS_MAX = TOKENS_TOTAL_MAX - TOKENS_CRITERIA_MAX
+  let chunks: Array<typeof input> = []
+  let currentChunk: typeof input = []
+  let currentChunkTokens = 0
+  for (const element of input) {
+    const elementAsString = tokenizer.truncate(stringify(element, false), options.tokensPerItem)
+    const elementTokens = tokenizer.count(elementAsString)
+    if (currentChunkTokens + elementTokens > TOKENS_ITEMS_MAX || currentChunk.length >= options.maxItemsPerChunk) {
+      if (currentChunk.length > 0) {
+        chunks.push(currentChunk)
+      }
+      currentChunk = []
+      currentChunkTokens = 0
+    }
+    currentChunk.push(element)
+    currentChunkTokens += elementTokens
+  }
+  if (currentChunk.length > 0) {
+    chunks.push(currentChunk)
+  }
+  chunks = chunks.filter((x) => x.length > 0)
+  // Phase 3: Rate each chunk in parallel
+  type ChunkResult = {
+    ratings: Array<Record<string, number>>
+    meta: { cost: { input: number; output: number }; latency: number; tokens: { input: number; output: number } }
+  }
+  const rateChunk = async (chunk: typeof input): Promise<ChunkResult> => {
+    ctx.controller.signal.throwIfAborted()
+    // Get examples from adapter for active learning
+    const chunkInputStr = JSON.stringify(chunk)
+    const examples =
+      taskId && ctx.adapter
+        ? await ctx.adapter.getExamples<string, Array<Record<string, number>>>({
+            input: chunkInputStr.slice(0, 1000), // Limit search string length
+            taskType,
+            taskId,
+          })
+        : []
+    // Check for exact match (cache hit)
+    const key = fastHash(
+      stringify({
+        taskId,
+        taskType,
+        input: chunkInputStr,
+        instructions: stringify(instructions),
+      })
+    )
+    const exactMatch = examples.find((x) => x.key === key)
+    if (exactMatch && exactMatch.output) {
+      // Return cached result with zero cost
+      return {
+        ratings: exactMatch.output,
+        meta: { cost: { input: 0, output: 0 }, latency: 0, tokens: { input: 0, output: 0 } },
+      }
+    }
+    const formatCriteria = () => {
+      return finalCriteriaKeys
+        .map((key) => {
+          const labels = evaluationCriteria[key]
+          return `**${key}**:
+  - very_bad (1): ${labels?.very_bad}
+  - bad (2): ${labels?.bad}
+  - average (3): ${labels?.average}
+  - good (4): ${labels?.good}
+  - very_good (5): ${labels?.very_good}`
+        })
+        .join('\n\n')
+    }
+    const formatItems = (items: typeof chunk) => {
+      return items
+        .map((item, idx) => {
+          const itemStr = tokenizer.truncate(stringify(item, false), options.tokensPerItem)
+          return `■${idx}: ${itemStr}■`
+        })
+        .join('\n')
+    }
+    // Format examples for few-shot learning
+    const exampleMessages: Array<{ type: 'text'; role: 'user' | 'assistant'; content: string }> = []
+    for (const example of examples.slice(0, 5)) {
+      // User message with input
+      try {
+        const exampleInput = JSON.parse(example.input)
+        exampleMessages.push({
+          type: 'text',
+          role: 'user',
+          content: `Expert Example - Items to rate:
+${formatItems(Array.isArray(exampleInput) ? exampleInput : [exampleInput])}
+Rate each item on all criteria.`,
+        })
+        // Assistant message with ratings
+        const exampleOutput = example.output
+        if (Array.isArray(exampleOutput) && exampleOutput.length > 0) {
+          const formattedRatings = exampleOutput
+            .map((rating, idx) => {
+              const pairs = finalCriteriaKeys
+                .map((key) => {
+                  const value = rating[key]
+                  if (typeof value === 'number') {
+                    // Convert number back to label
+                    const labelMap: Record<number, string> = {
+                      1: 'very_bad',
+                      2: 'bad',
+                      3: 'average',
+                      4: 'good',
+                      5: 'very_good',
+                    }
+                    return `${key}=${labelMap[value] || 'average'}`
+                  }
+                  return null
+                })
+                .filter(Boolean)
+                .join(';')
+              return `■${idx}:${pairs}■`
+            })
+            .join('\n')
+          exampleMessages.push({
+            type: 'text',
+            role: 'assistant',
+            content: `${formattedRatings}\n${END}`,
+          })
+          if (example.explanation) {
+            exampleMessages.push({
+              type: 'text',
+              role: 'assistant',
+              content: `Reasoning: ${example.explanation}`,
+            })
+          }
+        }
+      } catch {
+        // Skip malformed examples
+      }
+    }
+    const { extracted, meta } = await ctx.generateContent({
+      systemPrompt: `You are rating items based on evaluation criteria.
+Evaluation Criteria:
+${formatCriteria()}
+For each item, rate it on EACH criterion using one of these labels:
+very_bad, bad, average, good, very_good
+Output format:
+■0:criterion1=label;criterion2=label;criterion3=label■
+■1:criterion1=label;criterion2=label;criterion3=label■
+${END}
+IMPORTANT:
+- Rate every item (■0 to ■${chunk.length - 1})
+- Use exact criterion names: ${finalCriteriaKeys.join(', ')}
+- Use exact label names: very_bad, bad, average, good, very_good
+- Use semicolons (;) between criteria
+- Use equals (=) between criterion and label`,
+      stopSequences: [END],
+      messages: [
+        ...exampleMessages,
+        {
+          type: 'text',
+          role: 'user',
+          content: `Items to rate (■0 to ■${chunk.length - 1}):
+${formatItems(chunk)}
+Rate each item on all criteria.
+Output format: ■index:criterion1=label;criterion2=label■
+${END}`,
+        },
+      ],
+      transform: (text) => {
+        const results: Array<Record<string, number>> = []
+        // Parse ratings: ■0:affordability=good;quality=very_good■
+        const regex = /■(\d+):([^■]+)■/g
+        let match: RegExpExecArray | null
+        while ((match = regex.exec(text)) !== null) {
+          const idx = parseInt(match[1] ?? '', 10)
+          const ratingsStr = match[2] ?? ''
+          if (isNaN(idx) || idx < 0 || idx >= chunk.length) {
+            continue
+          }
+          const itemRatings: Record<string, number> = {}
+          let total = 0
+          // Parse criterion=label pairs
+          const pairs = ratingsStr.split(';').filter((x) => x.trim().length > 0)
+          for (const pair of pairs) {
+            const [criterion, label] = pair.split('=').map((x) => x.trim())
+            if (!criterion || !label) continue
+            // Convert label to number
+            const labelLower = label.toLowerCase().replace(/\s+/g, '_')
+            const ratingValue = RATING_VALUES[labelLower as RatingLabel] ?? 3 // default to average
+            itemRatings[criterion] = ratingValue
+            total += ratingValue
+          }
+          itemRatings.total = total
+          results[idx] = itemRatings
+        }
+        // Fill in missing results with defaults
+        for (let i = 0; i < chunk.length; i++) {
+          if (!results[i]) {
+            const defaultRatings: Record<string, number> = {}
+            let total = 0
+            for (const key of finalCriteriaKeys) {
+              defaultRatings[key] = 3 // average
+              total += 3
+            }
+            defaultRatings.total = total
+            results[i] = defaultRatings
+          }
+        }
+        return results
+      },
+    })
+    return { ratings: extracted, meta }
+  }
+  // Process chunks in parallel with p-limit
+  const limit = pLimit(10)
+  const chunkPromises = chunks.map((chunk) => limit(() => rateChunk(chunk)))
+  const ratedChunks = await Promise.all(chunkPromises)
+  // Phase 4: Flatten results and accumulate metadata
+  const allRatings = ratedChunks.flatMap((result) => result.ratings) as Array<RatingResult<I>>
+  // Accumulate metadata from all chunks
+  const totalMeta = ratedChunks.reduce(
+    (acc, result) => ({
+      cost: {
+        input: acc.cost.input + result.meta.cost.input,
+        output: acc.cost.output + result.meta.cost.output,
+      },
+      latency: Math.max(acc.latency, result.meta.latency), // Use max latency
+      tokens: {
+        input: acc.tokens.input + result.meta.tokens.input,
+        output: acc.tokens.output + result.meta.tokens.output,
+      },
+    }),
+    {
+      cost: { input: 0, output: 0 },
+      latency: 0,
+      tokens: { input: 0, output: 0 },
+    }
+  )
+  // Save example for active learning
+  if (taskId && ctx.adapter && !ctx.controller.signal.aborted) {
+    const key = fastHash(
+      stringify({
+        taskId,
+        taskType,
+        input: JSON.stringify(input),
+        instructions: stringify(instructions),
+      })
+    )
+    await ctx.adapter.saveExample({
+      key,
+      taskType,
+      taskId,
+      input: JSON.stringify(input),
+      output: allRatings,
+      instructions: typeof instructions === 'string' ? instructions : JSON.stringify(instructions),
+      metadata: {
+        cost: {
+          input: totalMeta.cost.input,
+          output: totalMeta.cost.output,
+        },
+        latency: totalMeta.latency,
+        model: ctx.modelId,
+        tokens: {
+          input: totalMeta.tokens.input,
+          output: totalMeta.tokens.output,
+        },
+      },
+    })
+  }
+  return allRatings
+}
+Zai.prototype.rate = function <T, I extends RatingInstructions>(
+  this: Zai,
+  input: Array<T>,
+  instructions: I,
+  _options?: Options
+): Response<Array<RatingResult<I>>, Array<SimplifiedRatingResult<I>>> {
+  const context = new ZaiContext({
+    client: this.client,
+    modelId: this.Model,
+    taskId: this.taskId,
+    taskType: 'zai.rate',
+    adapter: this.adapter,
+  })
+  return new Response<Array<RatingResult<I>>, Array<SimplifiedRatingResult<I>>>(
+    context,
+    rate(input, instructions, _options, context),
+    (result) => {
+      // If instructions is a string, simplify to just the total number
+      if (typeof instructions === 'string') {
+        return result.map((r) => r.total as SimplifiedRatingResult<I>)
+      }
+      // Otherwise return the full result (including total)
+      return result as Array<SimplifiedRatingResult<I>>
+    }
+  )
+}