npm - vitest-evals - Versions diffs - 0.2.0 → 0.4.0 - Mend

vitest-evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +211 -172
package/dist/index.d.mts +2 -98
package/dist/index.d.ts +2 -98
package/dist/index.js +270 -11
package/dist/index.js.map +1 -1
package/dist/index.mjs +269 -11
package/dist/index.mjs.map +1 -1
package/dist/scorers/index.d.mts +2 -0
package/dist/scorers/index.d.ts +2 -0
package/dist/scorers/index.js +282 -0
package/dist/scorers/index.js.map +1 -0
package/dist/scorers/index.mjs +256 -0
package/dist/scorers/index.mjs.map +1 -0
package/dist/scorers/toolCallScorer.d.mts +240 -0
package/dist/scorers/toolCallScorer.d.ts +240 -0
package/dist/scorers/toolCallScorer.js +280 -0
package/dist/scorers/toolCallScorer.js.map +1 -0
package/dist/scorers/toolCallScorer.mjs +256 -0
package/dist/scorers/toolCallScorer.mjs.map +1 -0
package/package.json +16 -4
package/dist/compatibility.test.d.mts +0 -2
package/dist/compatibility.test.d.ts +0 -2
package/dist/compatibility.test.js +0 -45009
package/dist/compatibility.test.js.map +0 -1
package/dist/compatibility.test.mjs +0 -45864
package/dist/compatibility.test.mjs.map +0 -1
package/dist/formatScores.test.d.mts +0 -2
package/dist/formatScores.test.d.ts +0 -2
package/dist/formatScores.test.js +0 -195
package/dist/formatScores.test.js.map +0 -1
package/dist/formatScores.test.mjs +0 -194
package/dist/formatScores.test.mjs.map +0 -1
package/dist/wrapText.test.d.mts +0 -2
package/dist/wrapText.test.d.ts +0 -2
package/dist/wrapText.test.js +0 -162
package/dist/wrapText.test.js.map +0 -1
package/dist/wrapText.test.mjs +0 -161
package/dist/wrapText.test.mjs.map +0 -1

package/dist/index.mjs CHANGED Viewed

@@ -17,6 +17,18 @@ var __spreadValues = (a, b) => {
   return a;
 };
 var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
+var __objRest = (source, exclude) => {
+  var target = {};
+  for (var prop in source)
+    if (__hasOwnProp.call(source, prop) && exclude.indexOf(prop) < 0)
+      target[prop] = source[prop];
+  if (source != null && __getOwnPropSymbols)
+    for (var prop of __getOwnPropSymbols(source)) {
+      if (exclude.indexOf(prop) < 0 && __propIsEnum.call(source, prop))
+        target[prop] = source[prop];
+    }
+  return target;
+};
 var __async = (__this, __arguments, generator) => {
   return new Promise((resolve, reject) => {
     var fulfilled = (value) => {
@@ -41,12 +53,247 @@ var __async = (__this, __arguments, generator) => {
 // src/index.ts
 import { assert, describe, expect, test } from "vitest";
 import "vitest";
+// src/scorers/toolCallScorer.ts
+function fuzzyMatch(expected, actual) {
+  if (expected == null || actual == null) {
+    return expected === actual;
+  }
+  if (typeof expected === "object" && typeof actual === "object" && !Array.isArray(expected)) {
+    return Object.entries(expected).every(
+      ([key, value]) => key in actual && fuzzyMatch(value, actual[key])
+    );
+  }
+  if (typeof expected === "string" && typeof actual === "string") {
+    return actual.toLowerCase().includes(expected.toLowerCase());
+  }
+  if (typeof expected === "number" && typeof actual === "number") {
+    const tolerance = Math.max(Math.abs(expected) * 1e-3, 1e-3);
+    return Math.abs(expected - actual) <= tolerance;
+  }
+  if (Array.isArray(expected) && Array.isArray(actual)) {
+    return expected.every(
+      (expItem) => actual.some((actItem) => fuzzyMatch(expItem, actItem))
+    );
+  }
+  return expected === actual;
+}
+function strictEquals(expected, actual) {
+  if (expected === actual) return true;
+  if (expected == null || actual == null) return false;
+  if (typeof expected !== typeof actual) return false;
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual)) return false;
+    if (expected.length !== actual.length) return false;
+    return expected.every((item, i) => strictEquals(item, actual[i]));
+  }
+  if (typeof expected === "object") {
+    const expectedKeys = Object.keys(expected).sort();
+    const actualKeys = Object.keys(actual).sort();
+    if (expectedKeys.length !== actualKeys.length) return false;
+    if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;
+    return expectedKeys.every(
+      (key) => strictEquals(expected[key], actual[key])
+    );
+  }
+  return expected === actual;
+}
+function ToolCallScorer(config = {}) {
+  const {
+    ordered = false,
+    requireAll = true,
+    allowExtras = true,
+    params = "strict"
+  } = config;
+  const argMatcher = typeof params === "function" ? params : params === "strict" ? strictEquals : fuzzyMatch;
+  return (opts) => __async(null, null, function* () {
+    const expectedTools = opts.expectedTools || [];
+    const actualCalls = opts.toolCalls || [];
+    if (expectedTools.length === 0) {
+      return {
+        score: 1,
+        metadata: {
+          rationale: "No tool calls expected"
+        }
+      };
+    }
+    if (actualCalls.length === 0) {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Expected ${expectedTools.length} tool(s) but none were called`
+        }
+      };
+    }
+    if (ordered) {
+      return evaluateOrderedTools(expectedTools, actualCalls, {
+        argMatcher,
+        allowExtras,
+        requireAllTools: requireAll
+      });
+    }
+    return evaluateUnorderedTools(expectedTools, actualCalls, {
+      argMatcher,
+      requireAllTools: requireAll,
+      allowExtras
+    });
+  });
+}
+function evaluateOrderedTools(expected, actual, options) {
+  let expectedIndex = 0;
+  let actualIndex = 0;
+  while (expectedIndex < expected.length && actualIndex < actual.length) {
+    const exp = expected[expectedIndex];
+    const act = actual[actualIndex];
+    if (exp.name === act.name) {
+      if (exp.arguments !== void 0) {
+        const argsMatch = options.argMatcher(
+          exp.arguments,
+          act.arguments || {}
+        );
+        if (!argsMatch) {
+          return {
+            score: 0.5,
+            metadata: {
+              rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,
+              expected: exp.arguments,
+              actual: act.arguments
+            }
+          };
+        }
+      }
+      expectedIndex++;
+      actualIndex++;
+    } else if (options.allowExtras) {
+      actualIndex++;
+    } else {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`
+        }
+      };
+    }
+  }
+  if (expectedIndex < expected.length) {
+    const missing = expected.slice(expectedIndex).map((t) => t.name);
+    if (options.requireAllTools) {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Missing required tools in sequence: ${missing.join(", ")}`
+        }
+      };
+    }
+    const matchedCount = expectedIndex;
+    const totalCount = expected.length;
+    const score = totalCount > 0 ? matchedCount / totalCount : 1;
+    return {
+      score,
+      metadata: {
+        rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(", ")})`,
+        matched: matchedCount,
+        total: totalCount
+      }
+    };
+  }
+  if (!options.allowExtras && actualIndex < actual.length) {
+    const extra = actual.slice(actualIndex).map((t) => t.name);
+    return {
+      score: 0,
+      metadata: {
+        rationale: `Unexpected extra tools: ${extra.join(", ")}`
+      }
+    };
+  }
+  return {
+    score: 1,
+    metadata: {
+      rationale: "All tools called in expected order with correct arguments"
+    }
+  };
+}
+function evaluateUnorderedTools(expected, actual, options) {
+  const matchedExpected = /* @__PURE__ */ new Set();
+  const matchedActual = /* @__PURE__ */ new Set();
+  const issues = [];
+  for (let i = 0; i < expected.length; i++) {
+    const exp = expected[i];
+    let found = false;
+    for (let j = 0; j < actual.length; j++) {
+      if (matchedActual.has(j)) continue;
+      const act = actual[j];
+      if (exp.name === act.name) {
+        if (exp.arguments !== void 0) {
+          const argsMatch = options.argMatcher(
+            exp.arguments,
+            act.arguments || {}
+          );
+          if (!argsMatch) {
+            continue;
+          }
+        }
+        matchedExpected.add(i);
+        matchedActual.add(j);
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (exp.arguments !== void 0) {
+        const wrongArgsCalls = actual.filter((a) => a.name === exp.name);
+        if (wrongArgsCalls.length > 0) {
+          issues.push(`Tool '${exp.name}' called but with incorrect arguments`);
+        } else {
+          issues.push(`Missing required tool: ${exp.name}`);
+        }
+      } else {
+        issues.push(`Missing required tool: ${exp.name}`);
+      }
+    }
+  }
+  const extraTools = actual.filter((_, i) => !matchedActual.has(i)).map((t) => t.name);
+  if (!options.allowExtras && extraTools.length > 0) {
+    issues.push(`Unexpected extra tools: ${extraTools.join(", ")}`);
+  }
+  const expectedMatched = matchedExpected.size;
+  const expectedTotal = expected.length;
+  if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {
+    return {
+      score: 0,
+      metadata: {
+        rationale: issues.join("; ")
+      }
+    };
+  }
+  const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1;
+  if (score === 1) {
+    const extraInfo = extraTools.length > 0 ? ` (plus extra: ${extraTools.join(", ")})` : "";
+    return {
+      score: 1,
+      metadata: {
+        rationale: `All expected tools were called${extraInfo}`
+      }
+    };
+  }
+  return {
+    score,
+    metadata: {
+      rationale: issues.join("; "),
+      matched: expectedMatched,
+      total: expectedTotal
+    }
+  };
+}
+// src/index.ts
 expect.extend({
   /**
    * Evaluates a language model output against an expected answer using a scoring function.
    *
    * @param expected - The expected (ground truth) answer
    * @param taskFn - Async function that processes the input and returns the model output
+   *                 Can return either a string or TaskResult object with result and optional toolCalls
    * @param scoreFn - Function that evaluates the model output against the expected answer
    * @param threshold - Minimum acceptable score (0-1), defaults to 1.0
    *
@@ -56,8 +303,12 @@ expect.extend({
    *   expect("What is the capital of France?").toEval(
    *     "Paris",
    *     async (input) => {
-   *       // Query LLM here
-   *       return "Paris";
+   *       const response = await queryLLM(input);
+   *       // Recommended: return TaskResult
+   *       return {
+   *         result: response.text,
+   *         toolCalls: response.toolCalls || []
+   *       };
    *     },
    *     checkFactuality,
    *     0.8
@@ -65,12 +316,15 @@ expect.extend({
    * });
    * ```
    */
+  // TODO: this needs to be support true extensibility with Eval scorers
   toEval: function toEval(input, expected, taskFn, scoreFn, threshold = 1) {
     return __async(this, null, function* () {
       var _a;
       const { isNot } = this;
-      const output = yield taskFn(input);
-      let result = scoreFn({ input, expected, output });
+      const taskOutput = yield taskFn(input);
+      const output = typeof taskOutput === "string" ? taskOutput : taskOutput.result;
+      const toolCalls = typeof taskOutput === "object" ? taskOutput.toolCalls : void 0;
+      let result = scoreFn({ input, expected, output, toolCalls });
       if (result instanceof Promise) {
         result = yield result;
       }
@@ -93,17 +347,20 @@ function describeEval(name, {
 }) {
   return describe(name, () => __async(null, null, function* () {
     const testFn = skipIf ? test.skipIf(skipIf()) : test;
-    for (const { input, expected } of yield data()) {
+    for (const _a of yield data()) {
+      const _b = _a, { input } = _b, params = __objRest(_b, ["input"]);
       testFn(
         input,
         {
           timeout
         },
         (_0) => __async(null, [_0], function* ({ task: testTask }) {
-          const output = yield task(input);
+          const taskOutput = yield task(input);
+          const output = typeof taskOutput === "string" ? taskOutput : taskOutput.result;
+          const toolCalls = typeof taskOutput === "object" ? taskOutput.toolCalls : void 0;
           const scores = yield Promise.all(
             scorers.map((scorer) => {
-              const result = scorer({ input, expected, output });
+              const result = scorer(__spreadProps(__spreadValues({ input }, params), { output, toolCalls }));
               if (result instanceof Promise) {
                 return result;
               }
@@ -114,13 +371,13 @@ function describeEval(name, {
             name: scorers[i].name
           }));
           const avgScore = scores.reduce((acc, s) => {
-            var _a;
-            return acc + ((_a = s.score) != null ? _a : 0);
+            var _a2;
+            return acc + ((_a2 = s.score) != null ? _a2 : 0);
           }, 0) / scores.length;
-          testTask.meta.eval = {
+          testTask.meta.eval = __spreadValues({
             scores: scoresWithName,
             avgScore
-          };
+          }, toolCalls && { toolCalls });
           if (threshold) {
             assert(
               avgScore >= threshold,
@@ -181,6 +438,7 @@ function wrapText(text, width = 80) {
   return lines.join("\n");
 }
 export {
+  ToolCallScorer,
   describeEval,
   formatScores,
   wrapText

package/dist/index.mjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { assert, describe, expect, test } from \"vitest\";\nimport \"vitest\";\n\nexport type TaskFn = (input: string) => Promise<string>;\n\nexport type Score = {\n score: number \| null;\n metadata?: {\n rationale?: string;\n output?: string;\n };\n};\n\nexport type ScoreFn = (opts: {\n input: string;\n output: string;\n expected?: string;\n}) => Promise<Score> \| Score;\n\nexport type ToEval<R = unknown> = (\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold?: number,\n) => Promise<R>;\n\nexport interface EvalMatchers<R = unknown> {\n toEval: ToEval<R>;\n}\n\ndeclare module \"vitest\" {\n interface Assertion<T = any> extends EvalMatchers<T> {}\n interface AsymmetricMatchersContaining extends EvalMatchers {}\n\n interface TaskMeta {\n eval?: {\n scores: (Score & { name: string })[];\n avgScore: number;\n };\n }\n}\n\nexpect.extend({\n /*\n Evaluates a language model output against an expected answer using a scoring function.\n \n @param expected - The expected (ground truth) answer\n * @param taskFn - Async function that processes the input and returns the model output\n * @param scoreFn - Function that evaluates the model output against the expected answer\n * @param threshold - Minimum acceptable score (0-1), defaults to 1.0\n \n @example\n * ```javascript\n * test(\"checks capital of France\", async () => {\n * expect(\"What is the capital of France?\").toEval(\n * \"Paris\",\n * async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * checkFactuality,\n * 0.8\n * );\n * });\n * ```\n /\n toEval: async function toEval(\n input: string,\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold = 1.0,\n ) {\n const { isNot } = this;\n\n const output = await taskFn(input);\n\n let result = scoreFn({ input, expected, output });\n if (result instanceof Promise) {\n result = await result;\n }\n\n return {\n pass: (result.score ?? 0) >= threshold,\n message: () => formatScores([{ ...result, name: scoreFn.name }]),\n };\n },\n});\n\n/\n Creates a test suite for evaluating language model outputs.\n \n @param name - The name of the test suite\n * @param options - Configuration options\n * @param options.data - Async function that returns an array of test cases with input and expected values\n * @param options.task - Function that processes the input and returns the model output\n * @param options.skipIf - Optional function that determines if tests should be skipped\n * @param options.scorers - Array of scoring functions that evaluate model outputs\n * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0\n * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)\n \n @example\n * ```javascript\n * describeEval(\"capital cities test\", {\n * data: async () => [{\n * input: \"What is the capital of France?\",\n * expected: \"Paris\"\n * }],\n * task: async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * scorers: [checkFactuality],\n * threshold: 0.8\n * });\n * ```\n /\nexport function describeEval(\n name: string,\n {\n data,\n task,\n skipIf,\n scorers,\n threshold = 1.0,\n // increase default test timeout as 5s is usually not enough for\n // a single factuality check\n timeout = 60000,\n }: {\n data: () => Promise<{ input: string; expected: string }[]>;\n task: TaskFn;\n skipIf?: () => boolean;\n scorers: ScoreFn[];\n threshold?: number \| null;\n timeout?: number;\n },\n) {\n return describe(name, async () => {\n const testFn = skipIf ? test.skipIf(skipIf()) : test;\n // TODO: should data just be a generator?\n for (const { input, expected } of await data()) {\n testFn(\n input,\n {\n timeout,\n },\n async ({ task: testTask }) => {\n const output = await task(input);\n\n const scores = await Promise.all(\n scorers.map((scorer) => {\n const result = scorer({ input, expected, output });\n if (result instanceof Promise) {\n return result;\n }\n return new Promise<Score>((resolve) => resolve(result));\n }),\n );\n const scoresWithName = scores.map((s, i) => ({\n ...s,\n name: scorers[i].name,\n }));\n\n const avgScore =\n scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;\n\n testTask.meta.eval = {\n scores: scoresWithName,\n avgScore,\n };\n\n if (threshold) {\n assert(\n avgScore >= threshold,\n `Score: ${avgScore} below threshold: ${threshold}\\n\\n## Output:\\n${wrapText(output)}\\n\\n${formatScores(\n scoresWithName,\n )}`,\n );\n }\n },\n );\n }\n });\n}\n\nexport function formatScores(scores: (Score & { name: string })[]) {\n return scores\n .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))\n .map((s) => {\n const scoreLine = `# ${s.name \|\| \"Unknown\"} [${(s.score ?? 0).toFixed(1)}]`;\n if (\n ((s.score ?? 0) < 1.0 && s.metadata?.rationale) \|\|\n s.metadata?.output\n ) {\n return `${scoreLine}${\n s.metadata?.rationale\n ? `\\n\\n## Rationale\\n\\n${wrapText(s.metadata.rationale)}`\n : \"\"\n }${s.metadata?.output ? `\\n\\n## Response\\n\\n${wrapText(s.metadata.output)}` : \"\"}`;\n }\n return scoreLine;\n })\n .join(\"\\n\\n\");\n}\n\n/\n Wraps text to fit within a specified width, breaking at word boundaries.\n \n @param text - The text to wrap\n * @param width - The maximum width in characters (default: 80)\n * @returns The wrapped text with line breaks\n \n @example\n * ```javascript\n * const wrapped = wrapText(\"This is a very long text that needs to be wrapped to fit within an 80 character width.\", 20);\n * console.log(wrapped);\n * // Output:\n * // This is a very\n * // long text that\n * // needs to be\n * // wrapped to fit\n * // within an 80\n * // character width.\n * ```\n */\nexport function wrapText(text: string, width = 80): string {\n if (!text \|\| text.length <= width) {\n return text;\n }\n\n const words = text.split(/\\s+/);\n const lines: string[] = [];\n let currentLine = \"\";\n\n for (const word of words) {\n // If adding this word would exceed the width, start a new line\n if (currentLine.length + word.length + 1 > width) {\n lines.push(currentLine.trim());\n currentLine = word;\n } else {\n // Add the word to the current line\n currentLine += (currentLine ? \" \" : \"\") + word;\n }\n }\n\n // Add the last line if it's not empty\n if (currentLine) {\n lines.push(currentLine);\n }\n\n return lines.join(\"\\n\");\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,SAAS,QAAQ,UAAU,QAAQ,YAAY;AAC/C,OAAO;AAyCP,OAAO,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAwBZ,QAAQ,SAAe,OACrB,OACA,UACA,QACA,SACA,YAAY,GACZ;AAAA;AAxEJ;AAyEI,YAAM,EAAE,MAAM,IAAI;AAElB,YAAM,SAAS,MAAM,OAAO,KAAK;AAEjC,UAAI,SAAS,QAAQ,EAAE,OAAO,UAAU,OAAO,CAAC;AAChD,UAAI,kBAAkB,SAAS;AAC7B,iBAAS,MAAM;AAAA,MACjB;AAEA,aAAO;AAAA,QACL,QAAO,YAAO,UAAP,YAAgB,MAAM;AAAA,QAC7B,SAAS,MAAM,aAAa,CAAC,iCAAK,SAAL,EAAa,MAAM,QAAQ,KAAK,EAAC,CAAC;AAAA,MACjE;AAAA,IACF;AAAA;AACF,CAAC;AA8BM,SAAS,aACd,MACA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA;AAAA;AAAA,EAGZ,UAAU;AACZ,GAQA;AACA,SAAO,SAAS,MAAM,MAAY;AAChC,UAAM,SAAS,SAAS,KAAK,OAAO,OAAO,CAAC,IAAI;AAEhD,eAAW,EAAE,OAAO,SAAS,KAAK,MAAM,KAAK,GAAG;AAC9C;AAAA,QACE;AAAA,QACA;AAAA,UACE;AAAA,QACF;AAAA,QACA,CAAO,OAAuB,eAAvB,KAAuB,WAAvB,EAAE,MAAM,SAAS,GAAM;AAC5B,gBAAM,SAAS,MAAM,KAAK,KAAK;AAE/B,gBAAM,SAAS,MAAM,QAAQ;AAAA,YAC3B,QAAQ,IAAI,CAAC,WAAW;AACtB,oBAAM,SAAS,OAAO,EAAE,OAAO,UAAU,OAAO,CAAC;AACjD,kBAAI,kBAAkB,SAAS;AAC7B,uBAAO;AAAA,cACT;AACA,qBAAO,IAAI,QAAe,CAAC,YAAY,QAAQ,MAAM,CAAC;AAAA,YACxD,CAAC;AAAA,UACH;AACA,gBAAM,iBAAiB,OAAO,IAAI,CAAC,GAAG,MAAO,iCACxC,IADwC;AAAA,YAE3C,MAAM,QAAQ,CAAC,EAAE;AAAA,UACnB,EAAE;AAEF,gBAAM,WACJ,OAAO,OAAO,CAAC,KAAK,MAAG;AApKnC;AAoKsC,2BAAO,OAAE,UAAF,YAAW;AAAA,aAAI,CAAC,IAAI,OAAO;AAE9D,mBAAS,KAAK,OAAO;AAAA,YACnB,QAAQ;AAAA,YACR;AAAA,UACF;AAEA,cAAI,WAAW;AACb;AAAA,cACE,YAAY;AAAA,cACZ,UAAU,QAAQ,qBAAqB,SAAS;AAAA;AAAA;AAAA,EAAmB,SAAS,MAAM,CAAC;AAAA;AAAA,EAAO;AAAA,gBACxF;AAAA,cACF,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF,EAAC;AACH;AAEO,SAAS,aAAa,QAAsC;AACjE,SAAO,OACJ,KAAK,CAAC,GAAG,MAAG;AA3LjB;AA2LqB,oBAAE,UAAF,YAAW,OAAM,OAAE,UAAF,YAAW;AAAA,GAAE,EAC9C,IAAI,CAAC,MAAM;AA5LhB;AA6LM,UAAM,YAAY,KAAK,EAAE,QAAQ,SAAS,OAAM,OAAE,UAAF,YAAW,GAAG,QAAQ,CAAC,CAAC;AACxE,UACI,OAAE,UAAF,YAAW,KAAK,OAAO,OAAE,aAAF,mBAAY,gBACrC,OAAE,aAAF,mBAAY,SACZ;AACA,aAAO,GAAG,SAAS,KACjB,OAAE,aAAF,mBAAY,aACR;AAAA;AAAA;AAAA;AAAA,EAAuB,SAAS,EAAE,SAAS,SAAS,CAAC,KACrD,EACN,KAAG,OAAE,aAAF,mBAAY,UAAS;AAAA;AAAA;AAAA;AAAA,EAAsB,SAAS,EAAE,SAAS,MAAM,CAAC,KAAK,EAAE;AAAA,IAClF;AACA,WAAO;AAAA,EACT,CAAC,EACA,KAAK,MAAM;AAChB;AAsBO,SAAS,SAAS,MAAc,QAAQ,IAAY;AACzD,MAAI,CAAC,QAAQ,KAAK,UAAU,OAAO;AACjC,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,QAAkB,CAAC;AACzB,MAAI,cAAc;AAElB,aAAW,QAAQ,OAAO;AAExB,QAAI,YAAY,SAAS,KAAK,SAAS,IAAI,OAAO;AAChD,YAAM,KAAK,YAAY,KAAK,CAAC;AAC7B,oBAAc;AAAA,IAChB,OAAO;AAEL,sBAAgB,cAAc,MAAM,MAAM;AAAA,IAC5C;AAAA,EACF;AAGA,MAAI,aAAa;AACf,UAAM,KAAK,WAAW;AAAA,EACxB;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;","names":[]}
1	+ {"version":3,"sources":["../src/index.ts","../src/scorers/toolCallScorer.ts"],"sourcesContent":["import { assert, describe, expect, test } from \"vitest\";\nimport \"vitest\";\n\n/*\n Represents a tool/function call made during task execution.\n * Supports various LLM provider formats and use cases.\n /\nexport type ToolCall = {\n // Core fields (required for basic usage)\n name: string;\n arguments: Record<string, any>;\n\n // Result and timing\n result?: any;\n error?: {\n code?: string;\n message: string;\n details?: any;\n };\n timestamp?: number;\n duration_ms?: number;\n\n // Identification and correlation\n id?: string;\n parent_id?: string; // For nested/chained calls\n\n // Status tracking\n status?: \"pending\" \| \"executing\" \| \"completed\" \| \"failed\" \| \"cancelled\";\n\n // Provider-specific fields\n type?: \"function\" \| \"retrieval\" \| \"code_interpreter\" \| \"web_search\" \| string;\n\n // Additional metadata\n [key: string]: any; // Allow provider-specific fields\n};\n\nexport type TaskResult = {\n result: string;\n toolCalls?: ToolCall[];\n};\n\n/\n Task function that processes an input and returns either a string result\n * or a TaskResult object containing the result and any tool calls made.\n \n @param input - The input string to process\n * @returns Promise resolving to either a string or TaskResult object\n \n @example\n * // Simple tasks can just return a string\n * const simpleTask: TaskFn = async (input) => \"The answer is 42\";\n \n // Tasks that use tools should return TaskResult\n * const taskWithTools: TaskFn = async (input) => ({\n * result: \"The answer is 42\",\n * toolCalls: [{ name: \"calculate\", arguments: { expr: \"67\" }, result: 42 }]\n });\n /\nexport type TaskFn = (input: string) => Promise<string \| TaskResult>;\n\nexport type Score = {\n score: number \| null;\n metadata?: {\n rationale?: string;\n output?: string;\n };\n};\n\nexport interface BaseScorerOptions {\n input: string;\n output: string;\n toolCalls?: ToolCall[];\n}\n\nexport type ScoreFn<TOptions extends BaseScorerOptions = BaseScorerOptions> = (\n opts: TOptions,\n) => Promise<Score> \| Score;\n\nexport type ToEval<R = unknown> = (\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn<any>,\n threshold?: number,\n) => Promise<R>;\n\nexport interface EvalMatchers<R = unknown> {\n toEval: ToEval<R>;\n}\n\ndeclare module \"vitest\" {\n interface Assertion<T = any> extends EvalMatchers<T> {}\n interface AsymmetricMatchersContaining extends EvalMatchers {}\n\n interface TaskMeta {\n eval?: {\n scores: (Score & { name: string })[];\n avgScore: number;\n toolCalls?: ToolCall[];\n };\n }\n}\n\nexpect.extend({\n /\n Evaluates a language model output against an expected answer using a scoring function.\n \n @param expected - The expected (ground truth) answer\n * @param taskFn - Async function that processes the input and returns the model output\n * Can return either a string or TaskResult object with result and optional toolCalls\n * @param scoreFn - Function that evaluates the model output against the expected answer\n * @param threshold - Minimum acceptable score (0-1), defaults to 1.0\n \n @example\n * ```javascript\n * test(\"checks capital of France\", async () => {\n * expect(\"What is the capital of France?\").toEval(\n * \"Paris\",\n * async (input) => {\n * const response = await queryLLM(input);\n * // Recommended: return TaskResult\n * return {\n * result: response.text,\n * toolCalls: response.toolCalls \|\| []\n * };\n * },\n * checkFactuality,\n * 0.8\n * );\n * });\n * ```\n /\n // TODO: this needs to be support true extensibility with Eval scorers\n toEval: async function toEval(\n input: string,\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn<any>,\n threshold = 1.0,\n ) {\n const { isNot } = this;\n\n const taskOutput = await taskFn(input);\n const output =\n typeof taskOutput === \"string\" ? taskOutput : taskOutput.result;\n const toolCalls =\n typeof taskOutput === \"object\" ? taskOutput.toolCalls : undefined;\n\n let result = scoreFn({ input, expected, output, toolCalls });\n if (result instanceof Promise) {\n result = await result;\n }\n\n return {\n pass: (result.score ?? 0) >= threshold,\n message: () => formatScores([{ ...result, name: scoreFn.name }]),\n };\n },\n});\n\n/\n Creates a test suite for evaluating language model outputs.\n \n @param name - The name of the test suite\n * @param options - Configuration options\n * @param options.data - Async function that returns an array of test cases with input and any additional fields\n * @param options.task - Function that processes the input and returns the model output\n * Can return either a string or TaskResult object with result and optional toolCalls\n * @param options.skipIf - Optional function that determines if tests should be skipped\n * @param options.scorers - Array of scoring functions that evaluate model outputs\n * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0\n * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)\n \n @example\n * ```javascript\n * // Recommended: TaskResult format with tool tracking\n * describeEval(\"capital cities test\", {\n * data: async () => [{\n * input: \"What is the capital of France?\",\n * expected: \"Paris\"\n * }],\n * task: async (input) => {\n * const response = await queryLLM(input);\n * return {\n * result: response.text,\n * toolCalls: response.toolCalls \|\| []\n * };\n * },\n * scorers: [checkFactuality],\n * threshold: 0.8\n * });\n \n // Example with tool usage evaluation\n * describeEval(\"tool usage test\", {\n * data: async () => [{\n * input: \"Search for weather in Seattle\",\n * expectedTools: [{ name: \"weather_api\", arguments: { location: \"Seattle\" } }]\n * }],\n * task: async (input) => {\n * return {\n * result: \"The weather in Seattle is 65°F\",\n * toolCalls: [{\n * name: \"weather_api\",\n * arguments: { location: \"Seattle\" },\n * result: { temp: 65, condition: \"partly cloudy\" }\n * }]\n * };\n * },\n * scorers: [ToolCallScorer()],\n * threshold: 1.0\n * });\n * ```\n /\nexport function describeEval(\n name: string,\n {\n data,\n task,\n skipIf,\n scorers,\n threshold = 1.0,\n // increase default test timeout as 5s is usually not enough for\n // a single factuality check\n timeout = 60000,\n }: {\n data: () => Promise<Array<{ input: string } & Record<string, any>>>;\n task: TaskFn;\n skipIf?: () => boolean;\n scorers: ScoreFn<any>[];\n threshold?: number \| null;\n timeout?: number;\n },\n) {\n return describe(name, async () => {\n const testFn = skipIf ? test.skipIf(skipIf()) : test;\n // TODO: should data just be a generator?\n for (const { input, ...params } of await data()) {\n testFn(\n input,\n {\n timeout,\n },\n async ({ task: testTask }) => {\n const taskOutput = await task(input);\n const output =\n typeof taskOutput === \"string\" ? taskOutput : taskOutput.result;\n const toolCalls =\n typeof taskOutput === \"object\" ? taskOutput.toolCalls : undefined;\n\n const scores = await Promise.all(\n scorers.map((scorer) => {\n const result = scorer({ input, ...params, output, toolCalls });\n if (result instanceof Promise) {\n return result;\n }\n return new Promise<Score>((resolve) => resolve(result));\n }),\n );\n const scoresWithName = scores.map((s, i) => ({\n ...s,\n name: scorers[i].name,\n }));\n\n const avgScore =\n scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;\n\n testTask.meta.eval = {\n scores: scoresWithName,\n avgScore,\n ...(toolCalls && { toolCalls }),\n };\n\n if (threshold) {\n assert(\n avgScore >= threshold,\n `Score: ${avgScore} below threshold: ${threshold}\\n\\n## Output:\\n${wrapText(output)}\\n\\n${formatScores(\n scoresWithName,\n )}`,\n );\n }\n },\n );\n }\n });\n}\n\nexport function formatScores(scores: (Score & { name: string })[]) {\n return scores\n .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))\n .map((s) => {\n const scoreLine = `# ${s.name \|\| \"Unknown\"} [${(s.score ?? 0).toFixed(1)}]`;\n if (\n ((s.score ?? 0) < 1.0 && s.metadata?.rationale) \|\|\n s.metadata?.output\n ) {\n return `${scoreLine}${\n s.metadata?.rationale\n ? `\\n\\n## Rationale\\n\\n${wrapText(s.metadata.rationale)}`\n : \"\"\n }${s.metadata?.output ? `\\n\\n## Response\\n\\n${wrapText(s.metadata.output)}` : \"\"}`;\n }\n return scoreLine;\n })\n .join(\"\\n\\n\");\n}\n\n/\n Wraps text to fit within a specified width, breaking at word boundaries.\n \n @param text - The text to wrap\n * @param width - The maximum width in characters (default: 80)\n * @returns The wrapped text with line breaks\n \n @example\n * ```javascript\n * const wrapped = wrapText(\"This is a very long text that needs to be wrapped to fit within an 80 character width.\", 20);\n * console.log(wrapped);\n * // Output:\n * // This is a very\n * // long text that\n * // needs to be\n * // wrapped to fit\n * // within an 80\n * // character width.\n * ```\n /\nexport function wrapText(text: string, width = 80): string {\n if (!text \|\| text.length <= width) {\n return text;\n }\n\n const words = text.split(/\\s+/);\n const lines: string[] = [];\n let currentLine = \"\";\n\n for (const word of words) {\n // If adding this word would exceed the width, start a new line\n if (currentLine.length + word.length + 1 > width) {\n lines.push(currentLine.trim());\n currentLine = word;\n } else {\n // Add the word to the current line\n currentLine += (currentLine ? \" \" : \"\") + word;\n }\n }\n\n // Add the last line if it's not empty\n if (currentLine) {\n lines.push(currentLine);\n }\n\n return lines.join(\"\\n\");\n}\n\n// Export built-in scorers\nexport { ToolCallScorer, type ToolCallScorerOptions } from \"./scorers\";\n","import type { ScoreFn, BaseScorerOptions, ToolCall } from \"../index\";\n\nexport interface ToolCallScorerOptions extends BaseScorerOptions {\n // Expected tools are now defined in the test data\n expectedTools?: Array<{\n name: string;\n arguments?: any;\n }>;\n}\n\nexport interface ToolCallScorerConfig {\n /\n Whether tools must be called in the exact order specified\n * @default false\n /\n ordered?: boolean;\n\n /\n Whether all expected tools must be called for a passing score\n * When false: gives partial credit based on tools matched\n * @default true\n /\n requireAll?: boolean;\n\n /\n Whether to allow additional tool calls beyond those expected\n * @default true\n /\n allowExtras?: boolean;\n\n /\n How to match tool arguments/parameters\n * - \"strict\": Exact equality required (default)\n * - \"fuzzy\": Case-insensitive, subset matching, numeric tolerance\n * - Custom function: Your own comparison logic\n * @default \"strict\"\n /\n params?: \"strict\" \| \"fuzzy\" \| ((expected: any, actual: any) => boolean);\n}\n\n/\n Default fuzzy matching for arguments\n /\nfunction fuzzyMatch(expected: any, actual: any): boolean {\n // Null/undefined handling\n if (expected == null \|\| actual == null) {\n return expected === actual;\n }\n\n // For objects, check if actual has all expected properties\n if (\n typeof expected === \"object\" &&\n typeof actual === \"object\" &&\n !Array.isArray(expected)\n ) {\n return Object.entries(expected).every(\n ([key, value]) => key in actual && fuzzyMatch(value, actual[key]),\n );\n }\n\n // For strings, case-insensitive substring match\n if (typeof expected === \"string\" && typeof actual === \"string\") {\n return actual.toLowerCase().includes(expected.toLowerCase());\n }\n\n // For numbers, allow small differences (0.1% or 0.001, whichever is larger)\n if (typeof expected === \"number\" && typeof actual === \"number\") {\n const tolerance = Math.max(Math.abs(expected) 0.001, 0.001);\n return Math.abs(expected - actual) <= tolerance;\n }\n\n // For arrays, check if all expected items exist in actual (order doesn't matter in fuzzy mode)\n if (Array.isArray(expected) && Array.isArray(actual)) {\n return expected.every((expItem) =>\n actual.some((actItem) => fuzzyMatch(expItem, actItem)),\n );\n }\n\n // Otherwise strict equality\n return expected === actual;\n}\n\n/*\n Strict equality comparison (deep equals)\n /\nfunction strictEquals(expected: any, actual: any): boolean {\n // Handle primitive types and null/undefined\n if (expected === actual) return true;\n if (expected == null \|\| actual == null) return false;\n\n // Must be same type\n if (typeof expected !== typeof actual) return false;\n\n // Handle arrays\n if (Array.isArray(expected)) {\n if (!Array.isArray(actual)) return false;\n if (expected.length !== actual.length) return false;\n return expected.every((item, i) => strictEquals(item, actual[i]));\n }\n\n // Handle objects\n if (typeof expected === \"object\") {\n const expectedKeys = Object.keys(expected).sort();\n const actualKeys = Object.keys(actual).sort();\n\n // Must have same keys\n if (expectedKeys.length !== actualKeys.length) return false;\n if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;\n\n // All values must match\n return expectedKeys.every((key) =>\n strictEquals(expected[key], actual[key]),\n );\n }\n\n // Primitive types\n return expected === actual;\n}\n\n/\n A configurable scorer for evaluating tool usage in LLM responses.\n \n The test data defines WHAT tools/arguments are expected,\n * while this scorer defines HOW to evaluate them.\n \n @param config - Configuration options for the scorer\n * @param config.ordered - Require exact order of tool calls\n * @param config.requireAll - Require all expected tools (vs partial credit)\n * @param config.allowExtras - Allow additional tool calls\n * @param config.params - How to match parameters: \"strict\", \"fuzzy\", or custom function\n \n @example\n * // Default: strict params, any order\n * describeEval(\"search test\", {\n * data: async () => [{\n * input: \"Find restaurants\",\n * expectedTools: [\n * { name: \"search\", arguments: { type: \"restaurant\" } },\n * { name: \"filter\" }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer()]\n * });\n \n @example\n * // Strict order and parameters\n * describeEval(\"payment flow\", {\n * data: async () => [{\n * input: \"Process payment\",\n * expectedTools: [\n * { name: \"validate\", arguments: { amount: 100 } },\n * { name: \"charge\", arguments: { amount: 100, method: \"card\" } }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer({ ordered: true, params: \"strict\" })]\n * });\n /\nexport function ToolCallScorer(\n config: ToolCallScorerConfig = {},\n): ScoreFn<ToolCallScorerOptions> {\n const {\n ordered = false,\n requireAll = true,\n allowExtras = true,\n params = \"strict\",\n } = config;\n\n // Determine the argument matcher\n const argMatcher =\n typeof params === \"function\"\n ? params\n : params === \"strict\"\n ? strictEquals\n : fuzzyMatch;\n\n return async (opts) => {\n const expectedTools = opts.expectedTools \|\| [];\n const actualCalls = opts.toolCalls \|\| [];\n\n // No expectations means pass\n if (expectedTools.length === 0) {\n return {\n score: 1.0,\n metadata: {\n rationale: \"No tool calls expected\",\n },\n };\n }\n\n // No actual calls when we expected some\n if (actualCalls.length === 0) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected ${expectedTools.length} tool(s) but none were called`,\n },\n };\n }\n\n if (ordered) {\n return evaluateOrderedTools(expectedTools, actualCalls, {\n argMatcher,\n allowExtras,\n requireAllTools: requireAll,\n });\n }\n\n return evaluateUnorderedTools(expectedTools, actualCalls, {\n argMatcher,\n requireAllTools: requireAll,\n allowExtras,\n });\n };\n}\n\n/\n Evaluate tools that must be called in a specific order\n /\nfunction evaluateOrderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n allowExtras: boolean;\n requireAllTools: boolean;\n },\n) {\n let expectedIndex = 0;\n let actualIndex = 0;\n\n // Match expected tools in order\n while (expectedIndex < expected.length && actualIndex < actual.length) {\n const exp = expected[expectedIndex];\n const act = actual[actualIndex];\n\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments \|\| {},\n );\n if (!argsMatch) {\n return {\n score: 0.5,\n metadata: {\n rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,\n expected: exp.arguments,\n actual: act.arguments,\n },\n };\n }\n }\n expectedIndex++;\n actualIndex++;\n } else if (options.allowExtras) {\n // Skip extra tool\n actualIndex++;\n } else {\n // Wrong tool in sequence when extra tools not allowed\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`,\n },\n };\n }\n }\n\n // Check if all expected tools were matched\n if (expectedIndex < expected.length) {\n const missing = expected.slice(expectedIndex).map((t) => t.name);\n\n if (options.requireAllTools) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Missing required tools in sequence: ${missing.join(\", \")}`,\n },\n };\n }\n\n // Partial credit when requireAllTools is false\n const matchedCount = expectedIndex;\n const totalCount = expected.length;\n const score = totalCount > 0 ? matchedCount / totalCount : 1.0;\n\n return {\n score,\n metadata: {\n rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(\", \")})`,\n matched: matchedCount,\n total: totalCount,\n },\n };\n }\n\n // Check for extra tools at the end if not allowed\n if (!options.allowExtras && actualIndex < actual.length) {\n const extra = actual.slice(actualIndex).map((t) => t.name);\n return {\n score: 0.0,\n metadata: {\n rationale: `Unexpected extra tools: ${extra.join(\", \")}`,\n },\n };\n }\n\n return {\n score: 1.0,\n metadata: {\n rationale: \"All tools called in expected order with correct arguments\",\n },\n };\n}\n\n/\n Evaluate tools that can be called in any order\n */\nfunction evaluateUnorderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n requireAllTools: boolean;\n allowExtras: boolean;\n },\n) {\n const matchedExpected = new Set<number>();\n const matchedActual = new Set<number>();\n const issues: string[] = [];\n\n // Try to match each expected tool\n for (let i = 0; i < expected.length; i++) {\n const exp = expected[i];\n let found = false;\n\n // Look for a matching actual tool call\n for (let j = 0; j < actual.length; j++) {\n if (matchedActual.has(j)) continue;\n\n const act = actual[j];\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments \|\| {},\n );\n if (!argsMatch) {\n continue; // Try to find another call with matching args\n }\n }\n\n // Found a match\n matchedExpected.add(i);\n matchedActual.add(j);\n found = true;\n break;\n }\n }\n\n if (!found) {\n if (exp.arguments !== undefined) {\n // Check if tool was called but with wrong args\n const wrongArgsCalls = actual.filter((a) => a.name === exp.name);\n if (wrongArgsCalls.length > 0) {\n issues.push(`Tool '${exp.name}' called but with incorrect arguments`);\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n }\n }\n\n // Check for extra tools\n const extraTools = actual\n .filter((_, i) => !matchedActual.has(i))\n .map((t) => t.name);\n\n if (!options.allowExtras && extraTools.length > 0) {\n issues.push(`Unexpected extra tools: ${extraTools.join(\", \")}`);\n }\n\n // Calculate score\n const expectedMatched = matchedExpected.size;\n const expectedTotal = expected.length;\n\n // If we have any critical issues (wrong tools, missing tools when required, or extra tools when not allowed)\n if (issues.length > 0 && (options.requireAllTools \|\| !options.allowExtras)) {\n return {\n score: 0.0,\n metadata: {\n rationale: issues.join(\"; \"),\n },\n };\n }\n\n // Partial credit when not all required\n const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1.0;\n\n if (score === 1.0) {\n const extraInfo =\n extraTools.length > 0 ? ` (plus extra: ${extraTools.join(\", \")})` : \"\";\n return {\n score: 1.0,\n metadata: {\n rationale: `All expected tools were called${extraInfo}`,\n },\n };\n }\n\n return {\n score,\n metadata: {\n rationale: issues.join(\"; \"),\n matched: expectedMatched,\n total: expectedTotal,\n },\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,SAAS,QAAQ,UAAU,QAAQ,YAAY;AAC/C,OAAO;;;AC0CP,SAAS,WAAW,UAAe,QAAsB;AAEvD,MAAI,YAAY,QAAQ,UAAU,MAAM;AACtC,WAAO,aAAa;AAAA,EACtB;AAGA,MACE,OAAO,aAAa,YACpB,OAAO,WAAW,YAClB,CAAC,MAAM,QAAQ,QAAQ,GACvB;AACA,WAAO,OAAO,QAAQ,QAAQ,EAAE;AAAA,MAC9B,CAAC,CAAC,KAAK,KAAK,MAAM,OAAO,UAAU,WAAW,OAAO,OAAO,GAAG,CAAC;AAAA,IAClE;AAAA,EACF;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,OAAO,YAAY,EAAE,SAAS,SAAS,YAAY,CAAC;AAAA,EAC7D;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,UAAM,YAAY,KAAK,IAAI,KAAK,IAAI,QAAQ,IAAI,MAAO,IAAK;AAC5D,WAAO,KAAK,IAAI,WAAW,MAAM,KAAK;AAAA,EACxC;AAGA,MAAI,MAAM,QAAQ,QAAQ,KAAK,MAAM,QAAQ,MAAM,GAAG;AACpD,WAAO,SAAS;AAAA,MAAM,CAAC,YACrB,OAAO,KAAK,CAAC,YAAY,WAAW,SAAS,OAAO,CAAC;AAAA,IACvD;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AAKA,SAAS,aAAa,UAAe,QAAsB;AAEzD,MAAI,aAAa,OAAQ,QAAO;AAChC,MAAI,YAAY,QAAQ,UAAU,KAAM,QAAO;AAG/C,MAAI,OAAO,aAAa,OAAO,OAAQ,QAAO;AAG9C,MAAI,MAAM,QAAQ,QAAQ,GAAG;AAC3B,QAAI,CAAC,MAAM,QAAQ,MAAM,EAAG,QAAO;AACnC,QAAI,SAAS,WAAW,OAAO,OAAQ,QAAO;AAC9C,WAAO,SAAS,MAAM,CAAC,MAAM,MAAM,aAAa,MAAM,OAAO,CAAC,CAAC,CAAC;AAAA,EAClE;AAGA,MAAI,OAAO,aAAa,UAAU;AAChC,UAAM,eAAe,OAAO,KAAK,QAAQ,EAAE,KAAK;AAChD,UAAM,aAAa,OAAO,KAAK,MAAM,EAAE,KAAK;AAG5C,QAAI,aAAa,WAAW,WAAW,OAAQ,QAAO;AACtD,QAAI,CAAC,aAAa,MAAM,CAAC,KAAK,MAAM,QAAQ,WAAW,CAAC,CAAC,EAAG,QAAO;AAGnE,WAAO,aAAa;AAAA,MAAM,CAAC,QACzB,aAAa,SAAS,GAAG,GAAG,OAAO,GAAG,CAAC;AAAA,IACzC;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AA0CO,SAAS,eACd,SAA+B,CAAC,GACA;AAChC,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,aAAa;AAAA,IACb,cAAc;AAAA,IACd,SAAS;AAAA,EACX,IAAI;AAGJ,QAAM,aACJ,OAAO,WAAW,aACd,SACA,WAAW,WACT,eACA;AAER,SAAO,CAAO,SAAS;AACrB,UAAM,gBAAgB,KAAK,iBAAiB,CAAC;AAC7C,UAAM,cAAc,KAAK,aAAa,CAAC;AAGvC,QAAI,cAAc,WAAW,GAAG;AAC9B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW;AAAA,QACb;AAAA,MACF;AAAA,IACF;AAGA,QAAI,YAAY,WAAW,GAAG;AAC5B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,YAAY,cAAc,MAAM;AAAA,QAC7C;AAAA,MACF;AAAA,IACF;AAEA,QAAI,SAAS;AACX,aAAO,qBAAqB,eAAe,aAAa;AAAA,QACtD;AAAA,QACA;AAAA,QACA,iBAAiB;AAAA,MACnB,CAAC;AAAA,IACH;AAEA,WAAO,uBAAuB,eAAe,aAAa;AAAA,MACxD;AAAA,MACA,iBAAiB;AAAA,MACjB;AAAA,IACF,CAAC;AAAA,EACH;AACF;AAKA,SAAS,qBACP,UACA,QACA,SAKA;AACA,MAAI,gBAAgB;AACpB,MAAI,cAAc;AAGlB,SAAO,gBAAgB,SAAS,UAAU,cAAc,OAAO,QAAQ;AACrE,UAAM,MAAM,SAAS,aAAa;AAClC,UAAM,MAAM,OAAO,WAAW;AAE9B,QAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,UAAI,IAAI,cAAc,QAAW;AAC/B,cAAM,YAAY,QAAQ;AAAA,UACxB,IAAI;AAAA,UACJ,IAAI,aAAa,CAAC;AAAA,QACpB;AACA,YAAI,CAAC,WAAW;AACd,iBAAO;AAAA,YACL,OAAO;AAAA,YACP,UAAU;AAAA,cACR,WAAW,SAAS,IAAI,IAAI,iDAAiD,gBAAgB,CAAC;AAAA,cAC9F,UAAU,IAAI;AAAA,cACd,QAAQ,IAAI;AAAA,YACd;AAAA,UACF;AAAA,QACF;AAAA,MACF;AACA;AACA;AAAA,IACF,WAAW,QAAQ,aAAa;AAE9B;AAAA,IACF,OAAO;AAEL,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,aAAa,IAAI,IAAI,iBAAiB,gBAAgB,CAAC,eAAe,IAAI,IAAI;AAAA,QAC3F;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,MAAI,gBAAgB,SAAS,QAAQ;AACnC,UAAM,UAAU,SAAS,MAAM,aAAa,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AAE/D,QAAI,QAAQ,iBAAiB;AAC3B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,uCAAuC,QAAQ,KAAK,IAAI,CAAC;AAAA,QACtE;AAAA,MACF;AAAA,IACF;AAGA,UAAM,eAAe;AACrB,UAAM,aAAa,SAAS;AAC5B,UAAM,QAAQ,aAAa,IAAI,eAAe,aAAa;AAE3D,WAAO;AAAA,MACL;AAAA,MACA,UAAU;AAAA,QACR,WAAW,kBAAkB,YAAY,IAAI,UAAU,oCAAoC,QAAQ,KAAK,IAAI,CAAC;AAAA,QAC7G,SAAS;AAAA,QACT,OAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAGA,MAAI,CAAC,QAAQ,eAAe,cAAc,OAAO,QAAQ;AACvD,UAAM,QAAQ,OAAO,MAAM,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AACzD,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,2BAA2B,MAAM,KAAK,IAAI,CAAC;AAAA,MACxD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO;AAAA,IACP,UAAU;AAAA,MACR,WAAW;AAAA,IACb;AAAA,EACF;AACF;AAKA,SAAS,uBACP,UACA,QACA,SAKA;AACA,QAAM,kBAAkB,oBAAI,IAAY;AACxC,QAAM,gBAAgB,oBAAI,IAAY;AACtC,QAAM,SAAmB,CAAC;AAG1B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,MAAM,SAAS,CAAC;AACtB,QAAI,QAAQ;AAGZ,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,UAAI,cAAc,IAAI,CAAC,EAAG;AAE1B,YAAM,MAAM,OAAO,CAAC;AACpB,UAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,YAAI,IAAI,cAAc,QAAW;AAC/B,gBAAM,YAAY,QAAQ;AAAA,YACxB,IAAI;AAAA,YACJ,IAAI,aAAa,CAAC;AAAA,UACpB;AACA,cAAI,CAAC,WAAW;AACd;AAAA,UACF;AAAA,QACF;AAGA,wBAAgB,IAAI,CAAC;AACrB,sBAAc,IAAI,CAAC;AACnB,gBAAQ;AACR;AAAA,MACF;AAAA,IACF;AAEA,QAAI,CAAC,OAAO;AACV,UAAI,IAAI,cAAc,QAAW;AAE/B,cAAM,iBAAiB,OAAO,OAAO,CAAC,MAAM,EAAE,SAAS,IAAI,IAAI;AAC/D,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,KAAK,SAAS,IAAI,IAAI,uCAAuC;AAAA,QACtE,OAAO;AACL,iBAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,QAClD;AAAA,MACF,OAAO;AACL,eAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,MAClD;AAAA,IACF;AAAA,EACF;AAGA,QAAM,aAAa,OAChB,OAAO,CAAC,GAAG,MAAM,CAAC,cAAc,IAAI,CAAC,CAAC,EACtC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,MAAI,CAAC,QAAQ,eAAe,WAAW,SAAS,GAAG;AACjD,WAAO,KAAK,2BAA2B,WAAW,KAAK,IAAI,CAAC,EAAE;AAAA,EAChE;AAGA,QAAM,kBAAkB,gBAAgB;AACxC,QAAM,gBAAgB,SAAS;AAG/B,MAAI,OAAO,SAAS,MAAM,QAAQ,mBAAmB,CAAC,QAAQ,cAAc;AAC1E,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAGA,QAAM,QAAQ,gBAAgB,IAAI,kBAAkB,gBAAgB;AAEpE,MAAI,UAAU,GAAK;AACjB,UAAM,YACJ,WAAW,SAAS,IAAI,iBAAiB,WAAW,KAAK,IAAI,CAAC,MAAM;AACtE,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,iCAAiC,SAAS;AAAA,MACvD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC3B,SAAS;AAAA,MACT,OAAO;AAAA,IACT;AAAA,EACF;AACF;;;ADlUA,OAAO,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA8BZ,QAAQ,SAAe,OACrB,OACA,UACA,QACA,SACA,YAAY,GACZ;AAAA;AA1IJ;AA2II,YAAM,EAAE,MAAM,IAAI;AAElB,YAAM,aAAa,MAAM,OAAO,KAAK;AACrC,YAAM,SACJ,OAAO,eAAe,WAAW,aAAa,WAAW;AAC3D,YAAM,YACJ,OAAO,eAAe,WAAW,WAAW,YAAY;AAE1D,UAAI,SAAS,QAAQ,EAAE,OAAO,UAAU,QAAQ,UAAU,CAAC;AAC3D,UAAI,kBAAkB,SAAS;AAC7B,iBAAS,MAAM;AAAA,MACjB;AAEA,aAAO;AAAA,QACL,QAAO,YAAO,UAAP,YAAgB,MAAM;AAAA,QAC7B,SAAS,MAAM,aAAa,CAAC,iCAAK,SAAL,EAAa,MAAM,QAAQ,KAAK,EAAC,CAAC;AAAA,MACjE;AAAA,IACF;AAAA;AACF,CAAC;AAuDM,SAAS,aACd,MACA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA;AAAA;AAAA,EAGZ,UAAU;AACZ,GAQA;AACA,SAAO,SAAS,MAAM,MAAY;AAChC,UAAM,SAAS,SAAS,KAAK,OAAO,OAAO,CAAC,IAAI;AAEhD,eAAW,MAAwB,MAAM,KAAK,GAAG;AAA5C,qBAAQ,QA3OjB,IA2OS,IAAkB,mBAAlB,IAAkB,CAAV;AACX;AAAA,QACE;AAAA,QACA;AAAA,UACE;AAAA,QACF;AAAA,QACA,CAAO,OAAuB,eAAvB,KAAuB,WAAvB,EAAE,MAAM,SAAS,GAAM;AAC5B,gBAAM,aAAa,MAAM,KAAK,KAAK;AACnC,gBAAM,SACJ,OAAO,eAAe,WAAW,aAAa,WAAW;AAC3D,gBAAM,YACJ,OAAO,eAAe,WAAW,WAAW,YAAY;AAE1D,gBAAM,SAAS,MAAM,QAAQ;AAAA,YAC3B,QAAQ,IAAI,CAAC,WAAW;AACtB,oBAAM,SAAS,OAAO,+BAAE,SAAU,SAAZ,EAAoB,QAAQ,UAAU,EAAC;AAC7D,kBAAI,kBAAkB,SAAS;AAC7B,uBAAO;AAAA,cACT;AACA,qBAAO,IAAI,QAAe,CAAC,YAAY,QAAQ,MAAM,CAAC;AAAA,YACxD,CAAC;AAAA,UACH;AACA,gBAAM,iBAAiB,OAAO,IAAI,CAAC,GAAG,MAAO,iCACxC,IADwC;AAAA,YAE3C,MAAM,QAAQ,CAAC,EAAE;AAAA,UACnB,EAAE;AAEF,gBAAM,WACJ,OAAO,OAAO,CAAC,KAAK,MAAG;AAvQnC,gBAAAA;AAuQsC,2BAAOA,MAAA,EAAE,UAAF,OAAAA,MAAW;AAAA,aAAI,CAAC,IAAI,OAAO;AAE9D,mBAAS,KAAK,OAAO;AAAA,YACnB,QAAQ;AAAA,YACR;AAAA,aACI,aAAa,EAAE,UAAU;AAG/B,cAAI,WAAW;AACb;AAAA,cACE,YAAY;AAAA,cACZ,UAAU,QAAQ,qBAAqB,SAAS;AAAA;AAAA;AAAA,EAAmB,SAAS,MAAM,CAAC;AAAA;AAAA,EAAO;AAAA,gBACxF;AAAA,cACF,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF,EAAC;AACH;AAEO,SAAS,aAAa,QAAsC;AACjE,SAAO,OACJ,KAAK,CAAC,GAAG,MAAG;AA/RjB;AA+RqB,oBAAE,UAAF,YAAW,OAAM,OAAE,UAAF,YAAW;AAAA,GAAE,EAC9C,IAAI,CAAC,MAAM;AAhShB;AAiSM,UAAM,YAAY,KAAK,EAAE,QAAQ,SAAS,OAAM,OAAE,UAAF,YAAW,GAAG,QAAQ,CAAC,CAAC;AACxE,UACI,OAAE,UAAF,YAAW,KAAK,OAAO,OAAE,aAAF,mBAAY,gBACrC,OAAE,aAAF,mBAAY,SACZ;AACA,aAAO,GAAG,SAAS,KACjB,OAAE,aAAF,mBAAY,aACR;AAAA;AAAA;AAAA;AAAA,EAAuB,SAAS,EAAE,SAAS,SAAS,CAAC,KACrD,EACN,KAAG,OAAE,aAAF,mBAAY,UAAS;AAAA;AAAA;AAAA;AAAA,EAAsB,SAAS,EAAE,SAAS,MAAM,CAAC,KAAK,EAAE;AAAA,IAClF;AACA,WAAO;AAAA,EACT,CAAC,EACA,KAAK,MAAM;AAChB;AAsBO,SAAS,SAAS,MAAc,QAAQ,IAAY;AACzD,MAAI,CAAC,QAAQ,KAAK,UAAU,OAAO;AACjC,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,QAAkB,CAAC;AACzB,MAAI,cAAc;AAElB,aAAW,QAAQ,OAAO;AAExB,QAAI,YAAY,SAAS,KAAK,SAAS,IAAI,OAAO;AAChD,YAAM,KAAK,YAAY,KAAK,CAAC;AAC7B,oBAAc;AAAA,IAChB,OAAO;AAEL,sBAAgB,cAAc,MAAM,MAAM;AAAA,IAC5C;AAAA,EACF;AAGA,MAAI,aAAa;AACf,UAAM,KAAK,WAAW;AAAA,EACxB;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;","names":["_a"]}

package/dist/scorers/index.d.mts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { ToolCallScorer, ToolCallScorerConfig, ToolCallScorerOptions } from './toolCallScorer.mjs';
2	+ import 'vitest';

package/dist/scorers/index.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { ToolCallScorer, ToolCallScorerConfig, ToolCallScorerOptions } from './toolCallScorer.js';
2	+ import 'vitest';