npm - vitest-evals - Versions diffs - 0.2.0 → 0.4.0 - Mend

vitest-evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +211 -172
package/dist/index.d.mts +2 -98
package/dist/index.d.ts +2 -98
package/dist/index.js +270 -11
package/dist/index.js.map +1 -1
package/dist/index.mjs +269 -11
package/dist/index.mjs.map +1 -1
package/dist/scorers/index.d.mts +2 -0
package/dist/scorers/index.d.ts +2 -0
package/dist/scorers/index.js +282 -0
package/dist/scorers/index.js.map +1 -0
package/dist/scorers/index.mjs +256 -0
package/dist/scorers/index.mjs.map +1 -0
package/dist/scorers/toolCallScorer.d.mts +240 -0
package/dist/scorers/toolCallScorer.d.ts +240 -0
package/dist/scorers/toolCallScorer.js +280 -0
package/dist/scorers/toolCallScorer.js.map +1 -0
package/dist/scorers/toolCallScorer.mjs +256 -0
package/dist/scorers/toolCallScorer.mjs.map +1 -0
package/package.json +16 -4
package/dist/compatibility.test.d.mts +0 -2
package/dist/compatibility.test.d.ts +0 -2
package/dist/compatibility.test.js +0 -45009
package/dist/compatibility.test.js.map +0 -1
package/dist/compatibility.test.mjs +0 -45864
package/dist/compatibility.test.mjs.map +0 -1
package/dist/formatScores.test.d.mts +0 -2
package/dist/formatScores.test.d.ts +0 -2
package/dist/formatScores.test.js +0 -195
package/dist/formatScores.test.js.map +0 -1
package/dist/formatScores.test.mjs +0 -194
package/dist/formatScores.test.mjs.map +0 -1
package/dist/wrapText.test.d.mts +0 -2
package/dist/wrapText.test.d.ts +0 -2
package/dist/wrapText.test.js +0 -162
package/dist/wrapText.test.js.map +0 -1
package/dist/wrapText.test.mjs +0 -161
package/dist/wrapText.test.mjs.map +0 -1

package/dist/scorers/index.js ADDED Viewed

@@ -0,0 +1,282 @@
+"use strict";
+var __defProp = Object.defineProperty;
+var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
+var __getOwnPropNames = Object.getOwnPropertyNames;
+var __hasOwnProp = Object.prototype.hasOwnProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+var __copyProps = (to, from, except, desc) => {
+  if (from && typeof from === "object" || typeof from === "function") {
+    for (let key of __getOwnPropNames(from))
+      if (!__hasOwnProp.call(to, key) && key !== except)
+        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
+  }
+  return to;
+};
+var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
+var __async = (__this, __arguments, generator) => {
+  return new Promise((resolve, reject) => {
+    var fulfilled = (value) => {
+      try {
+        step(generator.next(value));
+      } catch (e) {
+        reject(e);
+      }
+    };
+    var rejected = (value) => {
+      try {
+        step(generator.throw(value));
+      } catch (e) {
+        reject(e);
+      }
+    };
+    var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
+    step((generator = generator.apply(__this, __arguments)).next());
+  });
+};
+// src/scorers/index.ts
+var scorers_exports = {};
+__export(scorers_exports, {
+  ToolCallScorer: () => ToolCallScorer
+});
+module.exports = __toCommonJS(scorers_exports);
+// src/scorers/toolCallScorer.ts
+function fuzzyMatch(expected, actual) {
+  if (expected == null || actual == null) {
+    return expected === actual;
+  }
+  if (typeof expected === "object" && typeof actual === "object" && !Array.isArray(expected)) {
+    return Object.entries(expected).every(
+      ([key, value]) => key in actual && fuzzyMatch(value, actual[key])
+    );
+  }
+  if (typeof expected === "string" && typeof actual === "string") {
+    return actual.toLowerCase().includes(expected.toLowerCase());
+  }
+  if (typeof expected === "number" && typeof actual === "number") {
+    const tolerance = Math.max(Math.abs(expected) * 1e-3, 1e-3);
+    return Math.abs(expected - actual) <= tolerance;
+  }
+  if (Array.isArray(expected) && Array.isArray(actual)) {
+    return expected.every(
+      (expItem) => actual.some((actItem) => fuzzyMatch(expItem, actItem))
+    );
+  }
+  return expected === actual;
+}
+function strictEquals(expected, actual) {
+  if (expected === actual) return true;
+  if (expected == null || actual == null) return false;
+  if (typeof expected !== typeof actual) return false;
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual)) return false;
+    if (expected.length !== actual.length) return false;
+    return expected.every((item, i) => strictEquals(item, actual[i]));
+  }
+  if (typeof expected === "object") {
+    const expectedKeys = Object.keys(expected).sort();
+    const actualKeys = Object.keys(actual).sort();
+    if (expectedKeys.length !== actualKeys.length) return false;
+    if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;
+    return expectedKeys.every(
+      (key) => strictEquals(expected[key], actual[key])
+    );
+  }
+  return expected === actual;
+}
+function ToolCallScorer(config = {}) {
+  const {
+    ordered = false,
+    requireAll = true,
+    allowExtras = true,
+    params = "strict"
+  } = config;
+  const argMatcher = typeof params === "function" ? params : params === "strict" ? strictEquals : fuzzyMatch;
+  return (opts) => __async(null, null, function* () {
+    const expectedTools = opts.expectedTools || [];
+    const actualCalls = opts.toolCalls || [];
+    if (expectedTools.length === 0) {
+      return {
+        score: 1,
+        metadata: {
+          rationale: "No tool calls expected"
+        }
+      };
+    }
+    if (actualCalls.length === 0) {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Expected ${expectedTools.length} tool(s) but none were called`
+        }
+      };
+    }
+    if (ordered) {
+      return evaluateOrderedTools(expectedTools, actualCalls, {
+        argMatcher,
+        allowExtras,
+        requireAllTools: requireAll
+      });
+    }
+    return evaluateUnorderedTools(expectedTools, actualCalls, {
+      argMatcher,
+      requireAllTools: requireAll,
+      allowExtras
+    });
+  });
+}
+function evaluateOrderedTools(expected, actual, options) {
+  let expectedIndex = 0;
+  let actualIndex = 0;
+  while (expectedIndex < expected.length && actualIndex < actual.length) {
+    const exp = expected[expectedIndex];
+    const act = actual[actualIndex];
+    if (exp.name === act.name) {
+      if (exp.arguments !== void 0) {
+        const argsMatch = options.argMatcher(
+          exp.arguments,
+          act.arguments || {}
+        );
+        if (!argsMatch) {
+          return {
+            score: 0.5,
+            metadata: {
+              rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,
+              expected: exp.arguments,
+              actual: act.arguments
+            }
+          };
+        }
+      }
+      expectedIndex++;
+      actualIndex++;
+    } else if (options.allowExtras) {
+      actualIndex++;
+    } else {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`
+        }
+      };
+    }
+  }
+  if (expectedIndex < expected.length) {
+    const missing = expected.slice(expectedIndex).map((t) => t.name);
+    if (options.requireAllTools) {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Missing required tools in sequence: ${missing.join(", ")}`
+        }
+      };
+    }
+    const matchedCount = expectedIndex;
+    const totalCount = expected.length;
+    const score = totalCount > 0 ? matchedCount / totalCount : 1;
+    return {
+      score,
+      metadata: {
+        rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(", ")})`,
+        matched: matchedCount,
+        total: totalCount
+      }
+    };
+  }
+  if (!options.allowExtras && actualIndex < actual.length) {
+    const extra = actual.slice(actualIndex).map((t) => t.name);
+    return {
+      score: 0,
+      metadata: {
+        rationale: `Unexpected extra tools: ${extra.join(", ")}`
+      }
+    };
+  }
+  return {
+    score: 1,
+    metadata: {
+      rationale: "All tools called in expected order with correct arguments"
+    }
+  };
+}
+function evaluateUnorderedTools(expected, actual, options) {
+  const matchedExpected = /* @__PURE__ */ new Set();
+  const matchedActual = /* @__PURE__ */ new Set();
+  const issues = [];
+  for (let i = 0; i < expected.length; i++) {
+    const exp = expected[i];
+    let found = false;
+    for (let j = 0; j < actual.length; j++) {
+      if (matchedActual.has(j)) continue;
+      const act = actual[j];
+      if (exp.name === act.name) {
+        if (exp.arguments !== void 0) {
+          const argsMatch = options.argMatcher(
+            exp.arguments,
+            act.arguments || {}
+          );
+          if (!argsMatch) {
+            continue;
+          }
+        }
+        matchedExpected.add(i);
+        matchedActual.add(j);
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (exp.arguments !== void 0) {
+        const wrongArgsCalls = actual.filter((a) => a.name === exp.name);
+        if (wrongArgsCalls.length > 0) {
+          issues.push(`Tool '${exp.name}' called but with incorrect arguments`);
+        } else {
+          issues.push(`Missing required tool: ${exp.name}`);
+        }
+      } else {
+        issues.push(`Missing required tool: ${exp.name}`);
+      }
+    }
+  }
+  const extraTools = actual.filter((_, i) => !matchedActual.has(i)).map((t) => t.name);
+  if (!options.allowExtras && extraTools.length > 0) {
+    issues.push(`Unexpected extra tools: ${extraTools.join(", ")}`);
+  }
+  const expectedMatched = matchedExpected.size;
+  const expectedTotal = expected.length;
+  if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {
+    return {
+      score: 0,
+      metadata: {
+        rationale: issues.join("; ")
+      }
+    };
+  }
+  const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1;
+  if (score === 1) {
+    const extraInfo = extraTools.length > 0 ? ` (plus extra: ${extraTools.join(", ")})` : "";
+    return {
+      score: 1,
+      metadata: {
+        rationale: `All expected tools were called${extraInfo}`
+      }
+    };
+  }
+  return {
+    score,
+    metadata: {
+      rationale: issues.join("; "),
+      matched: expectedMatched,
+      total: expectedTotal
+    }
+  };
+}
+// Annotate the CommonJS export names for ESM import in node:
+0 && (module.exports = {
+  ToolCallScorer
+});
+//# sourceMappingURL=index.js.map

package/dist/scorers/index.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../../src/scorers/index.ts","../../src/scorers/toolCallScorer.ts"],"sourcesContent":["export {\n ToolCallScorer,\n type ToolCallScorerOptions,\n type ToolCallScorerConfig,\n} from \"./toolCallScorer\";\n","import type { ScoreFn, BaseScorerOptions, ToolCall } from \"../index\";\n\nexport interface ToolCallScorerOptions extends BaseScorerOptions {\n // Expected tools are now defined in the test data\n expectedTools?: Array<{\n name: string;\n arguments?: any;\n }>;\n}\n\nexport interface ToolCallScorerConfig {\n /**\n * Whether tools must be called in the exact order specified\n * @default false\n */\n ordered?: boolean;\n\n /**\n * Whether all expected tools must be called for a passing score\n * When false: gives partial credit based on tools matched\n * @default true\n */\n requireAll?: boolean;\n\n /**\n * Whether to allow additional tool calls beyond those expected\n * @default true\n */\n allowExtras?: boolean;\n\n /**\n * How to match tool arguments/parameters\n * - \"strict\": Exact equality required (default)\n * - \"fuzzy\": Case-insensitive, subset matching, numeric tolerance\n * - Custom function: Your own comparison logic\n * @default \"strict\"\n */\n params?: \"strict\" | \"fuzzy\" | ((expected: any, actual: any) => boolean);\n}\n\n/**\n * Default fuzzy matching for arguments\n */\nfunction fuzzyMatch(expected: any, actual: any): boolean {\n // Null/undefined handling\n if (expected == null || actual == null) {\n return expected === actual;\n }\n\n // For objects, check if actual has all expected properties\n if (\n typeof expected === \"object\" &&\n typeof actual === \"object\" &&\n !Array.isArray(expected)\n ) {\n return Object.entries(expected).every(\n ([key, value]) => key in actual && fuzzyMatch(value, actual[key]),\n );\n }\n\n // For strings, case-insensitive substring match\n if (typeof expected === \"string\" && typeof actual === \"string\") {\n return actual.toLowerCase().includes(expected.toLowerCase());\n }\n\n // For numbers, allow small differences (0.1% or 0.001, whichever is larger)\n if (typeof expected === \"number\" && typeof actual === \"number\") {\n const tolerance = Math.max(Math.abs(expected) * 0.001, 0.001);\n return Math.abs(expected - actual) <= tolerance;\n }\n\n // For arrays, check if all expected items exist in actual (order doesn't matter in fuzzy mode)\n if (Array.isArray(expected) && Array.isArray(actual)) {\n return expected.every((expItem) =>\n actual.some((actItem) => fuzzyMatch(expItem, actItem)),\n );\n }\n\n // Otherwise strict equality\n return expected === actual;\n}\n\n/**\n * Strict equality comparison (deep equals)\n */\nfunction strictEquals(expected: any, actual: any): boolean {\n // Handle primitive types and null/undefined\n if (expected === actual) return true;\n if (expected == null || actual == null) return false;\n\n // Must be same type\n if (typeof expected !== typeof actual) return false;\n\n // Handle arrays\n if (Array.isArray(expected)) {\n if (!Array.isArray(actual)) return false;\n if (expected.length !== actual.length) return false;\n return expected.every((item, i) => strictEquals(item, actual[i]));\n }\n\n // Handle objects\n if (typeof expected === \"object\") {\n const expectedKeys = Object.keys(expected).sort();\n const actualKeys = Object.keys(actual).sort();\n\n // Must have same keys\n if (expectedKeys.length !== actualKeys.length) return false;\n if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;\n\n // All values must match\n return expectedKeys.every((key) =>\n strictEquals(expected[key], actual[key]),\n );\n }\n\n // Primitive types\n return expected === actual;\n}\n\n/**\n * A configurable scorer for evaluating tool usage in LLM responses.\n *\n * The test data defines WHAT tools/arguments are expected,\n * while this scorer defines HOW to evaluate them.\n *\n * @param config - Configuration options for the scorer\n * @param config.ordered - Require exact order of tool calls\n * @param config.requireAll - Require all expected tools (vs partial credit)\n * @param config.allowExtras - Allow additional tool calls\n * @param config.params - How to match parameters: \"strict\", \"fuzzy\", or custom function\n *\n * @example\n * // Default: strict params, any order\n * describeEval(\"search test\", {\n * data: async () => [{\n * input: \"Find restaurants\",\n * expectedTools: [\n * { name: \"search\", arguments: { type: \"restaurant\" } },\n * { name: \"filter\" }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer()]\n * });\n *\n * @example\n * // Strict order and parameters\n * describeEval(\"payment flow\", {\n * data: async () => [{\n * input: \"Process payment\",\n * expectedTools: [\n * { name: \"validate\", arguments: { amount: 100 } },\n * { name: \"charge\", arguments: { amount: 100, method: \"card\" } }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer({ ordered: true, params: \"strict\" })]\n * });\n */\nexport function ToolCallScorer(\n config: ToolCallScorerConfig = {},\n): ScoreFn<ToolCallScorerOptions> {\n const {\n ordered = false,\n requireAll = true,\n allowExtras = true,\n params = \"strict\",\n } = config;\n\n // Determine the argument matcher\n const argMatcher =\n typeof params === \"function\"\n ? params\n : params === \"strict\"\n ? strictEquals\n : fuzzyMatch;\n\n return async (opts) => {\n const expectedTools = opts.expectedTools || [];\n const actualCalls = opts.toolCalls || [];\n\n // No expectations means pass\n if (expectedTools.length === 0) {\n return {\n score: 1.0,\n metadata: {\n rationale: \"No tool calls expected\",\n },\n };\n }\n\n // No actual calls when we expected some\n if (actualCalls.length === 0) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected ${expectedTools.length} tool(s) but none were called`,\n },\n };\n }\n\n if (ordered) {\n return evaluateOrderedTools(expectedTools, actualCalls, {\n argMatcher,\n allowExtras,\n requireAllTools: requireAll,\n });\n }\n\n return evaluateUnorderedTools(expectedTools, actualCalls, {\n argMatcher,\n requireAllTools: requireAll,\n allowExtras,\n });\n };\n}\n\n/**\n * Evaluate tools that must be called in a specific order\n */\nfunction evaluateOrderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n allowExtras: boolean;\n requireAllTools: boolean;\n },\n) {\n let expectedIndex = 0;\n let actualIndex = 0;\n\n // Match expected tools in order\n while (expectedIndex < expected.length && actualIndex < actual.length) {\n const exp = expected[expectedIndex];\n const act = actual[actualIndex];\n\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments || {},\n );\n if (!argsMatch) {\n return {\n score: 0.5,\n metadata: {\n rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,\n expected: exp.arguments,\n actual: act.arguments,\n },\n };\n }\n }\n expectedIndex++;\n actualIndex++;\n } else if (options.allowExtras) {\n // Skip extra tool\n actualIndex++;\n } else {\n // Wrong tool in sequence when extra tools not allowed\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`,\n },\n };\n }\n }\n\n // Check if all expected tools were matched\n if (expectedIndex < expected.length) {\n const missing = expected.slice(expectedIndex).map((t) => t.name);\n\n if (options.requireAllTools) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Missing required tools in sequence: ${missing.join(\", \")}`,\n },\n };\n }\n\n // Partial credit when requireAllTools is false\n const matchedCount = expectedIndex;\n const totalCount = expected.length;\n const score = totalCount > 0 ? matchedCount / totalCount : 1.0;\n\n return {\n score,\n metadata: {\n rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(\", \")})`,\n matched: matchedCount,\n total: totalCount,\n },\n };\n }\n\n // Check for extra tools at the end if not allowed\n if (!options.allowExtras && actualIndex < actual.length) {\n const extra = actual.slice(actualIndex).map((t) => t.name);\n return {\n score: 0.0,\n metadata: {\n rationale: `Unexpected extra tools: ${extra.join(\", \")}`,\n },\n };\n }\n\n return {\n score: 1.0,\n metadata: {\n rationale: \"All tools called in expected order with correct arguments\",\n },\n };\n}\n\n/**\n * Evaluate tools that can be called in any order\n */\nfunction evaluateUnorderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n requireAllTools: boolean;\n allowExtras: boolean;\n },\n) {\n const matchedExpected = new Set<number>();\n const matchedActual = new Set<number>();\n const issues: string[] = [];\n\n // Try to match each expected tool\n for (let i = 0; i < expected.length; i++) {\n const exp = expected[i];\n let found = false;\n\n // Look for a matching actual tool call\n for (let j = 0; j < actual.length; j++) {\n if (matchedActual.has(j)) continue;\n\n const act = actual[j];\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments || {},\n );\n if (!argsMatch) {\n continue; // Try to find another call with matching args\n }\n }\n\n // Found a match\n matchedExpected.add(i);\n matchedActual.add(j);\n found = true;\n break;\n }\n }\n\n if (!found) {\n if (exp.arguments !== undefined) {\n // Check if tool was called but with wrong args\n const wrongArgsCalls = actual.filter((a) => a.name === exp.name);\n if (wrongArgsCalls.length > 0) {\n issues.push(`Tool '${exp.name}' called but with incorrect arguments`);\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n }\n }\n\n // Check for extra tools\n const extraTools = actual\n .filter((_, i) => !matchedActual.has(i))\n .map((t) => t.name);\n\n if (!options.allowExtras && extraTools.length > 0) {\n issues.push(`Unexpected extra tools: ${extraTools.join(\", \")}`);\n }\n\n // Calculate score\n const expectedMatched = matchedExpected.size;\n const expectedTotal = expected.length;\n\n // If we have any critical issues (wrong tools, missing tools when required, or extra tools when not allowed)\n if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {\n return {\n score: 0.0,\n metadata: {\n rationale: issues.join(\"; \"),\n },\n };\n }\n\n // Partial credit when not all required\n const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1.0;\n\n if (score === 1.0) {\n const extraInfo =\n extraTools.length > 0 ? ` (plus extra: ${extraTools.join(\", \")})` : \"\";\n return {\n score: 1.0,\n metadata: {\n rationale: `All expected tools were called${extraInfo}`,\n },\n };\n }\n\n return {\n score,\n metadata: {\n rationale: issues.join(\"; \"),\n matched: expectedMatched,\n total: expectedTotal,\n },\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;AC2CA,SAAS,WAAW,UAAe,QAAsB;AAEvD,MAAI,YAAY,QAAQ,UAAU,MAAM;AACtC,WAAO,aAAa;AAAA,EACtB;AAGA,MACE,OAAO,aAAa,YACpB,OAAO,WAAW,YAClB,CAAC,MAAM,QAAQ,QAAQ,GACvB;AACA,WAAO,OAAO,QAAQ,QAAQ,EAAE;AAAA,MAC9B,CAAC,CAAC,KAAK,KAAK,MAAM,OAAO,UAAU,WAAW,OAAO,OAAO,GAAG,CAAC;AAAA,IAClE;AAAA,EACF;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,OAAO,YAAY,EAAE,SAAS,SAAS,YAAY,CAAC;AAAA,EAC7D;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,UAAM,YAAY,KAAK,IAAI,KAAK,IAAI,QAAQ,IAAI,MAAO,IAAK;AAC5D,WAAO,KAAK,IAAI,WAAW,MAAM,KAAK;AAAA,EACxC;AAGA,MAAI,MAAM,QAAQ,QAAQ,KAAK,MAAM,QAAQ,MAAM,GAAG;AACpD,WAAO,SAAS;AAAA,MAAM,CAAC,YACrB,OAAO,KAAK,CAAC,YAAY,WAAW,SAAS,OAAO,CAAC;AAAA,IACvD;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AAKA,SAAS,aAAa,UAAe,QAAsB;AAEzD,MAAI,aAAa,OAAQ,QAAO;AAChC,MAAI,YAAY,QAAQ,UAAU,KAAM,QAAO;AAG/C,MAAI,OAAO,aAAa,OAAO,OAAQ,QAAO;AAG9C,MAAI,MAAM,QAAQ,QAAQ,GAAG;AAC3B,QAAI,CAAC,MAAM,QAAQ,MAAM,EAAG,QAAO;AACnC,QAAI,SAAS,WAAW,OAAO,OAAQ,QAAO;AAC9C,WAAO,SAAS,MAAM,CAAC,MAAM,MAAM,aAAa,MAAM,OAAO,CAAC,CAAC,CAAC;AAAA,EAClE;AAGA,MAAI,OAAO,aAAa,UAAU;AAChC,UAAM,eAAe,OAAO,KAAK,QAAQ,EAAE,KAAK;AAChD,UAAM,aAAa,OAAO,KAAK,MAAM,EAAE,KAAK;AAG5C,QAAI,aAAa,WAAW,WAAW,OAAQ,QAAO;AACtD,QAAI,CAAC,aAAa,MAAM,CAAC,KAAK,MAAM,QAAQ,WAAW,CAAC,CAAC,EAAG,QAAO;AAGnE,WAAO,aAAa;AAAA,MAAM,CAAC,QACzB,aAAa,SAAS,GAAG,GAAG,OAAO,GAAG,CAAC;AAAA,IACzC;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AA0CO,SAAS,eACd,SAA+B,CAAC,GACA;AAChC,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,aAAa;AAAA,IACb,cAAc;AAAA,IACd,SAAS;AAAA,EACX,IAAI;AAGJ,QAAM,aACJ,OAAO,WAAW,aACd,SACA,WAAW,WACT,eACA;AAER,SAAO,CAAO,SAAS;AACrB,UAAM,gBAAgB,KAAK,iBAAiB,CAAC;AAC7C,UAAM,cAAc,KAAK,aAAa,CAAC;AAGvC,QAAI,cAAc,WAAW,GAAG;AAC9B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW;AAAA,QACb;AAAA,MACF;AAAA,IACF;AAGA,QAAI,YAAY,WAAW,GAAG;AAC5B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,YAAY,cAAc,MAAM;AAAA,QAC7C;AAAA,MACF;AAAA,IACF;AAEA,QAAI,SAAS;AACX,aAAO,qBAAqB,eAAe,aAAa;AAAA,QACtD;AAAA,QACA;AAAA,QACA,iBAAiB;AAAA,MACnB,CAAC;AAAA,IACH;AAEA,WAAO,uBAAuB,eAAe,aAAa;AAAA,MACxD;AAAA,MACA,iBAAiB;AAAA,MACjB;AAAA,IACF,CAAC;AAAA,EACH;AACF;AAKA,SAAS,qBACP,UACA,QACA,SAKA;AACA,MAAI,gBAAgB;AACpB,MAAI,cAAc;AAGlB,SAAO,gBAAgB,SAAS,UAAU,cAAc,OAAO,QAAQ;AACrE,UAAM,MAAM,SAAS,aAAa;AAClC,UAAM,MAAM,OAAO,WAAW;AAE9B,QAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,UAAI,IAAI,cAAc,QAAW;AAC/B,cAAM,YAAY,QAAQ;AAAA,UACxB,IAAI;AAAA,UACJ,IAAI,aAAa,CAAC;AAAA,QACpB;AACA,YAAI,CAAC,WAAW;AACd,iBAAO;AAAA,YACL,OAAO;AAAA,YACP,UAAU;AAAA,cACR,WAAW,SAAS,IAAI,IAAI,iDAAiD,gBAAgB,CAAC;AAAA,cAC9F,UAAU,IAAI;AAAA,cACd,QAAQ,IAAI;AAAA,YACd;AAAA,UACF;AAAA,QACF;AAAA,MACF;AACA;AACA;AAAA,IACF,WAAW,QAAQ,aAAa;AAE9B;AAAA,IACF,OAAO;AAEL,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,aAAa,IAAI,IAAI,iBAAiB,gBAAgB,CAAC,eAAe,IAAI,IAAI;AAAA,QAC3F;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,MAAI,gBAAgB,SAAS,QAAQ;AACnC,UAAM,UAAU,SAAS,MAAM,aAAa,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AAE/D,QAAI,QAAQ,iBAAiB;AAC3B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,uCAAuC,QAAQ,KAAK,IAAI,CAAC;AAAA,QACtE;AAAA,MACF;AAAA,IACF;AAGA,UAAM,eAAe;AACrB,UAAM,aAAa,SAAS;AAC5B,UAAM,QAAQ,aAAa,IAAI,eAAe,aAAa;AAE3D,WAAO;AAAA,MACL;AAAA,MACA,UAAU;AAAA,QACR,WAAW,kBAAkB,YAAY,IAAI,UAAU,oCAAoC,QAAQ,KAAK,IAAI,CAAC;AAAA,QAC7G,SAAS;AAAA,QACT,OAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAGA,MAAI,CAAC,QAAQ,eAAe,cAAc,OAAO,QAAQ;AACvD,UAAM,QAAQ,OAAO,MAAM,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AACzD,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,2BAA2B,MAAM,KAAK,IAAI,CAAC;AAAA,MACxD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO;AAAA,IACP,UAAU;AAAA,MACR,WAAW;AAAA,IACb;AAAA,EACF;AACF;AAKA,SAAS,uBACP,UACA,QACA,SAKA;AACA,QAAM,kBAAkB,oBAAI,IAAY;AACxC,QAAM,gBAAgB,oBAAI,IAAY;AACtC,QAAM,SAAmB,CAAC;AAG1B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,MAAM,SAAS,CAAC;AACtB,QAAI,QAAQ;AAGZ,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,UAAI,cAAc,IAAI,CAAC,EAAG;AAE1B,YAAM,MAAM,OAAO,CAAC;AACpB,UAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,YAAI,IAAI,cAAc,QAAW;AAC/B,gBAAM,YAAY,QAAQ;AAAA,YACxB,IAAI;AAAA,YACJ,IAAI,aAAa,CAAC;AAAA,UACpB;AACA,cAAI,CAAC,WAAW;AACd;AAAA,UACF;AAAA,QACF;AAGA,wBAAgB,IAAI,CAAC;AACrB,sBAAc,IAAI,CAAC;AACnB,gBAAQ;AACR;AAAA,MACF;AAAA,IACF;AAEA,QAAI,CAAC,OAAO;AACV,UAAI,IAAI,cAAc,QAAW;AAE/B,cAAM,iBAAiB,OAAO,OAAO,CAAC,MAAM,EAAE,SAAS,IAAI,IAAI;AAC/D,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,KAAK,SAAS,IAAI,IAAI,uCAAuC;AAAA,QACtE,OAAO;AACL,iBAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,QAClD;AAAA,MACF,OAAO;AACL,eAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,MAClD;AAAA,IACF;AAAA,EACF;AAGA,QAAM,aAAa,OAChB,OAAO,CAAC,GAAG,MAAM,CAAC,cAAc,IAAI,CAAC,CAAC,EACtC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,MAAI,CAAC,QAAQ,eAAe,WAAW,SAAS,GAAG;AACjD,WAAO,KAAK,2BAA2B,WAAW,KAAK,IAAI,CAAC,EAAE;AAAA,EAChE;AAGA,QAAM,kBAAkB,gBAAgB;AACxC,QAAM,gBAAgB,SAAS;AAG/B,MAAI,OAAO,SAAS,MAAM,QAAQ,mBAAmB,CAAC,QAAQ,cAAc;AAC1E,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAGA,QAAM,QAAQ,gBAAgB,IAAI,kBAAkB,gBAAgB;AAEpE,MAAI,UAAU,GAAK;AACjB,UAAM,YACJ,WAAW,SAAS,IAAI,iBAAiB,WAAW,KAAK,IAAI,CAAC,MAAM;AACtE,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,iCAAiC,SAAS;AAAA,MACvD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC3B,SAAS;AAAA,MACT,OAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}

package/dist/scorers/index.mjs ADDED Viewed

@@ -0,0 +1,256 @@
+var __async = (__this, __arguments, generator) => {
+  return new Promise((resolve, reject) => {
+    var fulfilled = (value) => {
+      try {
+        step(generator.next(value));
+      } catch (e) {
+        reject(e);
+      }
+    };
+    var rejected = (value) => {
+      try {
+        step(generator.throw(value));
+      } catch (e) {
+        reject(e);
+      }
+    };
+    var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
+    step((generator = generator.apply(__this, __arguments)).next());
+  });
+};
+// src/scorers/toolCallScorer.ts
+function fuzzyMatch(expected, actual) {
+  if (expected == null || actual == null) {
+    return expected === actual;
+  }
+  if (typeof expected === "object" && typeof actual === "object" && !Array.isArray(expected)) {
+    return Object.entries(expected).every(
+      ([key, value]) => key in actual && fuzzyMatch(value, actual[key])
+    );
+  }
+  if (typeof expected === "string" && typeof actual === "string") {
+    return actual.toLowerCase().includes(expected.toLowerCase());
+  }
+  if (typeof expected === "number" && typeof actual === "number") {
+    const tolerance = Math.max(Math.abs(expected) * 1e-3, 1e-3);
+    return Math.abs(expected - actual) <= tolerance;
+  }
+  if (Array.isArray(expected) && Array.isArray(actual)) {
+    return expected.every(
+      (expItem) => actual.some((actItem) => fuzzyMatch(expItem, actItem))
+    );
+  }
+  return expected === actual;
+}
+function strictEquals(expected, actual) {
+  if (expected === actual) return true;
+  if (expected == null || actual == null) return false;
+  if (typeof expected !== typeof actual) return false;
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual)) return false;
+    if (expected.length !== actual.length) return false;
+    return expected.every((item, i) => strictEquals(item, actual[i]));
+  }
+  if (typeof expected === "object") {
+    const expectedKeys = Object.keys(expected).sort();
+    const actualKeys = Object.keys(actual).sort();
+    if (expectedKeys.length !== actualKeys.length) return false;
+    if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;
+    return expectedKeys.every(
+      (key) => strictEquals(expected[key], actual[key])
+    );
+  }
+  return expected === actual;
+}
+function ToolCallScorer(config = {}) {
+  const {
+    ordered = false,
+    requireAll = true,
+    allowExtras = true,
+    params = "strict"
+  } = config;
+  const argMatcher = typeof params === "function" ? params : params === "strict" ? strictEquals : fuzzyMatch;
+  return (opts) => __async(null, null, function* () {
+    const expectedTools = opts.expectedTools || [];
+    const actualCalls = opts.toolCalls || [];
+    if (expectedTools.length === 0) {
+      return {
+        score: 1,
+        metadata: {
+          rationale: "No tool calls expected"
+        }
+      };
+    }
+    if (actualCalls.length === 0) {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Expected ${expectedTools.length} tool(s) but none were called`
+        }
+      };
+    }
+    if (ordered) {
+      return evaluateOrderedTools(expectedTools, actualCalls, {
+        argMatcher,
+        allowExtras,
+        requireAllTools: requireAll
+      });
+    }
+    return evaluateUnorderedTools(expectedTools, actualCalls, {
+      argMatcher,
+      requireAllTools: requireAll,
+      allowExtras
+    });
+  });
+}
+function evaluateOrderedTools(expected, actual, options) {
+  let expectedIndex = 0;
+  let actualIndex = 0;
+  while (expectedIndex < expected.length && actualIndex < actual.length) {
+    const exp = expected[expectedIndex];
+    const act = actual[actualIndex];
+    if (exp.name === act.name) {
+      if (exp.arguments !== void 0) {
+        const argsMatch = options.argMatcher(
+          exp.arguments,
+          act.arguments || {}
+        );
+        if (!argsMatch) {
+          return {
+            score: 0.5,
+            metadata: {
+              rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,
+              expected: exp.arguments,
+              actual: act.arguments
+            }
+          };
+        }
+      }
+      expectedIndex++;
+      actualIndex++;
+    } else if (options.allowExtras) {
+      actualIndex++;
+    } else {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`
+        }
+      };
+    }
+  }
+  if (expectedIndex < expected.length) {
+    const missing = expected.slice(expectedIndex).map((t) => t.name);
+    if (options.requireAllTools) {
+      return {
+        score: 0,
+        metadata: {
+          rationale: `Missing required tools in sequence: ${missing.join(", ")}`
+        }
+      };
+    }
+    const matchedCount = expectedIndex;
+    const totalCount = expected.length;
+    const score = totalCount > 0 ? matchedCount / totalCount : 1;
+    return {
+      score,
+      metadata: {
+        rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(", ")})`,
+        matched: matchedCount,
+        total: totalCount
+      }
+    };
+  }
+  if (!options.allowExtras && actualIndex < actual.length) {
+    const extra = actual.slice(actualIndex).map((t) => t.name);
+    return {
+      score: 0,
+      metadata: {
+        rationale: `Unexpected extra tools: ${extra.join(", ")}`
+      }
+    };
+  }
+  return {
+    score: 1,
+    metadata: {
+      rationale: "All tools called in expected order with correct arguments"
+    }
+  };
+}
+function evaluateUnorderedTools(expected, actual, options) {
+  const matchedExpected = /* @__PURE__ */ new Set();
+  const matchedActual = /* @__PURE__ */ new Set();
+  const issues = [];
+  for (let i = 0; i < expected.length; i++) {
+    const exp = expected[i];
+    let found = false;
+    for (let j = 0; j < actual.length; j++) {
+      if (matchedActual.has(j)) continue;
+      const act = actual[j];
+      if (exp.name === act.name) {
+        if (exp.arguments !== void 0) {
+          const argsMatch = options.argMatcher(
+            exp.arguments,
+            act.arguments || {}
+          );
+          if (!argsMatch) {
+            continue;
+          }
+        }
+        matchedExpected.add(i);
+        matchedActual.add(j);
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (exp.arguments !== void 0) {
+        const wrongArgsCalls = actual.filter((a) => a.name === exp.name);
+        if (wrongArgsCalls.length > 0) {
+          issues.push(`Tool '${exp.name}' called but with incorrect arguments`);
+        } else {
+          issues.push(`Missing required tool: ${exp.name}`);
+        }
+      } else {
+        issues.push(`Missing required tool: ${exp.name}`);
+      }
+    }
+  }
+  const extraTools = actual.filter((_, i) => !matchedActual.has(i)).map((t) => t.name);
+  if (!options.allowExtras && extraTools.length > 0) {
+    issues.push(`Unexpected extra tools: ${extraTools.join(", ")}`);
+  }
+  const expectedMatched = matchedExpected.size;
+  const expectedTotal = expected.length;
+  if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {
+    return {
+      score: 0,
+      metadata: {
+        rationale: issues.join("; ")
+      }
+    };
+  }
+  const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1;
+  if (score === 1) {
+    const extraInfo = extraTools.length > 0 ? ` (plus extra: ${extraTools.join(", ")})` : "";
+    return {
+      score: 1,
+      metadata: {
+        rationale: `All expected tools were called${extraInfo}`
+      }
+    };
+  }
+  return {
+    score,
+    metadata: {
+      rationale: issues.join("; "),
+      matched: expectedMatched,
+      total: expectedTotal
+    }
+  };
+}
+export {
+  ToolCallScorer
+};
+//# sourceMappingURL=index.mjs.map

package/dist/scorers/index.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../../src/scorers/toolCallScorer.ts"],"sourcesContent":["import type { ScoreFn, BaseScorerOptions, ToolCall } from \"../index\";\n\nexport interface ToolCallScorerOptions extends BaseScorerOptions {\n // Expected tools are now defined in the test data\n expectedTools?: Array<{\n name: string;\n arguments?: any;\n }>;\n}\n\nexport interface ToolCallScorerConfig {\n /**\n * Whether tools must be called in the exact order specified\n * @default false\n */\n ordered?: boolean;\n\n /**\n * Whether all expected tools must be called for a passing score\n * When false: gives partial credit based on tools matched\n * @default true\n */\n requireAll?: boolean;\n\n /**\n * Whether to allow additional tool calls beyond those expected\n * @default true\n */\n allowExtras?: boolean;\n\n /**\n * How to match tool arguments/parameters\n * - \"strict\": Exact equality required (default)\n * - \"fuzzy\": Case-insensitive, subset matching, numeric tolerance\n * - Custom function: Your own comparison logic\n * @default \"strict\"\n */\n params?: \"strict\" | \"fuzzy\" | ((expected: any, actual: any) => boolean);\n}\n\n/**\n * Default fuzzy matching for arguments\n */\nfunction fuzzyMatch(expected: any, actual: any): boolean {\n // Null/undefined handling\n if (expected == null || actual == null) {\n return expected === actual;\n }\n\n // For objects, check if actual has all expected properties\n if (\n typeof expected === \"object\" &&\n typeof actual === \"object\" &&\n !Array.isArray(expected)\n ) {\n return Object.entries(expected).every(\n ([key, value]) => key in actual && fuzzyMatch(value, actual[key]),\n );\n }\n\n // For strings, case-insensitive substring match\n if (typeof expected === \"string\" && typeof actual === \"string\") {\n return actual.toLowerCase().includes(expected.toLowerCase());\n }\n\n // For numbers, allow small differences (0.1% or 0.001, whichever is larger)\n if (typeof expected === \"number\" && typeof actual === \"number\") {\n const tolerance = Math.max(Math.abs(expected) * 0.001, 0.001);\n return Math.abs(expected - actual) <= tolerance;\n }\n\n // For arrays, check if all expected items exist in actual (order doesn't matter in fuzzy mode)\n if (Array.isArray(expected) && Array.isArray(actual)) {\n return expected.every((expItem) =>\n actual.some((actItem) => fuzzyMatch(expItem, actItem)),\n );\n }\n\n // Otherwise strict equality\n return expected === actual;\n}\n\n/**\n * Strict equality comparison (deep equals)\n */\nfunction strictEquals(expected: any, actual: any): boolean {\n // Handle primitive types and null/undefined\n if (expected === actual) return true;\n if (expected == null || actual == null) return false;\n\n // Must be same type\n if (typeof expected !== typeof actual) return false;\n\n // Handle arrays\n if (Array.isArray(expected)) {\n if (!Array.isArray(actual)) return false;\n if (expected.length !== actual.length) return false;\n return expected.every((item, i) => strictEquals(item, actual[i]));\n }\n\n // Handle objects\n if (typeof expected === \"object\") {\n const expectedKeys = Object.keys(expected).sort();\n const actualKeys = Object.keys(actual).sort();\n\n // Must have same keys\n if (expectedKeys.length !== actualKeys.length) return false;\n if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;\n\n // All values must match\n return expectedKeys.every((key) =>\n strictEquals(expected[key], actual[key]),\n );\n }\n\n // Primitive types\n return expected === actual;\n}\n\n/**\n * A configurable scorer for evaluating tool usage in LLM responses.\n *\n * The test data defines WHAT tools/arguments are expected,\n * while this scorer defines HOW to evaluate them.\n *\n * @param config - Configuration options for the scorer\n * @param config.ordered - Require exact order of tool calls\n * @param config.requireAll - Require all expected tools (vs partial credit)\n * @param config.allowExtras - Allow additional tool calls\n * @param config.params - How to match parameters: \"strict\", \"fuzzy\", or custom function\n *\n * @example\n * // Default: strict params, any order\n * describeEval(\"search test\", {\n * data: async () => [{\n * input: \"Find restaurants\",\n * expectedTools: [\n * { name: \"search\", arguments: { type: \"restaurant\" } },\n * { name: \"filter\" }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer()]\n * });\n *\n * @example\n * // Strict order and parameters\n * describeEval(\"payment flow\", {\n * data: async () => [{\n * input: \"Process payment\",\n * expectedTools: [\n * { name: \"validate\", arguments: { amount: 100 } },\n * { name: \"charge\", arguments: { amount: 100, method: \"card\" } }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer({ ordered: true, params: \"strict\" })]\n * });\n */\nexport function ToolCallScorer(\n config: ToolCallScorerConfig = {},\n): ScoreFn<ToolCallScorerOptions> {\n const {\n ordered = false,\n requireAll = true,\n allowExtras = true,\n params = \"strict\",\n } = config;\n\n // Determine the argument matcher\n const argMatcher =\n typeof params === \"function\"\n ? params\n : params === \"strict\"\n ? strictEquals\n : fuzzyMatch;\n\n return async (opts) => {\n const expectedTools = opts.expectedTools || [];\n const actualCalls = opts.toolCalls || [];\n\n // No expectations means pass\n if (expectedTools.length === 0) {\n return {\n score: 1.0,\n metadata: {\n rationale: \"No tool calls expected\",\n },\n };\n }\n\n // No actual calls when we expected some\n if (actualCalls.length === 0) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected ${expectedTools.length} tool(s) but none were called`,\n },\n };\n }\n\n if (ordered) {\n return evaluateOrderedTools(expectedTools, actualCalls, {\n argMatcher,\n allowExtras,\n requireAllTools: requireAll,\n });\n }\n\n return evaluateUnorderedTools(expectedTools, actualCalls, {\n argMatcher,\n requireAllTools: requireAll,\n allowExtras,\n });\n };\n}\n\n/**\n * Evaluate tools that must be called in a specific order\n */\nfunction evaluateOrderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n allowExtras: boolean;\n requireAllTools: boolean;\n },\n) {\n let expectedIndex = 0;\n let actualIndex = 0;\n\n // Match expected tools in order\n while (expectedIndex < expected.length && actualIndex < actual.length) {\n const exp = expected[expectedIndex];\n const act = actual[actualIndex];\n\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments || {},\n );\n if (!argsMatch) {\n return {\n score: 0.5,\n metadata: {\n rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,\n expected: exp.arguments,\n actual: act.arguments,\n },\n };\n }\n }\n expectedIndex++;\n actualIndex++;\n } else if (options.allowExtras) {\n // Skip extra tool\n actualIndex++;\n } else {\n // Wrong tool in sequence when extra tools not allowed\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`,\n },\n };\n }\n }\n\n // Check if all expected tools were matched\n if (expectedIndex < expected.length) {\n const missing = expected.slice(expectedIndex).map((t) => t.name);\n\n if (options.requireAllTools) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Missing required tools in sequence: ${missing.join(\", \")}`,\n },\n };\n }\n\n // Partial credit when requireAllTools is false\n const matchedCount = expectedIndex;\n const totalCount = expected.length;\n const score = totalCount > 0 ? matchedCount / totalCount : 1.0;\n\n return {\n score,\n metadata: {\n rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(\", \")})`,\n matched: matchedCount,\n total: totalCount,\n },\n };\n }\n\n // Check for extra tools at the end if not allowed\n if (!options.allowExtras && actualIndex < actual.length) {\n const extra = actual.slice(actualIndex).map((t) => t.name);\n return {\n score: 0.0,\n metadata: {\n rationale: `Unexpected extra tools: ${extra.join(\", \")}`,\n },\n };\n }\n\n return {\n score: 1.0,\n metadata: {\n rationale: \"All tools called in expected order with correct arguments\",\n },\n };\n}\n\n/**\n * Evaluate tools that can be called in any order\n */\nfunction evaluateUnorderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n requireAllTools: boolean;\n allowExtras: boolean;\n },\n) {\n const matchedExpected = new Set<number>();\n const matchedActual = new Set<number>();\n const issues: string[] = [];\n\n // Try to match each expected tool\n for (let i = 0; i < expected.length; i++) {\n const exp = expected[i];\n let found = false;\n\n // Look for a matching actual tool call\n for (let j = 0; j < actual.length; j++) {\n if (matchedActual.has(j)) continue;\n\n const act = actual[j];\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments || {},\n );\n if (!argsMatch) {\n continue; // Try to find another call with matching args\n }\n }\n\n // Found a match\n matchedExpected.add(i);\n matchedActual.add(j);\n found = true;\n break;\n }\n }\n\n if (!found) {\n if (exp.arguments !== undefined) {\n // Check if tool was called but with wrong args\n const wrongArgsCalls = actual.filter((a) => a.name === exp.name);\n if (wrongArgsCalls.length > 0) {\n issues.push(`Tool '${exp.name}' called but with incorrect arguments`);\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n }\n }\n\n // Check for extra tools\n const extraTools = actual\n .filter((_, i) => !matchedActual.has(i))\n .map((t) => t.name);\n\n if (!options.allowExtras && extraTools.length > 0) {\n issues.push(`Unexpected extra tools: ${extraTools.join(\", \")}`);\n }\n\n // Calculate score\n const expectedMatched = matchedExpected.size;\n const expectedTotal = expected.length;\n\n // If we have any critical issues (wrong tools, missing tools when required, or extra tools when not allowed)\n if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {\n return {\n score: 0.0,\n metadata: {\n rationale: issues.join(\"; \"),\n },\n };\n }\n\n // Partial credit when not all required\n const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1.0;\n\n if (score === 1.0) {\n const extraInfo =\n extraTools.length > 0 ? ` (plus extra: ${extraTools.join(\", \")})` : \"\";\n return {\n score: 1.0,\n metadata: {\n rationale: `All expected tools were called${extraInfo}`,\n },\n };\n }\n\n return {\n score,\n metadata: {\n rationale: issues.join(\"; \"),\n matched: expectedMatched,\n total: expectedTotal,\n },\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;AA2CA,SAAS,WAAW,UAAe,QAAsB;AAEvD,MAAI,YAAY,QAAQ,UAAU,MAAM;AACtC,WAAO,aAAa;AAAA,EACtB;AAGA,MACE,OAAO,aAAa,YACpB,OAAO,WAAW,YAClB,CAAC,MAAM,QAAQ,QAAQ,GACvB;AACA,WAAO,OAAO,QAAQ,QAAQ,EAAE;AAAA,MAC9B,CAAC,CAAC,KAAK,KAAK,MAAM,OAAO,UAAU,WAAW,OAAO,OAAO,GAAG,CAAC;AAAA,IAClE;AAAA,EACF;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,OAAO,YAAY,EAAE,SAAS,SAAS,YAAY,CAAC;AAAA,EAC7D;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,UAAM,YAAY,KAAK,IAAI,KAAK,IAAI,QAAQ,IAAI,MAAO,IAAK;AAC5D,WAAO,KAAK,IAAI,WAAW,MAAM,KAAK;AAAA,EACxC;AAGA,MAAI,MAAM,QAAQ,QAAQ,KAAK,MAAM,QAAQ,MAAM,GAAG;AACpD,WAAO,SAAS;AAAA,MAAM,CAAC,YACrB,OAAO,KAAK,CAAC,YAAY,WAAW,SAAS,OAAO,CAAC;AAAA,IACvD;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AAKA,SAAS,aAAa,UAAe,QAAsB;AAEzD,MAAI,aAAa,OAAQ,QAAO;AAChC,MAAI,YAAY,QAAQ,UAAU,KAAM,QAAO;AAG/C,MAAI,OAAO,aAAa,OAAO,OAAQ,QAAO;AAG9C,MAAI,MAAM,QAAQ,QAAQ,GAAG;AAC3B,QAAI,CAAC,MAAM,QAAQ,MAAM,EAAG,QAAO;AACnC,QAAI,SAAS,WAAW,OAAO,OAAQ,QAAO;AAC9C,WAAO,SAAS,MAAM,CAAC,MAAM,MAAM,aAAa,MAAM,OAAO,CAAC,CAAC,CAAC;AAAA,EAClE;AAGA,MAAI,OAAO,aAAa,UAAU;AAChC,UAAM,eAAe,OAAO,KAAK,QAAQ,EAAE,KAAK;AAChD,UAAM,aAAa,OAAO,KAAK,MAAM,EAAE,KAAK;AAG5C,QAAI,aAAa,WAAW,WAAW,OAAQ,QAAO;AACtD,QAAI,CAAC,aAAa,MAAM,CAAC,KAAK,MAAM,QAAQ,WAAW,CAAC,CAAC,EAAG,QAAO;AAGnE,WAAO,aAAa;AAAA,MAAM,CAAC,QACzB,aAAa,SAAS,GAAG,GAAG,OAAO,GAAG,CAAC;AAAA,IACzC;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AA0CO,SAAS,eACd,SAA+B,CAAC,GACA;AAChC,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,aAAa;AAAA,IACb,cAAc;AAAA,IACd,SAAS;AAAA,EACX,IAAI;AAGJ,QAAM,aACJ,OAAO,WAAW,aACd,SACA,WAAW,WACT,eACA;AAER,SAAO,CAAO,SAAS;AACrB,UAAM,gBAAgB,KAAK,iBAAiB,CAAC;AAC7C,UAAM,cAAc,KAAK,aAAa,CAAC;AAGvC,QAAI,cAAc,WAAW,GAAG;AAC9B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW;AAAA,QACb;AAAA,MACF;AAAA,IACF;AAGA,QAAI,YAAY,WAAW,GAAG;AAC5B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,YAAY,cAAc,MAAM;AAAA,QAC7C;AAAA,MACF;AAAA,IACF;AAEA,QAAI,SAAS;AACX,aAAO,qBAAqB,eAAe,aAAa;AAAA,QACtD;AAAA,QACA;AAAA,QACA,iBAAiB;AAAA,MACnB,CAAC;AAAA,IACH;AAEA,WAAO,uBAAuB,eAAe,aAAa;AAAA,MACxD;AAAA,MACA,iBAAiB;AAAA,MACjB;AAAA,IACF,CAAC;AAAA,EACH;AACF;AAKA,SAAS,qBACP,UACA,QACA,SAKA;AACA,MAAI,gBAAgB;AACpB,MAAI,cAAc;AAGlB,SAAO,gBAAgB,SAAS,UAAU,cAAc,OAAO,QAAQ;AACrE,UAAM,MAAM,SAAS,aAAa;AAClC,UAAM,MAAM,OAAO,WAAW;AAE9B,QAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,UAAI,IAAI,cAAc,QAAW;AAC/B,cAAM,YAAY,QAAQ;AAAA,UACxB,IAAI;AAAA,UACJ,IAAI,aAAa,CAAC;AAAA,QACpB;AACA,YAAI,CAAC,WAAW;AACd,iBAAO;AAAA,YACL,OAAO;AAAA,YACP,UAAU;AAAA,cACR,WAAW,SAAS,IAAI,IAAI,iDAAiD,gBAAgB,CAAC;AAAA,cAC9F,UAAU,IAAI;AAAA,cACd,QAAQ,IAAI;AAAA,YACd;AAAA,UACF;AAAA,QACF;AAAA,MACF;AACA;AACA;AAAA,IACF,WAAW,QAAQ,aAAa;AAE9B;AAAA,IACF,OAAO;AAEL,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,aAAa,IAAI,IAAI,iBAAiB,gBAAgB,CAAC,eAAe,IAAI,IAAI;AAAA,QAC3F;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,MAAI,gBAAgB,SAAS,QAAQ;AACnC,UAAM,UAAU,SAAS,MAAM,aAAa,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AAE/D,QAAI,QAAQ,iBAAiB;AAC3B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,uCAAuC,QAAQ,KAAK,IAAI,CAAC;AAAA,QACtE;AAAA,MACF;AAAA,IACF;AAGA,UAAM,eAAe;AACrB,UAAM,aAAa,SAAS;AAC5B,UAAM,QAAQ,aAAa,IAAI,eAAe,aAAa;AAE3D,WAAO;AAAA,MACL;AAAA,MACA,UAAU;AAAA,QACR,WAAW,kBAAkB,YAAY,IAAI,UAAU,oCAAoC,QAAQ,KAAK,IAAI,CAAC;AAAA,QAC7G,SAAS;AAAA,QACT,OAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAGA,MAAI,CAAC,QAAQ,eAAe,cAAc,OAAO,QAAQ;AACvD,UAAM,QAAQ,OAAO,MAAM,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AACzD,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,2BAA2B,MAAM,KAAK,IAAI,CAAC;AAAA,MACxD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO;AAAA,IACP,UAAU;AAAA,MACR,WAAW;AAAA,IACb;AAAA,EACF;AACF;AAKA,SAAS,uBACP,UACA,QACA,SAKA;AACA,QAAM,kBAAkB,oBAAI,IAAY;AACxC,QAAM,gBAAgB,oBAAI,IAAY;AACtC,QAAM,SAAmB,CAAC;AAG1B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,MAAM,SAAS,CAAC;AACtB,QAAI,QAAQ;AAGZ,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,UAAI,cAAc,IAAI,CAAC,EAAG;AAE1B,YAAM,MAAM,OAAO,CAAC;AACpB,UAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,YAAI,IAAI,cAAc,QAAW;AAC/B,gBAAM,YAAY,QAAQ;AAAA,YACxB,IAAI;AAAA,YACJ,IAAI,aAAa,CAAC;AAAA,UACpB;AACA,cAAI,CAAC,WAAW;AACd;AAAA,UACF;AAAA,QACF;AAGA,wBAAgB,IAAI,CAAC;AACrB,sBAAc,IAAI,CAAC;AACnB,gBAAQ;AACR;AAAA,MACF;AAAA,IACF;AAEA,QAAI,CAAC,OAAO;AACV,UAAI,IAAI,cAAc,QAAW;AAE/B,cAAM,iBAAiB,OAAO,OAAO,CAAC,MAAM,EAAE,SAAS,IAAI,IAAI;AAC/D,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,KAAK,SAAS,IAAI,IAAI,uCAAuC;AAAA,QACtE,OAAO;AACL,iBAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,QAClD;AAAA,MACF,OAAO;AACL,eAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,MAClD;AAAA,IACF;AAAA,EACF;AAGA,QAAM,aAAa,OAChB,OAAO,CAAC,GAAG,MAAM,CAAC,cAAc,IAAI,CAAC,CAAC,EACtC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,MAAI,CAAC,QAAQ,eAAe,WAAW,SAAS,GAAG;AACjD,WAAO,KAAK,2BAA2B,WAAW,KAAK,IAAI,CAAC,EAAE;AAAA,EAChE;AAGA,QAAM,kBAAkB,gBAAgB;AACxC,QAAM,gBAAgB,SAAS;AAG/B,MAAI,OAAO,SAAS,MAAM,QAAQ,mBAAmB,CAAC,QAAQ,cAAc;AAC1E,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAGA,QAAM,QAAQ,gBAAgB,IAAI,kBAAkB,gBAAgB;AAEpE,MAAI,UAAU,GAAK;AACjB,UAAM,YACJ,WAAW,SAAS,IAAI,iBAAiB,WAAW,KAAK,IAAI,CAAC,MAAM;AACtE,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,iCAAiC,SAAS;AAAA,MACvD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC3B,SAAS;AAAA,MACT,OAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}