vitest-evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +211 -172
  2. package/dist/index.d.mts +2 -98
  3. package/dist/index.d.ts +2 -98
  4. package/dist/index.js +270 -11
  5. package/dist/index.js.map +1 -1
  6. package/dist/index.mjs +269 -11
  7. package/dist/index.mjs.map +1 -1
  8. package/dist/scorers/index.d.mts +2 -0
  9. package/dist/scorers/index.d.ts +2 -0
  10. package/dist/scorers/index.js +282 -0
  11. package/dist/scorers/index.js.map +1 -0
  12. package/dist/scorers/index.mjs +256 -0
  13. package/dist/scorers/index.mjs.map +1 -0
  14. package/dist/scorers/toolCallScorer.d.mts +240 -0
  15. package/dist/scorers/toolCallScorer.d.ts +240 -0
  16. package/dist/scorers/toolCallScorer.js +280 -0
  17. package/dist/scorers/toolCallScorer.js.map +1 -0
  18. package/dist/scorers/toolCallScorer.mjs +256 -0
  19. package/dist/scorers/toolCallScorer.mjs.map +1 -0
  20. package/package.json +16 -4
  21. package/dist/compatibility.test.d.mts +0 -2
  22. package/dist/compatibility.test.d.ts +0 -2
  23. package/dist/compatibility.test.js +0 -45009
  24. package/dist/compatibility.test.js.map +0 -1
  25. package/dist/compatibility.test.mjs +0 -45864
  26. package/dist/compatibility.test.mjs.map +0 -1
  27. package/dist/formatScores.test.d.mts +0 -2
  28. package/dist/formatScores.test.d.ts +0 -2
  29. package/dist/formatScores.test.js +0 -195
  30. package/dist/formatScores.test.js.map +0 -1
  31. package/dist/formatScores.test.mjs +0 -194
  32. package/dist/formatScores.test.mjs.map +0 -1
  33. package/dist/wrapText.test.d.mts +0 -2
  34. package/dist/wrapText.test.d.ts +0 -2
  35. package/dist/wrapText.test.js +0 -162
  36. package/dist/wrapText.test.js.map +0 -1
  37. package/dist/wrapText.test.mjs +0 -161
  38. package/dist/wrapText.test.mjs.map +0 -1
package/dist/index.mjs CHANGED
@@ -17,6 +17,18 @@ var __spreadValues = (a, b) => {
17
17
  return a;
18
18
  };
19
19
  var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
20
+ var __objRest = (source, exclude) => {
21
+ var target = {};
22
+ for (var prop in source)
23
+ if (__hasOwnProp.call(source, prop) && exclude.indexOf(prop) < 0)
24
+ target[prop] = source[prop];
25
+ if (source != null && __getOwnPropSymbols)
26
+ for (var prop of __getOwnPropSymbols(source)) {
27
+ if (exclude.indexOf(prop) < 0 && __propIsEnum.call(source, prop))
28
+ target[prop] = source[prop];
29
+ }
30
+ return target;
31
+ };
20
32
  var __async = (__this, __arguments, generator) => {
21
33
  return new Promise((resolve, reject) => {
22
34
  var fulfilled = (value) => {
@@ -41,12 +53,247 @@ var __async = (__this, __arguments, generator) => {
41
53
  // src/index.ts
42
54
  import { assert, describe, expect, test } from "vitest";
43
55
  import "vitest";
56
+
57
+ // src/scorers/toolCallScorer.ts
58
+ function fuzzyMatch(expected, actual) {
59
+ if (expected == null || actual == null) {
60
+ return expected === actual;
61
+ }
62
+ if (typeof expected === "object" && typeof actual === "object" && !Array.isArray(expected)) {
63
+ return Object.entries(expected).every(
64
+ ([key, value]) => key in actual && fuzzyMatch(value, actual[key])
65
+ );
66
+ }
67
+ if (typeof expected === "string" && typeof actual === "string") {
68
+ return actual.toLowerCase().includes(expected.toLowerCase());
69
+ }
70
+ if (typeof expected === "number" && typeof actual === "number") {
71
+ const tolerance = Math.max(Math.abs(expected) * 1e-3, 1e-3);
72
+ return Math.abs(expected - actual) <= tolerance;
73
+ }
74
+ if (Array.isArray(expected) && Array.isArray(actual)) {
75
+ return expected.every(
76
+ (expItem) => actual.some((actItem) => fuzzyMatch(expItem, actItem))
77
+ );
78
+ }
79
+ return expected === actual;
80
+ }
81
+ function strictEquals(expected, actual) {
82
+ if (expected === actual) return true;
83
+ if (expected == null || actual == null) return false;
84
+ if (typeof expected !== typeof actual) return false;
85
+ if (Array.isArray(expected)) {
86
+ if (!Array.isArray(actual)) return false;
87
+ if (expected.length !== actual.length) return false;
88
+ return expected.every((item, i) => strictEquals(item, actual[i]));
89
+ }
90
+ if (typeof expected === "object") {
91
+ const expectedKeys = Object.keys(expected).sort();
92
+ const actualKeys = Object.keys(actual).sort();
93
+ if (expectedKeys.length !== actualKeys.length) return false;
94
+ if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;
95
+ return expectedKeys.every(
96
+ (key) => strictEquals(expected[key], actual[key])
97
+ );
98
+ }
99
+ return expected === actual;
100
+ }
101
+ function ToolCallScorer(config = {}) {
102
+ const {
103
+ ordered = false,
104
+ requireAll = true,
105
+ allowExtras = true,
106
+ params = "strict"
107
+ } = config;
108
+ const argMatcher = typeof params === "function" ? params : params === "strict" ? strictEquals : fuzzyMatch;
109
+ return (opts) => __async(null, null, function* () {
110
+ const expectedTools = opts.expectedTools || [];
111
+ const actualCalls = opts.toolCalls || [];
112
+ if (expectedTools.length === 0) {
113
+ return {
114
+ score: 1,
115
+ metadata: {
116
+ rationale: "No tool calls expected"
117
+ }
118
+ };
119
+ }
120
+ if (actualCalls.length === 0) {
121
+ return {
122
+ score: 0,
123
+ metadata: {
124
+ rationale: `Expected ${expectedTools.length} tool(s) but none were called`
125
+ }
126
+ };
127
+ }
128
+ if (ordered) {
129
+ return evaluateOrderedTools(expectedTools, actualCalls, {
130
+ argMatcher,
131
+ allowExtras,
132
+ requireAllTools: requireAll
133
+ });
134
+ }
135
+ return evaluateUnorderedTools(expectedTools, actualCalls, {
136
+ argMatcher,
137
+ requireAllTools: requireAll,
138
+ allowExtras
139
+ });
140
+ });
141
+ }
142
+ function evaluateOrderedTools(expected, actual, options) {
143
+ let expectedIndex = 0;
144
+ let actualIndex = 0;
145
+ while (expectedIndex < expected.length && actualIndex < actual.length) {
146
+ const exp = expected[expectedIndex];
147
+ const act = actual[actualIndex];
148
+ if (exp.name === act.name) {
149
+ if (exp.arguments !== void 0) {
150
+ const argsMatch = options.argMatcher(
151
+ exp.arguments,
152
+ act.arguments || {}
153
+ );
154
+ if (!argsMatch) {
155
+ return {
156
+ score: 0.5,
157
+ metadata: {
158
+ rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,
159
+ expected: exp.arguments,
160
+ actual: act.arguments
161
+ }
162
+ };
163
+ }
164
+ }
165
+ expectedIndex++;
166
+ actualIndex++;
167
+ } else if (options.allowExtras) {
168
+ actualIndex++;
169
+ } else {
170
+ return {
171
+ score: 0,
172
+ metadata: {
173
+ rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`
174
+ }
175
+ };
176
+ }
177
+ }
178
+ if (expectedIndex < expected.length) {
179
+ const missing = expected.slice(expectedIndex).map((t) => t.name);
180
+ if (options.requireAllTools) {
181
+ return {
182
+ score: 0,
183
+ metadata: {
184
+ rationale: `Missing required tools in sequence: ${missing.join(", ")}`
185
+ }
186
+ };
187
+ }
188
+ const matchedCount = expectedIndex;
189
+ const totalCount = expected.length;
190
+ const score = totalCount > 0 ? matchedCount / totalCount : 1;
191
+ return {
192
+ score,
193
+ metadata: {
194
+ rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(", ")})`,
195
+ matched: matchedCount,
196
+ total: totalCount
197
+ }
198
+ };
199
+ }
200
+ if (!options.allowExtras && actualIndex < actual.length) {
201
+ const extra = actual.slice(actualIndex).map((t) => t.name);
202
+ return {
203
+ score: 0,
204
+ metadata: {
205
+ rationale: `Unexpected extra tools: ${extra.join(", ")}`
206
+ }
207
+ };
208
+ }
209
+ return {
210
+ score: 1,
211
+ metadata: {
212
+ rationale: "All tools called in expected order with correct arguments"
213
+ }
214
+ };
215
+ }
216
+ function evaluateUnorderedTools(expected, actual, options) {
217
+ const matchedExpected = /* @__PURE__ */ new Set();
218
+ const matchedActual = /* @__PURE__ */ new Set();
219
+ const issues = [];
220
+ for (let i = 0; i < expected.length; i++) {
221
+ const exp = expected[i];
222
+ let found = false;
223
+ for (let j = 0; j < actual.length; j++) {
224
+ if (matchedActual.has(j)) continue;
225
+ const act = actual[j];
226
+ if (exp.name === act.name) {
227
+ if (exp.arguments !== void 0) {
228
+ const argsMatch = options.argMatcher(
229
+ exp.arguments,
230
+ act.arguments || {}
231
+ );
232
+ if (!argsMatch) {
233
+ continue;
234
+ }
235
+ }
236
+ matchedExpected.add(i);
237
+ matchedActual.add(j);
238
+ found = true;
239
+ break;
240
+ }
241
+ }
242
+ if (!found) {
243
+ if (exp.arguments !== void 0) {
244
+ const wrongArgsCalls = actual.filter((a) => a.name === exp.name);
245
+ if (wrongArgsCalls.length > 0) {
246
+ issues.push(`Tool '${exp.name}' called but with incorrect arguments`);
247
+ } else {
248
+ issues.push(`Missing required tool: ${exp.name}`);
249
+ }
250
+ } else {
251
+ issues.push(`Missing required tool: ${exp.name}`);
252
+ }
253
+ }
254
+ }
255
+ const extraTools = actual.filter((_, i) => !matchedActual.has(i)).map((t) => t.name);
256
+ if (!options.allowExtras && extraTools.length > 0) {
257
+ issues.push(`Unexpected extra tools: ${extraTools.join(", ")}`);
258
+ }
259
+ const expectedMatched = matchedExpected.size;
260
+ const expectedTotal = expected.length;
261
+ if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {
262
+ return {
263
+ score: 0,
264
+ metadata: {
265
+ rationale: issues.join("; ")
266
+ }
267
+ };
268
+ }
269
+ const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1;
270
+ if (score === 1) {
271
+ const extraInfo = extraTools.length > 0 ? ` (plus extra: ${extraTools.join(", ")})` : "";
272
+ return {
273
+ score: 1,
274
+ metadata: {
275
+ rationale: `All expected tools were called${extraInfo}`
276
+ }
277
+ };
278
+ }
279
+ return {
280
+ score,
281
+ metadata: {
282
+ rationale: issues.join("; "),
283
+ matched: expectedMatched,
284
+ total: expectedTotal
285
+ }
286
+ };
287
+ }
288
+
289
+ // src/index.ts
44
290
  expect.extend({
45
291
  /**
46
292
  * Evaluates a language model output against an expected answer using a scoring function.
47
293
  *
48
294
  * @param expected - The expected (ground truth) answer
49
295
  * @param taskFn - Async function that processes the input and returns the model output
296
+ * Can return either a string or TaskResult object with result and optional toolCalls
50
297
  * @param scoreFn - Function that evaluates the model output against the expected answer
51
298
  * @param threshold - Minimum acceptable score (0-1), defaults to 1.0
52
299
  *
@@ -56,8 +303,12 @@ expect.extend({
56
303
  * expect("What is the capital of France?").toEval(
57
304
  * "Paris",
58
305
  * async (input) => {
59
- * // Query LLM here
60
- * return "Paris";
306
+ * const response = await queryLLM(input);
307
+ * // Recommended: return TaskResult
308
+ * return {
309
+ * result: response.text,
310
+ * toolCalls: response.toolCalls || []
311
+ * };
61
312
  * },
62
313
  * checkFactuality,
63
314
  * 0.8
@@ -65,12 +316,15 @@ expect.extend({
65
316
  * });
66
317
  * ```
67
318
  */
319
+ // TODO: this needs to be support true extensibility with Eval scorers
68
320
  toEval: function toEval(input, expected, taskFn, scoreFn, threshold = 1) {
69
321
  return __async(this, null, function* () {
70
322
  var _a;
71
323
  const { isNot } = this;
72
- const output = yield taskFn(input);
73
- let result = scoreFn({ input, expected, output });
324
+ const taskOutput = yield taskFn(input);
325
+ const output = typeof taskOutput === "string" ? taskOutput : taskOutput.result;
326
+ const toolCalls = typeof taskOutput === "object" ? taskOutput.toolCalls : void 0;
327
+ let result = scoreFn({ input, expected, output, toolCalls });
74
328
  if (result instanceof Promise) {
75
329
  result = yield result;
76
330
  }
@@ -93,17 +347,20 @@ function describeEval(name, {
93
347
  }) {
94
348
  return describe(name, () => __async(null, null, function* () {
95
349
  const testFn = skipIf ? test.skipIf(skipIf()) : test;
96
- for (const { input, expected } of yield data()) {
350
+ for (const _a of yield data()) {
351
+ const _b = _a, { input } = _b, params = __objRest(_b, ["input"]);
97
352
  testFn(
98
353
  input,
99
354
  {
100
355
  timeout
101
356
  },
102
357
  (_0) => __async(null, [_0], function* ({ task: testTask }) {
103
- const output = yield task(input);
358
+ const taskOutput = yield task(input);
359
+ const output = typeof taskOutput === "string" ? taskOutput : taskOutput.result;
360
+ const toolCalls = typeof taskOutput === "object" ? taskOutput.toolCalls : void 0;
104
361
  const scores = yield Promise.all(
105
362
  scorers.map((scorer) => {
106
- const result = scorer({ input, expected, output });
363
+ const result = scorer(__spreadProps(__spreadValues({ input }, params), { output, toolCalls }));
107
364
  if (result instanceof Promise) {
108
365
  return result;
109
366
  }
@@ -114,13 +371,13 @@ function describeEval(name, {
114
371
  name: scorers[i].name
115
372
  }));
116
373
  const avgScore = scores.reduce((acc, s) => {
117
- var _a;
118
- return acc + ((_a = s.score) != null ? _a : 0);
374
+ var _a2;
375
+ return acc + ((_a2 = s.score) != null ? _a2 : 0);
119
376
  }, 0) / scores.length;
120
- testTask.meta.eval = {
377
+ testTask.meta.eval = __spreadValues({
121
378
  scores: scoresWithName,
122
379
  avgScore
123
- };
380
+ }, toolCalls && { toolCalls });
124
381
  if (threshold) {
125
382
  assert(
126
383
  avgScore >= threshold,
@@ -181,6 +438,7 @@ function wrapText(text, width = 80) {
181
438
  return lines.join("\n");
182
439
  }
183
440
  export {
441
+ ToolCallScorer,
184
442
  describeEval,
185
443
  formatScores,
186
444
  wrapText
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { assert, describe, expect, test } from \"vitest\";\nimport \"vitest\";\n\nexport type TaskFn = (input: string) => Promise<string>;\n\nexport type Score = {\n score: number | null;\n metadata?: {\n rationale?: string;\n output?: string;\n };\n};\n\nexport type ScoreFn = (opts: {\n input: string;\n output: string;\n expected?: string;\n}) => Promise<Score> | Score;\n\nexport type ToEval<R = unknown> = (\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold?: number,\n) => Promise<R>;\n\nexport interface EvalMatchers<R = unknown> {\n toEval: ToEval<R>;\n}\n\ndeclare module \"vitest\" {\n interface Assertion<T = any> extends EvalMatchers<T> {}\n interface AsymmetricMatchersContaining extends EvalMatchers {}\n\n interface TaskMeta {\n eval?: {\n scores: (Score & { name: string })[];\n avgScore: number;\n };\n }\n}\n\nexpect.extend({\n /**\n * Evaluates a language model output against an expected answer using a scoring function.\n *\n * @param expected - The expected (ground truth) answer\n * @param taskFn - Async function that processes the input and returns the model output\n * @param scoreFn - Function that evaluates the model output against the expected answer\n * @param threshold - Minimum acceptable score (0-1), defaults to 1.0\n *\n * @example\n * ```javascript\n * test(\"checks capital of France\", async () => {\n * expect(\"What is the capital of France?\").toEval(\n * \"Paris\",\n * async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * checkFactuality,\n * 0.8\n * );\n * });\n * ```\n */\n toEval: async function toEval(\n input: string,\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold = 1.0,\n ) {\n const { isNot } = this;\n\n const output = await taskFn(input);\n\n let result = scoreFn({ input, expected, output });\n if (result instanceof Promise) {\n result = await result;\n }\n\n return {\n pass: (result.score ?? 0) >= threshold,\n message: () => formatScores([{ ...result, name: scoreFn.name }]),\n };\n },\n});\n\n/**\n * Creates a test suite for evaluating language model outputs.\n *\n * @param name - The name of the test suite\n * @param options - Configuration options\n * @param options.data - Async function that returns an array of test cases with input and expected values\n * @param options.task - Function that processes the input and returns the model output\n * @param options.skipIf - Optional function that determines if tests should be skipped\n * @param options.scorers - Array of scoring functions that evaluate model outputs\n * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0\n * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)\n *\n * @example\n * ```javascript\n * describeEval(\"capital cities test\", {\n * data: async () => [{\n * input: \"What is the capital of France?\",\n * expected: \"Paris\"\n * }],\n * task: async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * scorers: [checkFactuality],\n * threshold: 0.8\n * });\n * ```\n */\nexport function describeEval(\n name: string,\n {\n data,\n task,\n skipIf,\n scorers,\n threshold = 1.0,\n // increase default test timeout as 5s is usually not enough for\n // a single factuality check\n timeout = 60000,\n }: {\n data: () => Promise<{ input: string; expected: string }[]>;\n task: TaskFn;\n skipIf?: () => boolean;\n scorers: ScoreFn[];\n threshold?: number | null;\n timeout?: number;\n },\n) {\n return describe(name, async () => {\n const testFn = skipIf ? test.skipIf(skipIf()) : test;\n // TODO: should data just be a generator?\n for (const { input, expected } of await data()) {\n testFn(\n input,\n {\n timeout,\n },\n async ({ task: testTask }) => {\n const output = await task(input);\n\n const scores = await Promise.all(\n scorers.map((scorer) => {\n const result = scorer({ input, expected, output });\n if (result instanceof Promise) {\n return result;\n }\n return new Promise<Score>((resolve) => resolve(result));\n }),\n );\n const scoresWithName = scores.map((s, i) => ({\n ...s,\n name: scorers[i].name,\n }));\n\n const avgScore =\n scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;\n\n testTask.meta.eval = {\n scores: scoresWithName,\n avgScore,\n };\n\n if (threshold) {\n assert(\n avgScore >= threshold,\n `Score: ${avgScore} below threshold: ${threshold}\\n\\n## Output:\\n${wrapText(output)}\\n\\n${formatScores(\n scoresWithName,\n )}`,\n );\n }\n },\n );\n }\n });\n}\n\nexport function formatScores(scores: (Score & { name: string })[]) {\n return scores\n .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))\n .map((s) => {\n const scoreLine = `# ${s.name || \"Unknown\"} [${(s.score ?? 0).toFixed(1)}]`;\n if (\n ((s.score ?? 0) < 1.0 && s.metadata?.rationale) ||\n s.metadata?.output\n ) {\n return `${scoreLine}${\n s.metadata?.rationale\n ? `\\n\\n## Rationale\\n\\n${wrapText(s.metadata.rationale)}`\n : \"\"\n }${s.metadata?.output ? `\\n\\n## Response\\n\\n${wrapText(s.metadata.output)}` : \"\"}`;\n }\n return scoreLine;\n })\n .join(\"\\n\\n\");\n}\n\n/**\n * Wraps text to fit within a specified width, breaking at word boundaries.\n *\n * @param text - The text to wrap\n * @param width - The maximum width in characters (default: 80)\n * @returns The wrapped text with line breaks\n *\n * @example\n * ```javascript\n * const wrapped = wrapText(\"This is a very long text that needs to be wrapped to fit within an 80 character width.\", 20);\n * console.log(wrapped);\n * // Output:\n * // This is a very\n * // long text that\n * // needs to be\n * // wrapped to fit\n * // within an 80\n * // character width.\n * ```\n */\nexport function wrapText(text: string, width = 80): string {\n if (!text || text.length <= width) {\n return text;\n }\n\n const words = text.split(/\\s+/);\n const lines: string[] = [];\n let currentLine = \"\";\n\n for (const word of words) {\n // If adding this word would exceed the width, start a new line\n if (currentLine.length + word.length + 1 > width) {\n lines.push(currentLine.trim());\n currentLine = word;\n } else {\n // Add the word to the current line\n currentLine += (currentLine ? \" \" : \"\") + word;\n }\n }\n\n // Add the last line if it's not empty\n if (currentLine) {\n lines.push(currentLine);\n }\n\n return lines.join(\"\\n\");\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,SAAS,QAAQ,UAAU,QAAQ,YAAY;AAC/C,OAAO;AAyCP,OAAO,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAwBZ,QAAQ,SAAe,OACrB,OACA,UACA,QACA,SACA,YAAY,GACZ;AAAA;AAxEJ;AAyEI,YAAM,EAAE,MAAM,IAAI;AAElB,YAAM,SAAS,MAAM,OAAO,KAAK;AAEjC,UAAI,SAAS,QAAQ,EAAE,OAAO,UAAU,OAAO,CAAC;AAChD,UAAI,kBAAkB,SAAS;AAC7B,iBAAS,MAAM;AAAA,MACjB;AAEA,aAAO;AAAA,QACL,QAAO,YAAO,UAAP,YAAgB,MAAM;AAAA,QAC7B,SAAS,MAAM,aAAa,CAAC,iCAAK,SAAL,EAAa,MAAM,QAAQ,KAAK,EAAC,CAAC;AAAA,MACjE;AAAA,IACF;AAAA;AACF,CAAC;AA8BM,SAAS,aACd,MACA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA;AAAA;AAAA,EAGZ,UAAU;AACZ,GAQA;AACA,SAAO,SAAS,MAAM,MAAY;AAChC,UAAM,SAAS,SAAS,KAAK,OAAO,OAAO,CAAC,IAAI;AAEhD,eAAW,EAAE,OAAO,SAAS,KAAK,MAAM,KAAK,GAAG;AAC9C;AAAA,QACE;AAAA,QACA;AAAA,UACE;AAAA,QACF;AAAA,QACA,CAAO,OAAuB,eAAvB,KAAuB,WAAvB,EAAE,MAAM,SAAS,GAAM;AAC5B,gBAAM,SAAS,MAAM,KAAK,KAAK;AAE/B,gBAAM,SAAS,MAAM,QAAQ;AAAA,YAC3B,QAAQ,IAAI,CAAC,WAAW;AACtB,oBAAM,SAAS,OAAO,EAAE,OAAO,UAAU,OAAO,CAAC;AACjD,kBAAI,kBAAkB,SAAS;AAC7B,uBAAO;AAAA,cACT;AACA,qBAAO,IAAI,QAAe,CAAC,YAAY,QAAQ,MAAM,CAAC;AAAA,YACxD,CAAC;AAAA,UACH;AACA,gBAAM,iBAAiB,OAAO,IAAI,CAAC,GAAG,MAAO,iCACxC,IADwC;AAAA,YAE3C,MAAM,QAAQ,CAAC,EAAE;AAAA,UACnB,EAAE;AAEF,gBAAM,WACJ,OAAO,OAAO,CAAC,KAAK,MAAG;AApKnC;AAoKsC,2BAAO,OAAE,UAAF,YAAW;AAAA,aAAI,CAAC,IAAI,OAAO;AAE9D,mBAAS,KAAK,OAAO;AAAA,YACnB,QAAQ;AAAA,YACR;AAAA,UACF;AAEA,cAAI,WAAW;AACb;AAAA,cACE,YAAY;AAAA,cACZ,UAAU,QAAQ,qBAAqB,SAAS;AAAA;AAAA;AAAA,EAAmB,SAAS,MAAM,CAAC;AAAA;AAAA,EAAO;AAAA,gBACxF;AAAA,cACF,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF,EAAC;AACH;AAEO,SAAS,aAAa,QAAsC;AACjE,SAAO,OACJ,KAAK,CAAC,GAAG,MAAG;AA3LjB;AA2LqB,oBAAE,UAAF,YAAW,OAAM,OAAE,UAAF,YAAW;AAAA,GAAE,EAC9C,IAAI,CAAC,MAAM;AA5LhB;AA6LM,UAAM,YAAY,KAAK,EAAE,QAAQ,SAAS,OAAM,OAAE,UAAF,YAAW,GAAG,QAAQ,CAAC,CAAC;AACxE,UACI,OAAE,UAAF,YAAW,KAAK,OAAO,OAAE,aAAF,mBAAY,gBACrC,OAAE,aAAF,mBAAY,SACZ;AACA,aAAO,GAAG,SAAS,KACjB,OAAE,aAAF,mBAAY,aACR;AAAA;AAAA;AAAA;AAAA,EAAuB,SAAS,EAAE,SAAS,SAAS,CAAC,KACrD,EACN,KAAG,OAAE,aAAF,mBAAY,UAAS;AAAA;AAAA;AAAA;AAAA,EAAsB,SAAS,EAAE,SAAS,MAAM,CAAC,KAAK,EAAE;AAAA,IAClF;AACA,WAAO;AAAA,EACT,CAAC,EACA,KAAK,MAAM;AAChB;AAsBO,SAAS,SAAS,MAAc,QAAQ,IAAY;AACzD,MAAI,CAAC,QAAQ,KAAK,UAAU,OAAO;AACjC,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,QAAkB,CAAC;AACzB,MAAI,cAAc;AAElB,aAAW,QAAQ,OAAO;AAExB,QAAI,YAAY,SAAS,KAAK,SAAS,IAAI,OAAO;AAChD,YAAM,KAAK,YAAY,KAAK,CAAC;AAC7B,oBAAc;AAAA,IAChB,OAAO;AAEL,sBAAgB,cAAc,MAAM,MAAM;AAAA,IAC5C;AAAA,EACF;AAGA,MAAI,aAAa;AACf,UAAM,KAAK,WAAW;AAAA,EACxB;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;","names":[]}
1
+ {"version":3,"sources":["../src/index.ts","../src/scorers/toolCallScorer.ts"],"sourcesContent":["import { assert, describe, expect, test } from \"vitest\";\nimport \"vitest\";\n\n/**\n * Represents a tool/function call made during task execution.\n * Supports various LLM provider formats and use cases.\n */\nexport type ToolCall = {\n // Core fields (required for basic usage)\n name: string;\n arguments: Record<string, any>;\n\n // Result and timing\n result?: any;\n error?: {\n code?: string;\n message: string;\n details?: any;\n };\n timestamp?: number;\n duration_ms?: number;\n\n // Identification and correlation\n id?: string;\n parent_id?: string; // For nested/chained calls\n\n // Status tracking\n status?: \"pending\" | \"executing\" | \"completed\" | \"failed\" | \"cancelled\";\n\n // Provider-specific fields\n type?: \"function\" | \"retrieval\" | \"code_interpreter\" | \"web_search\" | string;\n\n // Additional metadata\n [key: string]: any; // Allow provider-specific fields\n};\n\nexport type TaskResult = {\n result: string;\n toolCalls?: ToolCall[];\n};\n\n/**\n * Task function that processes an input and returns either a string result\n * or a TaskResult object containing the result and any tool calls made.\n *\n * @param input - The input string to process\n * @returns Promise resolving to either a string or TaskResult object\n *\n * @example\n * // Simple tasks can just return a string\n * const simpleTask: TaskFn = async (input) => \"The answer is 42\";\n *\n * // Tasks that use tools should return TaskResult\n * const taskWithTools: TaskFn = async (input) => ({\n * result: \"The answer is 42\",\n * toolCalls: [{ name: \"calculate\", arguments: { expr: \"6*7\" }, result: 42 }]\n * });\n */\nexport type TaskFn = (input: string) => Promise<string | TaskResult>;\n\nexport type Score = {\n score: number | null;\n metadata?: {\n rationale?: string;\n output?: string;\n };\n};\n\nexport interface BaseScorerOptions {\n input: string;\n output: string;\n toolCalls?: ToolCall[];\n}\n\nexport type ScoreFn<TOptions extends BaseScorerOptions = BaseScorerOptions> = (\n opts: TOptions,\n) => Promise<Score> | Score;\n\nexport type ToEval<R = unknown> = (\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn<any>,\n threshold?: number,\n) => Promise<R>;\n\nexport interface EvalMatchers<R = unknown> {\n toEval: ToEval<R>;\n}\n\ndeclare module \"vitest\" {\n interface Assertion<T = any> extends EvalMatchers<T> {}\n interface AsymmetricMatchersContaining extends EvalMatchers {}\n\n interface TaskMeta {\n eval?: {\n scores: (Score & { name: string })[];\n avgScore: number;\n toolCalls?: ToolCall[];\n };\n }\n}\n\nexpect.extend({\n /**\n * Evaluates a language model output against an expected answer using a scoring function.\n *\n * @param expected - The expected (ground truth) answer\n * @param taskFn - Async function that processes the input and returns the model output\n * Can return either a string or TaskResult object with result and optional toolCalls\n * @param scoreFn - Function that evaluates the model output against the expected answer\n * @param threshold - Minimum acceptable score (0-1), defaults to 1.0\n *\n * @example\n * ```javascript\n * test(\"checks capital of France\", async () => {\n * expect(\"What is the capital of France?\").toEval(\n * \"Paris\",\n * async (input) => {\n * const response = await queryLLM(input);\n * // Recommended: return TaskResult\n * return {\n * result: response.text,\n * toolCalls: response.toolCalls || []\n * };\n * },\n * checkFactuality,\n * 0.8\n * );\n * });\n * ```\n */\n // TODO: this needs to be support true extensibility with Eval scorers\n toEval: async function toEval(\n input: string,\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn<any>,\n threshold = 1.0,\n ) {\n const { isNot } = this;\n\n const taskOutput = await taskFn(input);\n const output =\n typeof taskOutput === \"string\" ? taskOutput : taskOutput.result;\n const toolCalls =\n typeof taskOutput === \"object\" ? taskOutput.toolCalls : undefined;\n\n let result = scoreFn({ input, expected, output, toolCalls });\n if (result instanceof Promise) {\n result = await result;\n }\n\n return {\n pass: (result.score ?? 0) >= threshold,\n message: () => formatScores([{ ...result, name: scoreFn.name }]),\n };\n },\n});\n\n/**\n * Creates a test suite for evaluating language model outputs.\n *\n * @param name - The name of the test suite\n * @param options - Configuration options\n * @param options.data - Async function that returns an array of test cases with input and any additional fields\n * @param options.task - Function that processes the input and returns the model output\n * Can return either a string or TaskResult object with result and optional toolCalls\n * @param options.skipIf - Optional function that determines if tests should be skipped\n * @param options.scorers - Array of scoring functions that evaluate model outputs\n * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0\n * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)\n *\n * @example\n * ```javascript\n * // Recommended: TaskResult format with tool tracking\n * describeEval(\"capital cities test\", {\n * data: async () => [{\n * input: \"What is the capital of France?\",\n * expected: \"Paris\"\n * }],\n * task: async (input) => {\n * const response = await queryLLM(input);\n * return {\n * result: response.text,\n * toolCalls: response.toolCalls || []\n * };\n * },\n * scorers: [checkFactuality],\n * threshold: 0.8\n * });\n *\n * // Example with tool usage evaluation\n * describeEval(\"tool usage test\", {\n * data: async () => [{\n * input: \"Search for weather in Seattle\",\n * expectedTools: [{ name: \"weather_api\", arguments: { location: \"Seattle\" } }]\n * }],\n * task: async (input) => {\n * return {\n * result: \"The weather in Seattle is 65°F\",\n * toolCalls: [{\n * name: \"weather_api\",\n * arguments: { location: \"Seattle\" },\n * result: { temp: 65, condition: \"partly cloudy\" }\n * }]\n * };\n * },\n * scorers: [ToolCallScorer()],\n * threshold: 1.0\n * });\n * ```\n */\nexport function describeEval(\n name: string,\n {\n data,\n task,\n skipIf,\n scorers,\n threshold = 1.0,\n // increase default test timeout as 5s is usually not enough for\n // a single factuality check\n timeout = 60000,\n }: {\n data: () => Promise<Array<{ input: string } & Record<string, any>>>;\n task: TaskFn;\n skipIf?: () => boolean;\n scorers: ScoreFn<any>[];\n threshold?: number | null;\n timeout?: number;\n },\n) {\n return describe(name, async () => {\n const testFn = skipIf ? test.skipIf(skipIf()) : test;\n // TODO: should data just be a generator?\n for (const { input, ...params } of await data()) {\n testFn(\n input,\n {\n timeout,\n },\n async ({ task: testTask }) => {\n const taskOutput = await task(input);\n const output =\n typeof taskOutput === \"string\" ? taskOutput : taskOutput.result;\n const toolCalls =\n typeof taskOutput === \"object\" ? taskOutput.toolCalls : undefined;\n\n const scores = await Promise.all(\n scorers.map((scorer) => {\n const result = scorer({ input, ...params, output, toolCalls });\n if (result instanceof Promise) {\n return result;\n }\n return new Promise<Score>((resolve) => resolve(result));\n }),\n );\n const scoresWithName = scores.map((s, i) => ({\n ...s,\n name: scorers[i].name,\n }));\n\n const avgScore =\n scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;\n\n testTask.meta.eval = {\n scores: scoresWithName,\n avgScore,\n ...(toolCalls && { toolCalls }),\n };\n\n if (threshold) {\n assert(\n avgScore >= threshold,\n `Score: ${avgScore} below threshold: ${threshold}\\n\\n## Output:\\n${wrapText(output)}\\n\\n${formatScores(\n scoresWithName,\n )}`,\n );\n }\n },\n );\n }\n });\n}\n\nexport function formatScores(scores: (Score & { name: string })[]) {\n return scores\n .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))\n .map((s) => {\n const scoreLine = `# ${s.name || \"Unknown\"} [${(s.score ?? 0).toFixed(1)}]`;\n if (\n ((s.score ?? 0) < 1.0 && s.metadata?.rationale) ||\n s.metadata?.output\n ) {\n return `${scoreLine}${\n s.metadata?.rationale\n ? `\\n\\n## Rationale\\n\\n${wrapText(s.metadata.rationale)}`\n : \"\"\n }${s.metadata?.output ? `\\n\\n## Response\\n\\n${wrapText(s.metadata.output)}` : \"\"}`;\n }\n return scoreLine;\n })\n .join(\"\\n\\n\");\n}\n\n/**\n * Wraps text to fit within a specified width, breaking at word boundaries.\n *\n * @param text - The text to wrap\n * @param width - The maximum width in characters (default: 80)\n * @returns The wrapped text with line breaks\n *\n * @example\n * ```javascript\n * const wrapped = wrapText(\"This is a very long text that needs to be wrapped to fit within an 80 character width.\", 20);\n * console.log(wrapped);\n * // Output:\n * // This is a very\n * // long text that\n * // needs to be\n * // wrapped to fit\n * // within an 80\n * // character width.\n * ```\n */\nexport function wrapText(text: string, width = 80): string {\n if (!text || text.length <= width) {\n return text;\n }\n\n const words = text.split(/\\s+/);\n const lines: string[] = [];\n let currentLine = \"\";\n\n for (const word of words) {\n // If adding this word would exceed the width, start a new line\n if (currentLine.length + word.length + 1 > width) {\n lines.push(currentLine.trim());\n currentLine = word;\n } else {\n // Add the word to the current line\n currentLine += (currentLine ? \" \" : \"\") + word;\n }\n }\n\n // Add the last line if it's not empty\n if (currentLine) {\n lines.push(currentLine);\n }\n\n return lines.join(\"\\n\");\n}\n\n// Export built-in scorers\nexport { ToolCallScorer, type ToolCallScorerOptions } from \"./scorers\";\n","import type { ScoreFn, BaseScorerOptions, ToolCall } from \"../index\";\n\nexport interface ToolCallScorerOptions extends BaseScorerOptions {\n // Expected tools are now defined in the test data\n expectedTools?: Array<{\n name: string;\n arguments?: any;\n }>;\n}\n\nexport interface ToolCallScorerConfig {\n /**\n * Whether tools must be called in the exact order specified\n * @default false\n */\n ordered?: boolean;\n\n /**\n * Whether all expected tools must be called for a passing score\n * When false: gives partial credit based on tools matched\n * @default true\n */\n requireAll?: boolean;\n\n /**\n * Whether to allow additional tool calls beyond those expected\n * @default true\n */\n allowExtras?: boolean;\n\n /**\n * How to match tool arguments/parameters\n * - \"strict\": Exact equality required (default)\n * - \"fuzzy\": Case-insensitive, subset matching, numeric tolerance\n * - Custom function: Your own comparison logic\n * @default \"strict\"\n */\n params?: \"strict\" | \"fuzzy\" | ((expected: any, actual: any) => boolean);\n}\n\n/**\n * Default fuzzy matching for arguments\n */\nfunction fuzzyMatch(expected: any, actual: any): boolean {\n // Null/undefined handling\n if (expected == null || actual == null) {\n return expected === actual;\n }\n\n // For objects, check if actual has all expected properties\n if (\n typeof expected === \"object\" &&\n typeof actual === \"object\" &&\n !Array.isArray(expected)\n ) {\n return Object.entries(expected).every(\n ([key, value]) => key in actual && fuzzyMatch(value, actual[key]),\n );\n }\n\n // For strings, case-insensitive substring match\n if (typeof expected === \"string\" && typeof actual === \"string\") {\n return actual.toLowerCase().includes(expected.toLowerCase());\n }\n\n // For numbers, allow small differences (0.1% or 0.001, whichever is larger)\n if (typeof expected === \"number\" && typeof actual === \"number\") {\n const tolerance = Math.max(Math.abs(expected) * 0.001, 0.001);\n return Math.abs(expected - actual) <= tolerance;\n }\n\n // For arrays, check if all expected items exist in actual (order doesn't matter in fuzzy mode)\n if (Array.isArray(expected) && Array.isArray(actual)) {\n return expected.every((expItem) =>\n actual.some((actItem) => fuzzyMatch(expItem, actItem)),\n );\n }\n\n // Otherwise strict equality\n return expected === actual;\n}\n\n/**\n * Strict equality comparison (deep equals)\n */\nfunction strictEquals(expected: any, actual: any): boolean {\n // Handle primitive types and null/undefined\n if (expected === actual) return true;\n if (expected == null || actual == null) return false;\n\n // Must be same type\n if (typeof expected !== typeof actual) return false;\n\n // Handle arrays\n if (Array.isArray(expected)) {\n if (!Array.isArray(actual)) return false;\n if (expected.length !== actual.length) return false;\n return expected.every((item, i) => strictEquals(item, actual[i]));\n }\n\n // Handle objects\n if (typeof expected === \"object\") {\n const expectedKeys = Object.keys(expected).sort();\n const actualKeys = Object.keys(actual).sort();\n\n // Must have same keys\n if (expectedKeys.length !== actualKeys.length) return false;\n if (!expectedKeys.every((key, i) => key === actualKeys[i])) return false;\n\n // All values must match\n return expectedKeys.every((key) =>\n strictEquals(expected[key], actual[key]),\n );\n }\n\n // Primitive types\n return expected === actual;\n}\n\n/**\n * A configurable scorer for evaluating tool usage in LLM responses.\n *\n * The test data defines WHAT tools/arguments are expected,\n * while this scorer defines HOW to evaluate them.\n *\n * @param config - Configuration options for the scorer\n * @param config.ordered - Require exact order of tool calls\n * @param config.requireAll - Require all expected tools (vs partial credit)\n * @param config.allowExtras - Allow additional tool calls\n * @param config.params - How to match parameters: \"strict\", \"fuzzy\", or custom function\n *\n * @example\n * // Default: strict params, any order\n * describeEval(\"search test\", {\n * data: async () => [{\n * input: \"Find restaurants\",\n * expectedTools: [\n * { name: \"search\", arguments: { type: \"restaurant\" } },\n * { name: \"filter\" }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer()]\n * });\n *\n * @example\n * // Strict order and parameters\n * describeEval(\"payment flow\", {\n * data: async () => [{\n * input: \"Process payment\",\n * expectedTools: [\n * { name: \"validate\", arguments: { amount: 100 } },\n * { name: \"charge\", arguments: { amount: 100, method: \"card\" } }\n * ]\n * }],\n * task: myTask,\n * scorers: [ToolCallScorer({ ordered: true, params: \"strict\" })]\n * });\n */\nexport function ToolCallScorer(\n config: ToolCallScorerConfig = {},\n): ScoreFn<ToolCallScorerOptions> {\n const {\n ordered = false,\n requireAll = true,\n allowExtras = true,\n params = \"strict\",\n } = config;\n\n // Determine the argument matcher\n const argMatcher =\n typeof params === \"function\"\n ? params\n : params === \"strict\"\n ? strictEquals\n : fuzzyMatch;\n\n return async (opts) => {\n const expectedTools = opts.expectedTools || [];\n const actualCalls = opts.toolCalls || [];\n\n // No expectations means pass\n if (expectedTools.length === 0) {\n return {\n score: 1.0,\n metadata: {\n rationale: \"No tool calls expected\",\n },\n };\n }\n\n // No actual calls when we expected some\n if (actualCalls.length === 0) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected ${expectedTools.length} tool(s) but none were called`,\n },\n };\n }\n\n if (ordered) {\n return evaluateOrderedTools(expectedTools, actualCalls, {\n argMatcher,\n allowExtras,\n requireAllTools: requireAll,\n });\n }\n\n return evaluateUnorderedTools(expectedTools, actualCalls, {\n argMatcher,\n requireAllTools: requireAll,\n allowExtras,\n });\n };\n}\n\n/**\n * Evaluate tools that must be called in a specific order\n */\nfunction evaluateOrderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n allowExtras: boolean;\n requireAllTools: boolean;\n },\n) {\n let expectedIndex = 0;\n let actualIndex = 0;\n\n // Match expected tools in order\n while (expectedIndex < expected.length && actualIndex < actual.length) {\n const exp = expected[expectedIndex];\n const act = actual[actualIndex];\n\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments || {},\n );\n if (!argsMatch) {\n return {\n score: 0.5,\n metadata: {\n rationale: `Tool '${exp.name}' called with incorrect arguments at position ${expectedIndex + 1}`,\n expected: exp.arguments,\n actual: act.arguments,\n },\n };\n }\n }\n expectedIndex++;\n actualIndex++;\n } else if (options.allowExtras) {\n // Skip extra tool\n actualIndex++;\n } else {\n // Wrong tool in sequence when extra tools not allowed\n return {\n score: 0.0,\n metadata: {\n rationale: `Expected '${exp.name}' at position ${expectedIndex + 1} but found '${act.name}'`,\n },\n };\n }\n }\n\n // Check if all expected tools were matched\n if (expectedIndex < expected.length) {\n const missing = expected.slice(expectedIndex).map((t) => t.name);\n\n if (options.requireAllTools) {\n return {\n score: 0.0,\n metadata: {\n rationale: `Missing required tools in sequence: ${missing.join(\", \")}`,\n },\n };\n }\n\n // Partial credit when requireAllTools is false\n const matchedCount = expectedIndex;\n const totalCount = expected.length;\n const score = totalCount > 0 ? matchedCount / totalCount : 1.0;\n\n return {\n score,\n metadata: {\n rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(\", \")})`,\n matched: matchedCount,\n total: totalCount,\n },\n };\n }\n\n // Check for extra tools at the end if not allowed\n if (!options.allowExtras && actualIndex < actual.length) {\n const extra = actual.slice(actualIndex).map((t) => t.name);\n return {\n score: 0.0,\n metadata: {\n rationale: `Unexpected extra tools: ${extra.join(\", \")}`,\n },\n };\n }\n\n return {\n score: 1.0,\n metadata: {\n rationale: \"All tools called in expected order with correct arguments\",\n },\n };\n}\n\n/**\n * Evaluate tools that can be called in any order\n */\nfunction evaluateUnorderedTools(\n expected: Array<{ name: string; arguments?: any }>,\n actual: ToolCall[],\n options: {\n argMatcher: (expected: any, actual: any) => boolean;\n requireAllTools: boolean;\n allowExtras: boolean;\n },\n) {\n const matchedExpected = new Set<number>();\n const matchedActual = new Set<number>();\n const issues: string[] = [];\n\n // Try to match each expected tool\n for (let i = 0; i < expected.length; i++) {\n const exp = expected[i];\n let found = false;\n\n // Look for a matching actual tool call\n for (let j = 0; j < actual.length; j++) {\n if (matchedActual.has(j)) continue;\n\n const act = actual[j];\n if (exp.name === act.name) {\n // Check arguments if specified\n if (exp.arguments !== undefined) {\n const argsMatch = options.argMatcher(\n exp.arguments,\n act.arguments || {},\n );\n if (!argsMatch) {\n continue; // Try to find another call with matching args\n }\n }\n\n // Found a match\n matchedExpected.add(i);\n matchedActual.add(j);\n found = true;\n break;\n }\n }\n\n if (!found) {\n if (exp.arguments !== undefined) {\n // Check if tool was called but with wrong args\n const wrongArgsCalls = actual.filter((a) => a.name === exp.name);\n if (wrongArgsCalls.length > 0) {\n issues.push(`Tool '${exp.name}' called but with incorrect arguments`);\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n } else {\n issues.push(`Missing required tool: ${exp.name}`);\n }\n }\n }\n\n // Check for extra tools\n const extraTools = actual\n .filter((_, i) => !matchedActual.has(i))\n .map((t) => t.name);\n\n if (!options.allowExtras && extraTools.length > 0) {\n issues.push(`Unexpected extra tools: ${extraTools.join(\", \")}`);\n }\n\n // Calculate score\n const expectedMatched = matchedExpected.size;\n const expectedTotal = expected.length;\n\n // If we have any critical issues (wrong tools, missing tools when required, or extra tools when not allowed)\n if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {\n return {\n score: 0.0,\n metadata: {\n rationale: issues.join(\"; \"),\n },\n };\n }\n\n // Partial credit when not all required\n const score = expectedTotal > 0 ? expectedMatched / expectedTotal : 1.0;\n\n if (score === 1.0) {\n const extraInfo =\n extraTools.length > 0 ? ` (plus extra: ${extraTools.join(\", \")})` : \"\";\n return {\n score: 1.0,\n metadata: {\n rationale: `All expected tools were called${extraInfo}`,\n },\n };\n }\n\n return {\n score,\n metadata: {\n rationale: issues.join(\"; \"),\n matched: expectedMatched,\n total: expectedTotal,\n },\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,SAAS,QAAQ,UAAU,QAAQ,YAAY;AAC/C,OAAO;;;AC0CP,SAAS,WAAW,UAAe,QAAsB;AAEvD,MAAI,YAAY,QAAQ,UAAU,MAAM;AACtC,WAAO,aAAa;AAAA,EACtB;AAGA,MACE,OAAO,aAAa,YACpB,OAAO,WAAW,YAClB,CAAC,MAAM,QAAQ,QAAQ,GACvB;AACA,WAAO,OAAO,QAAQ,QAAQ,EAAE;AAAA,MAC9B,CAAC,CAAC,KAAK,KAAK,MAAM,OAAO,UAAU,WAAW,OAAO,OAAO,GAAG,CAAC;AAAA,IAClE;AAAA,EACF;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,OAAO,YAAY,EAAE,SAAS,SAAS,YAAY,CAAC;AAAA,EAC7D;AAGA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,UAAM,YAAY,KAAK,IAAI,KAAK,IAAI,QAAQ,IAAI,MAAO,IAAK;AAC5D,WAAO,KAAK,IAAI,WAAW,MAAM,KAAK;AAAA,EACxC;AAGA,MAAI,MAAM,QAAQ,QAAQ,KAAK,MAAM,QAAQ,MAAM,GAAG;AACpD,WAAO,SAAS;AAAA,MAAM,CAAC,YACrB,OAAO,KAAK,CAAC,YAAY,WAAW,SAAS,OAAO,CAAC;AAAA,IACvD;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AAKA,SAAS,aAAa,UAAe,QAAsB;AAEzD,MAAI,aAAa,OAAQ,QAAO;AAChC,MAAI,YAAY,QAAQ,UAAU,KAAM,QAAO;AAG/C,MAAI,OAAO,aAAa,OAAO,OAAQ,QAAO;AAG9C,MAAI,MAAM,QAAQ,QAAQ,GAAG;AAC3B,QAAI,CAAC,MAAM,QAAQ,MAAM,EAAG,QAAO;AACnC,QAAI,SAAS,WAAW,OAAO,OAAQ,QAAO;AAC9C,WAAO,SAAS,MAAM,CAAC,MAAM,MAAM,aAAa,MAAM,OAAO,CAAC,CAAC,CAAC;AAAA,EAClE;AAGA,MAAI,OAAO,aAAa,UAAU;AAChC,UAAM,eAAe,OAAO,KAAK,QAAQ,EAAE,KAAK;AAChD,UAAM,aAAa,OAAO,KAAK,MAAM,EAAE,KAAK;AAG5C,QAAI,aAAa,WAAW,WAAW,OAAQ,QAAO;AACtD,QAAI,CAAC,aAAa,MAAM,CAAC,KAAK,MAAM,QAAQ,WAAW,CAAC,CAAC,EAAG,QAAO;AAGnE,WAAO,aAAa;AAAA,MAAM,CAAC,QACzB,aAAa,SAAS,GAAG,GAAG,OAAO,GAAG,CAAC;AAAA,IACzC;AAAA,EACF;AAGA,SAAO,aAAa;AACtB;AA0CO,SAAS,eACd,SAA+B,CAAC,GACA;AAChC,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,aAAa;AAAA,IACb,cAAc;AAAA,IACd,SAAS;AAAA,EACX,IAAI;AAGJ,QAAM,aACJ,OAAO,WAAW,aACd,SACA,WAAW,WACT,eACA;AAER,SAAO,CAAO,SAAS;AACrB,UAAM,gBAAgB,KAAK,iBAAiB,CAAC;AAC7C,UAAM,cAAc,KAAK,aAAa,CAAC;AAGvC,QAAI,cAAc,WAAW,GAAG;AAC9B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW;AAAA,QACb;AAAA,MACF;AAAA,IACF;AAGA,QAAI,YAAY,WAAW,GAAG;AAC5B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,YAAY,cAAc,MAAM;AAAA,QAC7C;AAAA,MACF;AAAA,IACF;AAEA,QAAI,SAAS;AACX,aAAO,qBAAqB,eAAe,aAAa;AAAA,QACtD;AAAA,QACA;AAAA,QACA,iBAAiB;AAAA,MACnB,CAAC;AAAA,IACH;AAEA,WAAO,uBAAuB,eAAe,aAAa;AAAA,MACxD;AAAA,MACA,iBAAiB;AAAA,MACjB;AAAA,IACF,CAAC;AAAA,EACH;AACF;AAKA,SAAS,qBACP,UACA,QACA,SAKA;AACA,MAAI,gBAAgB;AACpB,MAAI,cAAc;AAGlB,SAAO,gBAAgB,SAAS,UAAU,cAAc,OAAO,QAAQ;AACrE,UAAM,MAAM,SAAS,aAAa;AAClC,UAAM,MAAM,OAAO,WAAW;AAE9B,QAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,UAAI,IAAI,cAAc,QAAW;AAC/B,cAAM,YAAY,QAAQ;AAAA,UACxB,IAAI;AAAA,UACJ,IAAI,aAAa,CAAC;AAAA,QACpB;AACA,YAAI,CAAC,WAAW;AACd,iBAAO;AAAA,YACL,OAAO;AAAA,YACP,UAAU;AAAA,cACR,WAAW,SAAS,IAAI,IAAI,iDAAiD,gBAAgB,CAAC;AAAA,cAC9F,UAAU,IAAI;AAAA,cACd,QAAQ,IAAI;AAAA,YACd;AAAA,UACF;AAAA,QACF;AAAA,MACF;AACA;AACA;AAAA,IACF,WAAW,QAAQ,aAAa;AAE9B;AAAA,IACF,OAAO;AAEL,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,aAAa,IAAI,IAAI,iBAAiB,gBAAgB,CAAC,eAAe,IAAI,IAAI;AAAA,QAC3F;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,MAAI,gBAAgB,SAAS,QAAQ;AACnC,UAAM,UAAU,SAAS,MAAM,aAAa,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AAE/D,QAAI,QAAQ,iBAAiB;AAC3B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,uCAAuC,QAAQ,KAAK,IAAI,CAAC;AAAA,QACtE;AAAA,MACF;AAAA,IACF;AAGA,UAAM,eAAe;AACrB,UAAM,aAAa,SAAS;AAC5B,UAAM,QAAQ,aAAa,IAAI,eAAe,aAAa;AAE3D,WAAO;AAAA,MACL;AAAA,MACA,UAAU;AAAA,QACR,WAAW,kBAAkB,YAAY,IAAI,UAAU,oCAAoC,QAAQ,KAAK,IAAI,CAAC;AAAA,QAC7G,SAAS;AAAA,QACT,OAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAGA,MAAI,CAAC,QAAQ,eAAe,cAAc,OAAO,QAAQ;AACvD,UAAM,QAAQ,OAAO,MAAM,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI;AACzD,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,2BAA2B,MAAM,KAAK,IAAI,CAAC;AAAA,MACxD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO;AAAA,IACP,UAAU;AAAA,MACR,WAAW;AAAA,IACb;AAAA,EACF;AACF;AAKA,SAAS,uBACP,UACA,QACA,SAKA;AACA,QAAM,kBAAkB,oBAAI,IAAY;AACxC,QAAM,gBAAgB,oBAAI,IAAY;AACtC,QAAM,SAAmB,CAAC;AAG1B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,MAAM,SAAS,CAAC;AACtB,QAAI,QAAQ;AAGZ,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,UAAI,cAAc,IAAI,CAAC,EAAG;AAE1B,YAAM,MAAM,OAAO,CAAC;AACpB,UAAI,IAAI,SAAS,IAAI,MAAM;AAEzB,YAAI,IAAI,cAAc,QAAW;AAC/B,gBAAM,YAAY,QAAQ;AAAA,YACxB,IAAI;AAAA,YACJ,IAAI,aAAa,CAAC;AAAA,UACpB;AACA,cAAI,CAAC,WAAW;AACd;AAAA,UACF;AAAA,QACF;AAGA,wBAAgB,IAAI,CAAC;AACrB,sBAAc,IAAI,CAAC;AACnB,gBAAQ;AACR;AAAA,MACF;AAAA,IACF;AAEA,QAAI,CAAC,OAAO;AACV,UAAI,IAAI,cAAc,QAAW;AAE/B,cAAM,iBAAiB,OAAO,OAAO,CAAC,MAAM,EAAE,SAAS,IAAI,IAAI;AAC/D,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,KAAK,SAAS,IAAI,IAAI,uCAAuC;AAAA,QACtE,OAAO;AACL,iBAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,QAClD;AAAA,MACF,OAAO;AACL,eAAO,KAAK,0BAA0B,IAAI,IAAI,EAAE;AAAA,MAClD;AAAA,IACF;AAAA,EACF;AAGA,QAAM,aAAa,OAChB,OAAO,CAAC,GAAG,MAAM,CAAC,cAAc,IAAI,CAAC,CAAC,EACtC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,MAAI,CAAC,QAAQ,eAAe,WAAW,SAAS,GAAG;AACjD,WAAO,KAAK,2BAA2B,WAAW,KAAK,IAAI,CAAC,EAAE;AAAA,EAChE;AAGA,QAAM,kBAAkB,gBAAgB;AACxC,QAAM,gBAAgB,SAAS;AAG/B,MAAI,OAAO,SAAS,MAAM,QAAQ,mBAAmB,CAAC,QAAQ,cAAc;AAC1E,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAGA,QAAM,QAAQ,gBAAgB,IAAI,kBAAkB,gBAAgB;AAEpE,MAAI,UAAU,GAAK;AACjB,UAAM,YACJ,WAAW,SAAS,IAAI,iBAAiB,WAAW,KAAK,IAAI,CAAC,MAAM;AACtE,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,iCAAiC,SAAS;AAAA,MACvD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC3B,SAAS;AAAA,MACT,OAAO;AAAA,IACT;AAAA,EACF;AACF;;;ADlUA,OAAO,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA8BZ,QAAQ,SAAe,OACrB,OACA,UACA,QACA,SACA,YAAY,GACZ;AAAA;AA1IJ;AA2II,YAAM,EAAE,MAAM,IAAI;AAElB,YAAM,aAAa,MAAM,OAAO,KAAK;AACrC,YAAM,SACJ,OAAO,eAAe,WAAW,aAAa,WAAW;AAC3D,YAAM,YACJ,OAAO,eAAe,WAAW,WAAW,YAAY;AAE1D,UAAI,SAAS,QAAQ,EAAE,OAAO,UAAU,QAAQ,UAAU,CAAC;AAC3D,UAAI,kBAAkB,SAAS;AAC7B,iBAAS,MAAM;AAAA,MACjB;AAEA,aAAO;AAAA,QACL,QAAO,YAAO,UAAP,YAAgB,MAAM;AAAA,QAC7B,SAAS,MAAM,aAAa,CAAC,iCAAK,SAAL,EAAa,MAAM,QAAQ,KAAK,EAAC,CAAC;AAAA,MACjE;AAAA,IACF;AAAA;AACF,CAAC;AAuDM,SAAS,aACd,MACA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA;AAAA;AAAA,EAGZ,UAAU;AACZ,GAQA;AACA,SAAO,SAAS,MAAM,MAAY;AAChC,UAAM,SAAS,SAAS,KAAK,OAAO,OAAO,CAAC,IAAI;AAEhD,eAAW,MAAwB,MAAM,KAAK,GAAG;AAA5C,qBAAQ,QA3OjB,IA2OS,IAAkB,mBAAlB,IAAkB,CAAV;AACX;AAAA,QACE;AAAA,QACA;AAAA,UACE;AAAA,QACF;AAAA,QACA,CAAO,OAAuB,eAAvB,KAAuB,WAAvB,EAAE,MAAM,SAAS,GAAM;AAC5B,gBAAM,aAAa,MAAM,KAAK,KAAK;AACnC,gBAAM,SACJ,OAAO,eAAe,WAAW,aAAa,WAAW;AAC3D,gBAAM,YACJ,OAAO,eAAe,WAAW,WAAW,YAAY;AAE1D,gBAAM,SAAS,MAAM,QAAQ;AAAA,YAC3B,QAAQ,IAAI,CAAC,WAAW;AACtB,oBAAM,SAAS,OAAO,+BAAE,SAAU,SAAZ,EAAoB,QAAQ,UAAU,EAAC;AAC7D,kBAAI,kBAAkB,SAAS;AAC7B,uBAAO;AAAA,cACT;AACA,qBAAO,IAAI,QAAe,CAAC,YAAY,QAAQ,MAAM,CAAC;AAAA,YACxD,CAAC;AAAA,UACH;AACA,gBAAM,iBAAiB,OAAO,IAAI,CAAC,GAAG,MAAO,iCACxC,IADwC;AAAA,YAE3C,MAAM,QAAQ,CAAC,EAAE;AAAA,UACnB,EAAE;AAEF,gBAAM,WACJ,OAAO,OAAO,CAAC,KAAK,MAAG;AAvQnC,gBAAAA;AAuQsC,2BAAOA,MAAA,EAAE,UAAF,OAAAA,MAAW;AAAA,aAAI,CAAC,IAAI,OAAO;AAE9D,mBAAS,KAAK,OAAO;AAAA,YACnB,QAAQ;AAAA,YACR;AAAA,aACI,aAAa,EAAE,UAAU;AAG/B,cAAI,WAAW;AACb;AAAA,cACE,YAAY;AAAA,cACZ,UAAU,QAAQ,qBAAqB,SAAS;AAAA;AAAA;AAAA,EAAmB,SAAS,MAAM,CAAC;AAAA;AAAA,EAAO;AAAA,gBACxF;AAAA,cACF,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF,EAAC;AACH;AAEO,SAAS,aAAa,QAAsC;AACjE,SAAO,OACJ,KAAK,CAAC,GAAG,MAAG;AA/RjB;AA+RqB,oBAAE,UAAF,YAAW,OAAM,OAAE,UAAF,YAAW;AAAA,GAAE,EAC9C,IAAI,CAAC,MAAM;AAhShB;AAiSM,UAAM,YAAY,KAAK,EAAE,QAAQ,SAAS,OAAM,OAAE,UAAF,YAAW,GAAG,QAAQ,CAAC,CAAC;AACxE,UACI,OAAE,UAAF,YAAW,KAAK,OAAO,OAAE,aAAF,mBAAY,gBACrC,OAAE,aAAF,mBAAY,SACZ;AACA,aAAO,GAAG,SAAS,KACjB,OAAE,aAAF,mBAAY,aACR;AAAA;AAAA;AAAA;AAAA,EAAuB,SAAS,EAAE,SAAS,SAAS,CAAC,KACrD,EACN,KAAG,OAAE,aAAF,mBAAY,UAAS;AAAA;AAAA;AAAA;AAAA,EAAsB,SAAS,EAAE,SAAS,MAAM,CAAC,KAAK,EAAE;AAAA,IAClF;AACA,WAAO;AAAA,EACT,CAAC,EACA,KAAK,MAAM;AAChB;AAsBO,SAAS,SAAS,MAAc,QAAQ,IAAY;AACzD,MAAI,CAAC,QAAQ,KAAK,UAAU,OAAO;AACjC,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,QAAkB,CAAC;AACzB,MAAI,cAAc;AAElB,aAAW,QAAQ,OAAO;AAExB,QAAI,YAAY,SAAS,KAAK,SAAS,IAAI,OAAO;AAChD,YAAM,KAAK,YAAY,KAAK,CAAC;AAC7B,oBAAc;AAAA,IAChB,OAAO;AAEL,sBAAgB,cAAc,MAAM,MAAM;AAAA,IAC5C;AAAA,EACF;AAGA,MAAI,aAAa;AACf,UAAM,KAAK,WAAW;AAAA,EACxB;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;","names":["_a"]}
@@ -0,0 +1,2 @@
1
+ export { ToolCallScorer, ToolCallScorerConfig, ToolCallScorerOptions } from './toolCallScorer.mjs';
2
+ import 'vitest';
@@ -0,0 +1,2 @@
1
+ export { ToolCallScorer, ToolCallScorerConfig, ToolCallScorerOptions } from './toolCallScorer.js';
2
+ import 'vitest';