@ai-sdk-tool/eval 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,14 +5,15 @@ var colors = {
5
5
  red: "\x1B[31m",
6
6
  yellow: "\x1B[33m",
7
7
  cyan: "\x1B[36m",
8
- magenta: "\x1B[35m"
8
+ magenta: "\x1B[35m",
9
+ gray: "\x1B[90m"
9
10
  };
10
11
  function printResult(result) {
11
- const { model, benchmark, result: benchmarkResult } = result;
12
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
12
13
  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
13
14
  console.log(
14
15
  `
15
- ${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
16
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
16
17
  );
17
18
  console.log(
18
19
  ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
@@ -38,6 +39,186 @@ function consoleReporter(results) {
38
39
  console.log("\n---------------------------\n");
39
40
  }
40
41
 
42
+ // src/reporters/console.debug.ts
43
+ var colors2 = {
44
+ reset: "\x1B[0m",
45
+ green: "\x1B[32m",
46
+ red: "\x1B[31m",
47
+ yellow: "\x1B[33m",
48
+ cyan: "\x1B[36m",
49
+ magenta: "\x1B[35m",
50
+ gray: "\x1B[90m",
51
+ bold: "\x1B[1m",
52
+ underline: "\x1B[4m"
53
+ };
54
+ function colorizeDiffLine(line) {
55
+ if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
56
+ if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
57
+ if (line.startsWith("@"))
58
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
59
+ return line;
60
+ }
61
+ function uniqueLines(lines) {
62
+ const seen = /* @__PURE__ */ new Set();
63
+ const out = [];
64
+ for (const l of lines) {
65
+ if (seen.has(l)) continue;
66
+ seen.add(l);
67
+ out.push(l);
68
+ }
69
+ return out;
70
+ }
71
+ function suggestFixFromDiff(parsed) {
72
+ const suggestions = [];
73
+ const { error_type, expected, actual, diff } = parsed ?? {};
74
+ if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
75
+ const expectedName = expected?.function;
76
+ const actualName = actual?.function;
77
+ if (expectedName && actualName && expectedName !== actualName) {
78
+ suggestions.push(
79
+ `Call the function '${expectedName}' instead of '${actualName}'.`
80
+ );
81
+ }
82
+ if (Array.isArray(expected?.functions)) {
83
+ suggestions.push(
84
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
85
+ );
86
+ }
87
+ }
88
+ if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
89
+ const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
90
+ if (missing.length) {
91
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
92
+ }
93
+ }
94
+ if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
95
+ const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
96
+ if (extras.length) {
97
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
98
+ }
99
+ }
100
+ if (diff && diff.some((d) => d.startsWith("@@ param "))) {
101
+ const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
102
+ for (const param of targets) {
103
+ const allowedLine = diff.find(
104
+ (d) => d.startsWith("- expected one of:")
105
+ );
106
+ if (allowedLine) {
107
+ const allowed = allowedLine.replace("- expected one of: ", "");
108
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
109
+ } else {
110
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
111
+ }
112
+ }
113
+ }
114
+ if (suggestions.length === 0 && typeof error_type === "string") {
115
+ if (error_type.includes("missing_required")) {
116
+ suggestions.push(
117
+ "Add all required parameters defined by the tool schema."
118
+ );
119
+ } else if (error_type.includes("unexpected_param")) {
120
+ suggestions.push("Remove parameters not present in the tool schema.");
121
+ } else if (error_type.includes("wrong_count")) {
122
+ suggestions.push(
123
+ "Adjust the number of tool calls to match expected count."
124
+ );
125
+ } else if (error_type.includes("wrong_func_name")) {
126
+ suggestions.push("Use the exact expected function name from the schema.");
127
+ } else if (error_type.includes("value_error")) {
128
+ suggestions.push("Choose a value from the allowed options.");
129
+ }
130
+ }
131
+ return uniqueLines(suggestions);
132
+ }
133
+ function consoleDebugReporter(results) {
134
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
135
+ for (const r of results) {
136
+ const { model, modelKey, benchmark, result } = r;
137
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
138
+ console.log(
139
+ `
140
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
141
+ );
142
+ console.log(
143
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
144
+ );
145
+ const metrics = Object.entries(result.metrics);
146
+ if (metrics.length > 0) {
147
+ console.log(" Metrics:");
148
+ for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
149
+ }
150
+ if (result.logs && result.logs.length) {
151
+ const failLogs = result.logs.filter(
152
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
153
+ );
154
+ const hasFails = failLogs.length > 0;
155
+ if (hasFails) {
156
+ console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
157
+ const debugIds = /* @__PURE__ */ new Set();
158
+ for (const l of failLogs) {
159
+ if (l.startsWith("[DEBUG-FAIL]")) {
160
+ try {
161
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
162
+ if (parsed?.id) debugIds.add(String(parsed.id));
163
+ } catch {
164
+ }
165
+ }
166
+ }
167
+ for (const line of failLogs) {
168
+ if (line.startsWith("[FAIL]")) {
169
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
170
+ const failId = m?.[1];
171
+ if (failId && debugIds.has(failId)) continue;
172
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
173
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
174
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
175
+ } else if (line.startsWith("[STACK]")) {
176
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
177
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
178
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
179
+ try {
180
+ const parsed = JSON.parse(payload);
181
+ const { id, expected, actual, message, diff } = parsed;
182
+ console.log(
183
+ ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
184
+ );
185
+ if (diff && Array.isArray(diff)) {
186
+ for (const dLine of diff)
187
+ console.log(" " + colorizeDiffLine(dLine));
188
+ } else {
189
+ console.log(" expected:");
190
+ console.log(
191
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
192
+ );
193
+ console.log(" actual:");
194
+ console.log(
195
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
196
+ );
197
+ }
198
+ const suggestions = suggestFixFromDiff(parsed);
199
+ if (suggestions.length) {
200
+ console.log(
201
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
202
+ );
203
+ for (const s of suggestions) console.log(` \u2022 ${s}`);
204
+ }
205
+ } catch {
206
+ console.log(` ${line}`);
207
+ }
208
+ }
209
+ }
210
+ } else {
211
+ const info = result.logs.filter(
212
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
213
+ );
214
+ for (const line of info)
215
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
216
+ }
217
+ }
218
+ }
219
+ console.log("\n------------------------------------\n");
220
+ }
221
+
41
222
  // src/reporters/json.ts
42
223
  function jsonReporter(results) {
43
224
  const serializableResults = results.map((r) => ({
@@ -53,30 +234,35 @@ function jsonReporter(results) {
53
234
  // src/reporters/index.ts
54
235
  var reporters = {
55
236
  console: consoleReporter,
56
- json: jsonReporter
237
+ json: jsonReporter,
238
+ "console.debug": consoleDebugReporter
57
239
  };
58
240
 
59
241
  // src/evaluate.ts
60
- async function runSingleBenchmark(model, benchmark) {
242
+ async function runSingleBenchmark(model, benchmark, modelKey) {
61
243
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
62
244
  try {
63
- console.log(`[${modelId}] Running benchmark: ${benchmark.name}...`);
245
+ console.log(
246
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
247
+ );
64
248
  const result = await benchmark.run(model);
65
249
  console.log(
66
- `[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
250
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
67
251
  );
68
252
  return {
69
253
  model: modelId,
254
+ modelKey,
70
255
  benchmark: benchmark.name,
71
256
  result
72
257
  };
73
258
  } catch (error) {
74
259
  console.error(
75
- `[${modelId}] Error running benchmark: ${benchmark.name}`,
260
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
76
261
  error
77
262
  );
78
263
  return {
79
264
  model: modelId,
265
+ modelKey,
80
266
  benchmark: benchmark.name,
81
267
  result: {
82
268
  score: 0,
@@ -89,11 +275,26 @@ async function runSingleBenchmark(model, benchmark) {
89
275
  }
90
276
  async function evaluate(options) {
91
277
  const { models, benchmarks, reporter = "console" } = options;
92
- const modelsArray = Array.isArray(models) ? models : [models];
278
+ const modelEntries = [];
279
+ if (Array.isArray(models)) {
280
+ for (const m of models) modelEntries.push([void 0, m]);
281
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
282
+ modelEntries.push([void 0, models]);
283
+ } else {
284
+ for (const [key, m] of Object.entries(
285
+ models
286
+ )) {
287
+ modelEntries.push([key, m]);
288
+ }
289
+ }
93
290
  const allResults = [];
94
- for (const model of modelsArray) {
291
+ for (const [modelKey, model] of modelEntries) {
95
292
  for (const benchmark of benchmarks) {
96
- const evaluationResult = await runSingleBenchmark(model, benchmark);
293
+ const evaluationResult = await runSingleBenchmark(
294
+ model,
295
+ benchmark,
296
+ modelKey
297
+ );
97
298
  allResults.push(evaluationResult);
98
299
  }
99
300
  }
@@ -436,7 +637,9 @@ function checkStringValue(param, modelValue, possibleAnswers) {
436
637
  if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
437
638
  return {
438
639
  valid: false,
439
- error: `Invalid value for parameter '${param}': '${modelValue}'. Expected one of ${possibleAnswers.join(", ")}.`,
640
+ error: `Invalid value for parameter '${param}': ${JSON.stringify(
641
+ modelValue
642
+ )}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
440
643
  error_type: "value_error:string"
441
644
  };
442
645
  }
@@ -490,15 +693,55 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
490
693
  if (!hasMatch) {
491
694
  return {
492
695
  valid: false,
493
- error: `Invalid value for list parameter '${paramName}'.`,
696
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
697
+ modelValue
698
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
494
699
  error_type: "value_error:list"
495
700
  };
496
701
  }
497
702
  } else {
498
- if (!possibleValues.includes(modelValue)) {
703
+ const hasMatch = possibleValues.some((possibleValue) => {
704
+ if (modelValue === possibleValue) return true;
705
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
706
+ try {
707
+ const normalizeObject = (obj) => {
708
+ if (Array.isArray(obj)) {
709
+ return obj.map(normalizeObject);
710
+ }
711
+ if (obj && typeof obj === "object") {
712
+ const normalized = {};
713
+ for (const [key, value] of Object.entries(obj)) {
714
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
715
+ normalized[key] = value[0];
716
+ } else {
717
+ normalized[key] = normalizeObject(value);
718
+ }
719
+ }
720
+ return normalized;
721
+ }
722
+ return obj;
723
+ };
724
+ const normalizedModel = normalizeObject(modelValue);
725
+ const normalizedPossible = normalizeObject(possibleValue);
726
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
727
+ } catch {
728
+ return false;
729
+ }
730
+ }
731
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
732
+ return modelValue.toString() === possibleValue;
733
+ }
734
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
735
+ return modelValue === possibleValue.toString();
736
+ }
737
+ return false;
738
+ });
739
+ if (!hasMatch) {
499
740
  return {
500
741
  valid: false,
501
- error: `Invalid value for parameter '${paramName}': got '${modelValue}', expected one of '${possibleValues}'.`,
742
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
743
+ modelValue
744
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
502
745
  error_type: "value_error:other"
503
746
  };
504
747
  }
@@ -594,7 +837,8 @@ function check(testCase, modelOutput, possibleAnswer) {
594
837
  if (!modelOutput || modelOutput.length !== 1) {
595
838
  return {
596
839
  valid: false,
597
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`
840
+ error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
841
+ error_type: "simple:wrong_count"
598
842
  };
599
843
  }
600
844
  return simpleFunctionChecker(
@@ -623,7 +867,11 @@ function check(testCase, modelOutput, possibleAnswer) {
623
867
  }
624
868
  return { valid: true };
625
869
  } catch (e) {
626
- return { valid: false, error: `Checker Error: ${e.message}` };
870
+ return {
871
+ valid: false,
872
+ error: `Checker Error: ${e.message}`,
873
+ error_type: "checker_error"
874
+ };
627
875
  }
628
876
  }
629
877
  function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
@@ -675,7 +923,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
675
923
  if (copy.items) copy.items = fixSchema(copy.items);
676
924
  return copy;
677
925
  };
678
- for (const testCase of testCases) {
926
+ const concurrencyEnv = process.env.BFCL_CONCURRENCY;
927
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
928
+ logs.push(
929
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
930
+ );
931
+ const runSingleCase = async (testCase) => {
932
+ const caseLogs = [];
679
933
  const { function: tools, question: messages } = testCase;
680
934
  try {
681
935
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
@@ -708,11 +962,11 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
708
962
  try {
709
963
  const firstTool = transformedTools[0];
710
964
  const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
711
- logs.push(
965
+ caseLogs.push(
712
966
  `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
713
967
  );
714
968
  } catch (e) {
715
- logs.push(
969
+ caseLogs.push(
716
970
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
717
971
  );
718
972
  }
@@ -720,14 +974,22 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
720
974
  model,
721
975
  messages: flatMessages,
722
976
  tools: toolsMap,
723
- toolChoice: "auto"
977
+ toolChoice: "auto",
978
+ // Pass original schema information to middleware
979
+ providerOptions: {
980
+ toolCallMiddleware: {
981
+ originalToolSchemas: Object.fromEntries(
982
+ transformedTools.map((t) => [t.name, t.inputSchema])
983
+ )
984
+ }
985
+ }
724
986
  });
725
987
  try {
726
- logs.push(
988
+ caseLogs.push(
727
989
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
728
990
  );
729
991
  } catch {
730
- logs.push(
992
+ caseLogs.push(
731
993
  `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
732
994
  );
733
995
  }
@@ -760,20 +1022,221 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
760
1022
  possibleAnswer
761
1023
  );
762
1024
  if (checkerResult.valid) {
763
- correctCount++;
764
- logs.push(`[PASS] ${testCase.id}`);
1025
+ caseLogs.push(`[PASS] ${testCase.id}`);
1026
+ return { valid: true, logs: caseLogs };
765
1027
  } else {
766
- logs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1028
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1029
+ try {
1030
+ const category = testCase.id.split("_")[0];
1031
+ const diff = [];
1032
+ const summarizeArgs = (args) => {
1033
+ if (args == null) return args;
1034
+ if (typeof args !== "object") return args;
1035
+ return Object.keys(args).sort().reduce((acc, k) => {
1036
+ acc[k] = args[k];
1037
+ return acc;
1038
+ }, {});
1039
+ };
1040
+ const expected = {};
1041
+ const actual = {};
1042
+ if (category === "simple") {
1043
+ const funcDesc = tools[0];
1044
+ const gt = possibleAnswer.ground_truth?.[0];
1045
+ const expectedFuncName = funcDesc?.name;
1046
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
1047
+ const received = restoredCalls[0];
1048
+ const receivedName = received?.toolName ?? received?.name;
1049
+ const receivedArgs = summarizeArgs(received?.args);
1050
+ expected.function = expectedFuncName;
1051
+ expected.params = expectedParams;
1052
+ actual.function = receivedName;
1053
+ actual.args = receivedArgs;
1054
+ if (expectedFuncName !== receivedName) {
1055
+ diff.push(`@@ function name`);
1056
+ diff.push(`- ${expectedFuncName}`);
1057
+ diff.push(`+ ${receivedName}`);
1058
+ }
1059
+ if (expectedParams && receivedArgs) {
1060
+ const required = funcDesc?.parameters?.required ?? [];
1061
+ for (const req of required) {
1062
+ if (!(req in receivedArgs)) {
1063
+ diff.push(`- missing required param: ${req}`);
1064
+ }
1065
+ }
1066
+ for (const k of Object.keys(receivedArgs)) {
1067
+ if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1068
+ diff.push(`+ unexpected param: ${k}`);
1069
+ }
1070
+ }
1071
+ for (const k of Object.keys(receivedArgs)) {
1072
+ if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1073
+ const allowed = expectedParams[k];
1074
+ const got = receivedArgs[k];
1075
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
1076
+ try {
1077
+ if (Array.isArray(got)) {
1078
+ return JSON.stringify(
1079
+ got.map((x) => String(x)).sort()
1080
+ ) === JSON.stringify(
1081
+ v.map((x) => String(x)).sort()
1082
+ );
1083
+ }
1084
+ } catch {
1085
+ }
1086
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1087
+ });
1088
+ if (!includes) {
1089
+ diff.push(`@@ param ${k}`);
1090
+ diff.push(
1091
+ `- expected one of: ${JSON.stringify(allowed)}`
1092
+ );
1093
+ diff.push(`+ got: ${JSON.stringify(got)}`);
1094
+ }
1095
+ }
1096
+ }
1097
+ }
1098
+ } else {
1099
+ const gtArr = possibleAnswer.ground_truth ?? [];
1100
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
1101
+ const actualNames = restoredCalls.map(
1102
+ (c) => c.toolName ?? c.name
1103
+ );
1104
+ expected.functions = expectedNames;
1105
+ actual.functions = actualNames;
1106
+ if (expectedNames.length !== actualNames.length) {
1107
+ diff.push(`@@ call count`);
1108
+ diff.push(`- expected ${expectedNames.length}`);
1109
+ diff.push(`+ got ${actualNames.length}`);
1110
+ }
1111
+ const missing = expectedNames.filter(
1112
+ (n) => !actualNames.includes(n)
1113
+ );
1114
+ const extra = actualNames.filter(
1115
+ (n) => !expectedNames.includes(n)
1116
+ );
1117
+ for (const m of missing)
1118
+ diff.push(`- missing function: ${m}`);
1119
+ for (const e of extra)
1120
+ diff.push(`+ unexpected function: ${e}`);
1121
+ const usedActual = /* @__PURE__ */ new Set();
1122
+ for (const expectedObj of gtArr) {
1123
+ const fname = Object.keys(expectedObj)[0];
1124
+ let matchedIndex = -1;
1125
+ for (let i = 0; i < restoredCalls.length; i++) {
1126
+ if (usedActual.has(i)) continue;
1127
+ const rc = restoredCalls[i];
1128
+ const rcName = rc?.toolName ?? rc?.name;
1129
+ if (rcName === fname) {
1130
+ matchedIndex = i;
1131
+ break;
1132
+ }
1133
+ }
1134
+ if (matchedIndex === -1) continue;
1135
+ usedActual.add(matchedIndex);
1136
+ const received = restoredCalls[matchedIndex];
1137
+ const receivedArgs = summarizeArgs(received?.args);
1138
+ const expectedParamsAllowed = expectedObj[fname];
1139
+ const funcDesc = tools.find(
1140
+ (t) => t.name === fname
1141
+ );
1142
+ const requiredParams = funcDesc?.parameters?.required ?? [];
1143
+ diff.push(`@@ function ${fname}`);
1144
+ if (expectedParamsAllowed && receivedArgs) {
1145
+ for (const req of requiredParams) {
1146
+ if (!(req in receivedArgs)) {
1147
+ diff.push(`- missing required param: ${req}`);
1148
+ }
1149
+ }
1150
+ for (const k of Object.keys(receivedArgs)) {
1151
+ if (!Object.prototype.hasOwnProperty.call(
1152
+ expectedParamsAllowed,
1153
+ k
1154
+ )) {
1155
+ diff.push(`+ unexpected param: ${k}`);
1156
+ }
1157
+ }
1158
+ for (const k of Object.keys(receivedArgs)) {
1159
+ if (Object.prototype.hasOwnProperty.call(
1160
+ expectedParamsAllowed,
1161
+ k
1162
+ )) {
1163
+ const allowed = expectedParamsAllowed[k];
1164
+ const got = receivedArgs[k];
1165
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
1166
+ try {
1167
+ if (Array.isArray(got)) {
1168
+ return JSON.stringify(
1169
+ got.map((x) => String(x)).sort()
1170
+ ) === JSON.stringify(
1171
+ v.map((x) => String(x)).sort()
1172
+ );
1173
+ }
1174
+ } catch {
1175
+ }
1176
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1177
+ });
1178
+ if (!includes) {
1179
+ diff.push(`@@ param ${k}`);
1180
+ diff.push(
1181
+ `- expected one of: ${JSON.stringify(allowed)}`
1182
+ );
1183
+ diff.push(`+ got: ${JSON.stringify(got)}`);
1184
+ }
1185
+ }
1186
+ }
1187
+ }
1188
+ }
1189
+ }
1190
+ caseLogs.push(
1191
+ `[DEBUG-FAIL] ${JSON.stringify({
1192
+ id: testCase.id,
1193
+ message: checkerResult.error,
1194
+ error_type: checkerResult.error_type,
1195
+ expected,
1196
+ actual,
1197
+ diff
1198
+ })}`
1199
+ );
1200
+ } catch {
1201
+ caseLogs.push(
1202
+ `[DEBUG] ${testCase.id}: failed to build debug diff`
1203
+ );
1204
+ }
1205
+ return { valid: false, logs: caseLogs };
767
1206
  }
768
1207
  } catch (e) {
769
- logs.push(
1208
+ caseLogs.push(
770
1209
  `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
771
1210
  );
772
1211
  if (e?.stack) {
773
- logs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1212
+ caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
774
1213
  }
1214
+ return { valid: false, logs: caseLogs };
775
1215
  }
776
- }
1216
+ };
1217
+ const mapWithConcurrency = async (items, limit2, mapper) => {
1218
+ const results = new Array(items.length);
1219
+ let idx = 0;
1220
+ const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
1221
+ while (true) {
1222
+ const current = idx++;
1223
+ if (current >= items.length) break;
1224
+ results[current] = await mapper(items[current], current);
1225
+ }
1226
+ });
1227
+ await Promise.all(workers);
1228
+ return results;
1229
+ };
1230
+ const resultsPerCase = await mapWithConcurrency(
1231
+ testCases,
1232
+ concurrency,
1233
+ async (tc) => runSingleCase(tc)
1234
+ );
1235
+ correctCount = resultsPerCase.reduce(
1236
+ (acc, r) => acc + (r.valid ? 1 : 0),
1237
+ 0
1238
+ );
1239
+ for (const r of resultsPerCase) logs.push(...r.logs);
777
1240
  if (testCases.length === 0) {
778
1241
  return {
779
1242
  score: 0,