@ai-sdk-tool/eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -47,14 +47,15 @@ var colors = {
47
47
  red: "\x1B[31m",
48
48
  yellow: "\x1B[33m",
49
49
  cyan: "\x1B[36m",
50
- magenta: "\x1B[35m"
50
+ magenta: "\x1B[35m",
51
+ gray: "\x1B[90m"
51
52
  };
52
53
  function printResult(result) {
53
- const { model, benchmark, result: benchmarkResult } = result;
54
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
54
55
  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
55
56
  console.log(
56
57
  `
57
- ${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
58
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
58
59
  );
59
60
  console.log(
60
61
  ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
@@ -80,6 +81,186 @@ function consoleReporter(results) {
80
81
  console.log("\n---------------------------\n");
81
82
  }
82
83
 
84
+ // src/reporters/console.debug.ts
85
+ var colors2 = {
86
+ reset: "\x1B[0m",
87
+ green: "\x1B[32m",
88
+ red: "\x1B[31m",
89
+ yellow: "\x1B[33m",
90
+ cyan: "\x1B[36m",
91
+ magenta: "\x1B[35m",
92
+ gray: "\x1B[90m",
93
+ bold: "\x1B[1m",
94
+ underline: "\x1B[4m"
95
+ };
96
+ function colorizeDiffLine(line) {
97
+ if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
98
+ if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
99
+ if (line.startsWith("@"))
100
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
101
+ return line;
102
+ }
103
+ function uniqueLines(lines) {
104
+ const seen = /* @__PURE__ */ new Set();
105
+ const out = [];
106
+ for (const l of lines) {
107
+ if (seen.has(l)) continue;
108
+ seen.add(l);
109
+ out.push(l);
110
+ }
111
+ return out;
112
+ }
113
+ function suggestFixFromDiff(parsed) {
114
+ const suggestions = [];
115
+ const { error_type, expected, actual, diff } = parsed ?? {};
116
+ if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
117
+ const expectedName = expected?.function;
118
+ const actualName = actual?.function;
119
+ if (expectedName && actualName && expectedName !== actualName) {
120
+ suggestions.push(
121
+ `Call the function '${expectedName}' instead of '${actualName}'.`
122
+ );
123
+ }
124
+ if (Array.isArray(expected?.functions)) {
125
+ suggestions.push(
126
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
127
+ );
128
+ }
129
+ }
130
+ if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
131
+ const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
132
+ if (missing.length) {
133
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
134
+ }
135
+ }
136
+ if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
137
+ const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
138
+ if (extras.length) {
139
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
140
+ }
141
+ }
142
+ if (diff && diff.some((d) => d.startsWith("@@ param "))) {
143
+ const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
144
+ for (const param of targets) {
145
+ const allowedLine = diff.find(
146
+ (d) => d.startsWith("- expected one of:")
147
+ );
148
+ if (allowedLine) {
149
+ const allowed = allowedLine.replace("- expected one of: ", "");
150
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
151
+ } else {
152
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
153
+ }
154
+ }
155
+ }
156
+ if (suggestions.length === 0 && typeof error_type === "string") {
157
+ if (error_type.includes("missing_required")) {
158
+ suggestions.push(
159
+ "Add all required parameters defined by the tool schema."
160
+ );
161
+ } else if (error_type.includes("unexpected_param")) {
162
+ suggestions.push("Remove parameters not present in the tool schema.");
163
+ } else if (error_type.includes("wrong_count")) {
164
+ suggestions.push(
165
+ "Adjust the number of tool calls to match expected count."
166
+ );
167
+ } else if (error_type.includes("wrong_func_name")) {
168
+ suggestions.push("Use the exact expected function name from the schema.");
169
+ } else if (error_type.includes("value_error")) {
170
+ suggestions.push("Choose a value from the allowed options.");
171
+ }
172
+ }
173
+ return uniqueLines(suggestions);
174
+ }
175
+ function consoleDebugReporter(results) {
176
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
177
+ for (const r of results) {
178
+ const { model, modelKey, benchmark, result } = r;
179
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
180
+ console.log(
181
+ `
182
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
183
+ );
184
+ console.log(
185
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
186
+ );
187
+ const metrics = Object.entries(result.metrics);
188
+ if (metrics.length > 0) {
189
+ console.log(" Metrics:");
190
+ for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
191
+ }
192
+ if (result.logs && result.logs.length) {
193
+ const failLogs = result.logs.filter(
194
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
195
+ );
196
+ const hasFails = failLogs.length > 0;
197
+ if (hasFails) {
198
+ console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
199
+ const debugIds = /* @__PURE__ */ new Set();
200
+ for (const l of failLogs) {
201
+ if (l.startsWith("[DEBUG-FAIL]")) {
202
+ try {
203
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
204
+ if (parsed?.id) debugIds.add(String(parsed.id));
205
+ } catch {
206
+ }
207
+ }
208
+ }
209
+ for (const line of failLogs) {
210
+ if (line.startsWith("[FAIL]")) {
211
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
212
+ const failId = m?.[1];
213
+ if (failId && debugIds.has(failId)) continue;
214
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
215
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
216
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
217
+ } else if (line.startsWith("[STACK]")) {
218
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
219
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
220
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
221
+ try {
222
+ const parsed = JSON.parse(payload);
223
+ const { id, expected, actual, message, diff } = parsed;
224
+ console.log(
225
+ ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
226
+ );
227
+ if (diff && Array.isArray(diff)) {
228
+ for (const dLine of diff)
229
+ console.log(" " + colorizeDiffLine(dLine));
230
+ } else {
231
+ console.log(" expected:");
232
+ console.log(
233
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
234
+ );
235
+ console.log(" actual:");
236
+ console.log(
237
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
238
+ );
239
+ }
240
+ const suggestions = suggestFixFromDiff(parsed);
241
+ if (suggestions.length) {
242
+ console.log(
243
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
244
+ );
245
+ for (const s of suggestions) console.log(` \u2022 ${s}`);
246
+ }
247
+ } catch {
248
+ console.log(` ${line}`);
249
+ }
250
+ }
251
+ }
252
+ } else {
253
+ const info = result.logs.filter(
254
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
255
+ );
256
+ for (const line of info)
257
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
258
+ }
259
+ }
260
+ }
261
+ console.log("\n------------------------------------\n");
262
+ }
263
+
83
264
  // src/reporters/json.ts
84
265
  function jsonReporter(results) {
85
266
  const serializableResults = results.map((r) => ({
@@ -95,30 +276,35 @@ function jsonReporter(results) {
95
276
  // src/reporters/index.ts
96
277
  var reporters = {
97
278
  console: consoleReporter,
98
- json: jsonReporter
279
+ json: jsonReporter,
280
+ "console.debug": consoleDebugReporter
99
281
  };
100
282
 
101
283
  // src/evaluate.ts
102
- async function runSingleBenchmark(model, benchmark) {
284
+ async function runSingleBenchmark(model, benchmark, modelKey) {
103
285
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
104
286
  try {
105
- console.log(`[${modelId}] Running benchmark: ${benchmark.name}...`);
287
+ console.log(
288
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
289
+ );
106
290
  const result = await benchmark.run(model);
107
291
  console.log(
108
- `[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
292
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
109
293
  );
110
294
  return {
111
295
  model: modelId,
296
+ modelKey,
112
297
  benchmark: benchmark.name,
113
298
  result
114
299
  };
115
300
  } catch (error) {
116
301
  console.error(
117
- `[${modelId}] Error running benchmark: ${benchmark.name}`,
302
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
118
303
  error
119
304
  );
120
305
  return {
121
306
  model: modelId,
307
+ modelKey,
122
308
  benchmark: benchmark.name,
123
309
  result: {
124
310
  score: 0,
@@ -131,11 +317,26 @@ async function runSingleBenchmark(model, benchmark) {
131
317
  }
132
318
  async function evaluate(options) {
133
319
  const { models, benchmarks, reporter = "console" } = options;
134
- const modelsArray = Array.isArray(models) ? models : [models];
320
+ const modelEntries = [];
321
+ if (Array.isArray(models)) {
322
+ for (const m of models) modelEntries.push([void 0, m]);
323
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
324
+ modelEntries.push([void 0, models]);
325
+ } else {
326
+ for (const [key, m] of Object.entries(
327
+ models
328
+ )) {
329
+ modelEntries.push([key, m]);
330
+ }
331
+ }
135
332
  const allResults = [];
136
- for (const model of modelsArray) {
333
+ for (const [modelKey, model] of modelEntries) {
137
334
  for (const benchmark of benchmarks) {
138
- const evaluationResult = await runSingleBenchmark(model, benchmark);
335
+ const evaluationResult = await runSingleBenchmark(
336
+ model,
337
+ benchmark,
338
+ modelKey
339
+ );
139
340
  allResults.push(evaluationResult);
140
341
  }
141
342
  }
@@ -478,7 +679,9 @@ function checkStringValue(param, modelValue, possibleAnswers) {
478
679
  if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
479
680
  return {
480
681
  valid: false,
481
- error: `Invalid value for parameter '${param}': '${modelValue}'. Expected one of ${possibleAnswers.join(", ")}.`,
682
+ error: `Invalid value for parameter '${param}': ${JSON.stringify(
683
+ modelValue
684
+ )}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
482
685
  error_type: "value_error:string"
483
686
  };
484
687
  }
@@ -532,15 +735,55 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
532
735
  if (!hasMatch) {
533
736
  return {
534
737
  valid: false,
535
- error: `Invalid value for list parameter '${paramName}'.`,
738
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
739
+ modelValue
740
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
536
741
  error_type: "value_error:list"
537
742
  };
538
743
  }
539
744
  } else {
540
- if (!possibleValues.includes(modelValue)) {
745
+ const hasMatch = possibleValues.some((possibleValue) => {
746
+ if (modelValue === possibleValue) return true;
747
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
748
+ try {
749
+ const normalizeObject = (obj) => {
750
+ if (Array.isArray(obj)) {
751
+ return obj.map(normalizeObject);
752
+ }
753
+ if (obj && typeof obj === "object") {
754
+ const normalized = {};
755
+ for (const [key, value] of Object.entries(obj)) {
756
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
757
+ normalized[key] = value[0];
758
+ } else {
759
+ normalized[key] = normalizeObject(value);
760
+ }
761
+ }
762
+ return normalized;
763
+ }
764
+ return obj;
765
+ };
766
+ const normalizedModel = normalizeObject(modelValue);
767
+ const normalizedPossible = normalizeObject(possibleValue);
768
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
769
+ } catch {
770
+ return false;
771
+ }
772
+ }
773
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
774
+ return modelValue.toString() === possibleValue;
775
+ }
776
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
777
+ return modelValue === possibleValue.toString();
778
+ }
779
+ return false;
780
+ });
781
+ if (!hasMatch) {
541
782
  return {
542
783
  valid: false,
543
- error: `Invalid value for parameter '${paramName}': got '${modelValue}', expected one of '${possibleValues}'.`,
784
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
785
+ modelValue
786
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
544
787
  error_type: "value_error:other"
545
788
  };
546
789
  }
@@ -636,7 +879,8 @@ function check(testCase, modelOutput, possibleAnswer) {
636
879
  if (!modelOutput || modelOutput.length !== 1) {
637
880
  return {
638
881
  valid: false,
639
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`
882
+ error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
883
+ error_type: "simple:wrong_count"
640
884
  };
641
885
  }
642
886
  return simpleFunctionChecker(
@@ -665,7 +909,11 @@ function check(testCase, modelOutput, possibleAnswer) {
665
909
  }
666
910
  return { valid: true };
667
911
  } catch (e) {
668
- return { valid: false, error: `Checker Error: ${e.message}` };
912
+ return {
913
+ valid: false,
914
+ error: `Checker Error: ${e.message}`,
915
+ error_type: "checker_error"
916
+ };
669
917
  }
670
918
  }
671
919
  function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
@@ -717,7 +965,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
717
965
  if (copy.items) copy.items = fixSchema(copy.items);
718
966
  return copy;
719
967
  };
720
- for (const testCase of testCases) {
968
+ const concurrencyEnv = process.env.BFCL_CONCURRENCY;
969
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
970
+ logs.push(
971
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
972
+ );
973
+ const runSingleCase = async (testCase) => {
974
+ const caseLogs = [];
721
975
  const { function: tools, question: messages } = testCase;
722
976
  try {
723
977
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
@@ -735,33 +989,49 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
735
989
  type: "function",
736
990
  name: sanitized,
737
991
  description: t.description,
738
- // Mark as JSON schema explicitly to prevent Zod parsing
739
- inputSchema: (0, import_ai2.jsonSchema)(inputSchema)
992
+ inputSchema
740
993
  };
741
994
  });
995
+ const toolsMap = Object.fromEntries(
996
+ transformedTools.map((t) => [
997
+ t.name,
998
+ (0, import_ai2.tool)({
999
+ description: typeof t.description === "string" ? t.description : void 0,
1000
+ inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
1001
+ })
1002
+ ])
1003
+ );
742
1004
  try {
743
1005
  const firstTool = transformedTools[0];
744
1006
  const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
745
- logs.push(
1007
+ caseLogs.push(
746
1008
  `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
747
1009
  );
748
1010
  } catch (e) {
749
- logs.push(
1011
+ caseLogs.push(
750
1012
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
751
1013
  );
752
1014
  }
753
1015
  const { toolCalls, text, finishReason } = await (0, import_ai2.generateText)({
754
1016
  model,
755
1017
  messages: flatMessages,
756
- tools: transformedTools,
757
- toolChoice: "required"
1018
+ tools: toolsMap,
1019
+ toolChoice: "auto",
1020
+ // Pass original schema information to middleware
1021
+ providerOptions: {
1022
+ toolCallMiddleware: {
1023
+ originalToolSchemas: Object.fromEntries(
1024
+ transformedTools.map((t) => [t.name, t.inputSchema])
1025
+ )
1026
+ }
1027
+ }
758
1028
  });
759
1029
  try {
760
- logs.push(
1030
+ caseLogs.push(
761
1031
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
762
1032
  );
763
1033
  } catch {
764
- logs.push(
1034
+ caseLogs.push(
765
1035
  `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
766
1036
  );
767
1037
  }
@@ -794,20 +1064,221 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
794
1064
  possibleAnswer
795
1065
  );
796
1066
  if (checkerResult.valid) {
797
- correctCount++;
798
- logs.push(`[PASS] ${testCase.id}`);
1067
+ caseLogs.push(`[PASS] ${testCase.id}`);
1068
+ return { valid: true, logs: caseLogs };
799
1069
  } else {
800
- logs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1070
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1071
+ try {
1072
+ const category = testCase.id.split("_")[0];
1073
+ const diff = [];
1074
+ const summarizeArgs = (args) => {
1075
+ if (args == null) return args;
1076
+ if (typeof args !== "object") return args;
1077
+ return Object.keys(args).sort().reduce((acc, k) => {
1078
+ acc[k] = args[k];
1079
+ return acc;
1080
+ }, {});
1081
+ };
1082
+ const expected = {};
1083
+ const actual = {};
1084
+ if (category === "simple") {
1085
+ const funcDesc = tools[0];
1086
+ const gt = possibleAnswer.ground_truth?.[0];
1087
+ const expectedFuncName = funcDesc?.name;
1088
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
1089
+ const received = restoredCalls[0];
1090
+ const receivedName = received?.toolName ?? received?.name;
1091
+ const receivedArgs = summarizeArgs(received?.args);
1092
+ expected.function = expectedFuncName;
1093
+ expected.params = expectedParams;
1094
+ actual.function = receivedName;
1095
+ actual.args = receivedArgs;
1096
+ if (expectedFuncName !== receivedName) {
1097
+ diff.push(`@@ function name`);
1098
+ diff.push(`- ${expectedFuncName}`);
1099
+ diff.push(`+ ${receivedName}`);
1100
+ }
1101
+ if (expectedParams && receivedArgs) {
1102
+ const required = funcDesc?.parameters?.required ?? [];
1103
+ for (const req of required) {
1104
+ if (!(req in receivedArgs)) {
1105
+ diff.push(`- missing required param: ${req}`);
1106
+ }
1107
+ }
1108
+ for (const k of Object.keys(receivedArgs)) {
1109
+ if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1110
+ diff.push(`+ unexpected param: ${k}`);
1111
+ }
1112
+ }
1113
+ for (const k of Object.keys(receivedArgs)) {
1114
+ if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1115
+ const allowed = expectedParams[k];
1116
+ const got = receivedArgs[k];
1117
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
1118
+ try {
1119
+ if (Array.isArray(got)) {
1120
+ return JSON.stringify(
1121
+ got.map((x) => String(x)).sort()
1122
+ ) === JSON.stringify(
1123
+ v.map((x) => String(x)).sort()
1124
+ );
1125
+ }
1126
+ } catch {
1127
+ }
1128
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1129
+ });
1130
+ if (!includes) {
1131
+ diff.push(`@@ param ${k}`);
1132
+ diff.push(
1133
+ `- expected one of: ${JSON.stringify(allowed)}`
1134
+ );
1135
+ diff.push(`+ got: ${JSON.stringify(got)}`);
1136
+ }
1137
+ }
1138
+ }
1139
+ }
1140
+ } else {
1141
+ const gtArr = possibleAnswer.ground_truth ?? [];
1142
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
1143
+ const actualNames = restoredCalls.map(
1144
+ (c) => c.toolName ?? c.name
1145
+ );
1146
+ expected.functions = expectedNames;
1147
+ actual.functions = actualNames;
1148
+ if (expectedNames.length !== actualNames.length) {
1149
+ diff.push(`@@ call count`);
1150
+ diff.push(`- expected ${expectedNames.length}`);
1151
+ diff.push(`+ got ${actualNames.length}`);
1152
+ }
1153
+ const missing = expectedNames.filter(
1154
+ (n) => !actualNames.includes(n)
1155
+ );
1156
+ const extra = actualNames.filter(
1157
+ (n) => !expectedNames.includes(n)
1158
+ );
1159
+ for (const m of missing)
1160
+ diff.push(`- missing function: ${m}`);
1161
+ for (const e of extra)
1162
+ diff.push(`+ unexpected function: ${e}`);
1163
+ const usedActual = /* @__PURE__ */ new Set();
1164
+ for (const expectedObj of gtArr) {
1165
+ const fname = Object.keys(expectedObj)[0];
1166
+ let matchedIndex = -1;
1167
+ for (let i = 0; i < restoredCalls.length; i++) {
1168
+ if (usedActual.has(i)) continue;
1169
+ const rc = restoredCalls[i];
1170
+ const rcName = rc?.toolName ?? rc?.name;
1171
+ if (rcName === fname) {
1172
+ matchedIndex = i;
1173
+ break;
1174
+ }
1175
+ }
1176
+ if (matchedIndex === -1) continue;
1177
+ usedActual.add(matchedIndex);
1178
+ const received = restoredCalls[matchedIndex];
1179
+ const receivedArgs = summarizeArgs(received?.args);
1180
+ const expectedParamsAllowed = expectedObj[fname];
1181
+ const funcDesc = tools.find(
1182
+ (t) => t.name === fname
1183
+ );
1184
+ const requiredParams = funcDesc?.parameters?.required ?? [];
1185
+ diff.push(`@@ function ${fname}`);
1186
+ if (expectedParamsAllowed && receivedArgs) {
1187
+ for (const req of requiredParams) {
1188
+ if (!(req in receivedArgs)) {
1189
+ diff.push(`- missing required param: ${req}`);
1190
+ }
1191
+ }
1192
+ for (const k of Object.keys(receivedArgs)) {
1193
+ if (!Object.prototype.hasOwnProperty.call(
1194
+ expectedParamsAllowed,
1195
+ k
1196
+ )) {
1197
+ diff.push(`+ unexpected param: ${k}`);
1198
+ }
1199
+ }
1200
+ for (const k of Object.keys(receivedArgs)) {
1201
+ if (Object.prototype.hasOwnProperty.call(
1202
+ expectedParamsAllowed,
1203
+ k
1204
+ )) {
1205
+ const allowed = expectedParamsAllowed[k];
1206
+ const got = receivedArgs[k];
1207
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
1208
+ try {
1209
+ if (Array.isArray(got)) {
1210
+ return JSON.stringify(
1211
+ got.map((x) => String(x)).sort()
1212
+ ) === JSON.stringify(
1213
+ v.map((x) => String(x)).sort()
1214
+ );
1215
+ }
1216
+ } catch {
1217
+ }
1218
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1219
+ });
1220
+ if (!includes) {
1221
+ diff.push(`@@ param ${k}`);
1222
+ diff.push(
1223
+ `- expected one of: ${JSON.stringify(allowed)}`
1224
+ );
1225
+ diff.push(`+ got: ${JSON.stringify(got)}`);
1226
+ }
1227
+ }
1228
+ }
1229
+ }
1230
+ }
1231
+ }
1232
+ caseLogs.push(
1233
+ `[DEBUG-FAIL] ${JSON.stringify({
1234
+ id: testCase.id,
1235
+ message: checkerResult.error,
1236
+ error_type: checkerResult.error_type,
1237
+ expected,
1238
+ actual,
1239
+ diff
1240
+ })}`
1241
+ );
1242
+ } catch {
1243
+ caseLogs.push(
1244
+ `[DEBUG] ${testCase.id}: failed to build debug diff`
1245
+ );
1246
+ }
1247
+ return { valid: false, logs: caseLogs };
801
1248
  }
802
1249
  } catch (e) {
803
- logs.push(
1250
+ caseLogs.push(
804
1251
  `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
805
1252
  );
806
1253
  if (e?.stack) {
807
- logs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1254
+ caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
808
1255
  }
1256
+ return { valid: false, logs: caseLogs };
809
1257
  }
810
- }
1258
+ };
1259
+ const mapWithConcurrency = async (items, limit2, mapper) => {
1260
+ const results = new Array(items.length);
1261
+ let idx = 0;
1262
+ const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
1263
+ while (true) {
1264
+ const current = idx++;
1265
+ if (current >= items.length) break;
1266
+ results[current] = await mapper(items[current], current);
1267
+ }
1268
+ });
1269
+ await Promise.all(workers);
1270
+ return results;
1271
+ };
1272
+ const resultsPerCase = await mapWithConcurrency(
1273
+ testCases,
1274
+ concurrency,
1275
+ async (tc) => runSingleCase(tc)
1276
+ );
1277
+ correctCount = resultsPerCase.reduce(
1278
+ (acc, r) => acc + (r.valid ? 1 : 0),
1279
+ 0
1280
+ );
1281
+ for (const r of resultsPerCase) logs.push(...r.logs);
811
1282
  if (testCases.length === 0) {
812
1283
  return {
813
1284
  score: 0,