@ai-sdk-tool/eval 0.1.7 → 1.0.0-canary.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -40,386 +40,98 @@ __export(index_exports, {
40
40
  });
41
41
  module.exports = __toCommonJS(index_exports);
42
42
 
43
- // src/reporters/console.ts
44
- var colors = {
45
- reset: "\x1B[0m",
46
- green: "\x1B[32m",
47
- red: "\x1B[31m",
48
- yellow: "\x1B[33m",
49
- cyan: "\x1B[36m",
50
- magenta: "\x1B[35m",
51
- gray: "\x1B[90m"
52
- };
53
- function printResult(result) {
54
- const { model, modelKey, benchmark, result: benchmarkResult } = result;
55
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
56
- console.log(
57
- `
58
- ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
59
- );
60
- console.log(
61
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
62
- );
63
- const metrics = Object.entries(benchmarkResult.metrics);
64
- if (metrics.length > 0) {
65
- console.log(" Metrics:");
66
- for (const [key, value] of metrics) {
67
- console.log(` - ${key}: ${value}`);
68
- }
69
- }
70
- if (benchmarkResult.error) {
71
- console.log(
72
- ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
73
- );
74
- }
75
- }
76
- function consoleReporter(results) {
77
- console.log("\n--- \u{1F4CA} Evaluation Report ---");
78
- for (const result of results) {
79
- printResult(result);
80
- }
81
- console.log("\n---------------------------\n");
82
- }
43
+ // src/benchmarks/bfcl.ts
44
+ var import_node_fs2 = require("fs");
45
+ var import_node_path2 = __toESM(require("path"), 1);
46
+ var import_ai = require("ai");
83
47
 
84
- // src/reporters/console.debug.ts
85
- var colors2 = {
86
- reset: "\x1B[0m",
87
- green: "\x1B[32m",
88
- red: "\x1B[31m",
89
- yellow: "\x1B[33m",
90
- cyan: "\x1B[36m",
91
- magenta: "\x1B[35m",
92
- gray: "\x1B[90m",
93
- bold: "\x1B[1m",
94
- underline: "\x1B[4m"
95
- };
96
- function colorizeDiffLine(line) {
97
- if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
98
- if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
99
- if (line.startsWith("@"))
100
- return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
101
- return line;
102
- }
103
- function uniqueLines(lines) {
104
- const seen = /* @__PURE__ */ new Set();
105
- const out = [];
106
- for (const l of lines) {
107
- if (seen.has(l)) continue;
108
- seen.add(l);
109
- out.push(l);
110
- }
111
- return out;
112
- }
113
- function suggestFixFromDiff(parsed) {
114
- const suggestions = [];
115
- const { error_type, expected, actual, diff } = parsed ?? {};
116
- if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
117
- const expectedName = expected?.function;
118
- const actualName = actual?.function;
119
- if (expectedName && actualName && expectedName !== actualName) {
120
- suggestions.push(
121
- `Call the function '${expectedName}' instead of '${actualName}'.`
122
- );
123
- }
124
- if (Array.isArray(expected?.functions)) {
125
- suggestions.push(
126
- `Ensure tool calls include: ${expected.functions.join(", ")}.`
127
- );
128
- }
129
- }
130
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
131
- const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
132
- if (missing.length) {
133
- suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
134
- }
135
- }
136
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
137
- const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
138
- if (extras.length) {
139
- suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
140
- }
141
- }
142
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
- const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
- for (const param of targets) {
145
- const allowedLine = diff.find(
146
- (d) => String(d).startsWith("- expected one of:")
147
- );
148
- if (allowedLine) {
149
- const allowed = allowedLine.replace("- expected one of: ", "");
150
- suggestions.push(`Set '${param}' to one of: ${allowed}.`);
151
- } else {
152
- suggestions.push(`Adjust '${param}' to an allowed value.`);
153
- }
154
- }
155
- }
156
- if (suggestions.length === 0 && typeof error_type === "string") {
157
- if (error_type.includes("missing_required")) {
158
- suggestions.push(
159
- "Add all required parameters defined by the tool schema."
160
- );
161
- } else if (error_type.includes("unexpected_param")) {
162
- suggestions.push("Remove parameters not present in the tool schema.");
163
- } else if (error_type.includes("wrong_count")) {
164
- suggestions.push(
165
- "Adjust the number of tool calls to match expected count."
166
- );
167
- } else if (error_type.includes("wrong_func_name")) {
168
- suggestions.push("Use the exact expected function name from the schema.");
169
- } else if (error_type.includes("value_error")) {
170
- suggestions.push("Choose a value from the allowed options.");
48
+ // src/utils/paths.ts
49
+ var import_node_fs = __toESM(require("fs"), 1);
50
+ var import_node_module = require("module");
51
+ var import_node_path = __toESM(require("path"), 1);
52
+ var import_node_url = require("url");
53
+ function tryResolveViaPackageEntry(moduleUrl) {
54
+ try {
55
+ const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || import_node_path.default.join(process.cwd(), "package.json");
56
+ const requireFromEntry = (0, import_node_module.createRequire)(baseForRequireEntry);
57
+ const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
58
+ const entryDir = import_node_path.default.dirname(entryPath);
59
+ const guessPkgRoot = import_node_fs.default.existsSync(import_node_path.default.join(entryDir, "..")) ? import_node_path.default.resolve(entryDir, "..") : entryDir;
60
+ const dataAtRoot = import_node_path.default.join(guessPkgRoot, "data");
61
+ if (import_node_fs.default.existsSync(dataAtRoot)) {
62
+ return dataAtRoot;
171
63
  }
64
+ } catch {
172
65
  }
173
- return uniqueLines(suggestions);
66
+ return null;
174
67
  }
175
- function consoleDebugReporter(results) {
176
- console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
177
- for (const r of results) {
178
- const { model, modelKey, benchmark, result } = r;
179
- const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
180
- console.log(
181
- `
182
- ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
183
- );
184
- console.log(
185
- ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
186
- );
187
- const metrics = Object.entries(result.metrics);
188
- if (metrics.length > 0) {
189
- console.log(" Metrics:");
190
- for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
191
- }
192
- if (result.logs && result.logs.length) {
193
- const failLogs = result.logs.filter(
194
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
195
- );
196
- const hasFails = failLogs.length > 0;
197
- if (hasFails) {
198
- console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
199
- const debugIds = /* @__PURE__ */ new Set();
200
- for (const l of failLogs) {
201
- if (l.startsWith("[DEBUG-FAIL]")) {
202
- try {
203
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
204
- if (parsed?.id) debugIds.add(String(parsed.id));
205
- } catch {
206
- }
207
- }
208
- }
209
- for (const line of failLogs) {
210
- if (line.startsWith("[FAIL]")) {
211
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
212
- const failId = m?.[1];
213
- if (failId && debugIds.has(failId)) continue;
214
- console.log(` ${colors2.red}${line}${colors2.reset}`);
215
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
216
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
217
- } else if (line.startsWith("[STACK]")) {
218
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
219
- } else if (line.startsWith("[DEBUG-FAIL]")) {
220
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
221
- try {
222
- const parsed = JSON.parse(payload);
223
- const { id, expected, actual, message, diff } = parsed;
224
- console.log(
225
- ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
226
- );
227
- if (diff && Array.isArray(diff)) {
228
- for (const dLine of diff)
229
- console.log(" " + colorizeDiffLine(dLine));
230
- } else {
231
- console.log(" expected:");
232
- console.log(
233
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
234
- );
235
- console.log(" actual:");
236
- console.log(
237
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
238
- );
239
- }
240
- const suggestions = suggestFixFromDiff(parsed);
241
- if (suggestions.length) {
242
- console.log(
243
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
244
- );
245
- for (const s of suggestions) console.log(` \u2022 ${s}`);
246
- }
247
- } catch {
248
- console.log(` ${line}`);
249
- }
250
- }
251
- }
252
- } else {
253
- const info = result.logs.filter(
254
- (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
255
- );
256
- for (const line of info)
257
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
258
- }
68
+ function tryResolveViaPackageJson(moduleUrl) {
69
+ try {
70
+ const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_node_path.default.join(process.cwd(), "package.json");
71
+ const require2 = (0, import_node_module.createRequire)(baseForRequire);
72
+ const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
73
+ const pkgDir = import_node_path.default.dirname(pkgJsonPath);
74
+ const dataAtPkg = import_node_path.default.join(pkgDir, "data");
75
+ if (import_node_fs.default.existsSync(dataAtPkg)) {
76
+ return dataAtPkg;
259
77
  }
78
+ } catch {
260
79
  }
261
- console.log("\n------------------------------------\n");
80
+ return null;
262
81
  }
263
-
264
- // src/reporters/json.ts
265
- function jsonReporter(results) {
266
- const serializableResults = results.map((r) => ({
267
- ...r,
268
- result: {
269
- ...r.result,
270
- error: r.result.error?.message
82
+ function getStartDir(moduleUrl) {
83
+ if (moduleUrl) {
84
+ try {
85
+ return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
86
+ } catch {
87
+ return process.cwd();
271
88
  }
272
- }));
273
- console.log(JSON.stringify(serializableResults, null, 2));
274
- }
275
-
276
- // src/reporters/index.ts
277
- var reporters = {
278
- console: consoleReporter,
279
- json: jsonReporter,
280
- "console.debug": consoleDebugReporter
281
- };
282
-
283
- // src/evaluate.ts
284
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
285
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
286
- try {
287
- console.log(
288
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
289
- );
290
- const result = await benchmark.run(model, config);
291
- console.log(
292
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
293
- );
294
- return {
295
- model: modelId,
296
- modelKey,
297
- benchmark: benchmark.name,
298
- result
299
- };
300
- } catch (error) {
301
- console.error(
302
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
303
- error
304
- );
305
- return {
306
- model: modelId,
307
- modelKey,
308
- benchmark: benchmark.name,
309
- result: {
310
- score: 0,
311
- success: false,
312
- metrics: {},
313
- error: error instanceof Error ? error : new Error(String(error))
314
- }
315
- };
316
89
  }
90
+ return process.cwd();
317
91
  }
318
- async function evaluate(options) {
319
- const {
320
- models,
321
- benchmarks,
322
- reporter = "console",
323
- temperature,
324
- maxTokens
325
- } = options;
326
- const modelEntries = [];
327
- if (Array.isArray(models)) {
328
- for (const m of models) modelEntries.push([void 0, m]);
329
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
330
- modelEntries.push([void 0, models]);
331
- } else {
332
- for (const [key, m] of Object.entries(
333
- models
334
- )) {
335
- modelEntries.push([key, m]);
92
+ function findDataDirByTraversal(startDir) {
93
+ let dir = startDir;
94
+ const MAX_PARENT_TRAVERSAL_DEPTH = 6;
95
+ for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
96
+ const dataCandidate = import_node_path.default.join(dir, "data");
97
+ if (import_node_fs.default.existsSync(dataCandidate)) {
98
+ return dataCandidate;
336
99
  }
337
- }
338
- const allResults = [];
339
- for (const [modelKey, model] of modelEntries) {
340
- for (const benchmark of benchmarks) {
341
- const config = {};
342
- if (temperature !== void 0) config.temperature = temperature;
343
- if (maxTokens !== void 0) config.maxTokens = maxTokens;
344
- const evaluationResult = await runSingleBenchmark(
345
- model,
346
- benchmark,
347
- modelKey,
348
- Object.keys(config).length > 0 ? config : void 0
349
- );
350
- allResults.push(evaluationResult);
100
+ const parent = import_node_path.default.resolve(dir, "..");
101
+ if (parent === dir) {
102
+ break;
351
103
  }
104
+ dir = parent;
352
105
  }
353
- const report = reporters[reporter];
354
- if (report) {
355
- report(allResults);
356
- } else {
357
- console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
358
- reporters.console(allResults);
359
- }
360
- return allResults;
106
+ return null;
361
107
  }
362
-
363
- // src/benchmarks/bfcl.ts
364
- var import_ai = require("ai");
365
- var import_fs2 = require("fs");
366
- var import_path2 = __toESM(require("path"), 1);
367
-
368
- // src/utils/paths.ts
369
- var import_fs = __toESM(require("fs"), 1);
370
- var import_module = require("module");
371
- var import_path = __toESM(require("path"), 1);
372
- var import_url = require("url");
373
108
  function resolveDataDir(fromModuleUrl) {
374
- const moduleUrl = fromModuleUrl;
375
109
  const override = process.env.BFCL_DATA_DIR;
376
110
  if (override && override.trim().length > 0) {
377
111
  return override;
378
112
  }
379
- try {
380
- const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
381
- const requireFromEntry = (0, import_module.createRequire)(baseForRequireEntry);
382
- const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
383
- const entryDir = import_path.default.dirname(entryPath);
384
- const guessPkgRoot = import_fs.default.existsSync(import_path.default.join(entryDir, "..")) ? import_path.default.resolve(entryDir, "..") : entryDir;
385
- const dataAtRoot = import_path.default.join(guessPkgRoot, "data");
386
- if (import_fs.default.existsSync(dataAtRoot)) return dataAtRoot;
387
- } catch {
388
- }
389
- try {
390
- const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
391
- const require2 = (0, import_module.createRequire)(baseForRequire);
392
- const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
393
- const pkgDir = import_path.default.dirname(pkgJsonPath);
394
- const dataAtPkg = import_path.default.join(pkgDir, "data");
395
- if (import_fs.default.existsSync(dataAtPkg)) return dataAtPkg;
396
- } catch {
113
+ const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
114
+ if (viaEntry) {
115
+ return viaEntry;
397
116
  }
398
- let startDir;
399
- if (moduleUrl) {
400
- try {
401
- startDir = import_path.default.dirname((0, import_url.fileURLToPath)(moduleUrl));
402
- } catch {
403
- startDir = process.cwd();
404
- }
405
- } else {
406
- startDir = process.cwd();
117
+ const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
118
+ if (viaPackageJson) {
119
+ return viaPackageJson;
407
120
  }
408
- let dir = startDir;
409
- for (let i = 0; i < 6; i++) {
410
- const dataCandidate = import_path.default.join(dir, "data");
411
- if (import_fs.default.existsSync(dataCandidate)) return dataCandidate;
412
- const parent = import_path.default.resolve(dir, "..");
413
- if (parent === dir) break;
414
- dir = parent;
121
+ const startDir = getStartDir(fromModuleUrl);
122
+ const viaTraversal = findDataDirByTraversal(startDir);
123
+ if (viaTraversal) {
124
+ return viaTraversal;
415
125
  }
416
- const pkgRoot = import_path.default.resolve(startDir, "..", "..");
417
- return import_path.default.join(pkgRoot, "data");
126
+ const pkgRoot = import_node_path.default.resolve(startDir, "..", "..");
127
+ return import_node_path.default.join(pkgRoot, "data");
418
128
  }
419
129
 
420
130
  // src/benchmarks/bfcl/ast-checker.ts
421
131
  function standardizeString(input) {
422
- if (typeof input !== "string") return input;
132
+ if (typeof input !== "string") {
133
+ return input;
134
+ }
423
135
  const regex = /[ ,./\\-_*^]/g;
424
136
  return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
425
137
  }
@@ -439,23 +151,126 @@ function checkStringValue(param, modelValue, possibleAnswers) {
439
151
  }
440
152
  return { valid: true };
441
153
  }
442
- function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
443
- const modelArgs = modelToolCall.args;
444
- const modelFuncName = modelToolCall.toolName;
445
- const expectedFuncName = funcDescription.name;
446
- const expectedParams = funcDescription.parameters.properties;
447
- const requiredParams = funcDescription.parameters.required;
448
- if (modelFuncName !== expectedFuncName) {
154
+ function normalizeObject(obj) {
155
+ if (Array.isArray(obj)) {
156
+ return obj.map(normalizeObject);
157
+ }
158
+ if (obj && typeof obj === "object") {
159
+ const normalized = {};
160
+ for (const [key, value] of Object.entries(obj)) {
161
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
162
+ normalized[key] = value[0];
163
+ } else {
164
+ normalized[key] = normalizeObject(value);
165
+ }
166
+ }
167
+ return normalized;
168
+ }
169
+ return obj;
170
+ }
171
+ function valuesMatch(modelValue, possibleValue) {
172
+ if (modelValue === possibleValue) {
173
+ return true;
174
+ }
175
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
176
+ try {
177
+ const normalizedModel = normalizeObject(modelValue);
178
+ const normalizedPossible = normalizeObject(possibleValue);
179
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
180
+ } catch {
181
+ return false;
182
+ }
183
+ }
184
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
185
+ return modelValue.toString() === possibleValue;
186
+ }
187
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
188
+ return modelValue === possibleValue.toString();
189
+ }
190
+ return false;
191
+ }
192
+ function checkArrayValue(paramName, modelValue, possibleValues) {
193
+ const modelValueStr = JSON.stringify(
194
+ modelValue.map((v) => standardizeString(String(v))).sort()
195
+ );
196
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
197
+ if (!Array.isArray(p)) {
198
+ return false;
199
+ }
200
+ return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
201
+ }) : false;
202
+ if (!hasMatch) {
449
203
  return {
450
204
  valid: false,
451
- error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
452
- error_type: "simple_function_checker:wrong_func_name"
205
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
206
+ modelValue
207
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
208
+ error_type: "value_error:list"
453
209
  };
454
210
  }
455
- const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
456
- const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
457
- for (const param of requiredParams) {
458
- if (!(param in argsObj)) {
211
+ return { valid: true };
212
+ }
213
+ function checkObjectValue(paramName, modelValue, possibleValues) {
214
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
215
+ (possibleValue) => valuesMatch(modelValue, possibleValue)
216
+ ) : false;
217
+ if (!hasMatch) {
218
+ return {
219
+ valid: false,
220
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
221
+ modelValue
222
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
223
+ error_type: "value_error:other"
224
+ };
225
+ }
226
+ return { valid: true };
227
+ }
228
+ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
229
+ const funcNameCheck = checkFunctionName(
230
+ funcDescription.name,
231
+ modelToolCall.toolName
232
+ );
233
+ if (!funcNameCheck.valid) {
234
+ return funcNameCheck;
235
+ }
236
+ const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
237
+ const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
238
+ const context = {
239
+ funcDescription,
240
+ modelToolCall,
241
+ possibleAnswerParams,
242
+ expectedParams: funcDescription.parameters.properties
243
+ };
244
+ const requiredCheck = checkRequiredParams(
245
+ funcDescription.parameters.required,
246
+ argsObj
247
+ );
248
+ if (!requiredCheck.valid) {
249
+ return requiredCheck;
250
+ }
251
+ const paramsCheck = checkAllParameters(argsObj, context);
252
+ if (!paramsCheck.valid) {
253
+ return paramsCheck;
254
+ }
255
+ const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
256
+ if (!optionalCheck.valid) {
257
+ return optionalCheck;
258
+ }
259
+ return { valid: true };
260
+ }
261
+ function checkFunctionName(expected, actual) {
262
+ if (actual !== expected) {
263
+ return {
264
+ valid: false,
265
+ error: `Function name '${actual}' does not match expected '${expected}'.`,
266
+ error_type: "simple_function_checker:wrong_func_name"
267
+ };
268
+ }
269
+ return { valid: true };
270
+ }
271
+ function checkRequiredParams(requiredParams, argsObj) {
272
+ for (const param of requiredParams) {
273
+ if (!(param in argsObj)) {
459
274
  return {
460
275
  valid: false,
461
276
  error: `Missing required parameter: '${param}'.`,
@@ -463,103 +278,54 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
463
278
  };
464
279
  }
465
280
  }
466
- if (modelArgs && typeof modelArgs === "object") {
467
- for (const paramName of Object.keys(argsObj)) {
468
- const modelValue = argsObj[paramName];
469
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
281
+ return { valid: true };
282
+ }
283
+ function checkAllParameters(argsObj, context) {
284
+ for (const paramName of Object.keys(argsObj)) {
285
+ const paramCheck = checkSingleParameter(
286
+ paramName,
287
+ argsObj[paramName],
288
+ context
289
+ );
290
+ if (!paramCheck.valid) {
291
+ return paramCheck;
292
+ }
293
+ }
294
+ return { valid: true };
295
+ }
296
+ function checkSingleParameter(paramName, modelValue, context) {
297
+ if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
298
+ return {
299
+ valid: false,
300
+ error: `Unexpected parameter: '${paramName}'.`,
301
+ error_type: "simple_function_checker:unexpected_param"
302
+ };
303
+ }
304
+ const possibleValues = context.possibleAnswerParams[paramName];
305
+ if (typeof modelValue === "string") {
306
+ return checkStringValue(
307
+ paramName,
308
+ modelValue,
309
+ possibleValues ?? []
310
+ );
311
+ }
312
+ if (Array.isArray(modelValue)) {
313
+ return checkArrayValue(paramName, modelValue, possibleValues);
314
+ }
315
+ return checkObjectValue(paramName, modelValue, possibleValues);
316
+ }
317
+ function checkOptionalParams(argsObj, possibleAnswerParams) {
318
+ for (const paramName in possibleAnswerParams) {
319
+ if (Object.hasOwn(possibleAnswerParams, paramName)) {
320
+ const val = possibleAnswerParams[paramName];
321
+ const isOptional = Array.isArray(val) && val.includes("");
322
+ if (!(paramName in argsObj || isOptional)) {
470
323
  return {
471
324
  valid: false,
472
- error: `Unexpected parameter: '${paramName}'.`,
473
- error_type: "simple_function_checker:unexpected_param"
325
+ error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
326
+ error_type: "simple_function_checker:missing_optional"
474
327
  };
475
328
  }
476
- const possibleValues = possibleAnswerParams[paramName];
477
- if (typeof modelValue === "string") {
478
- const result = checkStringValue(
479
- paramName,
480
- modelValue,
481
- possibleValues ?? []
482
- );
483
- if (!result.valid) return result;
484
- } else if (Array.isArray(modelValue)) {
485
- const modelValueStr = JSON.stringify(
486
- modelValue.map((v) => standardizeString(String(v))).sort()
487
- );
488
- const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
489
- if (!Array.isArray(p)) return false;
490
- return JSON.stringify(
491
- p.map((v) => standardizeString(String(v))).sort()
492
- ) === modelValueStr;
493
- }) : false;
494
- if (!hasMatch) {
495
- return {
496
- valid: false,
497
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
498
- modelValue
499
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
500
- error_type: "value_error:list"
501
- };
502
- }
503
- } else {
504
- const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
505
- if (modelValue === possibleValue) return true;
506
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
507
- try {
508
- const normalizeObject = (obj) => {
509
- if (Array.isArray(obj)) {
510
- return obj.map(normalizeObject);
511
- }
512
- if (obj && typeof obj === "object") {
513
- const normalized = {};
514
- for (const [key, value] of Object.entries(
515
- obj
516
- )) {
517
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
518
- normalized[key] = value[0];
519
- } else {
520
- normalized[key] = normalizeObject(value);
521
- }
522
- }
523
- return normalized;
524
- }
525
- return obj;
526
- };
527
- const normalizedModel = normalizeObject(modelValue);
528
- const normalizedPossible = normalizeObject(possibleValue);
529
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
530
- } catch {
531
- return false;
532
- }
533
- }
534
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
535
- return modelValue.toString() === possibleValue;
536
- }
537
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
538
- return modelValue === possibleValue.toString();
539
- }
540
- return false;
541
- }) : false;
542
- if (!hasMatch) {
543
- return {
544
- valid: false,
545
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
546
- modelValue
547
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
548
- error_type: "value_error:other"
549
- };
550
- }
551
- }
552
- }
553
- }
554
- for (const paramName in possibleAnswerParams) {
555
- const val = possibleAnswerParams[paramName];
556
- const isOptional = Array.isArray(val) && val.includes("");
557
- if (!(paramName in argsObj) && !isOptional) {
558
- return {
559
- valid: false,
560
- error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
561
- error_type: "simple_function_checker:missing_optional"
562
- };
563
329
  }
564
330
  }
565
331
  return { valid: true };
@@ -586,8 +352,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
586
352
  };
587
353
  }
588
354
  let foundMatch = false;
589
- for (let i = 0; i < modelToolCalls.length; i++) {
590
- if (matchedModelCallIndices.has(i)) continue;
355
+ for (let i = 0; i < modelToolCalls.length; i += 1) {
356
+ if (matchedModelCallIndices.has(i)) {
357
+ continue;
358
+ }
591
359
  const checkerResult = simpleFunctionChecker(
592
360
  funcDescription,
593
361
  modelToolCalls[i],
@@ -636,6 +404,8 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
636
404
  }
637
405
 
638
406
  // src/benchmarks/bfcl.ts
407
+ var LINE_SPLIT_REGEX = /\r?\n/;
408
+ var NUMERIC_STRING_REGEX = /^\d+$/;
639
409
  function check(testCase, modelOutput, possibleAnswer) {
640
410
  const category = testCase.id.split("_")[0];
641
411
  try {
@@ -652,19 +422,22 @@ function check(testCase, modelOutput, possibleAnswer) {
652
422
  modelOutput[0],
653
423
  possibleAnswer.ground_truth[0]
654
424
  );
655
- } else if (category === "parallel") {
425
+ }
426
+ if (category === "parallel") {
656
427
  return parallelFunctionCheckerNoOrder(
657
428
  testCase.function,
658
429
  modelOutput,
659
430
  possibleAnswer.ground_truth
660
431
  );
661
- } else if (category === "multiple") {
432
+ }
433
+ if (category === "multiple") {
662
434
  return multipleFunctionChecker(
663
435
  testCase.function,
664
436
  modelOutput,
665
437
  possibleAnswer.ground_truth
666
438
  );
667
- } else if (category.includes("parallel-multiple")) {
439
+ }
440
+ if (category.includes("parallel-multiple")) {
668
441
  return parallelFunctionCheckerNoOrder(
669
442
  testCase.function,
670
443
  modelOutput,
@@ -692,16 +465,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
692
465
  try {
693
466
  const dataPath = resolveDataDir();
694
467
  logs.push(`[INFO] Using data dir: ${dataPath}`);
695
- const testCasesJson = await import_fs2.promises.readFile(
696
- import_path2.default.join(dataPath, testDataFile),
468
+ const testCasesJson = await import_node_fs2.promises.readFile(
469
+ import_node_path2.default.join(dataPath, testDataFile),
697
470
  "utf-8"
698
471
  );
699
- const possibleAnswersJson = await import_fs2.promises.readFile(
700
- import_path2.default.join(dataPath, answerDataFile),
472
+ const possibleAnswersJson = await import_node_fs2.promises.readFile(
473
+ import_node_path2.default.join(dataPath, answerDataFile),
701
474
  "utf-8"
702
475
  );
703
- testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
704
- const possibleAnswers = possibleAnswersJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
476
+ testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
477
+ const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
705
478
  const possibleAnswersMap = new Map(
706
479
  possibleAnswers.map((ans) => [ans.id, ans])
707
480
  );
@@ -713,319 +486,600 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
713
486
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
714
487
  );
715
488
  }
489
+ const fixSchemaType = (copy) => {
490
+ if (!copy.type) {
491
+ return;
492
+ }
493
+ if (copy.type === "dict") {
494
+ copy.type = "object";
495
+ }
496
+ if (copy.type === "tuple") {
497
+ copy.type = "array";
498
+ }
499
+ if (copy.type === "integer" || copy.type === "float") {
500
+ copy.type = "number";
501
+ }
502
+ };
503
+ const fixSchemaProperties = (copy, fixSchemaFn) => {
504
+ if (!copy.properties || typeof copy.properties !== "object") {
505
+ return;
506
+ }
507
+ for (const k of Object.keys(copy.properties)) {
508
+ copy.properties[k] = fixSchemaFn(
509
+ copy.properties[k]
510
+ );
511
+ }
512
+ };
716
513
  const fixSchema = (schema) => {
717
- if (!schema || typeof schema !== "object")
514
+ if (!schema || typeof schema !== "object") {
718
515
  return { type: "object", properties: {} };
516
+ }
719
517
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
720
518
  if (!Array.isArray(copy)) {
721
- if (copy.type) {
722
- if (copy.type === "dict") copy.type = "object";
723
- if (copy.type === "integer" || copy.type === "float")
724
- copy.type = "number";
725
- }
726
- if (copy.properties && typeof copy.properties === "object") {
727
- for (const k of Object.keys(copy.properties)) {
728
- copy.properties[k] = fixSchema(
729
- copy.properties[k]
730
- );
731
- }
519
+ fixSchemaType(copy);
520
+ fixSchemaProperties(copy, fixSchema);
521
+ if (copy.items) {
522
+ copy.items = fixSchema(copy.items);
732
523
  }
733
- if (copy.items) copy.items = fixSchema(copy.items);
734
524
  return copy;
735
525
  }
736
526
  return copy;
737
527
  };
528
+ const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
529
+ const sanitizeName = (toolName) => {
530
+ const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
531
+ return s.length > 0 ? s : "tool";
532
+ };
533
+ const buildTransformedTools = (tools, fixSchemaFn) => {
534
+ const nameMap = /* @__PURE__ */ new Map();
535
+ const transformedTools = tools.map((t) => {
536
+ const fixed = fixSchemaFn(t.parameters);
537
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
538
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
539
+ const sanitized = sanitizeName(t.name);
540
+ nameMap.set(sanitized, t.name);
541
+ return {
542
+ type: "function",
543
+ name: sanitized,
544
+ description: t.description,
545
+ inputSchema
546
+ };
547
+ });
548
+ return { transformedTools, nameMap };
549
+ };
550
+ const parseDebugToolCalls = (raw) => {
551
+ if (!raw) {
552
+ return [];
553
+ }
554
+ try {
555
+ const arr = JSON.parse(raw);
556
+ return Array.isArray(arr) ? arr : [];
557
+ } catch {
558
+ return [];
559
+ }
560
+ };
561
+ const getSanitizedName = (rawName, transformedTools) => {
562
+ if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
563
+ return transformedTools[Number(rawName)]?.name ?? rawName;
564
+ }
565
+ return rawName;
566
+ };
567
+ const parseToolArgs = (extractedArgs) => {
568
+ if (typeof extractedArgs !== "string") {
569
+ return extractedArgs;
570
+ }
571
+ try {
572
+ return JSON.parse(extractedArgs);
573
+ } catch {
574
+ return extractedArgs;
575
+ }
576
+ };
577
+ const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
578
+ const call = c;
579
+ const rawName = call.toolName ?? call.name;
580
+ const sanitizedFromIndex = getSanitizedName(
581
+ rawName,
582
+ transformedTools
583
+ );
584
+ const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
585
+ const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
586
+ const parsedArgs = parseToolArgs(extractedArgs);
587
+ return {
588
+ ...call,
589
+ toolName: originalName,
590
+ name: originalName,
591
+ args: parsedArgs ?? {}
592
+ };
593
+ });
594
+ const summarizeArgs = (args) => {
595
+ if (args == null) {
596
+ return args;
597
+ }
598
+ if (typeof args !== "object") {
599
+ return args;
600
+ }
601
+ return Object.keys(args).sort().reduce(
602
+ (acc, k) => {
603
+ acc[k] = args[k];
604
+ return acc;
605
+ },
606
+ {}
607
+ );
608
+ };
609
+ const generateParamMismatchDiff = (paramName, allowed, got) => {
610
+ const diffLines = [];
611
+ diffLines.push(`@@ param ${paramName}`);
612
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
613
+ const expectedLine = (() => {
614
+ if (allowedArray.length === 1) {
615
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
616
+ }
617
+ const formatted = allowedArray.map(
618
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
619
+ ).join(", ");
620
+ return `- expected one of: ${formatted}`;
621
+ })();
622
+ diffLines.push(expectedLine);
623
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
624
+ return diffLines;
625
+ };
626
+ const paramValueMatches = (allowed, got) => {
627
+ if (!Array.isArray(allowed)) {
628
+ return false;
629
+ }
630
+ return allowed.some((v) => {
631
+ try {
632
+ if (Array.isArray(got)) {
633
+ return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
634
+ }
635
+ } catch {
636
+ }
637
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
638
+ });
639
+ };
640
+ const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
641
+ if (expectedName !== receivedName) {
642
+ diff.push("@@ function name");
643
+ diff.push(`- ${expectedName}`);
644
+ diff.push(`+ ${receivedName}`);
645
+ }
646
+ };
647
+ const checkMissingParams = (required, receivedArgs, diff) => {
648
+ for (const req of required) {
649
+ if (!(req in receivedArgs)) {
650
+ diff.push(`- missing required param: ${req}`);
651
+ }
652
+ }
653
+ };
654
+ const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
655
+ for (const k of Object.keys(receivedArgs)) {
656
+ if (!(k in expectedParams)) {
657
+ diff.push(`+ unexpected param: ${k}`);
658
+ }
659
+ }
660
+ };
661
+ const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
662
+ for (const k of Object.keys(receivedArgs)) {
663
+ if (k in expectedParams) {
664
+ const allowed = expectedParams[k];
665
+ const got = receivedArgs[k];
666
+ if (!paramValueMatches(allowed, got)) {
667
+ diff.push(...generateParamMismatchDiff(k, allowed, got));
668
+ }
669
+ }
670
+ }
671
+ };
672
+ const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
673
+ const funcDesc = tools[0];
674
+ const gt = possibleAnswer.ground_truth?.[0];
675
+ const expectedFuncName = funcDesc?.name;
676
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
677
+ const received = restoredCalls[0];
678
+ const receivedName = received?.toolName ?? received?.name;
679
+ const receivedArgs = summarizeArgs(received?.args);
680
+ const expected = {
681
+ function: expectedFuncName,
682
+ params: expectedParams
683
+ };
684
+ const actual = {
685
+ function: receivedName,
686
+ args: receivedArgs
687
+ };
688
+ const diff = [];
689
+ checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
690
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
691
+ const required = funcDesc?.parameters?.required ?? [];
692
+ checkMissingParams(
693
+ required,
694
+ receivedArgs,
695
+ diff
696
+ );
697
+ checkUnexpectedParams(
698
+ expectedParams,
699
+ receivedArgs,
700
+ diff
701
+ );
702
+ checkParamValueMismatches(
703
+ expectedParams,
704
+ receivedArgs,
705
+ diff
706
+ );
707
+ }
708
+ return { expected, actual, diff };
709
+ };
710
+ const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
711
+ if (expectedCount !== actualCount) {
712
+ diff.push("@@ call count");
713
+ diff.push(`- expected ${expectedCount}`);
714
+ diff.push(`+ got ${actualCount}`);
715
+ }
716
+ };
717
+ const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
718
+ const missing = expectedNames.filter((n) => !actualNames.includes(n));
719
+ const extra = actualNames.filter((n) => !expectedNames.includes(n));
720
+ for (const m of missing) {
721
+ diff.push(`- missing function: ${m}`);
722
+ }
723
+ for (const e of extra) {
724
+ diff.push(`+ unexpected function: ${e}`);
725
+ }
726
+ };
727
+ const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
728
+ for (let i = 0; i < restoredCalls.length; i += 1) {
729
+ if (usedActual.has(i)) {
730
+ continue;
731
+ }
732
+ const rc = restoredCalls[i];
733
+ const rcName = rc?.toolName ?? rc?.name;
734
+ if (rcName === fname) {
735
+ return i;
736
+ }
737
+ }
738
+ return -1;
739
+ };
740
+ const validateFunctionParams = (options) => {
741
+ const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
742
+ checkMissingParams(requiredParams, receivedArgs, diff);
743
+ checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
744
+ checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
745
+ };
746
+ const processExpectedCall = (options) => {
747
+ const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
748
+ const fname = Object.keys(expectedObj)[0];
749
+ const matchedIndex = findMatchingCallIndex(
750
+ fname,
751
+ restoredCalls,
752
+ usedActual
753
+ );
754
+ if (matchedIndex === -1) {
755
+ return;
756
+ }
757
+ usedActual.add(matchedIndex);
758
+ const received = restoredCalls[matchedIndex];
759
+ const receivedArgs = summarizeArgs(received?.args);
760
+ const expectedParamsAllowed = expectedObj[fname];
761
+ const funcDesc = tools.find((t) => t.name === fname);
762
+ const requiredParams = funcDesc?.parameters?.required ?? [];
763
+ diff.push(`@@ function ${fname}`);
764
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
765
+ validateFunctionParams({
766
+ receivedArgs,
767
+ expectedParamsAllowed,
768
+ requiredParams,
769
+ diff
770
+ });
771
+ }
772
+ };
773
+ const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
774
+ const gtArr = possibleAnswer.ground_truth ?? [];
775
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
776
+ const actualNames = restoredCalls.map(
777
+ (c) => c.toolName ?? c.name
778
+ );
779
+ const expected = {
780
+ functions: expectedNames
781
+ };
782
+ const actual = { functions: actualNames };
783
+ const diff = [];
784
+ checkCallCountMismatch(
785
+ expectedNames.length,
786
+ actualNames.length,
787
+ diff
788
+ );
789
+ addMissingAndExtraFunctions(expectedNames, actualNames, diff);
790
+ const usedActual = /* @__PURE__ */ new Set();
791
+ for (const expectedObj of gtArr) {
792
+ processExpectedCall({
793
+ expectedObj,
794
+ restoredCalls,
795
+ tools,
796
+ usedActual,
797
+ diff
798
+ });
799
+ }
800
+ return { expected, actual, diff };
801
+ };
738
802
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
739
803
  const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
740
804
  logs.push(
741
805
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
742
806
  );
743
- const runSingleCase = async (testCase) => {
744
- const caseLogs = [];
745
- const { function: tools, question: messages } = testCase;
746
- const temp = config?.temperature;
747
- const temperature = typeof temp === "number" ? temp : void 0;
748
- const maxTok = config?.maxTokens;
749
- const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
807
+ const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
750
808
  try {
751
- const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
752
- const nameMap = /* @__PURE__ */ new Map();
753
- const sanitizeName = (name2) => {
754
- const s = name2.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
755
- return s.length > 0 ? s : "tool";
756
- };
757
- const transformedTools = tools.map((t) => {
758
- const fixed = fixSchema(t.parameters);
759
- const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
760
- const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
761
- const sanitized = sanitizeName(t.name);
762
- nameMap.set(sanitized, t.name);
763
- return {
764
- type: "function",
765
- name: sanitized,
766
- description: t.description,
767
- inputSchema
768
- };
769
- });
770
- const toolsMap = Object.fromEntries(
771
- transformedTools.map((t) => [
772
- t.name,
773
- (0, import_ai.tool)({
774
- description: typeof t.description === "string" ? t.description : void 0,
775
- inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
776
- })
777
- ])
809
+ const firstTool = transformedTools[0];
810
+ const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
811
+ caseLogs.push(
812
+ `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
778
813
  );
779
- try {
780
- const firstTool = transformedTools[0];
781
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
782
- caseLogs.push(
783
- `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
784
- );
785
- } catch (e) {
786
- caseLogs.push(
787
- `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
788
- );
814
+ } catch (e) {
815
+ caseLogs.push(
816
+ `[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
817
+ );
818
+ }
819
+ };
820
+ const logRawToolCalls = (options) => {
821
+ const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
822
+ try {
823
+ caseLogs.push(
824
+ `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
825
+ );
826
+ } catch {
827
+ caseLogs.push(
828
+ `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
829
+ );
830
+ }
831
+ };
832
+ const buildFailureContext = (options) => {
833
+ const {
834
+ testCase,
835
+ tools,
836
+ flatMessages,
837
+ mwOriginalText,
838
+ text,
839
+ finishReason,
840
+ mwParsedToolCalls,
841
+ restoredCalls,
842
+ possibleAnswer
843
+ } = options;
844
+ const lastUser = (() => {
845
+ const reversed = [...flatMessages].reverse();
846
+ const found = reversed.find(
847
+ (m) => m.role === "user"
848
+ );
849
+ return found?.content ?? void 0;
850
+ })();
851
+ const rawModelText = (() => {
852
+ if (mwOriginalText && mwOriginalText.length > 0) {
853
+ return mwOriginalText;
789
854
  }
790
- const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
791
- model,
792
- messages: flatMessages,
793
- tools: toolsMap,
794
- toolChoice: "auto",
795
- ...temperature !== void 0 ? { temperature } : {},
796
- ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
797
- });
855
+ if (typeof text === "string") {
856
+ return text;
857
+ }
858
+ return "";
859
+ })();
860
+ return {
861
+ id: testCase.id,
862
+ tool_schema: tools,
863
+ last_user_query: lastUser,
864
+ raw_model_text: rawModelText,
865
+ finish_reason: finishReason,
866
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
867
+ ground_truth: possibleAnswer.ground_truth
868
+ };
869
+ };
870
+ const logFailureDetails = (options) => {
871
+ const {
872
+ testCase,
873
+ tools,
874
+ possibleAnswer,
875
+ restoredCalls,
876
+ checkerResult,
877
+ flatMessages,
878
+ mwOriginalText,
879
+ text,
880
+ finishReason,
881
+ mwParsedToolCalls,
882
+ caseLogs
883
+ } = options;
884
+ try {
885
+ const category = testCase.id.split("_")[0];
886
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
887
+ tools,
888
+ possibleAnswer,
889
+ restoredCalls
890
+ ) : buildParallelDiff(
891
+ tools,
892
+ possibleAnswer,
893
+ restoredCalls
894
+ );
895
+ caseLogs.push(
896
+ `[DEBUG-FAIL] ${JSON.stringify({
897
+ id: testCase.id,
898
+ message: checkerResult.error,
899
+ error_type: checkerResult.error_type,
900
+ expected,
901
+ actual,
902
+ diff
903
+ })}`
904
+ );
798
905
  try {
906
+ const contextPayload = buildFailureContext({
907
+ testCase,
908
+ tools,
909
+ flatMessages,
910
+ mwOriginalText,
911
+ text,
912
+ finishReason,
913
+ mwParsedToolCalls,
914
+ restoredCalls,
915
+ possibleAnswer
916
+ });
799
917
  caseLogs.push(
800
- `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
918
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
801
919
  );
802
920
  } catch {
803
- caseLogs.push(
804
- `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
805
- );
806
921
  }
807
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
808
- if (!possibleAnswer) {
809
- throw new Error(`No possible answer for id: ${testCase.id}`);
922
+ } catch {
923
+ caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
924
+ }
925
+ };
926
+ const buildToolsMap = (transformedTools) => Object.fromEntries(
927
+ transformedTools.map((t) => [
928
+ t.name,
929
+ (0, import_ai.tool)({
930
+ description: typeof t.description === "string" ? t.description : void 0,
931
+ inputSchema: (0, import_ai.jsonSchema)(
932
+ t.inputSchema
933
+ )
934
+ })
935
+ ])
936
+ );
937
+ const executeModelGeneration = async (options) => {
938
+ const {
939
+ model: modelInstance,
940
+ flatMessages,
941
+ toolsMap,
942
+ temperature,
943
+ maxTokens
944
+ } = options;
945
+ const debugSummaryRef = {};
946
+ const providerOptions = {
947
+ toolCallMiddleware: {
948
+ debugSummary: debugSummaryRef
810
949
  }
811
- const restoredCalls = (toolCalls || []).map((c) => {
812
- const rawName = c.toolName ?? c.name;
813
- const sanitizedFromIndex = typeof rawName === "string" && /^\d+$/.test(rawName) ? transformedTools[Number(rawName)]?.name ?? rawName : rawName;
814
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
815
- const extractedArgs = c.args ?? c.arguments ?? c.input ?? c.params ?? c.parameters ?? void 0;
816
- let parsedArgs = extractedArgs;
817
- if (typeof parsedArgs === "string") {
818
- try {
819
- parsedArgs = JSON.parse(parsedArgs);
820
- } catch {
821
- }
822
- }
823
- return {
824
- ...c,
825
- toolName: originalName,
826
- name: originalName,
827
- args: parsedArgs ?? {}
828
- };
950
+ };
951
+ const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
952
+ model: modelInstance,
953
+ messages: flatMessages,
954
+ tools: toolsMap,
955
+ toolChoice: "auto",
956
+ providerOptions,
957
+ ...temperature !== void 0 ? { temperature } : {},
958
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
959
+ });
960
+ return { toolCalls, text, finishReason, debugSummaryRef };
961
+ };
962
+ const processValidationResult = (options) => {
963
+ const {
964
+ checkerResult,
965
+ testCase,
966
+ tools,
967
+ possibleAnswer,
968
+ restoredCalls,
969
+ flatMessages,
970
+ mwOriginalText,
971
+ text,
972
+ finishReason,
973
+ mwParsedToolCalls,
974
+ caseLogs
975
+ } = options;
976
+ if (checkerResult.valid) {
977
+ caseLogs.push(`[PASS] ${testCase.id}`);
978
+ return { valid: true, logs: caseLogs };
979
+ }
980
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
981
+ logFailureDetails({
982
+ testCase,
983
+ tools,
984
+ possibleAnswer,
985
+ restoredCalls,
986
+ checkerResult,
987
+ flatMessages,
988
+ mwOriginalText,
989
+ text,
990
+ finishReason,
991
+ mwParsedToolCalls,
992
+ caseLogs
993
+ });
994
+ return { valid: false, logs: caseLogs };
995
+ };
996
+ const prepareTestCaseData = (testCase) => {
997
+ const { function: tools, question: messages } = testCase;
998
+ const flatMessages = flattenMessages(messages);
999
+ const { transformedTools, nameMap } = buildTransformedTools(
1000
+ tools,
1001
+ fixSchema
1002
+ );
1003
+ const toolsMap = buildToolsMap(transformedTools);
1004
+ return { flatMessages, transformedTools, nameMap, toolsMap };
1005
+ };
1006
+ const processModelResponse = (options) => {
1007
+ const {
1008
+ testCase,
1009
+ toolCalls,
1010
+ text,
1011
+ finishReason,
1012
+ debugSummaryRef,
1013
+ nameMap,
1014
+ transformedTools,
1015
+ flatMessages,
1016
+ tools,
1017
+ caseLogs
1018
+ } = options;
1019
+ const mwOriginalText = debugSummaryRef.originalText;
1020
+ const mwParsedToolCalls = parseDebugToolCalls(
1021
+ debugSummaryRef.toolCalls
1022
+ );
1023
+ logRawToolCalls({
1024
+ toolCalls,
1025
+ finishReason,
1026
+ text,
1027
+ testCaseId: testCase.id,
1028
+ caseLogs
1029
+ });
1030
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1031
+ if (!possibleAnswer) {
1032
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1033
+ }
1034
+ const restoredCalls = restoreToolCalls(
1035
+ toolCalls || [],
1036
+ nameMap,
1037
+ transformedTools
1038
+ );
1039
+ const checkerResult = check(testCase, restoredCalls, possibleAnswer);
1040
+ return processValidationResult({
1041
+ checkerResult,
1042
+ testCase,
1043
+ tools,
1044
+ possibleAnswer,
1045
+ restoredCalls,
1046
+ flatMessages,
1047
+ mwOriginalText,
1048
+ text,
1049
+ finishReason,
1050
+ mwParsedToolCalls,
1051
+ caseLogs
1052
+ });
1053
+ };
1054
+ const runSingleCase = async (testCase) => {
1055
+ const caseLogs = [];
1056
+ const { function: tools } = testCase;
1057
+ const temp = config?.temperature;
1058
+ const temperature = typeof temp === "number" ? temp : void 0;
1059
+ const maxTok = config?.maxTokens;
1060
+ const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1061
+ try {
1062
+ const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
1063
+ logFirstToolDebug(transformedTools, testCase.id, caseLogs);
1064
+ const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
1065
+ model,
1066
+ flatMessages,
1067
+ toolsMap,
1068
+ temperature,
1069
+ maxTokens
829
1070
  });
830
- const checkerResult = check(
1071
+ return processModelResponse({
831
1072
  testCase,
832
- restoredCalls,
833
- possibleAnswer
834
- );
835
- if (checkerResult.valid) {
836
- caseLogs.push(`[PASS] ${testCase.id}`);
837
- return { valid: true, logs: caseLogs };
838
- } else {
839
- caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
840
- try {
841
- const category = testCase.id.split("_")[0];
842
- const diff = [];
843
- const summarizeArgs = (args) => {
844
- if (args == null) return args;
845
- if (typeof args !== "object") return args;
846
- return Object.keys(args).sort().reduce(
847
- (acc, k) => {
848
- acc[k] = args[k];
849
- return acc;
850
- },
851
- {}
852
- );
853
- };
854
- const expected = {};
855
- const actual = {};
856
- if (category === "simple") {
857
- const funcDesc = tools[0];
858
- const gt = possibleAnswer.ground_truth?.[0];
859
- const expectedFuncName = funcDesc?.name;
860
- const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
861
- const received = restoredCalls[0];
862
- const receivedName = received?.toolName ?? received?.name;
863
- const receivedArgs = summarizeArgs(received?.args);
864
- expected.function = expectedFuncName;
865
- expected.params = expectedParams;
866
- actual.function = receivedName;
867
- actual.args = receivedArgs;
868
- if (expectedFuncName !== receivedName) {
869
- diff.push(`@@ function name`);
870
- diff.push(`- ${expectedFuncName}`);
871
- diff.push(`+ ${receivedName}`);
872
- }
873
- if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
874
- const required = funcDesc?.parameters?.required ?? [];
875
- for (const req of required) {
876
- if (!(req in receivedArgs)) {
877
- diff.push(`- missing required param: ${req}`);
878
- }
879
- }
880
- for (const k of Object.keys(
881
- receivedArgs
882
- )) {
883
- if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
884
- diff.push(`+ unexpected param: ${k}`);
885
- }
886
- }
887
- for (const k of Object.keys(
888
- receivedArgs
889
- )) {
890
- if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
891
- const allowed = expectedParams[k];
892
- const got = receivedArgs[k];
893
- const includes = Array.isArray(allowed) && allowed.some((v) => {
894
- try {
895
- if (Array.isArray(got)) {
896
- return JSON.stringify(
897
- got.map((x) => String(x)).sort()
898
- ) === JSON.stringify(
899
- v.map((x) => String(x)).sort()
900
- );
901
- }
902
- } catch {
903
- }
904
- return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
905
- });
906
- if (!includes) {
907
- diff.push(`@@ param ${k}`);
908
- diff.push(
909
- `- expected one of: ${JSON.stringify(allowed)}`
910
- );
911
- diff.push(`+ got: ${JSON.stringify(got)}`);
912
- }
913
- }
914
- }
915
- }
916
- } else {
917
- const gtArr = possibleAnswer.ground_truth ?? [];
918
- const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
919
- const actualNames = restoredCalls.map(
920
- (c) => c.toolName ?? c.name
921
- );
922
- expected.functions = expectedNames;
923
- actual.functions = actualNames;
924
- if (expectedNames.length !== actualNames.length) {
925
- diff.push(`@@ call count`);
926
- diff.push(`- expected ${expectedNames.length}`);
927
- diff.push(`+ got ${actualNames.length}`);
928
- }
929
- const missing = expectedNames.filter(
930
- (n) => !actualNames.includes(n)
931
- );
932
- const extra = actualNames.filter(
933
- (n) => !expectedNames.includes(n)
934
- );
935
- for (const m of missing)
936
- diff.push(`- missing function: ${m}`);
937
- for (const e of extra)
938
- diff.push(`+ unexpected function: ${e}`);
939
- const usedActual = /* @__PURE__ */ new Set();
940
- for (const expectedObj of gtArr) {
941
- const fname = Object.keys(expectedObj)[0];
942
- let matchedIndex = -1;
943
- for (let i = 0; i < restoredCalls.length; i++) {
944
- if (usedActual.has(i)) continue;
945
- const rc = restoredCalls[i];
946
- const rcName = rc?.toolName ?? rc?.name;
947
- if (rcName === fname) {
948
- matchedIndex = i;
949
- break;
950
- }
951
- }
952
- if (matchedIndex === -1) continue;
953
- usedActual.add(matchedIndex);
954
- const received = restoredCalls[matchedIndex];
955
- const receivedArgs = summarizeArgs(received?.args);
956
- const expectedParamsAllowed = expectedObj[fname];
957
- const funcDesc = tools.find(
958
- (t) => t.name === fname
959
- );
960
- const requiredParams = funcDesc?.parameters?.required ?? [];
961
- diff.push(`@@ function ${fname}`);
962
- if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
963
- for (const req of requiredParams) {
964
- if (!(req in receivedArgs)) {
965
- diff.push(`- missing required param: ${req}`);
966
- }
967
- }
968
- for (const k of Object.keys(
969
- receivedArgs
970
- )) {
971
- if (!Object.prototype.hasOwnProperty.call(
972
- expectedParamsAllowed,
973
- k
974
- )) {
975
- diff.push(`+ unexpected param: ${k}`);
976
- }
977
- }
978
- for (const k of Object.keys(
979
- receivedArgs
980
- )) {
981
- if (Object.prototype.hasOwnProperty.call(
982
- expectedParamsAllowed,
983
- k
984
- )) {
985
- const allowed = expectedParamsAllowed[k];
986
- const got = receivedArgs[k];
987
- const includes = Array.isArray(allowed) && allowed.some((v) => {
988
- try {
989
- if (Array.isArray(got)) {
990
- return JSON.stringify(
991
- got.map((x) => String(x)).sort()
992
- ) === JSON.stringify(
993
- v.map((x) => String(x)).sort()
994
- );
995
- }
996
- } catch {
997
- }
998
- return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
999
- });
1000
- if (!includes) {
1001
- diff.push(`@@ param ${k}`);
1002
- diff.push(
1003
- `- expected one of: ${JSON.stringify(allowed)}`
1004
- );
1005
- diff.push(`+ got: ${JSON.stringify(got)}`);
1006
- }
1007
- }
1008
- }
1009
- }
1010
- }
1011
- }
1012
- caseLogs.push(
1013
- `[DEBUG-FAIL] ${JSON.stringify({
1014
- id: testCase.id,
1015
- message: checkerResult.error,
1016
- error_type: checkerResult.error_type,
1017
- expected,
1018
- actual,
1019
- diff
1020
- })}`
1021
- );
1022
- } catch {
1023
- caseLogs.push(
1024
- `[DEBUG] ${testCase.id}: failed to build debug diff`
1025
- );
1026
- }
1027
- return { valid: false, logs: caseLogs };
1028
- }
1073
+ toolCalls,
1074
+ text,
1075
+ finishReason,
1076
+ debugSummaryRef,
1077
+ nameMap,
1078
+ transformedTools,
1079
+ flatMessages,
1080
+ tools,
1081
+ caseLogs
1082
+ });
1029
1083
  } catch (e) {
1030
1084
  caseLogs.push(
1031
1085
  `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
@@ -1036,13 +1090,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1036
1090
  return { valid: false, logs: caseLogs };
1037
1091
  }
1038
1092
  };
1039
- const mapWithConcurrency = async (items, limit2, mapper) => {
1093
+ const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
1040
1094
  const results = new Array(items.length);
1041
1095
  let idx = 0;
1042
- const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
1096
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1043
1097
  while (true) {
1044
- const current = idx++;
1045
- if (current >= items.length) break;
1098
+ const current = idx;
1099
+ idx += 1;
1100
+ if (current >= items.length) {
1101
+ break;
1102
+ }
1046
1103
  results[current] = await mapper(items[current], current);
1047
1104
  }
1048
1105
  });
@@ -1058,7 +1115,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1058
1115
  (acc, r) => acc + (r.valid ? 1 : 0),
1059
1116
  0
1060
1117
  );
1061
- for (const r of resultsPerCase) logs.push(...r.logs);
1118
+ for (const r of resultsPerCase) {
1119
+ logs.push(...r.logs);
1120
+ }
1062
1121
  if (testCases.length === 0) {
1063
1122
  return {
1064
1123
  score: 0,
@@ -1085,7 +1144,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1085
1144
  success: false,
1086
1145
  metrics: {},
1087
1146
  error: e,
1088
- logs: [`[FATAL] Failed to run benchmark ${name}: ${e.message}`]
1147
+ logs: [
1148
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1149
+ ]
1089
1150
  };
1090
1151
  }
1091
1152
  }
@@ -1094,204 +1155,347 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1094
1155
  var bfclSimpleBenchmark = createBfclBenchmark(
1095
1156
  "bfcl-simple",
1096
1157
  "BFCL Simple Function Calling",
1097
- "BFCL_v3_simple.json",
1098
- "BFCL_v3_simple_possible_answer.json"
1158
+ "BFCL_v3_simple.jsonl",
1159
+ "BFCL_v3_simple_possible_answer.jsonl"
1099
1160
  );
1100
1161
  var bfclParallelBenchmark = createBfclBenchmark(
1101
1162
  "bfcl-parallel",
1102
1163
  "BFCL Parallel Function Calling",
1103
- "BFCL_v3_parallel.json",
1104
- "BFCL_v3_parallel_possible_answer.json"
1164
+ "BFCL_v3_parallel.jsonl",
1165
+ "BFCL_v3_parallel_possible_answer.jsonl"
1105
1166
  );
1106
1167
  var bfclMultipleBenchmark = createBfclBenchmark(
1107
1168
  "bfcl-multiple",
1108
1169
  "BFCL Multiple Function Calling",
1109
- "BFCL_v3_multiple.json",
1110
- "BFCL_v3_multiple_possible_answer.json"
1170
+ "BFCL_v3_multiple.jsonl",
1171
+ "BFCL_v3_multiple_possible_answer.jsonl"
1111
1172
  );
1112
1173
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1113
1174
  "bfcl-parallel-multiple",
1114
1175
  "BFCL Parallel & Multiple Function Calling",
1115
- "BFCL_v3_parallel_multiple.json",
1116
- "BFCL_v3_parallel_multiple_possible_answer.json"
1176
+ "BFCL_v3_parallel_multiple.jsonl",
1177
+ "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1117
1178
  );
1118
1179
 
1119
1180
  // src/benchmarks/json-generation.ts
1181
+ var import_node_fs3 = require("fs");
1182
+ var import_node_path3 = __toESM(require("path"), 1);
1120
1183
  var import_ai2 = require("ai");
1121
1184
  var import_ajv = __toESM(require("ajv"), 1);
1122
- var import_fs3 = require("fs");
1123
- var import_path3 = __toESM(require("path"), 1);
1124
- function extractFirstJsonBlock(text) {
1185
+ var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1186
+ var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1187
+ var NEWLINE_REGEX = /\r?\n/;
1188
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1189
+ function tryDirectParse(text) {
1125
1190
  try {
1126
1191
  return JSON.parse(text);
1127
1192
  } catch {
1193
+ return;
1128
1194
  }
1129
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1130
- if (fenceMatch) {
1131
- const inner = fenceMatch[1].trim();
1132
- try {
1133
- return JSON.parse(inner);
1134
- } catch {
1135
- }
1195
+ }
1196
+ function tryCodeFenceParse(text) {
1197
+ const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
1198
+ if (!fenceMatch) {
1199
+ return;
1200
+ }
1201
+ const inner = fenceMatch[1].trim();
1202
+ try {
1203
+ return JSON.parse(inner);
1204
+ } catch {
1205
+ return;
1136
1206
  }
1207
+ }
1208
+ function tryBracketScan(text) {
1137
1209
  const startIdxObj = text.indexOf("{");
1138
1210
  const startIdxArr = text.indexOf("[");
1139
1211
  const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1140
- if (start === void 0) return void 0;
1212
+ if (start === void 0) {
1213
+ return;
1214
+ }
1141
1215
  const open = text[start] === "{" ? "{" : "[";
1142
1216
  const close = open === "{" ? "}" : "]";
1143
1217
  let depth = 0;
1144
- for (let i = start; i < text.length; i++) {
1218
+ for (let i = start; i < text.length; i += 1) {
1145
1219
  const ch = text[i];
1146
- if (ch === open) depth++;
1147
- else if (ch === close) depth--;
1220
+ if (ch === open) {
1221
+ depth += 1;
1222
+ } else if (ch === close) {
1223
+ depth -= 1;
1224
+ }
1148
1225
  if (depth === 0) {
1149
1226
  const candidate = text.slice(start, i + 1);
1150
1227
  try {
1151
1228
  return JSON.parse(candidate);
1152
1229
  } catch {
1230
+ return;
1153
1231
  }
1154
- break;
1155
1232
  }
1156
1233
  }
1157
- return void 0;
1234
+ return;
1235
+ }
1236
+ function extractFirstJsonBlock(text) {
1237
+ const directResult = tryDirectParse(text);
1238
+ if (directResult !== void 0) {
1239
+ return directResult;
1240
+ }
1241
+ const fenceResult = tryCodeFenceParse(text);
1242
+ if (fenceResult !== void 0) {
1243
+ return fenceResult;
1244
+ }
1245
+ return tryBracketScan(text);
1158
1246
  }
1159
1247
  function subsetMatch(expected, actual) {
1160
1248
  if (expected === null || typeof expected !== "object") {
1161
1249
  return expected === actual;
1162
1250
  }
1163
1251
  if (Array.isArray(expected)) {
1164
- if (!Array.isArray(actual)) return false;
1165
- for (let i = 0; i < expected.length; i++) {
1166
- if (!subsetMatch(expected[i], actual[i])) return false;
1252
+ if (!Array.isArray(actual)) {
1253
+ return false;
1254
+ }
1255
+ for (let i = 0; i < expected.length; i += 1) {
1256
+ if (!subsetMatch(expected[i], actual[i])) {
1257
+ return false;
1258
+ }
1167
1259
  }
1168
1260
  return true;
1169
1261
  }
1170
- if (actual === null || typeof actual !== "object") return false;
1262
+ if (actual === null || typeof actual !== "object") {
1263
+ return false;
1264
+ }
1171
1265
  const eObj = expected;
1172
1266
  const aObj = actual;
1173
1267
  for (const key of Object.keys(eObj)) {
1174
- if (!subsetMatch(eObj[key], aObj[key])) return false;
1268
+ if (!subsetMatch(eObj[key], aObj[key])) {
1269
+ return false;
1270
+ }
1175
1271
  }
1176
1272
  return true;
1177
1273
  }
1178
- var jsonGenerationBenchmark = {
1179
- name: "json-generation",
1180
- version: "2.1.0",
1181
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1182
- async run(model, config) {
1183
- const logs = [];
1184
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
1185
- let schemaValidCount = 0;
1186
- let valueMatchCount = 0;
1187
- let correctCount = 0;
1188
- let tests = [];
1274
+ async function loadDatasets() {
1275
+ try {
1276
+ const dataDir = resolveDataDir();
1277
+ const testsJsonl = await import_node_fs3.promises.readFile(
1278
+ import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1279
+ "utf-8"
1280
+ );
1281
+ const expectedJsonl = await import_node_fs3.promises.readFile(
1282
+ import_node_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1283
+ "utf-8"
1284
+ );
1285
+ const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1286
+ const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1189
1287
  const expectedMap = /* @__PURE__ */ new Map();
1190
- try {
1191
- const dataDir = resolveDataDir();
1192
- const testsJsonl = await import_fs3.promises.readFile(
1193
- import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1194
- "utf-8"
1195
- );
1196
- const expectedJsonl = await import_fs3.promises.readFile(
1197
- import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1198
- "utf-8"
1199
- );
1200
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1201
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1202
- for (const r of expecteds) expectedMap.set(r.id, r);
1203
- } catch (e) {
1204
- const msg = e instanceof Error ? e.message : String(e);
1288
+ for (const r of expecteds) {
1289
+ expectedMap.set(r.id, r);
1290
+ }
1291
+ return { tests, expectedMap };
1292
+ } catch (e) {
1293
+ return {
1294
+ tests: [],
1295
+ expectedMap: /* @__PURE__ */ new Map(),
1296
+ error: e
1297
+ };
1298
+ }
1299
+ }
1300
+ function buildMessages(tc) {
1301
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1302
+ return [
1303
+ {
1304
+ role: "system",
1305
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1306
+ },
1307
+ {
1308
+ role: "user",
1309
+ content: [
1310
+ "Generate a JSON object that reflects the following facts.",
1311
+ "JSON Schema:",
1312
+ schemaStr,
1313
+ "Facts:",
1314
+ tc.promptFacts,
1315
+ "Output must be a single JSON only, with no additional text."
1316
+ ].join("\n\n")
1317
+ }
1318
+ ];
1319
+ }
1320
+ function validateTestCase(tc, parsed, context) {
1321
+ const validate = context.ajv.compile(tc.schema);
1322
+ const valid = validate(parsed);
1323
+ if (!valid) {
1324
+ context.logs.push(
1325
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1326
+ );
1327
+ }
1328
+ const expectedRec = context.expectedMap.get(tc.id);
1329
+ if (!expectedRec) {
1330
+ context.logs.push(
1331
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1332
+ );
1333
+ }
1334
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1335
+ return { valid, valuesOk, parsed };
1336
+ }
1337
+ async function processTestCase(tc, context) {
1338
+ const messages = buildMessages(tc);
1339
+ const temp = context.config?.temperature;
1340
+ const temperature = typeof temp === "number" ? temp : void 0;
1341
+ const { text } = await (0, import_ai2.generateText)({
1342
+ model: context.model,
1343
+ messages,
1344
+ ...temperature !== void 0 ? { temperature } : {}
1345
+ });
1346
+ let parsed;
1347
+ try {
1348
+ parsed = extractFirstJsonBlock(text);
1349
+ } catch {
1350
+ }
1351
+ if (parsed === void 0) {
1352
+ context.validation.logs.push(
1353
+ `[FAIL] ${tc.id}: Unable to parse JSON from model output.`
1354
+ );
1355
+ return { schemaValid: false, valueMatch: false, correct: false };
1356
+ }
1357
+ const {
1358
+ valid,
1359
+ valuesOk,
1360
+ parsed: validatedParsed
1361
+ } = validateTestCase(tc, parsed, context.validation);
1362
+ const correct = valid && valuesOk;
1363
+ if (correct) {
1364
+ context.validation.logs.push(`[PASS] ${tc.id}`);
1365
+ } else {
1366
+ context.validation.logs.push(
1367
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1368
+ validatedParsed
1369
+ )}`
1370
+ );
1371
+ }
1372
+ return { schemaValid: valid, valueMatch: valuesOk, correct };
1373
+ }
1374
+ var jsonGenerationBenchmark = {
1375
+ name: "json-generation",
1376
+ version: "2.1.0",
1377
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1378
+ async run(model, config) {
1379
+ const logs = [];
1380
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1381
+ const { tests, expectedMap, error } = await loadDatasets();
1382
+ if (error) {
1205
1383
  return {
1206
1384
  score: 0,
1207
1385
  success: false,
1208
1386
  metrics: {},
1209
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1210
- error: e
1387
+ logs: [
1388
+ `[FATAL] Failed to load json-generation datasets: ${error.message}`
1389
+ ],
1390
+ error
1211
1391
  };
1212
1392
  }
1213
- for (const tc of tests) {
1214
- try {
1215
- const schemaStr = JSON.stringify(tc.schema, null, 2);
1216
- const messages = [
1217
- {
1218
- role: "system",
1219
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1220
- },
1221
- {
1222
- role: "user",
1223
- content: [
1224
- "Generate a JSON object that reflects the following facts.",
1225
- "JSON Schema:",
1226
- schemaStr,
1227
- "Facts:",
1228
- tc.promptFacts,
1229
- "Output must be a single JSON only, with no additional text."
1230
- ].join("\n\n")
1231
- }
1232
- ];
1233
- const temp = config?.temperature;
1234
- const temperature = typeof temp === "number" ? temp : void 0;
1235
- const { text } = await (0, import_ai2.generateText)({
1236
- model,
1237
- messages,
1238
- ...temperature !== void 0 ? { temperature } : {}
1239
- });
1240
- let parsed;
1241
- try {
1242
- parsed = extractFirstJsonBlock(text);
1243
- } catch {
1244
- }
1245
- if (parsed === void 0) {
1246
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1247
- continue;
1248
- }
1249
- const validate = ajv.compile(tc.schema);
1250
- const valid = validate(parsed);
1251
- if (valid) schemaValidCount++;
1252
- else
1253
- logs.push(
1254
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1255
- );
1256
- const expectedRec = expectedMap.get(tc.id);
1257
- if (!expectedRec) {
1258
- logs.push(
1259
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1260
- );
1261
- }
1262
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1263
- if (valuesOk) valueMatchCount++;
1264
- if (valid && valuesOk) {
1265
- correctCount++;
1266
- logs.push(`[PASS] ${tc.id}`);
1267
- } else {
1268
- logs.push(
1269
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1270
- parsed
1271
- )}`
1272
- );
1273
- }
1274
- } catch (e) {
1275
- const msg = e instanceof Error ? e.message : String(e);
1276
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
1277
- }
1278
- }
1279
- const total = tests.length;
1280
- const score = correctCount / total;
1281
- return {
1282
- score,
1283
- success: score >= 0.8,
1284
- metrics: {
1285
- total_cases: total,
1286
- correct_count: correctCount,
1287
- schema_valid_count: schemaValidCount,
1288
- value_match_count: valueMatchCount,
1289
- accuracy: score
1290
- },
1291
- logs
1393
+ const context = {
1394
+ model,
1395
+ config,
1396
+ validation: { expectedMap, ajv, logs }
1292
1397
  };
1398
+ const counts = await processAllTests(tests, context);
1399
+ return buildBenchmarkResult(tests.length, counts, logs);
1293
1400
  }
1294
1401
  };
1402
+ async function processAllTests(tests, context) {
1403
+ let schemaValidCount = 0;
1404
+ let valueMatchCount = 0;
1405
+ let correctCount = 0;
1406
+ for (const tc of tests) {
1407
+ try {
1408
+ const result = await processTestCase(tc, context);
1409
+ if (result.schemaValid) {
1410
+ schemaValidCount += 1;
1411
+ }
1412
+ if (result.valueMatch) {
1413
+ valueMatchCount += 1;
1414
+ }
1415
+ if (result.correct) {
1416
+ correctCount += 1;
1417
+ }
1418
+ } catch (e) {
1419
+ const msg = e instanceof Error ? e.message : String(e);
1420
+ context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
1421
+ }
1422
+ }
1423
+ return { schemaValidCount, valueMatchCount, correctCount };
1424
+ }
1425
+ function buildBenchmarkResult(total, counts, logs) {
1426
+ const score = counts.correctCount / total;
1427
+ return {
1428
+ score,
1429
+ success: score >= 0.8,
1430
+ metrics: {
1431
+ total_cases: total,
1432
+ correct_count: counts.correctCount,
1433
+ schema_valid_count: counts.schemaValidCount,
1434
+ value_match_count: counts.valueMatchCount,
1435
+ accuracy: score
1436
+ },
1437
+ logs
1438
+ };
1439
+ }
1440
+ async function loadSchemaOnlyTests() {
1441
+ try {
1442
+ const dataDir = resolveDataDir();
1443
+ const testsJsonl = await import_node_fs3.promises.readFile(
1444
+ import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1445
+ "utf-8"
1446
+ );
1447
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1448
+ return { tests };
1449
+ } catch (e) {
1450
+ return { tests: [], error: e };
1451
+ }
1452
+ }
1453
+ async function processSchemaOnlyTestCase(tc, context) {
1454
+ const messages = buildMessages(tc);
1455
+ const temp = context.config?.temperature;
1456
+ const temperature = typeof temp === "number" ? temp : void 0;
1457
+ const { text } = await (0, import_ai2.generateText)({
1458
+ model: context.model,
1459
+ messages,
1460
+ ...temperature !== void 0 ? { temperature } : {}
1461
+ });
1462
+ let parsed;
1463
+ try {
1464
+ parsed = extractFirstJsonBlock(text);
1465
+ } catch {
1466
+ }
1467
+ if (parsed === void 0) {
1468
+ context.logs.push(
1469
+ `[FAIL] ${tc.id}: Could not parse JSON from model output.`
1470
+ );
1471
+ return false;
1472
+ }
1473
+ const validate = context.ajv.compile(tc.schema);
1474
+ const valid = validate(parsed);
1475
+ if (valid) {
1476
+ context.logs.push(`[PASS] ${tc.id}`);
1477
+ return true;
1478
+ }
1479
+ context.logs.push(
1480
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1481
+ );
1482
+ return false;
1483
+ }
1484
+ async function runSchemaOnlyTests(tests, context) {
1485
+ let schemaValidCount = 0;
1486
+ for (const tc of tests) {
1487
+ try {
1488
+ const isValid = await processSchemaOnlyTestCase(tc, context);
1489
+ if (isValid) {
1490
+ schemaValidCount += 1;
1491
+ }
1492
+ } catch (e) {
1493
+ const msg = e instanceof Error ? e.message : String(e);
1494
+ context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
1495
+ }
1496
+ }
1497
+ return schemaValidCount;
1498
+ }
1295
1499
  var jsonGenerationSchemaOnlyBenchmark = {
1296
1500
  name: "json-generation-schema-only",
1297
1501
  version: "1.0.1",
@@ -1299,76 +1503,19 @@ var jsonGenerationSchemaOnlyBenchmark = {
1299
1503
  async run(model, config) {
1300
1504
  const logs = [];
1301
1505
  const ajv = new import_ajv.default({ allErrors: true, strict: false });
1302
- let tests = [];
1303
- try {
1304
- const dataDir = resolveDataDir();
1305
- const testsJsonl = await import_fs3.promises.readFile(
1306
- import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1307
- "utf-8"
1308
- );
1309
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1310
- } catch (e) {
1311
- const msg = e instanceof Error ? e.message : String(e);
1506
+ const { tests, error } = await loadSchemaOnlyTests();
1507
+ if (error) {
1508
+ const msg = error.message;
1312
1509
  return {
1313
1510
  score: 0,
1314
1511
  success: false,
1315
1512
  metrics: {},
1316
1513
  logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1317
- error: e
1514
+ error
1318
1515
  };
1319
1516
  }
1320
- let schemaValidCount = 0;
1321
- for (const tc of tests) {
1322
- try {
1323
- const schemaStr = JSON.stringify(tc.schema, null, 2);
1324
- const messages = [
1325
- {
1326
- role: "system",
1327
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1328
- },
1329
- {
1330
- role: "user",
1331
- content: [
1332
- "Generate a JSON object that reflects the following facts.",
1333
- "JSON Schema:",
1334
- schemaStr,
1335
- "Facts:",
1336
- tc.promptFacts,
1337
- "Output must be a single JSON only, with no additional text."
1338
- ].join("\n\n")
1339
- }
1340
- ];
1341
- const temp = config?.temperature;
1342
- const temperature = typeof temp === "number" ? temp : void 0;
1343
- const { text } = await (0, import_ai2.generateText)({
1344
- model,
1345
- messages,
1346
- ...temperature !== void 0 ? { temperature } : {}
1347
- });
1348
- let parsed;
1349
- try {
1350
- parsed = extractFirstJsonBlock(text);
1351
- } catch {
1352
- }
1353
- if (parsed === void 0) {
1354
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1355
- continue;
1356
- }
1357
- const validate = ajv.compile(tc.schema);
1358
- const valid = validate(parsed);
1359
- if (valid) {
1360
- schemaValidCount++;
1361
- logs.push(`[PASS] ${tc.id}`);
1362
- } else {
1363
- logs.push(
1364
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1365
- );
1366
- }
1367
- } catch (e) {
1368
- const msg = e instanceof Error ? e.message : String(e);
1369
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
1370
- }
1371
- }
1517
+ const context = { model, config, ajv, logs };
1518
+ const schemaValidCount = await runSchemaOnlyTests(tests, context);
1372
1519
  const total = tests.length;
1373
1520
  const score = total > 0 ? schemaValidCount / total : 0;
1374
1521
  return {
@@ -1383,6 +1530,505 @@ var jsonGenerationSchemaOnlyBenchmark = {
1383
1530
  };
1384
1531
  }
1385
1532
  };
1533
+
1534
+ // src/reporters/console.ts
1535
+ var colors = {
1536
+ reset: "\x1B[0m",
1537
+ green: "\x1B[32m",
1538
+ red: "\x1B[31m",
1539
+ yellow: "\x1B[33m",
1540
+ cyan: "\x1B[36m",
1541
+ magenta: "\x1B[35m",
1542
+ gray: "\x1B[90m"
1543
+ };
1544
+ function printResult(result) {
1545
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
1546
+ const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
1547
+ console.log(
1548
+ `
1549
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
1550
+ );
1551
+ console.log(
1552
+ ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
1553
+ );
1554
+ const metrics = Object.entries(benchmarkResult.metrics);
1555
+ if (metrics.length > 0) {
1556
+ console.log(" Metrics:");
1557
+ for (const [key, value] of metrics) {
1558
+ console.log(` - ${key}: ${value}`);
1559
+ }
1560
+ }
1561
+ if (benchmarkResult.error) {
1562
+ console.log(
1563
+ ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
1564
+ );
1565
+ }
1566
+ }
1567
+ function consoleReporter(results) {
1568
+ console.log("\n--- \u{1F4CA} Evaluation Report ---");
1569
+ for (const result of results) {
1570
+ printResult(result);
1571
+ }
1572
+ console.log("\n---------------------------\n");
1573
+ }
1574
+
1575
+ // src/reporters/console.debug.ts
1576
+ var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
1577
+ var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
1578
+ var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
1579
+ var colors2 = {
1580
+ reset: "\x1B[0m",
1581
+ green: "\x1B[32m",
1582
+ red: "\x1B[31m",
1583
+ yellow: "\x1B[33m",
1584
+ cyan: "\x1B[36m",
1585
+ magenta: "\x1B[35m",
1586
+ gray: "\x1B[90m",
1587
+ bold: "\x1B[1m",
1588
+ underline: "\x1B[4m"
1589
+ };
1590
+ function colorizeDiffLine(line) {
1591
+ if (line.startsWith("+")) {
1592
+ return `${colors2.green}${line}${colors2.reset}`;
1593
+ }
1594
+ if (line.startsWith("-")) {
1595
+ return `${colors2.red}${line}${colors2.reset}`;
1596
+ }
1597
+ if (line.startsWith("@")) {
1598
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
1599
+ }
1600
+ return line;
1601
+ }
1602
+ function uniqueLines(lines) {
1603
+ const seen = /* @__PURE__ */ new Set();
1604
+ const out = [];
1605
+ for (const l of lines) {
1606
+ if (seen.has(l)) {
1607
+ continue;
1608
+ }
1609
+ seen.add(l);
1610
+ out.push(l);
1611
+ }
1612
+ return out;
1613
+ }
1614
+ function hasFunctionNameIssue(diff) {
1615
+ return diff.some(
1616
+ (d) => String(d).includes("function name") || String(d).includes("missing function:")
1617
+ );
1618
+ }
1619
+ function suggestFunctionNameFix(expected, actual, suggestions) {
1620
+ const expectedName = expected?.function;
1621
+ const actualName = actual?.function;
1622
+ if (expectedName && actualName && expectedName !== actualName) {
1623
+ suggestions.push(
1624
+ `Call the function '${expectedName}' instead of '${actualName}'.`
1625
+ );
1626
+ }
1627
+ if (Array.isArray(expected?.functions)) {
1628
+ suggestions.push(
1629
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
1630
+ );
1631
+ }
1632
+ }
1633
+ function suggestMissingParamFix(diff, suggestions) {
1634
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
1635
+ if (missing.length) {
1636
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
1637
+ }
1638
+ }
1639
+ function suggestUnexpectedParamFix(diff, suggestions) {
1640
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
1641
+ if (extras.length) {
1642
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
1643
+ }
1644
+ }
1645
+ function suggestParamValueFix(diff, suggestions) {
1646
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
1647
+ for (const param of targets) {
1648
+ const allowedOneOfLine = diff.find(
1649
+ (d) => String(d).startsWith("- expected one of:")
1650
+ );
1651
+ const allowedSingleLine = diff.find(
1652
+ (d) => String(d).startsWith("- expected:")
1653
+ );
1654
+ if (allowedSingleLine) {
1655
+ const value = allowedSingleLine.replace("- expected: ", "");
1656
+ suggestions.push(`Set '${param}' to: ${value}.`);
1657
+ } else if (allowedOneOfLine) {
1658
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
1659
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
1660
+ } else {
1661
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
1662
+ }
1663
+ }
1664
+ }
1665
+ function suggestFromErrorType(error_type, suggestions) {
1666
+ if (error_type.includes("missing_required")) {
1667
+ suggestions.push("Add all required parameters defined by the tool schema.");
1668
+ } else if (error_type.includes("unexpected_param")) {
1669
+ suggestions.push("Remove parameters not present in the tool schema.");
1670
+ } else if (error_type.includes("wrong_count")) {
1671
+ suggestions.push(
1672
+ "Adjust the number of tool calls to match expected count."
1673
+ );
1674
+ } else if (error_type.includes("wrong_func_name")) {
1675
+ suggestions.push("Use the exact expected function name from the schema.");
1676
+ } else if (error_type.includes("value_error")) {
1677
+ suggestions.push("Choose a value from the allowed options.");
1678
+ }
1679
+ }
1680
+ function suggestFixFromDiff(parsed) {
1681
+ const suggestions = [];
1682
+ const { error_type, expected, actual, diff } = parsed ?? {};
1683
+ if (!Array.isArray(diff)) {
1684
+ if (suggestions.length === 0 && typeof error_type === "string") {
1685
+ suggestFromErrorType(error_type, suggestions);
1686
+ }
1687
+ return uniqueLines(suggestions);
1688
+ }
1689
+ if (hasFunctionNameIssue(diff)) {
1690
+ suggestFunctionNameFix(expected, actual, suggestions);
1691
+ }
1692
+ if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
1693
+ suggestMissingParamFix(diff, suggestions);
1694
+ }
1695
+ if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
1696
+ suggestUnexpectedParamFix(diff, suggestions);
1697
+ }
1698
+ if (diff.some((d) => String(d).startsWith("@@ param "))) {
1699
+ suggestParamValueFix(diff, suggestions);
1700
+ }
1701
+ if (suggestions.length === 0 && typeof error_type === "string") {
1702
+ suggestFromErrorType(error_type, suggestions);
1703
+ }
1704
+ return uniqueLines(suggestions);
1705
+ }
1706
+ function getTestIdFromLogLine(line) {
1707
+ if (line.startsWith("[FAIL]")) {
1708
+ const m = line.match(FAIL_ID_REGEX);
1709
+ return m?.[1];
1710
+ }
1711
+ if (line.startsWith("[DEBUG-FAIL]")) {
1712
+ try {
1713
+ const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1714
+ return String(parsed?.id ?? "");
1715
+ } catch {
1716
+ }
1717
+ }
1718
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
1719
+ try {
1720
+ const parsed = JSON.parse(
1721
+ line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1722
+ );
1723
+ return String(parsed?.id ?? "");
1724
+ } catch {
1725
+ }
1726
+ }
1727
+ return;
1728
+ }
1729
+ function groupLogsByTestId(failLogs) {
1730
+ const byId = /* @__PURE__ */ new Map();
1731
+ for (const line of failLogs) {
1732
+ const id = getTestIdFromLogLine(line);
1733
+ const key = id ?? "__general__";
1734
+ const arr = byId.get(key) ?? [];
1735
+ arr.push(line);
1736
+ byId.set(key, arr);
1737
+ }
1738
+ return byId;
1739
+ }
1740
+ function collectDebugIds(lines) {
1741
+ const debugIds = /* @__PURE__ */ new Set();
1742
+ for (const l of lines) {
1743
+ if (l.startsWith("[DEBUG-FAIL]")) {
1744
+ try {
1745
+ const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1746
+ if (parsed?.id) {
1747
+ debugIds.add(String(parsed.id));
1748
+ }
1749
+ } catch {
1750
+ }
1751
+ }
1752
+ }
1753
+ return debugIds;
1754
+ }
1755
+ function printIndentedJson(prefix, data, color) {
1756
+ console.log(
1757
+ color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n ") + colors2.reset
1758
+ );
1759
+ }
1760
+ function displayDebugFailLine(line) {
1761
+ const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
1762
+ try {
1763
+ const parsed = JSON.parse(payload);
1764
+ const { message, diff, expected, actual } = parsed;
1765
+ if (message) {
1766
+ console.log(` ${colors2.bold}${message}${colors2.reset}`);
1767
+ }
1768
+ if (diff && Array.isArray(diff)) {
1769
+ for (const dLine of diff) {
1770
+ console.log(` ${colorizeDiffLine(dLine)}`);
1771
+ }
1772
+ } else {
1773
+ console.log(" expected:");
1774
+ printIndentedJson(" ", expected, colors2.green);
1775
+ console.log(" actual:");
1776
+ printIndentedJson(" ", actual, colors2.red);
1777
+ }
1778
+ const suggestions = suggestFixFromDiff(parsed);
1779
+ if (suggestions.length) {
1780
+ console.log(` ${colors2.bold}Suggested fix:${colors2.reset}`);
1781
+ for (const s of suggestions) {
1782
+ console.log(` \u2022 ${s}`);
1783
+ }
1784
+ }
1785
+ } catch {
1786
+ console.log(` ${line}`);
1787
+ }
1788
+ }
1789
+ function displayContextInfo(ctx) {
1790
+ if (ctx.tool_schema) {
1791
+ printIndentedJson(" tool schema: ", ctx.tool_schema, colors2.gray);
1792
+ }
1793
+ if (ctx.last_user_query) {
1794
+ console.log(
1795
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
1796
+ );
1797
+ }
1798
+ if (ctx.raw_model_text) {
1799
+ console.log(
1800
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
1801
+ );
1802
+ }
1803
+ if (ctx.parsed_tool_calls) {
1804
+ printIndentedJson(
1805
+ " parsed tool calls: ",
1806
+ ctx.parsed_tool_calls,
1807
+ colors2.gray
1808
+ );
1809
+ }
1810
+ if (ctx.ground_truth) {
1811
+ printIndentedJson(
1812
+ " ground truth: ",
1813
+ ctx.ground_truth,
1814
+ colors2.gray
1815
+ );
1816
+ }
1817
+ if (ctx.finish_reason) {
1818
+ console.log(
1819
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
1820
+ );
1821
+ }
1822
+ }
1823
+ function displayDebugFailContextLine(line) {
1824
+ const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
1825
+ try {
1826
+ const ctx = JSON.parse(payload);
1827
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
1828
+ displayContextInfo(ctx);
1829
+ } catch {
1830
+ console.log(` ${line}`);
1831
+ }
1832
+ }
1833
+ function displayLogLine(line, debugIds) {
1834
+ if (line.startsWith("[FAIL]")) {
1835
+ const m = line.match(FAIL_ID_REGEX);
1836
+ const failId = m?.[1];
1837
+ if (failId && debugIds.has(failId)) {
1838
+ return;
1839
+ }
1840
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
1841
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
1842
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
1843
+ } else if (line.startsWith("[STACK]")) {
1844
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
1845
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
1846
+ displayDebugFailLine(line);
1847
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
1848
+ displayDebugFailContextLine(line);
1849
+ }
1850
+ }
1851
+ function displayGroupedFailures(byId) {
1852
+ console.log(` ${colors2.bold}Failure details (grouped):${colors2.reset}`);
1853
+ for (const [groupId, lines] of byId) {
1854
+ if (groupId !== "__general__") {
1855
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
1856
+ }
1857
+ const debugIds = collectDebugIds(lines);
1858
+ for (const line of lines) {
1859
+ displayLogLine(line, debugIds);
1860
+ }
1861
+ }
1862
+ }
1863
+ function displaySuccessLogs(logs) {
1864
+ const info = logs.filter(
1865
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
1866
+ );
1867
+ for (const line of info) {
1868
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
1869
+ }
1870
+ }
1871
+ function filterFailureLogs(logs) {
1872
+ return logs.filter(
1873
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
1874
+ );
1875
+ }
1876
+ function displayResultLogs(logs) {
1877
+ const failLogs = filterFailureLogs(logs);
1878
+ const hasFails = failLogs.length > 0;
1879
+ if (hasFails) {
1880
+ const byId = groupLogsByTestId(failLogs);
1881
+ displayGroupedFailures(byId);
1882
+ } else {
1883
+ displaySuccessLogs(logs);
1884
+ }
1885
+ }
1886
+ function displayMetrics(metrics) {
1887
+ if (metrics.length > 0) {
1888
+ console.log(" Metrics:");
1889
+ for (const [k, v] of metrics) {
1890
+ console.log(` - ${k}: ${v}`);
1891
+ }
1892
+ }
1893
+ }
1894
+ function displayResultHeader(r) {
1895
+ const { model, modelKey, benchmark, result } = r;
1896
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
1897
+ console.log(
1898
+ `
1899
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
1900
+ );
1901
+ console.log(
1902
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
1903
+ );
1904
+ }
1905
+ function consoleDebugReporter(results) {
1906
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
1907
+ for (const r of results) {
1908
+ displayResultHeader(r);
1909
+ displayMetrics(Object.entries(r.result.metrics));
1910
+ if (r.result.logs?.length) {
1911
+ displayResultLogs(r.result.logs);
1912
+ }
1913
+ }
1914
+ console.log("\n------------------------------------\n");
1915
+ }
1916
+
1917
+ // src/reporters/json.ts
1918
+ function jsonReporter(results) {
1919
+ const serializableResults = results.map((r) => ({
1920
+ ...r,
1921
+ result: {
1922
+ ...r.result,
1923
+ error: r.result.error?.message
1924
+ }
1925
+ }));
1926
+ console.log(JSON.stringify(serializableResults, null, 2));
1927
+ }
1928
+
1929
+ // src/reporters/index.ts
1930
+ var reporters = {
1931
+ console: consoleReporter,
1932
+ json: jsonReporter,
1933
+ "console.debug": consoleDebugReporter
1934
+ };
1935
+
1936
+ // src/evaluate.ts
1937
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
1938
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
1939
+ try {
1940
+ console.log(
1941
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
1942
+ );
1943
+ const result = await benchmark.run(model, config);
1944
+ console.log(
1945
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
1946
+ );
1947
+ return {
1948
+ model: modelId,
1949
+ modelKey,
1950
+ benchmark: benchmark.name,
1951
+ result
1952
+ };
1953
+ } catch (error) {
1954
+ console.error(
1955
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
1956
+ error
1957
+ );
1958
+ return {
1959
+ model: modelId,
1960
+ modelKey,
1961
+ benchmark: benchmark.name,
1962
+ result: {
1963
+ score: 0,
1964
+ success: false,
1965
+ metrics: {},
1966
+ error: error instanceof Error ? error : new Error(String(error))
1967
+ }
1968
+ };
1969
+ }
1970
+ }
1971
+ function normalizeModels(models) {
1972
+ const modelEntries = [];
1973
+ if (Array.isArray(models)) {
1974
+ for (const m of models) {
1975
+ modelEntries.push([void 0, m]);
1976
+ }
1977
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
1978
+ modelEntries.push([void 0, models]);
1979
+ } else {
1980
+ for (const [key, m] of Object.entries(
1981
+ models
1982
+ )) {
1983
+ modelEntries.push([key, m]);
1984
+ }
1985
+ }
1986
+ return modelEntries;
1987
+ }
1988
+ function buildConfig(temperature, maxTokens) {
1989
+ const config = {};
1990
+ if (temperature !== void 0) {
1991
+ config.temperature = temperature;
1992
+ }
1993
+ if (maxTokens !== void 0) {
1994
+ config.maxTokens = maxTokens;
1995
+ }
1996
+ return Object.keys(config).length > 0 ? config : void 0;
1997
+ }
1998
+ function executeReporter(reporter, results) {
1999
+ const report = reporters[reporter];
2000
+ if (report) {
2001
+ report(results);
2002
+ } else {
2003
+ console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
2004
+ reporters.console(results);
2005
+ }
2006
+ }
2007
+ async function evaluate(options) {
2008
+ const {
2009
+ models,
2010
+ benchmarks,
2011
+ reporter = "console",
2012
+ temperature,
2013
+ maxTokens
2014
+ } = options;
2015
+ const modelEntries = normalizeModels(models);
2016
+ const config = buildConfig(temperature, maxTokens);
2017
+ const allResults = [];
2018
+ for (const [modelKey, model] of modelEntries) {
2019
+ for (const benchmark of benchmarks) {
2020
+ const evaluationResult = await runSingleBenchmark(
2021
+ model,
2022
+ benchmark,
2023
+ modelKey,
2024
+ config
2025
+ );
2026
+ allResults.push(evaluationResult);
2027
+ }
2028
+ }
2029
+ executeReporter(reporter, allResults);
2030
+ return allResults;
2031
+ }
1386
2032
  // Annotate the CommonJS export names for ESM import in node:
1387
2033
  0 && (module.exports = {
1388
2034
  bfclMultipleBenchmark,