@ai-sdk-tool/eval 0.1.8 → 1.0.0-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -34,477 +34,105 @@ __export(index_exports, {
34
34
  bfclParallelBenchmark: () => bfclParallelBenchmark,
35
35
  bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
36
36
  bfclSimpleBenchmark: () => bfclSimpleBenchmark,
37
+ complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
37
38
  evaluate: () => evaluate,
38
39
  jsonGenerationBenchmark: () => jsonGenerationBenchmark,
39
40
  jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
40
41
  });
41
42
  module.exports = __toCommonJS(index_exports);
42
43
 
43
- // src/reporters/console.ts
44
- var colors = {
45
- reset: "\x1B[0m",
46
- green: "\x1B[32m",
47
- red: "\x1B[31m",
48
- yellow: "\x1B[33m",
49
- cyan: "\x1B[36m",
50
- magenta: "\x1B[35m",
51
- gray: "\x1B[90m"
52
- };
53
- function printResult(result) {
54
- const { model, modelKey, benchmark, result: benchmarkResult } = result;
55
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
56
- console.log(
57
- `
58
- ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
59
- );
60
- console.log(
61
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
62
- );
63
- const metrics = Object.entries(benchmarkResult.metrics);
64
- if (metrics.length > 0) {
65
- console.log(" Metrics:");
66
- for (const [key, value] of metrics) {
67
- console.log(` - ${key}: ${value}`);
68
- }
69
- }
70
- if (benchmarkResult.error) {
71
- console.log(
72
- ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
73
- );
74
- }
75
- }
76
- function consoleReporter(results) {
77
- console.log("\n--- \u{1F4CA} Evaluation Report ---");
78
- for (const result of results) {
79
- printResult(result);
80
- }
81
- console.log("\n---------------------------\n");
82
- }
44
+ // src/benchmarks/bfcl.ts
45
+ var import_node_fs2 = require("fs");
46
+ var import_node_path2 = __toESM(require("path"), 1);
47
+ var import_ai = require("ai");
83
48
 
84
- // src/reporters/console.debug.ts
85
- var colors2 = {
86
- reset: "\x1B[0m",
87
- green: "\x1B[32m",
88
- red: "\x1B[31m",
89
- yellow: "\x1B[33m",
90
- cyan: "\x1B[36m",
91
- magenta: "\x1B[35m",
92
- gray: "\x1B[90m",
93
- bold: "\x1B[1m",
94
- underline: "\x1B[4m"
95
- };
96
- function colorizeDiffLine(line) {
97
- if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
98
- if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
99
- if (line.startsWith("@"))
100
- return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
101
- return line;
102
- }
103
- function uniqueLines(lines) {
104
- const seen = /* @__PURE__ */ new Set();
105
- const out = [];
106
- for (const l of lines) {
107
- if (seen.has(l)) continue;
108
- seen.add(l);
109
- out.push(l);
110
- }
111
- return out;
112
- }
113
- function suggestFixFromDiff(parsed) {
114
- const suggestions = [];
115
- const { error_type, expected, actual, diff } = parsed ?? {};
116
- if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
117
- const expectedName = expected?.function;
118
- const actualName = actual?.function;
119
- if (expectedName && actualName && expectedName !== actualName) {
120
- suggestions.push(
121
- `Call the function '${expectedName}' instead of '${actualName}'.`
122
- );
123
- }
124
- if (Array.isArray(expected?.functions)) {
125
- suggestions.push(
126
- `Ensure tool calls include: ${expected.functions.join(", ")}.`
127
- );
128
- }
129
- }
130
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
131
- const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
132
- if (missing.length) {
133
- suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
134
- }
135
- }
136
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
137
- const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
138
- if (extras.length) {
139
- suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
140
- }
141
- }
142
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
- const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
- for (const param of targets) {
145
- const allowedOneOfLine = diff.find(
146
- (d) => String(d).startsWith("- expected one of:")
147
- );
148
- const allowedSingleLine = diff.find(
149
- (d) => String(d).startsWith("- expected:")
150
- );
151
- if (allowedSingleLine) {
152
- const value = allowedSingleLine.replace("- expected: ", "");
153
- suggestions.push(`Set '${param}' to: ${value}.`);
154
- } else if (allowedOneOfLine) {
155
- const allowed = allowedOneOfLine.replace("- expected one of: ", "");
156
- suggestions.push(`Set '${param}' to one of: ${allowed}.`);
157
- } else {
158
- suggestions.push(`Adjust '${param}' to an allowed value.`);
159
- }
160
- }
161
- }
162
- if (suggestions.length === 0 && typeof error_type === "string") {
163
- if (error_type.includes("missing_required")) {
164
- suggestions.push(
165
- "Add all required parameters defined by the tool schema."
166
- );
167
- } else if (error_type.includes("unexpected_param")) {
168
- suggestions.push("Remove parameters not present in the tool schema.");
169
- } else if (error_type.includes("wrong_count")) {
170
- suggestions.push(
171
- "Adjust the number of tool calls to match expected count."
172
- );
173
- } else if (error_type.includes("wrong_func_name")) {
174
- suggestions.push("Use the exact expected function name from the schema.");
175
- } else if (error_type.includes("value_error")) {
176
- suggestions.push("Choose a value from the allowed options.");
49
+ // src/utils/paths.ts
50
+ var import_node_fs = __toESM(require("fs"), 1);
51
+ var import_node_module = require("module");
52
+ var import_node_path = __toESM(require("path"), 1);
53
+ var import_node_url = require("url");
54
+ function tryResolveViaPackageEntry(moduleUrl) {
55
+ try {
56
+ const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || import_node_path.default.join(process.cwd(), "package.json");
57
+ const requireFromEntry = (0, import_node_module.createRequire)(baseForRequireEntry);
58
+ const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
59
+ const entryDir = import_node_path.default.dirname(entryPath);
60
+ const guessPkgRoot = import_node_fs.default.existsSync(import_node_path.default.join(entryDir, "..")) ? import_node_path.default.resolve(entryDir, "..") : entryDir;
61
+ const dataAtRoot = import_node_path.default.join(guessPkgRoot, "data");
62
+ if (import_node_fs.default.existsSync(dataAtRoot)) {
63
+ return dataAtRoot;
177
64
  }
65
+ } catch (e) {
178
66
  }
179
- return uniqueLines(suggestions);
67
+ return null;
180
68
  }
181
- function consoleDebugReporter(results) {
182
- console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
183
- for (const r of results) {
184
- const { model, modelKey, benchmark, result } = r;
185
- const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
186
- console.log(
187
- `
188
- ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
189
- );
190
- console.log(
191
- ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
192
- );
193
- const metrics = Object.entries(result.metrics);
194
- if (metrics.length > 0) {
195
- console.log(" Metrics:");
196
- for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
197
- }
198
- if (result.logs && result.logs.length) {
199
- const failLogs = result.logs.filter(
200
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
201
- );
202
- const hasFails = failLogs.length > 0;
203
- if (hasFails) {
204
- let getTestIdFromLogLine2 = function(line) {
205
- if (line.startsWith("[FAIL]")) {
206
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
207
- return m?.[1];
208
- }
209
- if (line.startsWith("[DEBUG-FAIL]")) {
210
- try {
211
- const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
212
- return String(parsed?.id ?? "");
213
- } catch {
214
- }
215
- }
216
- if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
217
- try {
218
- const parsed = JSON.parse(
219
- line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
220
- );
221
- return String(parsed?.id ?? "");
222
- } catch {
223
- }
224
- }
225
- return void 0;
226
- };
227
- var getTestIdFromLogLine = getTestIdFromLogLine2;
228
- const byId = /* @__PURE__ */ new Map();
229
- for (const line of failLogs) {
230
- const id = getTestIdFromLogLine2(line);
231
- const key = id ?? "__general__";
232
- const arr = byId.get(key) ?? [];
233
- arr.push(line);
234
- byId.set(key, arr);
235
- }
236
- console.log(
237
- ` ${colors2.bold}Failure details (grouped):${colors2.reset}`
238
- );
239
- for (const [groupId, lines] of byId) {
240
- if (groupId !== "__general__") {
241
- console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
242
- }
243
- const debugIds = /* @__PURE__ */ new Set();
244
- for (const l of lines) {
245
- if (l.startsWith("[DEBUG-FAIL]")) {
246
- try {
247
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
248
- if (parsed?.id) debugIds.add(String(parsed.id));
249
- } catch {
250
- }
251
- }
252
- }
253
- for (const line of lines) {
254
- if (line.startsWith("[FAIL]")) {
255
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
256
- const failId = m?.[1];
257
- if (failId && debugIds.has(failId)) continue;
258
- console.log(` ${colors2.red}${line}${colors2.reset}`);
259
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
260
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
261
- } else if (line.startsWith("[STACK]")) {
262
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
263
- } else if (line.startsWith("[DEBUG-FAIL]")) {
264
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
265
- try {
266
- const parsed = JSON.parse(payload);
267
- const { message, diff, expected, actual } = parsed;
268
- if (message)
269
- console.log(
270
- ` ${colors2.bold}${message}${colors2.reset}`
271
- );
272
- if (diff && Array.isArray(diff)) {
273
- for (const dLine of diff)
274
- console.log(" " + colorizeDiffLine(dLine));
275
- } else {
276
- console.log(" expected:");
277
- console.log(
278
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
279
- );
280
- console.log(" actual:");
281
- console.log(
282
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
283
- );
284
- }
285
- const suggestions = suggestFixFromDiff(parsed);
286
- if (suggestions.length) {
287
- console.log(
288
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
289
- );
290
- for (const s of suggestions)
291
- console.log(` \u2022 ${s}`);
292
- }
293
- } catch {
294
- console.log(` ${line}`);
295
- }
296
- } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
297
- const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
298
- try {
299
- const ctx = JSON.parse(payload);
300
- console.log(` ${colors2.gray}context:${colors2.reset}`);
301
- if (ctx.tool_schema) {
302
- console.log(
303
- colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
304
- );
305
- }
306
- if (ctx.last_user_query) {
307
- console.log(
308
- colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
309
- );
310
- }
311
- if (ctx.raw_model_text) {
312
- console.log(
313
- colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
314
- );
315
- }
316
- if (ctx.parsed_tool_calls) {
317
- console.log(
318
- colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
319
- );
320
- }
321
- if (ctx.ground_truth) {
322
- console.log(
323
- colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
324
- );
325
- }
326
- if (ctx.finish_reason) {
327
- console.log(
328
- colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
329
- );
330
- }
331
- } catch {
332
- console.log(` ${line}`);
333
- }
334
- }
335
- }
336
- }
337
- } else {
338
- const info = result.logs.filter(
339
- (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
340
- );
341
- for (const line of info)
342
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
343
- }
69
+ function tryResolveViaPackageJson(moduleUrl) {
70
+ try {
71
+ const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_node_path.default.join(process.cwd(), "package.json");
72
+ const require2 = (0, import_node_module.createRequire)(baseForRequire);
73
+ const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
74
+ const pkgDir = import_node_path.default.dirname(pkgJsonPath);
75
+ const dataAtPkg = import_node_path.default.join(pkgDir, "data");
76
+ if (import_node_fs.default.existsSync(dataAtPkg)) {
77
+ return dataAtPkg;
344
78
  }
79
+ } catch (e) {
345
80
  }
346
- console.log("\n------------------------------------\n");
81
+ return null;
347
82
  }
348
-
349
- // src/reporters/json.ts
350
- function jsonReporter(results) {
351
- const serializableResults = results.map((r) => ({
352
- ...r,
353
- result: {
354
- ...r.result,
355
- error: r.result.error?.message
83
+ function getStartDir(moduleUrl) {
84
+ if (moduleUrl) {
85
+ try {
86
+ return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
87
+ } catch (e) {
88
+ return process.cwd();
356
89
  }
357
- }));
358
- console.log(JSON.stringify(serializableResults, null, 2));
359
- }
360
-
361
- // src/reporters/index.ts
362
- var reporters = {
363
- console: consoleReporter,
364
- json: jsonReporter,
365
- "console.debug": consoleDebugReporter
366
- };
367
-
368
- // src/evaluate.ts
369
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
370
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
371
- try {
372
- console.log(
373
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
374
- );
375
- const result = await benchmark.run(model, config);
376
- console.log(
377
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
378
- );
379
- return {
380
- model: modelId,
381
- modelKey,
382
- benchmark: benchmark.name,
383
- result
384
- };
385
- } catch (error) {
386
- console.error(
387
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
388
- error
389
- );
390
- return {
391
- model: modelId,
392
- modelKey,
393
- benchmark: benchmark.name,
394
- result: {
395
- score: 0,
396
- success: false,
397
- metrics: {},
398
- error: error instanceof Error ? error : new Error(String(error))
399
- }
400
- };
401
90
  }
91
+ return process.cwd();
402
92
  }
403
- async function evaluate(options) {
404
- const {
405
- models,
406
- benchmarks,
407
- reporter = "console",
408
- temperature,
409
- maxTokens
410
- } = options;
411
- const modelEntries = [];
412
- if (Array.isArray(models)) {
413
- for (const m of models) modelEntries.push([void 0, m]);
414
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
415
- modelEntries.push([void 0, models]);
416
- } else {
417
- for (const [key, m] of Object.entries(
418
- models
419
- )) {
420
- modelEntries.push([key, m]);
93
+ function findDataDirByTraversal(startDir) {
94
+ let dir = startDir;
95
+ const MAX_PARENT_TRAVERSAL_DEPTH = 6;
96
+ for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
97
+ const dataCandidate = import_node_path.default.join(dir, "data");
98
+ if (import_node_fs.default.existsSync(dataCandidate)) {
99
+ return dataCandidate;
421
100
  }
422
- }
423
- const allResults = [];
424
- for (const [modelKey, model] of modelEntries) {
425
- for (const benchmark of benchmarks) {
426
- const config = {};
427
- if (temperature !== void 0) config.temperature = temperature;
428
- if (maxTokens !== void 0) config.maxTokens = maxTokens;
429
- const evaluationResult = await runSingleBenchmark(
430
- model,
431
- benchmark,
432
- modelKey,
433
- Object.keys(config).length > 0 ? config : void 0
434
- );
435
- allResults.push(evaluationResult);
101
+ const parent = import_node_path.default.resolve(dir, "..");
102
+ if (parent === dir) {
103
+ break;
436
104
  }
105
+ dir = parent;
437
106
  }
438
- const report = reporters[reporter];
439
- if (report) {
440
- report(allResults);
441
- } else {
442
- console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
443
- reporters.console(allResults);
444
- }
445
- return allResults;
107
+ return null;
446
108
  }
447
-
448
- // src/benchmarks/bfcl.ts
449
- var import_ai = require("ai");
450
- var import_fs2 = require("fs");
451
- var import_path2 = __toESM(require("path"), 1);
452
-
453
- // src/utils/paths.ts
454
- var import_fs = __toESM(require("fs"), 1);
455
- var import_module = require("module");
456
- var import_path = __toESM(require("path"), 1);
457
- var import_url = require("url");
458
109
  function resolveDataDir(fromModuleUrl) {
459
- const moduleUrl = fromModuleUrl;
460
110
  const override = process.env.BFCL_DATA_DIR;
461
111
  if (override && override.trim().length > 0) {
462
112
  return override;
463
113
  }
464
- try {
465
- const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
466
- const requireFromEntry = (0, import_module.createRequire)(baseForRequireEntry);
467
- const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
468
- const entryDir = import_path.default.dirname(entryPath);
469
- const guessPkgRoot = import_fs.default.existsSync(import_path.default.join(entryDir, "..")) ? import_path.default.resolve(entryDir, "..") : entryDir;
470
- const dataAtRoot = import_path.default.join(guessPkgRoot, "data");
471
- if (import_fs.default.existsSync(dataAtRoot)) return dataAtRoot;
472
- } catch {
473
- }
474
- try {
475
- const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
476
- const require2 = (0, import_module.createRequire)(baseForRequire);
477
- const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
478
- const pkgDir = import_path.default.dirname(pkgJsonPath);
479
- const dataAtPkg = import_path.default.join(pkgDir, "data");
480
- if (import_fs.default.existsSync(dataAtPkg)) return dataAtPkg;
481
- } catch {
114
+ const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
115
+ if (viaEntry) {
116
+ return viaEntry;
482
117
  }
483
- let startDir;
484
- if (moduleUrl) {
485
- try {
486
- startDir = import_path.default.dirname((0, import_url.fileURLToPath)(moduleUrl));
487
- } catch {
488
- startDir = process.cwd();
489
- }
490
- } else {
491
- startDir = process.cwd();
118
+ const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
119
+ if (viaPackageJson) {
120
+ return viaPackageJson;
492
121
  }
493
- let dir = startDir;
494
- for (let i = 0; i < 6; i++) {
495
- const dataCandidate = import_path.default.join(dir, "data");
496
- if (import_fs.default.existsSync(dataCandidate)) return dataCandidate;
497
- const parent = import_path.default.resolve(dir, "..");
498
- if (parent === dir) break;
499
- dir = parent;
122
+ const startDir = getStartDir(fromModuleUrl);
123
+ const viaTraversal = findDataDirByTraversal(startDir);
124
+ if (viaTraversal) {
125
+ return viaTraversal;
500
126
  }
501
- const pkgRoot = import_path.default.resolve(startDir, "..", "..");
502
- return import_path.default.join(pkgRoot, "data");
127
+ const pkgRoot = import_node_path.default.resolve(startDir, "..", "..");
128
+ return import_node_path.default.join(pkgRoot, "data");
503
129
  }
504
130
 
505
131
  // src/benchmarks/bfcl/ast-checker.ts
506
132
  function standardizeString(input) {
507
- if (typeof input !== "string") return input;
133
+ if (typeof input !== "string") {
134
+ return input;
135
+ }
508
136
  const regex = /[ ,./\\-_*^]/g;
509
137
  return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
510
138
  }
@@ -524,127 +152,181 @@ function checkStringValue(param, modelValue, possibleAnswers) {
524
152
  }
525
153
  return { valid: true };
526
154
  }
527
- function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
528
- const modelArgs = modelToolCall.args;
529
- const modelFuncName = modelToolCall.toolName;
530
- const expectedFuncName = funcDescription.name;
531
- const expectedParams = funcDescription.parameters.properties;
532
- const requiredParams = funcDescription.parameters.required;
533
- if (modelFuncName !== expectedFuncName) {
155
+ function normalizeObject(obj) {
156
+ if (Array.isArray(obj)) {
157
+ return obj.map(normalizeObject);
158
+ }
159
+ if (obj && typeof obj === "object") {
160
+ const normalized = {};
161
+ for (const [key, value] of Object.entries(obj)) {
162
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
163
+ normalized[key] = value[0];
164
+ } else {
165
+ normalized[key] = normalizeObject(value);
166
+ }
167
+ }
168
+ return normalized;
169
+ }
170
+ return obj;
171
+ }
172
+ function valuesMatch(modelValue, possibleValue) {
173
+ if (modelValue === possibleValue) {
174
+ return true;
175
+ }
176
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
177
+ try {
178
+ const normalizedModel = normalizeObject(modelValue);
179
+ const normalizedPossible = normalizeObject(possibleValue);
180
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
181
+ } catch (e) {
182
+ return false;
183
+ }
184
+ }
185
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
186
+ return modelValue.toString() === possibleValue;
187
+ }
188
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
189
+ return modelValue === possibleValue.toString();
190
+ }
191
+ return false;
192
+ }
193
+ function checkArrayValue(paramName, modelValue, possibleValues) {
194
+ const modelValueStr = JSON.stringify(
195
+ modelValue.map((v) => standardizeString(String(v))).sort()
196
+ );
197
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
198
+ if (!Array.isArray(p)) {
199
+ return false;
200
+ }
201
+ return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
202
+ }) : false;
203
+ if (!hasMatch) {
534
204
  return {
535
205
  valid: false,
536
- error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
537
- error_type: "simple_function_checker:wrong_func_name"
206
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
207
+ modelValue
208
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
209
+ error_type: "value_error:list"
538
210
  };
539
211
  }
540
- const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
541
- const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
542
- for (const param of requiredParams) {
543
- if (!(param in argsObj)) {
544
- return {
545
- valid: false,
546
- error: `Missing required parameter: '${param}'.`,
212
+ return { valid: true };
213
+ }
214
+ function checkObjectValue(paramName, modelValue, possibleValues) {
215
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
216
+ (possibleValue) => valuesMatch(modelValue, possibleValue)
217
+ ) : false;
218
+ if (!hasMatch) {
219
+ return {
220
+ valid: false,
221
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
222
+ modelValue
223
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
224
+ error_type: "value_error:other"
225
+ };
226
+ }
227
+ return { valid: true };
228
+ }
229
+ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
230
+ const funcNameCheck = checkFunctionName(
231
+ funcDescription.name,
232
+ modelToolCall.toolName
233
+ );
234
+ if (!funcNameCheck.valid) {
235
+ return funcNameCheck;
236
+ }
237
+ const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
238
+ const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
239
+ const context = {
240
+ funcDescription,
241
+ modelToolCall,
242
+ possibleAnswerParams,
243
+ expectedParams: funcDescription.parameters.properties
244
+ };
245
+ const requiredCheck = checkRequiredParams(
246
+ funcDescription.parameters.required,
247
+ argsObj
248
+ );
249
+ if (!requiredCheck.valid) {
250
+ return requiredCheck;
251
+ }
252
+ const paramsCheck = checkAllParameters(argsObj, context);
253
+ if (!paramsCheck.valid) {
254
+ return paramsCheck;
255
+ }
256
+ const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
257
+ if (!optionalCheck.valid) {
258
+ return optionalCheck;
259
+ }
260
+ return { valid: true };
261
+ }
262
+ function checkFunctionName(expected, actual) {
263
+ if (actual !== expected) {
264
+ return {
265
+ valid: false,
266
+ error: `Function name '${actual}' does not match expected '${expected}'.`,
267
+ error_type: "simple_function_checker:wrong_func_name"
268
+ };
269
+ }
270
+ return { valid: true };
271
+ }
272
+ function checkRequiredParams(requiredParams, argsObj) {
273
+ for (const param of requiredParams) {
274
+ if (!(param in argsObj)) {
275
+ return {
276
+ valid: false,
277
+ error: `Missing required parameter: '${param}'.`,
547
278
  error_type: "simple_function_checker:missing_required"
548
279
  };
549
280
  }
550
281
  }
551
- if (modelArgs && typeof modelArgs === "object") {
552
- for (const paramName of Object.keys(argsObj)) {
553
- const modelValue = argsObj[paramName];
554
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
282
+ return { valid: true };
283
+ }
284
+ function checkAllParameters(argsObj, context) {
285
+ for (const paramName of Object.keys(argsObj)) {
286
+ const paramCheck = checkSingleParameter(
287
+ paramName,
288
+ argsObj[paramName],
289
+ context
290
+ );
291
+ if (!paramCheck.valid) {
292
+ return paramCheck;
293
+ }
294
+ }
295
+ return { valid: true };
296
+ }
297
+ function checkSingleParameter(paramName, modelValue, context) {
298
+ if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
299
+ return {
300
+ valid: false,
301
+ error: `Unexpected parameter: '${paramName}'.`,
302
+ error_type: "simple_function_checker:unexpected_param"
303
+ };
304
+ }
305
+ const possibleValues = context.possibleAnswerParams[paramName];
306
+ if (typeof modelValue === "string") {
307
+ return checkStringValue(
308
+ paramName,
309
+ modelValue,
310
+ possibleValues != null ? possibleValues : []
311
+ );
312
+ }
313
+ if (Array.isArray(modelValue)) {
314
+ return checkArrayValue(paramName, modelValue, possibleValues);
315
+ }
316
+ return checkObjectValue(paramName, modelValue, possibleValues);
317
+ }
318
+ function checkOptionalParams(argsObj, possibleAnswerParams) {
319
+ for (const paramName in possibleAnswerParams) {
320
+ if (Object.hasOwn(possibleAnswerParams, paramName)) {
321
+ const val = possibleAnswerParams[paramName];
322
+ const isOptional = Array.isArray(val) && val.includes("");
323
+ if (!(paramName in argsObj || isOptional)) {
555
324
  return {
556
325
  valid: false,
557
- error: `Unexpected parameter: '${paramName}'.`,
558
- error_type: "simple_function_checker:unexpected_param"
326
+ error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
327
+ error_type: "simple_function_checker:missing_optional"
559
328
  };
560
329
  }
561
- const possibleValues = possibleAnswerParams[paramName];
562
- if (typeof modelValue === "string") {
563
- const result = checkStringValue(
564
- paramName,
565
- modelValue,
566
- possibleValues ?? []
567
- );
568
- if (!result.valid) return result;
569
- } else if (Array.isArray(modelValue)) {
570
- const modelValueStr = JSON.stringify(
571
- modelValue.map((v) => standardizeString(String(v))).sort()
572
- );
573
- const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
574
- if (!Array.isArray(p)) return false;
575
- return JSON.stringify(
576
- p.map((v) => standardizeString(String(v))).sort()
577
- ) === modelValueStr;
578
- }) : false;
579
- if (!hasMatch) {
580
- return {
581
- valid: false,
582
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
583
- modelValue
584
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
585
- error_type: "value_error:list"
586
- };
587
- }
588
- } else {
589
- const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
590
- if (modelValue === possibleValue) return true;
591
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
592
- try {
593
- const normalizeObject = (obj) => {
594
- if (Array.isArray(obj)) {
595
- return obj.map(normalizeObject);
596
- }
597
- if (obj && typeof obj === "object") {
598
- const normalized = {};
599
- for (const [key, value] of Object.entries(
600
- obj
601
- )) {
602
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
603
- normalized[key] = value[0];
604
- } else {
605
- normalized[key] = normalizeObject(value);
606
- }
607
- }
608
- return normalized;
609
- }
610
- return obj;
611
- };
612
- const normalizedModel = normalizeObject(modelValue);
613
- const normalizedPossible = normalizeObject(possibleValue);
614
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
615
- } catch {
616
- return false;
617
- }
618
- }
619
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
620
- return modelValue.toString() === possibleValue;
621
- }
622
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
623
- return modelValue === possibleValue.toString();
624
- }
625
- return false;
626
- }) : false;
627
- if (!hasMatch) {
628
- return {
629
- valid: false,
630
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
631
- modelValue
632
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
633
- error_type: "value_error:other"
634
- };
635
- }
636
- }
637
- }
638
- }
639
- for (const paramName in possibleAnswerParams) {
640
- const val = possibleAnswerParams[paramName];
641
- const isOptional = Array.isArray(val) && val.includes("");
642
- if (!(paramName in argsObj) && !isOptional) {
643
- return {
644
- valid: false,
645
- error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
646
- error_type: "simple_function_checker:missing_optional"
647
- };
648
330
  }
649
331
  }
650
332
  return { valid: true };
@@ -671,8 +353,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
671
353
  };
672
354
  }
673
355
  let foundMatch = false;
674
- for (let i = 0; i < modelToolCalls.length; i++) {
675
- if (matchedModelCallIndices.has(i)) continue;
356
+ for (let i = 0; i < modelToolCalls.length; i += 1) {
357
+ if (matchedModelCallIndices.has(i)) {
358
+ continue;
359
+ }
676
360
  const checkerResult = simpleFunctionChecker(
677
361
  funcDescription,
678
362
  modelToolCalls[i],
@@ -721,6 +405,39 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
721
405
  }
722
406
 
723
407
  // src/benchmarks/bfcl.ts
408
+ var LINE_SPLIT_REGEX = /\r?\n/;
409
+ var NUMERIC_STRING_REGEX = /^\d+$/;
410
+ function convertGroundTruthToXML(call) {
411
+ const keys = Object.keys(call);
412
+ if (keys.length === 0) {
413
+ return "<empty_call />";
414
+ }
415
+ const funcName = keys[0];
416
+ if (!funcName) {
417
+ return "<undefined_function />";
418
+ }
419
+ const params = call[funcName];
420
+ if (!params || typeof params !== "object") {
421
+ return `<${funcName} />`;
422
+ }
423
+ let xml = `<${funcName}>
424
+ `;
425
+ for (const [key, value] of Object.entries(params)) {
426
+ const displayValue = Array.isArray(value) ? value[0] : value;
427
+ let valueStr;
428
+ if (typeof displayValue === "string") {
429
+ valueStr = displayValue;
430
+ } else if (displayValue === null || displayValue === void 0) {
431
+ valueStr = "";
432
+ } else {
433
+ valueStr = JSON.stringify(displayValue);
434
+ }
435
+ xml += ` <${key}>${valueStr}</${key}>
436
+ `;
437
+ }
438
+ xml += `</${funcName}>`;
439
+ return xml;
440
+ }
724
441
  function check(testCase, modelOutput, possibleAnswer) {
725
442
  const category = testCase.id.split("_")[0];
726
443
  try {
@@ -737,19 +454,22 @@ function check(testCase, modelOutput, possibleAnswer) {
737
454
  modelOutput[0],
738
455
  possibleAnswer.ground_truth[0]
739
456
  );
740
- } else if (category === "parallel") {
457
+ }
458
+ if (category === "parallel") {
741
459
  return parallelFunctionCheckerNoOrder(
742
460
  testCase.function,
743
461
  modelOutput,
744
462
  possibleAnswer.ground_truth
745
463
  );
746
- } else if (category === "multiple") {
464
+ }
465
+ if (category === "multiple") {
747
466
  return multipleFunctionChecker(
748
467
  testCase.function,
749
468
  modelOutput,
750
469
  possibleAnswer.ground_truth
751
470
  );
752
- } else if (category.includes("parallel-multiple")) {
471
+ }
472
+ if (category.includes("parallel-multiple")) {
753
473
  return parallelFunctionCheckerNoOrder(
754
474
  testCase.function,
755
475
  modelOutput,
@@ -777,16 +497,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
777
497
  try {
778
498
  const dataPath = resolveDataDir();
779
499
  logs.push(`[INFO] Using data dir: ${dataPath}`);
780
- const testCasesJson = await import_fs2.promises.readFile(
781
- import_path2.default.join(dataPath, testDataFile),
500
+ const testCasesJson = await import_node_fs2.promises.readFile(
501
+ import_node_path2.default.join(dataPath, testDataFile),
782
502
  "utf-8"
783
503
  );
784
- const possibleAnswersJson = await import_fs2.promises.readFile(
785
- import_path2.default.join(dataPath, answerDataFile),
504
+ const possibleAnswersJson = await import_node_fs2.promises.readFile(
505
+ import_node_path2.default.join(dataPath, answerDataFile),
786
506
  "utf-8"
787
507
  );
788
- testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
789
- const possibleAnswers = possibleAnswersJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
508
+ testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
509
+ const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
790
510
  const possibleAnswersMap = new Map(
791
511
  possibleAnswers.map((ans) => [ans.id, ans])
792
512
  );
@@ -798,406 +518,665 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
798
518
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
799
519
  );
800
520
  }
801
- const fixSchema = (schema) => {
802
- if (!schema || typeof schema !== "object")
521
+ const fixSchemaType2 = (copy) => {
522
+ if (!copy.type) {
523
+ return;
524
+ }
525
+ if (copy.type === "dict") {
526
+ copy.type = "object";
527
+ }
528
+ if (copy.type === "tuple") {
529
+ copy.type = "array";
530
+ }
531
+ if (copy.type === "integer" || copy.type === "float") {
532
+ copy.type = "number";
533
+ }
534
+ };
535
+ const fixSchemaProperties = (copy, fixSchemaFn) => {
536
+ if (!copy.properties || typeof copy.properties !== "object") {
537
+ return;
538
+ }
539
+ for (const k of Object.keys(copy.properties)) {
540
+ copy.properties[k] = fixSchemaFn(
541
+ copy.properties[k]
542
+ );
543
+ }
544
+ };
545
+ const fixSchema2 = (schema) => {
546
+ if (!schema || typeof schema !== "object") {
803
547
  return { type: "object", properties: {} };
804
- const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
548
+ }
549
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
805
550
  if (!Array.isArray(copy)) {
806
- if (copy.type) {
807
- if (copy.type === "dict") copy.type = "object";
808
- if (copy.type === "integer" || copy.type === "float")
809
- copy.type = "number";
810
- }
811
- if (copy.properties && typeof copy.properties === "object") {
812
- for (const k of Object.keys(copy.properties)) {
813
- copy.properties[k] = fixSchema(
814
- copy.properties[k]
815
- );
816
- }
551
+ fixSchemaType2(copy);
552
+ fixSchemaProperties(copy, fixSchema2);
553
+ if (copy.items) {
554
+ copy.items = fixSchema2(copy.items);
817
555
  }
818
- if (copy.items) copy.items = fixSchema(copy.items);
819
556
  return copy;
820
557
  }
821
558
  return copy;
822
559
  };
560
+ const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
561
+ const sanitizeName = (toolName) => {
562
+ const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
563
+ return s.length > 0 ? s : "tool";
564
+ };
565
+ const buildTransformedTools = (tools, fixSchemaFn) => {
566
+ const nameMap = /* @__PURE__ */ new Map();
567
+ const transformedTools = tools.map((t) => {
568
+ const fixed = fixSchemaFn(t.parameters);
569
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
570
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
571
+ const sanitized = sanitizeName(t.name);
572
+ nameMap.set(sanitized, t.name);
573
+ return {
574
+ type: "function",
575
+ name: sanitized,
576
+ description: t.description,
577
+ inputSchema
578
+ };
579
+ });
580
+ return { transformedTools, nameMap };
581
+ };
582
+ const parseDebugToolCalls = (raw) => {
583
+ if (!raw) {
584
+ return [];
585
+ }
586
+ try {
587
+ const arr = JSON.parse(raw);
588
+ return Array.isArray(arr) ? arr : [];
589
+ } catch (e) {
590
+ return [];
591
+ }
592
+ };
593
+ const getSanitizedName = (rawName, transformedTools) => {
594
+ var _a, _b;
595
+ if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
596
+ return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
597
+ }
598
+ return rawName;
599
+ };
600
+ const parseToolArgs = (extractedArgs) => {
601
+ if (typeof extractedArgs !== "string") {
602
+ return extractedArgs;
603
+ }
604
+ try {
605
+ return JSON.parse(extractedArgs);
606
+ } catch (e) {
607
+ return extractedArgs;
608
+ }
609
+ };
610
+ const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
611
+ var _a, _b, _c, _d, _e, _f;
612
+ const call = c;
613
+ const rawName = (_a = call.toolName) != null ? _a : call.name;
614
+ const sanitizedFromIndex = getSanitizedName(
615
+ rawName,
616
+ transformedTools
617
+ );
618
+ const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
619
+ const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
620
+ const parsedArgs = parseToolArgs(extractedArgs);
621
+ return {
622
+ ...call,
623
+ toolName: originalName,
624
+ name: originalName,
625
+ args: parsedArgs != null ? parsedArgs : {}
626
+ };
627
+ });
628
+ const summarizeArgs = (args) => {
629
+ if (args == null) {
630
+ return args;
631
+ }
632
+ if (typeof args !== "object") {
633
+ return args;
634
+ }
635
+ return Object.keys(args).sort().reduce(
636
+ (acc, k) => {
637
+ acc[k] = args[k];
638
+ return acc;
639
+ },
640
+ {}
641
+ );
642
+ };
643
+ const generateParamMismatchDiff = (paramName, allowed, got) => {
644
+ const diffLines = [];
645
+ diffLines.push(`@@ param ${paramName}`);
646
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
647
+ const expectedLine = (() => {
648
+ if (allowedArray.length === 1) {
649
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
650
+ }
651
+ const formatted = allowedArray.map(
652
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
653
+ ).join(", ");
654
+ return `- expected one of: ${formatted}`;
655
+ })();
656
+ diffLines.push(expectedLine);
657
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
658
+ return diffLines;
659
+ };
660
+ const paramValueMatches = (allowed, got) => {
661
+ if (!Array.isArray(allowed)) {
662
+ return false;
663
+ }
664
+ return allowed.some((v) => {
665
+ try {
666
+ if (Array.isArray(got)) {
667
+ return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
668
+ }
669
+ } catch (e) {
670
+ }
671
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
672
+ });
673
+ };
674
+ const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
675
+ if (expectedName !== receivedName) {
676
+ diff.push("@@ function name");
677
+ diff.push(`- ${expectedName}`);
678
+ diff.push(`+ ${receivedName}`);
679
+ }
680
+ };
681
+ const checkMissingParams = (required, receivedArgs, diff) => {
682
+ for (const req of required) {
683
+ if (!(req in receivedArgs)) {
684
+ diff.push(`- missing required param: ${req}`);
685
+ }
686
+ }
687
+ };
688
+ const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
689
+ for (const k of Object.keys(receivedArgs)) {
690
+ if (!(k in expectedParams)) {
691
+ diff.push(`+ unexpected param: ${k}`);
692
+ }
693
+ }
694
+ };
695
+ const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
696
+ for (const k of Object.keys(receivedArgs)) {
697
+ if (k in expectedParams) {
698
+ const allowed = expectedParams[k];
699
+ const got = receivedArgs[k];
700
+ if (!paramValueMatches(allowed, got)) {
701
+ diff.push(...generateParamMismatchDiff(k, allowed, got));
702
+ }
703
+ }
704
+ }
705
+ };
706
+ const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
707
+ var _a, _b, _c, _d;
708
+ const funcDesc = tools[0];
709
+ const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
710
+ const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
711
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
712
+ const received = restoredCalls[0];
713
+ const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
714
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
715
+ const expected = {
716
+ function: expectedFuncName,
717
+ params: expectedParams
718
+ };
719
+ const actual = {
720
+ function: receivedName,
721
+ args: receivedArgs
722
+ };
723
+ const diff = [];
724
+ checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
725
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
726
+ const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
727
+ checkMissingParams(
728
+ required,
729
+ receivedArgs,
730
+ diff
731
+ );
732
+ checkUnexpectedParams(
733
+ expectedParams,
734
+ receivedArgs,
735
+ diff
736
+ );
737
+ checkParamValueMismatches(
738
+ expectedParams,
739
+ receivedArgs,
740
+ diff
741
+ );
742
+ }
743
+ return { expected, actual, diff };
744
+ };
745
+ const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
746
+ if (expectedCount !== actualCount) {
747
+ diff.push("@@ call count");
748
+ diff.push(`- expected ${expectedCount}`);
749
+ diff.push(`+ got ${actualCount}`);
750
+ }
751
+ };
752
+ const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
753
+ const missing = expectedNames.filter((n) => !actualNames.includes(n));
754
+ const extra = actualNames.filter((n) => !expectedNames.includes(n));
755
+ for (const m of missing) {
756
+ diff.push(`- missing function: ${m}`);
757
+ }
758
+ for (const e of extra) {
759
+ diff.push(`+ unexpected function: ${e}`);
760
+ }
761
+ };
762
+ const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
763
+ var _a;
764
+ for (let i = 0; i < restoredCalls.length; i += 1) {
765
+ if (usedActual.has(i)) {
766
+ continue;
767
+ }
768
+ const rc = restoredCalls[i];
769
+ const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
770
+ if (rcName === fname) {
771
+ return i;
772
+ }
773
+ }
774
+ return -1;
775
+ };
776
+ const validateFunctionParams = (options) => {
777
+ const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
778
+ checkMissingParams(requiredParams, receivedArgs, diff);
779
+ checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
780
+ checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
781
+ };
782
+ const processExpectedCall = (options) => {
783
+ var _a, _b;
784
+ const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
785
+ const fname = Object.keys(expectedObj)[0];
786
+ const matchedIndex = findMatchingCallIndex(
787
+ fname,
788
+ restoredCalls,
789
+ usedActual
790
+ );
791
+ if (matchedIndex === -1) {
792
+ return;
793
+ }
794
+ usedActual.add(matchedIndex);
795
+ const received = restoredCalls[matchedIndex];
796
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
797
+ const expectedParamsAllowed = expectedObj[fname];
798
+ const funcDesc = tools.find((t) => t.name === fname);
799
+ const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
800
+ diff.push(`@@ function ${fname}`);
801
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
802
+ validateFunctionParams({
803
+ receivedArgs,
804
+ expectedParamsAllowed,
805
+ requiredParams,
806
+ diff
807
+ });
808
+ }
809
+ };
810
+ const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
811
+ var _a;
812
+ const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
813
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
814
+ const actualNames = restoredCalls.map(
815
+ (c) => {
816
+ var _a2;
817
+ return (_a2 = c.toolName) != null ? _a2 : c.name;
818
+ }
819
+ );
820
+ const expected = {
821
+ functions: expectedNames
822
+ };
823
+ const actual = { functions: actualNames };
824
+ const diff = [];
825
+ checkCallCountMismatch(
826
+ expectedNames.length,
827
+ actualNames.length,
828
+ diff
829
+ );
830
+ addMissingAndExtraFunctions(expectedNames, actualNames, diff);
831
+ const usedActual = /* @__PURE__ */ new Set();
832
+ for (const expectedObj of gtArr) {
833
+ processExpectedCall({
834
+ expectedObj,
835
+ restoredCalls,
836
+ tools,
837
+ usedActual,
838
+ diff
839
+ });
840
+ }
841
+ return { expected, actual, diff };
842
+ };
823
843
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
824
- const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
844
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
825
845
  logs.push(
826
846
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
827
847
  );
828
- const runSingleCase = async (testCase) => {
829
- const caseLogs = [];
830
- const { function: tools, question: messages } = testCase;
831
- const temp = config?.temperature;
832
- const temperature = typeof temp === "number" ? temp : void 0;
833
- const maxTok = config?.maxTokens;
834
- const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
848
+ const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
849
+ var _a, _b, _c, _d;
835
850
  try {
836
- const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
837
- const nameMap = /* @__PURE__ */ new Map();
838
- const sanitizeName = (name2) => {
839
- const s = name2.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
840
- return s.length > 0 ? s : "tool";
841
- };
842
- const transformedTools = tools.map((t) => {
843
- const fixed = fixSchema(t.parameters);
844
- const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
845
- const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
846
- const sanitized = sanitizeName(t.name);
847
- nameMap.set(sanitized, t.name);
848
- return {
849
- type: "function",
850
- name: sanitized,
851
- description: t.description,
852
- inputSchema
853
- };
854
- });
855
- const toolsMap = Object.fromEntries(
856
- transformedTools.map((t) => [
857
- t.name,
858
- (0, import_ai.tool)({
859
- description: typeof t.description === "string" ? t.description : void 0,
860
- inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
861
- })
862
- ])
851
+ const firstTool = transformedTools[0];
852
+ const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
853
+ caseLogs.push(
854
+ `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
863
855
  );
864
- try {
865
- const firstTool = transformedTools[0];
866
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
867
- caseLogs.push(
868
- `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
869
- );
870
- } catch (e) {
871
- caseLogs.push(
872
- `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
873
- );
856
+ } catch (e) {
857
+ caseLogs.push(
858
+ `[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
859
+ );
860
+ }
861
+ };
862
+ const logRawToolCalls = (options) => {
863
+ const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
864
+ try {
865
+ caseLogs.push(
866
+ `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
867
+ );
868
+ } catch (e) {
869
+ caseLogs.push(
870
+ `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
871
+ );
872
+ }
873
+ };
874
+ const buildFailureContext = (options) => {
875
+ const {
876
+ testCase,
877
+ tools,
878
+ flatMessages,
879
+ mwOriginalText,
880
+ text,
881
+ finishReason,
882
+ mwParsedToolCalls,
883
+ restoredCalls,
884
+ possibleAnswer
885
+ } = options;
886
+ const lastUser = (() => {
887
+ var _a;
888
+ const reversed = [...flatMessages].reverse();
889
+ const found = reversed.find(
890
+ (m) => m.role === "user"
891
+ );
892
+ return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
893
+ })();
894
+ const rawModelText = (() => {
895
+ if (mwOriginalText && mwOriginalText.length > 0) {
896
+ return mwOriginalText;
874
897
  }
875
- const debugSummaryRef = {};
876
- const providerOptions = {
877
- toolCallMiddleware: {
878
- debugSummary: debugSummaryRef
879
- }
880
- };
881
- const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
882
- model,
883
- messages: flatMessages,
884
- tools: toolsMap,
885
- toolChoice: "auto",
886
- providerOptions,
887
- ...temperature !== void 0 ? { temperature } : {},
888
- ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
889
- });
890
- const mwOriginalText = debugSummaryRef.originalText;
891
- const mwParsedToolCalls = (() => {
892
- const raw = debugSummaryRef.toolCalls;
893
- if (!raw) return [];
894
- try {
895
- const arr = JSON.parse(raw);
896
- return Array.isArray(arr) ? arr : [];
897
- } catch {
898
- return [];
899
- }
900
- })();
898
+ if (typeof text === "string") {
899
+ return text;
900
+ }
901
+ return "";
902
+ })();
903
+ return {
904
+ id: testCase.id,
905
+ tool_schema: tools,
906
+ last_user_query: lastUser,
907
+ raw_model_text: rawModelText,
908
+ finish_reason: finishReason,
909
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
910
+ ground_truth: possibleAnswer.ground_truth
911
+ };
912
+ };
913
+ const logFailureDetails = (options) => {
914
+ const {
915
+ testCase,
916
+ tools,
917
+ possibleAnswer,
918
+ restoredCalls,
919
+ checkerResult,
920
+ flatMessages,
921
+ mwOriginalText,
922
+ text,
923
+ finishReason,
924
+ mwParsedToolCalls,
925
+ caseLogs
926
+ } = options;
927
+ try {
928
+ const category = testCase.id.split("_")[0];
929
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
930
+ tools,
931
+ possibleAnswer,
932
+ restoredCalls
933
+ ) : buildParallelDiff(
934
+ tools,
935
+ possibleAnswer,
936
+ restoredCalls
937
+ );
938
+ caseLogs.push(
939
+ `[DEBUG-FAIL] ${JSON.stringify({
940
+ id: testCase.id,
941
+ message: checkerResult.error,
942
+ error_type: checkerResult.error_type,
943
+ expected,
944
+ actual,
945
+ diff
946
+ })}`
947
+ );
901
948
  try {
949
+ const contextPayload = buildFailureContext({
950
+ testCase,
951
+ tools,
952
+ flatMessages,
953
+ mwOriginalText,
954
+ text,
955
+ finishReason,
956
+ mwParsedToolCalls,
957
+ restoredCalls,
958
+ possibleAnswer
959
+ });
902
960
  caseLogs.push(
903
- `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
904
- );
905
- } catch {
906
- caseLogs.push(
907
- `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
961
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
908
962
  );
963
+ } catch (e) {
909
964
  }
910
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
911
- if (!possibleAnswer) {
912
- throw new Error(`No possible answer for id: ${testCase.id}`);
965
+ } catch (e) {
966
+ caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
967
+ }
968
+ };
969
+ const buildToolsMap = (transformedTools) => Object.fromEntries(
970
+ transformedTools.map((t) => [
971
+ t.name,
972
+ (0, import_ai.tool)({
973
+ description: typeof t.description === "string" ? t.description : void 0,
974
+ inputSchema: (0, import_ai.jsonSchema)(
975
+ t.inputSchema
976
+ )
977
+ })
978
+ ])
979
+ );
980
+ const executeModelGeneration = async (options) => {
981
+ const {
982
+ model: modelInstance,
983
+ flatMessages,
984
+ toolsMap,
985
+ temperature,
986
+ maxTokens
987
+ } = options;
988
+ const debugSummaryRef = {};
989
+ const providerOptions = {
990
+ toolCallMiddleware: {
991
+ debugSummary: debugSummaryRef
913
992
  }
914
- const restoredCalls = (toolCalls || []).map((c) => {
915
- const rawName = c.toolName ?? c.name;
916
- const sanitizedFromIndex = typeof rawName === "string" && /^\d+$/.test(rawName) ? transformedTools[Number(rawName)]?.name ?? rawName : rawName;
917
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
918
- const extractedArgs = c.args ?? c.arguments ?? c.input ?? c.params ?? c.parameters ?? void 0;
919
- let parsedArgs = extractedArgs;
920
- if (typeof parsedArgs === "string") {
921
- try {
922
- parsedArgs = JSON.parse(parsedArgs);
923
- } catch {
924
- }
925
- }
926
- return {
927
- ...c,
928
- toolName: originalName,
929
- name: originalName,
930
- args: parsedArgs ?? {}
931
- };
993
+ };
994
+ const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
995
+ model: modelInstance,
996
+ messages: flatMessages,
997
+ tools: toolsMap,
998
+ toolChoice: "auto",
999
+ providerOptions,
1000
+ ...temperature !== void 0 ? { temperature } : {},
1001
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
1002
+ });
1003
+ return { toolCalls, text, finishReason, debugSummaryRef };
1004
+ };
1005
+ const processValidationResult = (options) => {
1006
+ const {
1007
+ checkerResult,
1008
+ testCase,
1009
+ tools,
1010
+ possibleAnswer,
1011
+ restoredCalls,
1012
+ flatMessages,
1013
+ mwOriginalText,
1014
+ text,
1015
+ finishReason,
1016
+ mwParsedToolCalls,
1017
+ caseLogs
1018
+ } = options;
1019
+ if (checkerResult.valid) {
1020
+ caseLogs.push(`[PASS] ${testCase.id}`);
1021
+ return { valid: true, logs: caseLogs };
1022
+ }
1023
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1024
+ logFailureDetails({
1025
+ testCase,
1026
+ tools,
1027
+ possibleAnswer,
1028
+ restoredCalls,
1029
+ checkerResult,
1030
+ flatMessages,
1031
+ mwOriginalText,
1032
+ text,
1033
+ finishReason,
1034
+ mwParsedToolCalls,
1035
+ caseLogs
1036
+ });
1037
+ return { valid: false, logs: caseLogs };
1038
+ };
1039
+ const prepareTestCaseData = (testCase) => {
1040
+ const { function: tools, question: messages } = testCase;
1041
+ const flatMessages = flattenMessages(messages);
1042
+ const { transformedTools, nameMap } = buildTransformedTools(
1043
+ tools,
1044
+ fixSchema2
1045
+ );
1046
+ const toolsMap = buildToolsMap(transformedTools);
1047
+ return { flatMessages, transformedTools, nameMap, toolsMap };
1048
+ };
1049
+ const processModelResponse = (options) => {
1050
+ const {
1051
+ testCase,
1052
+ toolCalls,
1053
+ text,
1054
+ finishReason,
1055
+ debugSummaryRef,
1056
+ nameMap,
1057
+ transformedTools,
1058
+ flatMessages,
1059
+ tools,
1060
+ caseLogs
1061
+ } = options;
1062
+ const mwOriginalText = debugSummaryRef.originalText;
1063
+ const mwParsedToolCalls = parseDebugToolCalls(
1064
+ debugSummaryRef.toolCalls
1065
+ );
1066
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1067
+ if (!possibleAnswer) {
1068
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1069
+ }
1070
+ if (process.env.DEBUG_PARSER_OUTPUT === "true") {
1071
+ const groundTruth = possibleAnswer.ground_truth;
1072
+ const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
1073
+ console.log("\n========== BFCL CASE DEBUG ==========");
1074
+ console.log(`Test Case: ${testCase.id}`);
1075
+ console.log(`Expected count: ${groundTruth.length} call(s)`);
1076
+ console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
1077
+ console.log(expectedXML);
1078
+ console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
1079
+ console.log(mwOriginalText || text || "(empty)");
1080
+ console.log(
1081
+ "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
1082
+ );
1083
+ console.log(JSON.stringify(toolCalls, null, 2));
1084
+ console.log("======================================\n");
1085
+ }
1086
+ logRawToolCalls({
1087
+ toolCalls,
1088
+ finishReason,
1089
+ text,
1090
+ testCaseId: testCase.id,
1091
+ caseLogs
1092
+ });
1093
+ const restoredCalls = restoreToolCalls(
1094
+ toolCalls || [],
1095
+ nameMap,
1096
+ transformedTools
1097
+ );
1098
+ const checkerResult = check(testCase, restoredCalls, possibleAnswer);
1099
+ return processValidationResult({
1100
+ checkerResult,
1101
+ testCase,
1102
+ tools,
1103
+ possibleAnswer,
1104
+ restoredCalls,
1105
+ flatMessages,
1106
+ mwOriginalText,
1107
+ text,
1108
+ finishReason,
1109
+ mwParsedToolCalls,
1110
+ caseLogs
1111
+ });
1112
+ };
1113
+ const runSingleCase2 = async (testCase) => {
1114
+ const caseLogs = [];
1115
+ const { function: tools } = testCase;
1116
+ const temp = config == null ? void 0 : config.temperature;
1117
+ const temperature = typeof temp === "number" ? temp : void 0;
1118
+ const maxTok = config == null ? void 0 : config.maxTokens;
1119
+ const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1120
+ try {
1121
+ const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
1122
+ logFirstToolDebug(transformedTools, testCase.id, caseLogs);
1123
+ const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
1124
+ model,
1125
+ flatMessages,
1126
+ toolsMap,
1127
+ temperature,
1128
+ maxTokens
932
1129
  });
933
- const checkerResult = check(
1130
+ return processModelResponse({
934
1131
  testCase,
935
- restoredCalls,
936
- possibleAnswer
937
- );
938
- if (checkerResult.valid) {
939
- caseLogs.push(`[PASS] ${testCase.id}`);
940
- return { valid: true, logs: caseLogs };
941
- } else {
942
- caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
943
- try {
944
- let generateParamMismatchDiff2 = function(paramName, allowed, got) {
945
- const diffLines = [];
946
- diffLines.push(`@@ param ${paramName}`);
947
- const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
948
- const expectedLine = (() => {
949
- if (allowedArray.length === 1) {
950
- return `- expected: ${JSON.stringify(allowedArray[0])}`;
951
- }
952
- const formatted = allowedArray.map(
953
- (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
954
- ).join(", ");
955
- return `- expected one of: ${formatted}`;
956
- })();
957
- diffLines.push(expectedLine);
958
- diffLines.push(`+ got: ${JSON.stringify(got)}`);
959
- return diffLines;
960
- };
961
- var generateParamMismatchDiff = generateParamMismatchDiff2;
962
- const category = testCase.id.split("_")[0];
963
- const diff = [];
964
- const summarizeArgs = (args) => {
965
- if (args == null) return args;
966
- if (typeof args !== "object") return args;
967
- return Object.keys(args).sort().reduce(
968
- (acc, k) => {
969
- acc[k] = args[k];
970
- return acc;
971
- },
972
- {}
973
- );
974
- };
975
- const expected = {};
976
- const actual = {};
977
- if (category === "simple") {
978
- const funcDesc = tools[0];
979
- const gt = possibleAnswer.ground_truth?.[0];
980
- const expectedFuncName = funcDesc?.name;
981
- const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
982
- const received = restoredCalls[0];
983
- const receivedName = received?.toolName ?? received?.name;
984
- const receivedArgs = summarizeArgs(received?.args);
985
- expected.function = expectedFuncName;
986
- expected.params = expectedParams;
987
- actual.function = receivedName;
988
- actual.args = receivedArgs;
989
- if (expectedFuncName !== receivedName) {
990
- diff.push(`@@ function name`);
991
- diff.push(`- ${expectedFuncName}`);
992
- diff.push(`+ ${receivedName}`);
993
- }
994
- if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
995
- const required = funcDesc?.parameters?.required ?? [];
996
- for (const req of required) {
997
- if (!(req in receivedArgs)) {
998
- diff.push(`- missing required param: ${req}`);
999
- }
1000
- }
1001
- for (const k of Object.keys(
1002
- receivedArgs
1003
- )) {
1004
- if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1005
- diff.push(`+ unexpected param: ${k}`);
1006
- }
1007
- }
1008
- for (const k of Object.keys(
1009
- receivedArgs
1010
- )) {
1011
- if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1012
- const allowed = expectedParams[k];
1013
- const got = receivedArgs[k];
1014
- const includes = Array.isArray(allowed) && allowed.some((v) => {
1015
- try {
1016
- if (Array.isArray(got)) {
1017
- return JSON.stringify(
1018
- got.map((x) => String(x)).sort()
1019
- ) === JSON.stringify(
1020
- v.map((x) => String(x)).sort()
1021
- );
1022
- }
1023
- } catch {
1024
- }
1025
- return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1026
- });
1027
- if (!includes) {
1028
- diff.push(
1029
- ...generateParamMismatchDiff2(k, allowed, got)
1030
- );
1031
- }
1032
- }
1033
- }
1034
- }
1035
- } else {
1036
- const gtArr = possibleAnswer.ground_truth ?? [];
1037
- const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
1038
- const actualNames = restoredCalls.map(
1039
- (c) => c.toolName ?? c.name
1040
- );
1041
- expected.functions = expectedNames;
1042
- actual.functions = actualNames;
1043
- if (expectedNames.length !== actualNames.length) {
1044
- diff.push(`@@ call count`);
1045
- diff.push(`- expected ${expectedNames.length}`);
1046
- diff.push(`+ got ${actualNames.length}`);
1047
- }
1048
- const missing = expectedNames.filter(
1049
- (n) => !actualNames.includes(n)
1050
- );
1051
- const extra = actualNames.filter(
1052
- (n) => !expectedNames.includes(n)
1053
- );
1054
- for (const m of missing)
1055
- diff.push(`- missing function: ${m}`);
1056
- for (const e of extra)
1057
- diff.push(`+ unexpected function: ${e}`);
1058
- const usedActual = /* @__PURE__ */ new Set();
1059
- for (const expectedObj of gtArr) {
1060
- const fname = Object.keys(expectedObj)[0];
1061
- let matchedIndex = -1;
1062
- for (let i = 0; i < restoredCalls.length; i++) {
1063
- if (usedActual.has(i)) continue;
1064
- const rc = restoredCalls[i];
1065
- const rcName = rc?.toolName ?? rc?.name;
1066
- if (rcName === fname) {
1067
- matchedIndex = i;
1068
- break;
1069
- }
1070
- }
1071
- if (matchedIndex === -1) continue;
1072
- usedActual.add(matchedIndex);
1073
- const received = restoredCalls[matchedIndex];
1074
- const receivedArgs = summarizeArgs(received?.args);
1075
- const expectedParamsAllowed = expectedObj[fname];
1076
- const funcDesc = tools.find(
1077
- (t) => t.name === fname
1078
- );
1079
- const requiredParams = funcDesc?.parameters?.required ?? [];
1080
- diff.push(`@@ function ${fname}`);
1081
- if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1082
- for (const req of requiredParams) {
1083
- if (!(req in receivedArgs)) {
1084
- diff.push(`- missing required param: ${req}`);
1085
- }
1086
- }
1087
- for (const k of Object.keys(
1088
- receivedArgs
1089
- )) {
1090
- if (!Object.prototype.hasOwnProperty.call(
1091
- expectedParamsAllowed,
1092
- k
1093
- )) {
1094
- diff.push(`+ unexpected param: ${k}`);
1095
- }
1096
- }
1097
- for (const k of Object.keys(
1098
- receivedArgs
1099
- )) {
1100
- if (Object.prototype.hasOwnProperty.call(
1101
- expectedParamsAllowed,
1102
- k
1103
- )) {
1104
- const allowed = expectedParamsAllowed[k];
1105
- const got = receivedArgs[k];
1106
- const includes = Array.isArray(allowed) && allowed.some((v) => {
1107
- try {
1108
- if (Array.isArray(got)) {
1109
- return JSON.stringify(
1110
- got.map((x) => String(x)).sort()
1111
- ) === JSON.stringify(
1112
- v.map((x) => String(x)).sort()
1113
- );
1114
- }
1115
- } catch {
1116
- }
1117
- return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1118
- });
1119
- if (!includes) {
1120
- diff.push(
1121
- ...generateParamMismatchDiff2(k, allowed, got)
1122
- );
1123
- }
1124
- }
1125
- }
1126
- }
1127
- }
1128
- }
1129
- caseLogs.push(
1130
- `[DEBUG-FAIL] ${JSON.stringify({
1131
- id: testCase.id,
1132
- message: checkerResult.error,
1133
- error_type: checkerResult.error_type,
1134
- expected,
1135
- actual,
1136
- diff
1137
- })}`
1138
- );
1139
- try {
1140
- const lastUser = (() => {
1141
- const reversed = [...flatMessages].reverse();
1142
- const found = reversed.find(
1143
- (m) => m.role === "user"
1144
- );
1145
- return found?.content ?? void 0;
1146
- })();
1147
- const contextPayload = {
1148
- id: testCase.id,
1149
- tool_schema: tools,
1150
- last_user_query: lastUser,
1151
- raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
1152
- finish_reason: finishReason,
1153
- parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1154
- ground_truth: possibleAnswer.ground_truth
1155
- };
1156
- caseLogs.push(
1157
- `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
1158
- );
1159
- } catch {
1160
- }
1161
- } catch {
1162
- caseLogs.push(
1163
- `[DEBUG] ${testCase.id}: failed to build debug diff`
1164
- );
1165
- }
1166
- return { valid: false, logs: caseLogs };
1167
- }
1132
+ toolCalls,
1133
+ text,
1134
+ finishReason,
1135
+ debugSummaryRef,
1136
+ nameMap,
1137
+ transformedTools,
1138
+ flatMessages,
1139
+ tools,
1140
+ caseLogs
1141
+ });
1168
1142
  } catch (e) {
1169
1143
  caseLogs.push(
1170
- `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
1144
+ `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
1171
1145
  );
1172
- if (e?.stack) {
1146
+ if (e == null ? void 0 : e.stack) {
1173
1147
  caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1174
1148
  }
1175
1149
  return { valid: false, logs: caseLogs };
1176
1150
  }
1177
1151
  };
1178
- const mapWithConcurrency = async (items, limit2, mapper) => {
1152
+ const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
1179
1153
  const results = new Array(items.length);
1180
1154
  let idx = 0;
1181
- const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
1155
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1182
1156
  while (true) {
1183
- const current = idx++;
1184
- if (current >= items.length) break;
1157
+ const current = idx;
1158
+ idx += 1;
1159
+ if (current >= items.length) {
1160
+ break;
1161
+ }
1185
1162
  results[current] = await mapper(items[current], current);
1186
1163
  }
1187
1164
  });
1188
1165
  await Promise.all(workers);
1189
1166
  return results;
1190
1167
  };
1191
- const resultsPerCase = await mapWithConcurrency(
1168
+ const resultsPerCase = await mapWithConcurrency2(
1192
1169
  testCases,
1193
1170
  concurrency,
1194
- async (tc) => runSingleCase(tc)
1171
+ async (tc) => runSingleCase2(tc)
1195
1172
  );
1196
1173
  correctCount = resultsPerCase.reduce(
1197
1174
  (acc, r) => acc + (r.valid ? 1 : 0),
1198
1175
  0
1199
1176
  );
1200
- for (const r of resultsPerCase) logs.push(...r.logs);
1177
+ for (const r of resultsPerCase) {
1178
+ logs.push(...r.logs);
1179
+ }
1201
1180
  if (testCases.length === 0) {
1202
1181
  return {
1203
1182
  score: 0,
@@ -1224,7 +1203,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1224
1203
  success: false,
1225
1204
  metrics: {},
1226
1205
  error: e,
1227
- logs: [`[FATAL] Failed to run benchmark ${name}: ${e.message}`]
1206
+ logs: [
1207
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1208
+ ]
1228
1209
  };
1229
1210
  }
1230
1211
  }
@@ -1233,87 +1214,591 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1233
1214
  var bfclSimpleBenchmark = createBfclBenchmark(
1234
1215
  "bfcl-simple",
1235
1216
  "BFCL Simple Function Calling",
1236
- "BFCL_v3_simple.json",
1237
- "BFCL_v3_simple_possible_answer.json"
1217
+ "BFCL_v3_simple.jsonl",
1218
+ "BFCL_v3_simple_possible_answer.jsonl"
1238
1219
  );
1239
1220
  var bfclParallelBenchmark = createBfclBenchmark(
1240
1221
  "bfcl-parallel",
1241
1222
  "BFCL Parallel Function Calling",
1242
- "BFCL_v3_parallel.json",
1243
- "BFCL_v3_parallel_possible_answer.json"
1223
+ "BFCL_v3_parallel.jsonl",
1224
+ "BFCL_v3_parallel_possible_answer.jsonl"
1244
1225
  );
1245
1226
  var bfclMultipleBenchmark = createBfclBenchmark(
1246
1227
  "bfcl-multiple",
1247
1228
  "BFCL Multiple Function Calling",
1248
- "BFCL_v3_multiple.json",
1249
- "BFCL_v3_multiple_possible_answer.json"
1229
+ "BFCL_v3_multiple.jsonl",
1230
+ "BFCL_v3_multiple_possible_answer.jsonl"
1250
1231
  );
1251
1232
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1252
1233
  "bfcl-parallel-multiple",
1253
1234
  "BFCL Parallel & Multiple Function Calling",
1254
- "BFCL_v3_parallel_multiple.json",
1255
- "BFCL_v3_parallel_multiple_possible_answer.json"
1235
+ "BFCL_v3_parallel_multiple.jsonl",
1236
+ "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1256
1237
  );
1257
1238
 
1258
- // src/benchmarks/json-generation.ts
1239
+ // src/benchmarks/complex-func-bench.ts
1240
+ var import_node_fs3 = require("fs");
1241
+ var import_node_path3 = __toESM(require("path"), 1);
1259
1242
  var import_ai2 = require("ai");
1260
- var import_ajv = __toESM(require("ajv"), 1);
1261
- var import_fs3 = require("fs");
1262
- var import_path3 = __toESM(require("path"), 1);
1263
- function extractFirstJsonBlock(text) {
1264
- try {
1265
- return JSON.parse(text);
1266
- } catch {
1243
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1244
+ function standardizeString2(input) {
1245
+ if (typeof input !== "string") {
1246
+ return input;
1267
1247
  }
1268
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1269
- if (fenceMatch) {
1270
- const inner = fenceMatch[1].trim();
1271
- try {
1272
- return JSON.parse(inner);
1273
- } catch {
1274
- }
1248
+ return input.toLowerCase().trim();
1249
+ }
1250
+ function valuesMatch2(modelValue, expectedValue) {
1251
+ if (modelValue === expectedValue) {
1252
+ return true;
1275
1253
  }
1276
- const startIdxObj = text.indexOf("{");
1277
- const startIdxArr = text.indexOf("[");
1254
+ if (typeof modelValue === "string" && typeof expectedValue === "string") {
1255
+ return standardizeString2(modelValue) === standardizeString2(expectedValue);
1256
+ }
1257
+ if (typeof modelValue === "number" && typeof expectedValue === "string") {
1258
+ return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
1259
+ }
1260
+ if (typeof modelValue === "string" && typeof expectedValue === "number") {
1261
+ return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
1262
+ }
1263
+ if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
1264
+ try {
1265
+ return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
1266
+ } catch (e) {
1267
+ return false;
1268
+ }
1269
+ }
1270
+ return false;
1271
+ }
1272
+ function validateFunctionName(modelFuncName, expectedFuncName) {
1273
+ if (modelFuncName !== expectedFuncName) {
1274
+ return {
1275
+ valid: false,
1276
+ error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
1277
+ error_type: "function_name_mismatch"
1278
+ };
1279
+ }
1280
+ return { valid: true };
1281
+ }
1282
+ function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
1283
+ for (const param of requiredParams) {
1284
+ if (!(param in modelArgs) && param in expectedArgs) {
1285
+ return {
1286
+ valid: false,
1287
+ error: `Missing required parameter: '${param}'`,
1288
+ error_type: "missing_required_param"
1289
+ };
1290
+ }
1291
+ }
1292
+ return { valid: true };
1293
+ }
1294
+ function validateParamValues(expectedArgs, modelArgs, requiredParams) {
1295
+ for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
1296
+ if (!(paramName in modelArgs)) {
1297
+ if (!requiredParams.includes(paramName)) {
1298
+ continue;
1299
+ }
1300
+ return {
1301
+ valid: false,
1302
+ error: `Missing parameter: '${paramName}'`,
1303
+ error_type: "missing_param"
1304
+ };
1305
+ }
1306
+ const modelValue = modelArgs[paramName];
1307
+ if (!valuesMatch2(modelValue, expectedValue)) {
1308
+ return {
1309
+ valid: false,
1310
+ error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
1311
+ error_type: "value_mismatch"
1312
+ };
1313
+ }
1314
+ }
1315
+ return { valid: true };
1316
+ }
1317
+ function checkFunctionCall(modelCall, expected, toolSpecs) {
1318
+ var _a, _b, _c, _d;
1319
+ const expectedFuncName = Object.keys(expected)[0];
1320
+ const expectedArgs = expected[expectedFuncName];
1321
+ const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
1322
+ const modelArgs = (_b = modelCall.args) != null ? _b : {};
1323
+ const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
1324
+ if (!nameResult.valid) {
1325
+ return nameResult;
1326
+ }
1327
+ const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
1328
+ const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
1329
+ const requiredResult = validateRequiredParams(
1330
+ requiredParams,
1331
+ modelArgs,
1332
+ expectedArgs
1333
+ );
1334
+ if (!requiredResult.valid) {
1335
+ return requiredResult;
1336
+ }
1337
+ return validateParamValues(expectedArgs, modelArgs, requiredParams);
1338
+ }
1339
+ function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
1340
+ if (modelCalls.length !== expectedCalls.length) {
1341
+ return {
1342
+ valid: false,
1343
+ error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
1344
+ error_type: "wrong_call_count"
1345
+ };
1346
+ }
1347
+ if (expectedCalls.length === 1) {
1348
+ return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
1349
+ }
1350
+ const matchedIndices = /* @__PURE__ */ new Set();
1351
+ for (const expected of expectedCalls) {
1352
+ let foundMatch = false;
1353
+ for (let i = 0; i < modelCalls.length; i++) {
1354
+ if (matchedIndices.has(i)) {
1355
+ continue;
1356
+ }
1357
+ const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
1358
+ if (result.valid) {
1359
+ matchedIndices.add(i);
1360
+ foundMatch = true;
1361
+ break;
1362
+ }
1363
+ }
1364
+ if (!foundMatch) {
1365
+ const expectedFuncName = Object.keys(expected)[0];
1366
+ return {
1367
+ valid: false,
1368
+ error: `Could not find matching call for function '${expectedFuncName}'`,
1369
+ error_type: "no_matching_call"
1370
+ };
1371
+ }
1372
+ }
1373
+ return { valid: true };
1374
+ }
1375
+ var fixSchemaType = (copy) => {
1376
+ if (!copy.type) {
1377
+ return;
1378
+ }
1379
+ if (copy.type === "dict") {
1380
+ copy.type = "object";
1381
+ }
1382
+ if (copy.type === "tuple") {
1383
+ copy.type = "array";
1384
+ }
1385
+ if (copy.type === "integer" || copy.type === "float") {
1386
+ copy.type = "number";
1387
+ }
1388
+ };
1389
+ var fixSchema = (schema) => {
1390
+ if (!schema || typeof schema !== "object") {
1391
+ return { type: "object", properties: {} };
1392
+ }
1393
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
1394
+ if (!Array.isArray(copy)) {
1395
+ fixSchemaType(copy);
1396
+ if (copy.properties && typeof copy.properties === "object") {
1397
+ for (const k of Object.keys(copy.properties)) {
1398
+ copy.properties[k] = fixSchema(
1399
+ copy.properties[k]
1400
+ );
1401
+ }
1402
+ }
1403
+ if (copy.items) {
1404
+ copy.items = fixSchema(copy.items);
1405
+ }
1406
+ }
1407
+ return copy;
1408
+ };
1409
+ function buildTools(tools) {
1410
+ const nameMap = /* @__PURE__ */ new Map();
1411
+ const transformedTools = tools.map((t) => {
1412
+ const fixed = fixSchema(t.parameters);
1413
+ const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
1414
+ const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
1415
+ nameMap.set(sanitized, t.name);
1416
+ return {
1417
+ type: "function",
1418
+ name: sanitized,
1419
+ description: t.description,
1420
+ inputSchema
1421
+ };
1422
+ });
1423
+ const toolsMap = Object.fromEntries(
1424
+ transformedTools.map((t) => [
1425
+ t.name,
1426
+ (0, import_ai2.tool)({
1427
+ description: typeof t.description === "string" ? t.description : void 0,
1428
+ inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
1429
+ })
1430
+ ])
1431
+ );
1432
+ return { nameMap, toolsMap };
1433
+ }
1434
+ async function mapWithConcurrency(items, concurrencyLimit, mapper) {
1435
+ const results = new Array(items.length);
1436
+ let idx = 0;
1437
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1438
+ while (true) {
1439
+ const current = idx;
1440
+ idx += 1;
1441
+ if (current >= items.length) {
1442
+ break;
1443
+ }
1444
+ results[current] = await mapper(items[current]);
1445
+ }
1446
+ });
1447
+ await Promise.all(workers);
1448
+ return results;
1449
+ }
1450
+ async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
1451
+ const caseLogs = [];
1452
+ const { function: tools, question: messages } = testCase;
1453
+ try {
1454
+ const { nameMap, toolsMap } = buildTools(tools);
1455
+ const debugSummaryRef = {};
1456
+ const providerOptions = {
1457
+ toolCallMiddleware: { debugSummary: debugSummaryRef }
1458
+ };
1459
+ const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
1460
+ model,
1461
+ messages,
1462
+ tools: toolsMap,
1463
+ toolChoice: "auto",
1464
+ providerOptions,
1465
+ ...temperature !== void 0 ? { temperature } : {},
1466
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
1467
+ });
1468
+ const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
1469
+ var _a, _b, _c, _d;
1470
+ const rawName = (_a = c.toolName) != null ? _a : c.name;
1471
+ const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
1472
+ return {
1473
+ toolName: originalName,
1474
+ name: originalName,
1475
+ args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
1476
+ };
1477
+ });
1478
+ caseLogs.push(
1479
+ `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
1480
+ );
1481
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1482
+ if (!possibleAnswer) {
1483
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1484
+ }
1485
+ const checkerResult = checkAllFunctionCalls(
1486
+ restoredCalls,
1487
+ possibleAnswer.ground_truth,
1488
+ tools
1489
+ );
1490
+ if (checkerResult.valid) {
1491
+ caseLogs.push(`[PASS] ${testCase.id}`);
1492
+ return { valid: true, logs: caseLogs };
1493
+ }
1494
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1495
+ return { valid: false, logs: caseLogs };
1496
+ } catch (e) {
1497
+ caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
1498
+ return { valid: false, logs: caseLogs };
1499
+ }
1500
+ }
1501
+ async function loadTestData(dataPath, testDataFile) {
1502
+ const testCasesJson = await import_node_fs3.promises.readFile(
1503
+ import_node_path3.default.join(dataPath, testDataFile),
1504
+ "utf-8"
1505
+ );
1506
+ return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1507
+ }
1508
+ async function loadAnswerData(dataPath, answerDataFile) {
1509
+ const answersJson = await import_node_fs3.promises.readFile(
1510
+ import_node_path3.default.join(dataPath, answerDataFile),
1511
+ "utf-8"
1512
+ );
1513
+ const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1514
+ return new Map(answers.map((ans) => [ans.id, ans]));
1515
+ }
1516
+ function getConfigValues(config) {
1517
+ const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
1518
+ const limit = limitEnv ? Number(limitEnv) : void 0;
1519
+ const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
1520
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
1521
+ const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
1522
+ const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
1523
+ return { limit, concurrency, temperature, maxTokens };
1524
+ }
1525
+ function aggregateResults(resultsPerCase, testCases) {
1526
+ const logs = [];
1527
+ const correctCount = resultsPerCase.reduce(
1528
+ (acc, r) => acc + (r.valid ? 1 : 0),
1529
+ 0
1530
+ );
1531
+ for (const r of resultsPerCase) {
1532
+ logs.push(...r.logs);
1533
+ }
1534
+ if (testCases.length === 0) {
1535
+ return {
1536
+ score: 0,
1537
+ success: false,
1538
+ metrics: {},
1539
+ logs: ["No test cases found."]
1540
+ };
1541
+ }
1542
+ const score = correctCount / testCases.length;
1543
+ return {
1544
+ score,
1545
+ success: score > 0.5,
1546
+ metrics: {
1547
+ correct_count: correctCount,
1548
+ total_cases: testCases.length,
1549
+ accuracy: score
1550
+ },
1551
+ logs
1552
+ };
1553
+ }
1554
+ function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
1555
+ return {
1556
+ name,
1557
+ version: "1.0.0",
1558
+ description,
1559
+ async run(model, config) {
1560
+ var _a;
1561
+ const logs = [];
1562
+ try {
1563
+ const dataPath = resolveDataDir();
1564
+ logs.push(`[INFO] Using data dir: ${dataPath}`);
1565
+ let testCases = await loadTestData(dataPath, testDataFile);
1566
+ const possibleAnswersMap = await loadAnswerData(
1567
+ dataPath,
1568
+ answerDataFile
1569
+ );
1570
+ const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
1571
+ if (limit && Number.isFinite(limit) && limit > 0) {
1572
+ testCases = testCases.slice(0, limit);
1573
+ logs.push(`[INFO] Limiting test cases to ${limit}`);
1574
+ }
1575
+ logs.push(
1576
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
1577
+ );
1578
+ const resultsPerCase = await mapWithConcurrency(
1579
+ testCases,
1580
+ concurrency,
1581
+ (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
1582
+ );
1583
+ const result = aggregateResults(resultsPerCase, testCases);
1584
+ result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
1585
+ return result;
1586
+ } catch (e) {
1587
+ return {
1588
+ score: 0,
1589
+ success: false,
1590
+ metrics: {},
1591
+ error: e,
1592
+ logs: [
1593
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1594
+ ]
1595
+ };
1596
+ }
1597
+ }
1598
+ };
1599
+ }
1600
+ var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
1601
+ "complex-func-bench",
1602
+ "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
1603
+ "ComplexFuncBench.jsonl",
1604
+ "ComplexFuncBench_possible_answer.jsonl"
1605
+ );
1606
+
1607
+ // src/benchmarks/json-generation.ts
1608
+ var import_node_fs4 = require("fs");
1609
+ var import_node_path4 = __toESM(require("path"), 1);
1610
+ var import_ai3 = require("ai");
1611
+ var import_ajv = __toESM(require("ajv"), 1);
1612
+ var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1613
+ var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1614
+ var NEWLINE_REGEX = /\r?\n/;
1615
+ var LINE_SPLIT_REGEX3 = /\r?\n/;
1616
+ function tryDirectParse(text) {
1617
+ try {
1618
+ return JSON.parse(text);
1619
+ } catch (e) {
1620
+ return;
1621
+ }
1622
+ }
1623
+ function tryCodeFenceParse(text) {
1624
+ const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
1625
+ if (!fenceMatch) {
1626
+ return;
1627
+ }
1628
+ const inner = fenceMatch[1].trim();
1629
+ try {
1630
+ return JSON.parse(inner);
1631
+ } catch (e) {
1632
+ return;
1633
+ }
1634
+ }
1635
+ function tryBracketScan(text) {
1636
+ const startIdxObj = text.indexOf("{");
1637
+ const startIdxArr = text.indexOf("[");
1278
1638
  const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1279
- if (start === void 0) return void 0;
1639
+ if (start === void 0) {
1640
+ return;
1641
+ }
1280
1642
  const open = text[start] === "{" ? "{" : "[";
1281
1643
  const close = open === "{" ? "}" : "]";
1282
1644
  let depth = 0;
1283
- for (let i = start; i < text.length; i++) {
1645
+ for (let i = start; i < text.length; i += 1) {
1284
1646
  const ch = text[i];
1285
- if (ch === open) depth++;
1286
- else if (ch === close) depth--;
1647
+ if (ch === open) {
1648
+ depth += 1;
1649
+ } else if (ch === close) {
1650
+ depth -= 1;
1651
+ }
1287
1652
  if (depth === 0) {
1288
1653
  const candidate = text.slice(start, i + 1);
1289
1654
  try {
1290
1655
  return JSON.parse(candidate);
1291
- } catch {
1656
+ } catch (e) {
1657
+ return;
1292
1658
  }
1293
- break;
1294
1659
  }
1295
1660
  }
1296
- return void 0;
1661
+ return;
1662
+ }
1663
+ function extractFirstJsonBlock(text) {
1664
+ const directResult = tryDirectParse(text);
1665
+ if (directResult !== void 0) {
1666
+ return directResult;
1667
+ }
1668
+ const fenceResult = tryCodeFenceParse(text);
1669
+ if (fenceResult !== void 0) {
1670
+ return fenceResult;
1671
+ }
1672
+ return tryBracketScan(text);
1297
1673
  }
1298
1674
  function subsetMatch(expected, actual) {
1299
1675
  if (expected === null || typeof expected !== "object") {
1300
1676
  return expected === actual;
1301
1677
  }
1302
1678
  if (Array.isArray(expected)) {
1303
- if (!Array.isArray(actual)) return false;
1304
- for (let i = 0; i < expected.length; i++) {
1305
- if (!subsetMatch(expected[i], actual[i])) return false;
1679
+ if (!Array.isArray(actual)) {
1680
+ return false;
1681
+ }
1682
+ for (let i = 0; i < expected.length; i += 1) {
1683
+ if (!subsetMatch(expected[i], actual[i])) {
1684
+ return false;
1685
+ }
1306
1686
  }
1307
1687
  return true;
1308
1688
  }
1309
- if (actual === null || typeof actual !== "object") return false;
1689
+ if (actual === null || typeof actual !== "object") {
1690
+ return false;
1691
+ }
1310
1692
  const eObj = expected;
1311
1693
  const aObj = actual;
1312
1694
  for (const key of Object.keys(eObj)) {
1313
- if (!subsetMatch(eObj[key], aObj[key])) return false;
1695
+ if (!subsetMatch(eObj[key], aObj[key])) {
1696
+ return false;
1697
+ }
1314
1698
  }
1315
1699
  return true;
1316
1700
  }
1701
+ async function loadDatasets() {
1702
+ try {
1703
+ const dataDir = resolveDataDir();
1704
+ const testsJsonl = await import_node_fs4.promises.readFile(
1705
+ import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
1706
+ "utf-8"
1707
+ );
1708
+ const expectedJsonl = await import_node_fs4.promises.readFile(
1709
+ import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
1710
+ "utf-8"
1711
+ );
1712
+ const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1713
+ const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1714
+ const expectedMap = /* @__PURE__ */ new Map();
1715
+ for (const r of expecteds) {
1716
+ expectedMap.set(r.id, r);
1717
+ }
1718
+ return { tests, expectedMap };
1719
+ } catch (e) {
1720
+ return {
1721
+ tests: [],
1722
+ expectedMap: /* @__PURE__ */ new Map(),
1723
+ error: e
1724
+ };
1725
+ }
1726
+ }
1727
+ function buildMessages(tc) {
1728
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1729
+ return [
1730
+ {
1731
+ role: "system",
1732
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1733
+ },
1734
+ {
1735
+ role: "user",
1736
+ content: [
1737
+ "Generate a JSON object that reflects the following facts.",
1738
+ "JSON Schema:",
1739
+ schemaStr,
1740
+ "Facts:",
1741
+ tc.promptFacts,
1742
+ "Output must be a single JSON only, with no additional text."
1743
+ ].join("\n\n")
1744
+ }
1745
+ ];
1746
+ }
1747
+ function validateTestCase(tc, parsed, context) {
1748
+ const validate = context.ajv.compile(tc.schema);
1749
+ const valid = validate(parsed);
1750
+ if (!valid) {
1751
+ context.logs.push(
1752
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1753
+ );
1754
+ }
1755
+ const expectedRec = context.expectedMap.get(tc.id);
1756
+ if (!expectedRec) {
1757
+ context.logs.push(
1758
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1759
+ );
1760
+ }
1761
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1762
+ return { valid, valuesOk, parsed };
1763
+ }
1764
+ async function processTestCase(tc, context) {
1765
+ var _a;
1766
+ const messages = buildMessages(tc);
1767
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1768
+ const temperature = typeof temp === "number" ? temp : void 0;
1769
+ const { text } = await (0, import_ai3.generateText)({
1770
+ model: context.model,
1771
+ messages,
1772
+ ...temperature !== void 0 ? { temperature } : {}
1773
+ });
1774
+ let parsed;
1775
+ try {
1776
+ parsed = extractFirstJsonBlock(text);
1777
+ } catch (e) {
1778
+ }
1779
+ if (parsed === void 0) {
1780
+ context.validation.logs.push(
1781
+ `[FAIL] ${tc.id}: Unable to parse JSON from model output.`
1782
+ );
1783
+ return { schemaValid: false, valueMatch: false, correct: false };
1784
+ }
1785
+ const {
1786
+ valid,
1787
+ valuesOk,
1788
+ parsed: validatedParsed
1789
+ } = validateTestCase(tc, parsed, context.validation);
1790
+ const correct = valid && valuesOk;
1791
+ if (correct) {
1792
+ context.validation.logs.push(`[PASS] ${tc.id}`);
1793
+ } else {
1794
+ context.validation.logs.push(
1795
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1796
+ validatedParsed
1797
+ )}`
1798
+ );
1799
+ }
1800
+ return { schemaValid: valid, valueMatch: valuesOk, correct };
1801
+ }
1317
1802
  var jsonGenerationBenchmark = {
1318
1803
  name: "json-generation",
1319
1804
  version: "2.1.0",
@@ -1321,213 +1806,731 @@ var jsonGenerationBenchmark = {
1321
1806
  async run(model, config) {
1322
1807
  const logs = [];
1323
1808
  const ajv = new import_ajv.default({ allErrors: true, strict: false });
1324
- let schemaValidCount = 0;
1325
- let valueMatchCount = 0;
1326
- let correctCount = 0;
1327
- let tests = [];
1328
- const expectedMap = /* @__PURE__ */ new Map();
1809
+ const { tests, expectedMap, error } = await loadDatasets();
1810
+ if (error) {
1811
+ return {
1812
+ score: 0,
1813
+ success: false,
1814
+ metrics: {},
1815
+ logs: [
1816
+ `[FATAL] Failed to load json-generation datasets: ${error.message}`
1817
+ ],
1818
+ error
1819
+ };
1820
+ }
1821
+ const context = {
1822
+ model,
1823
+ config,
1824
+ validation: { expectedMap, ajv, logs }
1825
+ };
1826
+ const counts = await processAllTests(tests, context);
1827
+ return buildBenchmarkResult(tests.length, counts, logs);
1828
+ }
1829
+ };
1830
+ async function processAllTests(tests, context) {
1831
+ let schemaValidCount = 0;
1832
+ let valueMatchCount = 0;
1833
+ let correctCount = 0;
1834
+ for (const tc of tests) {
1329
1835
  try {
1330
- const dataDir = resolveDataDir();
1331
- const testsJsonl = await import_fs3.promises.readFile(
1332
- import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1333
- "utf-8"
1334
- );
1335
- const expectedJsonl = await import_fs3.promises.readFile(
1336
- import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1337
- "utf-8"
1338
- );
1339
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1340
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1341
- for (const r of expecteds) expectedMap.set(r.id, r);
1836
+ const result = await processTestCase(tc, context);
1837
+ if (result.schemaValid) {
1838
+ schemaValidCount += 1;
1839
+ }
1840
+ if (result.valueMatch) {
1841
+ valueMatchCount += 1;
1842
+ }
1843
+ if (result.correct) {
1844
+ correctCount += 1;
1845
+ }
1846
+ } catch (e) {
1847
+ const msg = e instanceof Error ? e.message : String(e);
1848
+ context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
1849
+ }
1850
+ }
1851
+ return { schemaValidCount, valueMatchCount, correctCount };
1852
+ }
1853
+ function buildBenchmarkResult(total, counts, logs) {
1854
+ const score = counts.correctCount / total;
1855
+ return {
1856
+ score,
1857
+ success: score >= 0.8,
1858
+ metrics: {
1859
+ total_cases: total,
1860
+ correct_count: counts.correctCount,
1861
+ schema_valid_count: counts.schemaValidCount,
1862
+ value_match_count: counts.valueMatchCount,
1863
+ accuracy: score
1864
+ },
1865
+ logs
1866
+ };
1867
+ }
1868
+ async function loadSchemaOnlyTests() {
1869
+ try {
1870
+ const dataDir = resolveDataDir();
1871
+ const testsJsonl = await import_node_fs4.promises.readFile(
1872
+ import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
1873
+ "utf-8"
1874
+ );
1875
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1876
+ return { tests };
1877
+ } catch (e) {
1878
+ return { tests: [], error: e };
1879
+ }
1880
+ }
1881
+ async function processSchemaOnlyTestCase(tc, context) {
1882
+ var _a;
1883
+ const messages = buildMessages(tc);
1884
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1885
+ const temperature = typeof temp === "number" ? temp : void 0;
1886
+ const { text } = await (0, import_ai3.generateText)({
1887
+ model: context.model,
1888
+ messages,
1889
+ ...temperature !== void 0 ? { temperature } : {}
1890
+ });
1891
+ let parsed;
1892
+ try {
1893
+ parsed = extractFirstJsonBlock(text);
1894
+ } catch (e) {
1895
+ }
1896
+ if (parsed === void 0) {
1897
+ context.logs.push(
1898
+ `[FAIL] ${tc.id}: Could not parse JSON from model output.`
1899
+ );
1900
+ return false;
1901
+ }
1902
+ const validate = context.ajv.compile(tc.schema);
1903
+ const valid = validate(parsed);
1904
+ if (valid) {
1905
+ context.logs.push(`[PASS] ${tc.id}`);
1906
+ return true;
1907
+ }
1908
+ context.logs.push(
1909
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1910
+ );
1911
+ return false;
1912
+ }
1913
+ async function runSchemaOnlyTests(tests, context) {
1914
+ let schemaValidCount = 0;
1915
+ for (const tc of tests) {
1916
+ try {
1917
+ const isValid = await processSchemaOnlyTestCase(tc, context);
1918
+ if (isValid) {
1919
+ schemaValidCount += 1;
1920
+ }
1342
1921
  } catch (e) {
1343
1922
  const msg = e instanceof Error ? e.message : String(e);
1923
+ context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
1924
+ }
1925
+ }
1926
+ return schemaValidCount;
1927
+ }
1928
+ var jsonGenerationSchemaOnlyBenchmark = {
1929
+ name: "json-generation-schema-only",
1930
+ version: "1.0.1",
1931
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1932
+ async run(model, config) {
1933
+ const logs = [];
1934
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1935
+ const { tests, error } = await loadSchemaOnlyTests();
1936
+ if (error) {
1937
+ const msg = error.message;
1344
1938
  return {
1345
1939
  score: 0,
1346
1940
  success: false,
1347
1941
  metrics: {},
1348
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1349
- error: e
1942
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1943
+ error
1350
1944
  };
1351
1945
  }
1352
- for (const tc of tests) {
1353
- try {
1354
- const schemaStr = JSON.stringify(tc.schema, null, 2);
1355
- const messages = [
1356
- {
1357
- role: "system",
1358
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1359
- },
1360
- {
1361
- role: "user",
1362
- content: [
1363
- "Generate a JSON object that reflects the following facts.",
1364
- "JSON Schema:",
1365
- schemaStr,
1366
- "Facts:",
1367
- tc.promptFacts,
1368
- "Output must be a single JSON only, with no additional text."
1369
- ].join("\n\n")
1370
- }
1371
- ];
1372
- const temp = config?.temperature;
1373
- const temperature = typeof temp === "number" ? temp : void 0;
1374
- const { text } = await (0, import_ai2.generateText)({
1375
- model,
1376
- messages,
1377
- ...temperature !== void 0 ? { temperature } : {}
1378
- });
1379
- let parsed;
1380
- try {
1381
- parsed = extractFirstJsonBlock(text);
1382
- } catch {
1383
- }
1384
- if (parsed === void 0) {
1385
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1386
- continue;
1387
- }
1388
- const validate = ajv.compile(tc.schema);
1389
- const valid = validate(parsed);
1390
- if (valid) schemaValidCount++;
1391
- else
1392
- logs.push(
1393
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1394
- );
1395
- const expectedRec = expectedMap.get(tc.id);
1396
- if (!expectedRec) {
1397
- logs.push(
1398
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1399
- );
1400
- }
1401
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1402
- if (valuesOk) valueMatchCount++;
1403
- if (valid && valuesOk) {
1404
- correctCount++;
1405
- logs.push(`[PASS] ${tc.id}`);
1406
- } else {
1407
- logs.push(
1408
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1409
- parsed
1410
- )}`
1411
- );
1412
- }
1413
- } catch (e) {
1414
- const msg = e instanceof Error ? e.message : String(e);
1415
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
1416
- }
1417
- }
1946
+ const context = { model, config, ajv, logs };
1947
+ const schemaValidCount = await runSchemaOnlyTests(tests, context);
1418
1948
  const total = tests.length;
1419
- const score = correctCount / total;
1949
+ const score = total > 0 ? schemaValidCount / total : 0;
1420
1950
  return {
1421
1951
  score,
1422
1952
  success: score >= 0.8,
1423
1953
  metrics: {
1424
1954
  total_cases: total,
1425
- correct_count: correctCount,
1426
1955
  schema_valid_count: schemaValidCount,
1427
- value_match_count: valueMatchCount,
1428
1956
  accuracy: score
1429
1957
  },
1430
1958
  logs
1431
1959
  };
1432
1960
  }
1433
1961
  };
1434
- var jsonGenerationSchemaOnlyBenchmark = {
1435
- name: "json-generation-schema-only",
1436
- version: "1.0.1",
1437
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1438
- async run(model, config) {
1439
- const logs = [];
1440
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
1441
- let tests = [];
1962
+
1963
+ // src/reporters/console.ts
1964
+ var colors = {
1965
+ reset: "\x1B[0m",
1966
+ green: "\x1B[32m",
1967
+ red: "\x1B[31m",
1968
+ yellow: "\x1B[33m",
1969
+ cyan: "\x1B[36m",
1970
+ magenta: "\x1B[35m",
1971
+ gray: "\x1B[90m",
1972
+ white: "\x1B[37m",
1973
+ bgRed: "\x1B[41m"
1974
+ };
1975
+ function formatDiff(diff) {
1976
+ if (!diff || diff.length === 0) {
1977
+ return "";
1978
+ }
1979
+ return diff.map((line) => {
1980
+ if (line.startsWith("-")) {
1981
+ return `${colors.red}${line}${colors.reset}`;
1982
+ }
1983
+ if (line.startsWith("+")) {
1984
+ return `${colors.green}${line}${colors.reset}`;
1985
+ }
1986
+ if (line.startsWith("@@")) {
1987
+ return `${colors.cyan}${line}${colors.reset}`;
1988
+ }
1989
+ return line;
1990
+ }).join("\n ");
1991
+ }
1992
+ function printFailLogs(logs) {
1993
+ const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
1994
+ for (const log of failLogs) {
1995
+ try {
1996
+ const jsonStr = log.replace("[DEBUG-FAIL] ", "");
1997
+ const data = JSON.parse(jsonStr);
1998
+ console.log(`
1999
+ ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
2000
+ console.log(
2001
+ ` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
2002
+ );
2003
+ console.log(` Message: ${data.message}`);
2004
+ if (data.diff && Array.isArray(data.diff)) {
2005
+ console.log(` Diff:
2006
+ ${formatDiff(data.diff)}`);
2007
+ }
2008
+ if (data.expected && data.actual) {
2009
+ const expStr = JSON.stringify(data.expected);
2010
+ const actStr = JSON.stringify(data.actual);
2011
+ if (expStr.length < 100 && actStr.length < 100) {
2012
+ console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
2013
+ console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
2014
+ }
2015
+ }
2016
+ } catch (_e) {
2017
+ console.log(` Raw Log: ${log}`);
2018
+ }
2019
+ }
2020
+ }
2021
+ function printResult(result) {
2022
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
2023
+ const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
2024
+ console.log(
2025
+ `
2026
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
2027
+ );
2028
+ console.log(
2029
+ ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
2030
+ );
2031
+ const metrics = Object.entries(benchmarkResult.metrics);
2032
+ if (metrics.length > 0) {
2033
+ console.log(" Metrics:");
2034
+ for (const [key, value] of metrics) {
2035
+ console.log(` - ${key}: ${value}`);
2036
+ }
2037
+ }
2038
+ if (benchmarkResult.error) {
2039
+ console.log(
2040
+ ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
2041
+ );
2042
+ }
2043
+ if (!benchmarkResult.success && benchmarkResult.logs) {
2044
+ printFailLogs(benchmarkResult.logs);
2045
+ const failLogs = benchmarkResult.logs.filter(
2046
+ (l) => l.startsWith("[DEBUG-FAIL]")
2047
+ );
2048
+ if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
2049
+ console.log(" Raw Logs (Sample):");
2050
+ for (const l of benchmarkResult.logs.slice(0, 10)) {
2051
+ console.log(` ${l}`);
2052
+ }
2053
+ }
2054
+ }
2055
+ }
2056
+ function consoleReporter(results) {
2057
+ console.log("\n--- \u{1F4CA} Evaluation Report ---");
2058
+ for (const result of results) {
2059
+ printResult(result);
2060
+ }
2061
+ console.log("\n---------------------------\n");
2062
+ }
2063
+
2064
+ // src/reporters/console.debug.ts
2065
+ var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
2066
+ var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
2067
+ var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
2068
+ var colors2 = {
2069
+ reset: "\x1B[0m",
2070
+ green: "\x1B[32m",
2071
+ red: "\x1B[31m",
2072
+ yellow: "\x1B[33m",
2073
+ cyan: "\x1B[36m",
2074
+ magenta: "\x1B[35m",
2075
+ gray: "\x1B[90m",
2076
+ bold: "\x1B[1m",
2077
+ underline: "\x1B[4m"
2078
+ };
2079
+ function colorizeDiffLine(line) {
2080
+ if (line.startsWith("+")) {
2081
+ return `${colors2.green}${line}${colors2.reset}`;
2082
+ }
2083
+ if (line.startsWith("-")) {
2084
+ return `${colors2.red}${line}${colors2.reset}`;
2085
+ }
2086
+ if (line.startsWith("@")) {
2087
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
2088
+ }
2089
+ return line;
2090
+ }
2091
+ function uniqueLines(lines) {
2092
+ const seen = /* @__PURE__ */ new Set();
2093
+ const out = [];
2094
+ for (const l of lines) {
2095
+ if (seen.has(l)) {
2096
+ continue;
2097
+ }
2098
+ seen.add(l);
2099
+ out.push(l);
2100
+ }
2101
+ return out;
2102
+ }
2103
+ function hasFunctionNameIssue(diff) {
2104
+ return diff.some(
2105
+ (d) => String(d).includes("function name") || String(d).includes("missing function:")
2106
+ );
2107
+ }
2108
+ function suggestFunctionNameFix(expected, actual, suggestions) {
2109
+ const expectedName = expected == null ? void 0 : expected.function;
2110
+ const actualName = actual == null ? void 0 : actual.function;
2111
+ if (expectedName && actualName && expectedName !== actualName) {
2112
+ suggestions.push(
2113
+ `Call the function '${expectedName}' instead of '${actualName}'.`
2114
+ );
2115
+ }
2116
+ if (Array.isArray(expected == null ? void 0 : expected.functions)) {
2117
+ suggestions.push(
2118
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
2119
+ );
2120
+ }
2121
+ }
2122
+ function suggestMissingParamFix(diff, suggestions) {
2123
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
2124
+ if (missing.length) {
2125
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
2126
+ }
2127
+ }
2128
+ function suggestUnexpectedParamFix(diff, suggestions) {
2129
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
2130
+ if (extras.length) {
2131
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
2132
+ }
2133
+ }
2134
+ function suggestParamValueFix(diff, suggestions) {
2135
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
2136
+ for (const param of targets) {
2137
+ const allowedOneOfLine = diff.find(
2138
+ (d) => String(d).startsWith("- expected one of:")
2139
+ );
2140
+ const allowedSingleLine = diff.find(
2141
+ (d) => String(d).startsWith("- expected:")
2142
+ );
2143
+ if (allowedSingleLine) {
2144
+ const value = allowedSingleLine.replace("- expected: ", "");
2145
+ suggestions.push(`Set '${param}' to: ${value}.`);
2146
+ } else if (allowedOneOfLine) {
2147
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
2148
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
2149
+ } else {
2150
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
2151
+ }
2152
+ }
2153
+ }
2154
+ function suggestFromErrorType(error_type, suggestions) {
2155
+ if (error_type.includes("missing_required")) {
2156
+ suggestions.push("Add all required parameters defined by the tool schema.");
2157
+ } else if (error_type.includes("unexpected_param")) {
2158
+ suggestions.push("Remove parameters not present in the tool schema.");
2159
+ } else if (error_type.includes("wrong_count")) {
2160
+ suggestions.push(
2161
+ "Adjust the number of tool calls to match expected count."
2162
+ );
2163
+ } else if (error_type.includes("wrong_func_name")) {
2164
+ suggestions.push("Use the exact expected function name from the schema.");
2165
+ } else if (error_type.includes("value_error")) {
2166
+ suggestions.push("Choose a value from the allowed options.");
2167
+ }
2168
+ }
2169
+ function suggestFixFromDiff(parsed) {
2170
+ const suggestions = [];
2171
+ const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
2172
+ if (!Array.isArray(diff)) {
2173
+ if (suggestions.length === 0 && typeof error_type === "string") {
2174
+ suggestFromErrorType(error_type, suggestions);
2175
+ }
2176
+ return uniqueLines(suggestions);
2177
+ }
2178
+ if (hasFunctionNameIssue(diff)) {
2179
+ suggestFunctionNameFix(expected, actual, suggestions);
2180
+ }
2181
+ if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
2182
+ suggestMissingParamFix(diff, suggestions);
2183
+ }
2184
+ if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
2185
+ suggestUnexpectedParamFix(diff, suggestions);
2186
+ }
2187
+ if (diff.some((d) => String(d).startsWith("@@ param "))) {
2188
+ suggestParamValueFix(diff, suggestions);
2189
+ }
2190
+ if (suggestions.length === 0 && typeof error_type === "string") {
2191
+ suggestFromErrorType(error_type, suggestions);
2192
+ }
2193
+ return uniqueLines(suggestions);
2194
+ }
2195
+ function getTestIdFromLogLine(line) {
2196
+ var _a, _b;
2197
+ if (line.startsWith("[FAIL]")) {
2198
+ const m = line.match(FAIL_ID_REGEX);
2199
+ return m == null ? void 0 : m[1];
2200
+ }
2201
+ if (line.startsWith("[DEBUG-FAIL]")) {
2202
+ try {
2203
+ const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
2204
+ return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
2205
+ } catch (e) {
2206
+ }
2207
+ }
2208
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
1442
2209
  try {
1443
- const dataDir = resolveDataDir();
1444
- const testsJsonl = await import_fs3.promises.readFile(
1445
- import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1446
- "utf-8"
2210
+ const parsed = JSON.parse(
2211
+ line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1447
2212
  );
1448
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
2213
+ return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
1449
2214
  } catch (e) {
1450
- const msg = e instanceof Error ? e.message : String(e);
1451
- return {
1452
- score: 0,
1453
- success: false,
1454
- metrics: {},
1455
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1456
- error: e
1457
- };
1458
2215
  }
1459
- let schemaValidCount = 0;
1460
- for (const tc of tests) {
2216
+ }
2217
+ return;
2218
+ }
2219
+ function groupLogsByTestId(failLogs) {
2220
+ var _a;
2221
+ const byId = /* @__PURE__ */ new Map();
2222
+ for (const line of failLogs) {
2223
+ const id = getTestIdFromLogLine(line);
2224
+ const key = id != null ? id : "__general__";
2225
+ const arr = (_a = byId.get(key)) != null ? _a : [];
2226
+ arr.push(line);
2227
+ byId.set(key, arr);
2228
+ }
2229
+ return byId;
2230
+ }
2231
+ function collectDebugIds(lines) {
2232
+ const debugIds = /* @__PURE__ */ new Set();
2233
+ for (const l of lines) {
2234
+ if (l.startsWith("[DEBUG-FAIL]")) {
1461
2235
  try {
1462
- const schemaStr = JSON.stringify(tc.schema, null, 2);
1463
- const messages = [
1464
- {
1465
- role: "system",
1466
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1467
- },
1468
- {
1469
- role: "user",
1470
- content: [
1471
- "Generate a JSON object that reflects the following facts.",
1472
- "JSON Schema:",
1473
- schemaStr,
1474
- "Facts:",
1475
- tc.promptFacts,
1476
- "Output must be a single JSON only, with no additional text."
1477
- ].join("\n\n")
1478
- }
1479
- ];
1480
- const temp = config?.temperature;
1481
- const temperature = typeof temp === "number" ? temp : void 0;
1482
- const { text } = await (0, import_ai2.generateText)({
1483
- model,
1484
- messages,
1485
- ...temperature !== void 0 ? { temperature } : {}
1486
- });
1487
- let parsed;
1488
- try {
1489
- parsed = extractFirstJsonBlock(text);
1490
- } catch {
1491
- }
1492
- if (parsed === void 0) {
1493
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1494
- continue;
1495
- }
1496
- const validate = ajv.compile(tc.schema);
1497
- const valid = validate(parsed);
1498
- if (valid) {
1499
- schemaValidCount++;
1500
- logs.push(`[PASS] ${tc.id}`);
1501
- } else {
1502
- logs.push(
1503
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1504
- );
2236
+ const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
2237
+ if (parsed == null ? void 0 : parsed.id) {
2238
+ debugIds.add(String(parsed.id));
1505
2239
  }
1506
2240
  } catch (e) {
1507
- const msg = e instanceof Error ? e.message : String(e);
1508
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
1509
2241
  }
1510
2242
  }
1511
- const total = tests.length;
1512
- const score = total > 0 ? schemaValidCount / total : 0;
2243
+ }
2244
+ return debugIds;
2245
+ }
2246
+ function printIndentedJson(prefix, data, color) {
2247
+ console.log(
2248
+ color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n ") + colors2.reset
2249
+ );
2250
+ }
2251
+ function displayDebugFailLine(line) {
2252
+ const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
2253
+ try {
2254
+ const parsed = JSON.parse(payload);
2255
+ const { message, diff, expected, actual } = parsed;
2256
+ if (message) {
2257
+ console.log(` ${colors2.bold}${message}${colors2.reset}`);
2258
+ }
2259
+ if (diff && Array.isArray(diff)) {
2260
+ for (const dLine of diff) {
2261
+ console.log(` ${colorizeDiffLine(dLine)}`);
2262
+ }
2263
+ } else {
2264
+ console.log(" expected:");
2265
+ printIndentedJson(" ", expected, colors2.green);
2266
+ console.log(" actual:");
2267
+ printIndentedJson(" ", actual, colors2.red);
2268
+ }
2269
+ const suggestions = suggestFixFromDiff(parsed);
2270
+ if (suggestions.length) {
2271
+ console.log(` ${colors2.bold}Suggested fix:${colors2.reset}`);
2272
+ for (const s of suggestions) {
2273
+ console.log(` \u2022 ${s}`);
2274
+ }
2275
+ }
2276
+ } catch (e) {
2277
+ console.log(` ${line}`);
2278
+ }
2279
+ }
2280
+ function displayContextInfo(ctx) {
2281
+ if (ctx.tool_schema) {
2282
+ printIndentedJson(" tool schema: ", ctx.tool_schema, colors2.gray);
2283
+ }
2284
+ if (ctx.last_user_query) {
2285
+ console.log(
2286
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
2287
+ );
2288
+ }
2289
+ if (ctx.raw_model_text) {
2290
+ console.log(
2291
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
2292
+ );
2293
+ }
2294
+ if (ctx.parsed_tool_calls) {
2295
+ printIndentedJson(
2296
+ " parsed tool calls: ",
2297
+ ctx.parsed_tool_calls,
2298
+ colors2.gray
2299
+ );
2300
+ }
2301
+ if (ctx.ground_truth) {
2302
+ printIndentedJson(
2303
+ " ground truth: ",
2304
+ ctx.ground_truth,
2305
+ colors2.gray
2306
+ );
2307
+ }
2308
+ if (ctx.finish_reason) {
2309
+ console.log(
2310
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
2311
+ );
2312
+ }
2313
+ }
2314
+ function displayDebugFailContextLine(line) {
2315
+ const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
2316
+ try {
2317
+ const ctx = JSON.parse(payload);
2318
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
2319
+ displayContextInfo(ctx);
2320
+ } catch (e) {
2321
+ console.log(` ${line}`);
2322
+ }
2323
+ }
2324
+ function displayLogLine(line, debugIds) {
2325
+ if (line.startsWith("[FAIL]")) {
2326
+ const m = line.match(FAIL_ID_REGEX);
2327
+ const failId = m == null ? void 0 : m[1];
2328
+ if (failId && debugIds.has(failId)) {
2329
+ return;
2330
+ }
2331
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
2332
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
2333
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
2334
+ } else if (line.startsWith("[STACK]")) {
2335
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
2336
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
2337
+ displayDebugFailLine(line);
2338
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
2339
+ displayDebugFailContextLine(line);
2340
+ }
2341
+ }
2342
+ function displayGroupedFailures(byId) {
2343
+ console.log(` ${colors2.bold}Failure details (grouped):${colors2.reset}`);
2344
+ for (const [groupId, lines] of byId) {
2345
+ if (groupId !== "__general__") {
2346
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
2347
+ }
2348
+ const debugIds = collectDebugIds(lines);
2349
+ for (const line of lines) {
2350
+ displayLogLine(line, debugIds);
2351
+ }
2352
+ }
2353
+ }
2354
+ function displaySuccessLogs(logs) {
2355
+ const info = logs.filter(
2356
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
2357
+ );
2358
+ for (const line of info) {
2359
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
2360
+ }
2361
+ }
2362
+ function filterFailureLogs(logs) {
2363
+ return logs.filter(
2364
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
2365
+ );
2366
+ }
2367
+ function displayResultLogs(logs) {
2368
+ const failLogs = filterFailureLogs(logs);
2369
+ const hasFails = failLogs.length > 0;
2370
+ if (hasFails) {
2371
+ const byId = groupLogsByTestId(failLogs);
2372
+ displayGroupedFailures(byId);
2373
+ } else {
2374
+ displaySuccessLogs(logs);
2375
+ }
2376
+ }
2377
+ function displayMetrics(metrics) {
2378
+ if (metrics.length > 0) {
2379
+ console.log(" Metrics:");
2380
+ for (const [k, v] of metrics) {
2381
+ console.log(` - ${k}: ${v}`);
2382
+ }
2383
+ }
2384
+ }
2385
+ function displayResultHeader(r) {
2386
+ const { model, modelKey, benchmark, result } = r;
2387
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
2388
+ console.log(
2389
+ `
2390
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
2391
+ );
2392
+ console.log(
2393
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
2394
+ );
2395
+ }
2396
+ function consoleDebugReporter(results) {
2397
+ var _a;
2398
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
2399
+ for (const r of results) {
2400
+ displayResultHeader(r);
2401
+ displayMetrics(Object.entries(r.result.metrics));
2402
+ if ((_a = r.result.logs) == null ? void 0 : _a.length) {
2403
+ displayResultLogs(r.result.logs);
2404
+ }
2405
+ }
2406
+ console.log("\n------------------------------------\n");
2407
+ }
2408
+
2409
+ // src/reporters/json.ts
2410
+ function jsonReporter(results) {
2411
+ const serializableResults = results.map((r) => {
2412
+ var _a;
1513
2413
  return {
1514
- score,
1515
- success: score >= 0.8,
1516
- metrics: {
1517
- total_cases: total,
1518
- schema_valid_count: schemaValidCount,
1519
- accuracy: score
1520
- },
1521
- logs
2414
+ ...r,
2415
+ result: {
2416
+ ...r.result,
2417
+ error: (_a = r.result.error) == null ? void 0 : _a.message
2418
+ }
1522
2419
  };
1523
- }
2420
+ });
2421
+ console.log(JSON.stringify(serializableResults, null, 2));
2422
+ }
2423
+
2424
+ // src/reporters/index.ts
2425
+ var reporters = {
2426
+ console: consoleReporter,
2427
+ json: jsonReporter,
2428
+ "console.debug": consoleDebugReporter
1524
2429
  };
2430
+
2431
+ // src/evaluate.ts
2432
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
2433
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2434
+ try {
2435
+ console.log(
2436
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
2437
+ );
2438
+ const result = await benchmark.run(model, config);
2439
+ console.log(
2440
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
2441
+ );
2442
+ return {
2443
+ model: modelId,
2444
+ modelKey,
2445
+ benchmark: benchmark.name,
2446
+ result
2447
+ };
2448
+ } catch (error) {
2449
+ console.error(
2450
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
2451
+ error
2452
+ );
2453
+ return {
2454
+ model: modelId,
2455
+ modelKey,
2456
+ benchmark: benchmark.name,
2457
+ result: {
2458
+ score: 0,
2459
+ success: false,
2460
+ metrics: {},
2461
+ error: error instanceof Error ? error : new Error(String(error))
2462
+ }
2463
+ };
2464
+ }
2465
+ }
2466
+ function normalizeModels(models) {
2467
+ const modelEntries = [];
2468
+ if (Array.isArray(models)) {
2469
+ for (const m of models) {
2470
+ modelEntries.push([void 0, m]);
2471
+ }
2472
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
2473
+ modelEntries.push([void 0, models]);
2474
+ } else {
2475
+ for (const [key, m] of Object.entries(
2476
+ models
2477
+ )) {
2478
+ modelEntries.push([key, m]);
2479
+ }
2480
+ }
2481
+ return modelEntries;
2482
+ }
2483
+ function buildConfig(temperature, maxTokens) {
2484
+ const config = {};
2485
+ if (temperature !== void 0) {
2486
+ config.temperature = temperature;
2487
+ }
2488
+ if (maxTokens !== void 0) {
2489
+ config.maxTokens = maxTokens;
2490
+ }
2491
+ return Object.keys(config).length > 0 ? config : void 0;
2492
+ }
2493
+ function executeReporter(reporter, results) {
2494
+ const report = reporters[reporter];
2495
+ if (report) {
2496
+ report(results);
2497
+ } else {
2498
+ console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
2499
+ reporters.console(results);
2500
+ }
2501
+ }
2502
+ async function evaluate(options) {
2503
+ const {
2504
+ models,
2505
+ benchmarks,
2506
+ reporter = "console",
2507
+ temperature,
2508
+ maxTokens
2509
+ } = options;
2510
+ const modelEntries = normalizeModels(models);
2511
+ const config = buildConfig(temperature, maxTokens);
2512
+ const allResults = [];
2513
+ for (const [modelKey, model] of modelEntries) {
2514
+ for (const benchmark of benchmarks) {
2515
+ const evaluationResult = await runSingleBenchmark(
2516
+ model,
2517
+ benchmark,
2518
+ modelKey,
2519
+ config
2520
+ );
2521
+ allResults.push(evaluationResult);
2522
+ }
2523
+ }
2524
+ executeReporter(reporter, allResults);
2525
+ return allResults;
2526
+ }
1525
2527
  // Annotate the CommonJS export names for ESM import in node:
1526
2528
  0 && (module.exports = {
1527
2529
  bfclMultipleBenchmark,
1528
2530
  bfclParallelBenchmark,
1529
2531
  bfclParallelMultipleBenchmark,
1530
2532
  bfclSimpleBenchmark,
2533
+ complexFuncBenchBenchmark,
1531
2534
  evaluate,
1532
2535
  jsonGenerationBenchmark,
1533
2536
  jsonGenerationSchemaOnlyBenchmark