@ai-sdk-tool/eval 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -47,14 +47,15 @@ var colors = {
47
47
  red: "\x1B[31m",
48
48
  yellow: "\x1B[33m",
49
49
  cyan: "\x1B[36m",
50
- magenta: "\x1B[35m"
50
+ magenta: "\x1B[35m",
51
+ gray: "\x1B[90m"
51
52
  };
52
53
  function printResult(result) {
53
- const { model, benchmark, result: benchmarkResult } = result;
54
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
54
55
  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
55
56
  console.log(
56
57
  `
57
- ${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
58
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
58
59
  );
59
60
  console.log(
60
61
  ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
@@ -80,6 +81,186 @@ function consoleReporter(results) {
80
81
  console.log("\n---------------------------\n");
81
82
  }
82
83
 
84
+ // src/reporters/console.debug.ts
85
+ var colors2 = {
86
+ reset: "\x1B[0m",
87
+ green: "\x1B[32m",
88
+ red: "\x1B[31m",
89
+ yellow: "\x1B[33m",
90
+ cyan: "\x1B[36m",
91
+ magenta: "\x1B[35m",
92
+ gray: "\x1B[90m",
93
+ bold: "\x1B[1m",
94
+ underline: "\x1B[4m"
95
+ };
96
+ function colorizeDiffLine(line) {
97
+ if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
98
+ if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
99
+ if (line.startsWith("@"))
100
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
101
+ return line;
102
+ }
103
+ function uniqueLines(lines) {
104
+ const seen = /* @__PURE__ */ new Set();
105
+ const out = [];
106
+ for (const l of lines) {
107
+ if (seen.has(l)) continue;
108
+ seen.add(l);
109
+ out.push(l);
110
+ }
111
+ return out;
112
+ }
113
+ function suggestFixFromDiff(parsed) {
114
+ const suggestions = [];
115
+ const { error_type, expected, actual, diff } = parsed ?? {};
116
+ if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
117
+ const expectedName = expected?.function;
118
+ const actualName = actual?.function;
119
+ if (expectedName && actualName && expectedName !== actualName) {
120
+ suggestions.push(
121
+ `Call the function '${expectedName}' instead of '${actualName}'.`
122
+ );
123
+ }
124
+ if (Array.isArray(expected?.functions)) {
125
+ suggestions.push(
126
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
127
+ );
128
+ }
129
+ }
130
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
131
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
132
+ if (missing.length) {
133
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
134
+ }
135
+ }
136
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
137
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
138
+ if (extras.length) {
139
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
140
+ }
141
+ }
142
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
+ for (const param of targets) {
145
+ const allowedLine = diff.find(
146
+ (d) => String(d).startsWith("- expected one of:")
147
+ );
148
+ if (allowedLine) {
149
+ const allowed = allowedLine.replace("- expected one of: ", "");
150
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
151
+ } else {
152
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
153
+ }
154
+ }
155
+ }
156
+ if (suggestions.length === 0 && typeof error_type === "string") {
157
+ if (error_type.includes("missing_required")) {
158
+ suggestions.push(
159
+ "Add all required parameters defined by the tool schema."
160
+ );
161
+ } else if (error_type.includes("unexpected_param")) {
162
+ suggestions.push("Remove parameters not present in the tool schema.");
163
+ } else if (error_type.includes("wrong_count")) {
164
+ suggestions.push(
165
+ "Adjust the number of tool calls to match expected count."
166
+ );
167
+ } else if (error_type.includes("wrong_func_name")) {
168
+ suggestions.push("Use the exact expected function name from the schema.");
169
+ } else if (error_type.includes("value_error")) {
170
+ suggestions.push("Choose a value from the allowed options.");
171
+ }
172
+ }
173
+ return uniqueLines(suggestions);
174
+ }
175
+ function consoleDebugReporter(results) {
176
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
177
+ for (const r of results) {
178
+ const { model, modelKey, benchmark, result } = r;
179
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
180
+ console.log(
181
+ `
182
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
183
+ );
184
+ console.log(
185
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
186
+ );
187
+ const metrics = Object.entries(result.metrics);
188
+ if (metrics.length > 0) {
189
+ console.log(" Metrics:");
190
+ for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
191
+ }
192
+ if (result.logs && result.logs.length) {
193
+ const failLogs = result.logs.filter(
194
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
195
+ );
196
+ const hasFails = failLogs.length > 0;
197
+ if (hasFails) {
198
+ console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
199
+ const debugIds = /* @__PURE__ */ new Set();
200
+ for (const l of failLogs) {
201
+ if (l.startsWith("[DEBUG-FAIL]")) {
202
+ try {
203
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
204
+ if (parsed?.id) debugIds.add(String(parsed.id));
205
+ } catch {
206
+ }
207
+ }
208
+ }
209
+ for (const line of failLogs) {
210
+ if (line.startsWith("[FAIL]")) {
211
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
212
+ const failId = m?.[1];
213
+ if (failId && debugIds.has(failId)) continue;
214
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
215
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
216
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
217
+ } else if (line.startsWith("[STACK]")) {
218
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
219
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
220
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
221
+ try {
222
+ const parsed = JSON.parse(payload);
223
+ const { id, expected, actual, message, diff } = parsed;
224
+ console.log(
225
+ ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
226
+ );
227
+ if (diff && Array.isArray(diff)) {
228
+ for (const dLine of diff)
229
+ console.log(" " + colorizeDiffLine(dLine));
230
+ } else {
231
+ console.log(" expected:");
232
+ console.log(
233
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
234
+ );
235
+ console.log(" actual:");
236
+ console.log(
237
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
238
+ );
239
+ }
240
+ const suggestions = suggestFixFromDiff(parsed);
241
+ if (suggestions.length) {
242
+ console.log(
243
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
244
+ );
245
+ for (const s of suggestions) console.log(` \u2022 ${s}`);
246
+ }
247
+ } catch {
248
+ console.log(` ${line}`);
249
+ }
250
+ }
251
+ }
252
+ } else {
253
+ const info = result.logs.filter(
254
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
255
+ );
256
+ for (const line of info)
257
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
258
+ }
259
+ }
260
+ }
261
+ console.log("\n------------------------------------\n");
262
+ }
263
+
83
264
  // src/reporters/json.ts
84
265
  function jsonReporter(results) {
85
266
  const serializableResults = results.map((r) => ({
@@ -95,30 +276,35 @@ function jsonReporter(results) {
95
276
  // src/reporters/index.ts
96
277
  var reporters = {
97
278
  console: consoleReporter,
98
- json: jsonReporter
279
+ json: jsonReporter,
280
+ "console.debug": consoleDebugReporter
99
281
  };
100
282
 
101
283
  // src/evaluate.ts
102
- async function runSingleBenchmark(model, benchmark) {
284
+ async function runSingleBenchmark(model, benchmark, modelKey) {
103
285
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
104
286
  try {
105
- console.log(`[${modelId}] Running benchmark: ${benchmark.name}...`);
287
+ console.log(
288
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
289
+ );
106
290
  const result = await benchmark.run(model);
107
291
  console.log(
108
- `[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
292
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
109
293
  );
110
294
  return {
111
295
  model: modelId,
296
+ modelKey,
112
297
  benchmark: benchmark.name,
113
298
  result
114
299
  };
115
300
  } catch (error) {
116
301
  console.error(
117
- `[${modelId}] Error running benchmark: ${benchmark.name}`,
302
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
118
303
  error
119
304
  );
120
305
  return {
121
306
  model: modelId,
307
+ modelKey,
122
308
  benchmark: benchmark.name,
123
309
  result: {
124
310
  score: 0,
@@ -131,11 +317,26 @@ async function runSingleBenchmark(model, benchmark) {
131
317
  }
132
318
  async function evaluate(options) {
133
319
  const { models, benchmarks, reporter = "console" } = options;
134
- const modelsArray = Array.isArray(models) ? models : [models];
320
+ const modelEntries = [];
321
+ if (Array.isArray(models)) {
322
+ for (const m of models) modelEntries.push([void 0, m]);
323
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
324
+ modelEntries.push([void 0, models]);
325
+ } else {
326
+ for (const [key, m] of Object.entries(
327
+ models
328
+ )) {
329
+ modelEntries.push([key, m]);
330
+ }
331
+ }
135
332
  const allResults = [];
136
- for (const model of modelsArray) {
333
+ for (const [modelKey, model] of modelEntries) {
137
334
  for (const benchmark of benchmarks) {
138
- const evaluationResult = await runSingleBenchmark(model, benchmark);
335
+ const evaluationResult = await runSingleBenchmark(
336
+ model,
337
+ benchmark,
338
+ modelKey
339
+ );
139
340
  allResults.push(evaluationResult);
140
341
  }
141
342
  }
@@ -149,17 +350,16 @@ async function evaluate(options) {
149
350
  return allResults;
150
351
  }
151
352
 
152
- // src/benchmarks/json-generation.ts
353
+ // src/benchmarks/bfcl.ts
153
354
  var import_ai = require("ai");
154
- var import_ajv = __toESM(require("ajv"), 1);
155
355
  var import_fs2 = require("fs");
156
356
  var import_path2 = __toESM(require("path"), 1);
157
357
 
158
358
  // src/utils/paths.ts
159
359
  var import_fs = __toESM(require("fs"), 1);
360
+ var import_module = require("module");
160
361
  var import_path = __toESM(require("path"), 1);
161
362
  var import_url = require("url");
162
- var import_module = require("module");
163
363
  function resolveDataDir(fromModuleUrl) {
164
364
  const moduleUrl = fromModuleUrl;
165
365
  const override = process.env.BFCL_DATA_DIR;
@@ -207,396 +407,193 @@ function resolveDataDir(fromModuleUrl) {
207
407
  return import_path.default.join(pkgRoot, "data");
208
408
  }
209
409
 
210
- // src/benchmarks/json-generation.ts
211
- function extractFirstJsonBlock(text) {
212
- try {
213
- return JSON.parse(text);
214
- } catch {
410
+ // src/benchmarks/bfcl/ast-checker.ts
411
+ function standardizeString(input) {
412
+ if (typeof input !== "string") return input;
413
+ const regex = /[ ,./\\-_*^]/g;
414
+ return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
415
+ }
416
+ function checkStringValue(param, modelValue, possibleAnswers) {
417
+ const standardizedModelValue = standardizeString(modelValue);
418
+ const standardizedPossibleAnswers = possibleAnswers.map(
419
+ (ans) => standardizeString(String(ans))
420
+ );
421
+ if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
422
+ return {
423
+ valid: false,
424
+ error: `Invalid value for parameter '${param}': ${JSON.stringify(
425
+ modelValue
426
+ )}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
427
+ error_type: "value_error:string"
428
+ };
215
429
  }
216
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
217
- if (fenceMatch) {
218
- const inner = fenceMatch[1].trim();
219
- try {
220
- return JSON.parse(inner);
221
- } catch {
430
+ return { valid: true };
431
+ }
432
+ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
433
+ const modelArgs = modelToolCall.args;
434
+ const modelFuncName = modelToolCall.toolName;
435
+ const expectedFuncName = funcDescription.name;
436
+ const expectedParams = funcDescription.parameters.properties;
437
+ const requiredParams = funcDescription.parameters.required;
438
+ if (modelFuncName !== expectedFuncName) {
439
+ return {
440
+ valid: false,
441
+ error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
442
+ error_type: "simple_function_checker:wrong_func_name"
443
+ };
444
+ }
445
+ const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
446
+ const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
447
+ for (const param of requiredParams) {
448
+ if (!(param in argsObj)) {
449
+ return {
450
+ valid: false,
451
+ error: `Missing required parameter: '${param}'.`,
452
+ error_type: "simple_function_checker:missing_required"
453
+ };
222
454
  }
223
455
  }
224
- const startIdxObj = text.indexOf("{");
225
- const startIdxArr = text.indexOf("[");
226
- const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
227
- if (start === void 0) return void 0;
228
- const open = text[start] === "{" ? "{" : "[";
229
- const close = open === "{" ? "}" : "]";
230
- let depth = 0;
231
- for (let i = start; i < text.length; i++) {
232
- const ch = text[i];
233
- if (ch === open) depth++;
234
- else if (ch === close) depth--;
235
- if (depth === 0) {
236
- const candidate = text.slice(start, i + 1);
237
- try {
238
- return JSON.parse(candidate);
239
- } catch {
456
+ if (modelArgs && typeof modelArgs === "object") {
457
+ for (const paramName of Object.keys(argsObj)) {
458
+ const modelValue = argsObj[paramName];
459
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
460
+ return {
461
+ valid: false,
462
+ error: `Unexpected parameter: '${paramName}'.`,
463
+ error_type: "simple_function_checker:unexpected_param"
464
+ };
465
+ }
466
+ const possibleValues = possibleAnswerParams[paramName];
467
+ if (typeof modelValue === "string") {
468
+ const result = checkStringValue(
469
+ paramName,
470
+ modelValue,
471
+ possibleValues ?? []
472
+ );
473
+ if (!result.valid) return result;
474
+ } else if (Array.isArray(modelValue)) {
475
+ const modelValueStr = JSON.stringify(
476
+ modelValue.map((v) => standardizeString(String(v))).sort()
477
+ );
478
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
479
+ if (!Array.isArray(p)) return false;
480
+ return JSON.stringify(
481
+ p.map((v) => standardizeString(String(v))).sort()
482
+ ) === modelValueStr;
483
+ }) : false;
484
+ if (!hasMatch) {
485
+ return {
486
+ valid: false,
487
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
488
+ modelValue
489
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
490
+ error_type: "value_error:list"
491
+ };
492
+ }
493
+ } else {
494
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
495
+ if (modelValue === possibleValue) return true;
496
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
497
+ try {
498
+ const normalizeObject = (obj) => {
499
+ if (Array.isArray(obj)) {
500
+ return obj.map(normalizeObject);
501
+ }
502
+ if (obj && typeof obj === "object") {
503
+ const normalized = {};
504
+ for (const [key, value] of Object.entries(
505
+ obj
506
+ )) {
507
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
508
+ normalized[key] = value[0];
509
+ } else {
510
+ normalized[key] = normalizeObject(value);
511
+ }
512
+ }
513
+ return normalized;
514
+ }
515
+ return obj;
516
+ };
517
+ const normalizedModel = normalizeObject(modelValue);
518
+ const normalizedPossible = normalizeObject(possibleValue);
519
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
520
+ } catch {
521
+ return false;
522
+ }
523
+ }
524
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
525
+ return modelValue.toString() === possibleValue;
526
+ }
527
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
528
+ return modelValue === possibleValue.toString();
529
+ }
530
+ return false;
531
+ }) : false;
532
+ if (!hasMatch) {
533
+ return {
534
+ valid: false,
535
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
536
+ modelValue
537
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
538
+ error_type: "value_error:other"
539
+ };
540
+ }
240
541
  }
241
- break;
242
542
  }
243
543
  }
244
- return void 0;
245
- }
246
- function subsetMatch(expected, actual) {
247
- if (expected === null || typeof expected !== "object") {
248
- return expected === actual;
249
- }
250
- if (Array.isArray(expected)) {
251
- if (!Array.isArray(actual)) return false;
252
- for (let i = 0; i < expected.length; i++) {
253
- if (!subsetMatch(expected[i], actual[i])) return false;
544
+ for (const paramName in possibleAnswerParams) {
545
+ const val = possibleAnswerParams[paramName];
546
+ const isOptional = Array.isArray(val) && val.includes("");
547
+ if (!(paramName in argsObj) && !isOptional) {
548
+ return {
549
+ valid: false,
550
+ error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
551
+ error_type: "simple_function_checker:missing_optional"
552
+ };
254
553
  }
255
- return true;
256
- }
257
- if (actual === null || typeof actual !== "object") return false;
258
- const eObj = expected;
259
- const aObj = actual;
260
- for (const key of Object.keys(eObj)) {
261
- if (!subsetMatch(eObj[key], aObj[key])) return false;
262
554
  }
263
- return true;
555
+ return { valid: true };
264
556
  }
265
- var jsonGenerationBenchmark = {
266
- name: "json-generation",
267
- version: "2.1.0",
268
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
269
- async run(model) {
270
- const logs = [];
271
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
272
- let schemaValidCount = 0;
273
- let valueMatchCount = 0;
274
- let correctCount = 0;
275
- let tests = [];
276
- const expectedMap = /* @__PURE__ */ new Map();
277
- try {
278
- const dataDir = resolveDataDir();
279
- const testsJsonl = await import_fs2.promises.readFile(
280
- import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
281
- "utf-8"
282
- );
283
- const expectedJsonl = await import_fs2.promises.readFile(
284
- import_path2.default.join(dataDir, "json_generation_expected.jsonl"),
285
- "utf-8"
557
+ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
558
+ if (modelToolCalls.length !== possibleAnswers.length) {
559
+ return {
560
+ valid: false,
561
+ error: `Wrong number of functions. Expected ${possibleAnswers.length}, got ${modelToolCalls.length}.`,
562
+ error_type: "parallel_function_checker_no_order:wrong_count"
563
+ };
564
+ }
565
+ const matchedModelCallIndices = /* @__PURE__ */ new Set();
566
+ for (const possibleAnswer of possibleAnswers) {
567
+ const expectedFuncName = Object.keys(possibleAnswer)[0];
568
+ const funcDescription = funcDescriptions.find(
569
+ (f) => f.name === expectedFuncName
570
+ );
571
+ if (!funcDescription) {
572
+ return {
573
+ valid: false,
574
+ error: `Could not find function description for '${expectedFuncName}'.`,
575
+ error_type: "parallel_function_checker_no_order:missing_func_desc"
576
+ };
577
+ }
578
+ let foundMatch = false;
579
+ for (let i = 0; i < modelToolCalls.length; i++) {
580
+ if (matchedModelCallIndices.has(i)) continue;
581
+ const checkerResult = simpleFunctionChecker(
582
+ funcDescription,
583
+ modelToolCalls[i],
584
+ possibleAnswer
286
585
  );
287
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
288
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
289
- for (const r of expecteds) expectedMap.set(r.id, r);
290
- } catch (e) {
291
- const msg = e instanceof Error ? e.message : String(e);
586
+ if (checkerResult.valid) {
587
+ matchedModelCallIndices.add(i);
588
+ foundMatch = true;
589
+ break;
590
+ }
591
+ }
592
+ if (!foundMatch) {
292
593
  return {
293
- score: 0,
294
- success: false,
295
- metrics: {},
296
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
297
- error: e
298
- };
299
- }
300
- for (const tc of tests) {
301
- try {
302
- const schemaStr = JSON.stringify(tc.schema, null, 2);
303
- const messages = [
304
- {
305
- role: "system",
306
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
307
- },
308
- {
309
- role: "user",
310
- content: [
311
- "Generate a JSON object that reflects the following facts.",
312
- "JSON Schema:",
313
- schemaStr,
314
- "Facts:",
315
- tc.promptFacts,
316
- "Output must be a single JSON only, with no additional text."
317
- ].join("\n\n")
318
- }
319
- ];
320
- const { text } = await (0, import_ai.generateText)({ model, messages });
321
- let parsed;
322
- try {
323
- parsed = extractFirstJsonBlock(text);
324
- } catch {
325
- }
326
- if (parsed === void 0) {
327
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
328
- continue;
329
- }
330
- const validate = ajv.compile(tc.schema);
331
- const valid = validate(parsed);
332
- if (valid) schemaValidCount++;
333
- else
334
- logs.push(
335
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
336
- );
337
- const expectedRec = expectedMap.get(tc.id);
338
- if (!expectedRec) {
339
- logs.push(
340
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
341
- );
342
- }
343
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
344
- if (valuesOk) valueMatchCount++;
345
- if (valid && valuesOk) {
346
- correctCount++;
347
- logs.push(`[PASS] ${tc.id}`);
348
- } else {
349
- logs.push(
350
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
351
- parsed
352
- )}`
353
- );
354
- }
355
- } catch (e) {
356
- const msg = e instanceof Error ? e.message : String(e);
357
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
358
- }
359
- }
360
- const total = tests.length;
361
- const score = correctCount / total;
362
- return {
363
- score,
364
- success: score >= 0.8,
365
- metrics: {
366
- total_cases: total,
367
- correct_count: correctCount,
368
- schema_valid_count: schemaValidCount,
369
- value_match_count: valueMatchCount,
370
- accuracy: score
371
- },
372
- logs
373
- };
374
- }
375
- };
376
- var jsonGenerationSchemaOnlyBenchmark = {
377
- name: "json-generation-schema-only",
378
- version: "1.0.1",
379
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
380
- async run(model) {
381
- const logs = [];
382
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
383
- let tests = [];
384
- try {
385
- const dataDir = resolveDataDir();
386
- const testsJsonl = await import_fs2.promises.readFile(
387
- import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
388
- "utf-8"
389
- );
390
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
391
- } catch (e) {
392
- const msg = e instanceof Error ? e.message : String(e);
393
- return {
394
- score: 0,
395
- success: false,
396
- metrics: {},
397
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
398
- error: e
399
- };
400
- }
401
- let schemaValidCount = 0;
402
- for (const tc of tests) {
403
- try {
404
- const schemaStr = JSON.stringify(tc.schema, null, 2);
405
- const messages = [
406
- {
407
- role: "system",
408
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
409
- },
410
- {
411
- role: "user",
412
- content: [
413
- "Generate a JSON object that reflects the following facts.",
414
- "JSON Schema:",
415
- schemaStr,
416
- "Facts:",
417
- tc.promptFacts,
418
- "Output must be a single JSON only, with no additional text."
419
- ].join("\n\n")
420
- }
421
- ];
422
- const { text } = await (0, import_ai.generateText)({ model, messages });
423
- let parsed;
424
- try {
425
- parsed = extractFirstJsonBlock(text);
426
- } catch {
427
- }
428
- if (parsed === void 0) {
429
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
430
- continue;
431
- }
432
- const validate = ajv.compile(tc.schema);
433
- const valid = validate(parsed);
434
- if (valid) {
435
- schemaValidCount++;
436
- logs.push(`[PASS] ${tc.id}`);
437
- } else {
438
- logs.push(
439
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
440
- );
441
- }
442
- } catch (e) {
443
- const msg = e instanceof Error ? e.message : String(e);
444
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
445
- }
446
- }
447
- const total = tests.length;
448
- const score = total > 0 ? schemaValidCount / total : 0;
449
- return {
450
- score,
451
- success: score >= 0.8,
452
- metrics: {
453
- total_cases: total,
454
- schema_valid_count: schemaValidCount,
455
- accuracy: score
456
- },
457
- logs
458
- };
459
- }
460
- };
461
-
462
- // src/benchmarks/bfcl.ts
463
- var import_ai2 = require("ai");
464
- var import_fs3 = require("fs");
465
- var import_path3 = __toESM(require("path"), 1);
466
-
467
- // src/benchmarks/bfcl/ast-checker.ts
468
- function standardizeString(input) {
469
- if (typeof input !== "string") return input;
470
- const regex = /[ ,./\\-_*^]/g;
471
- return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
472
- }
473
- function checkStringValue(param, modelValue, possibleAnswers) {
474
- const standardizedModelValue = standardizeString(modelValue);
475
- const standardizedPossibleAnswers = possibleAnswers.map(
476
- (ans) => standardizeString(ans)
477
- );
478
- if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
479
- return {
480
- valid: false,
481
- error: `Invalid value for parameter '${param}': '${modelValue}'. Expected one of ${possibleAnswers.join(", ")}.`,
482
- error_type: "value_error:string"
483
- };
484
- }
485
- return { valid: true };
486
- }
487
- function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
488
- const modelArgs = modelToolCall.args;
489
- const modelFuncName = modelToolCall.toolName;
490
- const expectedFuncName = funcDescription.name;
491
- const expectedParams = funcDescription.parameters.properties;
492
- const requiredParams = funcDescription.parameters.required;
493
- if (modelFuncName !== expectedFuncName) {
494
- return {
495
- valid: false,
496
- error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
497
- error_type: "simple_function_checker:wrong_func_name"
498
- };
499
- }
500
- const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
501
- for (const param of requiredParams) {
502
- if (!(param in modelArgs)) {
503
- return {
504
- valid: false,
505
- error: `Missing required parameter: '${param}'.`,
506
- error_type: "simple_function_checker:missing_required"
507
- };
508
- }
509
- }
510
- for (const paramName in modelArgs) {
511
- const modelValue = modelArgs[paramName];
512
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
513
- return {
514
- valid: false,
515
- error: `Unexpected parameter: '${paramName}'.`,
516
- error_type: "simple_function_checker:unexpected_param"
517
- };
518
- }
519
- const possibleValues = possibleAnswerParams[paramName];
520
- if (typeof modelValue === "string") {
521
- const result = checkStringValue(paramName, modelValue, possibleValues);
522
- if (!result.valid) return result;
523
- } else if (Array.isArray(modelValue)) {
524
- const modelValueStr = JSON.stringify(
525
- modelValue.map((v) => standardizeString(v.toString())).sort()
526
- );
527
- const hasMatch = possibleValues.some(
528
- (p) => JSON.stringify(
529
- p.map((v) => standardizeString(v.toString())).sort()
530
- ) === modelValueStr
531
- );
532
- if (!hasMatch) {
533
- return {
534
- valid: false,
535
- error: `Invalid value for list parameter '${paramName}'.`,
536
- error_type: "value_error:list"
537
- };
538
- }
539
- } else {
540
- if (!possibleValues.includes(modelValue)) {
541
- return {
542
- valid: false,
543
- error: `Invalid value for parameter '${paramName}': got '${modelValue}', expected one of '${possibleValues}'.`,
544
- error_type: "value_error:other"
545
- };
546
- }
547
- }
548
- }
549
- for (const paramName in possibleAnswerParams) {
550
- if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
551
- return {
552
- valid: false,
553
- error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
554
- error_type: "simple_function_checker:missing_optional"
555
- };
556
- }
557
- }
558
- return { valid: true };
559
- }
560
- function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
561
- if (modelToolCalls.length !== possibleAnswers.length) {
562
- return {
563
- valid: false,
564
- error: `Wrong number of functions. Expected ${possibleAnswers.length}, got ${modelToolCalls.length}.`,
565
- error_type: "parallel_function_checker_no_order:wrong_count"
566
- };
567
- }
568
- const matchedModelCallIndices = /* @__PURE__ */ new Set();
569
- for (const possibleAnswer of possibleAnswers) {
570
- const expectedFuncName = Object.keys(possibleAnswer)[0];
571
- const funcDescription = funcDescriptions.find(
572
- (f) => f.name === expectedFuncName
573
- );
574
- if (!funcDescription) {
575
- return {
576
- valid: false,
577
- error: `Could not find function description for '${expectedFuncName}'.`,
578
- error_type: "parallel_function_checker_no_order:missing_func_desc"
579
- };
580
- }
581
- let foundMatch = false;
582
- for (let i = 0; i < modelToolCalls.length; i++) {
583
- if (matchedModelCallIndices.has(i)) continue;
584
- const checkerResult = simpleFunctionChecker(
585
- funcDescription,
586
- modelToolCalls[i],
587
- possibleAnswer
588
- );
589
- if (checkerResult.valid) {
590
- matchedModelCallIndices.add(i);
591
- foundMatch = true;
592
- break;
593
- }
594
- }
595
- if (!foundMatch) {
596
- return {
597
- valid: false,
598
- error: `Could not find a matching function call for '${expectedFuncName}'.`,
599
- error_type: "parallel_function_checker_no_order:cannot_find_match"
594
+ valid: false,
595
+ error: `Could not find a matching function call for '${expectedFuncName}'.`,
596
+ error_type: "parallel_function_checker_no_order:cannot_find_match"
600
597
  };
601
598
  }
602
599
  }
@@ -633,10 +630,11 @@ function check(testCase, modelOutput, possibleAnswer) {
633
630
  const category = testCase.id.split("_")[0];
634
631
  try {
635
632
  if (category === "simple") {
636
- if (!modelOutput || modelOutput.length !== 1) {
633
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
637
634
  return {
638
635
  valid: false,
639
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`
636
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
637
+ error_type: "simple:wrong_count"
640
638
  };
641
639
  }
642
640
  return simpleFunctionChecker(
@@ -665,7 +663,11 @@ function check(testCase, modelOutput, possibleAnswer) {
665
663
  }
666
664
  return { valid: true };
667
665
  } catch (e) {
668
- return { valid: false, error: `Checker Error: ${e.message}` };
666
+ return {
667
+ valid: false,
668
+ error: `Checker Error: ${e.message}`,
669
+ error_type: "checker_error"
670
+ };
669
671
  }
670
672
  }
671
673
  function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
@@ -680,12 +682,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
680
682
  try {
681
683
  const dataPath = resolveDataDir();
682
684
  logs.push(`[INFO] Using data dir: ${dataPath}`);
683
- const testCasesJson = await import_fs3.promises.readFile(
684
- import_path3.default.join(dataPath, testDataFile),
685
+ const testCasesJson = await import_fs2.promises.readFile(
686
+ import_path2.default.join(dataPath, testDataFile),
685
687
  "utf-8"
686
688
  );
687
- const possibleAnswersJson = await import_fs3.promises.readFile(
688
- import_path3.default.join(dataPath, answerDataFile),
689
+ const possibleAnswersJson = await import_fs2.promises.readFile(
690
+ import_path2.default.join(dataPath, answerDataFile),
689
691
  "utf-8"
690
692
  );
691
693
  testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -702,22 +704,34 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
702
704
  );
703
705
  }
704
706
  const fixSchema = (schema) => {
705
- if (!schema || typeof schema !== "object") return schema;
707
+ if (!schema || typeof schema !== "object")
708
+ return { type: "object", properties: {} };
706
709
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
707
- if (copy.type) {
708
- if (copy.type === "dict") copy.type = "object";
709
- if (copy.type === "integer" || copy.type === "float")
710
- copy.type = "number";
711
- }
712
- if (copy.properties && typeof copy.properties === "object") {
713
- for (const k of Object.keys(copy.properties)) {
714
- copy.properties[k] = fixSchema(copy.properties[k]);
710
+ if (!Array.isArray(copy)) {
711
+ if (copy.type) {
712
+ if (copy.type === "dict") copy.type = "object";
713
+ if (copy.type === "integer" || copy.type === "float")
714
+ copy.type = "number";
715
+ }
716
+ if (copy.properties && typeof copy.properties === "object") {
717
+ for (const k of Object.keys(copy.properties)) {
718
+ copy.properties[k] = fixSchema(
719
+ copy.properties[k]
720
+ );
721
+ }
715
722
  }
723
+ if (copy.items) copy.items = fixSchema(copy.items);
724
+ return copy;
716
725
  }
717
- if (copy.items) copy.items = fixSchema(copy.items);
718
726
  return copy;
719
727
  };
720
- for (const testCase of testCases) {
728
+ const concurrencyEnv = process.env.BFCL_CONCURRENCY;
729
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
730
+ logs.push(
731
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
732
+ );
733
+ const runSingleCase = async (testCase) => {
734
+ const caseLogs = [];
721
735
  const { function: tools, question: messages } = testCase;
722
736
  try {
723
737
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
@@ -728,7 +742,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
728
742
  };
729
743
  const transformedTools = tools.map((t) => {
730
744
  const fixed = fixSchema(t.parameters);
731
- const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
745
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
746
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
732
747
  const sanitized = sanitizeName(t.name);
733
748
  nameMap.set(sanitized, t.name);
734
749
  return {
@@ -741,35 +756,46 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
741
756
  const toolsMap = Object.fromEntries(
742
757
  transformedTools.map((t) => [
743
758
  t.name,
744
- (0, import_ai2.tool)({
759
+ (0, import_ai.tool)({
745
760
  description: typeof t.description === "string" ? t.description : void 0,
746
- inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
761
+ inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
747
762
  })
748
763
  ])
749
764
  );
750
765
  try {
751
766
  const firstTool = transformedTools[0];
752
767
  const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
753
- logs.push(
768
+ caseLogs.push(
754
769
  `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
755
770
  );
756
771
  } catch (e) {
757
- logs.push(
772
+ caseLogs.push(
758
773
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
759
774
  );
760
775
  }
761
- const { toolCalls, text, finishReason } = await (0, import_ai2.generateText)({
776
+ const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
762
777
  model,
763
778
  messages: flatMessages,
764
779
  tools: toolsMap,
765
- toolChoice: "auto"
780
+ toolChoice: "auto",
781
+ // Pass original schema information to middleware
782
+ providerOptions: {
783
+ toolCallMiddleware: {
784
+ originalToolSchemas: Object.fromEntries(
785
+ transformedTools.map((t) => [
786
+ t.name,
787
+ t.inputSchema
788
+ ])
789
+ )
790
+ }
791
+ }
766
792
  });
767
793
  try {
768
- logs.push(
794
+ caseLogs.push(
769
795
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
770
796
  );
771
797
  } catch {
772
- logs.push(
798
+ caseLogs.push(
773
799
  `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
774
800
  );
775
801
  }
@@ -802,20 +828,232 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
802
828
  possibleAnswer
803
829
  );
804
830
  if (checkerResult.valid) {
805
- correctCount++;
806
- logs.push(`[PASS] ${testCase.id}`);
831
+ caseLogs.push(`[PASS] ${testCase.id}`);
832
+ return { valid: true, logs: caseLogs };
807
833
  } else {
808
- logs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
834
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
835
+ try {
836
+ const category = testCase.id.split("_")[0];
837
+ const diff = [];
838
+ const summarizeArgs = (args) => {
839
+ if (args == null) return args;
840
+ if (typeof args !== "object") return args;
841
+ return Object.keys(args).sort().reduce(
842
+ (acc, k) => {
843
+ acc[k] = args[k];
844
+ return acc;
845
+ },
846
+ {}
847
+ );
848
+ };
849
+ const expected = {};
850
+ const actual = {};
851
+ if (category === "simple") {
852
+ const funcDesc = tools[0];
853
+ const gt = possibleAnswer.ground_truth?.[0];
854
+ const expectedFuncName = funcDesc?.name;
855
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
856
+ const received = restoredCalls[0];
857
+ const receivedName = received?.toolName ?? received?.name;
858
+ const receivedArgs = summarizeArgs(received?.args);
859
+ expected.function = expectedFuncName;
860
+ expected.params = expectedParams;
861
+ actual.function = receivedName;
862
+ actual.args = receivedArgs;
863
+ if (expectedFuncName !== receivedName) {
864
+ diff.push(`@@ function name`);
865
+ diff.push(`- ${expectedFuncName}`);
866
+ diff.push(`+ ${receivedName}`);
867
+ }
868
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
869
+ const required = funcDesc?.parameters?.required ?? [];
870
+ for (const req of required) {
871
+ if (!(req in receivedArgs)) {
872
+ diff.push(`- missing required param: ${req}`);
873
+ }
874
+ }
875
+ for (const k of Object.keys(
876
+ receivedArgs
877
+ )) {
878
+ if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
879
+ diff.push(`+ unexpected param: ${k}`);
880
+ }
881
+ }
882
+ for (const k of Object.keys(
883
+ receivedArgs
884
+ )) {
885
+ if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
886
+ const allowed = expectedParams[k];
887
+ const got = receivedArgs[k];
888
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
889
+ try {
890
+ if (Array.isArray(got)) {
891
+ return JSON.stringify(
892
+ got.map((x) => String(x)).sort()
893
+ ) === JSON.stringify(
894
+ v.map((x) => String(x)).sort()
895
+ );
896
+ }
897
+ } catch {
898
+ }
899
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
900
+ });
901
+ if (!includes) {
902
+ diff.push(`@@ param ${k}`);
903
+ diff.push(
904
+ `- expected one of: ${JSON.stringify(allowed)}`
905
+ );
906
+ diff.push(`+ got: ${JSON.stringify(got)}`);
907
+ }
908
+ }
909
+ }
910
+ }
911
+ } else {
912
+ const gtArr = possibleAnswer.ground_truth ?? [];
913
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
914
+ const actualNames = restoredCalls.map(
915
+ (c) => c.toolName ?? c.name
916
+ );
917
+ expected.functions = expectedNames;
918
+ actual.functions = actualNames;
919
+ if (expectedNames.length !== actualNames.length) {
920
+ diff.push(`@@ call count`);
921
+ diff.push(`- expected ${expectedNames.length}`);
922
+ diff.push(`+ got ${actualNames.length}`);
923
+ }
924
+ const missing = expectedNames.filter(
925
+ (n) => !actualNames.includes(n)
926
+ );
927
+ const extra = actualNames.filter(
928
+ (n) => !expectedNames.includes(n)
929
+ );
930
+ for (const m of missing)
931
+ diff.push(`- missing function: ${m}`);
932
+ for (const e of extra)
933
+ diff.push(`+ unexpected function: ${e}`);
934
+ const usedActual = /* @__PURE__ */ new Set();
935
+ for (const expectedObj of gtArr) {
936
+ const fname = Object.keys(expectedObj)[0];
937
+ let matchedIndex = -1;
938
+ for (let i = 0; i < restoredCalls.length; i++) {
939
+ if (usedActual.has(i)) continue;
940
+ const rc = restoredCalls[i];
941
+ const rcName = rc?.toolName ?? rc?.name;
942
+ if (rcName === fname) {
943
+ matchedIndex = i;
944
+ break;
945
+ }
946
+ }
947
+ if (matchedIndex === -1) continue;
948
+ usedActual.add(matchedIndex);
949
+ const received = restoredCalls[matchedIndex];
950
+ const receivedArgs = summarizeArgs(received?.args);
951
+ const expectedParamsAllowed = expectedObj[fname];
952
+ const funcDesc = tools.find(
953
+ (t) => t.name === fname
954
+ );
955
+ const requiredParams = funcDesc?.parameters?.required ?? [];
956
+ diff.push(`@@ function ${fname}`);
957
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
958
+ for (const req of requiredParams) {
959
+ if (!(req in receivedArgs)) {
960
+ diff.push(`- missing required param: ${req}`);
961
+ }
962
+ }
963
+ for (const k of Object.keys(
964
+ receivedArgs
965
+ )) {
966
+ if (!Object.prototype.hasOwnProperty.call(
967
+ expectedParamsAllowed,
968
+ k
969
+ )) {
970
+ diff.push(`+ unexpected param: ${k}`);
971
+ }
972
+ }
973
+ for (const k of Object.keys(
974
+ receivedArgs
975
+ )) {
976
+ if (Object.prototype.hasOwnProperty.call(
977
+ expectedParamsAllowed,
978
+ k
979
+ )) {
980
+ const allowed = expectedParamsAllowed[k];
981
+ const got = receivedArgs[k];
982
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
983
+ try {
984
+ if (Array.isArray(got)) {
985
+ return JSON.stringify(
986
+ got.map((x) => String(x)).sort()
987
+ ) === JSON.stringify(
988
+ v.map((x) => String(x)).sort()
989
+ );
990
+ }
991
+ } catch {
992
+ }
993
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
994
+ });
995
+ if (!includes) {
996
+ diff.push(`@@ param ${k}`);
997
+ diff.push(
998
+ `- expected one of: ${JSON.stringify(allowed)}`
999
+ );
1000
+ diff.push(`+ got: ${JSON.stringify(got)}`);
1001
+ }
1002
+ }
1003
+ }
1004
+ }
1005
+ }
1006
+ }
1007
+ caseLogs.push(
1008
+ `[DEBUG-FAIL] ${JSON.stringify({
1009
+ id: testCase.id,
1010
+ message: checkerResult.error,
1011
+ error_type: checkerResult.error_type,
1012
+ expected,
1013
+ actual,
1014
+ diff
1015
+ })}`
1016
+ );
1017
+ } catch {
1018
+ caseLogs.push(
1019
+ `[DEBUG] ${testCase.id}: failed to build debug diff`
1020
+ );
1021
+ }
1022
+ return { valid: false, logs: caseLogs };
809
1023
  }
810
1024
  } catch (e) {
811
- logs.push(
1025
+ caseLogs.push(
812
1026
  `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
813
1027
  );
814
1028
  if (e?.stack) {
815
- logs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1029
+ caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
816
1030
  }
1031
+ return { valid: false, logs: caseLogs };
817
1032
  }
818
- }
1033
+ };
1034
+ const mapWithConcurrency = async (items, limit2, mapper) => {
1035
+ const results = new Array(items.length);
1036
+ let idx = 0;
1037
+ const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
1038
+ while (true) {
1039
+ const current = idx++;
1040
+ if (current >= items.length) break;
1041
+ results[current] = await mapper(items[current], current);
1042
+ }
1043
+ });
1044
+ await Promise.all(workers);
1045
+ return results;
1046
+ };
1047
+ const resultsPerCase = await mapWithConcurrency(
1048
+ testCases,
1049
+ concurrency,
1050
+ async (tc) => runSingleCase(tc)
1051
+ );
1052
+ correctCount = resultsPerCase.reduce(
1053
+ (acc, r) => acc + (r.valid ? 1 : 0),
1054
+ 0
1055
+ );
1056
+ for (const r of resultsPerCase) logs.push(...r.logs);
819
1057
  if (testCases.length === 0) {
820
1058
  return {
821
1059
  score: 0,
@@ -872,6 +1110,262 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
872
1110
  "BFCL_v3_parallel_multiple.json",
873
1111
  "BFCL_v3_parallel_multiple_possible_answer.json"
874
1112
  );
1113
+
1114
+ // src/benchmarks/json-generation.ts
1115
+ var import_ai2 = require("ai");
1116
+ var import_ajv = __toESM(require("ajv"), 1);
1117
+ var import_fs3 = require("fs");
1118
+ var import_path3 = __toESM(require("path"), 1);
1119
+ function extractFirstJsonBlock(text) {
1120
+ try {
1121
+ return JSON.parse(text);
1122
+ } catch {
1123
+ }
1124
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1125
+ if (fenceMatch) {
1126
+ const inner = fenceMatch[1].trim();
1127
+ try {
1128
+ return JSON.parse(inner);
1129
+ } catch {
1130
+ }
1131
+ }
1132
+ const startIdxObj = text.indexOf("{");
1133
+ const startIdxArr = text.indexOf("[");
1134
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1135
+ if (start === void 0) return void 0;
1136
+ const open = text[start] === "{" ? "{" : "[";
1137
+ const close = open === "{" ? "}" : "]";
1138
+ let depth = 0;
1139
+ for (let i = start; i < text.length; i++) {
1140
+ const ch = text[i];
1141
+ if (ch === open) depth++;
1142
+ else if (ch === close) depth--;
1143
+ if (depth === 0) {
1144
+ const candidate = text.slice(start, i + 1);
1145
+ try {
1146
+ return JSON.parse(candidate);
1147
+ } catch {
1148
+ }
1149
+ break;
1150
+ }
1151
+ }
1152
+ return void 0;
1153
+ }
1154
+ function subsetMatch(expected, actual) {
1155
+ if (expected === null || typeof expected !== "object") {
1156
+ return expected === actual;
1157
+ }
1158
+ if (Array.isArray(expected)) {
1159
+ if (!Array.isArray(actual)) return false;
1160
+ for (let i = 0; i < expected.length; i++) {
1161
+ if (!subsetMatch(expected[i], actual[i])) return false;
1162
+ }
1163
+ return true;
1164
+ }
1165
+ if (actual === null || typeof actual !== "object") return false;
1166
+ const eObj = expected;
1167
+ const aObj = actual;
1168
+ for (const key of Object.keys(eObj)) {
1169
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
1170
+ }
1171
+ return true;
1172
+ }
1173
+ var jsonGenerationBenchmark = {
1174
+ name: "json-generation",
1175
+ version: "2.1.0",
1176
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1177
+ async run(model) {
1178
+ const logs = [];
1179
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1180
+ let schemaValidCount = 0;
1181
+ let valueMatchCount = 0;
1182
+ let correctCount = 0;
1183
+ let tests = [];
1184
+ const expectedMap = /* @__PURE__ */ new Map();
1185
+ try {
1186
+ const dataDir = resolveDataDir();
1187
+ const testsJsonl = await import_fs3.promises.readFile(
1188
+ import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1189
+ "utf-8"
1190
+ );
1191
+ const expectedJsonl = await import_fs3.promises.readFile(
1192
+ import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1193
+ "utf-8"
1194
+ );
1195
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1196
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1197
+ for (const r of expecteds) expectedMap.set(r.id, r);
1198
+ } catch (e) {
1199
+ const msg = e instanceof Error ? e.message : String(e);
1200
+ return {
1201
+ score: 0,
1202
+ success: false,
1203
+ metrics: {},
1204
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1205
+ error: e
1206
+ };
1207
+ }
1208
+ for (const tc of tests) {
1209
+ try {
1210
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1211
+ const messages = [
1212
+ {
1213
+ role: "system",
1214
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1215
+ },
1216
+ {
1217
+ role: "user",
1218
+ content: [
1219
+ "Generate a JSON object that reflects the following facts.",
1220
+ "JSON Schema:",
1221
+ schemaStr,
1222
+ "Facts:",
1223
+ tc.promptFacts,
1224
+ "Output must be a single JSON only, with no additional text."
1225
+ ].join("\n\n")
1226
+ }
1227
+ ];
1228
+ const { text } = await (0, import_ai2.generateText)({ model, messages });
1229
+ let parsed;
1230
+ try {
1231
+ parsed = extractFirstJsonBlock(text);
1232
+ } catch {
1233
+ }
1234
+ if (parsed === void 0) {
1235
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1236
+ continue;
1237
+ }
1238
+ const validate = ajv.compile(tc.schema);
1239
+ const valid = validate(parsed);
1240
+ if (valid) schemaValidCount++;
1241
+ else
1242
+ logs.push(
1243
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1244
+ );
1245
+ const expectedRec = expectedMap.get(tc.id);
1246
+ if (!expectedRec) {
1247
+ logs.push(
1248
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1249
+ );
1250
+ }
1251
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1252
+ if (valuesOk) valueMatchCount++;
1253
+ if (valid && valuesOk) {
1254
+ correctCount++;
1255
+ logs.push(`[PASS] ${tc.id}`);
1256
+ } else {
1257
+ logs.push(
1258
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1259
+ parsed
1260
+ )}`
1261
+ );
1262
+ }
1263
+ } catch (e) {
1264
+ const msg = e instanceof Error ? e.message : String(e);
1265
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1266
+ }
1267
+ }
1268
+ const total = tests.length;
1269
+ const score = correctCount / total;
1270
+ return {
1271
+ score,
1272
+ success: score >= 0.8,
1273
+ metrics: {
1274
+ total_cases: total,
1275
+ correct_count: correctCount,
1276
+ schema_valid_count: schemaValidCount,
1277
+ value_match_count: valueMatchCount,
1278
+ accuracy: score
1279
+ },
1280
+ logs
1281
+ };
1282
+ }
1283
+ };
1284
+ var jsonGenerationSchemaOnlyBenchmark = {
1285
+ name: "json-generation-schema-only",
1286
+ version: "1.0.1",
1287
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1288
+ async run(model) {
1289
+ const logs = [];
1290
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1291
+ let tests = [];
1292
+ try {
1293
+ const dataDir = resolveDataDir();
1294
+ const testsJsonl = await import_fs3.promises.readFile(
1295
+ import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1296
+ "utf-8"
1297
+ );
1298
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1299
+ } catch (e) {
1300
+ const msg = e instanceof Error ? e.message : String(e);
1301
+ return {
1302
+ score: 0,
1303
+ success: false,
1304
+ metrics: {},
1305
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1306
+ error: e
1307
+ };
1308
+ }
1309
+ let schemaValidCount = 0;
1310
+ for (const tc of tests) {
1311
+ try {
1312
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1313
+ const messages = [
1314
+ {
1315
+ role: "system",
1316
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1317
+ },
1318
+ {
1319
+ role: "user",
1320
+ content: [
1321
+ "Generate a JSON object that reflects the following facts.",
1322
+ "JSON Schema:",
1323
+ schemaStr,
1324
+ "Facts:",
1325
+ tc.promptFacts,
1326
+ "Output must be a single JSON only, with no additional text."
1327
+ ].join("\n\n")
1328
+ }
1329
+ ];
1330
+ const { text } = await (0, import_ai2.generateText)({ model, messages });
1331
+ let parsed;
1332
+ try {
1333
+ parsed = extractFirstJsonBlock(text);
1334
+ } catch {
1335
+ }
1336
+ if (parsed === void 0) {
1337
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1338
+ continue;
1339
+ }
1340
+ const validate = ajv.compile(tc.schema);
1341
+ const valid = validate(parsed);
1342
+ if (valid) {
1343
+ schemaValidCount++;
1344
+ logs.push(`[PASS] ${tc.id}`);
1345
+ } else {
1346
+ logs.push(
1347
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1348
+ );
1349
+ }
1350
+ } catch (e) {
1351
+ const msg = e instanceof Error ? e.message : String(e);
1352
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1353
+ }
1354
+ }
1355
+ const total = tests.length;
1356
+ const score = total > 0 ? schemaValidCount / total : 0;
1357
+ return {
1358
+ score,
1359
+ success: score >= 0.8,
1360
+ metrics: {
1361
+ total_cases: total,
1362
+ schema_valid_count: schemaValidCount,
1363
+ accuracy: score
1364
+ },
1365
+ logs
1366
+ };
1367
+ }
1368
+ };
875
1369
  // Annotate the CommonJS export names for ESM import in node:
876
1370
  0 && (module.exports = {
877
1371
  bfclMultipleBenchmark,