@ai-sdk-tool/eval 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,14 +5,15 @@ var colors = {
5
5
  red: "\x1B[31m",
6
6
  yellow: "\x1B[33m",
7
7
  cyan: "\x1B[36m",
8
- magenta: "\x1B[35m"
8
+ magenta: "\x1B[35m",
9
+ gray: "\x1B[90m"
9
10
  };
10
11
  function printResult(result) {
11
- const { model, benchmark, result: benchmarkResult } = result;
12
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
12
13
  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
13
14
  console.log(
14
15
  `
15
- ${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
16
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
16
17
  );
17
18
  console.log(
18
19
  ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
@@ -38,6 +39,186 @@ function consoleReporter(results) {
38
39
  console.log("\n---------------------------\n");
39
40
  }
40
41
 
42
+ // src/reporters/console.debug.ts
43
+ var colors2 = {
44
+ reset: "\x1B[0m",
45
+ green: "\x1B[32m",
46
+ red: "\x1B[31m",
47
+ yellow: "\x1B[33m",
48
+ cyan: "\x1B[36m",
49
+ magenta: "\x1B[35m",
50
+ gray: "\x1B[90m",
51
+ bold: "\x1B[1m",
52
+ underline: "\x1B[4m"
53
+ };
54
+ function colorizeDiffLine(line) {
55
+ if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
56
+ if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
57
+ if (line.startsWith("@"))
58
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
59
+ return line;
60
+ }
61
+ function uniqueLines(lines) {
62
+ const seen = /* @__PURE__ */ new Set();
63
+ const out = [];
64
+ for (const l of lines) {
65
+ if (seen.has(l)) continue;
66
+ seen.add(l);
67
+ out.push(l);
68
+ }
69
+ return out;
70
+ }
71
+ function suggestFixFromDiff(parsed) {
72
+ const suggestions = [];
73
+ const { error_type, expected, actual, diff } = parsed ?? {};
74
+ if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
75
+ const expectedName = expected?.function;
76
+ const actualName = actual?.function;
77
+ if (expectedName && actualName && expectedName !== actualName) {
78
+ suggestions.push(
79
+ `Call the function '${expectedName}' instead of '${actualName}'.`
80
+ );
81
+ }
82
+ if (Array.isArray(expected?.functions)) {
83
+ suggestions.push(
84
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
85
+ );
86
+ }
87
+ }
88
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
89
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
90
+ if (missing.length) {
91
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
92
+ }
93
+ }
94
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
95
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
96
+ if (extras.length) {
97
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
98
+ }
99
+ }
100
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
101
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
102
+ for (const param of targets) {
103
+ const allowedLine = diff.find(
104
+ (d) => String(d).startsWith("- expected one of:")
105
+ );
106
+ if (allowedLine) {
107
+ const allowed = allowedLine.replace("- expected one of: ", "");
108
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
109
+ } else {
110
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
111
+ }
112
+ }
113
+ }
114
+ if (suggestions.length === 0 && typeof error_type === "string") {
115
+ if (error_type.includes("missing_required")) {
116
+ suggestions.push(
117
+ "Add all required parameters defined by the tool schema."
118
+ );
119
+ } else if (error_type.includes("unexpected_param")) {
120
+ suggestions.push("Remove parameters not present in the tool schema.");
121
+ } else if (error_type.includes("wrong_count")) {
122
+ suggestions.push(
123
+ "Adjust the number of tool calls to match expected count."
124
+ );
125
+ } else if (error_type.includes("wrong_func_name")) {
126
+ suggestions.push("Use the exact expected function name from the schema.");
127
+ } else if (error_type.includes("value_error")) {
128
+ suggestions.push("Choose a value from the allowed options.");
129
+ }
130
+ }
131
+ return uniqueLines(suggestions);
132
+ }
133
+ function consoleDebugReporter(results) {
134
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
135
+ for (const r of results) {
136
+ const { model, modelKey, benchmark, result } = r;
137
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
138
+ console.log(
139
+ `
140
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
141
+ );
142
+ console.log(
143
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
144
+ );
145
+ const metrics = Object.entries(result.metrics);
146
+ if (metrics.length > 0) {
147
+ console.log(" Metrics:");
148
+ for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
149
+ }
150
+ if (result.logs && result.logs.length) {
151
+ const failLogs = result.logs.filter(
152
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
153
+ );
154
+ const hasFails = failLogs.length > 0;
155
+ if (hasFails) {
156
+ console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
157
+ const debugIds = /* @__PURE__ */ new Set();
158
+ for (const l of failLogs) {
159
+ if (l.startsWith("[DEBUG-FAIL]")) {
160
+ try {
161
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
162
+ if (parsed?.id) debugIds.add(String(parsed.id));
163
+ } catch {
164
+ }
165
+ }
166
+ }
167
+ for (const line of failLogs) {
168
+ if (line.startsWith("[FAIL]")) {
169
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
170
+ const failId = m?.[1];
171
+ if (failId && debugIds.has(failId)) continue;
172
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
173
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
174
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
175
+ } else if (line.startsWith("[STACK]")) {
176
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
177
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
178
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
179
+ try {
180
+ const parsed = JSON.parse(payload);
181
+ const { id, expected, actual, message, diff } = parsed;
182
+ console.log(
183
+ ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
184
+ );
185
+ if (diff && Array.isArray(diff)) {
186
+ for (const dLine of diff)
187
+ console.log(" " + colorizeDiffLine(dLine));
188
+ } else {
189
+ console.log(" expected:");
190
+ console.log(
191
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
192
+ );
193
+ console.log(" actual:");
194
+ console.log(
195
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
196
+ );
197
+ }
198
+ const suggestions = suggestFixFromDiff(parsed);
199
+ if (suggestions.length) {
200
+ console.log(
201
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
202
+ );
203
+ for (const s of suggestions) console.log(` \u2022 ${s}`);
204
+ }
205
+ } catch {
206
+ console.log(` ${line}`);
207
+ }
208
+ }
209
+ }
210
+ } else {
211
+ const info = result.logs.filter(
212
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
213
+ );
214
+ for (const line of info)
215
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
216
+ }
217
+ }
218
+ }
219
+ console.log("\n------------------------------------\n");
220
+ }
221
+
41
222
  // src/reporters/json.ts
42
223
  function jsonReporter(results) {
43
224
  const serializableResults = results.map((r) => ({
@@ -53,30 +234,35 @@ function jsonReporter(results) {
53
234
  // src/reporters/index.ts
54
235
  var reporters = {
55
236
  console: consoleReporter,
56
- json: jsonReporter
237
+ json: jsonReporter,
238
+ "console.debug": consoleDebugReporter
57
239
  };
58
240
 
59
241
  // src/evaluate.ts
60
- async function runSingleBenchmark(model, benchmark) {
242
+ async function runSingleBenchmark(model, benchmark, modelKey) {
61
243
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
62
244
  try {
63
- console.log(`[${modelId}] Running benchmark: ${benchmark.name}...`);
245
+ console.log(
246
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
247
+ );
64
248
  const result = await benchmark.run(model);
65
249
  console.log(
66
- `[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
250
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
67
251
  );
68
252
  return {
69
253
  model: modelId,
254
+ modelKey,
70
255
  benchmark: benchmark.name,
71
256
  result
72
257
  };
73
258
  } catch (error) {
74
259
  console.error(
75
- `[${modelId}] Error running benchmark: ${benchmark.name}`,
260
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
76
261
  error
77
262
  );
78
263
  return {
79
264
  model: modelId,
265
+ modelKey,
80
266
  benchmark: benchmark.name,
81
267
  result: {
82
268
  score: 0,
@@ -89,11 +275,26 @@ async function runSingleBenchmark(model, benchmark) {
89
275
  }
90
276
  async function evaluate(options) {
91
277
  const { models, benchmarks, reporter = "console" } = options;
92
- const modelsArray = Array.isArray(models) ? models : [models];
278
+ const modelEntries = [];
279
+ if (Array.isArray(models)) {
280
+ for (const m of models) modelEntries.push([void 0, m]);
281
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
282
+ modelEntries.push([void 0, models]);
283
+ } else {
284
+ for (const [key, m] of Object.entries(
285
+ models
286
+ )) {
287
+ modelEntries.push([key, m]);
288
+ }
289
+ }
93
290
  const allResults = [];
94
- for (const model of modelsArray) {
291
+ for (const [modelKey, model] of modelEntries) {
95
292
  for (const benchmark of benchmarks) {
96
- const evaluationResult = await runSingleBenchmark(model, benchmark);
293
+ const evaluationResult = await runSingleBenchmark(
294
+ model,
295
+ benchmark,
296
+ modelKey
297
+ );
97
298
  allResults.push(evaluationResult);
98
299
  }
99
300
  }
@@ -107,17 +308,16 @@ async function evaluate(options) {
107
308
  return allResults;
108
309
  }
109
310
 
110
- // src/benchmarks/json-generation.ts
111
- import { generateText } from "ai";
112
- import Ajv from "ajv";
311
+ // src/benchmarks/bfcl.ts
312
+ import { generateText, jsonSchema, tool } from "ai";
113
313
  import { promises as fs2 } from "fs";
114
314
  import path2 from "path";
115
315
 
116
316
  // src/utils/paths.ts
117
317
  import fs from "fs";
318
+ import { createRequire } from "module";
118
319
  import path from "path";
119
320
  import { fileURLToPath } from "url";
120
- import { createRequire } from "module";
121
321
  function resolveDataDir(fromModuleUrl) {
122
322
  const moduleUrl = fromModuleUrl;
123
323
  const override = process.env.BFCL_DATA_DIR;
@@ -165,396 +365,193 @@ function resolveDataDir(fromModuleUrl) {
165
365
  return path.join(pkgRoot, "data");
166
366
  }
167
367
 
168
- // src/benchmarks/json-generation.ts
169
- function extractFirstJsonBlock(text) {
170
- try {
171
- return JSON.parse(text);
172
- } catch {
368
+ // src/benchmarks/bfcl/ast-checker.ts
369
+ function standardizeString(input) {
370
+ if (typeof input !== "string") return input;
371
+ const regex = /[ ,./\\-_*^]/g;
372
+ return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
373
+ }
374
+ function checkStringValue(param, modelValue, possibleAnswers) {
375
+ const standardizedModelValue = standardizeString(modelValue);
376
+ const standardizedPossibleAnswers = possibleAnswers.map(
377
+ (ans) => standardizeString(String(ans))
378
+ );
379
+ if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
380
+ return {
381
+ valid: false,
382
+ error: `Invalid value for parameter '${param}': ${JSON.stringify(
383
+ modelValue
384
+ )}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
385
+ error_type: "value_error:string"
386
+ };
173
387
  }
174
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
175
- if (fenceMatch) {
176
- const inner = fenceMatch[1].trim();
177
- try {
178
- return JSON.parse(inner);
179
- } catch {
388
+ return { valid: true };
389
+ }
390
+ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
391
+ const modelArgs = modelToolCall.args;
392
+ const modelFuncName = modelToolCall.toolName;
393
+ const expectedFuncName = funcDescription.name;
394
+ const expectedParams = funcDescription.parameters.properties;
395
+ const requiredParams = funcDescription.parameters.required;
396
+ if (modelFuncName !== expectedFuncName) {
397
+ return {
398
+ valid: false,
399
+ error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
400
+ error_type: "simple_function_checker:wrong_func_name"
401
+ };
402
+ }
403
+ const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
404
+ const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
405
+ for (const param of requiredParams) {
406
+ if (!(param in argsObj)) {
407
+ return {
408
+ valid: false,
409
+ error: `Missing required parameter: '${param}'.`,
410
+ error_type: "simple_function_checker:missing_required"
411
+ };
180
412
  }
181
413
  }
182
- const startIdxObj = text.indexOf("{");
183
- const startIdxArr = text.indexOf("[");
184
- const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
185
- if (start === void 0) return void 0;
186
- const open = text[start] === "{" ? "{" : "[";
187
- const close = open === "{" ? "}" : "]";
188
- let depth = 0;
189
- for (let i = start; i < text.length; i++) {
190
- const ch = text[i];
191
- if (ch === open) depth++;
192
- else if (ch === close) depth--;
193
- if (depth === 0) {
194
- const candidate = text.slice(start, i + 1);
195
- try {
196
- return JSON.parse(candidate);
197
- } catch {
414
+ if (modelArgs && typeof modelArgs === "object") {
415
+ for (const paramName of Object.keys(argsObj)) {
416
+ const modelValue = argsObj[paramName];
417
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
418
+ return {
419
+ valid: false,
420
+ error: `Unexpected parameter: '${paramName}'.`,
421
+ error_type: "simple_function_checker:unexpected_param"
422
+ };
423
+ }
424
+ const possibleValues = possibleAnswerParams[paramName];
425
+ if (typeof modelValue === "string") {
426
+ const result = checkStringValue(
427
+ paramName,
428
+ modelValue,
429
+ possibleValues ?? []
430
+ );
431
+ if (!result.valid) return result;
432
+ } else if (Array.isArray(modelValue)) {
433
+ const modelValueStr = JSON.stringify(
434
+ modelValue.map((v) => standardizeString(String(v))).sort()
435
+ );
436
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
437
+ if (!Array.isArray(p)) return false;
438
+ return JSON.stringify(
439
+ p.map((v) => standardizeString(String(v))).sort()
440
+ ) === modelValueStr;
441
+ }) : false;
442
+ if (!hasMatch) {
443
+ return {
444
+ valid: false,
445
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
446
+ modelValue
447
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
448
+ error_type: "value_error:list"
449
+ };
450
+ }
451
+ } else {
452
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
453
+ if (modelValue === possibleValue) return true;
454
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
455
+ try {
456
+ const normalizeObject = (obj) => {
457
+ if (Array.isArray(obj)) {
458
+ return obj.map(normalizeObject);
459
+ }
460
+ if (obj && typeof obj === "object") {
461
+ const normalized = {};
462
+ for (const [key, value] of Object.entries(
463
+ obj
464
+ )) {
465
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
466
+ normalized[key] = value[0];
467
+ } else {
468
+ normalized[key] = normalizeObject(value);
469
+ }
470
+ }
471
+ return normalized;
472
+ }
473
+ return obj;
474
+ };
475
+ const normalizedModel = normalizeObject(modelValue);
476
+ const normalizedPossible = normalizeObject(possibleValue);
477
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
478
+ } catch {
479
+ return false;
480
+ }
481
+ }
482
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
483
+ return modelValue.toString() === possibleValue;
484
+ }
485
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
486
+ return modelValue === possibleValue.toString();
487
+ }
488
+ return false;
489
+ }) : false;
490
+ if (!hasMatch) {
491
+ return {
492
+ valid: false,
493
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
494
+ modelValue
495
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
496
+ error_type: "value_error:other"
497
+ };
498
+ }
198
499
  }
199
- break;
200
500
  }
201
501
  }
202
- return void 0;
203
- }
204
- function subsetMatch(expected, actual) {
205
- if (expected === null || typeof expected !== "object") {
206
- return expected === actual;
207
- }
208
- if (Array.isArray(expected)) {
209
- if (!Array.isArray(actual)) return false;
210
- for (let i = 0; i < expected.length; i++) {
211
- if (!subsetMatch(expected[i], actual[i])) return false;
502
+ for (const paramName in possibleAnswerParams) {
503
+ const val = possibleAnswerParams[paramName];
504
+ const isOptional = Array.isArray(val) && val.includes("");
505
+ if (!(paramName in argsObj) && !isOptional) {
506
+ return {
507
+ valid: false,
508
+ error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
509
+ error_type: "simple_function_checker:missing_optional"
510
+ };
212
511
  }
213
- return true;
214
- }
215
- if (actual === null || typeof actual !== "object") return false;
216
- const eObj = expected;
217
- const aObj = actual;
218
- for (const key of Object.keys(eObj)) {
219
- if (!subsetMatch(eObj[key], aObj[key])) return false;
220
512
  }
221
- return true;
513
+ return { valid: true };
222
514
  }
223
- var jsonGenerationBenchmark = {
224
- name: "json-generation",
225
- version: "2.1.0",
226
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
227
- async run(model) {
228
- const logs = [];
229
- const ajv = new Ajv({ allErrors: true, strict: false });
230
- let schemaValidCount = 0;
231
- let valueMatchCount = 0;
232
- let correctCount = 0;
233
- let tests = [];
234
- const expectedMap = /* @__PURE__ */ new Map();
235
- try {
236
- const dataDir = resolveDataDir();
237
- const testsJsonl = await fs2.readFile(
238
- path2.join(dataDir, "json_generation_tests.jsonl"),
239
- "utf-8"
240
- );
241
- const expectedJsonl = await fs2.readFile(
242
- path2.join(dataDir, "json_generation_expected.jsonl"),
243
- "utf-8"
515
+ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
516
+ if (modelToolCalls.length !== possibleAnswers.length) {
517
+ return {
518
+ valid: false,
519
+ error: `Wrong number of functions. Expected ${possibleAnswers.length}, got ${modelToolCalls.length}.`,
520
+ error_type: "parallel_function_checker_no_order:wrong_count"
521
+ };
522
+ }
523
+ const matchedModelCallIndices = /* @__PURE__ */ new Set();
524
+ for (const possibleAnswer of possibleAnswers) {
525
+ const expectedFuncName = Object.keys(possibleAnswer)[0];
526
+ const funcDescription = funcDescriptions.find(
527
+ (f) => f.name === expectedFuncName
528
+ );
529
+ if (!funcDescription) {
530
+ return {
531
+ valid: false,
532
+ error: `Could not find function description for '${expectedFuncName}'.`,
533
+ error_type: "parallel_function_checker_no_order:missing_func_desc"
534
+ };
535
+ }
536
+ let foundMatch = false;
537
+ for (let i = 0; i < modelToolCalls.length; i++) {
538
+ if (matchedModelCallIndices.has(i)) continue;
539
+ const checkerResult = simpleFunctionChecker(
540
+ funcDescription,
541
+ modelToolCalls[i],
542
+ possibleAnswer
244
543
  );
245
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
246
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
247
- for (const r of expecteds) expectedMap.set(r.id, r);
248
- } catch (e) {
249
- const msg = e instanceof Error ? e.message : String(e);
544
+ if (checkerResult.valid) {
545
+ matchedModelCallIndices.add(i);
546
+ foundMatch = true;
547
+ break;
548
+ }
549
+ }
550
+ if (!foundMatch) {
250
551
  return {
251
- score: 0,
252
- success: false,
253
- metrics: {},
254
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
255
- error: e
256
- };
257
- }
258
- for (const tc of tests) {
259
- try {
260
- const schemaStr = JSON.stringify(tc.schema, null, 2);
261
- const messages = [
262
- {
263
- role: "system",
264
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
265
- },
266
- {
267
- role: "user",
268
- content: [
269
- "Generate a JSON object that reflects the following facts.",
270
- "JSON Schema:",
271
- schemaStr,
272
- "Facts:",
273
- tc.promptFacts,
274
- "Output must be a single JSON only, with no additional text."
275
- ].join("\n\n")
276
- }
277
- ];
278
- const { text } = await generateText({ model, messages });
279
- let parsed;
280
- try {
281
- parsed = extractFirstJsonBlock(text);
282
- } catch {
283
- }
284
- if (parsed === void 0) {
285
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
286
- continue;
287
- }
288
- const validate = ajv.compile(tc.schema);
289
- const valid = validate(parsed);
290
- if (valid) schemaValidCount++;
291
- else
292
- logs.push(
293
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
294
- );
295
- const expectedRec = expectedMap.get(tc.id);
296
- if (!expectedRec) {
297
- logs.push(
298
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
299
- );
300
- }
301
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
302
- if (valuesOk) valueMatchCount++;
303
- if (valid && valuesOk) {
304
- correctCount++;
305
- logs.push(`[PASS] ${tc.id}`);
306
- } else {
307
- logs.push(
308
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
309
- parsed
310
- )}`
311
- );
312
- }
313
- } catch (e) {
314
- const msg = e instanceof Error ? e.message : String(e);
315
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
316
- }
317
- }
318
- const total = tests.length;
319
- const score = correctCount / total;
320
- return {
321
- score,
322
- success: score >= 0.8,
323
- metrics: {
324
- total_cases: total,
325
- correct_count: correctCount,
326
- schema_valid_count: schemaValidCount,
327
- value_match_count: valueMatchCount,
328
- accuracy: score
329
- },
330
- logs
331
- };
332
- }
333
- };
334
- var jsonGenerationSchemaOnlyBenchmark = {
335
- name: "json-generation-schema-only",
336
- version: "1.0.1",
337
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
338
- async run(model) {
339
- const logs = [];
340
- const ajv = new Ajv({ allErrors: true, strict: false });
341
- let tests = [];
342
- try {
343
- const dataDir = resolveDataDir();
344
- const testsJsonl = await fs2.readFile(
345
- path2.join(dataDir, "json_generation_tests.jsonl"),
346
- "utf-8"
347
- );
348
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
349
- } catch (e) {
350
- const msg = e instanceof Error ? e.message : String(e);
351
- return {
352
- score: 0,
353
- success: false,
354
- metrics: {},
355
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
356
- error: e
357
- };
358
- }
359
- let schemaValidCount = 0;
360
- for (const tc of tests) {
361
- try {
362
- const schemaStr = JSON.stringify(tc.schema, null, 2);
363
- const messages = [
364
- {
365
- role: "system",
366
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
367
- },
368
- {
369
- role: "user",
370
- content: [
371
- "Generate a JSON object that reflects the following facts.",
372
- "JSON Schema:",
373
- schemaStr,
374
- "Facts:",
375
- tc.promptFacts,
376
- "Output must be a single JSON only, with no additional text."
377
- ].join("\n\n")
378
- }
379
- ];
380
- const { text } = await generateText({ model, messages });
381
- let parsed;
382
- try {
383
- parsed = extractFirstJsonBlock(text);
384
- } catch {
385
- }
386
- if (parsed === void 0) {
387
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
388
- continue;
389
- }
390
- const validate = ajv.compile(tc.schema);
391
- const valid = validate(parsed);
392
- if (valid) {
393
- schemaValidCount++;
394
- logs.push(`[PASS] ${tc.id}`);
395
- } else {
396
- logs.push(
397
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
398
- );
399
- }
400
- } catch (e) {
401
- const msg = e instanceof Error ? e.message : String(e);
402
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
403
- }
404
- }
405
- const total = tests.length;
406
- const score = total > 0 ? schemaValidCount / total : 0;
407
- return {
408
- score,
409
- success: score >= 0.8,
410
- metrics: {
411
- total_cases: total,
412
- schema_valid_count: schemaValidCount,
413
- accuracy: score
414
- },
415
- logs
416
- };
417
- }
418
- };
419
-
420
- // src/benchmarks/bfcl.ts
421
- import { generateText as generateText2, jsonSchema, tool } from "ai";
422
- import { promises as fs3 } from "fs";
423
- import path3 from "path";
424
-
425
- // src/benchmarks/bfcl/ast-checker.ts
426
- function standardizeString(input) {
427
- if (typeof input !== "string") return input;
428
- const regex = /[ ,./\\-_*^]/g;
429
- return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
430
- }
431
- function checkStringValue(param, modelValue, possibleAnswers) {
432
- const standardizedModelValue = standardizeString(modelValue);
433
- const standardizedPossibleAnswers = possibleAnswers.map(
434
- (ans) => standardizeString(ans)
435
- );
436
- if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
437
- return {
438
- valid: false,
439
- error: `Invalid value for parameter '${param}': '${modelValue}'. Expected one of ${possibleAnswers.join(", ")}.`,
440
- error_type: "value_error:string"
441
- };
442
- }
443
- return { valid: true };
444
- }
445
- function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
446
- const modelArgs = modelToolCall.args;
447
- const modelFuncName = modelToolCall.toolName;
448
- const expectedFuncName = funcDescription.name;
449
- const expectedParams = funcDescription.parameters.properties;
450
- const requiredParams = funcDescription.parameters.required;
451
- if (modelFuncName !== expectedFuncName) {
452
- return {
453
- valid: false,
454
- error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
455
- error_type: "simple_function_checker:wrong_func_name"
456
- };
457
- }
458
- const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
459
- for (const param of requiredParams) {
460
- if (!(param in modelArgs)) {
461
- return {
462
- valid: false,
463
- error: `Missing required parameter: '${param}'.`,
464
- error_type: "simple_function_checker:missing_required"
465
- };
466
- }
467
- }
468
- for (const paramName in modelArgs) {
469
- const modelValue = modelArgs[paramName];
470
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
471
- return {
472
- valid: false,
473
- error: `Unexpected parameter: '${paramName}'.`,
474
- error_type: "simple_function_checker:unexpected_param"
475
- };
476
- }
477
- const possibleValues = possibleAnswerParams[paramName];
478
- if (typeof modelValue === "string") {
479
- const result = checkStringValue(paramName, modelValue, possibleValues);
480
- if (!result.valid) return result;
481
- } else if (Array.isArray(modelValue)) {
482
- const modelValueStr = JSON.stringify(
483
- modelValue.map((v) => standardizeString(v.toString())).sort()
484
- );
485
- const hasMatch = possibleValues.some(
486
- (p) => JSON.stringify(
487
- p.map((v) => standardizeString(v.toString())).sort()
488
- ) === modelValueStr
489
- );
490
- if (!hasMatch) {
491
- return {
492
- valid: false,
493
- error: `Invalid value for list parameter '${paramName}'.`,
494
- error_type: "value_error:list"
495
- };
496
- }
497
- } else {
498
- if (!possibleValues.includes(modelValue)) {
499
- return {
500
- valid: false,
501
- error: `Invalid value for parameter '${paramName}': got '${modelValue}', expected one of '${possibleValues}'.`,
502
- error_type: "value_error:other"
503
- };
504
- }
505
- }
506
- }
507
- for (const paramName in possibleAnswerParams) {
508
- if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
509
- return {
510
- valid: false,
511
- error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
512
- error_type: "simple_function_checker:missing_optional"
513
- };
514
- }
515
- }
516
- return { valid: true };
517
- }
518
- function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
519
- if (modelToolCalls.length !== possibleAnswers.length) {
520
- return {
521
- valid: false,
522
- error: `Wrong number of functions. Expected ${possibleAnswers.length}, got ${modelToolCalls.length}.`,
523
- error_type: "parallel_function_checker_no_order:wrong_count"
524
- };
525
- }
526
- const matchedModelCallIndices = /* @__PURE__ */ new Set();
527
- for (const possibleAnswer of possibleAnswers) {
528
- const expectedFuncName = Object.keys(possibleAnswer)[0];
529
- const funcDescription = funcDescriptions.find(
530
- (f) => f.name === expectedFuncName
531
- );
532
- if (!funcDescription) {
533
- return {
534
- valid: false,
535
- error: `Could not find function description for '${expectedFuncName}'.`,
536
- error_type: "parallel_function_checker_no_order:missing_func_desc"
537
- };
538
- }
539
- let foundMatch = false;
540
- for (let i = 0; i < modelToolCalls.length; i++) {
541
- if (matchedModelCallIndices.has(i)) continue;
542
- const checkerResult = simpleFunctionChecker(
543
- funcDescription,
544
- modelToolCalls[i],
545
- possibleAnswer
546
- );
547
- if (checkerResult.valid) {
548
- matchedModelCallIndices.add(i);
549
- foundMatch = true;
550
- break;
551
- }
552
- }
553
- if (!foundMatch) {
554
- return {
555
- valid: false,
556
- error: `Could not find a matching function call for '${expectedFuncName}'.`,
557
- error_type: "parallel_function_checker_no_order:cannot_find_match"
552
+ valid: false,
553
+ error: `Could not find a matching function call for '${expectedFuncName}'.`,
554
+ error_type: "parallel_function_checker_no_order:cannot_find_match"
558
555
  };
559
556
  }
560
557
  }
@@ -591,10 +588,11 @@ function check(testCase, modelOutput, possibleAnswer) {
591
588
  const category = testCase.id.split("_")[0];
592
589
  try {
593
590
  if (category === "simple") {
594
- if (!modelOutput || modelOutput.length !== 1) {
591
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
595
592
  return {
596
593
  valid: false,
597
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`
594
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
595
+ error_type: "simple:wrong_count"
598
596
  };
599
597
  }
600
598
  return simpleFunctionChecker(
@@ -623,7 +621,11 @@ function check(testCase, modelOutput, possibleAnswer) {
623
621
  }
624
622
  return { valid: true };
625
623
  } catch (e) {
626
- return { valid: false, error: `Checker Error: ${e.message}` };
624
+ return {
625
+ valid: false,
626
+ error: `Checker Error: ${e.message}`,
627
+ error_type: "checker_error"
628
+ };
627
629
  }
628
630
  }
629
631
  function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
@@ -638,12 +640,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
638
640
  try {
639
641
  const dataPath = resolveDataDir();
640
642
  logs.push(`[INFO] Using data dir: ${dataPath}`);
641
- const testCasesJson = await fs3.readFile(
642
- path3.join(dataPath, testDataFile),
643
+ const testCasesJson = await fs2.readFile(
644
+ path2.join(dataPath, testDataFile),
643
645
  "utf-8"
644
646
  );
645
- const possibleAnswersJson = await fs3.readFile(
646
- path3.join(dataPath, answerDataFile),
647
+ const possibleAnswersJson = await fs2.readFile(
648
+ path2.join(dataPath, answerDataFile),
647
649
  "utf-8"
648
650
  );
649
651
  testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -660,22 +662,34 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
660
662
  );
661
663
  }
662
664
  const fixSchema = (schema) => {
663
- if (!schema || typeof schema !== "object") return schema;
665
+ if (!schema || typeof schema !== "object")
666
+ return { type: "object", properties: {} };
664
667
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
665
- if (copy.type) {
666
- if (copy.type === "dict") copy.type = "object";
667
- if (copy.type === "integer" || copy.type === "float")
668
- copy.type = "number";
669
- }
670
- if (copy.properties && typeof copy.properties === "object") {
671
- for (const k of Object.keys(copy.properties)) {
672
- copy.properties[k] = fixSchema(copy.properties[k]);
668
+ if (!Array.isArray(copy)) {
669
+ if (copy.type) {
670
+ if (copy.type === "dict") copy.type = "object";
671
+ if (copy.type === "integer" || copy.type === "float")
672
+ copy.type = "number";
673
673
  }
674
+ if (copy.properties && typeof copy.properties === "object") {
675
+ for (const k of Object.keys(copy.properties)) {
676
+ copy.properties[k] = fixSchema(
677
+ copy.properties[k]
678
+ );
679
+ }
680
+ }
681
+ if (copy.items) copy.items = fixSchema(copy.items);
682
+ return copy;
674
683
  }
675
- if (copy.items) copy.items = fixSchema(copy.items);
676
684
  return copy;
677
685
  };
678
- for (const testCase of testCases) {
686
+ const concurrencyEnv = process.env.BFCL_CONCURRENCY;
687
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
688
+ logs.push(
689
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
690
+ );
691
+ const runSingleCase = async (testCase) => {
692
+ const caseLogs = [];
679
693
  const { function: tools, question: messages } = testCase;
680
694
  try {
681
695
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
@@ -686,7 +700,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
686
700
  };
687
701
  const transformedTools = tools.map((t) => {
688
702
  const fixed = fixSchema(t.parameters);
689
- const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
703
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
704
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
690
705
  const sanitized = sanitizeName(t.name);
691
706
  nameMap.set(sanitized, t.name);
692
707
  return {
@@ -708,26 +723,37 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
708
723
  try {
709
724
  const firstTool = transformedTools[0];
710
725
  const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
711
- logs.push(
726
+ caseLogs.push(
712
727
  `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
713
728
  );
714
729
  } catch (e) {
715
- logs.push(
730
+ caseLogs.push(
716
731
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
717
732
  );
718
733
  }
719
- const { toolCalls, text, finishReason } = await generateText2({
734
+ const { toolCalls, text, finishReason } = await generateText({
720
735
  model,
721
736
  messages: flatMessages,
722
737
  tools: toolsMap,
723
- toolChoice: "auto"
738
+ toolChoice: "auto",
739
+ // Pass original schema information to middleware
740
+ providerOptions: {
741
+ toolCallMiddleware: {
742
+ originalToolSchemas: Object.fromEntries(
743
+ transformedTools.map((t) => [
744
+ t.name,
745
+ t.inputSchema
746
+ ])
747
+ )
748
+ }
749
+ }
724
750
  });
725
751
  try {
726
- logs.push(
752
+ caseLogs.push(
727
753
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
728
754
  );
729
755
  } catch {
730
- logs.push(
756
+ caseLogs.push(
731
757
  `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
732
758
  );
733
759
  }
@@ -760,20 +786,232 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
760
786
  possibleAnswer
761
787
  );
762
788
  if (checkerResult.valid) {
763
- correctCount++;
764
- logs.push(`[PASS] ${testCase.id}`);
789
+ caseLogs.push(`[PASS] ${testCase.id}`);
790
+ return { valid: true, logs: caseLogs };
765
791
  } else {
766
- logs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
792
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
793
+ try {
794
+ const category = testCase.id.split("_")[0];
795
+ const diff = [];
796
+ const summarizeArgs = (args) => {
797
+ if (args == null) return args;
798
+ if (typeof args !== "object") return args;
799
+ return Object.keys(args).sort().reduce(
800
+ (acc, k) => {
801
+ acc[k] = args[k];
802
+ return acc;
803
+ },
804
+ {}
805
+ );
806
+ };
807
+ const expected = {};
808
+ const actual = {};
809
+ if (category === "simple") {
810
+ const funcDesc = tools[0];
811
+ const gt = possibleAnswer.ground_truth?.[0];
812
+ const expectedFuncName = funcDesc?.name;
813
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
814
+ const received = restoredCalls[0];
815
+ const receivedName = received?.toolName ?? received?.name;
816
+ const receivedArgs = summarizeArgs(received?.args);
817
+ expected.function = expectedFuncName;
818
+ expected.params = expectedParams;
819
+ actual.function = receivedName;
820
+ actual.args = receivedArgs;
821
+ if (expectedFuncName !== receivedName) {
822
+ diff.push(`@@ function name`);
823
+ diff.push(`- ${expectedFuncName}`);
824
+ diff.push(`+ ${receivedName}`);
825
+ }
826
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
827
+ const required = funcDesc?.parameters?.required ?? [];
828
+ for (const req of required) {
829
+ if (!(req in receivedArgs)) {
830
+ diff.push(`- missing required param: ${req}`);
831
+ }
832
+ }
833
+ for (const k of Object.keys(
834
+ receivedArgs
835
+ )) {
836
+ if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
837
+ diff.push(`+ unexpected param: ${k}`);
838
+ }
839
+ }
840
+ for (const k of Object.keys(
841
+ receivedArgs
842
+ )) {
843
+ if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
844
+ const allowed = expectedParams[k];
845
+ const got = receivedArgs[k];
846
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
847
+ try {
848
+ if (Array.isArray(got)) {
849
+ return JSON.stringify(
850
+ got.map((x) => String(x)).sort()
851
+ ) === JSON.stringify(
852
+ v.map((x) => String(x)).sort()
853
+ );
854
+ }
855
+ } catch {
856
+ }
857
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
858
+ });
859
+ if (!includes) {
860
+ diff.push(`@@ param ${k}`);
861
+ diff.push(
862
+ `- expected one of: ${JSON.stringify(allowed)}`
863
+ );
864
+ diff.push(`+ got: ${JSON.stringify(got)}`);
865
+ }
866
+ }
867
+ }
868
+ }
869
+ } else {
870
+ const gtArr = possibleAnswer.ground_truth ?? [];
871
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
872
+ const actualNames = restoredCalls.map(
873
+ (c) => c.toolName ?? c.name
874
+ );
875
+ expected.functions = expectedNames;
876
+ actual.functions = actualNames;
877
+ if (expectedNames.length !== actualNames.length) {
878
+ diff.push(`@@ call count`);
879
+ diff.push(`- expected ${expectedNames.length}`);
880
+ diff.push(`+ got ${actualNames.length}`);
881
+ }
882
+ const missing = expectedNames.filter(
883
+ (n) => !actualNames.includes(n)
884
+ );
885
+ const extra = actualNames.filter(
886
+ (n) => !expectedNames.includes(n)
887
+ );
888
+ for (const m of missing)
889
+ diff.push(`- missing function: ${m}`);
890
+ for (const e of extra)
891
+ diff.push(`+ unexpected function: ${e}`);
892
+ const usedActual = /* @__PURE__ */ new Set();
893
+ for (const expectedObj of gtArr) {
894
+ const fname = Object.keys(expectedObj)[0];
895
+ let matchedIndex = -1;
896
+ for (let i = 0; i < restoredCalls.length; i++) {
897
+ if (usedActual.has(i)) continue;
898
+ const rc = restoredCalls[i];
899
+ const rcName = rc?.toolName ?? rc?.name;
900
+ if (rcName === fname) {
901
+ matchedIndex = i;
902
+ break;
903
+ }
904
+ }
905
+ if (matchedIndex === -1) continue;
906
+ usedActual.add(matchedIndex);
907
+ const received = restoredCalls[matchedIndex];
908
+ const receivedArgs = summarizeArgs(received?.args);
909
+ const expectedParamsAllowed = expectedObj[fname];
910
+ const funcDesc = tools.find(
911
+ (t) => t.name === fname
912
+ );
913
+ const requiredParams = funcDesc?.parameters?.required ?? [];
914
+ diff.push(`@@ function ${fname}`);
915
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
916
+ for (const req of requiredParams) {
917
+ if (!(req in receivedArgs)) {
918
+ diff.push(`- missing required param: ${req}`);
919
+ }
920
+ }
921
+ for (const k of Object.keys(
922
+ receivedArgs
923
+ )) {
924
+ if (!Object.prototype.hasOwnProperty.call(
925
+ expectedParamsAllowed,
926
+ k
927
+ )) {
928
+ diff.push(`+ unexpected param: ${k}`);
929
+ }
930
+ }
931
+ for (const k of Object.keys(
932
+ receivedArgs
933
+ )) {
934
+ if (Object.prototype.hasOwnProperty.call(
935
+ expectedParamsAllowed,
936
+ k
937
+ )) {
938
+ const allowed = expectedParamsAllowed[k];
939
+ const got = receivedArgs[k];
940
+ const includes = Array.isArray(allowed) && allowed.some((v) => {
941
+ try {
942
+ if (Array.isArray(got)) {
943
+ return JSON.stringify(
944
+ got.map((x) => String(x)).sort()
945
+ ) === JSON.stringify(
946
+ v.map((x) => String(x)).sort()
947
+ );
948
+ }
949
+ } catch {
950
+ }
951
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
952
+ });
953
+ if (!includes) {
954
+ diff.push(`@@ param ${k}`);
955
+ diff.push(
956
+ `- expected one of: ${JSON.stringify(allowed)}`
957
+ );
958
+ diff.push(`+ got: ${JSON.stringify(got)}`);
959
+ }
960
+ }
961
+ }
962
+ }
963
+ }
964
+ }
965
+ caseLogs.push(
966
+ `[DEBUG-FAIL] ${JSON.stringify({
967
+ id: testCase.id,
968
+ message: checkerResult.error,
969
+ error_type: checkerResult.error_type,
970
+ expected,
971
+ actual,
972
+ diff
973
+ })}`
974
+ );
975
+ } catch {
976
+ caseLogs.push(
977
+ `[DEBUG] ${testCase.id}: failed to build debug diff`
978
+ );
979
+ }
980
+ return { valid: false, logs: caseLogs };
767
981
  }
768
982
  } catch (e) {
769
- logs.push(
983
+ caseLogs.push(
770
984
  `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
771
985
  );
772
986
  if (e?.stack) {
773
- logs.push(`[STACK] ${testCase.id}: ${e.stack}`);
987
+ caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
774
988
  }
989
+ return { valid: false, logs: caseLogs };
775
990
  }
776
- }
991
+ };
992
+ const mapWithConcurrency = async (items, limit2, mapper) => {
993
+ const results = new Array(items.length);
994
+ let idx = 0;
995
+ const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
996
+ while (true) {
997
+ const current = idx++;
998
+ if (current >= items.length) break;
999
+ results[current] = await mapper(items[current], current);
1000
+ }
1001
+ });
1002
+ await Promise.all(workers);
1003
+ return results;
1004
+ };
1005
+ const resultsPerCase = await mapWithConcurrency(
1006
+ testCases,
1007
+ concurrency,
1008
+ async (tc) => runSingleCase(tc)
1009
+ );
1010
+ correctCount = resultsPerCase.reduce(
1011
+ (acc, r) => acc + (r.valid ? 1 : 0),
1012
+ 0
1013
+ );
1014
+ for (const r of resultsPerCase) logs.push(...r.logs);
777
1015
  if (testCases.length === 0) {
778
1016
  return {
779
1017
  score: 0,
@@ -830,6 +1068,262 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
830
1068
  "BFCL_v3_parallel_multiple.json",
831
1069
  "BFCL_v3_parallel_multiple_possible_answer.json"
832
1070
  );
1071
+
1072
+ // src/benchmarks/json-generation.ts
1073
+ import { generateText as generateText2 } from "ai";
1074
+ import Ajv from "ajv";
1075
+ import { promises as fs3 } from "fs";
1076
+ import path3 from "path";
1077
+ function extractFirstJsonBlock(text) {
1078
+ try {
1079
+ return JSON.parse(text);
1080
+ } catch {
1081
+ }
1082
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1083
+ if (fenceMatch) {
1084
+ const inner = fenceMatch[1].trim();
1085
+ try {
1086
+ return JSON.parse(inner);
1087
+ } catch {
1088
+ }
1089
+ }
1090
+ const startIdxObj = text.indexOf("{");
1091
+ const startIdxArr = text.indexOf("[");
1092
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1093
+ if (start === void 0) return void 0;
1094
+ const open = text[start] === "{" ? "{" : "[";
1095
+ const close = open === "{" ? "}" : "]";
1096
+ let depth = 0;
1097
+ for (let i = start; i < text.length; i++) {
1098
+ const ch = text[i];
1099
+ if (ch === open) depth++;
1100
+ else if (ch === close) depth--;
1101
+ if (depth === 0) {
1102
+ const candidate = text.slice(start, i + 1);
1103
+ try {
1104
+ return JSON.parse(candidate);
1105
+ } catch {
1106
+ }
1107
+ break;
1108
+ }
1109
+ }
1110
+ return void 0;
1111
+ }
1112
+ function subsetMatch(expected, actual) {
1113
+ if (expected === null || typeof expected !== "object") {
1114
+ return expected === actual;
1115
+ }
1116
+ if (Array.isArray(expected)) {
1117
+ if (!Array.isArray(actual)) return false;
1118
+ for (let i = 0; i < expected.length; i++) {
1119
+ if (!subsetMatch(expected[i], actual[i])) return false;
1120
+ }
1121
+ return true;
1122
+ }
1123
+ if (actual === null || typeof actual !== "object") return false;
1124
+ const eObj = expected;
1125
+ const aObj = actual;
1126
+ for (const key of Object.keys(eObj)) {
1127
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
1128
+ }
1129
+ return true;
1130
+ }
1131
+ var jsonGenerationBenchmark = {
1132
+ name: "json-generation",
1133
+ version: "2.1.0",
1134
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1135
+ async run(model) {
1136
+ const logs = [];
1137
+ const ajv = new Ajv({ allErrors: true, strict: false });
1138
+ let schemaValidCount = 0;
1139
+ let valueMatchCount = 0;
1140
+ let correctCount = 0;
1141
+ let tests = [];
1142
+ const expectedMap = /* @__PURE__ */ new Map();
1143
+ try {
1144
+ const dataDir = resolveDataDir();
1145
+ const testsJsonl = await fs3.readFile(
1146
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1147
+ "utf-8"
1148
+ );
1149
+ const expectedJsonl = await fs3.readFile(
1150
+ path3.join(dataDir, "json_generation_expected.jsonl"),
1151
+ "utf-8"
1152
+ );
1153
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1154
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1155
+ for (const r of expecteds) expectedMap.set(r.id, r);
1156
+ } catch (e) {
1157
+ const msg = e instanceof Error ? e.message : String(e);
1158
+ return {
1159
+ score: 0,
1160
+ success: false,
1161
+ metrics: {},
1162
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1163
+ error: e
1164
+ };
1165
+ }
1166
+ for (const tc of tests) {
1167
+ try {
1168
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1169
+ const messages = [
1170
+ {
1171
+ role: "system",
1172
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1173
+ },
1174
+ {
1175
+ role: "user",
1176
+ content: [
1177
+ "Generate a JSON object that reflects the following facts.",
1178
+ "JSON Schema:",
1179
+ schemaStr,
1180
+ "Facts:",
1181
+ tc.promptFacts,
1182
+ "Output must be a single JSON only, with no additional text."
1183
+ ].join("\n\n")
1184
+ }
1185
+ ];
1186
+ const { text } = await generateText2({ model, messages });
1187
+ let parsed;
1188
+ try {
1189
+ parsed = extractFirstJsonBlock(text);
1190
+ } catch {
1191
+ }
1192
+ if (parsed === void 0) {
1193
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1194
+ continue;
1195
+ }
1196
+ const validate = ajv.compile(tc.schema);
1197
+ const valid = validate(parsed);
1198
+ if (valid) schemaValidCount++;
1199
+ else
1200
+ logs.push(
1201
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1202
+ );
1203
+ const expectedRec = expectedMap.get(tc.id);
1204
+ if (!expectedRec) {
1205
+ logs.push(
1206
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1207
+ );
1208
+ }
1209
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1210
+ if (valuesOk) valueMatchCount++;
1211
+ if (valid && valuesOk) {
1212
+ correctCount++;
1213
+ logs.push(`[PASS] ${tc.id}`);
1214
+ } else {
1215
+ logs.push(
1216
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1217
+ parsed
1218
+ )}`
1219
+ );
1220
+ }
1221
+ } catch (e) {
1222
+ const msg = e instanceof Error ? e.message : String(e);
1223
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1224
+ }
1225
+ }
1226
+ const total = tests.length;
1227
+ const score = correctCount / total;
1228
+ return {
1229
+ score,
1230
+ success: score >= 0.8,
1231
+ metrics: {
1232
+ total_cases: total,
1233
+ correct_count: correctCount,
1234
+ schema_valid_count: schemaValidCount,
1235
+ value_match_count: valueMatchCount,
1236
+ accuracy: score
1237
+ },
1238
+ logs
1239
+ };
1240
+ }
1241
+ };
1242
+ var jsonGenerationSchemaOnlyBenchmark = {
1243
+ name: "json-generation-schema-only",
1244
+ version: "1.0.1",
1245
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1246
+ async run(model) {
1247
+ const logs = [];
1248
+ const ajv = new Ajv({ allErrors: true, strict: false });
1249
+ let tests = [];
1250
+ try {
1251
+ const dataDir = resolveDataDir();
1252
+ const testsJsonl = await fs3.readFile(
1253
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1254
+ "utf-8"
1255
+ );
1256
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1257
+ } catch (e) {
1258
+ const msg = e instanceof Error ? e.message : String(e);
1259
+ return {
1260
+ score: 0,
1261
+ success: false,
1262
+ metrics: {},
1263
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1264
+ error: e
1265
+ };
1266
+ }
1267
+ let schemaValidCount = 0;
1268
+ for (const tc of tests) {
1269
+ try {
1270
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1271
+ const messages = [
1272
+ {
1273
+ role: "system",
1274
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1275
+ },
1276
+ {
1277
+ role: "user",
1278
+ content: [
1279
+ "Generate a JSON object that reflects the following facts.",
1280
+ "JSON Schema:",
1281
+ schemaStr,
1282
+ "Facts:",
1283
+ tc.promptFacts,
1284
+ "Output must be a single JSON only, with no additional text."
1285
+ ].join("\n\n")
1286
+ }
1287
+ ];
1288
+ const { text } = await generateText2({ model, messages });
1289
+ let parsed;
1290
+ try {
1291
+ parsed = extractFirstJsonBlock(text);
1292
+ } catch {
1293
+ }
1294
+ if (parsed === void 0) {
1295
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1296
+ continue;
1297
+ }
1298
+ const validate = ajv.compile(tc.schema);
1299
+ const valid = validate(parsed);
1300
+ if (valid) {
1301
+ schemaValidCount++;
1302
+ logs.push(`[PASS] ${tc.id}`);
1303
+ } else {
1304
+ logs.push(
1305
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1306
+ );
1307
+ }
1308
+ } catch (e) {
1309
+ const msg = e instanceof Error ? e.message : String(e);
1310
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1311
+ }
1312
+ }
1313
+ const total = tests.length;
1314
+ const score = total > 0 ? schemaValidCount / total : 0;
1315
+ return {
1316
+ score,
1317
+ success: score >= 0.8,
1318
+ metrics: {
1319
+ total_cases: total,
1320
+ schema_valid_count: schemaValidCount,
1321
+ accuracy: score
1322
+ },
1323
+ logs
1324
+ };
1325
+ }
1326
+ };
833
1327
  export {
834
1328
  bfclMultipleBenchmark,
835
1329
  bfclParallelBenchmark,