@ai-sdk-tool/eval 0.1.8 → 1.0.0-canary.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,424 +1,18 @@
1
- // src/reporters/console.ts
2
- var colors = {
3
- reset: "\x1B[0m",
4
- green: "\x1B[32m",
5
- red: "\x1B[31m",
6
- yellow: "\x1B[33m",
7
- cyan: "\x1B[36m",
8
- magenta: "\x1B[35m",
9
- gray: "\x1B[90m"
10
- };
11
- function printResult(result) {
12
- const { model, modelKey, benchmark, result: benchmarkResult } = result;
13
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
14
- console.log(
15
- `
16
- ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
17
- );
18
- console.log(
19
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
20
- );
21
- const metrics = Object.entries(benchmarkResult.metrics);
22
- if (metrics.length > 0) {
23
- console.log(" Metrics:");
24
- for (const [key, value] of metrics) {
25
- console.log(` - ${key}: ${value}`);
26
- }
27
- }
28
- if (benchmarkResult.error) {
29
- console.log(
30
- ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
31
- );
32
- }
33
- }
34
- function consoleReporter(results) {
35
- console.log("\n--- \u{1F4CA} Evaluation Report ---");
36
- for (const result of results) {
37
- printResult(result);
38
- }
39
- console.log("\n---------------------------\n");
40
- }
41
-
42
- // src/reporters/console.debug.ts
43
- var colors2 = {
44
- reset: "\x1B[0m",
45
- green: "\x1B[32m",
46
- red: "\x1B[31m",
47
- yellow: "\x1B[33m",
48
- cyan: "\x1B[36m",
49
- magenta: "\x1B[35m",
50
- gray: "\x1B[90m",
51
- bold: "\x1B[1m",
52
- underline: "\x1B[4m"
53
- };
54
- function colorizeDiffLine(line) {
55
- if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
56
- if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
57
- if (line.startsWith("@"))
58
- return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
59
- return line;
60
- }
61
- function uniqueLines(lines) {
62
- const seen = /* @__PURE__ */ new Set();
63
- const out = [];
64
- for (const l of lines) {
65
- if (seen.has(l)) continue;
66
- seen.add(l);
67
- out.push(l);
68
- }
69
- return out;
70
- }
71
- function suggestFixFromDiff(parsed) {
72
- const suggestions = [];
73
- const { error_type, expected, actual, diff } = parsed ?? {};
74
- if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
75
- const expectedName = expected?.function;
76
- const actualName = actual?.function;
77
- if (expectedName && actualName && expectedName !== actualName) {
78
- suggestions.push(
79
- `Call the function '${expectedName}' instead of '${actualName}'.`
80
- );
81
- }
82
- if (Array.isArray(expected?.functions)) {
83
- suggestions.push(
84
- `Ensure tool calls include: ${expected.functions.join(", ")}.`
85
- );
86
- }
87
- }
88
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
89
- const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
90
- if (missing.length) {
91
- suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
92
- }
93
- }
94
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
95
- const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
96
- if (extras.length) {
97
- suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
98
- }
99
- }
100
- if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
101
- const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
102
- for (const param of targets) {
103
- const allowedOneOfLine = diff.find(
104
- (d) => String(d).startsWith("- expected one of:")
105
- );
106
- const allowedSingleLine = diff.find(
107
- (d) => String(d).startsWith("- expected:")
108
- );
109
- if (allowedSingleLine) {
110
- const value = allowedSingleLine.replace("- expected: ", "");
111
- suggestions.push(`Set '${param}' to: ${value}.`);
112
- } else if (allowedOneOfLine) {
113
- const allowed = allowedOneOfLine.replace("- expected one of: ", "");
114
- suggestions.push(`Set '${param}' to one of: ${allowed}.`);
115
- } else {
116
- suggestions.push(`Adjust '${param}' to an allowed value.`);
117
- }
118
- }
119
- }
120
- if (suggestions.length === 0 && typeof error_type === "string") {
121
- if (error_type.includes("missing_required")) {
122
- suggestions.push(
123
- "Add all required parameters defined by the tool schema."
124
- );
125
- } else if (error_type.includes("unexpected_param")) {
126
- suggestions.push("Remove parameters not present in the tool schema.");
127
- } else if (error_type.includes("wrong_count")) {
128
- suggestions.push(
129
- "Adjust the number of tool calls to match expected count."
130
- );
131
- } else if (error_type.includes("wrong_func_name")) {
132
- suggestions.push("Use the exact expected function name from the schema.");
133
- } else if (error_type.includes("value_error")) {
134
- suggestions.push("Choose a value from the allowed options.");
135
- }
136
- }
137
- return uniqueLines(suggestions);
138
- }
139
- function consoleDebugReporter(results) {
140
- console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
141
- for (const r of results) {
142
- const { model, modelKey, benchmark, result } = r;
143
- const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
144
- console.log(
145
- `
146
- ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
147
- );
148
- console.log(
149
- ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
150
- );
151
- const metrics = Object.entries(result.metrics);
152
- if (metrics.length > 0) {
153
- console.log(" Metrics:");
154
- for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
155
- }
156
- if (result.logs && result.logs.length) {
157
- const failLogs = result.logs.filter(
158
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
159
- );
160
- const hasFails = failLogs.length > 0;
161
- if (hasFails) {
162
- let getTestIdFromLogLine2 = function(line) {
163
- if (line.startsWith("[FAIL]")) {
164
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
165
- return m?.[1];
166
- }
167
- if (line.startsWith("[DEBUG-FAIL]")) {
168
- try {
169
- const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
170
- return String(parsed?.id ?? "");
171
- } catch {
172
- }
173
- }
174
- if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
175
- try {
176
- const parsed = JSON.parse(
177
- line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
178
- );
179
- return String(parsed?.id ?? "");
180
- } catch {
181
- }
182
- }
183
- return void 0;
184
- };
185
- var getTestIdFromLogLine = getTestIdFromLogLine2;
186
- const byId = /* @__PURE__ */ new Map();
187
- for (const line of failLogs) {
188
- const id = getTestIdFromLogLine2(line);
189
- const key = id ?? "__general__";
190
- const arr = byId.get(key) ?? [];
191
- arr.push(line);
192
- byId.set(key, arr);
193
- }
194
- console.log(
195
- ` ${colors2.bold}Failure details (grouped):${colors2.reset}`
196
- );
197
- for (const [groupId, lines] of byId) {
198
- if (groupId !== "__general__") {
199
- console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
200
- }
201
- const debugIds = /* @__PURE__ */ new Set();
202
- for (const l of lines) {
203
- if (l.startsWith("[DEBUG-FAIL]")) {
204
- try {
205
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
206
- if (parsed?.id) debugIds.add(String(parsed.id));
207
- } catch {
208
- }
209
- }
210
- }
211
- for (const line of lines) {
212
- if (line.startsWith("[FAIL]")) {
213
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
214
- const failId = m?.[1];
215
- if (failId && debugIds.has(failId)) continue;
216
- console.log(` ${colors2.red}${line}${colors2.reset}`);
217
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
218
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
219
- } else if (line.startsWith("[STACK]")) {
220
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
221
- } else if (line.startsWith("[DEBUG-FAIL]")) {
222
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
223
- try {
224
- const parsed = JSON.parse(payload);
225
- const { message, diff, expected, actual } = parsed;
226
- if (message)
227
- console.log(
228
- ` ${colors2.bold}${message}${colors2.reset}`
229
- );
230
- if (diff && Array.isArray(diff)) {
231
- for (const dLine of diff)
232
- console.log(" " + colorizeDiffLine(dLine));
233
- } else {
234
- console.log(" expected:");
235
- console.log(
236
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
237
- );
238
- console.log(" actual:");
239
- console.log(
240
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
241
- );
242
- }
243
- const suggestions = suggestFixFromDiff(parsed);
244
- if (suggestions.length) {
245
- console.log(
246
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
247
- );
248
- for (const s of suggestions)
249
- console.log(` \u2022 ${s}`);
250
- }
251
- } catch {
252
- console.log(` ${line}`);
253
- }
254
- } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
255
- const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
256
- try {
257
- const ctx = JSON.parse(payload);
258
- console.log(` ${colors2.gray}context:${colors2.reset}`);
259
- if (ctx.tool_schema) {
260
- console.log(
261
- colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
262
- );
263
- }
264
- if (ctx.last_user_query) {
265
- console.log(
266
- colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
267
- );
268
- }
269
- if (ctx.raw_model_text) {
270
- console.log(
271
- colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
272
- );
273
- }
274
- if (ctx.parsed_tool_calls) {
275
- console.log(
276
- colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
277
- );
278
- }
279
- if (ctx.ground_truth) {
280
- console.log(
281
- colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
282
- );
283
- }
284
- if (ctx.finish_reason) {
285
- console.log(
286
- colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
287
- );
288
- }
289
- } catch {
290
- console.log(` ${line}`);
291
- }
292
- }
293
- }
294
- }
295
- } else {
296
- const info = result.logs.filter(
297
- (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
298
- );
299
- for (const line of info)
300
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
301
- }
302
- }
303
- }
304
- console.log("\n------------------------------------\n");
305
- }
306
-
307
- // src/reporters/json.ts
308
- function jsonReporter(results) {
309
- const serializableResults = results.map((r) => ({
310
- ...r,
311
- result: {
312
- ...r.result,
313
- error: r.result.error?.message
314
- }
315
- }));
316
- console.log(JSON.stringify(serializableResults, null, 2));
317
- }
318
-
319
- // src/reporters/index.ts
320
- var reporters = {
321
- console: consoleReporter,
322
- json: jsonReporter,
323
- "console.debug": consoleDebugReporter
324
- };
325
-
326
- // src/evaluate.ts
327
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
328
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
329
- try {
330
- console.log(
331
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
332
- );
333
- const result = await benchmark.run(model, config);
334
- console.log(
335
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
336
- );
337
- return {
338
- model: modelId,
339
- modelKey,
340
- benchmark: benchmark.name,
341
- result
342
- };
343
- } catch (error) {
344
- console.error(
345
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
346
- error
347
- );
348
- return {
349
- model: modelId,
350
- modelKey,
351
- benchmark: benchmark.name,
352
- result: {
353
- score: 0,
354
- success: false,
355
- metrics: {},
356
- error: error instanceof Error ? error : new Error(String(error))
357
- }
358
- };
359
- }
360
- }
361
- async function evaluate(options) {
362
- const {
363
- models,
364
- benchmarks,
365
- reporter = "console",
366
- temperature,
367
- maxTokens
368
- } = options;
369
- const modelEntries = [];
370
- if (Array.isArray(models)) {
371
- for (const m of models) modelEntries.push([void 0, m]);
372
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
373
- modelEntries.push([void 0, models]);
374
- } else {
375
- for (const [key, m] of Object.entries(
376
- models
377
- )) {
378
- modelEntries.push([key, m]);
379
- }
380
- }
381
- const allResults = [];
382
- for (const [modelKey, model] of modelEntries) {
383
- for (const benchmark of benchmarks) {
384
- const config = {};
385
- if (temperature !== void 0) config.temperature = temperature;
386
- if (maxTokens !== void 0) config.maxTokens = maxTokens;
387
- const evaluationResult = await runSingleBenchmark(
388
- model,
389
- benchmark,
390
- modelKey,
391
- Object.keys(config).length > 0 ? config : void 0
392
- );
393
- allResults.push(evaluationResult);
394
- }
395
- }
396
- const report = reporters[reporter];
397
- if (report) {
398
- report(allResults);
399
- } else {
400
- console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
401
- reporters.console(allResults);
402
- }
403
- return allResults;
404
- }
405
-
406
1
  // src/benchmarks/bfcl.ts
407
- import { generateText, jsonSchema, tool } from "ai";
408
2
  import { promises as fs2 } from "fs";
409
3
  import path2 from "path";
4
+ import {
5
+ generateText,
6
+ jsonSchema,
7
+ tool
8
+ } from "ai";
410
9
 
411
10
  // src/utils/paths.ts
412
11
  import fs from "fs";
413
12
  import { createRequire } from "module";
414
13
  import path from "path";
415
14
  import { fileURLToPath } from "url";
416
- function resolveDataDir(fromModuleUrl) {
417
- const moduleUrl = fromModuleUrl;
418
- const override = process.env.BFCL_DATA_DIR;
419
- if (override && override.trim().length > 0) {
420
- return override;
421
- }
15
+ function tryResolveViaPackageEntry(moduleUrl) {
422
16
  try {
423
17
  const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
424
18
  const requireFromEntry = createRequire(baseForRequireEntry);
@@ -426,43 +20,80 @@ function resolveDataDir(fromModuleUrl) {
426
20
  const entryDir = path.dirname(entryPath);
427
21
  const guessPkgRoot = fs.existsSync(path.join(entryDir, "..")) ? path.resolve(entryDir, "..") : entryDir;
428
22
  const dataAtRoot = path.join(guessPkgRoot, "data");
429
- if (fs.existsSync(dataAtRoot)) return dataAtRoot;
23
+ if (fs.existsSync(dataAtRoot)) {
24
+ return dataAtRoot;
25
+ }
430
26
  } catch {
431
27
  }
28
+ return null;
29
+ }
30
+ function tryResolveViaPackageJson(moduleUrl) {
432
31
  try {
433
32
  const baseForRequire = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
434
33
  const require2 = createRequire(baseForRequire);
435
34
  const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
436
35
  const pkgDir = path.dirname(pkgJsonPath);
437
36
  const dataAtPkg = path.join(pkgDir, "data");
438
- if (fs.existsSync(dataAtPkg)) return dataAtPkg;
37
+ if (fs.existsSync(dataAtPkg)) {
38
+ return dataAtPkg;
39
+ }
439
40
  } catch {
440
41
  }
441
- let startDir;
42
+ return null;
43
+ }
44
+ function getStartDir(moduleUrl) {
442
45
  if (moduleUrl) {
443
46
  try {
444
- startDir = path.dirname(fileURLToPath(moduleUrl));
47
+ return path.dirname(fileURLToPath(moduleUrl));
445
48
  } catch {
446
- startDir = process.cwd();
49
+ return process.cwd();
447
50
  }
448
- } else {
449
- startDir = process.cwd();
450
51
  }
52
+ return process.cwd();
53
+ }
54
+ function findDataDirByTraversal(startDir) {
451
55
  let dir = startDir;
452
- for (let i = 0; i < 6; i++) {
56
+ const MAX_PARENT_TRAVERSAL_DEPTH = 6;
57
+ for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
453
58
  const dataCandidate = path.join(dir, "data");
454
- if (fs.existsSync(dataCandidate)) return dataCandidate;
59
+ if (fs.existsSync(dataCandidate)) {
60
+ return dataCandidate;
61
+ }
455
62
  const parent = path.resolve(dir, "..");
456
- if (parent === dir) break;
63
+ if (parent === dir) {
64
+ break;
65
+ }
457
66
  dir = parent;
458
67
  }
68
+ return null;
69
+ }
70
+ function resolveDataDir(fromModuleUrl) {
71
+ const override = process.env.BFCL_DATA_DIR;
72
+ if (override && override.trim().length > 0) {
73
+ return override;
74
+ }
75
+ const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
76
+ if (viaEntry) {
77
+ return viaEntry;
78
+ }
79
+ const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
80
+ if (viaPackageJson) {
81
+ return viaPackageJson;
82
+ }
83
+ const startDir = getStartDir(fromModuleUrl);
84
+ const viaTraversal = findDataDirByTraversal(startDir);
85
+ if (viaTraversal) {
86
+ return viaTraversal;
87
+ }
459
88
  const pkgRoot = path.resolve(startDir, "..", "..");
460
89
  return path.join(pkgRoot, "data");
461
90
  }
462
91
 
463
92
  // src/benchmarks/bfcl/ast-checker.ts
464
93
  function standardizeString(input) {
465
- if (typeof input !== "string") return input;
94
+ if (typeof input !== "string") {
95
+ return input;
96
+ }
466
97
  const regex = /[ ,./\\-_*^]/g;
467
98
  return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
468
99
  }
@@ -482,127 +113,181 @@ function checkStringValue(param, modelValue, possibleAnswers) {
482
113
  }
483
114
  return { valid: true };
484
115
  }
485
- function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
486
- const modelArgs = modelToolCall.args;
487
- const modelFuncName = modelToolCall.toolName;
488
- const expectedFuncName = funcDescription.name;
489
- const expectedParams = funcDescription.parameters.properties;
490
- const requiredParams = funcDescription.parameters.required;
491
- if (modelFuncName !== expectedFuncName) {
492
- return {
493
- valid: false,
494
- error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
495
- error_type: "simple_function_checker:wrong_func_name"
496
- };
116
+ function normalizeObject(obj) {
117
+ if (Array.isArray(obj)) {
118
+ return obj.map(normalizeObject);
497
119
  }
498
- const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
499
- const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
500
- for (const param of requiredParams) {
501
- if (!(param in argsObj)) {
502
- return {
120
+ if (obj && typeof obj === "object") {
121
+ const normalized = {};
122
+ for (const [key, value] of Object.entries(obj)) {
123
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
124
+ normalized[key] = value[0];
125
+ } else {
126
+ normalized[key] = normalizeObject(value);
127
+ }
128
+ }
129
+ return normalized;
130
+ }
131
+ return obj;
132
+ }
133
+ function valuesMatch(modelValue, possibleValue) {
134
+ if (modelValue === possibleValue) {
135
+ return true;
136
+ }
137
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
138
+ try {
139
+ const normalizedModel = normalizeObject(modelValue);
140
+ const normalizedPossible = normalizeObject(possibleValue);
141
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
142
+ } catch {
143
+ return false;
144
+ }
145
+ }
146
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
147
+ return modelValue.toString() === possibleValue;
148
+ }
149
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
150
+ return modelValue === possibleValue.toString();
151
+ }
152
+ return false;
153
+ }
154
+ function checkArrayValue(paramName, modelValue, possibleValues) {
155
+ const modelValueStr = JSON.stringify(
156
+ modelValue.map((v) => standardizeString(String(v))).sort()
157
+ );
158
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
159
+ if (!Array.isArray(p)) {
160
+ return false;
161
+ }
162
+ return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
163
+ }) : false;
164
+ if (!hasMatch) {
165
+ return {
166
+ valid: false,
167
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
168
+ modelValue
169
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
170
+ error_type: "value_error:list"
171
+ };
172
+ }
173
+ return { valid: true };
174
+ }
175
+ function checkObjectValue(paramName, modelValue, possibleValues) {
176
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
177
+ (possibleValue) => valuesMatch(modelValue, possibleValue)
178
+ ) : false;
179
+ if (!hasMatch) {
180
+ return {
181
+ valid: false,
182
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
183
+ modelValue
184
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
185
+ error_type: "value_error:other"
186
+ };
187
+ }
188
+ return { valid: true };
189
+ }
190
+ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
191
+ const funcNameCheck = checkFunctionName(
192
+ funcDescription.name,
193
+ modelToolCall.toolName
194
+ );
195
+ if (!funcNameCheck.valid) {
196
+ return funcNameCheck;
197
+ }
198
+ const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
199
+ const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
200
+ const context = {
201
+ funcDescription,
202
+ modelToolCall,
203
+ possibleAnswerParams,
204
+ expectedParams: funcDescription.parameters.properties
205
+ };
206
+ const requiredCheck = checkRequiredParams(
207
+ funcDescription.parameters.required,
208
+ argsObj
209
+ );
210
+ if (!requiredCheck.valid) {
211
+ return requiredCheck;
212
+ }
213
+ const paramsCheck = checkAllParameters(argsObj, context);
214
+ if (!paramsCheck.valid) {
215
+ return paramsCheck;
216
+ }
217
+ const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
218
+ if (!optionalCheck.valid) {
219
+ return optionalCheck;
220
+ }
221
+ return { valid: true };
222
+ }
223
+ function checkFunctionName(expected, actual) {
224
+ if (actual !== expected) {
225
+ return {
226
+ valid: false,
227
+ error: `Function name '${actual}' does not match expected '${expected}'.`,
228
+ error_type: "simple_function_checker:wrong_func_name"
229
+ };
230
+ }
231
+ return { valid: true };
232
+ }
233
+ function checkRequiredParams(requiredParams, argsObj) {
234
+ for (const param of requiredParams) {
235
+ if (!(param in argsObj)) {
236
+ return {
503
237
  valid: false,
504
238
  error: `Missing required parameter: '${param}'.`,
505
239
  error_type: "simple_function_checker:missing_required"
506
240
  };
507
241
  }
508
242
  }
509
- if (modelArgs && typeof modelArgs === "object") {
510
- for (const paramName of Object.keys(argsObj)) {
511
- const modelValue = argsObj[paramName];
512
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
243
+ return { valid: true };
244
+ }
245
+ function checkAllParameters(argsObj, context) {
246
+ for (const paramName of Object.keys(argsObj)) {
247
+ const paramCheck = checkSingleParameter(
248
+ paramName,
249
+ argsObj[paramName],
250
+ context
251
+ );
252
+ if (!paramCheck.valid) {
253
+ return paramCheck;
254
+ }
255
+ }
256
+ return { valid: true };
257
+ }
258
+ function checkSingleParameter(paramName, modelValue, context) {
259
+ if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
260
+ return {
261
+ valid: false,
262
+ error: `Unexpected parameter: '${paramName}'.`,
263
+ error_type: "simple_function_checker:unexpected_param"
264
+ };
265
+ }
266
+ const possibleValues = context.possibleAnswerParams[paramName];
267
+ if (typeof modelValue === "string") {
268
+ return checkStringValue(
269
+ paramName,
270
+ modelValue,
271
+ possibleValues ?? []
272
+ );
273
+ }
274
+ if (Array.isArray(modelValue)) {
275
+ return checkArrayValue(paramName, modelValue, possibleValues);
276
+ }
277
+ return checkObjectValue(paramName, modelValue, possibleValues);
278
+ }
279
+ function checkOptionalParams(argsObj, possibleAnswerParams) {
280
+ for (const paramName in possibleAnswerParams) {
281
+ if (Object.hasOwn(possibleAnswerParams, paramName)) {
282
+ const val = possibleAnswerParams[paramName];
283
+ const isOptional = Array.isArray(val) && val.includes("");
284
+ if (!(paramName in argsObj || isOptional)) {
513
285
  return {
514
286
  valid: false,
515
- error: `Unexpected parameter: '${paramName}'.`,
516
- error_type: "simple_function_checker:unexpected_param"
287
+ error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
288
+ error_type: "simple_function_checker:missing_optional"
517
289
  };
518
290
  }
519
- const possibleValues = possibleAnswerParams[paramName];
520
- if (typeof modelValue === "string") {
521
- const result = checkStringValue(
522
- paramName,
523
- modelValue,
524
- possibleValues ?? []
525
- );
526
- if (!result.valid) return result;
527
- } else if (Array.isArray(modelValue)) {
528
- const modelValueStr = JSON.stringify(
529
- modelValue.map((v) => standardizeString(String(v))).sort()
530
- );
531
- const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
532
- if (!Array.isArray(p)) return false;
533
- return JSON.stringify(
534
- p.map((v) => standardizeString(String(v))).sort()
535
- ) === modelValueStr;
536
- }) : false;
537
- if (!hasMatch) {
538
- return {
539
- valid: false,
540
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
541
- modelValue
542
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
543
- error_type: "value_error:list"
544
- };
545
- }
546
- } else {
547
- const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
548
- if (modelValue === possibleValue) return true;
549
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
550
- try {
551
- const normalizeObject = (obj) => {
552
- if (Array.isArray(obj)) {
553
- return obj.map(normalizeObject);
554
- }
555
- if (obj && typeof obj === "object") {
556
- const normalized = {};
557
- for (const [key, value] of Object.entries(
558
- obj
559
- )) {
560
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
561
- normalized[key] = value[0];
562
- } else {
563
- normalized[key] = normalizeObject(value);
564
- }
565
- }
566
- return normalized;
567
- }
568
- return obj;
569
- };
570
- const normalizedModel = normalizeObject(modelValue);
571
- const normalizedPossible = normalizeObject(possibleValue);
572
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
573
- } catch {
574
- return false;
575
- }
576
- }
577
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
578
- return modelValue.toString() === possibleValue;
579
- }
580
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
581
- return modelValue === possibleValue.toString();
582
- }
583
- return false;
584
- }) : false;
585
- if (!hasMatch) {
586
- return {
587
- valid: false,
588
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
589
- modelValue
590
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
591
- error_type: "value_error:other"
592
- };
593
- }
594
- }
595
- }
596
- }
597
- for (const paramName in possibleAnswerParams) {
598
- const val = possibleAnswerParams[paramName];
599
- const isOptional = Array.isArray(val) && val.includes("");
600
- if (!(paramName in argsObj) && !isOptional) {
601
- return {
602
- valid: false,
603
- error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
604
- error_type: "simple_function_checker:missing_optional"
605
- };
606
291
  }
607
292
  }
608
293
  return { valid: true };
@@ -629,8 +314,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
629
314
  };
630
315
  }
631
316
  let foundMatch = false;
632
- for (let i = 0; i < modelToolCalls.length; i++) {
633
- if (matchedModelCallIndices.has(i)) continue;
317
+ for (let i = 0; i < modelToolCalls.length; i += 1) {
318
+ if (matchedModelCallIndices.has(i)) {
319
+ continue;
320
+ }
634
321
  const checkerResult = simpleFunctionChecker(
635
322
  funcDescription,
636
323
  modelToolCalls[i],
@@ -679,6 +366,8 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
679
366
  }
680
367
 
681
368
  // src/benchmarks/bfcl.ts
369
+ var LINE_SPLIT_REGEX = /\r?\n/;
370
+ var NUMERIC_STRING_REGEX = /^\d+$/;
682
371
  function check(testCase, modelOutput, possibleAnswer) {
683
372
  const category = testCase.id.split("_")[0];
684
373
  try {
@@ -695,19 +384,22 @@ function check(testCase, modelOutput, possibleAnswer) {
695
384
  modelOutput[0],
696
385
  possibleAnswer.ground_truth[0]
697
386
  );
698
- } else if (category === "parallel") {
387
+ }
388
+ if (category === "parallel") {
699
389
  return parallelFunctionCheckerNoOrder(
700
390
  testCase.function,
701
391
  modelOutput,
702
392
  possibleAnswer.ground_truth
703
393
  );
704
- } else if (category === "multiple") {
394
+ }
395
+ if (category === "multiple") {
705
396
  return multipleFunctionChecker(
706
397
  testCase.function,
707
398
  modelOutput,
708
399
  possibleAnswer.ground_truth
709
400
  );
710
- } else if (category.includes("parallel-multiple")) {
401
+ }
402
+ if (category.includes("parallel-multiple")) {
711
403
  return parallelFunctionCheckerNoOrder(
712
404
  testCase.function,
713
405
  modelOutput,
@@ -743,8 +435,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
743
435
  path2.join(dataPath, answerDataFile),
744
436
  "utf-8"
745
437
  );
746
- testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
747
- const possibleAnswers = possibleAnswersJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
438
+ testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
439
+ const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
748
440
  const possibleAnswersMap = new Map(
749
441
  possibleAnswers.map((ans) => [ans.id, ans])
750
442
  );
@@ -756,373 +448,600 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
756
448
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
757
449
  );
758
450
  }
451
+ const fixSchemaType = (copy) => {
452
+ if (!copy.type) {
453
+ return;
454
+ }
455
+ if (copy.type === "dict") {
456
+ copy.type = "object";
457
+ }
458
+ if (copy.type === "tuple") {
459
+ copy.type = "array";
460
+ }
461
+ if (copy.type === "integer" || copy.type === "float") {
462
+ copy.type = "number";
463
+ }
464
+ };
465
+ const fixSchemaProperties = (copy, fixSchemaFn) => {
466
+ if (!copy.properties || typeof copy.properties !== "object") {
467
+ return;
468
+ }
469
+ for (const k of Object.keys(copy.properties)) {
470
+ copy.properties[k] = fixSchemaFn(
471
+ copy.properties[k]
472
+ );
473
+ }
474
+ };
759
475
  const fixSchema = (schema) => {
760
- if (!schema || typeof schema !== "object")
476
+ if (!schema || typeof schema !== "object") {
761
477
  return { type: "object", properties: {} };
478
+ }
762
479
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
763
480
  if (!Array.isArray(copy)) {
764
- if (copy.type) {
765
- if (copy.type === "dict") copy.type = "object";
766
- if (copy.type === "integer" || copy.type === "float")
767
- copy.type = "number";
481
+ fixSchemaType(copy);
482
+ fixSchemaProperties(copy, fixSchema);
483
+ if (copy.items) {
484
+ copy.items = fixSchema(copy.items);
768
485
  }
769
- if (copy.properties && typeof copy.properties === "object") {
770
- for (const k of Object.keys(copy.properties)) {
771
- copy.properties[k] = fixSchema(
772
- copy.properties[k]
773
- );
774
- }
775
- }
776
- if (copy.items) copy.items = fixSchema(copy.items);
777
486
  return copy;
778
487
  }
779
488
  return copy;
780
489
  };
490
+ const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
491
+ const sanitizeName = (toolName) => {
492
+ const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
493
+ return s.length > 0 ? s : "tool";
494
+ };
495
+ const buildTransformedTools = (tools, fixSchemaFn) => {
496
+ const nameMap = /* @__PURE__ */ new Map();
497
+ const transformedTools = tools.map((t) => {
498
+ const fixed = fixSchemaFn(t.parameters);
499
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
500
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
501
+ const sanitized = sanitizeName(t.name);
502
+ nameMap.set(sanitized, t.name);
503
+ return {
504
+ type: "function",
505
+ name: sanitized,
506
+ description: t.description,
507
+ inputSchema
508
+ };
509
+ });
510
+ return { transformedTools, nameMap };
511
+ };
512
+ const parseDebugToolCalls = (raw) => {
513
+ if (!raw) {
514
+ return [];
515
+ }
516
+ try {
517
+ const arr = JSON.parse(raw);
518
+ return Array.isArray(arr) ? arr : [];
519
+ } catch {
520
+ return [];
521
+ }
522
+ };
523
+ const getSanitizedName = (rawName, transformedTools) => {
524
+ if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
525
+ return transformedTools[Number(rawName)]?.name ?? rawName;
526
+ }
527
+ return rawName;
528
+ };
529
+ const parseToolArgs = (extractedArgs) => {
530
+ if (typeof extractedArgs !== "string") {
531
+ return extractedArgs;
532
+ }
533
+ try {
534
+ return JSON.parse(extractedArgs);
535
+ } catch {
536
+ return extractedArgs;
537
+ }
538
+ };
539
+ const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
540
+ const call = c;
541
+ const rawName = call.toolName ?? call.name;
542
+ const sanitizedFromIndex = getSanitizedName(
543
+ rawName,
544
+ transformedTools
545
+ );
546
+ const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
547
+ const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
548
+ const parsedArgs = parseToolArgs(extractedArgs);
549
+ return {
550
+ ...call,
551
+ toolName: originalName,
552
+ name: originalName,
553
+ args: parsedArgs ?? {}
554
+ };
555
+ });
556
+ const summarizeArgs = (args) => {
557
+ if (args == null) {
558
+ return args;
559
+ }
560
+ if (typeof args !== "object") {
561
+ return args;
562
+ }
563
+ return Object.keys(args).sort().reduce(
564
+ (acc, k) => {
565
+ acc[k] = args[k];
566
+ return acc;
567
+ },
568
+ {}
569
+ );
570
+ };
571
+ const generateParamMismatchDiff = (paramName, allowed, got) => {
572
+ const diffLines = [];
573
+ diffLines.push(`@@ param ${paramName}`);
574
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
575
+ const expectedLine = (() => {
576
+ if (allowedArray.length === 1) {
577
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
578
+ }
579
+ const formatted = allowedArray.map(
580
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
581
+ ).join(", ");
582
+ return `- expected one of: ${formatted}`;
583
+ })();
584
+ diffLines.push(expectedLine);
585
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
586
+ return diffLines;
587
+ };
588
+ const paramValueMatches = (allowed, got) => {
589
+ if (!Array.isArray(allowed)) {
590
+ return false;
591
+ }
592
+ return allowed.some((v) => {
593
+ try {
594
+ if (Array.isArray(got)) {
595
+ return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
596
+ }
597
+ } catch {
598
+ }
599
+ return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
600
+ });
601
+ };
602
+ const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
603
+ if (expectedName !== receivedName) {
604
+ diff.push("@@ function name");
605
+ diff.push(`- ${expectedName}`);
606
+ diff.push(`+ ${receivedName}`);
607
+ }
608
+ };
609
+ const checkMissingParams = (required, receivedArgs, diff) => {
610
+ for (const req of required) {
611
+ if (!(req in receivedArgs)) {
612
+ diff.push(`- missing required param: ${req}`);
613
+ }
614
+ }
615
+ };
616
+ const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
617
+ for (const k of Object.keys(receivedArgs)) {
618
+ if (!(k in expectedParams)) {
619
+ diff.push(`+ unexpected param: ${k}`);
620
+ }
621
+ }
622
+ };
623
+ const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
624
+ for (const k of Object.keys(receivedArgs)) {
625
+ if (k in expectedParams) {
626
+ const allowed = expectedParams[k];
627
+ const got = receivedArgs[k];
628
+ if (!paramValueMatches(allowed, got)) {
629
+ diff.push(...generateParamMismatchDiff(k, allowed, got));
630
+ }
631
+ }
632
+ }
633
+ };
634
+ const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
635
+ const funcDesc = tools[0];
636
+ const gt = possibleAnswer.ground_truth?.[0];
637
+ const expectedFuncName = funcDesc?.name;
638
+ const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
639
+ const received = restoredCalls[0];
640
+ const receivedName = received?.toolName ?? received?.name;
641
+ const receivedArgs = summarizeArgs(received?.args);
642
+ const expected = {
643
+ function: expectedFuncName,
644
+ params: expectedParams
645
+ };
646
+ const actual = {
647
+ function: receivedName,
648
+ args: receivedArgs
649
+ };
650
+ const diff = [];
651
+ checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
652
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
653
+ const required = funcDesc?.parameters?.required ?? [];
654
+ checkMissingParams(
655
+ required,
656
+ receivedArgs,
657
+ diff
658
+ );
659
+ checkUnexpectedParams(
660
+ expectedParams,
661
+ receivedArgs,
662
+ diff
663
+ );
664
+ checkParamValueMismatches(
665
+ expectedParams,
666
+ receivedArgs,
667
+ diff
668
+ );
669
+ }
670
+ return { expected, actual, diff };
671
+ };
672
+ const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
673
+ if (expectedCount !== actualCount) {
674
+ diff.push("@@ call count");
675
+ diff.push(`- expected ${expectedCount}`);
676
+ diff.push(`+ got ${actualCount}`);
677
+ }
678
+ };
679
+ const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
680
+ const missing = expectedNames.filter((n) => !actualNames.includes(n));
681
+ const extra = actualNames.filter((n) => !expectedNames.includes(n));
682
+ for (const m of missing) {
683
+ diff.push(`- missing function: ${m}`);
684
+ }
685
+ for (const e of extra) {
686
+ diff.push(`+ unexpected function: ${e}`);
687
+ }
688
+ };
689
+ const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
690
+ for (let i = 0; i < restoredCalls.length; i += 1) {
691
+ if (usedActual.has(i)) {
692
+ continue;
693
+ }
694
+ const rc = restoredCalls[i];
695
+ const rcName = rc?.toolName ?? rc?.name;
696
+ if (rcName === fname) {
697
+ return i;
698
+ }
699
+ }
700
+ return -1;
701
+ };
702
+ const validateFunctionParams = (options) => {
703
+ const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
704
+ checkMissingParams(requiredParams, receivedArgs, diff);
705
+ checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
706
+ checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
707
+ };
708
+ const processExpectedCall = (options) => {
709
+ const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
710
+ const fname = Object.keys(expectedObj)[0];
711
+ const matchedIndex = findMatchingCallIndex(
712
+ fname,
713
+ restoredCalls,
714
+ usedActual
715
+ );
716
+ if (matchedIndex === -1) {
717
+ return;
718
+ }
719
+ usedActual.add(matchedIndex);
720
+ const received = restoredCalls[matchedIndex];
721
+ const receivedArgs = summarizeArgs(received?.args);
722
+ const expectedParamsAllowed = expectedObj[fname];
723
+ const funcDesc = tools.find((t) => t.name === fname);
724
+ const requiredParams = funcDesc?.parameters?.required ?? [];
725
+ diff.push(`@@ function ${fname}`);
726
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
727
+ validateFunctionParams({
728
+ receivedArgs,
729
+ expectedParamsAllowed,
730
+ requiredParams,
731
+ diff
732
+ });
733
+ }
734
+ };
735
+ const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
736
+ const gtArr = possibleAnswer.ground_truth ?? [];
737
+ const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
738
+ const actualNames = restoredCalls.map(
739
+ (c) => c.toolName ?? c.name
740
+ );
741
+ const expected = {
742
+ functions: expectedNames
743
+ };
744
+ const actual = { functions: actualNames };
745
+ const diff = [];
746
+ checkCallCountMismatch(
747
+ expectedNames.length,
748
+ actualNames.length,
749
+ diff
750
+ );
751
+ addMissingAndExtraFunctions(expectedNames, actualNames, diff);
752
+ const usedActual = /* @__PURE__ */ new Set();
753
+ for (const expectedObj of gtArr) {
754
+ processExpectedCall({
755
+ expectedObj,
756
+ restoredCalls,
757
+ tools,
758
+ usedActual,
759
+ diff
760
+ });
761
+ }
762
+ return { expected, actual, diff };
763
+ };
781
764
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
782
765
  const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
783
766
  logs.push(
784
767
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
785
768
  );
786
- const runSingleCase = async (testCase) => {
787
- const caseLogs = [];
788
- const { function: tools, question: messages } = testCase;
789
- const temp = config?.temperature;
790
- const temperature = typeof temp === "number" ? temp : void 0;
791
- const maxTok = config?.maxTokens;
792
- const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
769
+ const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
793
770
  try {
794
- const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
795
- const nameMap = /* @__PURE__ */ new Map();
796
- const sanitizeName = (name2) => {
797
- const s = name2.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
798
- return s.length > 0 ? s : "tool";
799
- };
800
- const transformedTools = tools.map((t) => {
801
- const fixed = fixSchema(t.parameters);
802
- const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
803
- const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
804
- const sanitized = sanitizeName(t.name);
805
- nameMap.set(sanitized, t.name);
806
- return {
807
- type: "function",
808
- name: sanitized,
809
- description: t.description,
810
- inputSchema
811
- };
812
- });
813
- const toolsMap = Object.fromEntries(
814
- transformedTools.map((t) => [
815
- t.name,
816
- tool({
817
- description: typeof t.description === "string" ? t.description : void 0,
818
- inputSchema: jsonSchema(t.inputSchema)
819
- })
820
- ])
771
+ const firstTool = transformedTools[0];
772
+ const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
773
+ caseLogs.push(
774
+ `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
821
775
  );
822
- try {
823
- const firstTool = transformedTools[0];
824
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
825
- caseLogs.push(
826
- `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
827
- );
828
- } catch (e) {
829
- caseLogs.push(
830
- `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
831
- );
776
+ } catch (e) {
777
+ caseLogs.push(
778
+ `[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
779
+ );
780
+ }
781
+ };
782
+ const logRawToolCalls = (options) => {
783
+ const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
784
+ try {
785
+ caseLogs.push(
786
+ `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
787
+ );
788
+ } catch {
789
+ caseLogs.push(
790
+ `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
791
+ );
792
+ }
793
+ };
794
+ const buildFailureContext = (options) => {
795
+ const {
796
+ testCase,
797
+ tools,
798
+ flatMessages,
799
+ mwOriginalText,
800
+ text,
801
+ finishReason,
802
+ mwParsedToolCalls,
803
+ restoredCalls,
804
+ possibleAnswer
805
+ } = options;
806
+ const lastUser = (() => {
807
+ const reversed = [...flatMessages].reverse();
808
+ const found = reversed.find(
809
+ (m) => m.role === "user"
810
+ );
811
+ return found?.content ?? void 0;
812
+ })();
813
+ const rawModelText = (() => {
814
+ if (mwOriginalText && mwOriginalText.length > 0) {
815
+ return mwOriginalText;
832
816
  }
833
- const debugSummaryRef = {};
834
- const providerOptions = {
835
- toolCallMiddleware: {
836
- debugSummary: debugSummaryRef
837
- }
838
- };
839
- const { toolCalls, text, finishReason } = await generateText({
840
- model,
841
- messages: flatMessages,
842
- tools: toolsMap,
843
- toolChoice: "auto",
844
- providerOptions,
845
- ...temperature !== void 0 ? { temperature } : {},
846
- ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
847
- });
848
- const mwOriginalText = debugSummaryRef.originalText;
849
- const mwParsedToolCalls = (() => {
850
- const raw = debugSummaryRef.toolCalls;
851
- if (!raw) return [];
852
- try {
853
- const arr = JSON.parse(raw);
854
- return Array.isArray(arr) ? arr : [];
855
- } catch {
856
- return [];
857
- }
858
- })();
817
+ if (typeof text === "string") {
818
+ return text;
819
+ }
820
+ return "";
821
+ })();
822
+ return {
823
+ id: testCase.id,
824
+ tool_schema: tools,
825
+ last_user_query: lastUser,
826
+ raw_model_text: rawModelText,
827
+ finish_reason: finishReason,
828
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
829
+ ground_truth: possibleAnswer.ground_truth
830
+ };
831
+ };
832
+ const logFailureDetails = (options) => {
833
+ const {
834
+ testCase,
835
+ tools,
836
+ possibleAnswer,
837
+ restoredCalls,
838
+ checkerResult,
839
+ flatMessages,
840
+ mwOriginalText,
841
+ text,
842
+ finishReason,
843
+ mwParsedToolCalls,
844
+ caseLogs
845
+ } = options;
846
+ try {
847
+ const category = testCase.id.split("_")[0];
848
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
849
+ tools,
850
+ possibleAnswer,
851
+ restoredCalls
852
+ ) : buildParallelDiff(
853
+ tools,
854
+ possibleAnswer,
855
+ restoredCalls
856
+ );
857
+ caseLogs.push(
858
+ `[DEBUG-FAIL] ${JSON.stringify({
859
+ id: testCase.id,
860
+ message: checkerResult.error,
861
+ error_type: checkerResult.error_type,
862
+ expected,
863
+ actual,
864
+ diff
865
+ })}`
866
+ );
859
867
  try {
868
+ const contextPayload = buildFailureContext({
869
+ testCase,
870
+ tools,
871
+ flatMessages,
872
+ mwOriginalText,
873
+ text,
874
+ finishReason,
875
+ mwParsedToolCalls,
876
+ restoredCalls,
877
+ possibleAnswer
878
+ });
860
879
  caseLogs.push(
861
- `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
880
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
862
881
  );
863
882
  } catch {
864
- caseLogs.push(
865
- `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
866
- );
867
883
  }
868
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
869
- if (!possibleAnswer) {
870
- throw new Error(`No possible answer for id: ${testCase.id}`);
884
+ } catch {
885
+ caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
886
+ }
887
+ };
888
+ const buildToolsMap = (transformedTools) => Object.fromEntries(
889
+ transformedTools.map((t) => [
890
+ t.name,
891
+ tool({
892
+ description: typeof t.description === "string" ? t.description : void 0,
893
+ inputSchema: jsonSchema(
894
+ t.inputSchema
895
+ )
896
+ })
897
+ ])
898
+ );
899
+ const executeModelGeneration = async (options) => {
900
+ const {
901
+ model: modelInstance,
902
+ flatMessages,
903
+ toolsMap,
904
+ temperature,
905
+ maxTokens
906
+ } = options;
907
+ const debugSummaryRef = {};
908
+ const providerOptions = {
909
+ toolCallMiddleware: {
910
+ debugSummary: debugSummaryRef
871
911
  }
872
- const restoredCalls = (toolCalls || []).map((c) => {
873
- const rawName = c.toolName ?? c.name;
874
- const sanitizedFromIndex = typeof rawName === "string" && /^\d+$/.test(rawName) ? transformedTools[Number(rawName)]?.name ?? rawName : rawName;
875
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
876
- const extractedArgs = c.args ?? c.arguments ?? c.input ?? c.params ?? c.parameters ?? void 0;
877
- let parsedArgs = extractedArgs;
878
- if (typeof parsedArgs === "string") {
879
- try {
880
- parsedArgs = JSON.parse(parsedArgs);
881
- } catch {
882
- }
883
- }
884
- return {
885
- ...c,
886
- toolName: originalName,
887
- name: originalName,
888
- args: parsedArgs ?? {}
889
- };
912
+ };
913
+ const { toolCalls, text, finishReason } = await generateText({
914
+ model: modelInstance,
915
+ messages: flatMessages,
916
+ tools: toolsMap,
917
+ toolChoice: "auto",
918
+ providerOptions,
919
+ ...temperature !== void 0 ? { temperature } : {},
920
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
921
+ });
922
+ return { toolCalls, text, finishReason, debugSummaryRef };
923
+ };
924
+ const processValidationResult = (options) => {
925
+ const {
926
+ checkerResult,
927
+ testCase,
928
+ tools,
929
+ possibleAnswer,
930
+ restoredCalls,
931
+ flatMessages,
932
+ mwOriginalText,
933
+ text,
934
+ finishReason,
935
+ mwParsedToolCalls,
936
+ caseLogs
937
+ } = options;
938
+ if (checkerResult.valid) {
939
+ caseLogs.push(`[PASS] ${testCase.id}`);
940
+ return { valid: true, logs: caseLogs };
941
+ }
942
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
943
+ logFailureDetails({
944
+ testCase,
945
+ tools,
946
+ possibleAnswer,
947
+ restoredCalls,
948
+ checkerResult,
949
+ flatMessages,
950
+ mwOriginalText,
951
+ text,
952
+ finishReason,
953
+ mwParsedToolCalls,
954
+ caseLogs
955
+ });
956
+ return { valid: false, logs: caseLogs };
957
+ };
958
+ const prepareTestCaseData = (testCase) => {
959
+ const { function: tools, question: messages } = testCase;
960
+ const flatMessages = flattenMessages(messages);
961
+ const { transformedTools, nameMap } = buildTransformedTools(
962
+ tools,
963
+ fixSchema
964
+ );
965
+ const toolsMap = buildToolsMap(transformedTools);
966
+ return { flatMessages, transformedTools, nameMap, toolsMap };
967
+ };
968
+ const processModelResponse = (options) => {
969
+ const {
970
+ testCase,
971
+ toolCalls,
972
+ text,
973
+ finishReason,
974
+ debugSummaryRef,
975
+ nameMap,
976
+ transformedTools,
977
+ flatMessages,
978
+ tools,
979
+ caseLogs
980
+ } = options;
981
+ const mwOriginalText = debugSummaryRef.originalText;
982
+ const mwParsedToolCalls = parseDebugToolCalls(
983
+ debugSummaryRef.toolCalls
984
+ );
985
+ logRawToolCalls({
986
+ toolCalls,
987
+ finishReason,
988
+ text,
989
+ testCaseId: testCase.id,
990
+ caseLogs
991
+ });
992
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
993
+ if (!possibleAnswer) {
994
+ throw new Error(`No possible answer for id: ${testCase.id}`);
995
+ }
996
+ const restoredCalls = restoreToolCalls(
997
+ toolCalls || [],
998
+ nameMap,
999
+ transformedTools
1000
+ );
1001
+ const checkerResult = check(testCase, restoredCalls, possibleAnswer);
1002
+ return processValidationResult({
1003
+ checkerResult,
1004
+ testCase,
1005
+ tools,
1006
+ possibleAnswer,
1007
+ restoredCalls,
1008
+ flatMessages,
1009
+ mwOriginalText,
1010
+ text,
1011
+ finishReason,
1012
+ mwParsedToolCalls,
1013
+ caseLogs
1014
+ });
1015
+ };
1016
+ const runSingleCase = async (testCase) => {
1017
+ const caseLogs = [];
1018
+ const { function: tools } = testCase;
1019
+ const temp = config?.temperature;
1020
+ const temperature = typeof temp === "number" ? temp : void 0;
1021
+ const maxTok = config?.maxTokens;
1022
+ const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1023
+ try {
1024
+ const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
1025
+ logFirstToolDebug(transformedTools, testCase.id, caseLogs);
1026
+ const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
1027
+ model,
1028
+ flatMessages,
1029
+ toolsMap,
1030
+ temperature,
1031
+ maxTokens
890
1032
  });
891
- const checkerResult = check(
1033
+ return processModelResponse({
892
1034
  testCase,
893
- restoredCalls,
894
- possibleAnswer
895
- );
896
- if (checkerResult.valid) {
897
- caseLogs.push(`[PASS] ${testCase.id}`);
898
- return { valid: true, logs: caseLogs };
899
- } else {
900
- caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
901
- try {
902
- let generateParamMismatchDiff2 = function(paramName, allowed, got) {
903
- const diffLines = [];
904
- diffLines.push(`@@ param ${paramName}`);
905
- const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
906
- const expectedLine = (() => {
907
- if (allowedArray.length === 1) {
908
- return `- expected: ${JSON.stringify(allowedArray[0])}`;
909
- }
910
- const formatted = allowedArray.map(
911
- (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
912
- ).join(", ");
913
- return `- expected one of: ${formatted}`;
914
- })();
915
- diffLines.push(expectedLine);
916
- diffLines.push(`+ got: ${JSON.stringify(got)}`);
917
- return diffLines;
918
- };
919
- var generateParamMismatchDiff = generateParamMismatchDiff2;
920
- const category = testCase.id.split("_")[0];
921
- const diff = [];
922
- const summarizeArgs = (args) => {
923
- if (args == null) return args;
924
- if (typeof args !== "object") return args;
925
- return Object.keys(args).sort().reduce(
926
- (acc, k) => {
927
- acc[k] = args[k];
928
- return acc;
929
- },
930
- {}
931
- );
932
- };
933
- const expected = {};
934
- const actual = {};
935
- if (category === "simple") {
936
- const funcDesc = tools[0];
937
- const gt = possibleAnswer.ground_truth?.[0];
938
- const expectedFuncName = funcDesc?.name;
939
- const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
940
- const received = restoredCalls[0];
941
- const receivedName = received?.toolName ?? received?.name;
942
- const receivedArgs = summarizeArgs(received?.args);
943
- expected.function = expectedFuncName;
944
- expected.params = expectedParams;
945
- actual.function = receivedName;
946
- actual.args = receivedArgs;
947
- if (expectedFuncName !== receivedName) {
948
- diff.push(`@@ function name`);
949
- diff.push(`- ${expectedFuncName}`);
950
- diff.push(`+ ${receivedName}`);
951
- }
952
- if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
953
- const required = funcDesc?.parameters?.required ?? [];
954
- for (const req of required) {
955
- if (!(req in receivedArgs)) {
956
- diff.push(`- missing required param: ${req}`);
957
- }
958
- }
959
- for (const k of Object.keys(
960
- receivedArgs
961
- )) {
962
- if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
963
- diff.push(`+ unexpected param: ${k}`);
964
- }
965
- }
966
- for (const k of Object.keys(
967
- receivedArgs
968
- )) {
969
- if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
970
- const allowed = expectedParams[k];
971
- const got = receivedArgs[k];
972
- const includes = Array.isArray(allowed) && allowed.some((v) => {
973
- try {
974
- if (Array.isArray(got)) {
975
- return JSON.stringify(
976
- got.map((x) => String(x)).sort()
977
- ) === JSON.stringify(
978
- v.map((x) => String(x)).sort()
979
- );
980
- }
981
- } catch {
982
- }
983
- return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
984
- });
985
- if (!includes) {
986
- diff.push(
987
- ...generateParamMismatchDiff2(k, allowed, got)
988
- );
989
- }
990
- }
991
- }
992
- }
993
- } else {
994
- const gtArr = possibleAnswer.ground_truth ?? [];
995
- const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
996
- const actualNames = restoredCalls.map(
997
- (c) => c.toolName ?? c.name
998
- );
999
- expected.functions = expectedNames;
1000
- actual.functions = actualNames;
1001
- if (expectedNames.length !== actualNames.length) {
1002
- diff.push(`@@ call count`);
1003
- diff.push(`- expected ${expectedNames.length}`);
1004
- diff.push(`+ got ${actualNames.length}`);
1005
- }
1006
- const missing = expectedNames.filter(
1007
- (n) => !actualNames.includes(n)
1008
- );
1009
- const extra = actualNames.filter(
1010
- (n) => !expectedNames.includes(n)
1011
- );
1012
- for (const m of missing)
1013
- diff.push(`- missing function: ${m}`);
1014
- for (const e of extra)
1015
- diff.push(`+ unexpected function: ${e}`);
1016
- const usedActual = /* @__PURE__ */ new Set();
1017
- for (const expectedObj of gtArr) {
1018
- const fname = Object.keys(expectedObj)[0];
1019
- let matchedIndex = -1;
1020
- for (let i = 0; i < restoredCalls.length; i++) {
1021
- if (usedActual.has(i)) continue;
1022
- const rc = restoredCalls[i];
1023
- const rcName = rc?.toolName ?? rc?.name;
1024
- if (rcName === fname) {
1025
- matchedIndex = i;
1026
- break;
1027
- }
1028
- }
1029
- if (matchedIndex === -1) continue;
1030
- usedActual.add(matchedIndex);
1031
- const received = restoredCalls[matchedIndex];
1032
- const receivedArgs = summarizeArgs(received?.args);
1033
- const expectedParamsAllowed = expectedObj[fname];
1034
- const funcDesc = tools.find(
1035
- (t) => t.name === fname
1036
- );
1037
- const requiredParams = funcDesc?.parameters?.required ?? [];
1038
- diff.push(`@@ function ${fname}`);
1039
- if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1040
- for (const req of requiredParams) {
1041
- if (!(req in receivedArgs)) {
1042
- diff.push(`- missing required param: ${req}`);
1043
- }
1044
- }
1045
- for (const k of Object.keys(
1046
- receivedArgs
1047
- )) {
1048
- if (!Object.prototype.hasOwnProperty.call(
1049
- expectedParamsAllowed,
1050
- k
1051
- )) {
1052
- diff.push(`+ unexpected param: ${k}`);
1053
- }
1054
- }
1055
- for (const k of Object.keys(
1056
- receivedArgs
1057
- )) {
1058
- if (Object.prototype.hasOwnProperty.call(
1059
- expectedParamsAllowed,
1060
- k
1061
- )) {
1062
- const allowed = expectedParamsAllowed[k];
1063
- const got = receivedArgs[k];
1064
- const includes = Array.isArray(allowed) && allowed.some((v) => {
1065
- try {
1066
- if (Array.isArray(got)) {
1067
- return JSON.stringify(
1068
- got.map((x) => String(x)).sort()
1069
- ) === JSON.stringify(
1070
- v.map((x) => String(x)).sort()
1071
- );
1072
- }
1073
- } catch {
1074
- }
1075
- return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
1076
- });
1077
- if (!includes) {
1078
- diff.push(
1079
- ...generateParamMismatchDiff2(k, allowed, got)
1080
- );
1081
- }
1082
- }
1083
- }
1084
- }
1085
- }
1086
- }
1087
- caseLogs.push(
1088
- `[DEBUG-FAIL] ${JSON.stringify({
1089
- id: testCase.id,
1090
- message: checkerResult.error,
1091
- error_type: checkerResult.error_type,
1092
- expected,
1093
- actual,
1094
- diff
1095
- })}`
1096
- );
1097
- try {
1098
- const lastUser = (() => {
1099
- const reversed = [...flatMessages].reverse();
1100
- const found = reversed.find(
1101
- (m) => m.role === "user"
1102
- );
1103
- return found?.content ?? void 0;
1104
- })();
1105
- const contextPayload = {
1106
- id: testCase.id,
1107
- tool_schema: tools,
1108
- last_user_query: lastUser,
1109
- raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
1110
- finish_reason: finishReason,
1111
- parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1112
- ground_truth: possibleAnswer.ground_truth
1113
- };
1114
- caseLogs.push(
1115
- `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
1116
- );
1117
- } catch {
1118
- }
1119
- } catch {
1120
- caseLogs.push(
1121
- `[DEBUG] ${testCase.id}: failed to build debug diff`
1122
- );
1123
- }
1124
- return { valid: false, logs: caseLogs };
1125
- }
1035
+ toolCalls,
1036
+ text,
1037
+ finishReason,
1038
+ debugSummaryRef,
1039
+ nameMap,
1040
+ transformedTools,
1041
+ flatMessages,
1042
+ tools,
1043
+ caseLogs
1044
+ });
1126
1045
  } catch (e) {
1127
1046
  caseLogs.push(
1128
1047
  `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
@@ -1133,13 +1052,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1133
1052
  return { valid: false, logs: caseLogs };
1134
1053
  }
1135
1054
  };
1136
- const mapWithConcurrency = async (items, limit2, mapper) => {
1055
+ const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
1137
1056
  const results = new Array(items.length);
1138
1057
  let idx = 0;
1139
- const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
1058
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1140
1059
  while (true) {
1141
- const current = idx++;
1142
- if (current >= items.length) break;
1060
+ const current = idx;
1061
+ idx += 1;
1062
+ if (current >= items.length) {
1063
+ break;
1064
+ }
1143
1065
  results[current] = await mapper(items[current], current);
1144
1066
  }
1145
1067
  });
@@ -1155,7 +1077,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1155
1077
  (acc, r) => acc + (r.valid ? 1 : 0),
1156
1078
  0
1157
1079
  );
1158
- for (const r of resultsPerCase) logs.push(...r.logs);
1080
+ for (const r of resultsPerCase) {
1081
+ logs.push(...r.logs);
1082
+ }
1159
1083
  if (testCases.length === 0) {
1160
1084
  return {
1161
1085
  score: 0,
@@ -1182,7 +1106,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1182
1106
  success: false,
1183
1107
  metrics: {},
1184
1108
  error: e,
1185
- logs: [`[FATAL] Failed to run benchmark ${name}: ${e.message}`]
1109
+ logs: [
1110
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1111
+ ]
1186
1112
  };
1187
1113
  }
1188
1114
  }
@@ -1191,87 +1117,222 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1191
1117
  var bfclSimpleBenchmark = createBfclBenchmark(
1192
1118
  "bfcl-simple",
1193
1119
  "BFCL Simple Function Calling",
1194
- "BFCL_v3_simple.json",
1195
- "BFCL_v3_simple_possible_answer.json"
1120
+ "BFCL_v3_simple.jsonl",
1121
+ "BFCL_v3_simple_possible_answer.jsonl"
1196
1122
  );
1197
1123
  var bfclParallelBenchmark = createBfclBenchmark(
1198
1124
  "bfcl-parallel",
1199
1125
  "BFCL Parallel Function Calling",
1200
- "BFCL_v3_parallel.json",
1201
- "BFCL_v3_parallel_possible_answer.json"
1126
+ "BFCL_v3_parallel.jsonl",
1127
+ "BFCL_v3_parallel_possible_answer.jsonl"
1202
1128
  );
1203
1129
  var bfclMultipleBenchmark = createBfclBenchmark(
1204
1130
  "bfcl-multiple",
1205
1131
  "BFCL Multiple Function Calling",
1206
- "BFCL_v3_multiple.json",
1207
- "BFCL_v3_multiple_possible_answer.json"
1132
+ "BFCL_v3_multiple.jsonl",
1133
+ "BFCL_v3_multiple_possible_answer.jsonl"
1208
1134
  );
1209
1135
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1210
1136
  "bfcl-parallel-multiple",
1211
1137
  "BFCL Parallel & Multiple Function Calling",
1212
- "BFCL_v3_parallel_multiple.json",
1213
- "BFCL_v3_parallel_multiple_possible_answer.json"
1138
+ "BFCL_v3_parallel_multiple.jsonl",
1139
+ "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1214
1140
  );
1215
1141
 
1216
1142
  // src/benchmarks/json-generation.ts
1217
- import { generateText as generateText2 } from "ai";
1218
- import Ajv from "ajv";
1219
1143
  import { promises as fs3 } from "fs";
1220
1144
  import path3 from "path";
1221
- function extractFirstJsonBlock(text) {
1145
+ import { generateText as generateText2 } from "ai";
1146
+ import Ajv from "ajv";
1147
+ var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1148
+ var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1149
+ var NEWLINE_REGEX = /\r?\n/;
1150
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1151
+ function tryDirectParse(text) {
1222
1152
  try {
1223
1153
  return JSON.parse(text);
1224
1154
  } catch {
1155
+ return;
1225
1156
  }
1226
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1227
- if (fenceMatch) {
1228
- const inner = fenceMatch[1].trim();
1229
- try {
1230
- return JSON.parse(inner);
1231
- } catch {
1232
- }
1157
+ }
1158
+ function tryCodeFenceParse(text) {
1159
+ const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
1160
+ if (!fenceMatch) {
1161
+ return;
1162
+ }
1163
+ const inner = fenceMatch[1].trim();
1164
+ try {
1165
+ return JSON.parse(inner);
1166
+ } catch {
1167
+ return;
1233
1168
  }
1169
+ }
1170
+ function tryBracketScan(text) {
1234
1171
  const startIdxObj = text.indexOf("{");
1235
1172
  const startIdxArr = text.indexOf("[");
1236
1173
  const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1237
- if (start === void 0) return void 0;
1174
+ if (start === void 0) {
1175
+ return;
1176
+ }
1238
1177
  const open = text[start] === "{" ? "{" : "[";
1239
1178
  const close = open === "{" ? "}" : "]";
1240
1179
  let depth = 0;
1241
- for (let i = start; i < text.length; i++) {
1180
+ for (let i = start; i < text.length; i += 1) {
1242
1181
  const ch = text[i];
1243
- if (ch === open) depth++;
1244
- else if (ch === close) depth--;
1182
+ if (ch === open) {
1183
+ depth += 1;
1184
+ } else if (ch === close) {
1185
+ depth -= 1;
1186
+ }
1245
1187
  if (depth === 0) {
1246
1188
  const candidate = text.slice(start, i + 1);
1247
1189
  try {
1248
1190
  return JSON.parse(candidate);
1249
1191
  } catch {
1192
+ return;
1250
1193
  }
1251
- break;
1252
1194
  }
1253
1195
  }
1254
- return void 0;
1196
+ return;
1197
+ }
1198
+ function extractFirstJsonBlock(text) {
1199
+ const directResult = tryDirectParse(text);
1200
+ if (directResult !== void 0) {
1201
+ return directResult;
1202
+ }
1203
+ const fenceResult = tryCodeFenceParse(text);
1204
+ if (fenceResult !== void 0) {
1205
+ return fenceResult;
1206
+ }
1207
+ return tryBracketScan(text);
1255
1208
  }
1256
1209
  function subsetMatch(expected, actual) {
1257
1210
  if (expected === null || typeof expected !== "object") {
1258
1211
  return expected === actual;
1259
1212
  }
1260
1213
  if (Array.isArray(expected)) {
1261
- if (!Array.isArray(actual)) return false;
1262
- for (let i = 0; i < expected.length; i++) {
1263
- if (!subsetMatch(expected[i], actual[i])) return false;
1214
+ if (!Array.isArray(actual)) {
1215
+ return false;
1216
+ }
1217
+ for (let i = 0; i < expected.length; i += 1) {
1218
+ if (!subsetMatch(expected[i], actual[i])) {
1219
+ return false;
1220
+ }
1264
1221
  }
1265
1222
  return true;
1266
1223
  }
1267
- if (actual === null || typeof actual !== "object") return false;
1224
+ if (actual === null || typeof actual !== "object") {
1225
+ return false;
1226
+ }
1268
1227
  const eObj = expected;
1269
1228
  const aObj = actual;
1270
1229
  for (const key of Object.keys(eObj)) {
1271
- if (!subsetMatch(eObj[key], aObj[key])) return false;
1230
+ if (!subsetMatch(eObj[key], aObj[key])) {
1231
+ return false;
1232
+ }
1272
1233
  }
1273
1234
  return true;
1274
1235
  }
1236
+ async function loadDatasets() {
1237
+ try {
1238
+ const dataDir = resolveDataDir();
1239
+ const testsJsonl = await fs3.readFile(
1240
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1241
+ "utf-8"
1242
+ );
1243
+ const expectedJsonl = await fs3.readFile(
1244
+ path3.join(dataDir, "json_generation_expected.jsonl"),
1245
+ "utf-8"
1246
+ );
1247
+ const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1248
+ const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1249
+ const expectedMap = /* @__PURE__ */ new Map();
1250
+ for (const r of expecteds) {
1251
+ expectedMap.set(r.id, r);
1252
+ }
1253
+ return { tests, expectedMap };
1254
+ } catch (e) {
1255
+ return {
1256
+ tests: [],
1257
+ expectedMap: /* @__PURE__ */ new Map(),
1258
+ error: e
1259
+ };
1260
+ }
1261
+ }
1262
+ function buildMessages(tc) {
1263
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1264
+ return [
1265
+ {
1266
+ role: "system",
1267
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1268
+ },
1269
+ {
1270
+ role: "user",
1271
+ content: [
1272
+ "Generate a JSON object that reflects the following facts.",
1273
+ "JSON Schema:",
1274
+ schemaStr,
1275
+ "Facts:",
1276
+ tc.promptFacts,
1277
+ "Output must be a single JSON only, with no additional text."
1278
+ ].join("\n\n")
1279
+ }
1280
+ ];
1281
+ }
1282
+ function validateTestCase(tc, parsed, context) {
1283
+ const validate = context.ajv.compile(tc.schema);
1284
+ const valid = validate(parsed);
1285
+ if (!valid) {
1286
+ context.logs.push(
1287
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1288
+ );
1289
+ }
1290
+ const expectedRec = context.expectedMap.get(tc.id);
1291
+ if (!expectedRec) {
1292
+ context.logs.push(
1293
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1294
+ );
1295
+ }
1296
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1297
+ return { valid, valuesOk, parsed };
1298
+ }
1299
+ async function processTestCase(tc, context) {
1300
+ const messages = buildMessages(tc);
1301
+ const temp = context.config?.temperature;
1302
+ const temperature = typeof temp === "number" ? temp : void 0;
1303
+ const { text } = await generateText2({
1304
+ model: context.model,
1305
+ messages,
1306
+ ...temperature !== void 0 ? { temperature } : {}
1307
+ });
1308
+ let parsed;
1309
+ try {
1310
+ parsed = extractFirstJsonBlock(text);
1311
+ } catch {
1312
+ }
1313
+ if (parsed === void 0) {
1314
+ context.validation.logs.push(
1315
+ `[FAIL] ${tc.id}: Unable to parse JSON from model output.`
1316
+ );
1317
+ return { schemaValid: false, valueMatch: false, correct: false };
1318
+ }
1319
+ const {
1320
+ valid,
1321
+ valuesOk,
1322
+ parsed: validatedParsed
1323
+ } = validateTestCase(tc, parsed, context.validation);
1324
+ const correct = valid && valuesOk;
1325
+ if (correct) {
1326
+ context.validation.logs.push(`[PASS] ${tc.id}`);
1327
+ } else {
1328
+ context.validation.logs.push(
1329
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1330
+ validatedParsed
1331
+ )}`
1332
+ );
1333
+ }
1334
+ return { schemaValid: valid, valueMatch: valuesOk, correct };
1335
+ }
1275
1336
  var jsonGenerationBenchmark = {
1276
1337
  name: "json-generation",
1277
1338
  version: "2.1.0",
@@ -1279,116 +1340,124 @@ var jsonGenerationBenchmark = {
1279
1340
  async run(model, config) {
1280
1341
  const logs = [];
1281
1342
  const ajv = new Ajv({ allErrors: true, strict: false });
1282
- let schemaValidCount = 0;
1283
- let valueMatchCount = 0;
1284
- let correctCount = 0;
1285
- let tests = [];
1286
- const expectedMap = /* @__PURE__ */ new Map();
1287
- try {
1288
- const dataDir = resolveDataDir();
1289
- const testsJsonl = await fs3.readFile(
1290
- path3.join(dataDir, "json_generation_tests.jsonl"),
1291
- "utf-8"
1292
- );
1293
- const expectedJsonl = await fs3.readFile(
1294
- path3.join(dataDir, "json_generation_expected.jsonl"),
1295
- "utf-8"
1296
- );
1297
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1298
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1299
- for (const r of expecteds) expectedMap.set(r.id, r);
1300
- } catch (e) {
1301
- const msg = e instanceof Error ? e.message : String(e);
1343
+ const { tests, expectedMap, error } = await loadDatasets();
1344
+ if (error) {
1302
1345
  return {
1303
1346
  score: 0,
1304
1347
  success: false,
1305
1348
  metrics: {},
1306
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1307
- error: e
1349
+ logs: [
1350
+ `[FATAL] Failed to load json-generation datasets: ${error.message}`
1351
+ ],
1352
+ error
1308
1353
  };
1309
1354
  }
1310
- for (const tc of tests) {
1311
- try {
1312
- const schemaStr = JSON.stringify(tc.schema, null, 2);
1313
- const messages = [
1314
- {
1315
- role: "system",
1316
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1317
- },
1318
- {
1319
- role: "user",
1320
- content: [
1321
- "Generate a JSON object that reflects the following facts.",
1322
- "JSON Schema:",
1323
- schemaStr,
1324
- "Facts:",
1325
- tc.promptFacts,
1326
- "Output must be a single JSON only, with no additional text."
1327
- ].join("\n\n")
1328
- }
1329
- ];
1330
- const temp = config?.temperature;
1331
- const temperature = typeof temp === "number" ? temp : void 0;
1332
- const { text } = await generateText2({
1333
- model,
1334
- messages,
1335
- ...temperature !== void 0 ? { temperature } : {}
1336
- });
1337
- let parsed;
1338
- try {
1339
- parsed = extractFirstJsonBlock(text);
1340
- } catch {
1341
- }
1342
- if (parsed === void 0) {
1343
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1344
- continue;
1345
- }
1346
- const validate = ajv.compile(tc.schema);
1347
- const valid = validate(parsed);
1348
- if (valid) schemaValidCount++;
1349
- else
1350
- logs.push(
1351
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1352
- );
1353
- const expectedRec = expectedMap.get(tc.id);
1354
- if (!expectedRec) {
1355
- logs.push(
1356
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1357
- );
1358
- }
1359
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1360
- if (valuesOk) valueMatchCount++;
1361
- if (valid && valuesOk) {
1362
- correctCount++;
1363
- logs.push(`[PASS] ${tc.id}`);
1364
- } else {
1365
- logs.push(
1366
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1367
- parsed
1368
- )}`
1369
- );
1370
- }
1371
- } catch (e) {
1372
- const msg = e instanceof Error ? e.message : String(e);
1373
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
1374
- }
1375
- }
1376
- const total = tests.length;
1377
- const score = correctCount / total;
1378
- return {
1379
- score,
1380
- success: score >= 0.8,
1381
- metrics: {
1382
- total_cases: total,
1383
- correct_count: correctCount,
1384
- schema_valid_count: schemaValidCount,
1385
- value_match_count: valueMatchCount,
1386
- accuracy: score
1387
- },
1388
- logs
1355
+ const context = {
1356
+ model,
1357
+ config,
1358
+ validation: { expectedMap, ajv, logs }
1389
1359
  };
1360
+ const counts = await processAllTests(tests, context);
1361
+ return buildBenchmarkResult(tests.length, counts, logs);
1390
1362
  }
1391
1363
  };
1364
+ async function processAllTests(tests, context) {
1365
+ let schemaValidCount = 0;
1366
+ let valueMatchCount = 0;
1367
+ let correctCount = 0;
1368
+ for (const tc of tests) {
1369
+ try {
1370
+ const result = await processTestCase(tc, context);
1371
+ if (result.schemaValid) {
1372
+ schemaValidCount += 1;
1373
+ }
1374
+ if (result.valueMatch) {
1375
+ valueMatchCount += 1;
1376
+ }
1377
+ if (result.correct) {
1378
+ correctCount += 1;
1379
+ }
1380
+ } catch (e) {
1381
+ const msg = e instanceof Error ? e.message : String(e);
1382
+ context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
1383
+ }
1384
+ }
1385
+ return { schemaValidCount, valueMatchCount, correctCount };
1386
+ }
1387
+ function buildBenchmarkResult(total, counts, logs) {
1388
+ const score = counts.correctCount / total;
1389
+ return {
1390
+ score,
1391
+ success: score >= 0.8,
1392
+ metrics: {
1393
+ total_cases: total,
1394
+ correct_count: counts.correctCount,
1395
+ schema_valid_count: counts.schemaValidCount,
1396
+ value_match_count: counts.valueMatchCount,
1397
+ accuracy: score
1398
+ },
1399
+ logs
1400
+ };
1401
+ }
1402
+ async function loadSchemaOnlyTests() {
1403
+ try {
1404
+ const dataDir = resolveDataDir();
1405
+ const testsJsonl = await fs3.readFile(
1406
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1407
+ "utf-8"
1408
+ );
1409
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1410
+ return { tests };
1411
+ } catch (e) {
1412
+ return { tests: [], error: e };
1413
+ }
1414
+ }
1415
+ async function processSchemaOnlyTestCase(tc, context) {
1416
+ const messages = buildMessages(tc);
1417
+ const temp = context.config?.temperature;
1418
+ const temperature = typeof temp === "number" ? temp : void 0;
1419
+ const { text } = await generateText2({
1420
+ model: context.model,
1421
+ messages,
1422
+ ...temperature !== void 0 ? { temperature } : {}
1423
+ });
1424
+ let parsed;
1425
+ try {
1426
+ parsed = extractFirstJsonBlock(text);
1427
+ } catch {
1428
+ }
1429
+ if (parsed === void 0) {
1430
+ context.logs.push(
1431
+ `[FAIL] ${tc.id}: Could not parse JSON from model output.`
1432
+ );
1433
+ return false;
1434
+ }
1435
+ const validate = context.ajv.compile(tc.schema);
1436
+ const valid = validate(parsed);
1437
+ if (valid) {
1438
+ context.logs.push(`[PASS] ${tc.id}`);
1439
+ return true;
1440
+ }
1441
+ context.logs.push(
1442
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1443
+ );
1444
+ return false;
1445
+ }
1446
+ async function runSchemaOnlyTests(tests, context) {
1447
+ let schemaValidCount = 0;
1448
+ for (const tc of tests) {
1449
+ try {
1450
+ const isValid = await processSchemaOnlyTestCase(tc, context);
1451
+ if (isValid) {
1452
+ schemaValidCount += 1;
1453
+ }
1454
+ } catch (e) {
1455
+ const msg = e instanceof Error ? e.message : String(e);
1456
+ context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
1457
+ }
1458
+ }
1459
+ return schemaValidCount;
1460
+ }
1392
1461
  var jsonGenerationSchemaOnlyBenchmark = {
1393
1462
  name: "json-generation-schema-only",
1394
1463
  version: "1.0.1",
@@ -1396,76 +1465,19 @@ var jsonGenerationSchemaOnlyBenchmark = {
1396
1465
  async run(model, config) {
1397
1466
  const logs = [];
1398
1467
  const ajv = new Ajv({ allErrors: true, strict: false });
1399
- let tests = [];
1400
- try {
1401
- const dataDir = resolveDataDir();
1402
- const testsJsonl = await fs3.readFile(
1403
- path3.join(dataDir, "json_generation_tests.jsonl"),
1404
- "utf-8"
1405
- );
1406
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1407
- } catch (e) {
1408
- const msg = e instanceof Error ? e.message : String(e);
1468
+ const { tests, error } = await loadSchemaOnlyTests();
1469
+ if (error) {
1470
+ const msg = error.message;
1409
1471
  return {
1410
1472
  score: 0,
1411
1473
  success: false,
1412
1474
  metrics: {},
1413
1475
  logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1414
- error: e
1476
+ error
1415
1477
  };
1416
1478
  }
1417
- let schemaValidCount = 0;
1418
- for (const tc of tests) {
1419
- try {
1420
- const schemaStr = JSON.stringify(tc.schema, null, 2);
1421
- const messages = [
1422
- {
1423
- role: "system",
1424
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1425
- },
1426
- {
1427
- role: "user",
1428
- content: [
1429
- "Generate a JSON object that reflects the following facts.",
1430
- "JSON Schema:",
1431
- schemaStr,
1432
- "Facts:",
1433
- tc.promptFacts,
1434
- "Output must be a single JSON only, with no additional text."
1435
- ].join("\n\n")
1436
- }
1437
- ];
1438
- const temp = config?.temperature;
1439
- const temperature = typeof temp === "number" ? temp : void 0;
1440
- const { text } = await generateText2({
1441
- model,
1442
- messages,
1443
- ...temperature !== void 0 ? { temperature } : {}
1444
- });
1445
- let parsed;
1446
- try {
1447
- parsed = extractFirstJsonBlock(text);
1448
- } catch {
1449
- }
1450
- if (parsed === void 0) {
1451
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1452
- continue;
1453
- }
1454
- const validate = ajv.compile(tc.schema);
1455
- const valid = validate(parsed);
1456
- if (valid) {
1457
- schemaValidCount++;
1458
- logs.push(`[PASS] ${tc.id}`);
1459
- } else {
1460
- logs.push(
1461
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1462
- );
1463
- }
1464
- } catch (e) {
1465
- const msg = e instanceof Error ? e.message : String(e);
1466
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
1467
- }
1468
- }
1479
+ const context = { model, config, ajv, logs };
1480
+ const schemaValidCount = await runSchemaOnlyTests(tests, context);
1469
1481
  const total = tests.length;
1470
1482
  const score = total > 0 ? schemaValidCount / total : 0;
1471
1483
  return {
@@ -1480,6 +1492,505 @@ var jsonGenerationSchemaOnlyBenchmark = {
1480
1492
  };
1481
1493
  }
1482
1494
  };
1495
+
1496
+ // src/reporters/console.ts
1497
+ var colors = {
1498
+ reset: "\x1B[0m",
1499
+ green: "\x1B[32m",
1500
+ red: "\x1B[31m",
1501
+ yellow: "\x1B[33m",
1502
+ cyan: "\x1B[36m",
1503
+ magenta: "\x1B[35m",
1504
+ gray: "\x1B[90m"
1505
+ };
1506
+ function printResult(result) {
1507
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
1508
+ const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
1509
+ console.log(
1510
+ `
1511
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
1512
+ );
1513
+ console.log(
1514
+ ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
1515
+ );
1516
+ const metrics = Object.entries(benchmarkResult.metrics);
1517
+ if (metrics.length > 0) {
1518
+ console.log(" Metrics:");
1519
+ for (const [key, value] of metrics) {
1520
+ console.log(` - ${key}: ${value}`);
1521
+ }
1522
+ }
1523
+ if (benchmarkResult.error) {
1524
+ console.log(
1525
+ ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
1526
+ );
1527
+ }
1528
+ }
1529
+ function consoleReporter(results) {
1530
+ console.log("\n--- \u{1F4CA} Evaluation Report ---");
1531
+ for (const result of results) {
1532
+ printResult(result);
1533
+ }
1534
+ console.log("\n---------------------------\n");
1535
+ }
1536
+
1537
+ // src/reporters/console.debug.ts
1538
+ var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
1539
+ var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
1540
+ var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
1541
+ var colors2 = {
1542
+ reset: "\x1B[0m",
1543
+ green: "\x1B[32m",
1544
+ red: "\x1B[31m",
1545
+ yellow: "\x1B[33m",
1546
+ cyan: "\x1B[36m",
1547
+ magenta: "\x1B[35m",
1548
+ gray: "\x1B[90m",
1549
+ bold: "\x1B[1m",
1550
+ underline: "\x1B[4m"
1551
+ };
1552
+ function colorizeDiffLine(line) {
1553
+ if (line.startsWith("+")) {
1554
+ return `${colors2.green}${line}${colors2.reset}`;
1555
+ }
1556
+ if (line.startsWith("-")) {
1557
+ return `${colors2.red}${line}${colors2.reset}`;
1558
+ }
1559
+ if (line.startsWith("@")) {
1560
+ return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
1561
+ }
1562
+ return line;
1563
+ }
1564
+ function uniqueLines(lines) {
1565
+ const seen = /* @__PURE__ */ new Set();
1566
+ const out = [];
1567
+ for (const l of lines) {
1568
+ if (seen.has(l)) {
1569
+ continue;
1570
+ }
1571
+ seen.add(l);
1572
+ out.push(l);
1573
+ }
1574
+ return out;
1575
+ }
1576
+ function hasFunctionNameIssue(diff) {
1577
+ return diff.some(
1578
+ (d) => String(d).includes("function name") || String(d).includes("missing function:")
1579
+ );
1580
+ }
1581
+ function suggestFunctionNameFix(expected, actual, suggestions) {
1582
+ const expectedName = expected?.function;
1583
+ const actualName = actual?.function;
1584
+ if (expectedName && actualName && expectedName !== actualName) {
1585
+ suggestions.push(
1586
+ `Call the function '${expectedName}' instead of '${actualName}'.`
1587
+ );
1588
+ }
1589
+ if (Array.isArray(expected?.functions)) {
1590
+ suggestions.push(
1591
+ `Ensure tool calls include: ${expected.functions.join(", ")}.`
1592
+ );
1593
+ }
1594
+ }
1595
+ function suggestMissingParamFix(diff, suggestions) {
1596
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
1597
+ if (missing.length) {
1598
+ suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
1599
+ }
1600
+ }
1601
+ function suggestUnexpectedParamFix(diff, suggestions) {
1602
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
1603
+ if (extras.length) {
1604
+ suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
1605
+ }
1606
+ }
1607
+ function suggestParamValueFix(diff, suggestions) {
1608
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
1609
+ for (const param of targets) {
1610
+ const allowedOneOfLine = diff.find(
1611
+ (d) => String(d).startsWith("- expected one of:")
1612
+ );
1613
+ const allowedSingleLine = diff.find(
1614
+ (d) => String(d).startsWith("- expected:")
1615
+ );
1616
+ if (allowedSingleLine) {
1617
+ const value = allowedSingleLine.replace("- expected: ", "");
1618
+ suggestions.push(`Set '${param}' to: ${value}.`);
1619
+ } else if (allowedOneOfLine) {
1620
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
1621
+ suggestions.push(`Set '${param}' to one of: ${allowed}.`);
1622
+ } else {
1623
+ suggestions.push(`Adjust '${param}' to an allowed value.`);
1624
+ }
1625
+ }
1626
+ }
1627
+ function suggestFromErrorType(error_type, suggestions) {
1628
+ if (error_type.includes("missing_required")) {
1629
+ suggestions.push("Add all required parameters defined by the tool schema.");
1630
+ } else if (error_type.includes("unexpected_param")) {
1631
+ suggestions.push("Remove parameters not present in the tool schema.");
1632
+ } else if (error_type.includes("wrong_count")) {
1633
+ suggestions.push(
1634
+ "Adjust the number of tool calls to match expected count."
1635
+ );
1636
+ } else if (error_type.includes("wrong_func_name")) {
1637
+ suggestions.push("Use the exact expected function name from the schema.");
1638
+ } else if (error_type.includes("value_error")) {
1639
+ suggestions.push("Choose a value from the allowed options.");
1640
+ }
1641
+ }
1642
+ function suggestFixFromDiff(parsed) {
1643
+ const suggestions = [];
1644
+ const { error_type, expected, actual, diff } = parsed ?? {};
1645
+ if (!Array.isArray(diff)) {
1646
+ if (suggestions.length === 0 && typeof error_type === "string") {
1647
+ suggestFromErrorType(error_type, suggestions);
1648
+ }
1649
+ return uniqueLines(suggestions);
1650
+ }
1651
+ if (hasFunctionNameIssue(diff)) {
1652
+ suggestFunctionNameFix(expected, actual, suggestions);
1653
+ }
1654
+ if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
1655
+ suggestMissingParamFix(diff, suggestions);
1656
+ }
1657
+ if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
1658
+ suggestUnexpectedParamFix(diff, suggestions);
1659
+ }
1660
+ if (diff.some((d) => String(d).startsWith("@@ param "))) {
1661
+ suggestParamValueFix(diff, suggestions);
1662
+ }
1663
+ if (suggestions.length === 0 && typeof error_type === "string") {
1664
+ suggestFromErrorType(error_type, suggestions);
1665
+ }
1666
+ return uniqueLines(suggestions);
1667
+ }
1668
+ function getTestIdFromLogLine(line) {
1669
+ if (line.startsWith("[FAIL]")) {
1670
+ const m = line.match(FAIL_ID_REGEX);
1671
+ return m?.[1];
1672
+ }
1673
+ if (line.startsWith("[DEBUG-FAIL]")) {
1674
+ try {
1675
+ const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1676
+ return String(parsed?.id ?? "");
1677
+ } catch {
1678
+ }
1679
+ }
1680
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
1681
+ try {
1682
+ const parsed = JSON.parse(
1683
+ line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1684
+ );
1685
+ return String(parsed?.id ?? "");
1686
+ } catch {
1687
+ }
1688
+ }
1689
+ return;
1690
+ }
1691
+ function groupLogsByTestId(failLogs) {
1692
+ const byId = /* @__PURE__ */ new Map();
1693
+ for (const line of failLogs) {
1694
+ const id = getTestIdFromLogLine(line);
1695
+ const key = id ?? "__general__";
1696
+ const arr = byId.get(key) ?? [];
1697
+ arr.push(line);
1698
+ byId.set(key, arr);
1699
+ }
1700
+ return byId;
1701
+ }
1702
+ function collectDebugIds(lines) {
1703
+ const debugIds = /* @__PURE__ */ new Set();
1704
+ for (const l of lines) {
1705
+ if (l.startsWith("[DEBUG-FAIL]")) {
1706
+ try {
1707
+ const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1708
+ if (parsed?.id) {
1709
+ debugIds.add(String(parsed.id));
1710
+ }
1711
+ } catch {
1712
+ }
1713
+ }
1714
+ }
1715
+ return debugIds;
1716
+ }
1717
+ function printIndentedJson(prefix, data, color) {
1718
+ console.log(
1719
+ color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n ") + colors2.reset
1720
+ );
1721
+ }
1722
+ function displayDebugFailLine(line) {
1723
+ const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
1724
+ try {
1725
+ const parsed = JSON.parse(payload);
1726
+ const { message, diff, expected, actual } = parsed;
1727
+ if (message) {
1728
+ console.log(` ${colors2.bold}${message}${colors2.reset}`);
1729
+ }
1730
+ if (diff && Array.isArray(diff)) {
1731
+ for (const dLine of diff) {
1732
+ console.log(` ${colorizeDiffLine(dLine)}`);
1733
+ }
1734
+ } else {
1735
+ console.log(" expected:");
1736
+ printIndentedJson(" ", expected, colors2.green);
1737
+ console.log(" actual:");
1738
+ printIndentedJson(" ", actual, colors2.red);
1739
+ }
1740
+ const suggestions = suggestFixFromDiff(parsed);
1741
+ if (suggestions.length) {
1742
+ console.log(` ${colors2.bold}Suggested fix:${colors2.reset}`);
1743
+ for (const s of suggestions) {
1744
+ console.log(` \u2022 ${s}`);
1745
+ }
1746
+ }
1747
+ } catch {
1748
+ console.log(` ${line}`);
1749
+ }
1750
+ }
1751
+ function displayContextInfo(ctx) {
1752
+ if (ctx.tool_schema) {
1753
+ printIndentedJson(" tool schema: ", ctx.tool_schema, colors2.gray);
1754
+ }
1755
+ if (ctx.last_user_query) {
1756
+ console.log(
1757
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
1758
+ );
1759
+ }
1760
+ if (ctx.raw_model_text) {
1761
+ console.log(
1762
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
1763
+ );
1764
+ }
1765
+ if (ctx.parsed_tool_calls) {
1766
+ printIndentedJson(
1767
+ " parsed tool calls: ",
1768
+ ctx.parsed_tool_calls,
1769
+ colors2.gray
1770
+ );
1771
+ }
1772
+ if (ctx.ground_truth) {
1773
+ printIndentedJson(
1774
+ " ground truth: ",
1775
+ ctx.ground_truth,
1776
+ colors2.gray
1777
+ );
1778
+ }
1779
+ if (ctx.finish_reason) {
1780
+ console.log(
1781
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
1782
+ );
1783
+ }
1784
+ }
1785
+ function displayDebugFailContextLine(line) {
1786
+ const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
1787
+ try {
1788
+ const ctx = JSON.parse(payload);
1789
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
1790
+ displayContextInfo(ctx);
1791
+ } catch {
1792
+ console.log(` ${line}`);
1793
+ }
1794
+ }
1795
+ function displayLogLine(line, debugIds) {
1796
+ if (line.startsWith("[FAIL]")) {
1797
+ const m = line.match(FAIL_ID_REGEX);
1798
+ const failId = m?.[1];
1799
+ if (failId && debugIds.has(failId)) {
1800
+ return;
1801
+ }
1802
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
1803
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
1804
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
1805
+ } else if (line.startsWith("[STACK]")) {
1806
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
1807
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
1808
+ displayDebugFailLine(line);
1809
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
1810
+ displayDebugFailContextLine(line);
1811
+ }
1812
+ }
1813
+ function displayGroupedFailures(byId) {
1814
+ console.log(` ${colors2.bold}Failure details (grouped):${colors2.reset}`);
1815
+ for (const [groupId, lines] of byId) {
1816
+ if (groupId !== "__general__") {
1817
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
1818
+ }
1819
+ const debugIds = collectDebugIds(lines);
1820
+ for (const line of lines) {
1821
+ displayLogLine(line, debugIds);
1822
+ }
1823
+ }
1824
+ }
1825
+ function displaySuccessLogs(logs) {
1826
+ const info = logs.filter(
1827
+ (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
1828
+ );
1829
+ for (const line of info) {
1830
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
1831
+ }
1832
+ }
1833
+ function filterFailureLogs(logs) {
1834
+ return logs.filter(
1835
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
1836
+ );
1837
+ }
1838
+ function displayResultLogs(logs) {
1839
+ const failLogs = filterFailureLogs(logs);
1840
+ const hasFails = failLogs.length > 0;
1841
+ if (hasFails) {
1842
+ const byId = groupLogsByTestId(failLogs);
1843
+ displayGroupedFailures(byId);
1844
+ } else {
1845
+ displaySuccessLogs(logs);
1846
+ }
1847
+ }
1848
+ function displayMetrics(metrics) {
1849
+ if (metrics.length > 0) {
1850
+ console.log(" Metrics:");
1851
+ for (const [k, v] of metrics) {
1852
+ console.log(` - ${k}: ${v}`);
1853
+ }
1854
+ }
1855
+ }
1856
+ function displayResultHeader(r) {
1857
+ const { model, modelKey, benchmark, result } = r;
1858
+ const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
1859
+ console.log(
1860
+ `
1861
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
1862
+ );
1863
+ console.log(
1864
+ ` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
1865
+ );
1866
+ }
1867
+ function consoleDebugReporter(results) {
1868
+ console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
1869
+ for (const r of results) {
1870
+ displayResultHeader(r);
1871
+ displayMetrics(Object.entries(r.result.metrics));
1872
+ if (r.result.logs?.length) {
1873
+ displayResultLogs(r.result.logs);
1874
+ }
1875
+ }
1876
+ console.log("\n------------------------------------\n");
1877
+ }
1878
+
1879
+ // src/reporters/json.ts
1880
+ function jsonReporter(results) {
1881
+ const serializableResults = results.map((r) => ({
1882
+ ...r,
1883
+ result: {
1884
+ ...r.result,
1885
+ error: r.result.error?.message
1886
+ }
1887
+ }));
1888
+ console.log(JSON.stringify(serializableResults, null, 2));
1889
+ }
1890
+
1891
+ // src/reporters/index.ts
1892
+ var reporters = {
1893
+ console: consoleReporter,
1894
+ json: jsonReporter,
1895
+ "console.debug": consoleDebugReporter
1896
+ };
1897
+
1898
+ // src/evaluate.ts
1899
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
1900
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
1901
+ try {
1902
+ console.log(
1903
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
1904
+ );
1905
+ const result = await benchmark.run(model, config);
1906
+ console.log(
1907
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
1908
+ );
1909
+ return {
1910
+ model: modelId,
1911
+ modelKey,
1912
+ benchmark: benchmark.name,
1913
+ result
1914
+ };
1915
+ } catch (error) {
1916
+ console.error(
1917
+ `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
1918
+ error
1919
+ );
1920
+ return {
1921
+ model: modelId,
1922
+ modelKey,
1923
+ benchmark: benchmark.name,
1924
+ result: {
1925
+ score: 0,
1926
+ success: false,
1927
+ metrics: {},
1928
+ error: error instanceof Error ? error : new Error(String(error))
1929
+ }
1930
+ };
1931
+ }
1932
+ }
1933
+ function normalizeModels(models) {
1934
+ const modelEntries = [];
1935
+ if (Array.isArray(models)) {
1936
+ for (const m of models) {
1937
+ modelEntries.push([void 0, m]);
1938
+ }
1939
+ } else if (typeof models === "object" && models !== null && "modelId" in models) {
1940
+ modelEntries.push([void 0, models]);
1941
+ } else {
1942
+ for (const [key, m] of Object.entries(
1943
+ models
1944
+ )) {
1945
+ modelEntries.push([key, m]);
1946
+ }
1947
+ }
1948
+ return modelEntries;
1949
+ }
1950
+ function buildConfig(temperature, maxTokens) {
1951
+ const config = {};
1952
+ if (temperature !== void 0) {
1953
+ config.temperature = temperature;
1954
+ }
1955
+ if (maxTokens !== void 0) {
1956
+ config.maxTokens = maxTokens;
1957
+ }
1958
+ return Object.keys(config).length > 0 ? config : void 0;
1959
+ }
1960
+ function executeReporter(reporter, results) {
1961
+ const report = reporters[reporter];
1962
+ if (report) {
1963
+ report(results);
1964
+ } else {
1965
+ console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
1966
+ reporters.console(results);
1967
+ }
1968
+ }
1969
+ async function evaluate(options) {
1970
+ const {
1971
+ models,
1972
+ benchmarks,
1973
+ reporter = "console",
1974
+ temperature,
1975
+ maxTokens
1976
+ } = options;
1977
+ const modelEntries = normalizeModels(models);
1978
+ const config = buildConfig(temperature, maxTokens);
1979
+ const allResults = [];
1980
+ for (const [modelKey, model] of modelEntries) {
1981
+ for (const benchmark of benchmarks) {
1982
+ const evaluationResult = await runSingleBenchmark(
1983
+ model,
1984
+ benchmark,
1985
+ modelKey,
1986
+ config
1987
+ );
1988
+ allResults.push(evaluationResult);
1989
+ }
1990
+ }
1991
+ executeReporter(reporter, allResults);
1992
+ return allResults;
1993
+ }
1483
1994
  export {
1484
1995
  bfclMultipleBenchmark,
1485
1996
  bfclParallelBenchmark,