@ai-sdk-tool/eval 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,877 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ bfclMultipleBenchmark: () => bfclMultipleBenchmark,
34
+ bfclParallelBenchmark: () => bfclParallelBenchmark,
35
+ bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
36
+ bfclSimpleBenchmark: () => bfclSimpleBenchmark,
37
+ evaluate: () => evaluate,
38
+ jsonGenerationBenchmark: () => jsonGenerationBenchmark,
39
+ jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
40
+ });
41
+ module.exports = __toCommonJS(index_exports);
42
+
43
+ // src/reporters/console.ts
44
+ var colors = {
45
+ reset: "\x1B[0m",
46
+ green: "\x1B[32m",
47
+ red: "\x1B[31m",
48
+ yellow: "\x1B[33m",
49
+ cyan: "\x1B[36m",
50
+ magenta: "\x1B[35m"
51
+ };
52
+ function printResult(result) {
53
+ const { model, benchmark, result: benchmarkResult } = result;
54
+ const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
55
+ console.log(
56
+ `
57
+ ${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
58
+ );
59
+ console.log(
60
+ ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
61
+ );
62
+ const metrics = Object.entries(benchmarkResult.metrics);
63
+ if (metrics.length > 0) {
64
+ console.log(" Metrics:");
65
+ for (const [key, value] of metrics) {
66
+ console.log(` - ${key}: ${value}`);
67
+ }
68
+ }
69
+ if (benchmarkResult.error) {
70
+ console.log(
71
+ ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
72
+ );
73
+ }
74
+ }
75
+ function consoleReporter(results) {
76
+ console.log("\n--- \u{1F4CA} Evaluation Report ---");
77
+ for (const result of results) {
78
+ printResult(result);
79
+ }
80
+ console.log("\n---------------------------\n");
81
+ }
82
+
83
+ // src/reporters/json.ts
84
+ function jsonReporter(results) {
85
+ const serializableResults = results.map((r) => ({
86
+ ...r,
87
+ result: {
88
+ ...r.result,
89
+ error: r.result.error?.message
90
+ }
91
+ }));
92
+ console.log(JSON.stringify(serializableResults, null, 2));
93
+ }
94
+
95
+ // src/reporters/index.ts
96
+ var reporters = {
97
+ console: consoleReporter,
98
+ json: jsonReporter
99
+ };
100
+
101
+ // src/evaluate.ts
102
+ async function runSingleBenchmark(model, benchmark) {
103
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
104
+ try {
105
+ console.log(`[${modelId}] Running benchmark: ${benchmark.name}...`);
106
+ const result = await benchmark.run(model);
107
+ console.log(
108
+ `[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
109
+ );
110
+ return {
111
+ model: modelId,
112
+ benchmark: benchmark.name,
113
+ result
114
+ };
115
+ } catch (error) {
116
+ console.error(
117
+ `[${modelId}] Error running benchmark: ${benchmark.name}`,
118
+ error
119
+ );
120
+ return {
121
+ model: modelId,
122
+ benchmark: benchmark.name,
123
+ result: {
124
+ score: 0,
125
+ success: false,
126
+ metrics: {},
127
+ error: error instanceof Error ? error : new Error(String(error))
128
+ }
129
+ };
130
+ }
131
+ }
132
+ async function evaluate(options) {
133
+ const { models, benchmarks, reporter = "console" } = options;
134
+ const modelsArray = Array.isArray(models) ? models : [models];
135
+ const allResults = [];
136
+ for (const model of modelsArray) {
137
+ for (const benchmark of benchmarks) {
138
+ const evaluationResult = await runSingleBenchmark(model, benchmark);
139
+ allResults.push(evaluationResult);
140
+ }
141
+ }
142
+ const report = reporters[reporter];
143
+ if (report) {
144
+ report(allResults);
145
+ } else {
146
+ console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
147
+ reporters.console(allResults);
148
+ }
149
+ return allResults;
150
+ }
151
+
152
+ // src/benchmarks/json-generation.ts
153
+ var import_ai = require("ai");
154
+ var import_ajv = __toESM(require("ajv"), 1);
155
+ var import_fs2 = require("fs");
156
+ var import_path2 = __toESM(require("path"), 1);
157
+
158
+ // src/utils/paths.ts
159
+ var import_fs = __toESM(require("fs"), 1);
160
+ var import_path = __toESM(require("path"), 1);
161
+ var import_url = require("url");
162
+ var import_module = require("module");
163
+ function resolveDataDir(fromModuleUrl) {
164
+ const moduleUrl = fromModuleUrl;
165
+ const override = process.env.BFCL_DATA_DIR;
166
+ if (override && override.trim().length > 0) {
167
+ return override;
168
+ }
169
+ try {
170
+ const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
171
+ const requireFromEntry = (0, import_module.createRequire)(baseForRequireEntry);
172
+ const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
173
+ const entryDir = import_path.default.dirname(entryPath);
174
+ const guessPkgRoot = import_fs.default.existsSync(import_path.default.join(entryDir, "..")) ? import_path.default.resolve(entryDir, "..") : entryDir;
175
+ const dataAtRoot = import_path.default.join(guessPkgRoot, "data");
176
+ if (import_fs.default.existsSync(dataAtRoot)) return dataAtRoot;
177
+ } catch {
178
+ }
179
+ try {
180
+ const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
181
+ const require2 = (0, import_module.createRequire)(baseForRequire);
182
+ const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
183
+ const pkgDir = import_path.default.dirname(pkgJsonPath);
184
+ const dataAtPkg = import_path.default.join(pkgDir, "data");
185
+ if (import_fs.default.existsSync(dataAtPkg)) return dataAtPkg;
186
+ } catch {
187
+ }
188
+ let startDir;
189
+ if (moduleUrl) {
190
+ try {
191
+ startDir = import_path.default.dirname((0, import_url.fileURLToPath)(moduleUrl));
192
+ } catch {
193
+ startDir = process.cwd();
194
+ }
195
+ } else {
196
+ startDir = process.cwd();
197
+ }
198
+ let dir = startDir;
199
+ for (let i = 0; i < 6; i++) {
200
+ const dataCandidate = import_path.default.join(dir, "data");
201
+ if (import_fs.default.existsSync(dataCandidate)) return dataCandidate;
202
+ const parent = import_path.default.resolve(dir, "..");
203
+ if (parent === dir) break;
204
+ dir = parent;
205
+ }
206
+ const pkgRoot = import_path.default.resolve(startDir, "..", "..");
207
+ return import_path.default.join(pkgRoot, "data");
208
+ }
209
+
210
+ // src/benchmarks/json-generation.ts
211
+ function extractFirstJsonBlock(text) {
212
+ try {
213
+ return JSON.parse(text);
214
+ } catch {
215
+ }
216
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
217
+ if (fenceMatch) {
218
+ const inner = fenceMatch[1].trim();
219
+ try {
220
+ return JSON.parse(inner);
221
+ } catch {
222
+ }
223
+ }
224
+ const startIdxObj = text.indexOf("{");
225
+ const startIdxArr = text.indexOf("[");
226
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
227
+ if (start === void 0) return void 0;
228
+ const open = text[start] === "{" ? "{" : "[";
229
+ const close = open === "{" ? "}" : "]";
230
+ let depth = 0;
231
+ for (let i = start; i < text.length; i++) {
232
+ const ch = text[i];
233
+ if (ch === open) depth++;
234
+ else if (ch === close) depth--;
235
+ if (depth === 0) {
236
+ const candidate = text.slice(start, i + 1);
237
+ try {
238
+ return JSON.parse(candidate);
239
+ } catch {
240
+ }
241
+ break;
242
+ }
243
+ }
244
+ return void 0;
245
+ }
246
+ function subsetMatch(expected, actual) {
247
+ if (expected === null || typeof expected !== "object") {
248
+ return expected === actual;
249
+ }
250
+ if (Array.isArray(expected)) {
251
+ if (!Array.isArray(actual)) return false;
252
+ for (let i = 0; i < expected.length; i++) {
253
+ if (!subsetMatch(expected[i], actual[i])) return false;
254
+ }
255
+ return true;
256
+ }
257
+ if (actual === null || typeof actual !== "object") return false;
258
+ const eObj = expected;
259
+ const aObj = actual;
260
+ for (const key of Object.keys(eObj)) {
261
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
262
+ }
263
+ return true;
264
+ }
265
+ var jsonGenerationBenchmark = {
266
+ name: "json-generation",
267
+ version: "2.1.0",
268
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
269
+ async run(model) {
270
+ const logs = [];
271
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
272
+ let schemaValidCount = 0;
273
+ let valueMatchCount = 0;
274
+ let correctCount = 0;
275
+ let tests = [];
276
+ const expectedMap = /* @__PURE__ */ new Map();
277
+ try {
278
+ const dataDir = resolveDataDir();
279
+ const testsJsonl = await import_fs2.promises.readFile(
280
+ import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
281
+ "utf-8"
282
+ );
283
+ const expectedJsonl = await import_fs2.promises.readFile(
284
+ import_path2.default.join(dataDir, "json_generation_expected.jsonl"),
285
+ "utf-8"
286
+ );
287
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
288
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
289
+ for (const r of expecteds) expectedMap.set(r.id, r);
290
+ } catch (e) {
291
+ const msg = e instanceof Error ? e.message : String(e);
292
+ return {
293
+ score: 0,
294
+ success: false,
295
+ metrics: {},
296
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
297
+ error: e
298
+ };
299
+ }
300
+ for (const tc of tests) {
301
+ try {
302
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
303
+ const messages = [
304
+ {
305
+ role: "system",
306
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
307
+ },
308
+ {
309
+ role: "user",
310
+ content: [
311
+ "Generate a JSON object that reflects the following facts.",
312
+ "JSON Schema:",
313
+ schemaStr,
314
+ "Facts:",
315
+ tc.promptFacts,
316
+ "Output must be a single JSON only, with no additional text."
317
+ ].join("\n\n")
318
+ }
319
+ ];
320
+ const { text } = await (0, import_ai.generateText)({ model, messages });
321
+ let parsed;
322
+ try {
323
+ parsed = extractFirstJsonBlock(text);
324
+ } catch {
325
+ }
326
+ if (parsed === void 0) {
327
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
328
+ continue;
329
+ }
330
+ const validate = ajv.compile(tc.schema);
331
+ const valid = validate(parsed);
332
+ if (valid) schemaValidCount++;
333
+ else
334
+ logs.push(
335
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
336
+ );
337
+ const expectedRec = expectedMap.get(tc.id);
338
+ if (!expectedRec) {
339
+ logs.push(
340
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
341
+ );
342
+ }
343
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
344
+ if (valuesOk) valueMatchCount++;
345
+ if (valid && valuesOk) {
346
+ correctCount++;
347
+ logs.push(`[PASS] ${tc.id}`);
348
+ } else {
349
+ logs.push(
350
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
351
+ parsed
352
+ )}`
353
+ );
354
+ }
355
+ } catch (e) {
356
+ const msg = e instanceof Error ? e.message : String(e);
357
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
358
+ }
359
+ }
360
+ const total = tests.length;
361
+ const score = correctCount / total;
362
+ return {
363
+ score,
364
+ success: score >= 0.8,
365
+ metrics: {
366
+ total_cases: total,
367
+ correct_count: correctCount,
368
+ schema_valid_count: schemaValidCount,
369
+ value_match_count: valueMatchCount,
370
+ accuracy: score
371
+ },
372
+ logs
373
+ };
374
+ }
375
+ };
376
+ var jsonGenerationSchemaOnlyBenchmark = {
377
+ name: "json-generation-schema-only",
378
+ version: "1.0.1",
379
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
380
+ async run(model) {
381
+ const logs = [];
382
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
383
+ let tests = [];
384
+ try {
385
+ const dataDir = resolveDataDir();
386
+ const testsJsonl = await import_fs2.promises.readFile(
387
+ import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
388
+ "utf-8"
389
+ );
390
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
391
+ } catch (e) {
392
+ const msg = e instanceof Error ? e.message : String(e);
393
+ return {
394
+ score: 0,
395
+ success: false,
396
+ metrics: {},
397
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
398
+ error: e
399
+ };
400
+ }
401
+ let schemaValidCount = 0;
402
+ for (const tc of tests) {
403
+ try {
404
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
405
+ const messages = [
406
+ {
407
+ role: "system",
408
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
409
+ },
410
+ {
411
+ role: "user",
412
+ content: [
413
+ "Generate a JSON object that reflects the following facts.",
414
+ "JSON Schema:",
415
+ schemaStr,
416
+ "Facts:",
417
+ tc.promptFacts,
418
+ "Output must be a single JSON only, with no additional text."
419
+ ].join("\n\n")
420
+ }
421
+ ];
422
+ const { text } = await (0, import_ai.generateText)({ model, messages });
423
+ let parsed;
424
+ try {
425
+ parsed = extractFirstJsonBlock(text);
426
+ } catch {
427
+ }
428
+ if (parsed === void 0) {
429
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
430
+ continue;
431
+ }
432
+ const validate = ajv.compile(tc.schema);
433
+ const valid = validate(parsed);
434
+ if (valid) {
435
+ schemaValidCount++;
436
+ logs.push(`[PASS] ${tc.id}`);
437
+ } else {
438
+ logs.push(
439
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
440
+ );
441
+ }
442
+ } catch (e) {
443
+ const msg = e instanceof Error ? e.message : String(e);
444
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
445
+ }
446
+ }
447
+ const total = tests.length;
448
+ const score = total > 0 ? schemaValidCount / total : 0;
449
+ return {
450
+ score,
451
+ success: score >= 0.8,
452
+ metrics: {
453
+ total_cases: total,
454
+ schema_valid_count: schemaValidCount,
455
+ accuracy: score
456
+ },
457
+ logs
458
+ };
459
+ }
460
+ };
461
+
462
+ // src/benchmarks/bfcl.ts
463
+ var import_ai2 = require("ai");
464
+ var import_fs3 = require("fs");
465
+ var import_path3 = __toESM(require("path"), 1);
466
+
467
+ // src/benchmarks/bfcl/ast-checker.ts
468
+ function standardizeString(input) {
469
+ if (typeof input !== "string") return input;
470
+ const regex = /[ ,./\\-_*^]/g;
471
+ return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
472
+ }
473
+ function checkStringValue(param, modelValue, possibleAnswers) {
474
+ const standardizedModelValue = standardizeString(modelValue);
475
+ const standardizedPossibleAnswers = possibleAnswers.map(
476
+ (ans) => standardizeString(ans)
477
+ );
478
+ if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
479
+ return {
480
+ valid: false,
481
+ error: `Invalid value for parameter '${param}': '${modelValue}'. Expected one of ${possibleAnswers.join(", ")}.`,
482
+ error_type: "value_error:string"
483
+ };
484
+ }
485
+ return { valid: true };
486
+ }
487
+ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
488
+ const modelArgs = modelToolCall.args;
489
+ const modelFuncName = modelToolCall.toolName;
490
+ const expectedFuncName = funcDescription.name;
491
+ const expectedParams = funcDescription.parameters.properties;
492
+ const requiredParams = funcDescription.parameters.required;
493
+ if (modelFuncName !== expectedFuncName) {
494
+ return {
495
+ valid: false,
496
+ error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
497
+ error_type: "simple_function_checker:wrong_func_name"
498
+ };
499
+ }
500
+ const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
501
+ for (const param of requiredParams) {
502
+ if (!(param in modelArgs)) {
503
+ return {
504
+ valid: false,
505
+ error: `Missing required parameter: '${param}'.`,
506
+ error_type: "simple_function_checker:missing_required"
507
+ };
508
+ }
509
+ }
510
+ for (const paramName in modelArgs) {
511
+ const modelValue = modelArgs[paramName];
512
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
513
+ return {
514
+ valid: false,
515
+ error: `Unexpected parameter: '${paramName}'.`,
516
+ error_type: "simple_function_checker:unexpected_param"
517
+ };
518
+ }
519
+ const possibleValues = possibleAnswerParams[paramName];
520
+ if (typeof modelValue === "string") {
521
+ const result = checkStringValue(paramName, modelValue, possibleValues);
522
+ if (!result.valid) return result;
523
+ } else if (Array.isArray(modelValue)) {
524
+ const modelValueStr = JSON.stringify(
525
+ modelValue.map((v) => standardizeString(v.toString())).sort()
526
+ );
527
+ const hasMatch = possibleValues.some(
528
+ (p) => JSON.stringify(
529
+ p.map((v) => standardizeString(v.toString())).sort()
530
+ ) === modelValueStr
531
+ );
532
+ if (!hasMatch) {
533
+ return {
534
+ valid: false,
535
+ error: `Invalid value for list parameter '${paramName}'.`,
536
+ error_type: "value_error:list"
537
+ };
538
+ }
539
+ } else {
540
+ if (!possibleValues.includes(modelValue)) {
541
+ return {
542
+ valid: false,
543
+ error: `Invalid value for parameter '${paramName}': got '${modelValue}', expected one of '${possibleValues}'.`,
544
+ error_type: "value_error:other"
545
+ };
546
+ }
547
+ }
548
+ }
549
+ for (const paramName in possibleAnswerParams) {
550
+ if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
551
+ return {
552
+ valid: false,
553
+ error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
554
+ error_type: "simple_function_checker:missing_optional"
555
+ };
556
+ }
557
+ }
558
+ return { valid: true };
559
+ }
560
+ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
561
+ if (modelToolCalls.length !== possibleAnswers.length) {
562
+ return {
563
+ valid: false,
564
+ error: `Wrong number of functions. Expected ${possibleAnswers.length}, got ${modelToolCalls.length}.`,
565
+ error_type: "parallel_function_checker_no_order:wrong_count"
566
+ };
567
+ }
568
+ const matchedModelCallIndices = /* @__PURE__ */ new Set();
569
+ for (const possibleAnswer of possibleAnswers) {
570
+ const expectedFuncName = Object.keys(possibleAnswer)[0];
571
+ const funcDescription = funcDescriptions.find(
572
+ (f) => f.name === expectedFuncName
573
+ );
574
+ if (!funcDescription) {
575
+ return {
576
+ valid: false,
577
+ error: `Could not find function description for '${expectedFuncName}'.`,
578
+ error_type: "parallel_function_checker_no_order:missing_func_desc"
579
+ };
580
+ }
581
+ let foundMatch = false;
582
+ for (let i = 0; i < modelToolCalls.length; i++) {
583
+ if (matchedModelCallIndices.has(i)) continue;
584
+ const checkerResult = simpleFunctionChecker(
585
+ funcDescription,
586
+ modelToolCalls[i],
587
+ possibleAnswer
588
+ );
589
+ if (checkerResult.valid) {
590
+ matchedModelCallIndices.add(i);
591
+ foundMatch = true;
592
+ break;
593
+ }
594
+ }
595
+ if (!foundMatch) {
596
+ return {
597
+ valid: false,
598
+ error: `Could not find a matching function call for '${expectedFuncName}'.`,
599
+ error_type: "parallel_function_checker_no_order:cannot_find_match"
600
+ };
601
+ }
602
+ }
603
+ return { valid: true };
604
+ }
605
+ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswers) {
606
+ if (modelToolCalls.length !== possibleAnswers.length) {
607
+ return {
608
+ valid: false,
609
+ error: `Wrong number of functions. Expected ${possibleAnswers.length}, got ${modelToolCalls.length}.`,
610
+ error_type: "multiple_function_checker:wrong_count"
611
+ };
612
+ }
613
+ const expectedFuncName = Object.keys(possibleAnswers[0])[0];
614
+ const funcDescription = funcDescriptions.find(
615
+ (f) => f.name === expectedFuncName
616
+ );
617
+ if (!funcDescription) {
618
+ return {
619
+ valid: false,
620
+ error: `Could not find function description for '${expectedFuncName}'.`,
621
+ error_type: "multiple_function_checker:missing_func_desc"
622
+ };
623
+ }
624
+ return simpleFunctionChecker(
625
+ funcDescription,
626
+ modelToolCalls[0],
627
+ possibleAnswers[0]
628
+ );
629
+ }
630
+
631
+ // src/benchmarks/bfcl.ts
632
+ function check(testCase, modelOutput, possibleAnswer) {
633
+ const category = testCase.id.split("_")[0];
634
+ try {
635
+ if (category === "simple") {
636
+ if (!modelOutput || modelOutput.length !== 1) {
637
+ return {
638
+ valid: false,
639
+ error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`
640
+ };
641
+ }
642
+ return simpleFunctionChecker(
643
+ testCase.function[0],
644
+ modelOutput[0],
645
+ possibleAnswer.ground_truth[0]
646
+ );
647
+ } else if (category === "parallel") {
648
+ return parallelFunctionCheckerNoOrder(
649
+ testCase.function,
650
+ modelOutput,
651
+ possibleAnswer.ground_truth
652
+ );
653
+ } else if (category === "multiple") {
654
+ return multipleFunctionChecker(
655
+ testCase.function,
656
+ modelOutput,
657
+ possibleAnswer.ground_truth
658
+ );
659
+ } else if (category.includes("parallel-multiple")) {
660
+ return parallelFunctionCheckerNoOrder(
661
+ testCase.function,
662
+ modelOutput,
663
+ possibleAnswer.ground_truth
664
+ );
665
+ }
666
+ return { valid: true };
667
+ } catch (e) {
668
+ return { valid: false, error: `Checker Error: ${e.message}` };
669
+ }
670
+ }
671
+ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
672
+ return {
673
+ name,
674
+ version: "1.0.0",
675
+ description,
676
+ async run(model) {
677
+ const logs = [];
678
+ let correctCount = 0;
679
+ let testCases = [];
680
+ try {
681
+ const dataPath = resolveDataDir();
682
+ logs.push(`[INFO] Using data dir: ${dataPath}`);
683
+ const testCasesJson = await import_fs3.promises.readFile(
684
+ import_path3.default.join(dataPath, testDataFile),
685
+ "utf-8"
686
+ );
687
+ const possibleAnswersJson = await import_fs3.promises.readFile(
688
+ import_path3.default.join(dataPath, answerDataFile),
689
+ "utf-8"
690
+ );
691
+ testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
692
+ const possibleAnswers = possibleAnswersJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
693
+ const possibleAnswersMap = new Map(
694
+ possibleAnswers.map((ans) => [ans.id, ans])
695
+ );
696
+ const limitEnv = process.env.BFCL_LIMIT;
697
+ const limit = limitEnv ? Number(limitEnv) : void 0;
698
+ if (limit && Number.isFinite(limit) && limit > 0) {
699
+ testCases = testCases.slice(0, limit);
700
+ logs.push(
701
+ `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
702
+ );
703
+ }
704
+ const fixSchema = (schema) => {
705
+ if (!schema || typeof schema !== "object") return schema;
706
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
707
+ if (copy.type) {
708
+ if (copy.type === "dict") copy.type = "object";
709
+ if (copy.type === "integer" || copy.type === "float")
710
+ copy.type = "number";
711
+ }
712
+ if (copy.properties && typeof copy.properties === "object") {
713
+ for (const k of Object.keys(copy.properties)) {
714
+ copy.properties[k] = fixSchema(copy.properties[k]);
715
+ }
716
+ }
717
+ if (copy.items) copy.items = fixSchema(copy.items);
718
+ return copy;
719
+ };
720
+ for (const testCase of testCases) {
721
+ const { function: tools, question: messages } = testCase;
722
+ try {
723
+ const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
724
+ const nameMap = /* @__PURE__ */ new Map();
725
+ const sanitizeName = (name2) => {
726
+ const s = name2.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
727
+ return s.length > 0 ? s : "tool";
728
+ };
729
+ const transformedTools = tools.map((t) => {
730
+ const fixed = fixSchema(t.parameters);
731
+ const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
732
+ const sanitized = sanitizeName(t.name);
733
+ nameMap.set(sanitized, t.name);
734
+ return {
735
+ type: "function",
736
+ name: sanitized,
737
+ description: t.description,
738
+ // Mark as JSON schema explicitly to prevent Zod parsing
739
+ inputSchema: (0, import_ai2.jsonSchema)(inputSchema)
740
+ };
741
+ });
742
+ try {
743
+ const firstTool = transformedTools[0];
744
+ const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
745
+ logs.push(
746
+ `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
747
+ );
748
+ } catch (e) {
749
+ logs.push(
750
+ `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
751
+ );
752
+ }
753
+ const { toolCalls, text, finishReason } = await (0, import_ai2.generateText)({
754
+ model,
755
+ messages: flatMessages,
756
+ tools: transformedTools,
757
+ toolChoice: "required"
758
+ });
759
+ try {
760
+ logs.push(
761
+ `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
762
+ );
763
+ } catch {
764
+ logs.push(
765
+ `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
766
+ );
767
+ }
768
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
769
+ if (!possibleAnswer) {
770
+ throw new Error(`No possible answer for id: ${testCase.id}`);
771
+ }
772
+ const restoredCalls = (toolCalls || []).map((c) => {
773
+ const rawName = c.toolName ?? c.name;
774
+ const sanitizedFromIndex = typeof rawName === "string" && /^\d+$/.test(rawName) ? transformedTools[Number(rawName)]?.name ?? rawName : rawName;
775
+ const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
776
+ const extractedArgs = c.args ?? c.arguments ?? c.input ?? c.params ?? c.parameters ?? void 0;
777
+ let parsedArgs = extractedArgs;
778
+ if (typeof parsedArgs === "string") {
779
+ try {
780
+ parsedArgs = JSON.parse(parsedArgs);
781
+ } catch {
782
+ }
783
+ }
784
+ return {
785
+ ...c,
786
+ toolName: originalName,
787
+ name: originalName,
788
+ args: parsedArgs ?? {}
789
+ };
790
+ });
791
+ const checkerResult = check(
792
+ testCase,
793
+ restoredCalls,
794
+ possibleAnswer
795
+ );
796
+ if (checkerResult.valid) {
797
+ correctCount++;
798
+ logs.push(`[PASS] ${testCase.id}`);
799
+ } else {
800
+ logs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
801
+ }
802
+ } catch (e) {
803
+ logs.push(
804
+ `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
805
+ );
806
+ if (e?.stack) {
807
+ logs.push(`[STACK] ${testCase.id}: ${e.stack}`);
808
+ }
809
+ }
810
+ }
811
+ if (testCases.length === 0) {
812
+ return {
813
+ score: 0,
814
+ success: false,
815
+ metrics: {},
816
+ logs: ["No test cases found."]
817
+ };
818
+ }
819
+ const score = correctCount / testCases.length;
820
+ return {
821
+ score,
822
+ success: score > 0.95,
823
+ // High success threshold as requested
824
+ metrics: {
825
+ correct_count: correctCount,
826
+ total_cases: testCases.length,
827
+ accuracy: score
828
+ },
829
+ logs
830
+ };
831
+ } catch (e) {
832
+ return {
833
+ score: 0,
834
+ success: false,
835
+ metrics: {},
836
+ error: e,
837
+ logs: [`[FATAL] Failed to run benchmark ${name}: ${e.message}`]
838
+ };
839
+ }
840
+ }
841
+ };
842
+ }
843
+ var bfclSimpleBenchmark = createBfclBenchmark(
844
+ "bfcl-simple",
845
+ "BFCL Simple Function Calling",
846
+ "BFCL_v3_simple.json",
847
+ "BFCL_v3_simple_possible_answer.json"
848
+ );
849
+ var bfclParallelBenchmark = createBfclBenchmark(
850
+ "bfcl-parallel",
851
+ "BFCL Parallel Function Calling",
852
+ "BFCL_v3_parallel.json",
853
+ "BFCL_v3_parallel_possible_answer.json"
854
+ );
855
+ var bfclMultipleBenchmark = createBfclBenchmark(
856
+ "bfcl-multiple",
857
+ "BFCL Multiple Function Calling",
858
+ "BFCL_v3_multiple.json",
859
+ "BFCL_v3_multiple_possible_answer.json"
860
+ );
861
+ var bfclParallelMultipleBenchmark = createBfclBenchmark(
862
+ "bfcl-parallel-multiple",
863
+ "BFCL Parallel & Multiple Function Calling",
864
+ "BFCL_v3_parallel_multiple.json",
865
+ "BFCL_v3_parallel_multiple_possible_answer.json"
866
+ );
867
+ // Annotate the CommonJS export names for ESM import in node:
868
+ 0 && (module.exports = {
869
+ bfclMultipleBenchmark,
870
+ bfclParallelBenchmark,
871
+ bfclParallelMultipleBenchmark,
872
+ bfclSimpleBenchmark,
873
+ evaluate,
874
+ jsonGenerationBenchmark,
875
+ jsonGenerationSchemaOnlyBenchmark
876
+ });
877
+ //# sourceMappingURL=index.cjs.map