@ai-sdk-tool/eval 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -71,7 +71,7 @@ function uniqueLines(lines) {
71
71
  function suggestFixFromDiff(parsed) {
72
72
  const suggestions = [];
73
73
  const { error_type, expected, actual, diff } = parsed ?? {};
74
- if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
74
+ if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
75
75
  const expectedName = expected?.function;
76
76
  const actualName = actual?.function;
77
77
  if (expectedName && actualName && expectedName !== actualName) {
@@ -85,23 +85,23 @@ function suggestFixFromDiff(parsed) {
85
85
  );
86
86
  }
87
87
  }
88
- if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
89
- const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
88
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
89
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
90
90
  if (missing.length) {
91
91
  suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
92
92
  }
93
93
  }
94
- if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
95
- const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
94
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
95
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
96
96
  if (extras.length) {
97
97
  suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
98
98
  }
99
99
  }
100
- if (diff && diff.some((d) => d.startsWith("@@ param "))) {
101
- const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
100
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
101
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
102
102
  for (const param of targets) {
103
103
  const allowedLine = diff.find(
104
- (d) => d.startsWith("- expected one of:")
104
+ (d) => String(d).startsWith("- expected one of:")
105
105
  );
106
106
  if (allowedLine) {
107
107
  const allowed = allowedLine.replace("- expected one of: ", "");
@@ -239,13 +239,13 @@ var reporters = {
239
239
  };
240
240
 
241
241
  // src/evaluate.ts
242
- async function runSingleBenchmark(model, benchmark, modelKey) {
242
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
243
243
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
244
244
  try {
245
245
  console.log(
246
246
  `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
247
247
  );
248
- const result = await benchmark.run(model);
248
+ const result = await benchmark.run(model, config);
249
249
  console.log(
250
250
  `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
251
251
  );
@@ -274,7 +274,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
274
274
  }
275
275
  }
276
276
  async function evaluate(options) {
277
- const { models, benchmarks, reporter = "console" } = options;
277
+ const { models, benchmarks, reporter = "console", temperature } = options;
278
278
  const modelEntries = [];
279
279
  if (Array.isArray(models)) {
280
280
  for (const m of models) modelEntries.push([void 0, m]);
@@ -293,7 +293,8 @@ async function evaluate(options) {
293
293
  const evaluationResult = await runSingleBenchmark(
294
294
  model,
295
295
  benchmark,
296
- modelKey
296
+ modelKey,
297
+ temperature !== void 0 ? { temperature } : void 0
297
298
  );
298
299
  allResults.push(evaluationResult);
299
300
  }
@@ -308,17 +309,16 @@ async function evaluate(options) {
308
309
  return allResults;
309
310
  }
310
311
 
311
- // src/benchmarks/json-generation.ts
312
- import { generateText } from "ai";
313
- import Ajv from "ajv";
312
+ // src/benchmarks/bfcl.ts
313
+ import { generateText, jsonSchema, tool } from "ai";
314
314
  import { promises as fs2 } from "fs";
315
315
  import path2 from "path";
316
316
 
317
317
  // src/utils/paths.ts
318
318
  import fs from "fs";
319
+ import { createRequire } from "module";
319
320
  import path from "path";
320
321
  import { fileURLToPath } from "url";
321
- import { createRequire } from "module";
322
322
  function resolveDataDir(fromModuleUrl) {
323
323
  const moduleUrl = fromModuleUrl;
324
324
  const override = process.env.BFCL_DATA_DIR;
@@ -366,263 +366,6 @@ function resolveDataDir(fromModuleUrl) {
366
366
  return path.join(pkgRoot, "data");
367
367
  }
368
368
 
369
- // src/benchmarks/json-generation.ts
370
- function extractFirstJsonBlock(text) {
371
- try {
372
- return JSON.parse(text);
373
- } catch {
374
- }
375
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
376
- if (fenceMatch) {
377
- const inner = fenceMatch[1].trim();
378
- try {
379
- return JSON.parse(inner);
380
- } catch {
381
- }
382
- }
383
- const startIdxObj = text.indexOf("{");
384
- const startIdxArr = text.indexOf("[");
385
- const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
386
- if (start === void 0) return void 0;
387
- const open = text[start] === "{" ? "{" : "[";
388
- const close = open === "{" ? "}" : "]";
389
- let depth = 0;
390
- for (let i = start; i < text.length; i++) {
391
- const ch = text[i];
392
- if (ch === open) depth++;
393
- else if (ch === close) depth--;
394
- if (depth === 0) {
395
- const candidate = text.slice(start, i + 1);
396
- try {
397
- return JSON.parse(candidate);
398
- } catch {
399
- }
400
- break;
401
- }
402
- }
403
- return void 0;
404
- }
405
- function subsetMatch(expected, actual) {
406
- if (expected === null || typeof expected !== "object") {
407
- return expected === actual;
408
- }
409
- if (Array.isArray(expected)) {
410
- if (!Array.isArray(actual)) return false;
411
- for (let i = 0; i < expected.length; i++) {
412
- if (!subsetMatch(expected[i], actual[i])) return false;
413
- }
414
- return true;
415
- }
416
- if (actual === null || typeof actual !== "object") return false;
417
- const eObj = expected;
418
- const aObj = actual;
419
- for (const key of Object.keys(eObj)) {
420
- if (!subsetMatch(eObj[key], aObj[key])) return false;
421
- }
422
- return true;
423
- }
424
- var jsonGenerationBenchmark = {
425
- name: "json-generation",
426
- version: "2.1.0",
427
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
428
- async run(model) {
429
- const logs = [];
430
- const ajv = new Ajv({ allErrors: true, strict: false });
431
- let schemaValidCount = 0;
432
- let valueMatchCount = 0;
433
- let correctCount = 0;
434
- let tests = [];
435
- const expectedMap = /* @__PURE__ */ new Map();
436
- try {
437
- const dataDir = resolveDataDir();
438
- const testsJsonl = await fs2.readFile(
439
- path2.join(dataDir, "json_generation_tests.jsonl"),
440
- "utf-8"
441
- );
442
- const expectedJsonl = await fs2.readFile(
443
- path2.join(dataDir, "json_generation_expected.jsonl"),
444
- "utf-8"
445
- );
446
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
447
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
448
- for (const r of expecteds) expectedMap.set(r.id, r);
449
- } catch (e) {
450
- const msg = e instanceof Error ? e.message : String(e);
451
- return {
452
- score: 0,
453
- success: false,
454
- metrics: {},
455
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
456
- error: e
457
- };
458
- }
459
- for (const tc of tests) {
460
- try {
461
- const schemaStr = JSON.stringify(tc.schema, null, 2);
462
- const messages = [
463
- {
464
- role: "system",
465
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
466
- },
467
- {
468
- role: "user",
469
- content: [
470
- "Generate a JSON object that reflects the following facts.",
471
- "JSON Schema:",
472
- schemaStr,
473
- "Facts:",
474
- tc.promptFacts,
475
- "Output must be a single JSON only, with no additional text."
476
- ].join("\n\n")
477
- }
478
- ];
479
- const { text } = await generateText({ model, messages });
480
- let parsed;
481
- try {
482
- parsed = extractFirstJsonBlock(text);
483
- } catch {
484
- }
485
- if (parsed === void 0) {
486
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
487
- continue;
488
- }
489
- const validate = ajv.compile(tc.schema);
490
- const valid = validate(parsed);
491
- if (valid) schemaValidCount++;
492
- else
493
- logs.push(
494
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
495
- );
496
- const expectedRec = expectedMap.get(tc.id);
497
- if (!expectedRec) {
498
- logs.push(
499
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
500
- );
501
- }
502
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
503
- if (valuesOk) valueMatchCount++;
504
- if (valid && valuesOk) {
505
- correctCount++;
506
- logs.push(`[PASS] ${tc.id}`);
507
- } else {
508
- logs.push(
509
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
510
- parsed
511
- )}`
512
- );
513
- }
514
- } catch (e) {
515
- const msg = e instanceof Error ? e.message : String(e);
516
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
517
- }
518
- }
519
- const total = tests.length;
520
- const score = correctCount / total;
521
- return {
522
- score,
523
- success: score >= 0.8,
524
- metrics: {
525
- total_cases: total,
526
- correct_count: correctCount,
527
- schema_valid_count: schemaValidCount,
528
- value_match_count: valueMatchCount,
529
- accuracy: score
530
- },
531
- logs
532
- };
533
- }
534
- };
535
- var jsonGenerationSchemaOnlyBenchmark = {
536
- name: "json-generation-schema-only",
537
- version: "1.0.1",
538
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
539
- async run(model) {
540
- const logs = [];
541
- const ajv = new Ajv({ allErrors: true, strict: false });
542
- let tests = [];
543
- try {
544
- const dataDir = resolveDataDir();
545
- const testsJsonl = await fs2.readFile(
546
- path2.join(dataDir, "json_generation_tests.jsonl"),
547
- "utf-8"
548
- );
549
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
550
- } catch (e) {
551
- const msg = e instanceof Error ? e.message : String(e);
552
- return {
553
- score: 0,
554
- success: false,
555
- metrics: {},
556
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
557
- error: e
558
- };
559
- }
560
- let schemaValidCount = 0;
561
- for (const tc of tests) {
562
- try {
563
- const schemaStr = JSON.stringify(tc.schema, null, 2);
564
- const messages = [
565
- {
566
- role: "system",
567
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
568
- },
569
- {
570
- role: "user",
571
- content: [
572
- "Generate a JSON object that reflects the following facts.",
573
- "JSON Schema:",
574
- schemaStr,
575
- "Facts:",
576
- tc.promptFacts,
577
- "Output must be a single JSON only, with no additional text."
578
- ].join("\n\n")
579
- }
580
- ];
581
- const { text } = await generateText({ model, messages });
582
- let parsed;
583
- try {
584
- parsed = extractFirstJsonBlock(text);
585
- } catch {
586
- }
587
- if (parsed === void 0) {
588
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
589
- continue;
590
- }
591
- const validate = ajv.compile(tc.schema);
592
- const valid = validate(parsed);
593
- if (valid) {
594
- schemaValidCount++;
595
- logs.push(`[PASS] ${tc.id}`);
596
- } else {
597
- logs.push(
598
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
599
- );
600
- }
601
- } catch (e) {
602
- const msg = e instanceof Error ? e.message : String(e);
603
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
604
- }
605
- }
606
- const total = tests.length;
607
- const score = total > 0 ? schemaValidCount / total : 0;
608
- return {
609
- score,
610
- success: score >= 0.8,
611
- metrics: {
612
- total_cases: total,
613
- schema_valid_count: schemaValidCount,
614
- accuracy: score
615
- },
616
- logs
617
- };
618
- }
619
- };
620
-
621
- // src/benchmarks/bfcl.ts
622
- import { generateText as generateText2, jsonSchema, tool } from "ai";
623
- import { promises as fs3 } from "fs";
624
- import path3 from "path";
625
-
626
369
  // src/benchmarks/bfcl/ast-checker.ts
627
370
  function standardizeString(input) {
628
371
  if (typeof input !== "string") return input;
@@ -632,7 +375,7 @@ function standardizeString(input) {
632
375
  function checkStringValue(param, modelValue, possibleAnswers) {
633
376
  const standardizedModelValue = standardizeString(modelValue);
634
377
  const standardizedPossibleAnswers = possibleAnswers.map(
635
- (ans) => standardizeString(ans)
378
+ (ans) => standardizeString(String(ans))
636
379
  );
637
380
  if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
638
381
  return {
@@ -659,8 +402,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
659
402
  };
660
403
  }
661
404
  const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
405
+ const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
662
406
  for (const param of requiredParams) {
663
- if (!(param in modelArgs)) {
407
+ if (!(param in argsObj)) {
664
408
  return {
665
409
  valid: false,
666
410
  error: `Missing required parameter: '${param}'.`,
@@ -668,87 +412,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
668
412
  };
669
413
  }
670
414
  }
671
- for (const paramName in modelArgs) {
672
- const modelValue = modelArgs[paramName];
673
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
674
- return {
675
- valid: false,
676
- error: `Unexpected parameter: '${paramName}'.`,
677
- error_type: "simple_function_checker:unexpected_param"
678
- };
679
- }
680
- const possibleValues = possibleAnswerParams[paramName];
681
- if (typeof modelValue === "string") {
682
- const result = checkStringValue(paramName, modelValue, possibleValues);
683
- if (!result.valid) return result;
684
- } else if (Array.isArray(modelValue)) {
685
- const modelValueStr = JSON.stringify(
686
- modelValue.map((v) => standardizeString(v.toString())).sort()
687
- );
688
- const hasMatch = possibleValues.some(
689
- (p) => JSON.stringify(
690
- p.map((v) => standardizeString(v.toString())).sort()
691
- ) === modelValueStr
692
- );
693
- if (!hasMatch) {
415
+ if (modelArgs && typeof modelArgs === "object") {
416
+ for (const paramName of Object.keys(argsObj)) {
417
+ const modelValue = argsObj[paramName];
418
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
694
419
  return {
695
420
  valid: false,
696
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
697
- modelValue
698
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
699
- error_type: "value_error:list"
421
+ error: `Unexpected parameter: '${paramName}'.`,
422
+ error_type: "simple_function_checker:unexpected_param"
700
423
  };
701
424
  }
702
- } else {
703
- const hasMatch = possibleValues.some((possibleValue) => {
704
- if (modelValue === possibleValue) return true;
705
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
706
- try {
707
- const normalizeObject = (obj) => {
708
- if (Array.isArray(obj)) {
709
- return obj.map(normalizeObject);
710
- }
711
- if (obj && typeof obj === "object") {
712
- const normalized = {};
713
- for (const [key, value] of Object.entries(obj)) {
714
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
715
- normalized[key] = value[0];
716
- } else {
717
- normalized[key] = normalizeObject(value);
425
+ const possibleValues = possibleAnswerParams[paramName];
426
+ if (typeof modelValue === "string") {
427
+ const result = checkStringValue(
428
+ paramName,
429
+ modelValue,
430
+ possibleValues ?? []
431
+ );
432
+ if (!result.valid) return result;
433
+ } else if (Array.isArray(modelValue)) {
434
+ const modelValueStr = JSON.stringify(
435
+ modelValue.map((v) => standardizeString(String(v))).sort()
436
+ );
437
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
438
+ if (!Array.isArray(p)) return false;
439
+ return JSON.stringify(
440
+ p.map((v) => standardizeString(String(v))).sort()
441
+ ) === modelValueStr;
442
+ }) : false;
443
+ if (!hasMatch) {
444
+ return {
445
+ valid: false,
446
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
447
+ modelValue
448
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
449
+ error_type: "value_error:list"
450
+ };
451
+ }
452
+ } else {
453
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
454
+ if (modelValue === possibleValue) return true;
455
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
456
+ try {
457
+ const normalizeObject = (obj) => {
458
+ if (Array.isArray(obj)) {
459
+ return obj.map(normalizeObject);
460
+ }
461
+ if (obj && typeof obj === "object") {
462
+ const normalized = {};
463
+ for (const [key, value] of Object.entries(
464
+ obj
465
+ )) {
466
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
467
+ normalized[key] = value[0];
468
+ } else {
469
+ normalized[key] = normalizeObject(value);
470
+ }
718
471
  }
472
+ return normalized;
719
473
  }
720
- return normalized;
721
- }
722
- return obj;
723
- };
724
- const normalizedModel = normalizeObject(modelValue);
725
- const normalizedPossible = normalizeObject(possibleValue);
726
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
727
- } catch {
728
- return false;
474
+ return obj;
475
+ };
476
+ const normalizedModel = normalizeObject(modelValue);
477
+ const normalizedPossible = normalizeObject(possibleValue);
478
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
479
+ } catch {
480
+ return false;
481
+ }
729
482
  }
483
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
484
+ return modelValue.toString() === possibleValue;
485
+ }
486
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
487
+ return modelValue === possibleValue.toString();
488
+ }
489
+ return false;
490
+ }) : false;
491
+ if (!hasMatch) {
492
+ return {
493
+ valid: false,
494
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
495
+ modelValue
496
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
497
+ error_type: "value_error:other"
498
+ };
730
499
  }
731
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
732
- return modelValue.toString() === possibleValue;
733
- }
734
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
735
- return modelValue === possibleValue.toString();
736
- }
737
- return false;
738
- });
739
- if (!hasMatch) {
740
- return {
741
- valid: false,
742
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
743
- modelValue
744
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
745
- error_type: "value_error:other"
746
- };
747
500
  }
748
501
  }
749
502
  }
750
503
  for (const paramName in possibleAnswerParams) {
751
- if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
504
+ const val = possibleAnswerParams[paramName];
505
+ const isOptional = Array.isArray(val) && val.includes("");
506
+ if (!(paramName in argsObj) && !isOptional) {
752
507
  return {
753
508
  valid: false,
754
509
  error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
@@ -834,10 +589,10 @@ function check(testCase, modelOutput, possibleAnswer) {
834
589
  const category = testCase.id.split("_")[0];
835
590
  try {
836
591
  if (category === "simple") {
837
- if (!modelOutput || modelOutput.length !== 1) {
592
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
838
593
  return {
839
594
  valid: false,
840
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
595
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
841
596
  error_type: "simple:wrong_count"
842
597
  };
843
598
  }
@@ -879,19 +634,19 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
879
634
  name,
880
635
  version: "1.0.0",
881
636
  description,
882
- async run(model) {
637
+ async run(model, config) {
883
638
  const logs = [];
884
639
  let correctCount = 0;
885
640
  let testCases = [];
886
641
  try {
887
642
  const dataPath = resolveDataDir();
888
643
  logs.push(`[INFO] Using data dir: ${dataPath}`);
889
- const testCasesJson = await fs3.readFile(
890
- path3.join(dataPath, testDataFile),
644
+ const testCasesJson = await fs2.readFile(
645
+ path2.join(dataPath, testDataFile),
891
646
  "utf-8"
892
647
  );
893
- const possibleAnswersJson = await fs3.readFile(
894
- path3.join(dataPath, answerDataFile),
648
+ const possibleAnswersJson = await fs2.readFile(
649
+ path2.join(dataPath, answerDataFile),
895
650
  "utf-8"
896
651
  );
897
652
  testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -908,19 +663,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
908
663
  );
909
664
  }
910
665
  const fixSchema = (schema) => {
911
- if (!schema || typeof schema !== "object") return schema;
666
+ if (!schema || typeof schema !== "object")
667
+ return { type: "object", properties: {} };
912
668
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
913
- if (copy.type) {
914
- if (copy.type === "dict") copy.type = "object";
915
- if (copy.type === "integer" || copy.type === "float")
916
- copy.type = "number";
917
- }
918
- if (copy.properties && typeof copy.properties === "object") {
919
- for (const k of Object.keys(copy.properties)) {
920
- copy.properties[k] = fixSchema(copy.properties[k]);
669
+ if (!Array.isArray(copy)) {
670
+ if (copy.type) {
671
+ if (copy.type === "dict") copy.type = "object";
672
+ if (copy.type === "integer" || copy.type === "float")
673
+ copy.type = "number";
674
+ }
675
+ if (copy.properties && typeof copy.properties === "object") {
676
+ for (const k of Object.keys(copy.properties)) {
677
+ copy.properties[k] = fixSchema(
678
+ copy.properties[k]
679
+ );
680
+ }
921
681
  }
682
+ if (copy.items) copy.items = fixSchema(copy.items);
683
+ return copy;
922
684
  }
923
- if (copy.items) copy.items = fixSchema(copy.items);
924
685
  return copy;
925
686
  };
926
687
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
@@ -931,6 +692,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
931
692
  const runSingleCase = async (testCase) => {
932
693
  const caseLogs = [];
933
694
  const { function: tools, question: messages } = testCase;
695
+ const temp = config?.temperature;
696
+ const temperature = typeof temp === "number" ? temp : void 0;
934
697
  try {
935
698
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
936
699
  const nameMap = /* @__PURE__ */ new Map();
@@ -940,7 +703,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
940
703
  };
941
704
  const transformedTools = tools.map((t) => {
942
705
  const fixed = fixSchema(t.parameters);
943
- const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
706
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
707
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
944
708
  const sanitized = sanitizeName(t.name);
945
709
  nameMap.set(sanitized, t.name);
946
710
  return {
@@ -970,16 +734,20 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
970
734
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
971
735
  );
972
736
  }
973
- const { toolCalls, text, finishReason } = await generateText2({
737
+ const { toolCalls, text, finishReason } = await generateText({
974
738
  model,
975
739
  messages: flatMessages,
976
740
  tools: toolsMap,
977
741
  toolChoice: "auto",
742
+ ...temperature !== void 0 ? { temperature } : {},
978
743
  // Pass original schema information to middleware
979
744
  providerOptions: {
980
745
  toolCallMiddleware: {
981
746
  originalToolSchemas: Object.fromEntries(
982
- transformedTools.map((t) => [t.name, t.inputSchema])
747
+ transformedTools.map((t) => [
748
+ t.name,
749
+ t.inputSchema
750
+ ])
983
751
  )
984
752
  }
985
753
  }
@@ -1032,10 +800,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1032
800
  const summarizeArgs = (args) => {
1033
801
  if (args == null) return args;
1034
802
  if (typeof args !== "object") return args;
1035
- return Object.keys(args).sort().reduce((acc, k) => {
1036
- acc[k] = args[k];
1037
- return acc;
1038
- }, {});
803
+ return Object.keys(args).sort().reduce(
804
+ (acc, k) => {
805
+ acc[k] = args[k];
806
+ return acc;
807
+ },
808
+ {}
809
+ );
1039
810
  };
1040
811
  const expected = {};
1041
812
  const actual = {};
@@ -1056,19 +827,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1056
827
  diff.push(`- ${expectedFuncName}`);
1057
828
  diff.push(`+ ${receivedName}`);
1058
829
  }
1059
- if (expectedParams && receivedArgs) {
830
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1060
831
  const required = funcDesc?.parameters?.required ?? [];
1061
832
  for (const req of required) {
1062
833
  if (!(req in receivedArgs)) {
1063
834
  diff.push(`- missing required param: ${req}`);
1064
835
  }
1065
836
  }
1066
- for (const k of Object.keys(receivedArgs)) {
837
+ for (const k of Object.keys(
838
+ receivedArgs
839
+ )) {
1067
840
  if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1068
841
  diff.push(`+ unexpected param: ${k}`);
1069
842
  }
1070
843
  }
1071
- for (const k of Object.keys(receivedArgs)) {
844
+ for (const k of Object.keys(
845
+ receivedArgs
846
+ )) {
1072
847
  if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1073
848
  const allowed = expectedParams[k];
1074
849
  const got = receivedArgs[k];
@@ -1141,13 +916,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1141
916
  );
1142
917
  const requiredParams = funcDesc?.parameters?.required ?? [];
1143
918
  diff.push(`@@ function ${fname}`);
1144
- if (expectedParamsAllowed && receivedArgs) {
919
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1145
920
  for (const req of requiredParams) {
1146
921
  if (!(req in receivedArgs)) {
1147
922
  diff.push(`- missing required param: ${req}`);
1148
923
  }
1149
924
  }
1150
- for (const k of Object.keys(receivedArgs)) {
925
+ for (const k of Object.keys(
926
+ receivedArgs
927
+ )) {
1151
928
  if (!Object.prototype.hasOwnProperty.call(
1152
929
  expectedParamsAllowed,
1153
930
  k
@@ -1155,7 +932,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1155
932
  diff.push(`+ unexpected param: ${k}`);
1156
933
  }
1157
934
  }
1158
- for (const k of Object.keys(receivedArgs)) {
935
+ for (const k of Object.keys(
936
+ receivedArgs
937
+ )) {
1159
938
  if (Object.prototype.hasOwnProperty.call(
1160
939
  expectedParamsAllowed,
1161
940
  k
@@ -1293,6 +1072,274 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
1293
1072
  "BFCL_v3_parallel_multiple.json",
1294
1073
  "BFCL_v3_parallel_multiple_possible_answer.json"
1295
1074
  );
1075
+
1076
+ // src/benchmarks/json-generation.ts
1077
+ import { generateText as generateText2 } from "ai";
1078
+ import Ajv from "ajv";
1079
+ import { promises as fs3 } from "fs";
1080
+ import path3 from "path";
1081
+ function extractFirstJsonBlock(text) {
1082
+ try {
1083
+ return JSON.parse(text);
1084
+ } catch {
1085
+ }
1086
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1087
+ if (fenceMatch) {
1088
+ const inner = fenceMatch[1].trim();
1089
+ try {
1090
+ return JSON.parse(inner);
1091
+ } catch {
1092
+ }
1093
+ }
1094
+ const startIdxObj = text.indexOf("{");
1095
+ const startIdxArr = text.indexOf("[");
1096
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1097
+ if (start === void 0) return void 0;
1098
+ const open = text[start] === "{" ? "{" : "[";
1099
+ const close = open === "{" ? "}" : "]";
1100
+ let depth = 0;
1101
+ for (let i = start; i < text.length; i++) {
1102
+ const ch = text[i];
1103
+ if (ch === open) depth++;
1104
+ else if (ch === close) depth--;
1105
+ if (depth === 0) {
1106
+ const candidate = text.slice(start, i + 1);
1107
+ try {
1108
+ return JSON.parse(candidate);
1109
+ } catch {
1110
+ }
1111
+ break;
1112
+ }
1113
+ }
1114
+ return void 0;
1115
+ }
1116
+ function subsetMatch(expected, actual) {
1117
+ if (expected === null || typeof expected !== "object") {
1118
+ return expected === actual;
1119
+ }
1120
+ if (Array.isArray(expected)) {
1121
+ if (!Array.isArray(actual)) return false;
1122
+ for (let i = 0; i < expected.length; i++) {
1123
+ if (!subsetMatch(expected[i], actual[i])) return false;
1124
+ }
1125
+ return true;
1126
+ }
1127
+ if (actual === null || typeof actual !== "object") return false;
1128
+ const eObj = expected;
1129
+ const aObj = actual;
1130
+ for (const key of Object.keys(eObj)) {
1131
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
1132
+ }
1133
+ return true;
1134
+ }
1135
+ var jsonGenerationBenchmark = {
1136
+ name: "json-generation",
1137
+ version: "2.1.0",
1138
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1139
+ async run(model, config) {
1140
+ const logs = [];
1141
+ const ajv = new Ajv({ allErrors: true, strict: false });
1142
+ let schemaValidCount = 0;
1143
+ let valueMatchCount = 0;
1144
+ let correctCount = 0;
1145
+ let tests = [];
1146
+ const expectedMap = /* @__PURE__ */ new Map();
1147
+ try {
1148
+ const dataDir = resolveDataDir();
1149
+ const testsJsonl = await fs3.readFile(
1150
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1151
+ "utf-8"
1152
+ );
1153
+ const expectedJsonl = await fs3.readFile(
1154
+ path3.join(dataDir, "json_generation_expected.jsonl"),
1155
+ "utf-8"
1156
+ );
1157
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1158
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1159
+ for (const r of expecteds) expectedMap.set(r.id, r);
1160
+ } catch (e) {
1161
+ const msg = e instanceof Error ? e.message : String(e);
1162
+ return {
1163
+ score: 0,
1164
+ success: false,
1165
+ metrics: {},
1166
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1167
+ error: e
1168
+ };
1169
+ }
1170
+ for (const tc of tests) {
1171
+ try {
1172
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1173
+ const messages = [
1174
+ {
1175
+ role: "system",
1176
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1177
+ },
1178
+ {
1179
+ role: "user",
1180
+ content: [
1181
+ "Generate a JSON object that reflects the following facts.",
1182
+ "JSON Schema:",
1183
+ schemaStr,
1184
+ "Facts:",
1185
+ tc.promptFacts,
1186
+ "Output must be a single JSON only, with no additional text."
1187
+ ].join("\n\n")
1188
+ }
1189
+ ];
1190
+ const temp = config?.temperature;
1191
+ const temperature = typeof temp === "number" ? temp : void 0;
1192
+ const { text } = await generateText2({
1193
+ model,
1194
+ messages,
1195
+ ...temperature !== void 0 ? { temperature } : {}
1196
+ });
1197
+ let parsed;
1198
+ try {
1199
+ parsed = extractFirstJsonBlock(text);
1200
+ } catch {
1201
+ }
1202
+ if (parsed === void 0) {
1203
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1204
+ continue;
1205
+ }
1206
+ const validate = ajv.compile(tc.schema);
1207
+ const valid = validate(parsed);
1208
+ if (valid) schemaValidCount++;
1209
+ else
1210
+ logs.push(
1211
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1212
+ );
1213
+ const expectedRec = expectedMap.get(tc.id);
1214
+ if (!expectedRec) {
1215
+ logs.push(
1216
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1217
+ );
1218
+ }
1219
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1220
+ if (valuesOk) valueMatchCount++;
1221
+ if (valid && valuesOk) {
1222
+ correctCount++;
1223
+ logs.push(`[PASS] ${tc.id}`);
1224
+ } else {
1225
+ logs.push(
1226
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1227
+ parsed
1228
+ )}`
1229
+ );
1230
+ }
1231
+ } catch (e) {
1232
+ const msg = e instanceof Error ? e.message : String(e);
1233
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1234
+ }
1235
+ }
1236
+ const total = tests.length;
1237
+ const score = correctCount / total;
1238
+ return {
1239
+ score,
1240
+ success: score >= 0.8,
1241
+ metrics: {
1242
+ total_cases: total,
1243
+ correct_count: correctCount,
1244
+ schema_valid_count: schemaValidCount,
1245
+ value_match_count: valueMatchCount,
1246
+ accuracy: score
1247
+ },
1248
+ logs
1249
+ };
1250
+ }
1251
+ };
1252
+ var jsonGenerationSchemaOnlyBenchmark = {
1253
+ name: "json-generation-schema-only",
1254
+ version: "1.0.1",
1255
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1256
+ async run(model, config) {
1257
+ const logs = [];
1258
+ const ajv = new Ajv({ allErrors: true, strict: false });
1259
+ let tests = [];
1260
+ try {
1261
+ const dataDir = resolveDataDir();
1262
+ const testsJsonl = await fs3.readFile(
1263
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1264
+ "utf-8"
1265
+ );
1266
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1267
+ } catch (e) {
1268
+ const msg = e instanceof Error ? e.message : String(e);
1269
+ return {
1270
+ score: 0,
1271
+ success: false,
1272
+ metrics: {},
1273
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1274
+ error: e
1275
+ };
1276
+ }
1277
+ let schemaValidCount = 0;
1278
+ for (const tc of tests) {
1279
+ try {
1280
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1281
+ const messages = [
1282
+ {
1283
+ role: "system",
1284
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1285
+ },
1286
+ {
1287
+ role: "user",
1288
+ content: [
1289
+ "Generate a JSON object that reflects the following facts.",
1290
+ "JSON Schema:",
1291
+ schemaStr,
1292
+ "Facts:",
1293
+ tc.promptFacts,
1294
+ "Output must be a single JSON only, with no additional text."
1295
+ ].join("\n\n")
1296
+ }
1297
+ ];
1298
+ const temp = config?.temperature;
1299
+ const temperature = typeof temp === "number" ? temp : void 0;
1300
+ const { text } = await generateText2({
1301
+ model,
1302
+ messages,
1303
+ ...temperature !== void 0 ? { temperature } : {}
1304
+ });
1305
+ let parsed;
1306
+ try {
1307
+ parsed = extractFirstJsonBlock(text);
1308
+ } catch {
1309
+ }
1310
+ if (parsed === void 0) {
1311
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1312
+ continue;
1313
+ }
1314
+ const validate = ajv.compile(tc.schema);
1315
+ const valid = validate(parsed);
1316
+ if (valid) {
1317
+ schemaValidCount++;
1318
+ logs.push(`[PASS] ${tc.id}`);
1319
+ } else {
1320
+ logs.push(
1321
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1322
+ );
1323
+ }
1324
+ } catch (e) {
1325
+ const msg = e instanceof Error ? e.message : String(e);
1326
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1327
+ }
1328
+ }
1329
+ const total = tests.length;
1330
+ const score = total > 0 ? schemaValidCount / total : 0;
1331
+ return {
1332
+ score,
1333
+ success: score >= 0.8,
1334
+ metrics: {
1335
+ total_cases: total,
1336
+ schema_valid_count: schemaValidCount,
1337
+ accuracy: score
1338
+ },
1339
+ logs
1340
+ };
1341
+ }
1342
+ };
1296
1343
  export {
1297
1344
  bfclMultipleBenchmark,
1298
1345
  bfclParallelBenchmark,