@ai-sdk-tool/eval 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -113,7 +113,7 @@ function uniqueLines(lines) {
113
113
  function suggestFixFromDiff(parsed) {
114
114
  const suggestions = [];
115
115
  const { error_type, expected, actual, diff } = parsed ?? {};
116
- if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
116
+ if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
117
117
  const expectedName = expected?.function;
118
118
  const actualName = actual?.function;
119
119
  if (expectedName && actualName && expectedName !== actualName) {
@@ -127,23 +127,23 @@ function suggestFixFromDiff(parsed) {
127
127
  );
128
128
  }
129
129
  }
130
- if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
131
- const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
130
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
131
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
132
132
  if (missing.length) {
133
133
  suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
134
134
  }
135
135
  }
136
- if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
137
- const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
136
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
137
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
138
138
  if (extras.length) {
139
139
  suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
140
140
  }
141
141
  }
142
- if (diff && diff.some((d) => d.startsWith("@@ param "))) {
143
- const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
142
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
144
  for (const param of targets) {
145
145
  const allowedLine = diff.find(
146
- (d) => d.startsWith("- expected one of:")
146
+ (d) => String(d).startsWith("- expected one of:")
147
147
  );
148
148
  if (allowedLine) {
149
149
  const allowed = allowedLine.replace("- expected one of: ", "");
@@ -350,17 +350,16 @@ async function evaluate(options) {
350
350
  return allResults;
351
351
  }
352
352
 
353
- // src/benchmarks/json-generation.ts
353
+ // src/benchmarks/bfcl.ts
354
354
  var import_ai = require("ai");
355
- var import_ajv = __toESM(require("ajv"), 1);
356
355
  var import_fs2 = require("fs");
357
356
  var import_path2 = __toESM(require("path"), 1);
358
357
 
359
358
  // src/utils/paths.ts
360
359
  var import_fs = __toESM(require("fs"), 1);
360
+ var import_module = require("module");
361
361
  var import_path = __toESM(require("path"), 1);
362
362
  var import_url = require("url");
363
- var import_module = require("module");
364
363
  function resolveDataDir(fromModuleUrl) {
365
364
  const moduleUrl = fromModuleUrl;
366
365
  const override = process.env.BFCL_DATA_DIR;
@@ -408,263 +407,6 @@ function resolveDataDir(fromModuleUrl) {
408
407
  return import_path.default.join(pkgRoot, "data");
409
408
  }
410
409
 
411
- // src/benchmarks/json-generation.ts
412
- function extractFirstJsonBlock(text) {
413
- try {
414
- return JSON.parse(text);
415
- } catch {
416
- }
417
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
418
- if (fenceMatch) {
419
- const inner = fenceMatch[1].trim();
420
- try {
421
- return JSON.parse(inner);
422
- } catch {
423
- }
424
- }
425
- const startIdxObj = text.indexOf("{");
426
- const startIdxArr = text.indexOf("[");
427
- const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
428
- if (start === void 0) return void 0;
429
- const open = text[start] === "{" ? "{" : "[";
430
- const close = open === "{" ? "}" : "]";
431
- let depth = 0;
432
- for (let i = start; i < text.length; i++) {
433
- const ch = text[i];
434
- if (ch === open) depth++;
435
- else if (ch === close) depth--;
436
- if (depth === 0) {
437
- const candidate = text.slice(start, i + 1);
438
- try {
439
- return JSON.parse(candidate);
440
- } catch {
441
- }
442
- break;
443
- }
444
- }
445
- return void 0;
446
- }
447
- function subsetMatch(expected, actual) {
448
- if (expected === null || typeof expected !== "object") {
449
- return expected === actual;
450
- }
451
- if (Array.isArray(expected)) {
452
- if (!Array.isArray(actual)) return false;
453
- for (let i = 0; i < expected.length; i++) {
454
- if (!subsetMatch(expected[i], actual[i])) return false;
455
- }
456
- return true;
457
- }
458
- if (actual === null || typeof actual !== "object") return false;
459
- const eObj = expected;
460
- const aObj = actual;
461
- for (const key of Object.keys(eObj)) {
462
- if (!subsetMatch(eObj[key], aObj[key])) return false;
463
- }
464
- return true;
465
- }
466
- var jsonGenerationBenchmark = {
467
- name: "json-generation",
468
- version: "2.1.0",
469
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
470
- async run(model) {
471
- const logs = [];
472
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
473
- let schemaValidCount = 0;
474
- let valueMatchCount = 0;
475
- let correctCount = 0;
476
- let tests = [];
477
- const expectedMap = /* @__PURE__ */ new Map();
478
- try {
479
- const dataDir = resolveDataDir();
480
- const testsJsonl = await import_fs2.promises.readFile(
481
- import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
482
- "utf-8"
483
- );
484
- const expectedJsonl = await import_fs2.promises.readFile(
485
- import_path2.default.join(dataDir, "json_generation_expected.jsonl"),
486
- "utf-8"
487
- );
488
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
489
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
490
- for (const r of expecteds) expectedMap.set(r.id, r);
491
- } catch (e) {
492
- const msg = e instanceof Error ? e.message : String(e);
493
- return {
494
- score: 0,
495
- success: false,
496
- metrics: {},
497
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
498
- error: e
499
- };
500
- }
501
- for (const tc of tests) {
502
- try {
503
- const schemaStr = JSON.stringify(tc.schema, null, 2);
504
- const messages = [
505
- {
506
- role: "system",
507
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
508
- },
509
- {
510
- role: "user",
511
- content: [
512
- "Generate a JSON object that reflects the following facts.",
513
- "JSON Schema:",
514
- schemaStr,
515
- "Facts:",
516
- tc.promptFacts,
517
- "Output must be a single JSON only, with no additional text."
518
- ].join("\n\n")
519
- }
520
- ];
521
- const { text } = await (0, import_ai.generateText)({ model, messages });
522
- let parsed;
523
- try {
524
- parsed = extractFirstJsonBlock(text);
525
- } catch {
526
- }
527
- if (parsed === void 0) {
528
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
529
- continue;
530
- }
531
- const validate = ajv.compile(tc.schema);
532
- const valid = validate(parsed);
533
- if (valid) schemaValidCount++;
534
- else
535
- logs.push(
536
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
537
- );
538
- const expectedRec = expectedMap.get(tc.id);
539
- if (!expectedRec) {
540
- logs.push(
541
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
542
- );
543
- }
544
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
545
- if (valuesOk) valueMatchCount++;
546
- if (valid && valuesOk) {
547
- correctCount++;
548
- logs.push(`[PASS] ${tc.id}`);
549
- } else {
550
- logs.push(
551
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
552
- parsed
553
- )}`
554
- );
555
- }
556
- } catch (e) {
557
- const msg = e instanceof Error ? e.message : String(e);
558
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
559
- }
560
- }
561
- const total = tests.length;
562
- const score = correctCount / total;
563
- return {
564
- score,
565
- success: score >= 0.8,
566
- metrics: {
567
- total_cases: total,
568
- correct_count: correctCount,
569
- schema_valid_count: schemaValidCount,
570
- value_match_count: valueMatchCount,
571
- accuracy: score
572
- },
573
- logs
574
- };
575
- }
576
- };
577
- var jsonGenerationSchemaOnlyBenchmark = {
578
- name: "json-generation-schema-only",
579
- version: "1.0.1",
580
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
581
- async run(model) {
582
- const logs = [];
583
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
584
- let tests = [];
585
- try {
586
- const dataDir = resolveDataDir();
587
- const testsJsonl = await import_fs2.promises.readFile(
588
- import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
589
- "utf-8"
590
- );
591
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
592
- } catch (e) {
593
- const msg = e instanceof Error ? e.message : String(e);
594
- return {
595
- score: 0,
596
- success: false,
597
- metrics: {},
598
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
599
- error: e
600
- };
601
- }
602
- let schemaValidCount = 0;
603
- for (const tc of tests) {
604
- try {
605
- const schemaStr = JSON.stringify(tc.schema, null, 2);
606
- const messages = [
607
- {
608
- role: "system",
609
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
610
- },
611
- {
612
- role: "user",
613
- content: [
614
- "Generate a JSON object that reflects the following facts.",
615
- "JSON Schema:",
616
- schemaStr,
617
- "Facts:",
618
- tc.promptFacts,
619
- "Output must be a single JSON only, with no additional text."
620
- ].join("\n\n")
621
- }
622
- ];
623
- const { text } = await (0, import_ai.generateText)({ model, messages });
624
- let parsed;
625
- try {
626
- parsed = extractFirstJsonBlock(text);
627
- } catch {
628
- }
629
- if (parsed === void 0) {
630
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
631
- continue;
632
- }
633
- const validate = ajv.compile(tc.schema);
634
- const valid = validate(parsed);
635
- if (valid) {
636
- schemaValidCount++;
637
- logs.push(`[PASS] ${tc.id}`);
638
- } else {
639
- logs.push(
640
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
641
- );
642
- }
643
- } catch (e) {
644
- const msg = e instanceof Error ? e.message : String(e);
645
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
646
- }
647
- }
648
- const total = tests.length;
649
- const score = total > 0 ? schemaValidCount / total : 0;
650
- return {
651
- score,
652
- success: score >= 0.8,
653
- metrics: {
654
- total_cases: total,
655
- schema_valid_count: schemaValidCount,
656
- accuracy: score
657
- },
658
- logs
659
- };
660
- }
661
- };
662
-
663
- // src/benchmarks/bfcl.ts
664
- var import_ai2 = require("ai");
665
- var import_fs3 = require("fs");
666
- var import_path3 = __toESM(require("path"), 1);
667
-
668
410
  // src/benchmarks/bfcl/ast-checker.ts
669
411
  function standardizeString(input) {
670
412
  if (typeof input !== "string") return input;
@@ -674,7 +416,7 @@ function standardizeString(input) {
674
416
  function checkStringValue(param, modelValue, possibleAnswers) {
675
417
  const standardizedModelValue = standardizeString(modelValue);
676
418
  const standardizedPossibleAnswers = possibleAnswers.map(
677
- (ans) => standardizeString(ans)
419
+ (ans) => standardizeString(String(ans))
678
420
  );
679
421
  if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
680
422
  return {
@@ -701,8 +443,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
701
443
  };
702
444
  }
703
445
  const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
446
+ const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
704
447
  for (const param of requiredParams) {
705
- if (!(param in modelArgs)) {
448
+ if (!(param in argsObj)) {
706
449
  return {
707
450
  valid: false,
708
451
  error: `Missing required parameter: '${param}'.`,
@@ -710,87 +453,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
710
453
  };
711
454
  }
712
455
  }
713
- for (const paramName in modelArgs) {
714
- const modelValue = modelArgs[paramName];
715
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
716
- return {
717
- valid: false,
718
- error: `Unexpected parameter: '${paramName}'.`,
719
- error_type: "simple_function_checker:unexpected_param"
720
- };
721
- }
722
- const possibleValues = possibleAnswerParams[paramName];
723
- if (typeof modelValue === "string") {
724
- const result = checkStringValue(paramName, modelValue, possibleValues);
725
- if (!result.valid) return result;
726
- } else if (Array.isArray(modelValue)) {
727
- const modelValueStr = JSON.stringify(
728
- modelValue.map((v) => standardizeString(v.toString())).sort()
729
- );
730
- const hasMatch = possibleValues.some(
731
- (p) => JSON.stringify(
732
- p.map((v) => standardizeString(v.toString())).sort()
733
- ) === modelValueStr
734
- );
735
- if (!hasMatch) {
456
+ if (modelArgs && typeof modelArgs === "object") {
457
+ for (const paramName of Object.keys(argsObj)) {
458
+ const modelValue = argsObj[paramName];
459
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
736
460
  return {
737
461
  valid: false,
738
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
739
- modelValue
740
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
741
- error_type: "value_error:list"
462
+ error: `Unexpected parameter: '${paramName}'.`,
463
+ error_type: "simple_function_checker:unexpected_param"
742
464
  };
743
465
  }
744
- } else {
745
- const hasMatch = possibleValues.some((possibleValue) => {
746
- if (modelValue === possibleValue) return true;
747
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
748
- try {
749
- const normalizeObject = (obj) => {
750
- if (Array.isArray(obj)) {
751
- return obj.map(normalizeObject);
752
- }
753
- if (obj && typeof obj === "object") {
754
- const normalized = {};
755
- for (const [key, value] of Object.entries(obj)) {
756
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
757
- normalized[key] = value[0];
758
- } else {
759
- normalized[key] = normalizeObject(value);
466
+ const possibleValues = possibleAnswerParams[paramName];
467
+ if (typeof modelValue === "string") {
468
+ const result = checkStringValue(
469
+ paramName,
470
+ modelValue,
471
+ possibleValues ?? []
472
+ );
473
+ if (!result.valid) return result;
474
+ } else if (Array.isArray(modelValue)) {
475
+ const modelValueStr = JSON.stringify(
476
+ modelValue.map((v) => standardizeString(String(v))).sort()
477
+ );
478
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
479
+ if (!Array.isArray(p)) return false;
480
+ return JSON.stringify(
481
+ p.map((v) => standardizeString(String(v))).sort()
482
+ ) === modelValueStr;
483
+ }) : false;
484
+ if (!hasMatch) {
485
+ return {
486
+ valid: false,
487
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
488
+ modelValue
489
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
490
+ error_type: "value_error:list"
491
+ };
492
+ }
493
+ } else {
494
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
495
+ if (modelValue === possibleValue) return true;
496
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
497
+ try {
498
+ const normalizeObject = (obj) => {
499
+ if (Array.isArray(obj)) {
500
+ return obj.map(normalizeObject);
501
+ }
502
+ if (obj && typeof obj === "object") {
503
+ const normalized = {};
504
+ for (const [key, value] of Object.entries(
505
+ obj
506
+ )) {
507
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
508
+ normalized[key] = value[0];
509
+ } else {
510
+ normalized[key] = normalizeObject(value);
511
+ }
760
512
  }
513
+ return normalized;
761
514
  }
762
- return normalized;
763
- }
764
- return obj;
765
- };
766
- const normalizedModel = normalizeObject(modelValue);
767
- const normalizedPossible = normalizeObject(possibleValue);
768
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
769
- } catch {
770
- return false;
515
+ return obj;
516
+ };
517
+ const normalizedModel = normalizeObject(modelValue);
518
+ const normalizedPossible = normalizeObject(possibleValue);
519
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
520
+ } catch {
521
+ return false;
522
+ }
771
523
  }
524
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
525
+ return modelValue.toString() === possibleValue;
526
+ }
527
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
528
+ return modelValue === possibleValue.toString();
529
+ }
530
+ return false;
531
+ }) : false;
532
+ if (!hasMatch) {
533
+ return {
534
+ valid: false,
535
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
536
+ modelValue
537
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
538
+ error_type: "value_error:other"
539
+ };
772
540
  }
773
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
774
- return modelValue.toString() === possibleValue;
775
- }
776
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
777
- return modelValue === possibleValue.toString();
778
- }
779
- return false;
780
- });
781
- if (!hasMatch) {
782
- return {
783
- valid: false,
784
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
785
- modelValue
786
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
787
- error_type: "value_error:other"
788
- };
789
541
  }
790
542
  }
791
543
  }
792
544
  for (const paramName in possibleAnswerParams) {
793
- if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
545
+ const val = possibleAnswerParams[paramName];
546
+ const isOptional = Array.isArray(val) && val.includes("");
547
+ if (!(paramName in argsObj) && !isOptional) {
794
548
  return {
795
549
  valid: false,
796
550
  error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
@@ -876,10 +630,10 @@ function check(testCase, modelOutput, possibleAnswer) {
876
630
  const category = testCase.id.split("_")[0];
877
631
  try {
878
632
  if (category === "simple") {
879
- if (!modelOutput || modelOutput.length !== 1) {
633
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
880
634
  return {
881
635
  valid: false,
882
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
636
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
883
637
  error_type: "simple:wrong_count"
884
638
  };
885
639
  }
@@ -928,12 +682,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
928
682
  try {
929
683
  const dataPath = resolveDataDir();
930
684
  logs.push(`[INFO] Using data dir: ${dataPath}`);
931
- const testCasesJson = await import_fs3.promises.readFile(
932
- import_path3.default.join(dataPath, testDataFile),
685
+ const testCasesJson = await import_fs2.promises.readFile(
686
+ import_path2.default.join(dataPath, testDataFile),
933
687
  "utf-8"
934
688
  );
935
- const possibleAnswersJson = await import_fs3.promises.readFile(
936
- import_path3.default.join(dataPath, answerDataFile),
689
+ const possibleAnswersJson = await import_fs2.promises.readFile(
690
+ import_path2.default.join(dataPath, answerDataFile),
937
691
  "utf-8"
938
692
  );
939
693
  testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -950,19 +704,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
950
704
  );
951
705
  }
952
706
  const fixSchema = (schema) => {
953
- if (!schema || typeof schema !== "object") return schema;
707
+ if (!schema || typeof schema !== "object")
708
+ return { type: "object", properties: {} };
954
709
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
955
- if (copy.type) {
956
- if (copy.type === "dict") copy.type = "object";
957
- if (copy.type === "integer" || copy.type === "float")
958
- copy.type = "number";
959
- }
960
- if (copy.properties && typeof copy.properties === "object") {
961
- for (const k of Object.keys(copy.properties)) {
962
- copy.properties[k] = fixSchema(copy.properties[k]);
710
+ if (!Array.isArray(copy)) {
711
+ if (copy.type) {
712
+ if (copy.type === "dict") copy.type = "object";
713
+ if (copy.type === "integer" || copy.type === "float")
714
+ copy.type = "number";
963
715
  }
716
+ if (copy.properties && typeof copy.properties === "object") {
717
+ for (const k of Object.keys(copy.properties)) {
718
+ copy.properties[k] = fixSchema(
719
+ copy.properties[k]
720
+ );
721
+ }
722
+ }
723
+ if (copy.items) copy.items = fixSchema(copy.items);
724
+ return copy;
964
725
  }
965
- if (copy.items) copy.items = fixSchema(copy.items);
966
726
  return copy;
967
727
  };
968
728
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
@@ -982,7 +742,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
982
742
  };
983
743
  const transformedTools = tools.map((t) => {
984
744
  const fixed = fixSchema(t.parameters);
985
- const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
745
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
746
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
986
747
  const sanitized = sanitizeName(t.name);
987
748
  nameMap.set(sanitized, t.name);
988
749
  return {
@@ -995,9 +756,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
995
756
  const toolsMap = Object.fromEntries(
996
757
  transformedTools.map((t) => [
997
758
  t.name,
998
- (0, import_ai2.tool)({
759
+ (0, import_ai.tool)({
999
760
  description: typeof t.description === "string" ? t.description : void 0,
1000
- inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
761
+ inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
1001
762
  })
1002
763
  ])
1003
764
  );
@@ -1012,7 +773,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1012
773
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
1013
774
  );
1014
775
  }
1015
- const { toolCalls, text, finishReason } = await (0, import_ai2.generateText)({
776
+ const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
1016
777
  model,
1017
778
  messages: flatMessages,
1018
779
  tools: toolsMap,
@@ -1021,7 +782,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1021
782
  providerOptions: {
1022
783
  toolCallMiddleware: {
1023
784
  originalToolSchemas: Object.fromEntries(
1024
- transformedTools.map((t) => [t.name, t.inputSchema])
785
+ transformedTools.map((t) => [
786
+ t.name,
787
+ t.inputSchema
788
+ ])
1025
789
  )
1026
790
  }
1027
791
  }
@@ -1074,10 +838,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1074
838
  const summarizeArgs = (args) => {
1075
839
  if (args == null) return args;
1076
840
  if (typeof args !== "object") return args;
1077
- return Object.keys(args).sort().reduce((acc, k) => {
1078
- acc[k] = args[k];
1079
- return acc;
1080
- }, {});
841
+ return Object.keys(args).sort().reduce(
842
+ (acc, k) => {
843
+ acc[k] = args[k];
844
+ return acc;
845
+ },
846
+ {}
847
+ );
1081
848
  };
1082
849
  const expected = {};
1083
850
  const actual = {};
@@ -1098,19 +865,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1098
865
  diff.push(`- ${expectedFuncName}`);
1099
866
  diff.push(`+ ${receivedName}`);
1100
867
  }
1101
- if (expectedParams && receivedArgs) {
868
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1102
869
  const required = funcDesc?.parameters?.required ?? [];
1103
870
  for (const req of required) {
1104
871
  if (!(req in receivedArgs)) {
1105
872
  diff.push(`- missing required param: ${req}`);
1106
873
  }
1107
874
  }
1108
- for (const k of Object.keys(receivedArgs)) {
875
+ for (const k of Object.keys(
876
+ receivedArgs
877
+ )) {
1109
878
  if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1110
879
  diff.push(`+ unexpected param: ${k}`);
1111
880
  }
1112
881
  }
1113
- for (const k of Object.keys(receivedArgs)) {
882
+ for (const k of Object.keys(
883
+ receivedArgs
884
+ )) {
1114
885
  if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1115
886
  const allowed = expectedParams[k];
1116
887
  const got = receivedArgs[k];
@@ -1183,13 +954,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1183
954
  );
1184
955
  const requiredParams = funcDesc?.parameters?.required ?? [];
1185
956
  diff.push(`@@ function ${fname}`);
1186
- if (expectedParamsAllowed && receivedArgs) {
957
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1187
958
  for (const req of requiredParams) {
1188
959
  if (!(req in receivedArgs)) {
1189
960
  diff.push(`- missing required param: ${req}`);
1190
961
  }
1191
962
  }
1192
- for (const k of Object.keys(receivedArgs)) {
963
+ for (const k of Object.keys(
964
+ receivedArgs
965
+ )) {
1193
966
  if (!Object.prototype.hasOwnProperty.call(
1194
967
  expectedParamsAllowed,
1195
968
  k
@@ -1197,7 +970,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1197
970
  diff.push(`+ unexpected param: ${k}`);
1198
971
  }
1199
972
  }
1200
- for (const k of Object.keys(receivedArgs)) {
973
+ for (const k of Object.keys(
974
+ receivedArgs
975
+ )) {
1201
976
  if (Object.prototype.hasOwnProperty.call(
1202
977
  expectedParamsAllowed,
1203
978
  k
@@ -1335,6 +1110,262 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
1335
1110
  "BFCL_v3_parallel_multiple.json",
1336
1111
  "BFCL_v3_parallel_multiple_possible_answer.json"
1337
1112
  );
1113
+
1114
+ // src/benchmarks/json-generation.ts
1115
+ var import_ai2 = require("ai");
1116
+ var import_ajv = __toESM(require("ajv"), 1);
1117
+ var import_fs3 = require("fs");
1118
+ var import_path3 = __toESM(require("path"), 1);
1119
+ function extractFirstJsonBlock(text) {
1120
+ try {
1121
+ return JSON.parse(text);
1122
+ } catch {
1123
+ }
1124
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1125
+ if (fenceMatch) {
1126
+ const inner = fenceMatch[1].trim();
1127
+ try {
1128
+ return JSON.parse(inner);
1129
+ } catch {
1130
+ }
1131
+ }
1132
+ const startIdxObj = text.indexOf("{");
1133
+ const startIdxArr = text.indexOf("[");
1134
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1135
+ if (start === void 0) return void 0;
1136
+ const open = text[start] === "{" ? "{" : "[";
1137
+ const close = open === "{" ? "}" : "]";
1138
+ let depth = 0;
1139
+ for (let i = start; i < text.length; i++) {
1140
+ const ch = text[i];
1141
+ if (ch === open) depth++;
1142
+ else if (ch === close) depth--;
1143
+ if (depth === 0) {
1144
+ const candidate = text.slice(start, i + 1);
1145
+ try {
1146
+ return JSON.parse(candidate);
1147
+ } catch {
1148
+ }
1149
+ break;
1150
+ }
1151
+ }
1152
+ return void 0;
1153
+ }
1154
+ function subsetMatch(expected, actual) {
1155
+ if (expected === null || typeof expected !== "object") {
1156
+ return expected === actual;
1157
+ }
1158
+ if (Array.isArray(expected)) {
1159
+ if (!Array.isArray(actual)) return false;
1160
+ for (let i = 0; i < expected.length; i++) {
1161
+ if (!subsetMatch(expected[i], actual[i])) return false;
1162
+ }
1163
+ return true;
1164
+ }
1165
+ if (actual === null || typeof actual !== "object") return false;
1166
+ const eObj = expected;
1167
+ const aObj = actual;
1168
+ for (const key of Object.keys(eObj)) {
1169
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
1170
+ }
1171
+ return true;
1172
+ }
1173
+ var jsonGenerationBenchmark = {
1174
+ name: "json-generation",
1175
+ version: "2.1.0",
1176
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1177
+ async run(model) {
1178
+ const logs = [];
1179
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1180
+ let schemaValidCount = 0;
1181
+ let valueMatchCount = 0;
1182
+ let correctCount = 0;
1183
+ let tests = [];
1184
+ const expectedMap = /* @__PURE__ */ new Map();
1185
+ try {
1186
+ const dataDir = resolveDataDir();
1187
+ const testsJsonl = await import_fs3.promises.readFile(
1188
+ import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1189
+ "utf-8"
1190
+ );
1191
+ const expectedJsonl = await import_fs3.promises.readFile(
1192
+ import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1193
+ "utf-8"
1194
+ );
1195
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1196
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1197
+ for (const r of expecteds) expectedMap.set(r.id, r);
1198
+ } catch (e) {
1199
+ const msg = e instanceof Error ? e.message : String(e);
1200
+ return {
1201
+ score: 0,
1202
+ success: false,
1203
+ metrics: {},
1204
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1205
+ error: e
1206
+ };
1207
+ }
1208
+ for (const tc of tests) {
1209
+ try {
1210
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1211
+ const messages = [
1212
+ {
1213
+ role: "system",
1214
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1215
+ },
1216
+ {
1217
+ role: "user",
1218
+ content: [
1219
+ "Generate a JSON object that reflects the following facts.",
1220
+ "JSON Schema:",
1221
+ schemaStr,
1222
+ "Facts:",
1223
+ tc.promptFacts,
1224
+ "Output must be a single JSON only, with no additional text."
1225
+ ].join("\n\n")
1226
+ }
1227
+ ];
1228
+ const { text } = await (0, import_ai2.generateText)({ model, messages });
1229
+ let parsed;
1230
+ try {
1231
+ parsed = extractFirstJsonBlock(text);
1232
+ } catch {
1233
+ }
1234
+ if (parsed === void 0) {
1235
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1236
+ continue;
1237
+ }
1238
+ const validate = ajv.compile(tc.schema);
1239
+ const valid = validate(parsed);
1240
+ if (valid) schemaValidCount++;
1241
+ else
1242
+ logs.push(
1243
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1244
+ );
1245
+ const expectedRec = expectedMap.get(tc.id);
1246
+ if (!expectedRec) {
1247
+ logs.push(
1248
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1249
+ );
1250
+ }
1251
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1252
+ if (valuesOk) valueMatchCount++;
1253
+ if (valid && valuesOk) {
1254
+ correctCount++;
1255
+ logs.push(`[PASS] ${tc.id}`);
1256
+ } else {
1257
+ logs.push(
1258
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1259
+ parsed
1260
+ )}`
1261
+ );
1262
+ }
1263
+ } catch (e) {
1264
+ const msg = e instanceof Error ? e.message : String(e);
1265
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1266
+ }
1267
+ }
1268
+ const total = tests.length;
1269
+ const score = correctCount / total;
1270
+ return {
1271
+ score,
1272
+ success: score >= 0.8,
1273
+ metrics: {
1274
+ total_cases: total,
1275
+ correct_count: correctCount,
1276
+ schema_valid_count: schemaValidCount,
1277
+ value_match_count: valueMatchCount,
1278
+ accuracy: score
1279
+ },
1280
+ logs
1281
+ };
1282
+ }
1283
+ };
1284
+ var jsonGenerationSchemaOnlyBenchmark = {
1285
+ name: "json-generation-schema-only",
1286
+ version: "1.0.1",
1287
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1288
+ async run(model) {
1289
+ const logs = [];
1290
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1291
+ let tests = [];
1292
+ try {
1293
+ const dataDir = resolveDataDir();
1294
+ const testsJsonl = await import_fs3.promises.readFile(
1295
+ import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1296
+ "utf-8"
1297
+ );
1298
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1299
+ } catch (e) {
1300
+ const msg = e instanceof Error ? e.message : String(e);
1301
+ return {
1302
+ score: 0,
1303
+ success: false,
1304
+ metrics: {},
1305
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1306
+ error: e
1307
+ };
1308
+ }
1309
+ let schemaValidCount = 0;
1310
+ for (const tc of tests) {
1311
+ try {
1312
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1313
+ const messages = [
1314
+ {
1315
+ role: "system",
1316
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1317
+ },
1318
+ {
1319
+ role: "user",
1320
+ content: [
1321
+ "Generate a JSON object that reflects the following facts.",
1322
+ "JSON Schema:",
1323
+ schemaStr,
1324
+ "Facts:",
1325
+ tc.promptFacts,
1326
+ "Output must be a single JSON only, with no additional text."
1327
+ ].join("\n\n")
1328
+ }
1329
+ ];
1330
+ const { text } = await (0, import_ai2.generateText)({ model, messages });
1331
+ let parsed;
1332
+ try {
1333
+ parsed = extractFirstJsonBlock(text);
1334
+ } catch {
1335
+ }
1336
+ if (parsed === void 0) {
1337
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1338
+ continue;
1339
+ }
1340
+ const validate = ajv.compile(tc.schema);
1341
+ const valid = validate(parsed);
1342
+ if (valid) {
1343
+ schemaValidCount++;
1344
+ logs.push(`[PASS] ${tc.id}`);
1345
+ } else {
1346
+ logs.push(
1347
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1348
+ );
1349
+ }
1350
+ } catch (e) {
1351
+ const msg = e instanceof Error ? e.message : String(e);
1352
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1353
+ }
1354
+ }
1355
+ const total = tests.length;
1356
+ const score = total > 0 ? schemaValidCount / total : 0;
1357
+ return {
1358
+ score,
1359
+ success: score >= 0.8,
1360
+ metrics: {
1361
+ total_cases: total,
1362
+ schema_valid_count: schemaValidCount,
1363
+ accuracy: score
1364
+ },
1365
+ logs
1366
+ };
1367
+ }
1368
+ };
1338
1369
  // Annotate the CommonJS export names for ESM import in node:
1339
1370
  0 && (module.exports = {
1340
1371
  bfclMultipleBenchmark,