@ai-sdk-tool/eval 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -71,7 +71,7 @@ function uniqueLines(lines) {
71
71
  function suggestFixFromDiff(parsed) {
72
72
  const suggestions = [];
73
73
  const { error_type, expected, actual, diff } = parsed ?? {};
74
- if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
74
+ if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
75
75
  const expectedName = expected?.function;
76
76
  const actualName = actual?.function;
77
77
  if (expectedName && actualName && expectedName !== actualName) {
@@ -85,23 +85,23 @@ function suggestFixFromDiff(parsed) {
85
85
  );
86
86
  }
87
87
  }
88
- if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
89
- const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
88
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
89
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
90
90
  if (missing.length) {
91
91
  suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
92
92
  }
93
93
  }
94
- if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
95
- const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
94
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
95
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
96
96
  if (extras.length) {
97
97
  suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
98
98
  }
99
99
  }
100
- if (diff && diff.some((d) => d.startsWith("@@ param "))) {
101
- const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
100
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
101
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
102
102
  for (const param of targets) {
103
103
  const allowedLine = diff.find(
104
- (d) => d.startsWith("- expected one of:")
104
+ (d) => String(d).startsWith("- expected one of:")
105
105
  );
106
106
  if (allowedLine) {
107
107
  const allowed = allowedLine.replace("- expected one of: ", "");
@@ -308,17 +308,16 @@ async function evaluate(options) {
308
308
  return allResults;
309
309
  }
310
310
 
311
- // src/benchmarks/json-generation.ts
312
- import { generateText } from "ai";
313
- import Ajv from "ajv";
311
+ // src/benchmarks/bfcl.ts
312
+ import { generateText, jsonSchema, tool } from "ai";
314
313
  import { promises as fs2 } from "fs";
315
314
  import path2 from "path";
316
315
 
317
316
  // src/utils/paths.ts
318
317
  import fs from "fs";
318
+ import { createRequire } from "module";
319
319
  import path from "path";
320
320
  import { fileURLToPath } from "url";
321
- import { createRequire } from "module";
322
321
  function resolveDataDir(fromModuleUrl) {
323
322
  const moduleUrl = fromModuleUrl;
324
323
  const override = process.env.BFCL_DATA_DIR;
@@ -366,263 +365,6 @@ function resolveDataDir(fromModuleUrl) {
366
365
  return path.join(pkgRoot, "data");
367
366
  }
368
367
 
369
- // src/benchmarks/json-generation.ts
370
- function extractFirstJsonBlock(text) {
371
- try {
372
- return JSON.parse(text);
373
- } catch {
374
- }
375
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
376
- if (fenceMatch) {
377
- const inner = fenceMatch[1].trim();
378
- try {
379
- return JSON.parse(inner);
380
- } catch {
381
- }
382
- }
383
- const startIdxObj = text.indexOf("{");
384
- const startIdxArr = text.indexOf("[");
385
- const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
386
- if (start === void 0) return void 0;
387
- const open = text[start] === "{" ? "{" : "[";
388
- const close = open === "{" ? "}" : "]";
389
- let depth = 0;
390
- for (let i = start; i < text.length; i++) {
391
- const ch = text[i];
392
- if (ch === open) depth++;
393
- else if (ch === close) depth--;
394
- if (depth === 0) {
395
- const candidate = text.slice(start, i + 1);
396
- try {
397
- return JSON.parse(candidate);
398
- } catch {
399
- }
400
- break;
401
- }
402
- }
403
- return void 0;
404
- }
405
- function subsetMatch(expected, actual) {
406
- if (expected === null || typeof expected !== "object") {
407
- return expected === actual;
408
- }
409
- if (Array.isArray(expected)) {
410
- if (!Array.isArray(actual)) return false;
411
- for (let i = 0; i < expected.length; i++) {
412
- if (!subsetMatch(expected[i], actual[i])) return false;
413
- }
414
- return true;
415
- }
416
- if (actual === null || typeof actual !== "object") return false;
417
- const eObj = expected;
418
- const aObj = actual;
419
- for (const key of Object.keys(eObj)) {
420
- if (!subsetMatch(eObj[key], aObj[key])) return false;
421
- }
422
- return true;
423
- }
424
- var jsonGenerationBenchmark = {
425
- name: "json-generation",
426
- version: "2.1.0",
427
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
428
- async run(model) {
429
- const logs = [];
430
- const ajv = new Ajv({ allErrors: true, strict: false });
431
- let schemaValidCount = 0;
432
- let valueMatchCount = 0;
433
- let correctCount = 0;
434
- let tests = [];
435
- const expectedMap = /* @__PURE__ */ new Map();
436
- try {
437
- const dataDir = resolveDataDir();
438
- const testsJsonl = await fs2.readFile(
439
- path2.join(dataDir, "json_generation_tests.jsonl"),
440
- "utf-8"
441
- );
442
- const expectedJsonl = await fs2.readFile(
443
- path2.join(dataDir, "json_generation_expected.jsonl"),
444
- "utf-8"
445
- );
446
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
447
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
448
- for (const r of expecteds) expectedMap.set(r.id, r);
449
- } catch (e) {
450
- const msg = e instanceof Error ? e.message : String(e);
451
- return {
452
- score: 0,
453
- success: false,
454
- metrics: {},
455
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
456
- error: e
457
- };
458
- }
459
- for (const tc of tests) {
460
- try {
461
- const schemaStr = JSON.stringify(tc.schema, null, 2);
462
- const messages = [
463
- {
464
- role: "system",
465
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
466
- },
467
- {
468
- role: "user",
469
- content: [
470
- "Generate a JSON object that reflects the following facts.",
471
- "JSON Schema:",
472
- schemaStr,
473
- "Facts:",
474
- tc.promptFacts,
475
- "Output must be a single JSON only, with no additional text."
476
- ].join("\n\n")
477
- }
478
- ];
479
- const { text } = await generateText({ model, messages });
480
- let parsed;
481
- try {
482
- parsed = extractFirstJsonBlock(text);
483
- } catch {
484
- }
485
- if (parsed === void 0) {
486
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
487
- continue;
488
- }
489
- const validate = ajv.compile(tc.schema);
490
- const valid = validate(parsed);
491
- if (valid) schemaValidCount++;
492
- else
493
- logs.push(
494
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
495
- );
496
- const expectedRec = expectedMap.get(tc.id);
497
- if (!expectedRec) {
498
- logs.push(
499
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
500
- );
501
- }
502
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
503
- if (valuesOk) valueMatchCount++;
504
- if (valid && valuesOk) {
505
- correctCount++;
506
- logs.push(`[PASS] ${tc.id}`);
507
- } else {
508
- logs.push(
509
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
510
- parsed
511
- )}`
512
- );
513
- }
514
- } catch (e) {
515
- const msg = e instanceof Error ? e.message : String(e);
516
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
517
- }
518
- }
519
- const total = tests.length;
520
- const score = correctCount / total;
521
- return {
522
- score,
523
- success: score >= 0.8,
524
- metrics: {
525
- total_cases: total,
526
- correct_count: correctCount,
527
- schema_valid_count: schemaValidCount,
528
- value_match_count: valueMatchCount,
529
- accuracy: score
530
- },
531
- logs
532
- };
533
- }
534
- };
535
- var jsonGenerationSchemaOnlyBenchmark = {
536
- name: "json-generation-schema-only",
537
- version: "1.0.1",
538
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
539
- async run(model) {
540
- const logs = [];
541
- const ajv = new Ajv({ allErrors: true, strict: false });
542
- let tests = [];
543
- try {
544
- const dataDir = resolveDataDir();
545
- const testsJsonl = await fs2.readFile(
546
- path2.join(dataDir, "json_generation_tests.jsonl"),
547
- "utf-8"
548
- );
549
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
550
- } catch (e) {
551
- const msg = e instanceof Error ? e.message : String(e);
552
- return {
553
- score: 0,
554
- success: false,
555
- metrics: {},
556
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
557
- error: e
558
- };
559
- }
560
- let schemaValidCount = 0;
561
- for (const tc of tests) {
562
- try {
563
- const schemaStr = JSON.stringify(tc.schema, null, 2);
564
- const messages = [
565
- {
566
- role: "system",
567
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
568
- },
569
- {
570
- role: "user",
571
- content: [
572
- "Generate a JSON object that reflects the following facts.",
573
- "JSON Schema:",
574
- schemaStr,
575
- "Facts:",
576
- tc.promptFacts,
577
- "Output must be a single JSON only, with no additional text."
578
- ].join("\n\n")
579
- }
580
- ];
581
- const { text } = await generateText({ model, messages });
582
- let parsed;
583
- try {
584
- parsed = extractFirstJsonBlock(text);
585
- } catch {
586
- }
587
- if (parsed === void 0) {
588
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
589
- continue;
590
- }
591
- const validate = ajv.compile(tc.schema);
592
- const valid = validate(parsed);
593
- if (valid) {
594
- schemaValidCount++;
595
- logs.push(`[PASS] ${tc.id}`);
596
- } else {
597
- logs.push(
598
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
599
- );
600
- }
601
- } catch (e) {
602
- const msg = e instanceof Error ? e.message : String(e);
603
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
604
- }
605
- }
606
- const total = tests.length;
607
- const score = total > 0 ? schemaValidCount / total : 0;
608
- return {
609
- score,
610
- success: score >= 0.8,
611
- metrics: {
612
- total_cases: total,
613
- schema_valid_count: schemaValidCount,
614
- accuracy: score
615
- },
616
- logs
617
- };
618
- }
619
- };
620
-
621
- // src/benchmarks/bfcl.ts
622
- import { generateText as generateText2, jsonSchema, tool } from "ai";
623
- import { promises as fs3 } from "fs";
624
- import path3 from "path";
625
-
626
368
  // src/benchmarks/bfcl/ast-checker.ts
627
369
  function standardizeString(input) {
628
370
  if (typeof input !== "string") return input;
@@ -632,7 +374,7 @@ function standardizeString(input) {
632
374
  function checkStringValue(param, modelValue, possibleAnswers) {
633
375
  const standardizedModelValue = standardizeString(modelValue);
634
376
  const standardizedPossibleAnswers = possibleAnswers.map(
635
- (ans) => standardizeString(ans)
377
+ (ans) => standardizeString(String(ans))
636
378
  );
637
379
  if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
638
380
  return {
@@ -659,8 +401,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
659
401
  };
660
402
  }
661
403
  const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
404
+ const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
662
405
  for (const param of requiredParams) {
663
- if (!(param in modelArgs)) {
406
+ if (!(param in argsObj)) {
664
407
  return {
665
408
  valid: false,
666
409
  error: `Missing required parameter: '${param}'.`,
@@ -668,87 +411,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
668
411
  };
669
412
  }
670
413
  }
671
- for (const paramName in modelArgs) {
672
- const modelValue = modelArgs[paramName];
673
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
674
- return {
675
- valid: false,
676
- error: `Unexpected parameter: '${paramName}'.`,
677
- error_type: "simple_function_checker:unexpected_param"
678
- };
679
- }
680
- const possibleValues = possibleAnswerParams[paramName];
681
- if (typeof modelValue === "string") {
682
- const result = checkStringValue(paramName, modelValue, possibleValues);
683
- if (!result.valid) return result;
684
- } else if (Array.isArray(modelValue)) {
685
- const modelValueStr = JSON.stringify(
686
- modelValue.map((v) => standardizeString(v.toString())).sort()
687
- );
688
- const hasMatch = possibleValues.some(
689
- (p) => JSON.stringify(
690
- p.map((v) => standardizeString(v.toString())).sort()
691
- ) === modelValueStr
692
- );
693
- if (!hasMatch) {
414
+ if (modelArgs && typeof modelArgs === "object") {
415
+ for (const paramName of Object.keys(argsObj)) {
416
+ const modelValue = argsObj[paramName];
417
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
694
418
  return {
695
419
  valid: false,
696
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
697
- modelValue
698
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
699
- error_type: "value_error:list"
420
+ error: `Unexpected parameter: '${paramName}'.`,
421
+ error_type: "simple_function_checker:unexpected_param"
700
422
  };
701
423
  }
702
- } else {
703
- const hasMatch = possibleValues.some((possibleValue) => {
704
- if (modelValue === possibleValue) return true;
705
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
706
- try {
707
- const normalizeObject = (obj) => {
708
- if (Array.isArray(obj)) {
709
- return obj.map(normalizeObject);
710
- }
711
- if (obj && typeof obj === "object") {
712
- const normalized = {};
713
- for (const [key, value] of Object.entries(obj)) {
714
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
715
- normalized[key] = value[0];
716
- } else {
717
- normalized[key] = normalizeObject(value);
424
+ const possibleValues = possibleAnswerParams[paramName];
425
+ if (typeof modelValue === "string") {
426
+ const result = checkStringValue(
427
+ paramName,
428
+ modelValue,
429
+ possibleValues ?? []
430
+ );
431
+ if (!result.valid) return result;
432
+ } else if (Array.isArray(modelValue)) {
433
+ const modelValueStr = JSON.stringify(
434
+ modelValue.map((v) => standardizeString(String(v))).sort()
435
+ );
436
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
437
+ if (!Array.isArray(p)) return false;
438
+ return JSON.stringify(
439
+ p.map((v) => standardizeString(String(v))).sort()
440
+ ) === modelValueStr;
441
+ }) : false;
442
+ if (!hasMatch) {
443
+ return {
444
+ valid: false,
445
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
446
+ modelValue
447
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
448
+ error_type: "value_error:list"
449
+ };
450
+ }
451
+ } else {
452
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
453
+ if (modelValue === possibleValue) return true;
454
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
455
+ try {
456
+ const normalizeObject = (obj) => {
457
+ if (Array.isArray(obj)) {
458
+ return obj.map(normalizeObject);
459
+ }
460
+ if (obj && typeof obj === "object") {
461
+ const normalized = {};
462
+ for (const [key, value] of Object.entries(
463
+ obj
464
+ )) {
465
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
466
+ normalized[key] = value[0];
467
+ } else {
468
+ normalized[key] = normalizeObject(value);
469
+ }
718
470
  }
471
+ return normalized;
719
472
  }
720
- return normalized;
721
- }
722
- return obj;
723
- };
724
- const normalizedModel = normalizeObject(modelValue);
725
- const normalizedPossible = normalizeObject(possibleValue);
726
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
727
- } catch {
728
- return false;
473
+ return obj;
474
+ };
475
+ const normalizedModel = normalizeObject(modelValue);
476
+ const normalizedPossible = normalizeObject(possibleValue);
477
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
478
+ } catch {
479
+ return false;
480
+ }
729
481
  }
482
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
483
+ return modelValue.toString() === possibleValue;
484
+ }
485
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
486
+ return modelValue === possibleValue.toString();
487
+ }
488
+ return false;
489
+ }) : false;
490
+ if (!hasMatch) {
491
+ return {
492
+ valid: false,
493
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
494
+ modelValue
495
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
496
+ error_type: "value_error:other"
497
+ };
730
498
  }
731
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
732
- return modelValue.toString() === possibleValue;
733
- }
734
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
735
- return modelValue === possibleValue.toString();
736
- }
737
- return false;
738
- });
739
- if (!hasMatch) {
740
- return {
741
- valid: false,
742
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
743
- modelValue
744
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
745
- error_type: "value_error:other"
746
- };
747
499
  }
748
500
  }
749
501
  }
750
502
  for (const paramName in possibleAnswerParams) {
751
- if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
503
+ const val = possibleAnswerParams[paramName];
504
+ const isOptional = Array.isArray(val) && val.includes("");
505
+ if (!(paramName in argsObj) && !isOptional) {
752
506
  return {
753
507
  valid: false,
754
508
  error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
@@ -834,10 +588,10 @@ function check(testCase, modelOutput, possibleAnswer) {
834
588
  const category = testCase.id.split("_")[0];
835
589
  try {
836
590
  if (category === "simple") {
837
- if (!modelOutput || modelOutput.length !== 1) {
591
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
838
592
  return {
839
593
  valid: false,
840
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
594
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
841
595
  error_type: "simple:wrong_count"
842
596
  };
843
597
  }
@@ -886,12 +640,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
886
640
  try {
887
641
  const dataPath = resolveDataDir();
888
642
  logs.push(`[INFO] Using data dir: ${dataPath}`);
889
- const testCasesJson = await fs3.readFile(
890
- path3.join(dataPath, testDataFile),
643
+ const testCasesJson = await fs2.readFile(
644
+ path2.join(dataPath, testDataFile),
891
645
  "utf-8"
892
646
  );
893
- const possibleAnswersJson = await fs3.readFile(
894
- path3.join(dataPath, answerDataFile),
647
+ const possibleAnswersJson = await fs2.readFile(
648
+ path2.join(dataPath, answerDataFile),
895
649
  "utf-8"
896
650
  );
897
651
  testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -908,19 +662,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
908
662
  );
909
663
  }
910
664
  const fixSchema = (schema) => {
911
- if (!schema || typeof schema !== "object") return schema;
665
+ if (!schema || typeof schema !== "object")
666
+ return { type: "object", properties: {} };
912
667
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
913
- if (copy.type) {
914
- if (copy.type === "dict") copy.type = "object";
915
- if (copy.type === "integer" || copy.type === "float")
916
- copy.type = "number";
917
- }
918
- if (copy.properties && typeof copy.properties === "object") {
919
- for (const k of Object.keys(copy.properties)) {
920
- copy.properties[k] = fixSchema(copy.properties[k]);
668
+ if (!Array.isArray(copy)) {
669
+ if (copy.type) {
670
+ if (copy.type === "dict") copy.type = "object";
671
+ if (copy.type === "integer" || copy.type === "float")
672
+ copy.type = "number";
673
+ }
674
+ if (copy.properties && typeof copy.properties === "object") {
675
+ for (const k of Object.keys(copy.properties)) {
676
+ copy.properties[k] = fixSchema(
677
+ copy.properties[k]
678
+ );
679
+ }
921
680
  }
681
+ if (copy.items) copy.items = fixSchema(copy.items);
682
+ return copy;
922
683
  }
923
- if (copy.items) copy.items = fixSchema(copy.items);
924
684
  return copy;
925
685
  };
926
686
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
@@ -940,7 +700,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
940
700
  };
941
701
  const transformedTools = tools.map((t) => {
942
702
  const fixed = fixSchema(t.parameters);
943
- const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
703
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
704
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
944
705
  const sanitized = sanitizeName(t.name);
945
706
  nameMap.set(sanitized, t.name);
946
707
  return {
@@ -970,7 +731,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
970
731
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
971
732
  );
972
733
  }
973
- const { toolCalls, text, finishReason } = await generateText2({
734
+ const { toolCalls, text, finishReason } = await generateText({
974
735
  model,
975
736
  messages: flatMessages,
976
737
  tools: toolsMap,
@@ -979,7 +740,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
979
740
  providerOptions: {
980
741
  toolCallMiddleware: {
981
742
  originalToolSchemas: Object.fromEntries(
982
- transformedTools.map((t) => [t.name, t.inputSchema])
743
+ transformedTools.map((t) => [
744
+ t.name,
745
+ t.inputSchema
746
+ ])
983
747
  )
984
748
  }
985
749
  }
@@ -1032,10 +796,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1032
796
  const summarizeArgs = (args) => {
1033
797
  if (args == null) return args;
1034
798
  if (typeof args !== "object") return args;
1035
- return Object.keys(args).sort().reduce((acc, k) => {
1036
- acc[k] = args[k];
1037
- return acc;
1038
- }, {});
799
+ return Object.keys(args).sort().reduce(
800
+ (acc, k) => {
801
+ acc[k] = args[k];
802
+ return acc;
803
+ },
804
+ {}
805
+ );
1039
806
  };
1040
807
  const expected = {};
1041
808
  const actual = {};
@@ -1056,19 +823,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1056
823
  diff.push(`- ${expectedFuncName}`);
1057
824
  diff.push(`+ ${receivedName}`);
1058
825
  }
1059
- if (expectedParams && receivedArgs) {
826
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1060
827
  const required = funcDesc?.parameters?.required ?? [];
1061
828
  for (const req of required) {
1062
829
  if (!(req in receivedArgs)) {
1063
830
  diff.push(`- missing required param: ${req}`);
1064
831
  }
1065
832
  }
1066
- for (const k of Object.keys(receivedArgs)) {
833
+ for (const k of Object.keys(
834
+ receivedArgs
835
+ )) {
1067
836
  if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1068
837
  diff.push(`+ unexpected param: ${k}`);
1069
838
  }
1070
839
  }
1071
- for (const k of Object.keys(receivedArgs)) {
840
+ for (const k of Object.keys(
841
+ receivedArgs
842
+ )) {
1072
843
  if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1073
844
  const allowed = expectedParams[k];
1074
845
  const got = receivedArgs[k];
@@ -1141,13 +912,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1141
912
  );
1142
913
  const requiredParams = funcDesc?.parameters?.required ?? [];
1143
914
  diff.push(`@@ function ${fname}`);
1144
- if (expectedParamsAllowed && receivedArgs) {
915
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1145
916
  for (const req of requiredParams) {
1146
917
  if (!(req in receivedArgs)) {
1147
918
  diff.push(`- missing required param: ${req}`);
1148
919
  }
1149
920
  }
1150
- for (const k of Object.keys(receivedArgs)) {
921
+ for (const k of Object.keys(
922
+ receivedArgs
923
+ )) {
1151
924
  if (!Object.prototype.hasOwnProperty.call(
1152
925
  expectedParamsAllowed,
1153
926
  k
@@ -1155,7 +928,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1155
928
  diff.push(`+ unexpected param: ${k}`);
1156
929
  }
1157
930
  }
1158
- for (const k of Object.keys(receivedArgs)) {
931
+ for (const k of Object.keys(
932
+ receivedArgs
933
+ )) {
1159
934
  if (Object.prototype.hasOwnProperty.call(
1160
935
  expectedParamsAllowed,
1161
936
  k
@@ -1293,6 +1068,262 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
1293
1068
  "BFCL_v3_parallel_multiple.json",
1294
1069
  "BFCL_v3_parallel_multiple_possible_answer.json"
1295
1070
  );
1071
+
1072
+ // src/benchmarks/json-generation.ts
1073
+ import { generateText as generateText2 } from "ai";
1074
+ import Ajv from "ajv";
1075
+ import { promises as fs3 } from "fs";
1076
+ import path3 from "path";
1077
+ function extractFirstJsonBlock(text) {
1078
+ try {
1079
+ return JSON.parse(text);
1080
+ } catch {
1081
+ }
1082
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1083
+ if (fenceMatch) {
1084
+ const inner = fenceMatch[1].trim();
1085
+ try {
1086
+ return JSON.parse(inner);
1087
+ } catch {
1088
+ }
1089
+ }
1090
+ const startIdxObj = text.indexOf("{");
1091
+ const startIdxArr = text.indexOf("[");
1092
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1093
+ if (start === void 0) return void 0;
1094
+ const open = text[start] === "{" ? "{" : "[";
1095
+ const close = open === "{" ? "}" : "]";
1096
+ let depth = 0;
1097
+ for (let i = start; i < text.length; i++) {
1098
+ const ch = text[i];
1099
+ if (ch === open) depth++;
1100
+ else if (ch === close) depth--;
1101
+ if (depth === 0) {
1102
+ const candidate = text.slice(start, i + 1);
1103
+ try {
1104
+ return JSON.parse(candidate);
1105
+ } catch {
1106
+ }
1107
+ break;
1108
+ }
1109
+ }
1110
+ return void 0;
1111
+ }
1112
+ function subsetMatch(expected, actual) {
1113
+ if (expected === null || typeof expected !== "object") {
1114
+ return expected === actual;
1115
+ }
1116
+ if (Array.isArray(expected)) {
1117
+ if (!Array.isArray(actual)) return false;
1118
+ for (let i = 0; i < expected.length; i++) {
1119
+ if (!subsetMatch(expected[i], actual[i])) return false;
1120
+ }
1121
+ return true;
1122
+ }
1123
+ if (actual === null || typeof actual !== "object") return false;
1124
+ const eObj = expected;
1125
+ const aObj = actual;
1126
+ for (const key of Object.keys(eObj)) {
1127
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
1128
+ }
1129
+ return true;
1130
+ }
1131
+ var jsonGenerationBenchmark = {
1132
+ name: "json-generation",
1133
+ version: "2.1.0",
1134
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1135
+ async run(model) {
1136
+ const logs = [];
1137
+ const ajv = new Ajv({ allErrors: true, strict: false });
1138
+ let schemaValidCount = 0;
1139
+ let valueMatchCount = 0;
1140
+ let correctCount = 0;
1141
+ let tests = [];
1142
+ const expectedMap = /* @__PURE__ */ new Map();
1143
+ try {
1144
+ const dataDir = resolveDataDir();
1145
+ const testsJsonl = await fs3.readFile(
1146
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1147
+ "utf-8"
1148
+ );
1149
+ const expectedJsonl = await fs3.readFile(
1150
+ path3.join(dataDir, "json_generation_expected.jsonl"),
1151
+ "utf-8"
1152
+ );
1153
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1154
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1155
+ for (const r of expecteds) expectedMap.set(r.id, r);
1156
+ } catch (e) {
1157
+ const msg = e instanceof Error ? e.message : String(e);
1158
+ return {
1159
+ score: 0,
1160
+ success: false,
1161
+ metrics: {},
1162
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1163
+ error: e
1164
+ };
1165
+ }
1166
+ for (const tc of tests) {
1167
+ try {
1168
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1169
+ const messages = [
1170
+ {
1171
+ role: "system",
1172
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1173
+ },
1174
+ {
1175
+ role: "user",
1176
+ content: [
1177
+ "Generate a JSON object that reflects the following facts.",
1178
+ "JSON Schema:",
1179
+ schemaStr,
1180
+ "Facts:",
1181
+ tc.promptFacts,
1182
+ "Output must be a single JSON only, with no additional text."
1183
+ ].join("\n\n")
1184
+ }
1185
+ ];
1186
+ const { text } = await generateText2({ model, messages });
1187
+ let parsed;
1188
+ try {
1189
+ parsed = extractFirstJsonBlock(text);
1190
+ } catch {
1191
+ }
1192
+ if (parsed === void 0) {
1193
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1194
+ continue;
1195
+ }
1196
+ const validate = ajv.compile(tc.schema);
1197
+ const valid = validate(parsed);
1198
+ if (valid) schemaValidCount++;
1199
+ else
1200
+ logs.push(
1201
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1202
+ );
1203
+ const expectedRec = expectedMap.get(tc.id);
1204
+ if (!expectedRec) {
1205
+ logs.push(
1206
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1207
+ );
1208
+ }
1209
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1210
+ if (valuesOk) valueMatchCount++;
1211
+ if (valid && valuesOk) {
1212
+ correctCount++;
1213
+ logs.push(`[PASS] ${tc.id}`);
1214
+ } else {
1215
+ logs.push(
1216
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1217
+ parsed
1218
+ )}`
1219
+ );
1220
+ }
1221
+ } catch (e) {
1222
+ const msg = e instanceof Error ? e.message : String(e);
1223
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1224
+ }
1225
+ }
1226
+ const total = tests.length;
1227
+ const score = correctCount / total;
1228
+ return {
1229
+ score,
1230
+ success: score >= 0.8,
1231
+ metrics: {
1232
+ total_cases: total,
1233
+ correct_count: correctCount,
1234
+ schema_valid_count: schemaValidCount,
1235
+ value_match_count: valueMatchCount,
1236
+ accuracy: score
1237
+ },
1238
+ logs
1239
+ };
1240
+ }
1241
+ };
1242
+ var jsonGenerationSchemaOnlyBenchmark = {
1243
+ name: "json-generation-schema-only",
1244
+ version: "1.0.1",
1245
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1246
+ async run(model) {
1247
+ const logs = [];
1248
+ const ajv = new Ajv({ allErrors: true, strict: false });
1249
+ let tests = [];
1250
+ try {
1251
+ const dataDir = resolveDataDir();
1252
+ const testsJsonl = await fs3.readFile(
1253
+ path3.join(dataDir, "json_generation_tests.jsonl"),
1254
+ "utf-8"
1255
+ );
1256
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1257
+ } catch (e) {
1258
+ const msg = e instanceof Error ? e.message : String(e);
1259
+ return {
1260
+ score: 0,
1261
+ success: false,
1262
+ metrics: {},
1263
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1264
+ error: e
1265
+ };
1266
+ }
1267
+ let schemaValidCount = 0;
1268
+ for (const tc of tests) {
1269
+ try {
1270
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1271
+ const messages = [
1272
+ {
1273
+ role: "system",
1274
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1275
+ },
1276
+ {
1277
+ role: "user",
1278
+ content: [
1279
+ "Generate a JSON object that reflects the following facts.",
1280
+ "JSON Schema:",
1281
+ schemaStr,
1282
+ "Facts:",
1283
+ tc.promptFacts,
1284
+ "Output must be a single JSON only, with no additional text."
1285
+ ].join("\n\n")
1286
+ }
1287
+ ];
1288
+ const { text } = await generateText2({ model, messages });
1289
+ let parsed;
1290
+ try {
1291
+ parsed = extractFirstJsonBlock(text);
1292
+ } catch {
1293
+ }
1294
+ if (parsed === void 0) {
1295
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1296
+ continue;
1297
+ }
1298
+ const validate = ajv.compile(tc.schema);
1299
+ const valid = validate(parsed);
1300
+ if (valid) {
1301
+ schemaValidCount++;
1302
+ logs.push(`[PASS] ${tc.id}`);
1303
+ } else {
1304
+ logs.push(
1305
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1306
+ );
1307
+ }
1308
+ } catch (e) {
1309
+ const msg = e instanceof Error ? e.message : String(e);
1310
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1311
+ }
1312
+ }
1313
+ const total = tests.length;
1314
+ const score = total > 0 ? schemaValidCount / total : 0;
1315
+ return {
1316
+ score,
1317
+ success: score >= 0.8,
1318
+ metrics: {
1319
+ total_cases: total,
1320
+ schema_valid_count: schemaValidCount,
1321
+ accuracy: score
1322
+ },
1323
+ logs
1324
+ };
1325
+ }
1326
+ };
1296
1327
  export {
1297
1328
  bfclMultipleBenchmark,
1298
1329
  bfclParallelBenchmark,