@ai-sdk-tool/eval 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -113,7 +113,7 @@ function uniqueLines(lines) {
113
113
  function suggestFixFromDiff(parsed) {
114
114
  const suggestions = [];
115
115
  const { error_type, expected, actual, diff } = parsed ?? {};
116
- if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
116
+ if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
117
117
  const expectedName = expected?.function;
118
118
  const actualName = actual?.function;
119
119
  if (expectedName && actualName && expectedName !== actualName) {
@@ -127,23 +127,23 @@ function suggestFixFromDiff(parsed) {
127
127
  );
128
128
  }
129
129
  }
130
- if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
131
- const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
130
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
131
+ const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
132
132
  if (missing.length) {
133
133
  suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
134
134
  }
135
135
  }
136
- if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
137
- const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
136
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
137
+ const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
138
138
  if (extras.length) {
139
139
  suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
140
140
  }
141
141
  }
142
- if (diff && diff.some((d) => d.startsWith("@@ param "))) {
143
- const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
142
+ if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
+ const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
144
  for (const param of targets) {
145
145
  const allowedLine = diff.find(
146
- (d) => d.startsWith("- expected one of:")
146
+ (d) => String(d).startsWith("- expected one of:")
147
147
  );
148
148
  if (allowedLine) {
149
149
  const allowed = allowedLine.replace("- expected one of: ", "");
@@ -281,13 +281,13 @@ var reporters = {
281
281
  };
282
282
 
283
283
  // src/evaluate.ts
284
- async function runSingleBenchmark(model, benchmark, modelKey) {
284
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
285
285
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
286
286
  try {
287
287
  console.log(
288
288
  `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
289
289
  );
290
- const result = await benchmark.run(model);
290
+ const result = await benchmark.run(model, config);
291
291
  console.log(
292
292
  `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
293
293
  );
@@ -316,7 +316,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
316
316
  }
317
317
  }
318
318
  async function evaluate(options) {
319
- const { models, benchmarks, reporter = "console" } = options;
319
+ const { models, benchmarks, reporter = "console", temperature } = options;
320
320
  const modelEntries = [];
321
321
  if (Array.isArray(models)) {
322
322
  for (const m of models) modelEntries.push([void 0, m]);
@@ -335,7 +335,8 @@ async function evaluate(options) {
335
335
  const evaluationResult = await runSingleBenchmark(
336
336
  model,
337
337
  benchmark,
338
- modelKey
338
+ modelKey,
339
+ temperature !== void 0 ? { temperature } : void 0
339
340
  );
340
341
  allResults.push(evaluationResult);
341
342
  }
@@ -350,17 +351,16 @@ async function evaluate(options) {
350
351
  return allResults;
351
352
  }
352
353
 
353
- // src/benchmarks/json-generation.ts
354
+ // src/benchmarks/bfcl.ts
354
355
  var import_ai = require("ai");
355
- var import_ajv = __toESM(require("ajv"), 1);
356
356
  var import_fs2 = require("fs");
357
357
  var import_path2 = __toESM(require("path"), 1);
358
358
 
359
359
  // src/utils/paths.ts
360
360
  var import_fs = __toESM(require("fs"), 1);
361
+ var import_module = require("module");
361
362
  var import_path = __toESM(require("path"), 1);
362
363
  var import_url = require("url");
363
- var import_module = require("module");
364
364
  function resolveDataDir(fromModuleUrl) {
365
365
  const moduleUrl = fromModuleUrl;
366
366
  const override = process.env.BFCL_DATA_DIR;
@@ -408,263 +408,6 @@ function resolveDataDir(fromModuleUrl) {
408
408
  return import_path.default.join(pkgRoot, "data");
409
409
  }
410
410
 
411
- // src/benchmarks/json-generation.ts
412
- function extractFirstJsonBlock(text) {
413
- try {
414
- return JSON.parse(text);
415
- } catch {
416
- }
417
- const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
418
- if (fenceMatch) {
419
- const inner = fenceMatch[1].trim();
420
- try {
421
- return JSON.parse(inner);
422
- } catch {
423
- }
424
- }
425
- const startIdxObj = text.indexOf("{");
426
- const startIdxArr = text.indexOf("[");
427
- const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
428
- if (start === void 0) return void 0;
429
- const open = text[start] === "{" ? "{" : "[";
430
- const close = open === "{" ? "}" : "]";
431
- let depth = 0;
432
- for (let i = start; i < text.length; i++) {
433
- const ch = text[i];
434
- if (ch === open) depth++;
435
- else if (ch === close) depth--;
436
- if (depth === 0) {
437
- const candidate = text.slice(start, i + 1);
438
- try {
439
- return JSON.parse(candidate);
440
- } catch {
441
- }
442
- break;
443
- }
444
- }
445
- return void 0;
446
- }
447
- function subsetMatch(expected, actual) {
448
- if (expected === null || typeof expected !== "object") {
449
- return expected === actual;
450
- }
451
- if (Array.isArray(expected)) {
452
- if (!Array.isArray(actual)) return false;
453
- for (let i = 0; i < expected.length; i++) {
454
- if (!subsetMatch(expected[i], actual[i])) return false;
455
- }
456
- return true;
457
- }
458
- if (actual === null || typeof actual !== "object") return false;
459
- const eObj = expected;
460
- const aObj = actual;
461
- for (const key of Object.keys(eObj)) {
462
- if (!subsetMatch(eObj[key], aObj[key])) return false;
463
- }
464
- return true;
465
- }
466
- var jsonGenerationBenchmark = {
467
- name: "json-generation",
468
- version: "2.1.0",
469
- description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
470
- async run(model) {
471
- const logs = [];
472
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
473
- let schemaValidCount = 0;
474
- let valueMatchCount = 0;
475
- let correctCount = 0;
476
- let tests = [];
477
- const expectedMap = /* @__PURE__ */ new Map();
478
- try {
479
- const dataDir = resolveDataDir();
480
- const testsJsonl = await import_fs2.promises.readFile(
481
- import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
482
- "utf-8"
483
- );
484
- const expectedJsonl = await import_fs2.promises.readFile(
485
- import_path2.default.join(dataDir, "json_generation_expected.jsonl"),
486
- "utf-8"
487
- );
488
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
489
- const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
490
- for (const r of expecteds) expectedMap.set(r.id, r);
491
- } catch (e) {
492
- const msg = e instanceof Error ? e.message : String(e);
493
- return {
494
- score: 0,
495
- success: false,
496
- metrics: {},
497
- logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
498
- error: e
499
- };
500
- }
501
- for (const tc of tests) {
502
- try {
503
- const schemaStr = JSON.stringify(tc.schema, null, 2);
504
- const messages = [
505
- {
506
- role: "system",
507
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
508
- },
509
- {
510
- role: "user",
511
- content: [
512
- "Generate a JSON object that reflects the following facts.",
513
- "JSON Schema:",
514
- schemaStr,
515
- "Facts:",
516
- tc.promptFacts,
517
- "Output must be a single JSON only, with no additional text."
518
- ].join("\n\n")
519
- }
520
- ];
521
- const { text } = await (0, import_ai.generateText)({ model, messages });
522
- let parsed;
523
- try {
524
- parsed = extractFirstJsonBlock(text);
525
- } catch {
526
- }
527
- if (parsed === void 0) {
528
- logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
529
- continue;
530
- }
531
- const validate = ajv.compile(tc.schema);
532
- const valid = validate(parsed);
533
- if (valid) schemaValidCount++;
534
- else
535
- logs.push(
536
- `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
537
- );
538
- const expectedRec = expectedMap.get(tc.id);
539
- if (!expectedRec) {
540
- logs.push(
541
- `[WARN] ${tc.id}: No expected record found. Skipping value match.`
542
- );
543
- }
544
- const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
545
- if (valuesOk) valueMatchCount++;
546
- if (valid && valuesOk) {
547
- correctCount++;
548
- logs.push(`[PASS] ${tc.id}`);
549
- } else {
550
- logs.push(
551
- `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
552
- parsed
553
- )}`
554
- );
555
- }
556
- } catch (e) {
557
- const msg = e instanceof Error ? e.message : String(e);
558
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
559
- }
560
- }
561
- const total = tests.length;
562
- const score = correctCount / total;
563
- return {
564
- score,
565
- success: score >= 0.8,
566
- metrics: {
567
- total_cases: total,
568
- correct_count: correctCount,
569
- schema_valid_count: schemaValidCount,
570
- value_match_count: valueMatchCount,
571
- accuracy: score
572
- },
573
- logs
574
- };
575
- }
576
- };
577
- var jsonGenerationSchemaOnlyBenchmark = {
578
- name: "json-generation-schema-only",
579
- version: "1.0.1",
580
- description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
581
- async run(model) {
582
- const logs = [];
583
- const ajv = new import_ajv.default({ allErrors: true, strict: false });
584
- let tests = [];
585
- try {
586
- const dataDir = resolveDataDir();
587
- const testsJsonl = await import_fs2.promises.readFile(
588
- import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
589
- "utf-8"
590
- );
591
- tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
592
- } catch (e) {
593
- const msg = e instanceof Error ? e.message : String(e);
594
- return {
595
- score: 0,
596
- success: false,
597
- metrics: {},
598
- logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
599
- error: e
600
- };
601
- }
602
- let schemaValidCount = 0;
603
- for (const tc of tests) {
604
- try {
605
- const schemaStr = JSON.stringify(tc.schema, null, 2);
606
- const messages = [
607
- {
608
- role: "system",
609
- content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
610
- },
611
- {
612
- role: "user",
613
- content: [
614
- "Generate a JSON object that reflects the following facts.",
615
- "JSON Schema:",
616
- schemaStr,
617
- "Facts:",
618
- tc.promptFacts,
619
- "Output must be a single JSON only, with no additional text."
620
- ].join("\n\n")
621
- }
622
- ];
623
- const { text } = await (0, import_ai.generateText)({ model, messages });
624
- let parsed;
625
- try {
626
- parsed = extractFirstJsonBlock(text);
627
- } catch {
628
- }
629
- if (parsed === void 0) {
630
- logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
631
- continue;
632
- }
633
- const validate = ajv.compile(tc.schema);
634
- const valid = validate(parsed);
635
- if (valid) {
636
- schemaValidCount++;
637
- logs.push(`[PASS] ${tc.id}`);
638
- } else {
639
- logs.push(
640
- `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
641
- );
642
- }
643
- } catch (e) {
644
- const msg = e instanceof Error ? e.message : String(e);
645
- logs.push(`[ERROR] ${tc.id}: ${msg}`);
646
- }
647
- }
648
- const total = tests.length;
649
- const score = total > 0 ? schemaValidCount / total : 0;
650
- return {
651
- score,
652
- success: score >= 0.8,
653
- metrics: {
654
- total_cases: total,
655
- schema_valid_count: schemaValidCount,
656
- accuracy: score
657
- },
658
- logs
659
- };
660
- }
661
- };
662
-
663
- // src/benchmarks/bfcl.ts
664
- var import_ai2 = require("ai");
665
- var import_fs3 = require("fs");
666
- var import_path3 = __toESM(require("path"), 1);
667
-
668
411
  // src/benchmarks/bfcl/ast-checker.ts
669
412
  function standardizeString(input) {
670
413
  if (typeof input !== "string") return input;
@@ -674,7 +417,7 @@ function standardizeString(input) {
674
417
  function checkStringValue(param, modelValue, possibleAnswers) {
675
418
  const standardizedModelValue = standardizeString(modelValue);
676
419
  const standardizedPossibleAnswers = possibleAnswers.map(
677
- (ans) => standardizeString(ans)
420
+ (ans) => standardizeString(String(ans))
678
421
  );
679
422
  if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
680
423
  return {
@@ -701,8 +444,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
701
444
  };
702
445
  }
703
446
  const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
447
+ const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
704
448
  for (const param of requiredParams) {
705
- if (!(param in modelArgs)) {
449
+ if (!(param in argsObj)) {
706
450
  return {
707
451
  valid: false,
708
452
  error: `Missing required parameter: '${param}'.`,
@@ -710,87 +454,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
710
454
  };
711
455
  }
712
456
  }
713
- for (const paramName in modelArgs) {
714
- const modelValue = modelArgs[paramName];
715
- if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
716
- return {
717
- valid: false,
718
- error: `Unexpected parameter: '${paramName}'.`,
719
- error_type: "simple_function_checker:unexpected_param"
720
- };
721
- }
722
- const possibleValues = possibleAnswerParams[paramName];
723
- if (typeof modelValue === "string") {
724
- const result = checkStringValue(paramName, modelValue, possibleValues);
725
- if (!result.valid) return result;
726
- } else if (Array.isArray(modelValue)) {
727
- const modelValueStr = JSON.stringify(
728
- modelValue.map((v) => standardizeString(v.toString())).sort()
729
- );
730
- const hasMatch = possibleValues.some(
731
- (p) => JSON.stringify(
732
- p.map((v) => standardizeString(v.toString())).sort()
733
- ) === modelValueStr
734
- );
735
- if (!hasMatch) {
457
+ if (modelArgs && typeof modelArgs === "object") {
458
+ for (const paramName of Object.keys(argsObj)) {
459
+ const modelValue = argsObj[paramName];
460
+ if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
736
461
  return {
737
462
  valid: false,
738
- error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
739
- modelValue
740
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
741
- error_type: "value_error:list"
463
+ error: `Unexpected parameter: '${paramName}'.`,
464
+ error_type: "simple_function_checker:unexpected_param"
742
465
  };
743
466
  }
744
- } else {
745
- const hasMatch = possibleValues.some((possibleValue) => {
746
- if (modelValue === possibleValue) return true;
747
- if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
748
- try {
749
- const normalizeObject = (obj) => {
750
- if (Array.isArray(obj)) {
751
- return obj.map(normalizeObject);
752
- }
753
- if (obj && typeof obj === "object") {
754
- const normalized = {};
755
- for (const [key, value] of Object.entries(obj)) {
756
- if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
757
- normalized[key] = value[0];
758
- } else {
759
- normalized[key] = normalizeObject(value);
467
+ const possibleValues = possibleAnswerParams[paramName];
468
+ if (typeof modelValue === "string") {
469
+ const result = checkStringValue(
470
+ paramName,
471
+ modelValue,
472
+ possibleValues ?? []
473
+ );
474
+ if (!result.valid) return result;
475
+ } else if (Array.isArray(modelValue)) {
476
+ const modelValueStr = JSON.stringify(
477
+ modelValue.map((v) => standardizeString(String(v))).sort()
478
+ );
479
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
480
+ if (!Array.isArray(p)) return false;
481
+ return JSON.stringify(
482
+ p.map((v) => standardizeString(String(v))).sort()
483
+ ) === modelValueStr;
484
+ }) : false;
485
+ if (!hasMatch) {
486
+ return {
487
+ valid: false,
488
+ error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
489
+ modelValue
490
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
491
+ error_type: "value_error:list"
492
+ };
493
+ }
494
+ } else {
495
+ const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
496
+ if (modelValue === possibleValue) return true;
497
+ if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
498
+ try {
499
+ const normalizeObject = (obj) => {
500
+ if (Array.isArray(obj)) {
501
+ return obj.map(normalizeObject);
502
+ }
503
+ if (obj && typeof obj === "object") {
504
+ const normalized = {};
505
+ for (const [key, value] of Object.entries(
506
+ obj
507
+ )) {
508
+ if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
509
+ normalized[key] = value[0];
510
+ } else {
511
+ normalized[key] = normalizeObject(value);
512
+ }
760
513
  }
514
+ return normalized;
761
515
  }
762
- return normalized;
763
- }
764
- return obj;
765
- };
766
- const normalizedModel = normalizeObject(modelValue);
767
- const normalizedPossible = normalizeObject(possibleValue);
768
- return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
769
- } catch {
770
- return false;
516
+ return obj;
517
+ };
518
+ const normalizedModel = normalizeObject(modelValue);
519
+ const normalizedPossible = normalizeObject(possibleValue);
520
+ return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
521
+ } catch {
522
+ return false;
523
+ }
771
524
  }
525
+ if (typeof modelValue === "number" && typeof possibleValue === "string") {
526
+ return modelValue.toString() === possibleValue;
527
+ }
528
+ if (typeof modelValue === "string" && typeof possibleValue === "number") {
529
+ return modelValue === possibleValue.toString();
530
+ }
531
+ return false;
532
+ }) : false;
533
+ if (!hasMatch) {
534
+ return {
535
+ valid: false,
536
+ error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
537
+ modelValue
538
+ )}. Expected one of ${JSON.stringify(possibleValues)}.`,
539
+ error_type: "value_error:other"
540
+ };
772
541
  }
773
- if (typeof modelValue === "number" && typeof possibleValue === "string") {
774
- return modelValue.toString() === possibleValue;
775
- }
776
- if (typeof modelValue === "string" && typeof possibleValue === "number") {
777
- return modelValue === possibleValue.toString();
778
- }
779
- return false;
780
- });
781
- if (!hasMatch) {
782
- return {
783
- valid: false,
784
- error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
785
- modelValue
786
- )}. Expected one of ${JSON.stringify(possibleValues)}.`,
787
- error_type: "value_error:other"
788
- };
789
542
  }
790
543
  }
791
544
  }
792
545
  for (const paramName in possibleAnswerParams) {
793
- if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
546
+ const val = possibleAnswerParams[paramName];
547
+ const isOptional = Array.isArray(val) && val.includes("");
548
+ if (!(paramName in argsObj) && !isOptional) {
794
549
  return {
795
550
  valid: false,
796
551
  error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
@@ -876,10 +631,10 @@ function check(testCase, modelOutput, possibleAnswer) {
876
631
  const category = testCase.id.split("_")[0];
877
632
  try {
878
633
  if (category === "simple") {
879
- if (!modelOutput || modelOutput.length !== 1) {
634
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
880
635
  return {
881
636
  valid: false,
882
- error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
637
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
883
638
  error_type: "simple:wrong_count"
884
639
  };
885
640
  }
@@ -921,19 +676,19 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
921
676
  name,
922
677
  version: "1.0.0",
923
678
  description,
924
- async run(model) {
679
+ async run(model, config) {
925
680
  const logs = [];
926
681
  let correctCount = 0;
927
682
  let testCases = [];
928
683
  try {
929
684
  const dataPath = resolveDataDir();
930
685
  logs.push(`[INFO] Using data dir: ${dataPath}`);
931
- const testCasesJson = await import_fs3.promises.readFile(
932
- import_path3.default.join(dataPath, testDataFile),
686
+ const testCasesJson = await import_fs2.promises.readFile(
687
+ import_path2.default.join(dataPath, testDataFile),
933
688
  "utf-8"
934
689
  );
935
- const possibleAnswersJson = await import_fs3.promises.readFile(
936
- import_path3.default.join(dataPath, answerDataFile),
690
+ const possibleAnswersJson = await import_fs2.promises.readFile(
691
+ import_path2.default.join(dataPath, answerDataFile),
937
692
  "utf-8"
938
693
  );
939
694
  testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -950,19 +705,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
950
705
  );
951
706
  }
952
707
  const fixSchema = (schema) => {
953
- if (!schema || typeof schema !== "object") return schema;
708
+ if (!schema || typeof schema !== "object")
709
+ return { type: "object", properties: {} };
954
710
  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
955
- if (copy.type) {
956
- if (copy.type === "dict") copy.type = "object";
957
- if (copy.type === "integer" || copy.type === "float")
958
- copy.type = "number";
959
- }
960
- if (copy.properties && typeof copy.properties === "object") {
961
- for (const k of Object.keys(copy.properties)) {
962
- copy.properties[k] = fixSchema(copy.properties[k]);
711
+ if (!Array.isArray(copy)) {
712
+ if (copy.type) {
713
+ if (copy.type === "dict") copy.type = "object";
714
+ if (copy.type === "integer" || copy.type === "float")
715
+ copy.type = "number";
716
+ }
717
+ if (copy.properties && typeof copy.properties === "object") {
718
+ for (const k of Object.keys(copy.properties)) {
719
+ copy.properties[k] = fixSchema(
720
+ copy.properties[k]
721
+ );
722
+ }
963
723
  }
724
+ if (copy.items) copy.items = fixSchema(copy.items);
725
+ return copy;
964
726
  }
965
- if (copy.items) copy.items = fixSchema(copy.items);
966
727
  return copy;
967
728
  };
968
729
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
@@ -973,6 +734,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
973
734
  const runSingleCase = async (testCase) => {
974
735
  const caseLogs = [];
975
736
  const { function: tools, question: messages } = testCase;
737
+ const temp = config?.temperature;
738
+ const temperature = typeof temp === "number" ? temp : void 0;
976
739
  try {
977
740
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
978
741
  const nameMap = /* @__PURE__ */ new Map();
@@ -982,7 +745,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
982
745
  };
983
746
  const transformedTools = tools.map((t) => {
984
747
  const fixed = fixSchema(t.parameters);
985
- const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
748
+ const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
749
+ const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
986
750
  const sanitized = sanitizeName(t.name);
987
751
  nameMap.set(sanitized, t.name);
988
752
  return {
@@ -995,9 +759,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
995
759
  const toolsMap = Object.fromEntries(
996
760
  transformedTools.map((t) => [
997
761
  t.name,
998
- (0, import_ai2.tool)({
762
+ (0, import_ai.tool)({
999
763
  description: typeof t.description === "string" ? t.description : void 0,
1000
- inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
764
+ inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
1001
765
  })
1002
766
  ])
1003
767
  );
@@ -1012,16 +776,20 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1012
776
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
1013
777
  );
1014
778
  }
1015
- const { toolCalls, text, finishReason } = await (0, import_ai2.generateText)({
779
+ const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
1016
780
  model,
1017
781
  messages: flatMessages,
1018
782
  tools: toolsMap,
1019
783
  toolChoice: "auto",
784
+ ...temperature !== void 0 ? { temperature } : {},
1020
785
  // Pass original schema information to middleware
1021
786
  providerOptions: {
1022
787
  toolCallMiddleware: {
1023
788
  originalToolSchemas: Object.fromEntries(
1024
- transformedTools.map((t) => [t.name, t.inputSchema])
789
+ transformedTools.map((t) => [
790
+ t.name,
791
+ t.inputSchema
792
+ ])
1025
793
  )
1026
794
  }
1027
795
  }
@@ -1074,10 +842,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1074
842
  const summarizeArgs = (args) => {
1075
843
  if (args == null) return args;
1076
844
  if (typeof args !== "object") return args;
1077
- return Object.keys(args).sort().reduce((acc, k) => {
1078
- acc[k] = args[k];
1079
- return acc;
1080
- }, {});
845
+ return Object.keys(args).sort().reduce(
846
+ (acc, k) => {
847
+ acc[k] = args[k];
848
+ return acc;
849
+ },
850
+ {}
851
+ );
1081
852
  };
1082
853
  const expected = {};
1083
854
  const actual = {};
@@ -1098,19 +869,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1098
869
  diff.push(`- ${expectedFuncName}`);
1099
870
  diff.push(`+ ${receivedName}`);
1100
871
  }
1101
- if (expectedParams && receivedArgs) {
872
+ if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1102
873
  const required = funcDesc?.parameters?.required ?? [];
1103
874
  for (const req of required) {
1104
875
  if (!(req in receivedArgs)) {
1105
876
  diff.push(`- missing required param: ${req}`);
1106
877
  }
1107
878
  }
1108
- for (const k of Object.keys(receivedArgs)) {
879
+ for (const k of Object.keys(
880
+ receivedArgs
881
+ )) {
1109
882
  if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1110
883
  diff.push(`+ unexpected param: ${k}`);
1111
884
  }
1112
885
  }
1113
- for (const k of Object.keys(receivedArgs)) {
886
+ for (const k of Object.keys(
887
+ receivedArgs
888
+ )) {
1114
889
  if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
1115
890
  const allowed = expectedParams[k];
1116
891
  const got = receivedArgs[k];
@@ -1183,13 +958,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1183
958
  );
1184
959
  const requiredParams = funcDesc?.parameters?.required ?? [];
1185
960
  diff.push(`@@ function ${fname}`);
1186
- if (expectedParamsAllowed && receivedArgs) {
961
+ if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
1187
962
  for (const req of requiredParams) {
1188
963
  if (!(req in receivedArgs)) {
1189
964
  diff.push(`- missing required param: ${req}`);
1190
965
  }
1191
966
  }
1192
- for (const k of Object.keys(receivedArgs)) {
967
+ for (const k of Object.keys(
968
+ receivedArgs
969
+ )) {
1193
970
  if (!Object.prototype.hasOwnProperty.call(
1194
971
  expectedParamsAllowed,
1195
972
  k
@@ -1197,7 +974,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1197
974
  diff.push(`+ unexpected param: ${k}`);
1198
975
  }
1199
976
  }
1200
- for (const k of Object.keys(receivedArgs)) {
977
+ for (const k of Object.keys(
978
+ receivedArgs
979
+ )) {
1201
980
  if (Object.prototype.hasOwnProperty.call(
1202
981
  expectedParamsAllowed,
1203
982
  k
@@ -1335,6 +1114,274 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
1335
1114
  "BFCL_v3_parallel_multiple.json",
1336
1115
  "BFCL_v3_parallel_multiple_possible_answer.json"
1337
1116
  );
1117
+
1118
+ // src/benchmarks/json-generation.ts
1119
+ var import_ai2 = require("ai");
1120
+ var import_ajv = __toESM(require("ajv"), 1);
1121
+ var import_fs3 = require("fs");
1122
+ var import_path3 = __toESM(require("path"), 1);
1123
+ function extractFirstJsonBlock(text) {
1124
+ try {
1125
+ return JSON.parse(text);
1126
+ } catch {
1127
+ }
1128
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
1129
+ if (fenceMatch) {
1130
+ const inner = fenceMatch[1].trim();
1131
+ try {
1132
+ return JSON.parse(inner);
1133
+ } catch {
1134
+ }
1135
+ }
1136
+ const startIdxObj = text.indexOf("{");
1137
+ const startIdxArr = text.indexOf("[");
1138
+ const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
1139
+ if (start === void 0) return void 0;
1140
+ const open = text[start] === "{" ? "{" : "[";
1141
+ const close = open === "{" ? "}" : "]";
1142
+ let depth = 0;
1143
+ for (let i = start; i < text.length; i++) {
1144
+ const ch = text[i];
1145
+ if (ch === open) depth++;
1146
+ else if (ch === close) depth--;
1147
+ if (depth === 0) {
1148
+ const candidate = text.slice(start, i + 1);
1149
+ try {
1150
+ return JSON.parse(candidate);
1151
+ } catch {
1152
+ }
1153
+ break;
1154
+ }
1155
+ }
1156
+ return void 0;
1157
+ }
1158
+ function subsetMatch(expected, actual) {
1159
+ if (expected === null || typeof expected !== "object") {
1160
+ return expected === actual;
1161
+ }
1162
+ if (Array.isArray(expected)) {
1163
+ if (!Array.isArray(actual)) return false;
1164
+ for (let i = 0; i < expected.length; i++) {
1165
+ if (!subsetMatch(expected[i], actual[i])) return false;
1166
+ }
1167
+ return true;
1168
+ }
1169
+ if (actual === null || typeof actual !== "object") return false;
1170
+ const eObj = expected;
1171
+ const aObj = actual;
1172
+ for (const key of Object.keys(eObj)) {
1173
+ if (!subsetMatch(eObj[key], aObj[key])) return false;
1174
+ }
1175
+ return true;
1176
+ }
1177
+ var jsonGenerationBenchmark = {
1178
+ name: "json-generation",
1179
+ version: "2.1.0",
1180
+ description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1181
+ async run(model, config) {
1182
+ const logs = [];
1183
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1184
+ let schemaValidCount = 0;
1185
+ let valueMatchCount = 0;
1186
+ let correctCount = 0;
1187
+ let tests = [];
1188
+ const expectedMap = /* @__PURE__ */ new Map();
1189
+ try {
1190
+ const dataDir = resolveDataDir();
1191
+ const testsJsonl = await import_fs3.promises.readFile(
1192
+ import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1193
+ "utf-8"
1194
+ );
1195
+ const expectedJsonl = await import_fs3.promises.readFile(
1196
+ import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1197
+ "utf-8"
1198
+ );
1199
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1200
+ const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1201
+ for (const r of expecteds) expectedMap.set(r.id, r);
1202
+ } catch (e) {
1203
+ const msg = e instanceof Error ? e.message : String(e);
1204
+ return {
1205
+ score: 0,
1206
+ success: false,
1207
+ metrics: {},
1208
+ logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
1209
+ error: e
1210
+ };
1211
+ }
1212
+ for (const tc of tests) {
1213
+ try {
1214
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1215
+ const messages = [
1216
+ {
1217
+ role: "system",
1218
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1219
+ },
1220
+ {
1221
+ role: "user",
1222
+ content: [
1223
+ "Generate a JSON object that reflects the following facts.",
1224
+ "JSON Schema:",
1225
+ schemaStr,
1226
+ "Facts:",
1227
+ tc.promptFacts,
1228
+ "Output must be a single JSON only, with no additional text."
1229
+ ].join("\n\n")
1230
+ }
1231
+ ];
1232
+ const temp = config?.temperature;
1233
+ const temperature = typeof temp === "number" ? temp : void 0;
1234
+ const { text } = await (0, import_ai2.generateText)({
1235
+ model,
1236
+ messages,
1237
+ ...temperature !== void 0 ? { temperature } : {}
1238
+ });
1239
+ let parsed;
1240
+ try {
1241
+ parsed = extractFirstJsonBlock(text);
1242
+ } catch {
1243
+ }
1244
+ if (parsed === void 0) {
1245
+ logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
1246
+ continue;
1247
+ }
1248
+ const validate = ajv.compile(tc.schema);
1249
+ const valid = validate(parsed);
1250
+ if (valid) schemaValidCount++;
1251
+ else
1252
+ logs.push(
1253
+ `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1254
+ );
1255
+ const expectedRec = expectedMap.get(tc.id);
1256
+ if (!expectedRec) {
1257
+ logs.push(
1258
+ `[WARN] ${tc.id}: No expected record found. Skipping value match.`
1259
+ );
1260
+ }
1261
+ const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
1262
+ if (valuesOk) valueMatchCount++;
1263
+ if (valid && valuesOk) {
1264
+ correctCount++;
1265
+ logs.push(`[PASS] ${tc.id}`);
1266
+ } else {
1267
+ logs.push(
1268
+ `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
1269
+ parsed
1270
+ )}`
1271
+ );
1272
+ }
1273
+ } catch (e) {
1274
+ const msg = e instanceof Error ? e.message : String(e);
1275
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1276
+ }
1277
+ }
1278
+ const total = tests.length;
1279
+ const score = correctCount / total;
1280
+ return {
1281
+ score,
1282
+ success: score >= 0.8,
1283
+ metrics: {
1284
+ total_cases: total,
1285
+ correct_count: correctCount,
1286
+ schema_valid_count: schemaValidCount,
1287
+ value_match_count: valueMatchCount,
1288
+ accuracy: score
1289
+ },
1290
+ logs
1291
+ };
1292
+ }
1293
+ };
1294
+ var jsonGenerationSchemaOnlyBenchmark = {
1295
+ name: "json-generation-schema-only",
1296
+ version: "1.0.1",
1297
+ description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1298
+ async run(model, config) {
1299
+ const logs = [];
1300
+ const ajv = new import_ajv.default({ allErrors: true, strict: false });
1301
+ let tests = [];
1302
+ try {
1303
+ const dataDir = resolveDataDir();
1304
+ const testsJsonl = await import_fs3.promises.readFile(
1305
+ import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1306
+ "utf-8"
1307
+ );
1308
+ tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1309
+ } catch (e) {
1310
+ const msg = e instanceof Error ? e.message : String(e);
1311
+ return {
1312
+ score: 0,
1313
+ success: false,
1314
+ metrics: {},
1315
+ logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
1316
+ error: e
1317
+ };
1318
+ }
1319
+ let schemaValidCount = 0;
1320
+ for (const tc of tests) {
1321
+ try {
1322
+ const schemaStr = JSON.stringify(tc.schema, null, 2);
1323
+ const messages = [
1324
+ {
1325
+ role: "system",
1326
+ content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
1327
+ },
1328
+ {
1329
+ role: "user",
1330
+ content: [
1331
+ "Generate a JSON object that reflects the following facts.",
1332
+ "JSON Schema:",
1333
+ schemaStr,
1334
+ "Facts:",
1335
+ tc.promptFacts,
1336
+ "Output must be a single JSON only, with no additional text."
1337
+ ].join("\n\n")
1338
+ }
1339
+ ];
1340
+ const temp = config?.temperature;
1341
+ const temperature = typeof temp === "number" ? temp : void 0;
1342
+ const { text } = await (0, import_ai2.generateText)({
1343
+ model,
1344
+ messages,
1345
+ ...temperature !== void 0 ? { temperature } : {}
1346
+ });
1347
+ let parsed;
1348
+ try {
1349
+ parsed = extractFirstJsonBlock(text);
1350
+ } catch {
1351
+ }
1352
+ if (parsed === void 0) {
1353
+ logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
1354
+ continue;
1355
+ }
1356
+ const validate = ajv.compile(tc.schema);
1357
+ const valid = validate(parsed);
1358
+ if (valid) {
1359
+ schemaValidCount++;
1360
+ logs.push(`[PASS] ${tc.id}`);
1361
+ } else {
1362
+ logs.push(
1363
+ `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
1364
+ );
1365
+ }
1366
+ } catch (e) {
1367
+ const msg = e instanceof Error ? e.message : String(e);
1368
+ logs.push(`[ERROR] ${tc.id}: ${msg}`);
1369
+ }
1370
+ }
1371
+ const total = tests.length;
1372
+ const score = total > 0 ? schemaValidCount / total : 0;
1373
+ return {
1374
+ score,
1375
+ success: score >= 0.8,
1376
+ metrics: {
1377
+ total_cases: total,
1378
+ schema_valid_count: schemaValidCount,
1379
+ accuracy: score
1380
+ },
1381
+ logs
1382
+ };
1383
+ }
1384
+ };
1338
1385
  // Annotate the CommonJS export names for ESM import in node:
1339
1386
  0 && (module.exports = {
1340
1387
  bfclMultipleBenchmark,