@ai-sdk-tool/eval 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -5
- package/dist/index.cjs +401 -370
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -4
- package/dist/index.d.ts +4 -4
- package/dist/index.js +400 -369
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
package/dist/index.cjs
CHANGED
|
@@ -113,7 +113,7 @@ function uniqueLines(lines) {
|
|
|
113
113
|
function suggestFixFromDiff(parsed) {
|
|
114
114
|
const suggestions = [];
|
|
115
115
|
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
116
|
-
if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
|
|
116
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
117
117
|
const expectedName = expected?.function;
|
|
118
118
|
const actualName = actual?.function;
|
|
119
119
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
@@ -127,23 +127,23 @@ function suggestFixFromDiff(parsed) {
|
|
|
127
127
|
);
|
|
128
128
|
}
|
|
129
129
|
}
|
|
130
|
-
if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
|
|
131
|
-
const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
|
|
130
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
131
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
132
132
|
if (missing.length) {
|
|
133
133
|
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
134
134
|
}
|
|
135
135
|
}
|
|
136
|
-
if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
|
|
137
|
-
const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
|
|
136
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
137
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
138
138
|
if (extras.length) {
|
|
139
139
|
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
140
140
|
}
|
|
141
141
|
}
|
|
142
|
-
if (diff && diff.some((d) => d.startsWith("@@ param "))) {
|
|
143
|
-
const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
|
|
142
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
143
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
144
144
|
for (const param of targets) {
|
|
145
145
|
const allowedLine = diff.find(
|
|
146
|
-
(d) => d.startsWith("- expected one of:")
|
|
146
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
147
147
|
);
|
|
148
148
|
if (allowedLine) {
|
|
149
149
|
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
@@ -350,17 +350,16 @@ async function evaluate(options) {
|
|
|
350
350
|
return allResults;
|
|
351
351
|
}
|
|
352
352
|
|
|
353
|
-
// src/benchmarks/
|
|
353
|
+
// src/benchmarks/bfcl.ts
|
|
354
354
|
var import_ai = require("ai");
|
|
355
|
-
var import_ajv = __toESM(require("ajv"), 1);
|
|
356
355
|
var import_fs2 = require("fs");
|
|
357
356
|
var import_path2 = __toESM(require("path"), 1);
|
|
358
357
|
|
|
359
358
|
// src/utils/paths.ts
|
|
360
359
|
var import_fs = __toESM(require("fs"), 1);
|
|
360
|
+
var import_module = require("module");
|
|
361
361
|
var import_path = __toESM(require("path"), 1);
|
|
362
362
|
var import_url = require("url");
|
|
363
|
-
var import_module = require("module");
|
|
364
363
|
function resolveDataDir(fromModuleUrl) {
|
|
365
364
|
const moduleUrl = fromModuleUrl;
|
|
366
365
|
const override = process.env.BFCL_DATA_DIR;
|
|
@@ -408,263 +407,6 @@ function resolveDataDir(fromModuleUrl) {
|
|
|
408
407
|
return import_path.default.join(pkgRoot, "data");
|
|
409
408
|
}
|
|
410
409
|
|
|
411
|
-
// src/benchmarks/json-generation.ts
|
|
412
|
-
function extractFirstJsonBlock(text) {
|
|
413
|
-
try {
|
|
414
|
-
return JSON.parse(text);
|
|
415
|
-
} catch {
|
|
416
|
-
}
|
|
417
|
-
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
418
|
-
if (fenceMatch) {
|
|
419
|
-
const inner = fenceMatch[1].trim();
|
|
420
|
-
try {
|
|
421
|
-
return JSON.parse(inner);
|
|
422
|
-
} catch {
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
const startIdxObj = text.indexOf("{");
|
|
426
|
-
const startIdxArr = text.indexOf("[");
|
|
427
|
-
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
428
|
-
if (start === void 0) return void 0;
|
|
429
|
-
const open = text[start] === "{" ? "{" : "[";
|
|
430
|
-
const close = open === "{" ? "}" : "]";
|
|
431
|
-
let depth = 0;
|
|
432
|
-
for (let i = start; i < text.length; i++) {
|
|
433
|
-
const ch = text[i];
|
|
434
|
-
if (ch === open) depth++;
|
|
435
|
-
else if (ch === close) depth--;
|
|
436
|
-
if (depth === 0) {
|
|
437
|
-
const candidate = text.slice(start, i + 1);
|
|
438
|
-
try {
|
|
439
|
-
return JSON.parse(candidate);
|
|
440
|
-
} catch {
|
|
441
|
-
}
|
|
442
|
-
break;
|
|
443
|
-
}
|
|
444
|
-
}
|
|
445
|
-
return void 0;
|
|
446
|
-
}
|
|
447
|
-
function subsetMatch(expected, actual) {
|
|
448
|
-
if (expected === null || typeof expected !== "object") {
|
|
449
|
-
return expected === actual;
|
|
450
|
-
}
|
|
451
|
-
if (Array.isArray(expected)) {
|
|
452
|
-
if (!Array.isArray(actual)) return false;
|
|
453
|
-
for (let i = 0; i < expected.length; i++) {
|
|
454
|
-
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
455
|
-
}
|
|
456
|
-
return true;
|
|
457
|
-
}
|
|
458
|
-
if (actual === null || typeof actual !== "object") return false;
|
|
459
|
-
const eObj = expected;
|
|
460
|
-
const aObj = actual;
|
|
461
|
-
for (const key of Object.keys(eObj)) {
|
|
462
|
-
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
463
|
-
}
|
|
464
|
-
return true;
|
|
465
|
-
}
|
|
466
|
-
var jsonGenerationBenchmark = {
|
|
467
|
-
name: "json-generation",
|
|
468
|
-
version: "2.1.0",
|
|
469
|
-
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
470
|
-
async run(model) {
|
|
471
|
-
const logs = [];
|
|
472
|
-
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
473
|
-
let schemaValidCount = 0;
|
|
474
|
-
let valueMatchCount = 0;
|
|
475
|
-
let correctCount = 0;
|
|
476
|
-
let tests = [];
|
|
477
|
-
const expectedMap = /* @__PURE__ */ new Map();
|
|
478
|
-
try {
|
|
479
|
-
const dataDir = resolveDataDir();
|
|
480
|
-
const testsJsonl = await import_fs2.promises.readFile(
|
|
481
|
-
import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
482
|
-
"utf-8"
|
|
483
|
-
);
|
|
484
|
-
const expectedJsonl = await import_fs2.promises.readFile(
|
|
485
|
-
import_path2.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
486
|
-
"utf-8"
|
|
487
|
-
);
|
|
488
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
489
|
-
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
490
|
-
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
491
|
-
} catch (e) {
|
|
492
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
493
|
-
return {
|
|
494
|
-
score: 0,
|
|
495
|
-
success: false,
|
|
496
|
-
metrics: {},
|
|
497
|
-
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
498
|
-
error: e
|
|
499
|
-
};
|
|
500
|
-
}
|
|
501
|
-
for (const tc of tests) {
|
|
502
|
-
try {
|
|
503
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
504
|
-
const messages = [
|
|
505
|
-
{
|
|
506
|
-
role: "system",
|
|
507
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
508
|
-
},
|
|
509
|
-
{
|
|
510
|
-
role: "user",
|
|
511
|
-
content: [
|
|
512
|
-
"Generate a JSON object that reflects the following facts.",
|
|
513
|
-
"JSON Schema:",
|
|
514
|
-
schemaStr,
|
|
515
|
-
"Facts:",
|
|
516
|
-
tc.promptFacts,
|
|
517
|
-
"Output must be a single JSON only, with no additional text."
|
|
518
|
-
].join("\n\n")
|
|
519
|
-
}
|
|
520
|
-
];
|
|
521
|
-
const { text } = await (0, import_ai.generateText)({ model, messages });
|
|
522
|
-
let parsed;
|
|
523
|
-
try {
|
|
524
|
-
parsed = extractFirstJsonBlock(text);
|
|
525
|
-
} catch {
|
|
526
|
-
}
|
|
527
|
-
if (parsed === void 0) {
|
|
528
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
529
|
-
continue;
|
|
530
|
-
}
|
|
531
|
-
const validate = ajv.compile(tc.schema);
|
|
532
|
-
const valid = validate(parsed);
|
|
533
|
-
if (valid) schemaValidCount++;
|
|
534
|
-
else
|
|
535
|
-
logs.push(
|
|
536
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
537
|
-
);
|
|
538
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
539
|
-
if (!expectedRec) {
|
|
540
|
-
logs.push(
|
|
541
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
542
|
-
);
|
|
543
|
-
}
|
|
544
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
545
|
-
if (valuesOk) valueMatchCount++;
|
|
546
|
-
if (valid && valuesOk) {
|
|
547
|
-
correctCount++;
|
|
548
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
549
|
-
} else {
|
|
550
|
-
logs.push(
|
|
551
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
552
|
-
parsed
|
|
553
|
-
)}`
|
|
554
|
-
);
|
|
555
|
-
}
|
|
556
|
-
} catch (e) {
|
|
557
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
558
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
const total = tests.length;
|
|
562
|
-
const score = correctCount / total;
|
|
563
|
-
return {
|
|
564
|
-
score,
|
|
565
|
-
success: score >= 0.8,
|
|
566
|
-
metrics: {
|
|
567
|
-
total_cases: total,
|
|
568
|
-
correct_count: correctCount,
|
|
569
|
-
schema_valid_count: schemaValidCount,
|
|
570
|
-
value_match_count: valueMatchCount,
|
|
571
|
-
accuracy: score
|
|
572
|
-
},
|
|
573
|
-
logs
|
|
574
|
-
};
|
|
575
|
-
}
|
|
576
|
-
};
|
|
577
|
-
var jsonGenerationSchemaOnlyBenchmark = {
|
|
578
|
-
name: "json-generation-schema-only",
|
|
579
|
-
version: "1.0.1",
|
|
580
|
-
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
581
|
-
async run(model) {
|
|
582
|
-
const logs = [];
|
|
583
|
-
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
584
|
-
let tests = [];
|
|
585
|
-
try {
|
|
586
|
-
const dataDir = resolveDataDir();
|
|
587
|
-
const testsJsonl = await import_fs2.promises.readFile(
|
|
588
|
-
import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
589
|
-
"utf-8"
|
|
590
|
-
);
|
|
591
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
592
|
-
} catch (e) {
|
|
593
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
594
|
-
return {
|
|
595
|
-
score: 0,
|
|
596
|
-
success: false,
|
|
597
|
-
metrics: {},
|
|
598
|
-
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
599
|
-
error: e
|
|
600
|
-
};
|
|
601
|
-
}
|
|
602
|
-
let schemaValidCount = 0;
|
|
603
|
-
for (const tc of tests) {
|
|
604
|
-
try {
|
|
605
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
606
|
-
const messages = [
|
|
607
|
-
{
|
|
608
|
-
role: "system",
|
|
609
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
610
|
-
},
|
|
611
|
-
{
|
|
612
|
-
role: "user",
|
|
613
|
-
content: [
|
|
614
|
-
"Generate a JSON object that reflects the following facts.",
|
|
615
|
-
"JSON Schema:",
|
|
616
|
-
schemaStr,
|
|
617
|
-
"Facts:",
|
|
618
|
-
tc.promptFacts,
|
|
619
|
-
"Output must be a single JSON only, with no additional text."
|
|
620
|
-
].join("\n\n")
|
|
621
|
-
}
|
|
622
|
-
];
|
|
623
|
-
const { text } = await (0, import_ai.generateText)({ model, messages });
|
|
624
|
-
let parsed;
|
|
625
|
-
try {
|
|
626
|
-
parsed = extractFirstJsonBlock(text);
|
|
627
|
-
} catch {
|
|
628
|
-
}
|
|
629
|
-
if (parsed === void 0) {
|
|
630
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
631
|
-
continue;
|
|
632
|
-
}
|
|
633
|
-
const validate = ajv.compile(tc.schema);
|
|
634
|
-
const valid = validate(parsed);
|
|
635
|
-
if (valid) {
|
|
636
|
-
schemaValidCount++;
|
|
637
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
638
|
-
} else {
|
|
639
|
-
logs.push(
|
|
640
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
641
|
-
);
|
|
642
|
-
}
|
|
643
|
-
} catch (e) {
|
|
644
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
645
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
646
|
-
}
|
|
647
|
-
}
|
|
648
|
-
const total = tests.length;
|
|
649
|
-
const score = total > 0 ? schemaValidCount / total : 0;
|
|
650
|
-
return {
|
|
651
|
-
score,
|
|
652
|
-
success: score >= 0.8,
|
|
653
|
-
metrics: {
|
|
654
|
-
total_cases: total,
|
|
655
|
-
schema_valid_count: schemaValidCount,
|
|
656
|
-
accuracy: score
|
|
657
|
-
},
|
|
658
|
-
logs
|
|
659
|
-
};
|
|
660
|
-
}
|
|
661
|
-
};
|
|
662
|
-
|
|
663
|
-
// src/benchmarks/bfcl.ts
|
|
664
|
-
var import_ai2 = require("ai");
|
|
665
|
-
var import_fs3 = require("fs");
|
|
666
|
-
var import_path3 = __toESM(require("path"), 1);
|
|
667
|
-
|
|
668
410
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
669
411
|
function standardizeString(input) {
|
|
670
412
|
if (typeof input !== "string") return input;
|
|
@@ -674,7 +416,7 @@ function standardizeString(input) {
|
|
|
674
416
|
function checkStringValue(param, modelValue, possibleAnswers) {
|
|
675
417
|
const standardizedModelValue = standardizeString(modelValue);
|
|
676
418
|
const standardizedPossibleAnswers = possibleAnswers.map(
|
|
677
|
-
(ans) => standardizeString(ans)
|
|
419
|
+
(ans) => standardizeString(String(ans))
|
|
678
420
|
);
|
|
679
421
|
if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
|
|
680
422
|
return {
|
|
@@ -701,8 +443,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
701
443
|
};
|
|
702
444
|
}
|
|
703
445
|
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
446
|
+
const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
|
|
704
447
|
for (const param of requiredParams) {
|
|
705
|
-
if (!(param in
|
|
448
|
+
if (!(param in argsObj)) {
|
|
706
449
|
return {
|
|
707
450
|
valid: false,
|
|
708
451
|
error: `Missing required parameter: '${param}'.`,
|
|
@@ -710,87 +453,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
710
453
|
};
|
|
711
454
|
}
|
|
712
455
|
}
|
|
713
|
-
|
|
714
|
-
const
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
valid: false,
|
|
718
|
-
error: `Unexpected parameter: '${paramName}'.`,
|
|
719
|
-
error_type: "simple_function_checker:unexpected_param"
|
|
720
|
-
};
|
|
721
|
-
}
|
|
722
|
-
const possibleValues = possibleAnswerParams[paramName];
|
|
723
|
-
if (typeof modelValue === "string") {
|
|
724
|
-
const result = checkStringValue(paramName, modelValue, possibleValues);
|
|
725
|
-
if (!result.valid) return result;
|
|
726
|
-
} else if (Array.isArray(modelValue)) {
|
|
727
|
-
const modelValueStr = JSON.stringify(
|
|
728
|
-
modelValue.map((v) => standardizeString(v.toString())).sort()
|
|
729
|
-
);
|
|
730
|
-
const hasMatch = possibleValues.some(
|
|
731
|
-
(p) => JSON.stringify(
|
|
732
|
-
p.map((v) => standardizeString(v.toString())).sort()
|
|
733
|
-
) === modelValueStr
|
|
734
|
-
);
|
|
735
|
-
if (!hasMatch) {
|
|
456
|
+
if (modelArgs && typeof modelArgs === "object") {
|
|
457
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
458
|
+
const modelValue = argsObj[paramName];
|
|
459
|
+
if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
|
|
736
460
|
return {
|
|
737
461
|
valid: false,
|
|
738
|
-
error: `
|
|
739
|
-
|
|
740
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
741
|
-
error_type: "value_error:list"
|
|
462
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
463
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
742
464
|
};
|
|
743
465
|
}
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
466
|
+
const possibleValues = possibleAnswerParams[paramName];
|
|
467
|
+
if (typeof modelValue === "string") {
|
|
468
|
+
const result = checkStringValue(
|
|
469
|
+
paramName,
|
|
470
|
+
modelValue,
|
|
471
|
+
possibleValues ?? []
|
|
472
|
+
);
|
|
473
|
+
if (!result.valid) return result;
|
|
474
|
+
} else if (Array.isArray(modelValue)) {
|
|
475
|
+
const modelValueStr = JSON.stringify(
|
|
476
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
477
|
+
);
|
|
478
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
479
|
+
if (!Array.isArray(p)) return false;
|
|
480
|
+
return JSON.stringify(
|
|
481
|
+
p.map((v) => standardizeString(String(v))).sort()
|
|
482
|
+
) === modelValueStr;
|
|
483
|
+
}) : false;
|
|
484
|
+
if (!hasMatch) {
|
|
485
|
+
return {
|
|
486
|
+
valid: false,
|
|
487
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
488
|
+
modelValue
|
|
489
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
490
|
+
error_type: "value_error:list"
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
} else {
|
|
494
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
|
|
495
|
+
if (modelValue === possibleValue) return true;
|
|
496
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
497
|
+
try {
|
|
498
|
+
const normalizeObject = (obj) => {
|
|
499
|
+
if (Array.isArray(obj)) {
|
|
500
|
+
return obj.map(normalizeObject);
|
|
501
|
+
}
|
|
502
|
+
if (obj && typeof obj === "object") {
|
|
503
|
+
const normalized = {};
|
|
504
|
+
for (const [key, value] of Object.entries(
|
|
505
|
+
obj
|
|
506
|
+
)) {
|
|
507
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
508
|
+
normalized[key] = value[0];
|
|
509
|
+
} else {
|
|
510
|
+
normalized[key] = normalizeObject(value);
|
|
511
|
+
}
|
|
760
512
|
}
|
|
513
|
+
return normalized;
|
|
761
514
|
}
|
|
762
|
-
return
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
return false;
|
|
515
|
+
return obj;
|
|
516
|
+
};
|
|
517
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
518
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
519
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
520
|
+
} catch {
|
|
521
|
+
return false;
|
|
522
|
+
}
|
|
771
523
|
}
|
|
524
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
525
|
+
return modelValue.toString() === possibleValue;
|
|
526
|
+
}
|
|
527
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
528
|
+
return modelValue === possibleValue.toString();
|
|
529
|
+
}
|
|
530
|
+
return false;
|
|
531
|
+
}) : false;
|
|
532
|
+
if (!hasMatch) {
|
|
533
|
+
return {
|
|
534
|
+
valid: false,
|
|
535
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
536
|
+
modelValue
|
|
537
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
538
|
+
error_type: "value_error:other"
|
|
539
|
+
};
|
|
772
540
|
}
|
|
773
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
774
|
-
return modelValue.toString() === possibleValue;
|
|
775
|
-
}
|
|
776
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
777
|
-
return modelValue === possibleValue.toString();
|
|
778
|
-
}
|
|
779
|
-
return false;
|
|
780
|
-
});
|
|
781
|
-
if (!hasMatch) {
|
|
782
|
-
return {
|
|
783
|
-
valid: false,
|
|
784
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
785
|
-
modelValue
|
|
786
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
787
|
-
error_type: "value_error:other"
|
|
788
|
-
};
|
|
789
541
|
}
|
|
790
542
|
}
|
|
791
543
|
}
|
|
792
544
|
for (const paramName in possibleAnswerParams) {
|
|
793
|
-
|
|
545
|
+
const val = possibleAnswerParams[paramName];
|
|
546
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
547
|
+
if (!(paramName in argsObj) && !isOptional) {
|
|
794
548
|
return {
|
|
795
549
|
valid: false,
|
|
796
550
|
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
@@ -876,10 +630,10 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
876
630
|
const category = testCase.id.split("_")[0];
|
|
877
631
|
try {
|
|
878
632
|
if (category === "simple") {
|
|
879
|
-
if (!modelOutput || modelOutput.length !== 1) {
|
|
633
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
880
634
|
return {
|
|
881
635
|
valid: false,
|
|
882
|
-
error: `Expected 1 function call, but got ${modelOutput
|
|
636
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
883
637
|
error_type: "simple:wrong_count"
|
|
884
638
|
};
|
|
885
639
|
}
|
|
@@ -928,12 +682,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
928
682
|
try {
|
|
929
683
|
const dataPath = resolveDataDir();
|
|
930
684
|
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
931
|
-
const testCasesJson = await
|
|
932
|
-
|
|
685
|
+
const testCasesJson = await import_fs2.promises.readFile(
|
|
686
|
+
import_path2.default.join(dataPath, testDataFile),
|
|
933
687
|
"utf-8"
|
|
934
688
|
);
|
|
935
|
-
const possibleAnswersJson = await
|
|
936
|
-
|
|
689
|
+
const possibleAnswersJson = await import_fs2.promises.readFile(
|
|
690
|
+
import_path2.default.join(dataPath, answerDataFile),
|
|
937
691
|
"utf-8"
|
|
938
692
|
);
|
|
939
693
|
testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -950,19 +704,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
950
704
|
);
|
|
951
705
|
}
|
|
952
706
|
const fixSchema = (schema) => {
|
|
953
|
-
if (!schema || typeof schema !== "object")
|
|
707
|
+
if (!schema || typeof schema !== "object")
|
|
708
|
+
return { type: "object", properties: {} };
|
|
954
709
|
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
955
|
-
if (copy
|
|
956
|
-
if (copy.type
|
|
957
|
-
|
|
958
|
-
copy.type
|
|
959
|
-
|
|
960
|
-
if (copy.properties && typeof copy.properties === "object") {
|
|
961
|
-
for (const k of Object.keys(copy.properties)) {
|
|
962
|
-
copy.properties[k] = fixSchema(copy.properties[k]);
|
|
710
|
+
if (!Array.isArray(copy)) {
|
|
711
|
+
if (copy.type) {
|
|
712
|
+
if (copy.type === "dict") copy.type = "object";
|
|
713
|
+
if (copy.type === "integer" || copy.type === "float")
|
|
714
|
+
copy.type = "number";
|
|
963
715
|
}
|
|
716
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
717
|
+
for (const k of Object.keys(copy.properties)) {
|
|
718
|
+
copy.properties[k] = fixSchema(
|
|
719
|
+
copy.properties[k]
|
|
720
|
+
);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
if (copy.items) copy.items = fixSchema(copy.items);
|
|
724
|
+
return copy;
|
|
964
725
|
}
|
|
965
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
966
726
|
return copy;
|
|
967
727
|
};
|
|
968
728
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
@@ -982,7 +742,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
982
742
|
};
|
|
983
743
|
const transformedTools = tools.map((t) => {
|
|
984
744
|
const fixed = fixSchema(t.parameters);
|
|
985
|
-
const
|
|
745
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
746
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
986
747
|
const sanitized = sanitizeName(t.name);
|
|
987
748
|
nameMap.set(sanitized, t.name);
|
|
988
749
|
return {
|
|
@@ -995,9 +756,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
995
756
|
const toolsMap = Object.fromEntries(
|
|
996
757
|
transformedTools.map((t) => [
|
|
997
758
|
t.name,
|
|
998
|
-
(0,
|
|
759
|
+
(0, import_ai.tool)({
|
|
999
760
|
description: typeof t.description === "string" ? t.description : void 0,
|
|
1000
|
-
inputSchema: (0,
|
|
761
|
+
inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
|
|
1001
762
|
})
|
|
1002
763
|
])
|
|
1003
764
|
);
|
|
@@ -1012,7 +773,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1012
773
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
1013
774
|
);
|
|
1014
775
|
}
|
|
1015
|
-
const { toolCalls, text, finishReason } = await (0,
|
|
776
|
+
const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
|
|
1016
777
|
model,
|
|
1017
778
|
messages: flatMessages,
|
|
1018
779
|
tools: toolsMap,
|
|
@@ -1021,7 +782,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1021
782
|
providerOptions: {
|
|
1022
783
|
toolCallMiddleware: {
|
|
1023
784
|
originalToolSchemas: Object.fromEntries(
|
|
1024
|
-
transformedTools.map((t) => [
|
|
785
|
+
transformedTools.map((t) => [
|
|
786
|
+
t.name,
|
|
787
|
+
t.inputSchema
|
|
788
|
+
])
|
|
1025
789
|
)
|
|
1026
790
|
}
|
|
1027
791
|
}
|
|
@@ -1074,10 +838,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1074
838
|
const summarizeArgs = (args) => {
|
|
1075
839
|
if (args == null) return args;
|
|
1076
840
|
if (typeof args !== "object") return args;
|
|
1077
|
-
return Object.keys(args).sort().reduce(
|
|
1078
|
-
acc
|
|
1079
|
-
|
|
1080
|
-
|
|
841
|
+
return Object.keys(args).sort().reduce(
|
|
842
|
+
(acc, k) => {
|
|
843
|
+
acc[k] = args[k];
|
|
844
|
+
return acc;
|
|
845
|
+
},
|
|
846
|
+
{}
|
|
847
|
+
);
|
|
1081
848
|
};
|
|
1082
849
|
const expected = {};
|
|
1083
850
|
const actual = {};
|
|
@@ -1098,19 +865,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1098
865
|
diff.push(`- ${expectedFuncName}`);
|
|
1099
866
|
diff.push(`+ ${receivedName}`);
|
|
1100
867
|
}
|
|
1101
|
-
if (expectedParams && receivedArgs) {
|
|
868
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1102
869
|
const required = funcDesc?.parameters?.required ?? [];
|
|
1103
870
|
for (const req of required) {
|
|
1104
871
|
if (!(req in receivedArgs)) {
|
|
1105
872
|
diff.push(`- missing required param: ${req}`);
|
|
1106
873
|
}
|
|
1107
874
|
}
|
|
1108
|
-
for (const k of Object.keys(
|
|
875
|
+
for (const k of Object.keys(
|
|
876
|
+
receivedArgs
|
|
877
|
+
)) {
|
|
1109
878
|
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1110
879
|
diff.push(`+ unexpected param: ${k}`);
|
|
1111
880
|
}
|
|
1112
881
|
}
|
|
1113
|
-
for (const k of Object.keys(
|
|
882
|
+
for (const k of Object.keys(
|
|
883
|
+
receivedArgs
|
|
884
|
+
)) {
|
|
1114
885
|
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1115
886
|
const allowed = expectedParams[k];
|
|
1116
887
|
const got = receivedArgs[k];
|
|
@@ -1183,13 +954,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1183
954
|
);
|
|
1184
955
|
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1185
956
|
diff.push(`@@ function ${fname}`);
|
|
1186
|
-
if (expectedParamsAllowed && receivedArgs) {
|
|
957
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1187
958
|
for (const req of requiredParams) {
|
|
1188
959
|
if (!(req in receivedArgs)) {
|
|
1189
960
|
diff.push(`- missing required param: ${req}`);
|
|
1190
961
|
}
|
|
1191
962
|
}
|
|
1192
|
-
for (const k of Object.keys(
|
|
963
|
+
for (const k of Object.keys(
|
|
964
|
+
receivedArgs
|
|
965
|
+
)) {
|
|
1193
966
|
if (!Object.prototype.hasOwnProperty.call(
|
|
1194
967
|
expectedParamsAllowed,
|
|
1195
968
|
k
|
|
@@ -1197,7 +970,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1197
970
|
diff.push(`+ unexpected param: ${k}`);
|
|
1198
971
|
}
|
|
1199
972
|
}
|
|
1200
|
-
for (const k of Object.keys(
|
|
973
|
+
for (const k of Object.keys(
|
|
974
|
+
receivedArgs
|
|
975
|
+
)) {
|
|
1201
976
|
if (Object.prototype.hasOwnProperty.call(
|
|
1202
977
|
expectedParamsAllowed,
|
|
1203
978
|
k
|
|
@@ -1335,6 +1110,262 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
|
1335
1110
|
"BFCL_v3_parallel_multiple.json",
|
|
1336
1111
|
"BFCL_v3_parallel_multiple_possible_answer.json"
|
|
1337
1112
|
);
|
|
1113
|
+
|
|
1114
|
+
// src/benchmarks/json-generation.ts
|
|
1115
|
+
var import_ai2 = require("ai");
|
|
1116
|
+
var import_ajv = __toESM(require("ajv"), 1);
|
|
1117
|
+
var import_fs3 = require("fs");
|
|
1118
|
+
var import_path3 = __toESM(require("path"), 1);
|
|
1119
|
+
function extractFirstJsonBlock(text) {
|
|
1120
|
+
try {
|
|
1121
|
+
return JSON.parse(text);
|
|
1122
|
+
} catch {
|
|
1123
|
+
}
|
|
1124
|
+
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
1125
|
+
if (fenceMatch) {
|
|
1126
|
+
const inner = fenceMatch[1].trim();
|
|
1127
|
+
try {
|
|
1128
|
+
return JSON.parse(inner);
|
|
1129
|
+
} catch {
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
const startIdxObj = text.indexOf("{");
|
|
1133
|
+
const startIdxArr = text.indexOf("[");
|
|
1134
|
+
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1135
|
+
if (start === void 0) return void 0;
|
|
1136
|
+
const open = text[start] === "{" ? "{" : "[";
|
|
1137
|
+
const close = open === "{" ? "}" : "]";
|
|
1138
|
+
let depth = 0;
|
|
1139
|
+
for (let i = start; i < text.length; i++) {
|
|
1140
|
+
const ch = text[i];
|
|
1141
|
+
if (ch === open) depth++;
|
|
1142
|
+
else if (ch === close) depth--;
|
|
1143
|
+
if (depth === 0) {
|
|
1144
|
+
const candidate = text.slice(start, i + 1);
|
|
1145
|
+
try {
|
|
1146
|
+
return JSON.parse(candidate);
|
|
1147
|
+
} catch {
|
|
1148
|
+
}
|
|
1149
|
+
break;
|
|
1150
|
+
}
|
|
1151
|
+
}
|
|
1152
|
+
return void 0;
|
|
1153
|
+
}
|
|
1154
|
+
function subsetMatch(expected, actual) {
|
|
1155
|
+
if (expected === null || typeof expected !== "object") {
|
|
1156
|
+
return expected === actual;
|
|
1157
|
+
}
|
|
1158
|
+
if (Array.isArray(expected)) {
|
|
1159
|
+
if (!Array.isArray(actual)) return false;
|
|
1160
|
+
for (let i = 0; i < expected.length; i++) {
|
|
1161
|
+
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
1162
|
+
}
|
|
1163
|
+
return true;
|
|
1164
|
+
}
|
|
1165
|
+
if (actual === null || typeof actual !== "object") return false;
|
|
1166
|
+
const eObj = expected;
|
|
1167
|
+
const aObj = actual;
|
|
1168
|
+
for (const key of Object.keys(eObj)) {
|
|
1169
|
+
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
1170
|
+
}
|
|
1171
|
+
return true;
|
|
1172
|
+
}
|
|
1173
|
+
var jsonGenerationBenchmark = {
|
|
1174
|
+
name: "json-generation",
|
|
1175
|
+
version: "2.1.0",
|
|
1176
|
+
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1177
|
+
async run(model) {
|
|
1178
|
+
const logs = [];
|
|
1179
|
+
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1180
|
+
let schemaValidCount = 0;
|
|
1181
|
+
let valueMatchCount = 0;
|
|
1182
|
+
let correctCount = 0;
|
|
1183
|
+
let tests = [];
|
|
1184
|
+
const expectedMap = /* @__PURE__ */ new Map();
|
|
1185
|
+
try {
|
|
1186
|
+
const dataDir = resolveDataDir();
|
|
1187
|
+
const testsJsonl = await import_fs3.promises.readFile(
|
|
1188
|
+
import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1189
|
+
"utf-8"
|
|
1190
|
+
);
|
|
1191
|
+
const expectedJsonl = await import_fs3.promises.readFile(
|
|
1192
|
+
import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
1193
|
+
"utf-8"
|
|
1194
|
+
);
|
|
1195
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1196
|
+
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1197
|
+
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
1198
|
+
} catch (e) {
|
|
1199
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1200
|
+
return {
|
|
1201
|
+
score: 0,
|
|
1202
|
+
success: false,
|
|
1203
|
+
metrics: {},
|
|
1204
|
+
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
1205
|
+
error: e
|
|
1206
|
+
};
|
|
1207
|
+
}
|
|
1208
|
+
for (const tc of tests) {
|
|
1209
|
+
try {
|
|
1210
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1211
|
+
const messages = [
|
|
1212
|
+
{
|
|
1213
|
+
role: "system",
|
|
1214
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1215
|
+
},
|
|
1216
|
+
{
|
|
1217
|
+
role: "user",
|
|
1218
|
+
content: [
|
|
1219
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1220
|
+
"JSON Schema:",
|
|
1221
|
+
schemaStr,
|
|
1222
|
+
"Facts:",
|
|
1223
|
+
tc.promptFacts,
|
|
1224
|
+
"Output must be a single JSON only, with no additional text."
|
|
1225
|
+
].join("\n\n")
|
|
1226
|
+
}
|
|
1227
|
+
];
|
|
1228
|
+
const { text } = await (0, import_ai2.generateText)({ model, messages });
|
|
1229
|
+
let parsed;
|
|
1230
|
+
try {
|
|
1231
|
+
parsed = extractFirstJsonBlock(text);
|
|
1232
|
+
} catch {
|
|
1233
|
+
}
|
|
1234
|
+
if (parsed === void 0) {
|
|
1235
|
+
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1236
|
+
continue;
|
|
1237
|
+
}
|
|
1238
|
+
const validate = ajv.compile(tc.schema);
|
|
1239
|
+
const valid = validate(parsed);
|
|
1240
|
+
if (valid) schemaValidCount++;
|
|
1241
|
+
else
|
|
1242
|
+
logs.push(
|
|
1243
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1244
|
+
);
|
|
1245
|
+
const expectedRec = expectedMap.get(tc.id);
|
|
1246
|
+
if (!expectedRec) {
|
|
1247
|
+
logs.push(
|
|
1248
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1249
|
+
);
|
|
1250
|
+
}
|
|
1251
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1252
|
+
if (valuesOk) valueMatchCount++;
|
|
1253
|
+
if (valid && valuesOk) {
|
|
1254
|
+
correctCount++;
|
|
1255
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1256
|
+
} else {
|
|
1257
|
+
logs.push(
|
|
1258
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1259
|
+
parsed
|
|
1260
|
+
)}`
|
|
1261
|
+
);
|
|
1262
|
+
}
|
|
1263
|
+
} catch (e) {
|
|
1264
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1265
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
const total = tests.length;
|
|
1269
|
+
const score = correctCount / total;
|
|
1270
|
+
return {
|
|
1271
|
+
score,
|
|
1272
|
+
success: score >= 0.8,
|
|
1273
|
+
metrics: {
|
|
1274
|
+
total_cases: total,
|
|
1275
|
+
correct_count: correctCount,
|
|
1276
|
+
schema_valid_count: schemaValidCount,
|
|
1277
|
+
value_match_count: valueMatchCount,
|
|
1278
|
+
accuracy: score
|
|
1279
|
+
},
|
|
1280
|
+
logs
|
|
1281
|
+
};
|
|
1282
|
+
}
|
|
1283
|
+
};
|
|
1284
|
+
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1285
|
+
name: "json-generation-schema-only",
|
|
1286
|
+
version: "1.0.1",
|
|
1287
|
+
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
1288
|
+
async run(model) {
|
|
1289
|
+
const logs = [];
|
|
1290
|
+
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1291
|
+
let tests = [];
|
|
1292
|
+
try {
|
|
1293
|
+
const dataDir = resolveDataDir();
|
|
1294
|
+
const testsJsonl = await import_fs3.promises.readFile(
|
|
1295
|
+
import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1296
|
+
"utf-8"
|
|
1297
|
+
);
|
|
1298
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1299
|
+
} catch (e) {
|
|
1300
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1301
|
+
return {
|
|
1302
|
+
score: 0,
|
|
1303
|
+
success: false,
|
|
1304
|
+
metrics: {},
|
|
1305
|
+
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1306
|
+
error: e
|
|
1307
|
+
};
|
|
1308
|
+
}
|
|
1309
|
+
let schemaValidCount = 0;
|
|
1310
|
+
for (const tc of tests) {
|
|
1311
|
+
try {
|
|
1312
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1313
|
+
const messages = [
|
|
1314
|
+
{
|
|
1315
|
+
role: "system",
|
|
1316
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1317
|
+
},
|
|
1318
|
+
{
|
|
1319
|
+
role: "user",
|
|
1320
|
+
content: [
|
|
1321
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1322
|
+
"JSON Schema:",
|
|
1323
|
+
schemaStr,
|
|
1324
|
+
"Facts:",
|
|
1325
|
+
tc.promptFacts,
|
|
1326
|
+
"Output must be a single JSON only, with no additional text."
|
|
1327
|
+
].join("\n\n")
|
|
1328
|
+
}
|
|
1329
|
+
];
|
|
1330
|
+
const { text } = await (0, import_ai2.generateText)({ model, messages });
|
|
1331
|
+
let parsed;
|
|
1332
|
+
try {
|
|
1333
|
+
parsed = extractFirstJsonBlock(text);
|
|
1334
|
+
} catch {
|
|
1335
|
+
}
|
|
1336
|
+
if (parsed === void 0) {
|
|
1337
|
+
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1338
|
+
continue;
|
|
1339
|
+
}
|
|
1340
|
+
const validate = ajv.compile(tc.schema);
|
|
1341
|
+
const valid = validate(parsed);
|
|
1342
|
+
if (valid) {
|
|
1343
|
+
schemaValidCount++;
|
|
1344
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1345
|
+
} else {
|
|
1346
|
+
logs.push(
|
|
1347
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1348
|
+
);
|
|
1349
|
+
}
|
|
1350
|
+
} catch (e) {
|
|
1351
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1352
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
const total = tests.length;
|
|
1356
|
+
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1357
|
+
return {
|
|
1358
|
+
score,
|
|
1359
|
+
success: score >= 0.8,
|
|
1360
|
+
metrics: {
|
|
1361
|
+
total_cases: total,
|
|
1362
|
+
schema_valid_count: schemaValidCount,
|
|
1363
|
+
accuracy: score
|
|
1364
|
+
},
|
|
1365
|
+
logs
|
|
1366
|
+
};
|
|
1367
|
+
}
|
|
1368
|
+
};
|
|
1338
1369
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1339
1370
|
0 && (module.exports = {
|
|
1340
1371
|
bfclMultipleBenchmark,
|