@ai-sdk-tool/eval 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -5
- package/dist/index.cjs +401 -370
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -4
- package/dist/index.d.ts +4 -4
- package/dist/index.js +400 -369
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
package/dist/index.js
CHANGED
|
@@ -71,7 +71,7 @@ function uniqueLines(lines) {
|
|
|
71
71
|
function suggestFixFromDiff(parsed) {
|
|
72
72
|
const suggestions = [];
|
|
73
73
|
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
74
|
-
if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
|
|
74
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
75
75
|
const expectedName = expected?.function;
|
|
76
76
|
const actualName = actual?.function;
|
|
77
77
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
@@ -85,23 +85,23 @@ function suggestFixFromDiff(parsed) {
|
|
|
85
85
|
);
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
|
-
if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
|
|
89
|
-
const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
|
|
88
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
89
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
90
90
|
if (missing.length) {
|
|
91
91
|
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
|
-
if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
|
|
95
|
-
const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
|
|
94
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
95
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
96
96
|
if (extras.length) {
|
|
97
97
|
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
98
98
|
}
|
|
99
99
|
}
|
|
100
|
-
if (diff && diff.some((d) => d.startsWith("@@ param "))) {
|
|
101
|
-
const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
|
|
100
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
101
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
102
102
|
for (const param of targets) {
|
|
103
103
|
const allowedLine = diff.find(
|
|
104
|
-
(d) => d.startsWith("- expected one of:")
|
|
104
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
105
105
|
);
|
|
106
106
|
if (allowedLine) {
|
|
107
107
|
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
@@ -308,17 +308,16 @@ async function evaluate(options) {
|
|
|
308
308
|
return allResults;
|
|
309
309
|
}
|
|
310
310
|
|
|
311
|
-
// src/benchmarks/
|
|
312
|
-
import { generateText } from "ai";
|
|
313
|
-
import Ajv from "ajv";
|
|
311
|
+
// src/benchmarks/bfcl.ts
|
|
312
|
+
import { generateText, jsonSchema, tool } from "ai";
|
|
314
313
|
import { promises as fs2 } from "fs";
|
|
315
314
|
import path2 from "path";
|
|
316
315
|
|
|
317
316
|
// src/utils/paths.ts
|
|
318
317
|
import fs from "fs";
|
|
318
|
+
import { createRequire } from "module";
|
|
319
319
|
import path from "path";
|
|
320
320
|
import { fileURLToPath } from "url";
|
|
321
|
-
import { createRequire } from "module";
|
|
322
321
|
function resolveDataDir(fromModuleUrl) {
|
|
323
322
|
const moduleUrl = fromModuleUrl;
|
|
324
323
|
const override = process.env.BFCL_DATA_DIR;
|
|
@@ -366,263 +365,6 @@ function resolveDataDir(fromModuleUrl) {
|
|
|
366
365
|
return path.join(pkgRoot, "data");
|
|
367
366
|
}
|
|
368
367
|
|
|
369
|
-
// src/benchmarks/json-generation.ts
|
|
370
|
-
function extractFirstJsonBlock(text) {
|
|
371
|
-
try {
|
|
372
|
-
return JSON.parse(text);
|
|
373
|
-
} catch {
|
|
374
|
-
}
|
|
375
|
-
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
376
|
-
if (fenceMatch) {
|
|
377
|
-
const inner = fenceMatch[1].trim();
|
|
378
|
-
try {
|
|
379
|
-
return JSON.parse(inner);
|
|
380
|
-
} catch {
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
const startIdxObj = text.indexOf("{");
|
|
384
|
-
const startIdxArr = text.indexOf("[");
|
|
385
|
-
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
386
|
-
if (start === void 0) return void 0;
|
|
387
|
-
const open = text[start] === "{" ? "{" : "[";
|
|
388
|
-
const close = open === "{" ? "}" : "]";
|
|
389
|
-
let depth = 0;
|
|
390
|
-
for (let i = start; i < text.length; i++) {
|
|
391
|
-
const ch = text[i];
|
|
392
|
-
if (ch === open) depth++;
|
|
393
|
-
else if (ch === close) depth--;
|
|
394
|
-
if (depth === 0) {
|
|
395
|
-
const candidate = text.slice(start, i + 1);
|
|
396
|
-
try {
|
|
397
|
-
return JSON.parse(candidate);
|
|
398
|
-
} catch {
|
|
399
|
-
}
|
|
400
|
-
break;
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
return void 0;
|
|
404
|
-
}
|
|
405
|
-
function subsetMatch(expected, actual) {
|
|
406
|
-
if (expected === null || typeof expected !== "object") {
|
|
407
|
-
return expected === actual;
|
|
408
|
-
}
|
|
409
|
-
if (Array.isArray(expected)) {
|
|
410
|
-
if (!Array.isArray(actual)) return false;
|
|
411
|
-
for (let i = 0; i < expected.length; i++) {
|
|
412
|
-
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
413
|
-
}
|
|
414
|
-
return true;
|
|
415
|
-
}
|
|
416
|
-
if (actual === null || typeof actual !== "object") return false;
|
|
417
|
-
const eObj = expected;
|
|
418
|
-
const aObj = actual;
|
|
419
|
-
for (const key of Object.keys(eObj)) {
|
|
420
|
-
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
421
|
-
}
|
|
422
|
-
return true;
|
|
423
|
-
}
|
|
424
|
-
var jsonGenerationBenchmark = {
|
|
425
|
-
name: "json-generation",
|
|
426
|
-
version: "2.1.0",
|
|
427
|
-
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
428
|
-
async run(model) {
|
|
429
|
-
const logs = [];
|
|
430
|
-
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
431
|
-
let schemaValidCount = 0;
|
|
432
|
-
let valueMatchCount = 0;
|
|
433
|
-
let correctCount = 0;
|
|
434
|
-
let tests = [];
|
|
435
|
-
const expectedMap = /* @__PURE__ */ new Map();
|
|
436
|
-
try {
|
|
437
|
-
const dataDir = resolveDataDir();
|
|
438
|
-
const testsJsonl = await fs2.readFile(
|
|
439
|
-
path2.join(dataDir, "json_generation_tests.jsonl"),
|
|
440
|
-
"utf-8"
|
|
441
|
-
);
|
|
442
|
-
const expectedJsonl = await fs2.readFile(
|
|
443
|
-
path2.join(dataDir, "json_generation_expected.jsonl"),
|
|
444
|
-
"utf-8"
|
|
445
|
-
);
|
|
446
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
447
|
-
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
448
|
-
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
449
|
-
} catch (e) {
|
|
450
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
451
|
-
return {
|
|
452
|
-
score: 0,
|
|
453
|
-
success: false,
|
|
454
|
-
metrics: {},
|
|
455
|
-
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
456
|
-
error: e
|
|
457
|
-
};
|
|
458
|
-
}
|
|
459
|
-
for (const tc of tests) {
|
|
460
|
-
try {
|
|
461
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
462
|
-
const messages = [
|
|
463
|
-
{
|
|
464
|
-
role: "system",
|
|
465
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
466
|
-
},
|
|
467
|
-
{
|
|
468
|
-
role: "user",
|
|
469
|
-
content: [
|
|
470
|
-
"Generate a JSON object that reflects the following facts.",
|
|
471
|
-
"JSON Schema:",
|
|
472
|
-
schemaStr,
|
|
473
|
-
"Facts:",
|
|
474
|
-
tc.promptFacts,
|
|
475
|
-
"Output must be a single JSON only, with no additional text."
|
|
476
|
-
].join("\n\n")
|
|
477
|
-
}
|
|
478
|
-
];
|
|
479
|
-
const { text } = await generateText({ model, messages });
|
|
480
|
-
let parsed;
|
|
481
|
-
try {
|
|
482
|
-
parsed = extractFirstJsonBlock(text);
|
|
483
|
-
} catch {
|
|
484
|
-
}
|
|
485
|
-
if (parsed === void 0) {
|
|
486
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
487
|
-
continue;
|
|
488
|
-
}
|
|
489
|
-
const validate = ajv.compile(tc.schema);
|
|
490
|
-
const valid = validate(parsed);
|
|
491
|
-
if (valid) schemaValidCount++;
|
|
492
|
-
else
|
|
493
|
-
logs.push(
|
|
494
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
495
|
-
);
|
|
496
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
497
|
-
if (!expectedRec) {
|
|
498
|
-
logs.push(
|
|
499
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
500
|
-
);
|
|
501
|
-
}
|
|
502
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
503
|
-
if (valuesOk) valueMatchCount++;
|
|
504
|
-
if (valid && valuesOk) {
|
|
505
|
-
correctCount++;
|
|
506
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
507
|
-
} else {
|
|
508
|
-
logs.push(
|
|
509
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
510
|
-
parsed
|
|
511
|
-
)}`
|
|
512
|
-
);
|
|
513
|
-
}
|
|
514
|
-
} catch (e) {
|
|
515
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
516
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
517
|
-
}
|
|
518
|
-
}
|
|
519
|
-
const total = tests.length;
|
|
520
|
-
const score = correctCount / total;
|
|
521
|
-
return {
|
|
522
|
-
score,
|
|
523
|
-
success: score >= 0.8,
|
|
524
|
-
metrics: {
|
|
525
|
-
total_cases: total,
|
|
526
|
-
correct_count: correctCount,
|
|
527
|
-
schema_valid_count: schemaValidCount,
|
|
528
|
-
value_match_count: valueMatchCount,
|
|
529
|
-
accuracy: score
|
|
530
|
-
},
|
|
531
|
-
logs
|
|
532
|
-
};
|
|
533
|
-
}
|
|
534
|
-
};
|
|
535
|
-
var jsonGenerationSchemaOnlyBenchmark = {
|
|
536
|
-
name: "json-generation-schema-only",
|
|
537
|
-
version: "1.0.1",
|
|
538
|
-
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
539
|
-
async run(model) {
|
|
540
|
-
const logs = [];
|
|
541
|
-
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
542
|
-
let tests = [];
|
|
543
|
-
try {
|
|
544
|
-
const dataDir = resolveDataDir();
|
|
545
|
-
const testsJsonl = await fs2.readFile(
|
|
546
|
-
path2.join(dataDir, "json_generation_tests.jsonl"),
|
|
547
|
-
"utf-8"
|
|
548
|
-
);
|
|
549
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
550
|
-
} catch (e) {
|
|
551
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
552
|
-
return {
|
|
553
|
-
score: 0,
|
|
554
|
-
success: false,
|
|
555
|
-
metrics: {},
|
|
556
|
-
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
557
|
-
error: e
|
|
558
|
-
};
|
|
559
|
-
}
|
|
560
|
-
let schemaValidCount = 0;
|
|
561
|
-
for (const tc of tests) {
|
|
562
|
-
try {
|
|
563
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
564
|
-
const messages = [
|
|
565
|
-
{
|
|
566
|
-
role: "system",
|
|
567
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
568
|
-
},
|
|
569
|
-
{
|
|
570
|
-
role: "user",
|
|
571
|
-
content: [
|
|
572
|
-
"Generate a JSON object that reflects the following facts.",
|
|
573
|
-
"JSON Schema:",
|
|
574
|
-
schemaStr,
|
|
575
|
-
"Facts:",
|
|
576
|
-
tc.promptFacts,
|
|
577
|
-
"Output must be a single JSON only, with no additional text."
|
|
578
|
-
].join("\n\n")
|
|
579
|
-
}
|
|
580
|
-
];
|
|
581
|
-
const { text } = await generateText({ model, messages });
|
|
582
|
-
let parsed;
|
|
583
|
-
try {
|
|
584
|
-
parsed = extractFirstJsonBlock(text);
|
|
585
|
-
} catch {
|
|
586
|
-
}
|
|
587
|
-
if (parsed === void 0) {
|
|
588
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
589
|
-
continue;
|
|
590
|
-
}
|
|
591
|
-
const validate = ajv.compile(tc.schema);
|
|
592
|
-
const valid = validate(parsed);
|
|
593
|
-
if (valid) {
|
|
594
|
-
schemaValidCount++;
|
|
595
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
596
|
-
} else {
|
|
597
|
-
logs.push(
|
|
598
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
599
|
-
);
|
|
600
|
-
}
|
|
601
|
-
} catch (e) {
|
|
602
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
603
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
const total = tests.length;
|
|
607
|
-
const score = total > 0 ? schemaValidCount / total : 0;
|
|
608
|
-
return {
|
|
609
|
-
score,
|
|
610
|
-
success: score >= 0.8,
|
|
611
|
-
metrics: {
|
|
612
|
-
total_cases: total,
|
|
613
|
-
schema_valid_count: schemaValidCount,
|
|
614
|
-
accuracy: score
|
|
615
|
-
},
|
|
616
|
-
logs
|
|
617
|
-
};
|
|
618
|
-
}
|
|
619
|
-
};
|
|
620
|
-
|
|
621
|
-
// src/benchmarks/bfcl.ts
|
|
622
|
-
import { generateText as generateText2, jsonSchema, tool } from "ai";
|
|
623
|
-
import { promises as fs3 } from "fs";
|
|
624
|
-
import path3 from "path";
|
|
625
|
-
|
|
626
368
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
627
369
|
function standardizeString(input) {
|
|
628
370
|
if (typeof input !== "string") return input;
|
|
@@ -632,7 +374,7 @@ function standardizeString(input) {
|
|
|
632
374
|
function checkStringValue(param, modelValue, possibleAnswers) {
|
|
633
375
|
const standardizedModelValue = standardizeString(modelValue);
|
|
634
376
|
const standardizedPossibleAnswers = possibleAnswers.map(
|
|
635
|
-
(ans) => standardizeString(ans)
|
|
377
|
+
(ans) => standardizeString(String(ans))
|
|
636
378
|
);
|
|
637
379
|
if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
|
|
638
380
|
return {
|
|
@@ -659,8 +401,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
659
401
|
};
|
|
660
402
|
}
|
|
661
403
|
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
404
|
+
const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
|
|
662
405
|
for (const param of requiredParams) {
|
|
663
|
-
if (!(param in
|
|
406
|
+
if (!(param in argsObj)) {
|
|
664
407
|
return {
|
|
665
408
|
valid: false,
|
|
666
409
|
error: `Missing required parameter: '${param}'.`,
|
|
@@ -668,87 +411,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
668
411
|
};
|
|
669
412
|
}
|
|
670
413
|
}
|
|
671
|
-
|
|
672
|
-
const
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
valid: false,
|
|
676
|
-
error: `Unexpected parameter: '${paramName}'.`,
|
|
677
|
-
error_type: "simple_function_checker:unexpected_param"
|
|
678
|
-
};
|
|
679
|
-
}
|
|
680
|
-
const possibleValues = possibleAnswerParams[paramName];
|
|
681
|
-
if (typeof modelValue === "string") {
|
|
682
|
-
const result = checkStringValue(paramName, modelValue, possibleValues);
|
|
683
|
-
if (!result.valid) return result;
|
|
684
|
-
} else if (Array.isArray(modelValue)) {
|
|
685
|
-
const modelValueStr = JSON.stringify(
|
|
686
|
-
modelValue.map((v) => standardizeString(v.toString())).sort()
|
|
687
|
-
);
|
|
688
|
-
const hasMatch = possibleValues.some(
|
|
689
|
-
(p) => JSON.stringify(
|
|
690
|
-
p.map((v) => standardizeString(v.toString())).sort()
|
|
691
|
-
) === modelValueStr
|
|
692
|
-
);
|
|
693
|
-
if (!hasMatch) {
|
|
414
|
+
if (modelArgs && typeof modelArgs === "object") {
|
|
415
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
416
|
+
const modelValue = argsObj[paramName];
|
|
417
|
+
if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
|
|
694
418
|
return {
|
|
695
419
|
valid: false,
|
|
696
|
-
error: `
|
|
697
|
-
|
|
698
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
699
|
-
error_type: "value_error:list"
|
|
420
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
421
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
700
422
|
};
|
|
701
423
|
}
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
424
|
+
const possibleValues = possibleAnswerParams[paramName];
|
|
425
|
+
if (typeof modelValue === "string") {
|
|
426
|
+
const result = checkStringValue(
|
|
427
|
+
paramName,
|
|
428
|
+
modelValue,
|
|
429
|
+
possibleValues ?? []
|
|
430
|
+
);
|
|
431
|
+
if (!result.valid) return result;
|
|
432
|
+
} else if (Array.isArray(modelValue)) {
|
|
433
|
+
const modelValueStr = JSON.stringify(
|
|
434
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
435
|
+
);
|
|
436
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
437
|
+
if (!Array.isArray(p)) return false;
|
|
438
|
+
return JSON.stringify(
|
|
439
|
+
p.map((v) => standardizeString(String(v))).sort()
|
|
440
|
+
) === modelValueStr;
|
|
441
|
+
}) : false;
|
|
442
|
+
if (!hasMatch) {
|
|
443
|
+
return {
|
|
444
|
+
valid: false,
|
|
445
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
446
|
+
modelValue
|
|
447
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
448
|
+
error_type: "value_error:list"
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
} else {
|
|
452
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
|
|
453
|
+
if (modelValue === possibleValue) return true;
|
|
454
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
455
|
+
try {
|
|
456
|
+
const normalizeObject = (obj) => {
|
|
457
|
+
if (Array.isArray(obj)) {
|
|
458
|
+
return obj.map(normalizeObject);
|
|
459
|
+
}
|
|
460
|
+
if (obj && typeof obj === "object") {
|
|
461
|
+
const normalized = {};
|
|
462
|
+
for (const [key, value] of Object.entries(
|
|
463
|
+
obj
|
|
464
|
+
)) {
|
|
465
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
466
|
+
normalized[key] = value[0];
|
|
467
|
+
} else {
|
|
468
|
+
normalized[key] = normalizeObject(value);
|
|
469
|
+
}
|
|
718
470
|
}
|
|
471
|
+
return normalized;
|
|
719
472
|
}
|
|
720
|
-
return
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
return false;
|
|
473
|
+
return obj;
|
|
474
|
+
};
|
|
475
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
476
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
477
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
478
|
+
} catch {
|
|
479
|
+
return false;
|
|
480
|
+
}
|
|
729
481
|
}
|
|
482
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
483
|
+
return modelValue.toString() === possibleValue;
|
|
484
|
+
}
|
|
485
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
486
|
+
return modelValue === possibleValue.toString();
|
|
487
|
+
}
|
|
488
|
+
return false;
|
|
489
|
+
}) : false;
|
|
490
|
+
if (!hasMatch) {
|
|
491
|
+
return {
|
|
492
|
+
valid: false,
|
|
493
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
494
|
+
modelValue
|
|
495
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
496
|
+
error_type: "value_error:other"
|
|
497
|
+
};
|
|
730
498
|
}
|
|
731
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
732
|
-
return modelValue.toString() === possibleValue;
|
|
733
|
-
}
|
|
734
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
735
|
-
return modelValue === possibleValue.toString();
|
|
736
|
-
}
|
|
737
|
-
return false;
|
|
738
|
-
});
|
|
739
|
-
if (!hasMatch) {
|
|
740
|
-
return {
|
|
741
|
-
valid: false,
|
|
742
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
743
|
-
modelValue
|
|
744
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
745
|
-
error_type: "value_error:other"
|
|
746
|
-
};
|
|
747
499
|
}
|
|
748
500
|
}
|
|
749
501
|
}
|
|
750
502
|
for (const paramName in possibleAnswerParams) {
|
|
751
|
-
|
|
503
|
+
const val = possibleAnswerParams[paramName];
|
|
504
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
505
|
+
if (!(paramName in argsObj) && !isOptional) {
|
|
752
506
|
return {
|
|
753
507
|
valid: false,
|
|
754
508
|
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
@@ -834,10 +588,10 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
834
588
|
const category = testCase.id.split("_")[0];
|
|
835
589
|
try {
|
|
836
590
|
if (category === "simple") {
|
|
837
|
-
if (!modelOutput || modelOutput.length !== 1) {
|
|
591
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
838
592
|
return {
|
|
839
593
|
valid: false,
|
|
840
|
-
error: `Expected 1 function call, but got ${modelOutput
|
|
594
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
841
595
|
error_type: "simple:wrong_count"
|
|
842
596
|
};
|
|
843
597
|
}
|
|
@@ -886,12 +640,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
886
640
|
try {
|
|
887
641
|
const dataPath = resolveDataDir();
|
|
888
642
|
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
889
|
-
const testCasesJson = await
|
|
890
|
-
|
|
643
|
+
const testCasesJson = await fs2.readFile(
|
|
644
|
+
path2.join(dataPath, testDataFile),
|
|
891
645
|
"utf-8"
|
|
892
646
|
);
|
|
893
|
-
const possibleAnswersJson = await
|
|
894
|
-
|
|
647
|
+
const possibleAnswersJson = await fs2.readFile(
|
|
648
|
+
path2.join(dataPath, answerDataFile),
|
|
895
649
|
"utf-8"
|
|
896
650
|
);
|
|
897
651
|
testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -908,19 +662,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
908
662
|
);
|
|
909
663
|
}
|
|
910
664
|
const fixSchema = (schema) => {
|
|
911
|
-
if (!schema || typeof schema !== "object")
|
|
665
|
+
if (!schema || typeof schema !== "object")
|
|
666
|
+
return { type: "object", properties: {} };
|
|
912
667
|
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
913
|
-
if (copy
|
|
914
|
-
if (copy.type
|
|
915
|
-
|
|
916
|
-
copy.type
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
668
|
+
if (!Array.isArray(copy)) {
|
|
669
|
+
if (copy.type) {
|
|
670
|
+
if (copy.type === "dict") copy.type = "object";
|
|
671
|
+
if (copy.type === "integer" || copy.type === "float")
|
|
672
|
+
copy.type = "number";
|
|
673
|
+
}
|
|
674
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
675
|
+
for (const k of Object.keys(copy.properties)) {
|
|
676
|
+
copy.properties[k] = fixSchema(
|
|
677
|
+
copy.properties[k]
|
|
678
|
+
);
|
|
679
|
+
}
|
|
921
680
|
}
|
|
681
|
+
if (copy.items) copy.items = fixSchema(copy.items);
|
|
682
|
+
return copy;
|
|
922
683
|
}
|
|
923
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
924
684
|
return copy;
|
|
925
685
|
};
|
|
926
686
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
@@ -940,7 +700,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
940
700
|
};
|
|
941
701
|
const transformedTools = tools.map((t) => {
|
|
942
702
|
const fixed = fixSchema(t.parameters);
|
|
943
|
-
const
|
|
703
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
704
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
944
705
|
const sanitized = sanitizeName(t.name);
|
|
945
706
|
nameMap.set(sanitized, t.name);
|
|
946
707
|
return {
|
|
@@ -970,7 +731,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
970
731
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
971
732
|
);
|
|
972
733
|
}
|
|
973
|
-
const { toolCalls, text, finishReason } = await
|
|
734
|
+
const { toolCalls, text, finishReason } = await generateText({
|
|
974
735
|
model,
|
|
975
736
|
messages: flatMessages,
|
|
976
737
|
tools: toolsMap,
|
|
@@ -979,7 +740,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
979
740
|
providerOptions: {
|
|
980
741
|
toolCallMiddleware: {
|
|
981
742
|
originalToolSchemas: Object.fromEntries(
|
|
982
|
-
transformedTools.map((t) => [
|
|
743
|
+
transformedTools.map((t) => [
|
|
744
|
+
t.name,
|
|
745
|
+
t.inputSchema
|
|
746
|
+
])
|
|
983
747
|
)
|
|
984
748
|
}
|
|
985
749
|
}
|
|
@@ -1032,10 +796,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1032
796
|
const summarizeArgs = (args) => {
|
|
1033
797
|
if (args == null) return args;
|
|
1034
798
|
if (typeof args !== "object") return args;
|
|
1035
|
-
return Object.keys(args).sort().reduce(
|
|
1036
|
-
acc
|
|
1037
|
-
|
|
1038
|
-
|
|
799
|
+
return Object.keys(args).sort().reduce(
|
|
800
|
+
(acc, k) => {
|
|
801
|
+
acc[k] = args[k];
|
|
802
|
+
return acc;
|
|
803
|
+
},
|
|
804
|
+
{}
|
|
805
|
+
);
|
|
1039
806
|
};
|
|
1040
807
|
const expected = {};
|
|
1041
808
|
const actual = {};
|
|
@@ -1056,19 +823,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1056
823
|
diff.push(`- ${expectedFuncName}`);
|
|
1057
824
|
diff.push(`+ ${receivedName}`);
|
|
1058
825
|
}
|
|
1059
|
-
if (expectedParams && receivedArgs) {
|
|
826
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1060
827
|
const required = funcDesc?.parameters?.required ?? [];
|
|
1061
828
|
for (const req of required) {
|
|
1062
829
|
if (!(req in receivedArgs)) {
|
|
1063
830
|
diff.push(`- missing required param: ${req}`);
|
|
1064
831
|
}
|
|
1065
832
|
}
|
|
1066
|
-
for (const k of Object.keys(
|
|
833
|
+
for (const k of Object.keys(
|
|
834
|
+
receivedArgs
|
|
835
|
+
)) {
|
|
1067
836
|
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1068
837
|
diff.push(`+ unexpected param: ${k}`);
|
|
1069
838
|
}
|
|
1070
839
|
}
|
|
1071
|
-
for (const k of Object.keys(
|
|
840
|
+
for (const k of Object.keys(
|
|
841
|
+
receivedArgs
|
|
842
|
+
)) {
|
|
1072
843
|
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1073
844
|
const allowed = expectedParams[k];
|
|
1074
845
|
const got = receivedArgs[k];
|
|
@@ -1141,13 +912,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1141
912
|
);
|
|
1142
913
|
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1143
914
|
diff.push(`@@ function ${fname}`);
|
|
1144
|
-
if (expectedParamsAllowed && receivedArgs) {
|
|
915
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1145
916
|
for (const req of requiredParams) {
|
|
1146
917
|
if (!(req in receivedArgs)) {
|
|
1147
918
|
diff.push(`- missing required param: ${req}`);
|
|
1148
919
|
}
|
|
1149
920
|
}
|
|
1150
|
-
for (const k of Object.keys(
|
|
921
|
+
for (const k of Object.keys(
|
|
922
|
+
receivedArgs
|
|
923
|
+
)) {
|
|
1151
924
|
if (!Object.prototype.hasOwnProperty.call(
|
|
1152
925
|
expectedParamsAllowed,
|
|
1153
926
|
k
|
|
@@ -1155,7 +928,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1155
928
|
diff.push(`+ unexpected param: ${k}`);
|
|
1156
929
|
}
|
|
1157
930
|
}
|
|
1158
|
-
for (const k of Object.keys(
|
|
931
|
+
for (const k of Object.keys(
|
|
932
|
+
receivedArgs
|
|
933
|
+
)) {
|
|
1159
934
|
if (Object.prototype.hasOwnProperty.call(
|
|
1160
935
|
expectedParamsAllowed,
|
|
1161
936
|
k
|
|
@@ -1293,6 +1068,262 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
|
1293
1068
|
"BFCL_v3_parallel_multiple.json",
|
|
1294
1069
|
"BFCL_v3_parallel_multiple_possible_answer.json"
|
|
1295
1070
|
);
|
|
1071
|
+
|
|
1072
|
+
// src/benchmarks/json-generation.ts
|
|
1073
|
+
import { generateText as generateText2 } from "ai";
|
|
1074
|
+
import Ajv from "ajv";
|
|
1075
|
+
import { promises as fs3 } from "fs";
|
|
1076
|
+
import path3 from "path";
|
|
1077
|
+
function extractFirstJsonBlock(text) {
|
|
1078
|
+
try {
|
|
1079
|
+
return JSON.parse(text);
|
|
1080
|
+
} catch {
|
|
1081
|
+
}
|
|
1082
|
+
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
1083
|
+
if (fenceMatch) {
|
|
1084
|
+
const inner = fenceMatch[1].trim();
|
|
1085
|
+
try {
|
|
1086
|
+
return JSON.parse(inner);
|
|
1087
|
+
} catch {
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
const startIdxObj = text.indexOf("{");
|
|
1091
|
+
const startIdxArr = text.indexOf("[");
|
|
1092
|
+
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1093
|
+
if (start === void 0) return void 0;
|
|
1094
|
+
const open = text[start] === "{" ? "{" : "[";
|
|
1095
|
+
const close = open === "{" ? "}" : "]";
|
|
1096
|
+
let depth = 0;
|
|
1097
|
+
for (let i = start; i < text.length; i++) {
|
|
1098
|
+
const ch = text[i];
|
|
1099
|
+
if (ch === open) depth++;
|
|
1100
|
+
else if (ch === close) depth--;
|
|
1101
|
+
if (depth === 0) {
|
|
1102
|
+
const candidate = text.slice(start, i + 1);
|
|
1103
|
+
try {
|
|
1104
|
+
return JSON.parse(candidate);
|
|
1105
|
+
} catch {
|
|
1106
|
+
}
|
|
1107
|
+
break;
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
return void 0;
|
|
1111
|
+
}
|
|
1112
|
+
function subsetMatch(expected, actual) {
|
|
1113
|
+
if (expected === null || typeof expected !== "object") {
|
|
1114
|
+
return expected === actual;
|
|
1115
|
+
}
|
|
1116
|
+
if (Array.isArray(expected)) {
|
|
1117
|
+
if (!Array.isArray(actual)) return false;
|
|
1118
|
+
for (let i = 0; i < expected.length; i++) {
|
|
1119
|
+
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
1120
|
+
}
|
|
1121
|
+
return true;
|
|
1122
|
+
}
|
|
1123
|
+
if (actual === null || typeof actual !== "object") return false;
|
|
1124
|
+
const eObj = expected;
|
|
1125
|
+
const aObj = actual;
|
|
1126
|
+
for (const key of Object.keys(eObj)) {
|
|
1127
|
+
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
1128
|
+
}
|
|
1129
|
+
return true;
|
|
1130
|
+
}
|
|
1131
|
+
var jsonGenerationBenchmark = {
|
|
1132
|
+
name: "json-generation",
|
|
1133
|
+
version: "2.1.0",
|
|
1134
|
+
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1135
|
+
async run(model) {
|
|
1136
|
+
const logs = [];
|
|
1137
|
+
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1138
|
+
let schemaValidCount = 0;
|
|
1139
|
+
let valueMatchCount = 0;
|
|
1140
|
+
let correctCount = 0;
|
|
1141
|
+
let tests = [];
|
|
1142
|
+
const expectedMap = /* @__PURE__ */ new Map();
|
|
1143
|
+
try {
|
|
1144
|
+
const dataDir = resolveDataDir();
|
|
1145
|
+
const testsJsonl = await fs3.readFile(
|
|
1146
|
+
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1147
|
+
"utf-8"
|
|
1148
|
+
);
|
|
1149
|
+
const expectedJsonl = await fs3.readFile(
|
|
1150
|
+
path3.join(dataDir, "json_generation_expected.jsonl"),
|
|
1151
|
+
"utf-8"
|
|
1152
|
+
);
|
|
1153
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1154
|
+
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1155
|
+
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
1156
|
+
} catch (e) {
|
|
1157
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1158
|
+
return {
|
|
1159
|
+
score: 0,
|
|
1160
|
+
success: false,
|
|
1161
|
+
metrics: {},
|
|
1162
|
+
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
1163
|
+
error: e
|
|
1164
|
+
};
|
|
1165
|
+
}
|
|
1166
|
+
for (const tc of tests) {
|
|
1167
|
+
try {
|
|
1168
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1169
|
+
const messages = [
|
|
1170
|
+
{
|
|
1171
|
+
role: "system",
|
|
1172
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1173
|
+
},
|
|
1174
|
+
{
|
|
1175
|
+
role: "user",
|
|
1176
|
+
content: [
|
|
1177
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1178
|
+
"JSON Schema:",
|
|
1179
|
+
schemaStr,
|
|
1180
|
+
"Facts:",
|
|
1181
|
+
tc.promptFacts,
|
|
1182
|
+
"Output must be a single JSON only, with no additional text."
|
|
1183
|
+
].join("\n\n")
|
|
1184
|
+
}
|
|
1185
|
+
];
|
|
1186
|
+
const { text } = await generateText2({ model, messages });
|
|
1187
|
+
let parsed;
|
|
1188
|
+
try {
|
|
1189
|
+
parsed = extractFirstJsonBlock(text);
|
|
1190
|
+
} catch {
|
|
1191
|
+
}
|
|
1192
|
+
if (parsed === void 0) {
|
|
1193
|
+
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1194
|
+
continue;
|
|
1195
|
+
}
|
|
1196
|
+
const validate = ajv.compile(tc.schema);
|
|
1197
|
+
const valid = validate(parsed);
|
|
1198
|
+
if (valid) schemaValidCount++;
|
|
1199
|
+
else
|
|
1200
|
+
logs.push(
|
|
1201
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1202
|
+
);
|
|
1203
|
+
const expectedRec = expectedMap.get(tc.id);
|
|
1204
|
+
if (!expectedRec) {
|
|
1205
|
+
logs.push(
|
|
1206
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1207
|
+
);
|
|
1208
|
+
}
|
|
1209
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1210
|
+
if (valuesOk) valueMatchCount++;
|
|
1211
|
+
if (valid && valuesOk) {
|
|
1212
|
+
correctCount++;
|
|
1213
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1214
|
+
} else {
|
|
1215
|
+
logs.push(
|
|
1216
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1217
|
+
parsed
|
|
1218
|
+
)}`
|
|
1219
|
+
);
|
|
1220
|
+
}
|
|
1221
|
+
} catch (e) {
|
|
1222
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1223
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
const total = tests.length;
|
|
1227
|
+
const score = correctCount / total;
|
|
1228
|
+
return {
|
|
1229
|
+
score,
|
|
1230
|
+
success: score >= 0.8,
|
|
1231
|
+
metrics: {
|
|
1232
|
+
total_cases: total,
|
|
1233
|
+
correct_count: correctCount,
|
|
1234
|
+
schema_valid_count: schemaValidCount,
|
|
1235
|
+
value_match_count: valueMatchCount,
|
|
1236
|
+
accuracy: score
|
|
1237
|
+
},
|
|
1238
|
+
logs
|
|
1239
|
+
};
|
|
1240
|
+
}
|
|
1241
|
+
};
|
|
1242
|
+
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1243
|
+
name: "json-generation-schema-only",
|
|
1244
|
+
version: "1.0.1",
|
|
1245
|
+
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
1246
|
+
async run(model) {
|
|
1247
|
+
const logs = [];
|
|
1248
|
+
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1249
|
+
let tests = [];
|
|
1250
|
+
try {
|
|
1251
|
+
const dataDir = resolveDataDir();
|
|
1252
|
+
const testsJsonl = await fs3.readFile(
|
|
1253
|
+
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1254
|
+
"utf-8"
|
|
1255
|
+
);
|
|
1256
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1257
|
+
} catch (e) {
|
|
1258
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1259
|
+
return {
|
|
1260
|
+
score: 0,
|
|
1261
|
+
success: false,
|
|
1262
|
+
metrics: {},
|
|
1263
|
+
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1264
|
+
error: e
|
|
1265
|
+
};
|
|
1266
|
+
}
|
|
1267
|
+
let schemaValidCount = 0;
|
|
1268
|
+
for (const tc of tests) {
|
|
1269
|
+
try {
|
|
1270
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1271
|
+
const messages = [
|
|
1272
|
+
{
|
|
1273
|
+
role: "system",
|
|
1274
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1275
|
+
},
|
|
1276
|
+
{
|
|
1277
|
+
role: "user",
|
|
1278
|
+
content: [
|
|
1279
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1280
|
+
"JSON Schema:",
|
|
1281
|
+
schemaStr,
|
|
1282
|
+
"Facts:",
|
|
1283
|
+
tc.promptFacts,
|
|
1284
|
+
"Output must be a single JSON only, with no additional text."
|
|
1285
|
+
].join("\n\n")
|
|
1286
|
+
}
|
|
1287
|
+
];
|
|
1288
|
+
const { text } = await generateText2({ model, messages });
|
|
1289
|
+
let parsed;
|
|
1290
|
+
try {
|
|
1291
|
+
parsed = extractFirstJsonBlock(text);
|
|
1292
|
+
} catch {
|
|
1293
|
+
}
|
|
1294
|
+
if (parsed === void 0) {
|
|
1295
|
+
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1296
|
+
continue;
|
|
1297
|
+
}
|
|
1298
|
+
const validate = ajv.compile(tc.schema);
|
|
1299
|
+
const valid = validate(parsed);
|
|
1300
|
+
if (valid) {
|
|
1301
|
+
schemaValidCount++;
|
|
1302
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1303
|
+
} else {
|
|
1304
|
+
logs.push(
|
|
1305
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1306
|
+
);
|
|
1307
|
+
}
|
|
1308
|
+
} catch (e) {
|
|
1309
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1310
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
const total = tests.length;
|
|
1314
|
+
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1315
|
+
return {
|
|
1316
|
+
score,
|
|
1317
|
+
success: score >= 0.8,
|
|
1318
|
+
metrics: {
|
|
1319
|
+
total_cases: total,
|
|
1320
|
+
schema_valid_count: schemaValidCount,
|
|
1321
|
+
accuracy: score
|
|
1322
|
+
},
|
|
1323
|
+
logs
|
|
1324
|
+
};
|
|
1325
|
+
}
|
|
1326
|
+
};
|
|
1296
1327
|
export {
|
|
1297
1328
|
bfclMultipleBenchmark,
|
|
1298
1329
|
bfclParallelBenchmark,
|