@ai-sdk-tool/eval 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -5
- package/dist/index.cjs +422 -375
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -4
- package/dist/index.d.ts +8 -4
- package/dist/index.js +421 -374
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
package/dist/index.cjs
CHANGED
|
@@ -113,7 +113,7 @@ function uniqueLines(lines) {
|
|
|
113
113
|
function suggestFixFromDiff(parsed) {
|
|
114
114
|
const suggestions = [];
|
|
115
115
|
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
116
|
-
if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
|
|
116
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
117
117
|
const expectedName = expected?.function;
|
|
118
118
|
const actualName = actual?.function;
|
|
119
119
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
@@ -127,23 +127,23 @@ function suggestFixFromDiff(parsed) {
|
|
|
127
127
|
);
|
|
128
128
|
}
|
|
129
129
|
}
|
|
130
|
-
if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
|
|
131
|
-
const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
|
|
130
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
131
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
132
132
|
if (missing.length) {
|
|
133
133
|
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
134
134
|
}
|
|
135
135
|
}
|
|
136
|
-
if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
|
|
137
|
-
const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
|
|
136
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
137
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
138
138
|
if (extras.length) {
|
|
139
139
|
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
140
140
|
}
|
|
141
141
|
}
|
|
142
|
-
if (diff && diff.some((d) => d.startsWith("@@ param "))) {
|
|
143
|
-
const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
|
|
142
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
143
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
144
144
|
for (const param of targets) {
|
|
145
145
|
const allowedLine = diff.find(
|
|
146
|
-
(d) => d.startsWith("- expected one of:")
|
|
146
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
147
147
|
);
|
|
148
148
|
if (allowedLine) {
|
|
149
149
|
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
@@ -281,13 +281,13 @@ var reporters = {
|
|
|
281
281
|
};
|
|
282
282
|
|
|
283
283
|
// src/evaluate.ts
|
|
284
|
-
async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
284
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
285
285
|
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
286
286
|
try {
|
|
287
287
|
console.log(
|
|
288
288
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
289
289
|
);
|
|
290
|
-
const result = await benchmark.run(model);
|
|
290
|
+
const result = await benchmark.run(model, config);
|
|
291
291
|
console.log(
|
|
292
292
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
293
293
|
);
|
|
@@ -316,7 +316,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
|
316
316
|
}
|
|
317
317
|
}
|
|
318
318
|
async function evaluate(options) {
|
|
319
|
-
const { models, benchmarks, reporter = "console" } = options;
|
|
319
|
+
const { models, benchmarks, reporter = "console", temperature } = options;
|
|
320
320
|
const modelEntries = [];
|
|
321
321
|
if (Array.isArray(models)) {
|
|
322
322
|
for (const m of models) modelEntries.push([void 0, m]);
|
|
@@ -335,7 +335,8 @@ async function evaluate(options) {
|
|
|
335
335
|
const evaluationResult = await runSingleBenchmark(
|
|
336
336
|
model,
|
|
337
337
|
benchmark,
|
|
338
|
-
modelKey
|
|
338
|
+
modelKey,
|
|
339
|
+
temperature !== void 0 ? { temperature } : void 0
|
|
339
340
|
);
|
|
340
341
|
allResults.push(evaluationResult);
|
|
341
342
|
}
|
|
@@ -350,17 +351,16 @@ async function evaluate(options) {
|
|
|
350
351
|
return allResults;
|
|
351
352
|
}
|
|
352
353
|
|
|
353
|
-
// src/benchmarks/
|
|
354
|
+
// src/benchmarks/bfcl.ts
|
|
354
355
|
var import_ai = require("ai");
|
|
355
|
-
var import_ajv = __toESM(require("ajv"), 1);
|
|
356
356
|
var import_fs2 = require("fs");
|
|
357
357
|
var import_path2 = __toESM(require("path"), 1);
|
|
358
358
|
|
|
359
359
|
// src/utils/paths.ts
|
|
360
360
|
var import_fs = __toESM(require("fs"), 1);
|
|
361
|
+
var import_module = require("module");
|
|
361
362
|
var import_path = __toESM(require("path"), 1);
|
|
362
363
|
var import_url = require("url");
|
|
363
|
-
var import_module = require("module");
|
|
364
364
|
function resolveDataDir(fromModuleUrl) {
|
|
365
365
|
const moduleUrl = fromModuleUrl;
|
|
366
366
|
const override = process.env.BFCL_DATA_DIR;
|
|
@@ -408,263 +408,6 @@ function resolveDataDir(fromModuleUrl) {
|
|
|
408
408
|
return import_path.default.join(pkgRoot, "data");
|
|
409
409
|
}
|
|
410
410
|
|
|
411
|
-
// src/benchmarks/json-generation.ts
|
|
412
|
-
function extractFirstJsonBlock(text) {
|
|
413
|
-
try {
|
|
414
|
-
return JSON.parse(text);
|
|
415
|
-
} catch {
|
|
416
|
-
}
|
|
417
|
-
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
418
|
-
if (fenceMatch) {
|
|
419
|
-
const inner = fenceMatch[1].trim();
|
|
420
|
-
try {
|
|
421
|
-
return JSON.parse(inner);
|
|
422
|
-
} catch {
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
const startIdxObj = text.indexOf("{");
|
|
426
|
-
const startIdxArr = text.indexOf("[");
|
|
427
|
-
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
428
|
-
if (start === void 0) return void 0;
|
|
429
|
-
const open = text[start] === "{" ? "{" : "[";
|
|
430
|
-
const close = open === "{" ? "}" : "]";
|
|
431
|
-
let depth = 0;
|
|
432
|
-
for (let i = start; i < text.length; i++) {
|
|
433
|
-
const ch = text[i];
|
|
434
|
-
if (ch === open) depth++;
|
|
435
|
-
else if (ch === close) depth--;
|
|
436
|
-
if (depth === 0) {
|
|
437
|
-
const candidate = text.slice(start, i + 1);
|
|
438
|
-
try {
|
|
439
|
-
return JSON.parse(candidate);
|
|
440
|
-
} catch {
|
|
441
|
-
}
|
|
442
|
-
break;
|
|
443
|
-
}
|
|
444
|
-
}
|
|
445
|
-
return void 0;
|
|
446
|
-
}
|
|
447
|
-
function subsetMatch(expected, actual) {
|
|
448
|
-
if (expected === null || typeof expected !== "object") {
|
|
449
|
-
return expected === actual;
|
|
450
|
-
}
|
|
451
|
-
if (Array.isArray(expected)) {
|
|
452
|
-
if (!Array.isArray(actual)) return false;
|
|
453
|
-
for (let i = 0; i < expected.length; i++) {
|
|
454
|
-
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
455
|
-
}
|
|
456
|
-
return true;
|
|
457
|
-
}
|
|
458
|
-
if (actual === null || typeof actual !== "object") return false;
|
|
459
|
-
const eObj = expected;
|
|
460
|
-
const aObj = actual;
|
|
461
|
-
for (const key of Object.keys(eObj)) {
|
|
462
|
-
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
463
|
-
}
|
|
464
|
-
return true;
|
|
465
|
-
}
|
|
466
|
-
var jsonGenerationBenchmark = {
|
|
467
|
-
name: "json-generation",
|
|
468
|
-
version: "2.1.0",
|
|
469
|
-
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
470
|
-
async run(model) {
|
|
471
|
-
const logs = [];
|
|
472
|
-
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
473
|
-
let schemaValidCount = 0;
|
|
474
|
-
let valueMatchCount = 0;
|
|
475
|
-
let correctCount = 0;
|
|
476
|
-
let tests = [];
|
|
477
|
-
const expectedMap = /* @__PURE__ */ new Map();
|
|
478
|
-
try {
|
|
479
|
-
const dataDir = resolveDataDir();
|
|
480
|
-
const testsJsonl = await import_fs2.promises.readFile(
|
|
481
|
-
import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
482
|
-
"utf-8"
|
|
483
|
-
);
|
|
484
|
-
const expectedJsonl = await import_fs2.promises.readFile(
|
|
485
|
-
import_path2.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
486
|
-
"utf-8"
|
|
487
|
-
);
|
|
488
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
489
|
-
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
490
|
-
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
491
|
-
} catch (e) {
|
|
492
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
493
|
-
return {
|
|
494
|
-
score: 0,
|
|
495
|
-
success: false,
|
|
496
|
-
metrics: {},
|
|
497
|
-
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
498
|
-
error: e
|
|
499
|
-
};
|
|
500
|
-
}
|
|
501
|
-
for (const tc of tests) {
|
|
502
|
-
try {
|
|
503
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
504
|
-
const messages = [
|
|
505
|
-
{
|
|
506
|
-
role: "system",
|
|
507
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
508
|
-
},
|
|
509
|
-
{
|
|
510
|
-
role: "user",
|
|
511
|
-
content: [
|
|
512
|
-
"Generate a JSON object that reflects the following facts.",
|
|
513
|
-
"JSON Schema:",
|
|
514
|
-
schemaStr,
|
|
515
|
-
"Facts:",
|
|
516
|
-
tc.promptFacts,
|
|
517
|
-
"Output must be a single JSON only, with no additional text."
|
|
518
|
-
].join("\n\n")
|
|
519
|
-
}
|
|
520
|
-
];
|
|
521
|
-
const { text } = await (0, import_ai.generateText)({ model, messages });
|
|
522
|
-
let parsed;
|
|
523
|
-
try {
|
|
524
|
-
parsed = extractFirstJsonBlock(text);
|
|
525
|
-
} catch {
|
|
526
|
-
}
|
|
527
|
-
if (parsed === void 0) {
|
|
528
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
529
|
-
continue;
|
|
530
|
-
}
|
|
531
|
-
const validate = ajv.compile(tc.schema);
|
|
532
|
-
const valid = validate(parsed);
|
|
533
|
-
if (valid) schemaValidCount++;
|
|
534
|
-
else
|
|
535
|
-
logs.push(
|
|
536
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
537
|
-
);
|
|
538
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
539
|
-
if (!expectedRec) {
|
|
540
|
-
logs.push(
|
|
541
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
542
|
-
);
|
|
543
|
-
}
|
|
544
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
545
|
-
if (valuesOk) valueMatchCount++;
|
|
546
|
-
if (valid && valuesOk) {
|
|
547
|
-
correctCount++;
|
|
548
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
549
|
-
} else {
|
|
550
|
-
logs.push(
|
|
551
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
552
|
-
parsed
|
|
553
|
-
)}`
|
|
554
|
-
);
|
|
555
|
-
}
|
|
556
|
-
} catch (e) {
|
|
557
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
558
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
const total = tests.length;
|
|
562
|
-
const score = correctCount / total;
|
|
563
|
-
return {
|
|
564
|
-
score,
|
|
565
|
-
success: score >= 0.8,
|
|
566
|
-
metrics: {
|
|
567
|
-
total_cases: total,
|
|
568
|
-
correct_count: correctCount,
|
|
569
|
-
schema_valid_count: schemaValidCount,
|
|
570
|
-
value_match_count: valueMatchCount,
|
|
571
|
-
accuracy: score
|
|
572
|
-
},
|
|
573
|
-
logs
|
|
574
|
-
};
|
|
575
|
-
}
|
|
576
|
-
};
|
|
577
|
-
var jsonGenerationSchemaOnlyBenchmark = {
|
|
578
|
-
name: "json-generation-schema-only",
|
|
579
|
-
version: "1.0.1",
|
|
580
|
-
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
581
|
-
async run(model) {
|
|
582
|
-
const logs = [];
|
|
583
|
-
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
584
|
-
let tests = [];
|
|
585
|
-
try {
|
|
586
|
-
const dataDir = resolveDataDir();
|
|
587
|
-
const testsJsonl = await import_fs2.promises.readFile(
|
|
588
|
-
import_path2.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
589
|
-
"utf-8"
|
|
590
|
-
);
|
|
591
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
592
|
-
} catch (e) {
|
|
593
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
594
|
-
return {
|
|
595
|
-
score: 0,
|
|
596
|
-
success: false,
|
|
597
|
-
metrics: {},
|
|
598
|
-
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
599
|
-
error: e
|
|
600
|
-
};
|
|
601
|
-
}
|
|
602
|
-
let schemaValidCount = 0;
|
|
603
|
-
for (const tc of tests) {
|
|
604
|
-
try {
|
|
605
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
606
|
-
const messages = [
|
|
607
|
-
{
|
|
608
|
-
role: "system",
|
|
609
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
610
|
-
},
|
|
611
|
-
{
|
|
612
|
-
role: "user",
|
|
613
|
-
content: [
|
|
614
|
-
"Generate a JSON object that reflects the following facts.",
|
|
615
|
-
"JSON Schema:",
|
|
616
|
-
schemaStr,
|
|
617
|
-
"Facts:",
|
|
618
|
-
tc.promptFacts,
|
|
619
|
-
"Output must be a single JSON only, with no additional text."
|
|
620
|
-
].join("\n\n")
|
|
621
|
-
}
|
|
622
|
-
];
|
|
623
|
-
const { text } = await (0, import_ai.generateText)({ model, messages });
|
|
624
|
-
let parsed;
|
|
625
|
-
try {
|
|
626
|
-
parsed = extractFirstJsonBlock(text);
|
|
627
|
-
} catch {
|
|
628
|
-
}
|
|
629
|
-
if (parsed === void 0) {
|
|
630
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
631
|
-
continue;
|
|
632
|
-
}
|
|
633
|
-
const validate = ajv.compile(tc.schema);
|
|
634
|
-
const valid = validate(parsed);
|
|
635
|
-
if (valid) {
|
|
636
|
-
schemaValidCount++;
|
|
637
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
638
|
-
} else {
|
|
639
|
-
logs.push(
|
|
640
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
641
|
-
);
|
|
642
|
-
}
|
|
643
|
-
} catch (e) {
|
|
644
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
645
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
646
|
-
}
|
|
647
|
-
}
|
|
648
|
-
const total = tests.length;
|
|
649
|
-
const score = total > 0 ? schemaValidCount / total : 0;
|
|
650
|
-
return {
|
|
651
|
-
score,
|
|
652
|
-
success: score >= 0.8,
|
|
653
|
-
metrics: {
|
|
654
|
-
total_cases: total,
|
|
655
|
-
schema_valid_count: schemaValidCount,
|
|
656
|
-
accuracy: score
|
|
657
|
-
},
|
|
658
|
-
logs
|
|
659
|
-
};
|
|
660
|
-
}
|
|
661
|
-
};
|
|
662
|
-
|
|
663
|
-
// src/benchmarks/bfcl.ts
|
|
664
|
-
var import_ai2 = require("ai");
|
|
665
|
-
var import_fs3 = require("fs");
|
|
666
|
-
var import_path3 = __toESM(require("path"), 1);
|
|
667
|
-
|
|
668
411
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
669
412
|
function standardizeString(input) {
|
|
670
413
|
if (typeof input !== "string") return input;
|
|
@@ -674,7 +417,7 @@ function standardizeString(input) {
|
|
|
674
417
|
function checkStringValue(param, modelValue, possibleAnswers) {
|
|
675
418
|
const standardizedModelValue = standardizeString(modelValue);
|
|
676
419
|
const standardizedPossibleAnswers = possibleAnswers.map(
|
|
677
|
-
(ans) => standardizeString(ans)
|
|
420
|
+
(ans) => standardizeString(String(ans))
|
|
678
421
|
);
|
|
679
422
|
if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
|
|
680
423
|
return {
|
|
@@ -701,8 +444,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
701
444
|
};
|
|
702
445
|
}
|
|
703
446
|
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
447
|
+
const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
|
|
704
448
|
for (const param of requiredParams) {
|
|
705
|
-
if (!(param in
|
|
449
|
+
if (!(param in argsObj)) {
|
|
706
450
|
return {
|
|
707
451
|
valid: false,
|
|
708
452
|
error: `Missing required parameter: '${param}'.`,
|
|
@@ -710,87 +454,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
710
454
|
};
|
|
711
455
|
}
|
|
712
456
|
}
|
|
713
|
-
|
|
714
|
-
const
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
valid: false,
|
|
718
|
-
error: `Unexpected parameter: '${paramName}'.`,
|
|
719
|
-
error_type: "simple_function_checker:unexpected_param"
|
|
720
|
-
};
|
|
721
|
-
}
|
|
722
|
-
const possibleValues = possibleAnswerParams[paramName];
|
|
723
|
-
if (typeof modelValue === "string") {
|
|
724
|
-
const result = checkStringValue(paramName, modelValue, possibleValues);
|
|
725
|
-
if (!result.valid) return result;
|
|
726
|
-
} else if (Array.isArray(modelValue)) {
|
|
727
|
-
const modelValueStr = JSON.stringify(
|
|
728
|
-
modelValue.map((v) => standardizeString(v.toString())).sort()
|
|
729
|
-
);
|
|
730
|
-
const hasMatch = possibleValues.some(
|
|
731
|
-
(p) => JSON.stringify(
|
|
732
|
-
p.map((v) => standardizeString(v.toString())).sort()
|
|
733
|
-
) === modelValueStr
|
|
734
|
-
);
|
|
735
|
-
if (!hasMatch) {
|
|
457
|
+
if (modelArgs && typeof modelArgs === "object") {
|
|
458
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
459
|
+
const modelValue = argsObj[paramName];
|
|
460
|
+
if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
|
|
736
461
|
return {
|
|
737
462
|
valid: false,
|
|
738
|
-
error: `
|
|
739
|
-
|
|
740
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
741
|
-
error_type: "value_error:list"
|
|
463
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
464
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
742
465
|
};
|
|
743
466
|
}
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
467
|
+
const possibleValues = possibleAnswerParams[paramName];
|
|
468
|
+
if (typeof modelValue === "string") {
|
|
469
|
+
const result = checkStringValue(
|
|
470
|
+
paramName,
|
|
471
|
+
modelValue,
|
|
472
|
+
possibleValues ?? []
|
|
473
|
+
);
|
|
474
|
+
if (!result.valid) return result;
|
|
475
|
+
} else if (Array.isArray(modelValue)) {
|
|
476
|
+
const modelValueStr = JSON.stringify(
|
|
477
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
478
|
+
);
|
|
479
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
480
|
+
if (!Array.isArray(p)) return false;
|
|
481
|
+
return JSON.stringify(
|
|
482
|
+
p.map((v) => standardizeString(String(v))).sort()
|
|
483
|
+
) === modelValueStr;
|
|
484
|
+
}) : false;
|
|
485
|
+
if (!hasMatch) {
|
|
486
|
+
return {
|
|
487
|
+
valid: false,
|
|
488
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
489
|
+
modelValue
|
|
490
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
491
|
+
error_type: "value_error:list"
|
|
492
|
+
};
|
|
493
|
+
}
|
|
494
|
+
} else {
|
|
495
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
|
|
496
|
+
if (modelValue === possibleValue) return true;
|
|
497
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
498
|
+
try {
|
|
499
|
+
const normalizeObject = (obj) => {
|
|
500
|
+
if (Array.isArray(obj)) {
|
|
501
|
+
return obj.map(normalizeObject);
|
|
502
|
+
}
|
|
503
|
+
if (obj && typeof obj === "object") {
|
|
504
|
+
const normalized = {};
|
|
505
|
+
for (const [key, value] of Object.entries(
|
|
506
|
+
obj
|
|
507
|
+
)) {
|
|
508
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
509
|
+
normalized[key] = value[0];
|
|
510
|
+
} else {
|
|
511
|
+
normalized[key] = normalizeObject(value);
|
|
512
|
+
}
|
|
760
513
|
}
|
|
514
|
+
return normalized;
|
|
761
515
|
}
|
|
762
|
-
return
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
return false;
|
|
516
|
+
return obj;
|
|
517
|
+
};
|
|
518
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
519
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
520
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
521
|
+
} catch {
|
|
522
|
+
return false;
|
|
523
|
+
}
|
|
771
524
|
}
|
|
525
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
526
|
+
return modelValue.toString() === possibleValue;
|
|
527
|
+
}
|
|
528
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
529
|
+
return modelValue === possibleValue.toString();
|
|
530
|
+
}
|
|
531
|
+
return false;
|
|
532
|
+
}) : false;
|
|
533
|
+
if (!hasMatch) {
|
|
534
|
+
return {
|
|
535
|
+
valid: false,
|
|
536
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
537
|
+
modelValue
|
|
538
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
539
|
+
error_type: "value_error:other"
|
|
540
|
+
};
|
|
772
541
|
}
|
|
773
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
774
|
-
return modelValue.toString() === possibleValue;
|
|
775
|
-
}
|
|
776
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
777
|
-
return modelValue === possibleValue.toString();
|
|
778
|
-
}
|
|
779
|
-
return false;
|
|
780
|
-
});
|
|
781
|
-
if (!hasMatch) {
|
|
782
|
-
return {
|
|
783
|
-
valid: false,
|
|
784
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
785
|
-
modelValue
|
|
786
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
787
|
-
error_type: "value_error:other"
|
|
788
|
-
};
|
|
789
542
|
}
|
|
790
543
|
}
|
|
791
544
|
}
|
|
792
545
|
for (const paramName in possibleAnswerParams) {
|
|
793
|
-
|
|
546
|
+
const val = possibleAnswerParams[paramName];
|
|
547
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
548
|
+
if (!(paramName in argsObj) && !isOptional) {
|
|
794
549
|
return {
|
|
795
550
|
valid: false,
|
|
796
551
|
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
@@ -876,10 +631,10 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
876
631
|
const category = testCase.id.split("_")[0];
|
|
877
632
|
try {
|
|
878
633
|
if (category === "simple") {
|
|
879
|
-
if (!modelOutput || modelOutput.length !== 1) {
|
|
634
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
880
635
|
return {
|
|
881
636
|
valid: false,
|
|
882
|
-
error: `Expected 1 function call, but got ${modelOutput
|
|
637
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
883
638
|
error_type: "simple:wrong_count"
|
|
884
639
|
};
|
|
885
640
|
}
|
|
@@ -921,19 +676,19 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
921
676
|
name,
|
|
922
677
|
version: "1.0.0",
|
|
923
678
|
description,
|
|
924
|
-
async run(model) {
|
|
679
|
+
async run(model, config) {
|
|
925
680
|
const logs = [];
|
|
926
681
|
let correctCount = 0;
|
|
927
682
|
let testCases = [];
|
|
928
683
|
try {
|
|
929
684
|
const dataPath = resolveDataDir();
|
|
930
685
|
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
931
|
-
const testCasesJson = await
|
|
932
|
-
|
|
686
|
+
const testCasesJson = await import_fs2.promises.readFile(
|
|
687
|
+
import_path2.default.join(dataPath, testDataFile),
|
|
933
688
|
"utf-8"
|
|
934
689
|
);
|
|
935
|
-
const possibleAnswersJson = await
|
|
936
|
-
|
|
690
|
+
const possibleAnswersJson = await import_fs2.promises.readFile(
|
|
691
|
+
import_path2.default.join(dataPath, answerDataFile),
|
|
937
692
|
"utf-8"
|
|
938
693
|
);
|
|
939
694
|
testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -950,19 +705,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
950
705
|
);
|
|
951
706
|
}
|
|
952
707
|
const fixSchema = (schema) => {
|
|
953
|
-
if (!schema || typeof schema !== "object")
|
|
708
|
+
if (!schema || typeof schema !== "object")
|
|
709
|
+
return { type: "object", properties: {} };
|
|
954
710
|
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
955
|
-
if (copy
|
|
956
|
-
if (copy.type
|
|
957
|
-
|
|
958
|
-
copy.type
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
711
|
+
if (!Array.isArray(copy)) {
|
|
712
|
+
if (copy.type) {
|
|
713
|
+
if (copy.type === "dict") copy.type = "object";
|
|
714
|
+
if (copy.type === "integer" || copy.type === "float")
|
|
715
|
+
copy.type = "number";
|
|
716
|
+
}
|
|
717
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
718
|
+
for (const k of Object.keys(copy.properties)) {
|
|
719
|
+
copy.properties[k] = fixSchema(
|
|
720
|
+
copy.properties[k]
|
|
721
|
+
);
|
|
722
|
+
}
|
|
963
723
|
}
|
|
724
|
+
if (copy.items) copy.items = fixSchema(copy.items);
|
|
725
|
+
return copy;
|
|
964
726
|
}
|
|
965
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
966
727
|
return copy;
|
|
967
728
|
};
|
|
968
729
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
@@ -973,6 +734,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
973
734
|
const runSingleCase = async (testCase) => {
|
|
974
735
|
const caseLogs = [];
|
|
975
736
|
const { function: tools, question: messages } = testCase;
|
|
737
|
+
const temp = config?.temperature;
|
|
738
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
976
739
|
try {
|
|
977
740
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
978
741
|
const nameMap = /* @__PURE__ */ new Map();
|
|
@@ -982,7 +745,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
982
745
|
};
|
|
983
746
|
const transformedTools = tools.map((t) => {
|
|
984
747
|
const fixed = fixSchema(t.parameters);
|
|
985
|
-
const
|
|
748
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
749
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
986
750
|
const sanitized = sanitizeName(t.name);
|
|
987
751
|
nameMap.set(sanitized, t.name);
|
|
988
752
|
return {
|
|
@@ -995,9 +759,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
995
759
|
const toolsMap = Object.fromEntries(
|
|
996
760
|
transformedTools.map((t) => [
|
|
997
761
|
t.name,
|
|
998
|
-
(0,
|
|
762
|
+
(0, import_ai.tool)({
|
|
999
763
|
description: typeof t.description === "string" ? t.description : void 0,
|
|
1000
|
-
inputSchema: (0,
|
|
764
|
+
inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
|
|
1001
765
|
})
|
|
1002
766
|
])
|
|
1003
767
|
);
|
|
@@ -1012,16 +776,20 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1012
776
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
1013
777
|
);
|
|
1014
778
|
}
|
|
1015
|
-
const { toolCalls, text, finishReason } = await (0,
|
|
779
|
+
const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
|
|
1016
780
|
model,
|
|
1017
781
|
messages: flatMessages,
|
|
1018
782
|
tools: toolsMap,
|
|
1019
783
|
toolChoice: "auto",
|
|
784
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
1020
785
|
// Pass original schema information to middleware
|
|
1021
786
|
providerOptions: {
|
|
1022
787
|
toolCallMiddleware: {
|
|
1023
788
|
originalToolSchemas: Object.fromEntries(
|
|
1024
|
-
transformedTools.map((t) => [
|
|
789
|
+
transformedTools.map((t) => [
|
|
790
|
+
t.name,
|
|
791
|
+
t.inputSchema
|
|
792
|
+
])
|
|
1025
793
|
)
|
|
1026
794
|
}
|
|
1027
795
|
}
|
|
@@ -1074,10 +842,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1074
842
|
const summarizeArgs = (args) => {
|
|
1075
843
|
if (args == null) return args;
|
|
1076
844
|
if (typeof args !== "object") return args;
|
|
1077
|
-
return Object.keys(args).sort().reduce(
|
|
1078
|
-
acc
|
|
1079
|
-
|
|
1080
|
-
|
|
845
|
+
return Object.keys(args).sort().reduce(
|
|
846
|
+
(acc, k) => {
|
|
847
|
+
acc[k] = args[k];
|
|
848
|
+
return acc;
|
|
849
|
+
},
|
|
850
|
+
{}
|
|
851
|
+
);
|
|
1081
852
|
};
|
|
1082
853
|
const expected = {};
|
|
1083
854
|
const actual = {};
|
|
@@ -1098,19 +869,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1098
869
|
diff.push(`- ${expectedFuncName}`);
|
|
1099
870
|
diff.push(`+ ${receivedName}`);
|
|
1100
871
|
}
|
|
1101
|
-
if (expectedParams && receivedArgs) {
|
|
872
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1102
873
|
const required = funcDesc?.parameters?.required ?? [];
|
|
1103
874
|
for (const req of required) {
|
|
1104
875
|
if (!(req in receivedArgs)) {
|
|
1105
876
|
diff.push(`- missing required param: ${req}`);
|
|
1106
877
|
}
|
|
1107
878
|
}
|
|
1108
|
-
for (const k of Object.keys(
|
|
879
|
+
for (const k of Object.keys(
|
|
880
|
+
receivedArgs
|
|
881
|
+
)) {
|
|
1109
882
|
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1110
883
|
diff.push(`+ unexpected param: ${k}`);
|
|
1111
884
|
}
|
|
1112
885
|
}
|
|
1113
|
-
for (const k of Object.keys(
|
|
886
|
+
for (const k of Object.keys(
|
|
887
|
+
receivedArgs
|
|
888
|
+
)) {
|
|
1114
889
|
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1115
890
|
const allowed = expectedParams[k];
|
|
1116
891
|
const got = receivedArgs[k];
|
|
@@ -1183,13 +958,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1183
958
|
);
|
|
1184
959
|
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1185
960
|
diff.push(`@@ function ${fname}`);
|
|
1186
|
-
if (expectedParamsAllowed && receivedArgs) {
|
|
961
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1187
962
|
for (const req of requiredParams) {
|
|
1188
963
|
if (!(req in receivedArgs)) {
|
|
1189
964
|
diff.push(`- missing required param: ${req}`);
|
|
1190
965
|
}
|
|
1191
966
|
}
|
|
1192
|
-
for (const k of Object.keys(
|
|
967
|
+
for (const k of Object.keys(
|
|
968
|
+
receivedArgs
|
|
969
|
+
)) {
|
|
1193
970
|
if (!Object.prototype.hasOwnProperty.call(
|
|
1194
971
|
expectedParamsAllowed,
|
|
1195
972
|
k
|
|
@@ -1197,7 +974,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1197
974
|
diff.push(`+ unexpected param: ${k}`);
|
|
1198
975
|
}
|
|
1199
976
|
}
|
|
1200
|
-
for (const k of Object.keys(
|
|
977
|
+
for (const k of Object.keys(
|
|
978
|
+
receivedArgs
|
|
979
|
+
)) {
|
|
1201
980
|
if (Object.prototype.hasOwnProperty.call(
|
|
1202
981
|
expectedParamsAllowed,
|
|
1203
982
|
k
|
|
@@ -1335,6 +1114,274 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
|
1335
1114
|
"BFCL_v3_parallel_multiple.json",
|
|
1336
1115
|
"BFCL_v3_parallel_multiple_possible_answer.json"
|
|
1337
1116
|
);
|
|
1117
|
+
|
|
1118
|
+
// src/benchmarks/json-generation.ts
|
|
1119
|
+
var import_ai2 = require("ai");
|
|
1120
|
+
var import_ajv = __toESM(require("ajv"), 1);
|
|
1121
|
+
var import_fs3 = require("fs");
|
|
1122
|
+
var import_path3 = __toESM(require("path"), 1);
|
|
1123
|
+
function extractFirstJsonBlock(text) {
|
|
1124
|
+
try {
|
|
1125
|
+
return JSON.parse(text);
|
|
1126
|
+
} catch {
|
|
1127
|
+
}
|
|
1128
|
+
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
1129
|
+
if (fenceMatch) {
|
|
1130
|
+
const inner = fenceMatch[1].trim();
|
|
1131
|
+
try {
|
|
1132
|
+
return JSON.parse(inner);
|
|
1133
|
+
} catch {
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
const startIdxObj = text.indexOf("{");
|
|
1137
|
+
const startIdxArr = text.indexOf("[");
|
|
1138
|
+
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1139
|
+
if (start === void 0) return void 0;
|
|
1140
|
+
const open = text[start] === "{" ? "{" : "[";
|
|
1141
|
+
const close = open === "{" ? "}" : "]";
|
|
1142
|
+
let depth = 0;
|
|
1143
|
+
for (let i = start; i < text.length; i++) {
|
|
1144
|
+
const ch = text[i];
|
|
1145
|
+
if (ch === open) depth++;
|
|
1146
|
+
else if (ch === close) depth--;
|
|
1147
|
+
if (depth === 0) {
|
|
1148
|
+
const candidate = text.slice(start, i + 1);
|
|
1149
|
+
try {
|
|
1150
|
+
return JSON.parse(candidate);
|
|
1151
|
+
} catch {
|
|
1152
|
+
}
|
|
1153
|
+
break;
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
return void 0;
|
|
1157
|
+
}
|
|
1158
|
+
function subsetMatch(expected, actual) {
|
|
1159
|
+
if (expected === null || typeof expected !== "object") {
|
|
1160
|
+
return expected === actual;
|
|
1161
|
+
}
|
|
1162
|
+
if (Array.isArray(expected)) {
|
|
1163
|
+
if (!Array.isArray(actual)) return false;
|
|
1164
|
+
for (let i = 0; i < expected.length; i++) {
|
|
1165
|
+
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
1166
|
+
}
|
|
1167
|
+
return true;
|
|
1168
|
+
}
|
|
1169
|
+
if (actual === null || typeof actual !== "object") return false;
|
|
1170
|
+
const eObj = expected;
|
|
1171
|
+
const aObj = actual;
|
|
1172
|
+
for (const key of Object.keys(eObj)) {
|
|
1173
|
+
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
1174
|
+
}
|
|
1175
|
+
return true;
|
|
1176
|
+
}
|
|
1177
|
+
var jsonGenerationBenchmark = {
|
|
1178
|
+
name: "json-generation",
|
|
1179
|
+
version: "2.1.0",
|
|
1180
|
+
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1181
|
+
async run(model, config) {
|
|
1182
|
+
const logs = [];
|
|
1183
|
+
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1184
|
+
let schemaValidCount = 0;
|
|
1185
|
+
let valueMatchCount = 0;
|
|
1186
|
+
let correctCount = 0;
|
|
1187
|
+
let tests = [];
|
|
1188
|
+
const expectedMap = /* @__PURE__ */ new Map();
|
|
1189
|
+
try {
|
|
1190
|
+
const dataDir = resolveDataDir();
|
|
1191
|
+
const testsJsonl = await import_fs3.promises.readFile(
|
|
1192
|
+
import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1193
|
+
"utf-8"
|
|
1194
|
+
);
|
|
1195
|
+
const expectedJsonl = await import_fs3.promises.readFile(
|
|
1196
|
+
import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
1197
|
+
"utf-8"
|
|
1198
|
+
);
|
|
1199
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1200
|
+
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1201
|
+
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
1202
|
+
} catch (e) {
|
|
1203
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1204
|
+
return {
|
|
1205
|
+
score: 0,
|
|
1206
|
+
success: false,
|
|
1207
|
+
metrics: {},
|
|
1208
|
+
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
1209
|
+
error: e
|
|
1210
|
+
};
|
|
1211
|
+
}
|
|
1212
|
+
for (const tc of tests) {
|
|
1213
|
+
try {
|
|
1214
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1215
|
+
const messages = [
|
|
1216
|
+
{
|
|
1217
|
+
role: "system",
|
|
1218
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1219
|
+
},
|
|
1220
|
+
{
|
|
1221
|
+
role: "user",
|
|
1222
|
+
content: [
|
|
1223
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1224
|
+
"JSON Schema:",
|
|
1225
|
+
schemaStr,
|
|
1226
|
+
"Facts:",
|
|
1227
|
+
tc.promptFacts,
|
|
1228
|
+
"Output must be a single JSON only, with no additional text."
|
|
1229
|
+
].join("\n\n")
|
|
1230
|
+
}
|
|
1231
|
+
];
|
|
1232
|
+
const temp = config?.temperature;
|
|
1233
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1234
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
1235
|
+
model,
|
|
1236
|
+
messages,
|
|
1237
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1238
|
+
});
|
|
1239
|
+
let parsed;
|
|
1240
|
+
try {
|
|
1241
|
+
parsed = extractFirstJsonBlock(text);
|
|
1242
|
+
} catch {
|
|
1243
|
+
}
|
|
1244
|
+
if (parsed === void 0) {
|
|
1245
|
+
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1246
|
+
continue;
|
|
1247
|
+
}
|
|
1248
|
+
const validate = ajv.compile(tc.schema);
|
|
1249
|
+
const valid = validate(parsed);
|
|
1250
|
+
if (valid) schemaValidCount++;
|
|
1251
|
+
else
|
|
1252
|
+
logs.push(
|
|
1253
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1254
|
+
);
|
|
1255
|
+
const expectedRec = expectedMap.get(tc.id);
|
|
1256
|
+
if (!expectedRec) {
|
|
1257
|
+
logs.push(
|
|
1258
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1259
|
+
);
|
|
1260
|
+
}
|
|
1261
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1262
|
+
if (valuesOk) valueMatchCount++;
|
|
1263
|
+
if (valid && valuesOk) {
|
|
1264
|
+
correctCount++;
|
|
1265
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1266
|
+
} else {
|
|
1267
|
+
logs.push(
|
|
1268
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1269
|
+
parsed
|
|
1270
|
+
)}`
|
|
1271
|
+
);
|
|
1272
|
+
}
|
|
1273
|
+
} catch (e) {
|
|
1274
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1275
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
const total = tests.length;
|
|
1279
|
+
const score = correctCount / total;
|
|
1280
|
+
return {
|
|
1281
|
+
score,
|
|
1282
|
+
success: score >= 0.8,
|
|
1283
|
+
metrics: {
|
|
1284
|
+
total_cases: total,
|
|
1285
|
+
correct_count: correctCount,
|
|
1286
|
+
schema_valid_count: schemaValidCount,
|
|
1287
|
+
value_match_count: valueMatchCount,
|
|
1288
|
+
accuracy: score
|
|
1289
|
+
},
|
|
1290
|
+
logs
|
|
1291
|
+
};
|
|
1292
|
+
}
|
|
1293
|
+
};
|
|
1294
|
+
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1295
|
+
name: "json-generation-schema-only",
|
|
1296
|
+
version: "1.0.1",
|
|
1297
|
+
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
1298
|
+
async run(model, config) {
|
|
1299
|
+
const logs = [];
|
|
1300
|
+
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1301
|
+
let tests = [];
|
|
1302
|
+
try {
|
|
1303
|
+
const dataDir = resolveDataDir();
|
|
1304
|
+
const testsJsonl = await import_fs3.promises.readFile(
|
|
1305
|
+
import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1306
|
+
"utf-8"
|
|
1307
|
+
);
|
|
1308
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1309
|
+
} catch (e) {
|
|
1310
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1311
|
+
return {
|
|
1312
|
+
score: 0,
|
|
1313
|
+
success: false,
|
|
1314
|
+
metrics: {},
|
|
1315
|
+
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1316
|
+
error: e
|
|
1317
|
+
};
|
|
1318
|
+
}
|
|
1319
|
+
let schemaValidCount = 0;
|
|
1320
|
+
for (const tc of tests) {
|
|
1321
|
+
try {
|
|
1322
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1323
|
+
const messages = [
|
|
1324
|
+
{
|
|
1325
|
+
role: "system",
|
|
1326
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1327
|
+
},
|
|
1328
|
+
{
|
|
1329
|
+
role: "user",
|
|
1330
|
+
content: [
|
|
1331
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1332
|
+
"JSON Schema:",
|
|
1333
|
+
schemaStr,
|
|
1334
|
+
"Facts:",
|
|
1335
|
+
tc.promptFacts,
|
|
1336
|
+
"Output must be a single JSON only, with no additional text."
|
|
1337
|
+
].join("\n\n")
|
|
1338
|
+
}
|
|
1339
|
+
];
|
|
1340
|
+
const temp = config?.temperature;
|
|
1341
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1342
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
1343
|
+
model,
|
|
1344
|
+
messages,
|
|
1345
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1346
|
+
});
|
|
1347
|
+
let parsed;
|
|
1348
|
+
try {
|
|
1349
|
+
parsed = extractFirstJsonBlock(text);
|
|
1350
|
+
} catch {
|
|
1351
|
+
}
|
|
1352
|
+
if (parsed === void 0) {
|
|
1353
|
+
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1354
|
+
continue;
|
|
1355
|
+
}
|
|
1356
|
+
const validate = ajv.compile(tc.schema);
|
|
1357
|
+
const valid = validate(parsed);
|
|
1358
|
+
if (valid) {
|
|
1359
|
+
schemaValidCount++;
|
|
1360
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1361
|
+
} else {
|
|
1362
|
+
logs.push(
|
|
1363
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1364
|
+
);
|
|
1365
|
+
}
|
|
1366
|
+
} catch (e) {
|
|
1367
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1368
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1369
|
+
}
|
|
1370
|
+
}
|
|
1371
|
+
const total = tests.length;
|
|
1372
|
+
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1373
|
+
return {
|
|
1374
|
+
score,
|
|
1375
|
+
success: score >= 0.8,
|
|
1376
|
+
metrics: {
|
|
1377
|
+
total_cases: total,
|
|
1378
|
+
schema_valid_count: schemaValidCount,
|
|
1379
|
+
accuracy: score
|
|
1380
|
+
},
|
|
1381
|
+
logs
|
|
1382
|
+
};
|
|
1383
|
+
}
|
|
1384
|
+
};
|
|
1338
1385
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1339
1386
|
0 && (module.exports = {
|
|
1340
1387
|
bfclMultipleBenchmark,
|