@ai-sdk-tool/eval 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -5
- package/dist/index.cjs +422 -375
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -4
- package/dist/index.d.ts +8 -4
- package/dist/index.js +421 -374
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
package/dist/index.js
CHANGED
|
@@ -71,7 +71,7 @@ function uniqueLines(lines) {
|
|
|
71
71
|
function suggestFixFromDiff(parsed) {
|
|
72
72
|
const suggestions = [];
|
|
73
73
|
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
74
|
-
if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
|
|
74
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
75
75
|
const expectedName = expected?.function;
|
|
76
76
|
const actualName = actual?.function;
|
|
77
77
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
@@ -85,23 +85,23 @@ function suggestFixFromDiff(parsed) {
|
|
|
85
85
|
);
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
|
-
if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
|
|
89
|
-
const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
|
|
88
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
89
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
90
90
|
if (missing.length) {
|
|
91
91
|
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
|
-
if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
|
|
95
|
-
const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
|
|
94
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
95
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
96
96
|
if (extras.length) {
|
|
97
97
|
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
98
98
|
}
|
|
99
99
|
}
|
|
100
|
-
if (diff && diff.some((d) => d.startsWith("@@ param "))) {
|
|
101
|
-
const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
|
|
100
|
+
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
101
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
102
102
|
for (const param of targets) {
|
|
103
103
|
const allowedLine = diff.find(
|
|
104
|
-
(d) => d.startsWith("- expected one of:")
|
|
104
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
105
105
|
);
|
|
106
106
|
if (allowedLine) {
|
|
107
107
|
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
@@ -239,13 +239,13 @@ var reporters = {
|
|
|
239
239
|
};
|
|
240
240
|
|
|
241
241
|
// src/evaluate.ts
|
|
242
|
-
async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
242
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
243
243
|
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
244
244
|
try {
|
|
245
245
|
console.log(
|
|
246
246
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
247
247
|
);
|
|
248
|
-
const result = await benchmark.run(model);
|
|
248
|
+
const result = await benchmark.run(model, config);
|
|
249
249
|
console.log(
|
|
250
250
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
251
251
|
);
|
|
@@ -274,7 +274,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
|
274
274
|
}
|
|
275
275
|
}
|
|
276
276
|
async function evaluate(options) {
|
|
277
|
-
const { models, benchmarks, reporter = "console" } = options;
|
|
277
|
+
const { models, benchmarks, reporter = "console", temperature } = options;
|
|
278
278
|
const modelEntries = [];
|
|
279
279
|
if (Array.isArray(models)) {
|
|
280
280
|
for (const m of models) modelEntries.push([void 0, m]);
|
|
@@ -293,7 +293,8 @@ async function evaluate(options) {
|
|
|
293
293
|
const evaluationResult = await runSingleBenchmark(
|
|
294
294
|
model,
|
|
295
295
|
benchmark,
|
|
296
|
-
modelKey
|
|
296
|
+
modelKey,
|
|
297
|
+
temperature !== void 0 ? { temperature } : void 0
|
|
297
298
|
);
|
|
298
299
|
allResults.push(evaluationResult);
|
|
299
300
|
}
|
|
@@ -308,17 +309,16 @@ async function evaluate(options) {
|
|
|
308
309
|
return allResults;
|
|
309
310
|
}
|
|
310
311
|
|
|
311
|
-
// src/benchmarks/
|
|
312
|
-
import { generateText } from "ai";
|
|
313
|
-
import Ajv from "ajv";
|
|
312
|
+
// src/benchmarks/bfcl.ts
|
|
313
|
+
import { generateText, jsonSchema, tool } from "ai";
|
|
314
314
|
import { promises as fs2 } from "fs";
|
|
315
315
|
import path2 from "path";
|
|
316
316
|
|
|
317
317
|
// src/utils/paths.ts
|
|
318
318
|
import fs from "fs";
|
|
319
|
+
import { createRequire } from "module";
|
|
319
320
|
import path from "path";
|
|
320
321
|
import { fileURLToPath } from "url";
|
|
321
|
-
import { createRequire } from "module";
|
|
322
322
|
function resolveDataDir(fromModuleUrl) {
|
|
323
323
|
const moduleUrl = fromModuleUrl;
|
|
324
324
|
const override = process.env.BFCL_DATA_DIR;
|
|
@@ -366,263 +366,6 @@ function resolveDataDir(fromModuleUrl) {
|
|
|
366
366
|
return path.join(pkgRoot, "data");
|
|
367
367
|
}
|
|
368
368
|
|
|
369
|
-
// src/benchmarks/json-generation.ts
|
|
370
|
-
function extractFirstJsonBlock(text) {
|
|
371
|
-
try {
|
|
372
|
-
return JSON.parse(text);
|
|
373
|
-
} catch {
|
|
374
|
-
}
|
|
375
|
-
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
376
|
-
if (fenceMatch) {
|
|
377
|
-
const inner = fenceMatch[1].trim();
|
|
378
|
-
try {
|
|
379
|
-
return JSON.parse(inner);
|
|
380
|
-
} catch {
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
const startIdxObj = text.indexOf("{");
|
|
384
|
-
const startIdxArr = text.indexOf("[");
|
|
385
|
-
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
386
|
-
if (start === void 0) return void 0;
|
|
387
|
-
const open = text[start] === "{" ? "{" : "[";
|
|
388
|
-
const close = open === "{" ? "}" : "]";
|
|
389
|
-
let depth = 0;
|
|
390
|
-
for (let i = start; i < text.length; i++) {
|
|
391
|
-
const ch = text[i];
|
|
392
|
-
if (ch === open) depth++;
|
|
393
|
-
else if (ch === close) depth--;
|
|
394
|
-
if (depth === 0) {
|
|
395
|
-
const candidate = text.slice(start, i + 1);
|
|
396
|
-
try {
|
|
397
|
-
return JSON.parse(candidate);
|
|
398
|
-
} catch {
|
|
399
|
-
}
|
|
400
|
-
break;
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
return void 0;
|
|
404
|
-
}
|
|
405
|
-
function subsetMatch(expected, actual) {
|
|
406
|
-
if (expected === null || typeof expected !== "object") {
|
|
407
|
-
return expected === actual;
|
|
408
|
-
}
|
|
409
|
-
if (Array.isArray(expected)) {
|
|
410
|
-
if (!Array.isArray(actual)) return false;
|
|
411
|
-
for (let i = 0; i < expected.length; i++) {
|
|
412
|
-
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
413
|
-
}
|
|
414
|
-
return true;
|
|
415
|
-
}
|
|
416
|
-
if (actual === null || typeof actual !== "object") return false;
|
|
417
|
-
const eObj = expected;
|
|
418
|
-
const aObj = actual;
|
|
419
|
-
for (const key of Object.keys(eObj)) {
|
|
420
|
-
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
421
|
-
}
|
|
422
|
-
return true;
|
|
423
|
-
}
|
|
424
|
-
var jsonGenerationBenchmark = {
|
|
425
|
-
name: "json-generation",
|
|
426
|
-
version: "2.1.0",
|
|
427
|
-
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
428
|
-
async run(model) {
|
|
429
|
-
const logs = [];
|
|
430
|
-
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
431
|
-
let schemaValidCount = 0;
|
|
432
|
-
let valueMatchCount = 0;
|
|
433
|
-
let correctCount = 0;
|
|
434
|
-
let tests = [];
|
|
435
|
-
const expectedMap = /* @__PURE__ */ new Map();
|
|
436
|
-
try {
|
|
437
|
-
const dataDir = resolveDataDir();
|
|
438
|
-
const testsJsonl = await fs2.readFile(
|
|
439
|
-
path2.join(dataDir, "json_generation_tests.jsonl"),
|
|
440
|
-
"utf-8"
|
|
441
|
-
);
|
|
442
|
-
const expectedJsonl = await fs2.readFile(
|
|
443
|
-
path2.join(dataDir, "json_generation_expected.jsonl"),
|
|
444
|
-
"utf-8"
|
|
445
|
-
);
|
|
446
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
447
|
-
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
448
|
-
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
449
|
-
} catch (e) {
|
|
450
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
451
|
-
return {
|
|
452
|
-
score: 0,
|
|
453
|
-
success: false,
|
|
454
|
-
metrics: {},
|
|
455
|
-
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
456
|
-
error: e
|
|
457
|
-
};
|
|
458
|
-
}
|
|
459
|
-
for (const tc of tests) {
|
|
460
|
-
try {
|
|
461
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
462
|
-
const messages = [
|
|
463
|
-
{
|
|
464
|
-
role: "system",
|
|
465
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
466
|
-
},
|
|
467
|
-
{
|
|
468
|
-
role: "user",
|
|
469
|
-
content: [
|
|
470
|
-
"Generate a JSON object that reflects the following facts.",
|
|
471
|
-
"JSON Schema:",
|
|
472
|
-
schemaStr,
|
|
473
|
-
"Facts:",
|
|
474
|
-
tc.promptFacts,
|
|
475
|
-
"Output must be a single JSON only, with no additional text."
|
|
476
|
-
].join("\n\n")
|
|
477
|
-
}
|
|
478
|
-
];
|
|
479
|
-
const { text } = await generateText({ model, messages });
|
|
480
|
-
let parsed;
|
|
481
|
-
try {
|
|
482
|
-
parsed = extractFirstJsonBlock(text);
|
|
483
|
-
} catch {
|
|
484
|
-
}
|
|
485
|
-
if (parsed === void 0) {
|
|
486
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
487
|
-
continue;
|
|
488
|
-
}
|
|
489
|
-
const validate = ajv.compile(tc.schema);
|
|
490
|
-
const valid = validate(parsed);
|
|
491
|
-
if (valid) schemaValidCount++;
|
|
492
|
-
else
|
|
493
|
-
logs.push(
|
|
494
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
495
|
-
);
|
|
496
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
497
|
-
if (!expectedRec) {
|
|
498
|
-
logs.push(
|
|
499
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
500
|
-
);
|
|
501
|
-
}
|
|
502
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
503
|
-
if (valuesOk) valueMatchCount++;
|
|
504
|
-
if (valid && valuesOk) {
|
|
505
|
-
correctCount++;
|
|
506
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
507
|
-
} else {
|
|
508
|
-
logs.push(
|
|
509
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
510
|
-
parsed
|
|
511
|
-
)}`
|
|
512
|
-
);
|
|
513
|
-
}
|
|
514
|
-
} catch (e) {
|
|
515
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
516
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
517
|
-
}
|
|
518
|
-
}
|
|
519
|
-
const total = tests.length;
|
|
520
|
-
const score = correctCount / total;
|
|
521
|
-
return {
|
|
522
|
-
score,
|
|
523
|
-
success: score >= 0.8,
|
|
524
|
-
metrics: {
|
|
525
|
-
total_cases: total,
|
|
526
|
-
correct_count: correctCount,
|
|
527
|
-
schema_valid_count: schemaValidCount,
|
|
528
|
-
value_match_count: valueMatchCount,
|
|
529
|
-
accuracy: score
|
|
530
|
-
},
|
|
531
|
-
logs
|
|
532
|
-
};
|
|
533
|
-
}
|
|
534
|
-
};
|
|
535
|
-
var jsonGenerationSchemaOnlyBenchmark = {
|
|
536
|
-
name: "json-generation-schema-only",
|
|
537
|
-
version: "1.0.1",
|
|
538
|
-
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
539
|
-
async run(model) {
|
|
540
|
-
const logs = [];
|
|
541
|
-
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
542
|
-
let tests = [];
|
|
543
|
-
try {
|
|
544
|
-
const dataDir = resolveDataDir();
|
|
545
|
-
const testsJsonl = await fs2.readFile(
|
|
546
|
-
path2.join(dataDir, "json_generation_tests.jsonl"),
|
|
547
|
-
"utf-8"
|
|
548
|
-
);
|
|
549
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
550
|
-
} catch (e) {
|
|
551
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
552
|
-
return {
|
|
553
|
-
score: 0,
|
|
554
|
-
success: false,
|
|
555
|
-
metrics: {},
|
|
556
|
-
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
557
|
-
error: e
|
|
558
|
-
};
|
|
559
|
-
}
|
|
560
|
-
let schemaValidCount = 0;
|
|
561
|
-
for (const tc of tests) {
|
|
562
|
-
try {
|
|
563
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
564
|
-
const messages = [
|
|
565
|
-
{
|
|
566
|
-
role: "system",
|
|
567
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
568
|
-
},
|
|
569
|
-
{
|
|
570
|
-
role: "user",
|
|
571
|
-
content: [
|
|
572
|
-
"Generate a JSON object that reflects the following facts.",
|
|
573
|
-
"JSON Schema:",
|
|
574
|
-
schemaStr,
|
|
575
|
-
"Facts:",
|
|
576
|
-
tc.promptFacts,
|
|
577
|
-
"Output must be a single JSON only, with no additional text."
|
|
578
|
-
].join("\n\n")
|
|
579
|
-
}
|
|
580
|
-
];
|
|
581
|
-
const { text } = await generateText({ model, messages });
|
|
582
|
-
let parsed;
|
|
583
|
-
try {
|
|
584
|
-
parsed = extractFirstJsonBlock(text);
|
|
585
|
-
} catch {
|
|
586
|
-
}
|
|
587
|
-
if (parsed === void 0) {
|
|
588
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
589
|
-
continue;
|
|
590
|
-
}
|
|
591
|
-
const validate = ajv.compile(tc.schema);
|
|
592
|
-
const valid = validate(parsed);
|
|
593
|
-
if (valid) {
|
|
594
|
-
schemaValidCount++;
|
|
595
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
596
|
-
} else {
|
|
597
|
-
logs.push(
|
|
598
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
599
|
-
);
|
|
600
|
-
}
|
|
601
|
-
} catch (e) {
|
|
602
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
603
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
const total = tests.length;
|
|
607
|
-
const score = total > 0 ? schemaValidCount / total : 0;
|
|
608
|
-
return {
|
|
609
|
-
score,
|
|
610
|
-
success: score >= 0.8,
|
|
611
|
-
metrics: {
|
|
612
|
-
total_cases: total,
|
|
613
|
-
schema_valid_count: schemaValidCount,
|
|
614
|
-
accuracy: score
|
|
615
|
-
},
|
|
616
|
-
logs
|
|
617
|
-
};
|
|
618
|
-
}
|
|
619
|
-
};
|
|
620
|
-
|
|
621
|
-
// src/benchmarks/bfcl.ts
|
|
622
|
-
import { generateText as generateText2, jsonSchema, tool } from "ai";
|
|
623
|
-
import { promises as fs3 } from "fs";
|
|
624
|
-
import path3 from "path";
|
|
625
|
-
|
|
626
369
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
627
370
|
function standardizeString(input) {
|
|
628
371
|
if (typeof input !== "string") return input;
|
|
@@ -632,7 +375,7 @@ function standardizeString(input) {
|
|
|
632
375
|
function checkStringValue(param, modelValue, possibleAnswers) {
|
|
633
376
|
const standardizedModelValue = standardizeString(modelValue);
|
|
634
377
|
const standardizedPossibleAnswers = possibleAnswers.map(
|
|
635
|
-
(ans) => standardizeString(ans)
|
|
378
|
+
(ans) => standardizeString(String(ans))
|
|
636
379
|
);
|
|
637
380
|
if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
|
|
638
381
|
return {
|
|
@@ -659,8 +402,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
659
402
|
};
|
|
660
403
|
}
|
|
661
404
|
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
405
|
+
const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
|
|
662
406
|
for (const param of requiredParams) {
|
|
663
|
-
if (!(param in
|
|
407
|
+
if (!(param in argsObj)) {
|
|
664
408
|
return {
|
|
665
409
|
valid: false,
|
|
666
410
|
error: `Missing required parameter: '${param}'.`,
|
|
@@ -668,87 +412,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
668
412
|
};
|
|
669
413
|
}
|
|
670
414
|
}
|
|
671
|
-
|
|
672
|
-
const
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
valid: false,
|
|
676
|
-
error: `Unexpected parameter: '${paramName}'.`,
|
|
677
|
-
error_type: "simple_function_checker:unexpected_param"
|
|
678
|
-
};
|
|
679
|
-
}
|
|
680
|
-
const possibleValues = possibleAnswerParams[paramName];
|
|
681
|
-
if (typeof modelValue === "string") {
|
|
682
|
-
const result = checkStringValue(paramName, modelValue, possibleValues);
|
|
683
|
-
if (!result.valid) return result;
|
|
684
|
-
} else if (Array.isArray(modelValue)) {
|
|
685
|
-
const modelValueStr = JSON.stringify(
|
|
686
|
-
modelValue.map((v) => standardizeString(v.toString())).sort()
|
|
687
|
-
);
|
|
688
|
-
const hasMatch = possibleValues.some(
|
|
689
|
-
(p) => JSON.stringify(
|
|
690
|
-
p.map((v) => standardizeString(v.toString())).sort()
|
|
691
|
-
) === modelValueStr
|
|
692
|
-
);
|
|
693
|
-
if (!hasMatch) {
|
|
415
|
+
if (modelArgs && typeof modelArgs === "object") {
|
|
416
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
417
|
+
const modelValue = argsObj[paramName];
|
|
418
|
+
if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
|
|
694
419
|
return {
|
|
695
420
|
valid: false,
|
|
696
|
-
error: `
|
|
697
|
-
|
|
698
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
699
|
-
error_type: "value_error:list"
|
|
421
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
422
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
700
423
|
};
|
|
701
424
|
}
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
425
|
+
const possibleValues = possibleAnswerParams[paramName];
|
|
426
|
+
if (typeof modelValue === "string") {
|
|
427
|
+
const result = checkStringValue(
|
|
428
|
+
paramName,
|
|
429
|
+
modelValue,
|
|
430
|
+
possibleValues ?? []
|
|
431
|
+
);
|
|
432
|
+
if (!result.valid) return result;
|
|
433
|
+
} else if (Array.isArray(modelValue)) {
|
|
434
|
+
const modelValueStr = JSON.stringify(
|
|
435
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
436
|
+
);
|
|
437
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
438
|
+
if (!Array.isArray(p)) return false;
|
|
439
|
+
return JSON.stringify(
|
|
440
|
+
p.map((v) => standardizeString(String(v))).sort()
|
|
441
|
+
) === modelValueStr;
|
|
442
|
+
}) : false;
|
|
443
|
+
if (!hasMatch) {
|
|
444
|
+
return {
|
|
445
|
+
valid: false,
|
|
446
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
447
|
+
modelValue
|
|
448
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
449
|
+
error_type: "value_error:list"
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
} else {
|
|
453
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
|
|
454
|
+
if (modelValue === possibleValue) return true;
|
|
455
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
456
|
+
try {
|
|
457
|
+
const normalizeObject = (obj) => {
|
|
458
|
+
if (Array.isArray(obj)) {
|
|
459
|
+
return obj.map(normalizeObject);
|
|
460
|
+
}
|
|
461
|
+
if (obj && typeof obj === "object") {
|
|
462
|
+
const normalized = {};
|
|
463
|
+
for (const [key, value] of Object.entries(
|
|
464
|
+
obj
|
|
465
|
+
)) {
|
|
466
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
467
|
+
normalized[key] = value[0];
|
|
468
|
+
} else {
|
|
469
|
+
normalized[key] = normalizeObject(value);
|
|
470
|
+
}
|
|
718
471
|
}
|
|
472
|
+
return normalized;
|
|
719
473
|
}
|
|
720
|
-
return
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
return false;
|
|
474
|
+
return obj;
|
|
475
|
+
};
|
|
476
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
477
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
478
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
479
|
+
} catch {
|
|
480
|
+
return false;
|
|
481
|
+
}
|
|
729
482
|
}
|
|
483
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
484
|
+
return modelValue.toString() === possibleValue;
|
|
485
|
+
}
|
|
486
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
487
|
+
return modelValue === possibleValue.toString();
|
|
488
|
+
}
|
|
489
|
+
return false;
|
|
490
|
+
}) : false;
|
|
491
|
+
if (!hasMatch) {
|
|
492
|
+
return {
|
|
493
|
+
valid: false,
|
|
494
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
495
|
+
modelValue
|
|
496
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
497
|
+
error_type: "value_error:other"
|
|
498
|
+
};
|
|
730
499
|
}
|
|
731
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
732
|
-
return modelValue.toString() === possibleValue;
|
|
733
|
-
}
|
|
734
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
735
|
-
return modelValue === possibleValue.toString();
|
|
736
|
-
}
|
|
737
|
-
return false;
|
|
738
|
-
});
|
|
739
|
-
if (!hasMatch) {
|
|
740
|
-
return {
|
|
741
|
-
valid: false,
|
|
742
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
743
|
-
modelValue
|
|
744
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
745
|
-
error_type: "value_error:other"
|
|
746
|
-
};
|
|
747
500
|
}
|
|
748
501
|
}
|
|
749
502
|
}
|
|
750
503
|
for (const paramName in possibleAnswerParams) {
|
|
751
|
-
|
|
504
|
+
const val = possibleAnswerParams[paramName];
|
|
505
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
506
|
+
if (!(paramName in argsObj) && !isOptional) {
|
|
752
507
|
return {
|
|
753
508
|
valid: false,
|
|
754
509
|
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
@@ -834,10 +589,10 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
834
589
|
const category = testCase.id.split("_")[0];
|
|
835
590
|
try {
|
|
836
591
|
if (category === "simple") {
|
|
837
|
-
if (!modelOutput || modelOutput.length !== 1) {
|
|
592
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
838
593
|
return {
|
|
839
594
|
valid: false,
|
|
840
|
-
error: `Expected 1 function call, but got ${modelOutput
|
|
595
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
841
596
|
error_type: "simple:wrong_count"
|
|
842
597
|
};
|
|
843
598
|
}
|
|
@@ -879,19 +634,19 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
879
634
|
name,
|
|
880
635
|
version: "1.0.0",
|
|
881
636
|
description,
|
|
882
|
-
async run(model) {
|
|
637
|
+
async run(model, config) {
|
|
883
638
|
const logs = [];
|
|
884
639
|
let correctCount = 0;
|
|
885
640
|
let testCases = [];
|
|
886
641
|
try {
|
|
887
642
|
const dataPath = resolveDataDir();
|
|
888
643
|
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
889
|
-
const testCasesJson = await
|
|
890
|
-
|
|
644
|
+
const testCasesJson = await fs2.readFile(
|
|
645
|
+
path2.join(dataPath, testDataFile),
|
|
891
646
|
"utf-8"
|
|
892
647
|
);
|
|
893
|
-
const possibleAnswersJson = await
|
|
894
|
-
|
|
648
|
+
const possibleAnswersJson = await fs2.readFile(
|
|
649
|
+
path2.join(dataPath, answerDataFile),
|
|
895
650
|
"utf-8"
|
|
896
651
|
);
|
|
897
652
|
testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -908,19 +663,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
908
663
|
);
|
|
909
664
|
}
|
|
910
665
|
const fixSchema = (schema) => {
|
|
911
|
-
if (!schema || typeof schema !== "object")
|
|
666
|
+
if (!schema || typeof schema !== "object")
|
|
667
|
+
return { type: "object", properties: {} };
|
|
912
668
|
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
913
|
-
if (copy
|
|
914
|
-
if (copy.type
|
|
915
|
-
|
|
916
|
-
copy.type
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
669
|
+
if (!Array.isArray(copy)) {
|
|
670
|
+
if (copy.type) {
|
|
671
|
+
if (copy.type === "dict") copy.type = "object";
|
|
672
|
+
if (copy.type === "integer" || copy.type === "float")
|
|
673
|
+
copy.type = "number";
|
|
674
|
+
}
|
|
675
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
676
|
+
for (const k of Object.keys(copy.properties)) {
|
|
677
|
+
copy.properties[k] = fixSchema(
|
|
678
|
+
copy.properties[k]
|
|
679
|
+
);
|
|
680
|
+
}
|
|
921
681
|
}
|
|
682
|
+
if (copy.items) copy.items = fixSchema(copy.items);
|
|
683
|
+
return copy;
|
|
922
684
|
}
|
|
923
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
924
685
|
return copy;
|
|
925
686
|
};
|
|
926
687
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
@@ -931,6 +692,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
931
692
|
const runSingleCase = async (testCase) => {
|
|
932
693
|
const caseLogs = [];
|
|
933
694
|
const { function: tools, question: messages } = testCase;
|
|
695
|
+
const temp = config?.temperature;
|
|
696
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
934
697
|
try {
|
|
935
698
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
936
699
|
const nameMap = /* @__PURE__ */ new Map();
|
|
@@ -940,7 +703,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
940
703
|
};
|
|
941
704
|
const transformedTools = tools.map((t) => {
|
|
942
705
|
const fixed = fixSchema(t.parameters);
|
|
943
|
-
const
|
|
706
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
707
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
944
708
|
const sanitized = sanitizeName(t.name);
|
|
945
709
|
nameMap.set(sanitized, t.name);
|
|
946
710
|
return {
|
|
@@ -970,16 +734,20 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
970
734
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
971
735
|
);
|
|
972
736
|
}
|
|
973
|
-
const { toolCalls, text, finishReason } = await
|
|
737
|
+
const { toolCalls, text, finishReason } = await generateText({
|
|
974
738
|
model,
|
|
975
739
|
messages: flatMessages,
|
|
976
740
|
tools: toolsMap,
|
|
977
741
|
toolChoice: "auto",
|
|
742
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
978
743
|
// Pass original schema information to middleware
|
|
979
744
|
providerOptions: {
|
|
980
745
|
toolCallMiddleware: {
|
|
981
746
|
originalToolSchemas: Object.fromEntries(
|
|
982
|
-
transformedTools.map((t) => [
|
|
747
|
+
transformedTools.map((t) => [
|
|
748
|
+
t.name,
|
|
749
|
+
t.inputSchema
|
|
750
|
+
])
|
|
983
751
|
)
|
|
984
752
|
}
|
|
985
753
|
}
|
|
@@ -1032,10 +800,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1032
800
|
const summarizeArgs = (args) => {
|
|
1033
801
|
if (args == null) return args;
|
|
1034
802
|
if (typeof args !== "object") return args;
|
|
1035
|
-
return Object.keys(args).sort().reduce(
|
|
1036
|
-
acc
|
|
1037
|
-
|
|
1038
|
-
|
|
803
|
+
return Object.keys(args).sort().reduce(
|
|
804
|
+
(acc, k) => {
|
|
805
|
+
acc[k] = args[k];
|
|
806
|
+
return acc;
|
|
807
|
+
},
|
|
808
|
+
{}
|
|
809
|
+
);
|
|
1039
810
|
};
|
|
1040
811
|
const expected = {};
|
|
1041
812
|
const actual = {};
|
|
@@ -1056,19 +827,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1056
827
|
diff.push(`- ${expectedFuncName}`);
|
|
1057
828
|
diff.push(`+ ${receivedName}`);
|
|
1058
829
|
}
|
|
1059
|
-
if (expectedParams && receivedArgs) {
|
|
830
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1060
831
|
const required = funcDesc?.parameters?.required ?? [];
|
|
1061
832
|
for (const req of required) {
|
|
1062
833
|
if (!(req in receivedArgs)) {
|
|
1063
834
|
diff.push(`- missing required param: ${req}`);
|
|
1064
835
|
}
|
|
1065
836
|
}
|
|
1066
|
-
for (const k of Object.keys(
|
|
837
|
+
for (const k of Object.keys(
|
|
838
|
+
receivedArgs
|
|
839
|
+
)) {
|
|
1067
840
|
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1068
841
|
diff.push(`+ unexpected param: ${k}`);
|
|
1069
842
|
}
|
|
1070
843
|
}
|
|
1071
|
-
for (const k of Object.keys(
|
|
844
|
+
for (const k of Object.keys(
|
|
845
|
+
receivedArgs
|
|
846
|
+
)) {
|
|
1072
847
|
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1073
848
|
const allowed = expectedParams[k];
|
|
1074
849
|
const got = receivedArgs[k];
|
|
@@ -1141,13 +916,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1141
916
|
);
|
|
1142
917
|
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1143
918
|
diff.push(`@@ function ${fname}`);
|
|
1144
|
-
if (expectedParamsAllowed && receivedArgs) {
|
|
919
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1145
920
|
for (const req of requiredParams) {
|
|
1146
921
|
if (!(req in receivedArgs)) {
|
|
1147
922
|
diff.push(`- missing required param: ${req}`);
|
|
1148
923
|
}
|
|
1149
924
|
}
|
|
1150
|
-
for (const k of Object.keys(
|
|
925
|
+
for (const k of Object.keys(
|
|
926
|
+
receivedArgs
|
|
927
|
+
)) {
|
|
1151
928
|
if (!Object.prototype.hasOwnProperty.call(
|
|
1152
929
|
expectedParamsAllowed,
|
|
1153
930
|
k
|
|
@@ -1155,7 +932,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1155
932
|
diff.push(`+ unexpected param: ${k}`);
|
|
1156
933
|
}
|
|
1157
934
|
}
|
|
1158
|
-
for (const k of Object.keys(
|
|
935
|
+
for (const k of Object.keys(
|
|
936
|
+
receivedArgs
|
|
937
|
+
)) {
|
|
1159
938
|
if (Object.prototype.hasOwnProperty.call(
|
|
1160
939
|
expectedParamsAllowed,
|
|
1161
940
|
k
|
|
@@ -1293,6 +1072,274 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
|
1293
1072
|
"BFCL_v3_parallel_multiple.json",
|
|
1294
1073
|
"BFCL_v3_parallel_multiple_possible_answer.json"
|
|
1295
1074
|
);
|
|
1075
|
+
|
|
1076
|
+
// src/benchmarks/json-generation.ts
|
|
1077
|
+
import { generateText as generateText2 } from "ai";
|
|
1078
|
+
import Ajv from "ajv";
|
|
1079
|
+
import { promises as fs3 } from "fs";
|
|
1080
|
+
import path3 from "path";
|
|
1081
|
+
function extractFirstJsonBlock(text) {
|
|
1082
|
+
try {
|
|
1083
|
+
return JSON.parse(text);
|
|
1084
|
+
} catch {
|
|
1085
|
+
}
|
|
1086
|
+
const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
|
|
1087
|
+
if (fenceMatch) {
|
|
1088
|
+
const inner = fenceMatch[1].trim();
|
|
1089
|
+
try {
|
|
1090
|
+
return JSON.parse(inner);
|
|
1091
|
+
} catch {
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
const startIdxObj = text.indexOf("{");
|
|
1095
|
+
const startIdxArr = text.indexOf("[");
|
|
1096
|
+
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1097
|
+
if (start === void 0) return void 0;
|
|
1098
|
+
const open = text[start] === "{" ? "{" : "[";
|
|
1099
|
+
const close = open === "{" ? "}" : "]";
|
|
1100
|
+
let depth = 0;
|
|
1101
|
+
for (let i = start; i < text.length; i++) {
|
|
1102
|
+
const ch = text[i];
|
|
1103
|
+
if (ch === open) depth++;
|
|
1104
|
+
else if (ch === close) depth--;
|
|
1105
|
+
if (depth === 0) {
|
|
1106
|
+
const candidate = text.slice(start, i + 1);
|
|
1107
|
+
try {
|
|
1108
|
+
return JSON.parse(candidate);
|
|
1109
|
+
} catch {
|
|
1110
|
+
}
|
|
1111
|
+
break;
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
return void 0;
|
|
1115
|
+
}
|
|
1116
|
+
function subsetMatch(expected, actual) {
|
|
1117
|
+
if (expected === null || typeof expected !== "object") {
|
|
1118
|
+
return expected === actual;
|
|
1119
|
+
}
|
|
1120
|
+
if (Array.isArray(expected)) {
|
|
1121
|
+
if (!Array.isArray(actual)) return false;
|
|
1122
|
+
for (let i = 0; i < expected.length; i++) {
|
|
1123
|
+
if (!subsetMatch(expected[i], actual[i])) return false;
|
|
1124
|
+
}
|
|
1125
|
+
return true;
|
|
1126
|
+
}
|
|
1127
|
+
if (actual === null || typeof actual !== "object") return false;
|
|
1128
|
+
const eObj = expected;
|
|
1129
|
+
const aObj = actual;
|
|
1130
|
+
for (const key of Object.keys(eObj)) {
|
|
1131
|
+
if (!subsetMatch(eObj[key], aObj[key])) return false;
|
|
1132
|
+
}
|
|
1133
|
+
return true;
|
|
1134
|
+
}
|
|
1135
|
+
var jsonGenerationBenchmark = {
|
|
1136
|
+
name: "json-generation",
|
|
1137
|
+
version: "2.1.0",
|
|
1138
|
+
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1139
|
+
async run(model, config) {
|
|
1140
|
+
const logs = [];
|
|
1141
|
+
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1142
|
+
let schemaValidCount = 0;
|
|
1143
|
+
let valueMatchCount = 0;
|
|
1144
|
+
let correctCount = 0;
|
|
1145
|
+
let tests = [];
|
|
1146
|
+
const expectedMap = /* @__PURE__ */ new Map();
|
|
1147
|
+
try {
|
|
1148
|
+
const dataDir = resolveDataDir();
|
|
1149
|
+
const testsJsonl = await fs3.readFile(
|
|
1150
|
+
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1151
|
+
"utf-8"
|
|
1152
|
+
);
|
|
1153
|
+
const expectedJsonl = await fs3.readFile(
|
|
1154
|
+
path3.join(dataDir, "json_generation_expected.jsonl"),
|
|
1155
|
+
"utf-8"
|
|
1156
|
+
);
|
|
1157
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1158
|
+
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1159
|
+
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
1160
|
+
} catch (e) {
|
|
1161
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1162
|
+
return {
|
|
1163
|
+
score: 0,
|
|
1164
|
+
success: false,
|
|
1165
|
+
metrics: {},
|
|
1166
|
+
logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
|
|
1167
|
+
error: e
|
|
1168
|
+
};
|
|
1169
|
+
}
|
|
1170
|
+
for (const tc of tests) {
|
|
1171
|
+
try {
|
|
1172
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1173
|
+
const messages = [
|
|
1174
|
+
{
|
|
1175
|
+
role: "system",
|
|
1176
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1177
|
+
},
|
|
1178
|
+
{
|
|
1179
|
+
role: "user",
|
|
1180
|
+
content: [
|
|
1181
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1182
|
+
"JSON Schema:",
|
|
1183
|
+
schemaStr,
|
|
1184
|
+
"Facts:",
|
|
1185
|
+
tc.promptFacts,
|
|
1186
|
+
"Output must be a single JSON only, with no additional text."
|
|
1187
|
+
].join("\n\n")
|
|
1188
|
+
}
|
|
1189
|
+
];
|
|
1190
|
+
const temp = config?.temperature;
|
|
1191
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1192
|
+
const { text } = await generateText2({
|
|
1193
|
+
model,
|
|
1194
|
+
messages,
|
|
1195
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1196
|
+
});
|
|
1197
|
+
let parsed;
|
|
1198
|
+
try {
|
|
1199
|
+
parsed = extractFirstJsonBlock(text);
|
|
1200
|
+
} catch {
|
|
1201
|
+
}
|
|
1202
|
+
if (parsed === void 0) {
|
|
1203
|
+
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1204
|
+
continue;
|
|
1205
|
+
}
|
|
1206
|
+
const validate = ajv.compile(tc.schema);
|
|
1207
|
+
const valid = validate(parsed);
|
|
1208
|
+
if (valid) schemaValidCount++;
|
|
1209
|
+
else
|
|
1210
|
+
logs.push(
|
|
1211
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1212
|
+
);
|
|
1213
|
+
const expectedRec = expectedMap.get(tc.id);
|
|
1214
|
+
if (!expectedRec) {
|
|
1215
|
+
logs.push(
|
|
1216
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1217
|
+
);
|
|
1218
|
+
}
|
|
1219
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1220
|
+
if (valuesOk) valueMatchCount++;
|
|
1221
|
+
if (valid && valuesOk) {
|
|
1222
|
+
correctCount++;
|
|
1223
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1224
|
+
} else {
|
|
1225
|
+
logs.push(
|
|
1226
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1227
|
+
parsed
|
|
1228
|
+
)}`
|
|
1229
|
+
);
|
|
1230
|
+
}
|
|
1231
|
+
} catch (e) {
|
|
1232
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1233
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
const total = tests.length;
|
|
1237
|
+
const score = correctCount / total;
|
|
1238
|
+
return {
|
|
1239
|
+
score,
|
|
1240
|
+
success: score >= 0.8,
|
|
1241
|
+
metrics: {
|
|
1242
|
+
total_cases: total,
|
|
1243
|
+
correct_count: correctCount,
|
|
1244
|
+
schema_valid_count: schemaValidCount,
|
|
1245
|
+
value_match_count: valueMatchCount,
|
|
1246
|
+
accuracy: score
|
|
1247
|
+
},
|
|
1248
|
+
logs
|
|
1249
|
+
};
|
|
1250
|
+
}
|
|
1251
|
+
};
|
|
1252
|
+
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1253
|
+
name: "json-generation-schema-only",
|
|
1254
|
+
version: "1.0.1",
|
|
1255
|
+
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
1256
|
+
async run(model, config) {
|
|
1257
|
+
const logs = [];
|
|
1258
|
+
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1259
|
+
let tests = [];
|
|
1260
|
+
try {
|
|
1261
|
+
const dataDir = resolveDataDir();
|
|
1262
|
+
const testsJsonl = await fs3.readFile(
|
|
1263
|
+
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1264
|
+
"utf-8"
|
|
1265
|
+
);
|
|
1266
|
+
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1267
|
+
} catch (e) {
|
|
1268
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1269
|
+
return {
|
|
1270
|
+
score: 0,
|
|
1271
|
+
success: false,
|
|
1272
|
+
metrics: {},
|
|
1273
|
+
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1274
|
+
error: e
|
|
1275
|
+
};
|
|
1276
|
+
}
|
|
1277
|
+
let schemaValidCount = 0;
|
|
1278
|
+
for (const tc of tests) {
|
|
1279
|
+
try {
|
|
1280
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1281
|
+
const messages = [
|
|
1282
|
+
{
|
|
1283
|
+
role: "system",
|
|
1284
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1285
|
+
},
|
|
1286
|
+
{
|
|
1287
|
+
role: "user",
|
|
1288
|
+
content: [
|
|
1289
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1290
|
+
"JSON Schema:",
|
|
1291
|
+
schemaStr,
|
|
1292
|
+
"Facts:",
|
|
1293
|
+
tc.promptFacts,
|
|
1294
|
+
"Output must be a single JSON only, with no additional text."
|
|
1295
|
+
].join("\n\n")
|
|
1296
|
+
}
|
|
1297
|
+
];
|
|
1298
|
+
const temp = config?.temperature;
|
|
1299
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1300
|
+
const { text } = await generateText2({
|
|
1301
|
+
model,
|
|
1302
|
+
messages,
|
|
1303
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1304
|
+
});
|
|
1305
|
+
let parsed;
|
|
1306
|
+
try {
|
|
1307
|
+
parsed = extractFirstJsonBlock(text);
|
|
1308
|
+
} catch {
|
|
1309
|
+
}
|
|
1310
|
+
if (parsed === void 0) {
|
|
1311
|
+
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1312
|
+
continue;
|
|
1313
|
+
}
|
|
1314
|
+
const validate = ajv.compile(tc.schema);
|
|
1315
|
+
const valid = validate(parsed);
|
|
1316
|
+
if (valid) {
|
|
1317
|
+
schemaValidCount++;
|
|
1318
|
+
logs.push(`[PASS] ${tc.id}`);
|
|
1319
|
+
} else {
|
|
1320
|
+
logs.push(
|
|
1321
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1322
|
+
);
|
|
1323
|
+
}
|
|
1324
|
+
} catch (e) {
|
|
1325
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1326
|
+
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
const total = tests.length;
|
|
1330
|
+
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1331
|
+
return {
|
|
1332
|
+
score,
|
|
1333
|
+
success: score >= 0.8,
|
|
1334
|
+
metrics: {
|
|
1335
|
+
total_cases: total,
|
|
1336
|
+
schema_valid_count: schemaValidCount,
|
|
1337
|
+
accuracy: score
|
|
1338
|
+
},
|
|
1339
|
+
logs
|
|
1340
|
+
};
|
|
1341
|
+
}
|
|
1342
|
+
};
|
|
1296
1343
|
export {
|
|
1297
1344
|
bfclMultipleBenchmark,
|
|
1298
1345
|
bfclParallelBenchmark,
|