@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
- package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
- package/data/BFCL_v4_simple.jsonl +400 -0
- package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +1264 -263
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +77 -11
- package/dist/index.d.ts +77 -11
- package/dist/index.js +1268 -264
- package/dist/index.js.map +1 -1
- package/package.json +18 -11
- package/data/BFCL_v3_simple.jsonl +0 -400
- package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
- /package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -34,6 +34,7 @@ __export(index_exports, {
|
|
|
34
34
|
bfclParallelBenchmark: () => bfclParallelBenchmark,
|
|
35
35
|
bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
|
|
36
36
|
bfclSimpleBenchmark: () => bfclSimpleBenchmark,
|
|
37
|
+
complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
|
|
37
38
|
evaluate: () => evaluate,
|
|
38
39
|
jsonGenerationBenchmark: () => jsonGenerationBenchmark,
|
|
39
40
|
jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
|
|
@@ -61,7 +62,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
|
|
|
61
62
|
if (import_node_fs.default.existsSync(dataAtRoot)) {
|
|
62
63
|
return dataAtRoot;
|
|
63
64
|
}
|
|
64
|
-
} catch {
|
|
65
|
+
} catch (e) {
|
|
65
66
|
}
|
|
66
67
|
return null;
|
|
67
68
|
}
|
|
@@ -75,7 +76,7 @@ function tryResolveViaPackageJson(moduleUrl) {
|
|
|
75
76
|
if (import_node_fs.default.existsSync(dataAtPkg)) {
|
|
76
77
|
return dataAtPkg;
|
|
77
78
|
}
|
|
78
|
-
} catch {
|
|
79
|
+
} catch (e) {
|
|
79
80
|
}
|
|
80
81
|
return null;
|
|
81
82
|
}
|
|
@@ -83,7 +84,7 @@ function getStartDir(moduleUrl) {
|
|
|
83
84
|
if (moduleUrl) {
|
|
84
85
|
try {
|
|
85
86
|
return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
|
|
86
|
-
} catch {
|
|
87
|
+
} catch (e) {
|
|
87
88
|
return process.cwd();
|
|
88
89
|
}
|
|
89
90
|
}
|
|
@@ -177,7 +178,7 @@ function valuesMatch(modelValue, possibleValue) {
|
|
|
177
178
|
const normalizedModel = normalizeObject(modelValue);
|
|
178
179
|
const normalizedPossible = normalizeObject(possibleValue);
|
|
179
180
|
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
180
|
-
} catch {
|
|
181
|
+
} catch (e) {
|
|
181
182
|
return false;
|
|
182
183
|
}
|
|
183
184
|
}
|
|
@@ -306,7 +307,7 @@ function checkSingleParameter(paramName, modelValue, context) {
|
|
|
306
307
|
return checkStringValue(
|
|
307
308
|
paramName,
|
|
308
309
|
modelValue,
|
|
309
|
-
possibleValues
|
|
310
|
+
possibleValues != null ? possibleValues : []
|
|
310
311
|
);
|
|
311
312
|
}
|
|
312
313
|
if (Array.isArray(modelValue)) {
|
|
@@ -406,45 +407,99 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
406
407
|
// src/benchmarks/bfcl.ts
|
|
407
408
|
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
408
409
|
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
410
|
+
var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
|
|
411
|
+
function convertGroundTruthToXML(call) {
|
|
412
|
+
const keys = Object.keys(call);
|
|
413
|
+
if (keys.length === 0) {
|
|
414
|
+
return "<empty_call />";
|
|
415
|
+
}
|
|
416
|
+
const funcName = keys[0];
|
|
417
|
+
if (!funcName) {
|
|
418
|
+
return "<undefined_function />";
|
|
419
|
+
}
|
|
420
|
+
const params = call[funcName];
|
|
421
|
+
if (!params || typeof params !== "object") {
|
|
422
|
+
return `<${funcName} />`;
|
|
423
|
+
}
|
|
424
|
+
let xml = `<${funcName}>
|
|
425
|
+
`;
|
|
426
|
+
for (const [key, value] of Object.entries(params)) {
|
|
427
|
+
const displayValue = Array.isArray(value) ? value[0] : value;
|
|
428
|
+
let valueStr;
|
|
429
|
+
if (typeof displayValue === "string") {
|
|
430
|
+
valueStr = displayValue;
|
|
431
|
+
} else if (displayValue === null || displayValue === void 0) {
|
|
432
|
+
valueStr = "";
|
|
433
|
+
} else {
|
|
434
|
+
valueStr = JSON.stringify(displayValue);
|
|
435
|
+
}
|
|
436
|
+
xml += ` <${key}>${valueStr}</${key}>
|
|
437
|
+
`;
|
|
438
|
+
}
|
|
439
|
+
xml += `</${funcName}>`;
|
|
440
|
+
return xml;
|
|
441
|
+
}
|
|
442
|
+
function extractCategory(id) {
|
|
443
|
+
if (id.startsWith("parallel_multiple")) {
|
|
444
|
+
return "parallel_multiple";
|
|
445
|
+
}
|
|
446
|
+
if (id.startsWith("simple_python")) {
|
|
447
|
+
return "simple";
|
|
448
|
+
}
|
|
449
|
+
if (id.startsWith("simple_java")) {
|
|
450
|
+
return "simple";
|
|
451
|
+
}
|
|
452
|
+
if (id.startsWith("simple_javascript")) {
|
|
453
|
+
return "simple";
|
|
454
|
+
}
|
|
455
|
+
if (id.startsWith("parallel")) {
|
|
456
|
+
return "parallel";
|
|
457
|
+
}
|
|
458
|
+
if (id.startsWith("multiple")) {
|
|
459
|
+
return "multiple";
|
|
460
|
+
}
|
|
461
|
+
if (id.startsWith("simple")) {
|
|
462
|
+
return "simple";
|
|
463
|
+
}
|
|
464
|
+
return id.split("_")[0];
|
|
465
|
+
}
|
|
409
466
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
410
|
-
const category = testCase.id
|
|
467
|
+
const category = extractCategory(testCase.id);
|
|
411
468
|
try {
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
469
|
+
switch (category) {
|
|
470
|
+
case "simple": {
|
|
471
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
472
|
+
return {
|
|
473
|
+
valid: false,
|
|
474
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
475
|
+
error_type: "simple:wrong_count"
|
|
476
|
+
};
|
|
477
|
+
}
|
|
478
|
+
return simpleFunctionChecker(
|
|
479
|
+
testCase.function[0],
|
|
480
|
+
modelOutput[0],
|
|
481
|
+
possibleAnswer.ground_truth[0]
|
|
482
|
+
);
|
|
483
|
+
}
|
|
484
|
+
case "multiple": {
|
|
485
|
+
return multipleFunctionChecker(
|
|
486
|
+
testCase.function,
|
|
487
|
+
modelOutput,
|
|
488
|
+
possibleAnswer.ground_truth
|
|
489
|
+
);
|
|
490
|
+
}
|
|
491
|
+
case "parallel":
|
|
492
|
+
case "parallel_multiple": {
|
|
493
|
+
return parallelFunctionCheckerNoOrder(
|
|
494
|
+
testCase.function,
|
|
495
|
+
modelOutput,
|
|
496
|
+
possibleAnswer.ground_truth
|
|
497
|
+
);
|
|
498
|
+
}
|
|
499
|
+
default: {
|
|
500
|
+
return { valid: true };
|
|
419
501
|
}
|
|
420
|
-
return simpleFunctionChecker(
|
|
421
|
-
testCase.function[0],
|
|
422
|
-
modelOutput[0],
|
|
423
|
-
possibleAnswer.ground_truth[0]
|
|
424
|
-
);
|
|
425
|
-
}
|
|
426
|
-
if (category === "parallel") {
|
|
427
|
-
return parallelFunctionCheckerNoOrder(
|
|
428
|
-
testCase.function,
|
|
429
|
-
modelOutput,
|
|
430
|
-
possibleAnswer.ground_truth
|
|
431
|
-
);
|
|
432
|
-
}
|
|
433
|
-
if (category === "multiple") {
|
|
434
|
-
return multipleFunctionChecker(
|
|
435
|
-
testCase.function,
|
|
436
|
-
modelOutput,
|
|
437
|
-
possibleAnswer.ground_truth
|
|
438
|
-
);
|
|
439
|
-
}
|
|
440
|
-
if (category.includes("parallel-multiple")) {
|
|
441
|
-
return parallelFunctionCheckerNoOrder(
|
|
442
|
-
testCase.function,
|
|
443
|
-
modelOutput,
|
|
444
|
-
possibleAnswer.ground_truth
|
|
445
|
-
);
|
|
446
502
|
}
|
|
447
|
-
return { valid: true };
|
|
448
503
|
} catch (e) {
|
|
449
504
|
return {
|
|
450
505
|
valid: false,
|
|
@@ -486,7 +541,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
486
541
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
487
542
|
);
|
|
488
543
|
}
|
|
489
|
-
const
|
|
544
|
+
const fixSchemaType2 = (copy) => {
|
|
490
545
|
if (!copy.type) {
|
|
491
546
|
return;
|
|
492
547
|
}
|
|
@@ -510,16 +565,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
510
565
|
);
|
|
511
566
|
}
|
|
512
567
|
};
|
|
513
|
-
const
|
|
568
|
+
const fixSchema2 = (schema) => {
|
|
514
569
|
if (!schema || typeof schema !== "object") {
|
|
515
570
|
return { type: "object", properties: {} };
|
|
516
571
|
}
|
|
517
|
-
const copy = Array.isArray(schema) ? schema.map((v) =>
|
|
572
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
|
|
518
573
|
if (!Array.isArray(copy)) {
|
|
519
|
-
|
|
520
|
-
fixSchemaProperties(copy,
|
|
574
|
+
fixSchemaType2(copy);
|
|
575
|
+
fixSchemaProperties(copy, fixSchema2);
|
|
521
576
|
if (copy.items) {
|
|
522
|
-
copy.items =
|
|
577
|
+
copy.items = fixSchema2(copy.items);
|
|
523
578
|
}
|
|
524
579
|
return copy;
|
|
525
580
|
}
|
|
@@ -554,13 +609,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
554
609
|
try {
|
|
555
610
|
const arr = JSON.parse(raw);
|
|
556
611
|
return Array.isArray(arr) ? arr : [];
|
|
557
|
-
} catch {
|
|
612
|
+
} catch (e) {
|
|
558
613
|
return [];
|
|
559
614
|
}
|
|
560
615
|
};
|
|
561
616
|
const getSanitizedName = (rawName, transformedTools) => {
|
|
617
|
+
var _a, _b;
|
|
562
618
|
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
563
|
-
return transformedTools[Number(rawName)]
|
|
619
|
+
return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
|
|
564
620
|
}
|
|
565
621
|
return rawName;
|
|
566
622
|
};
|
|
@@ -570,25 +626,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
570
626
|
}
|
|
571
627
|
try {
|
|
572
628
|
return JSON.parse(extractedArgs);
|
|
573
|
-
} catch {
|
|
629
|
+
} catch (e) {
|
|
574
630
|
return extractedArgs;
|
|
575
631
|
}
|
|
576
632
|
};
|
|
577
633
|
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
634
|
+
var _a, _b, _c, _d, _e, _f;
|
|
578
635
|
const call = c;
|
|
579
|
-
const rawName = call.toolName
|
|
636
|
+
const rawName = (_a = call.toolName) != null ? _a : call.name;
|
|
580
637
|
const sanitizedFromIndex = getSanitizedName(
|
|
581
638
|
rawName,
|
|
582
639
|
transformedTools
|
|
583
640
|
);
|
|
584
|
-
const originalName = nameMap.get(sanitizedFromIndex)
|
|
585
|
-
const extractedArgs = call.args
|
|
641
|
+
const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
|
|
642
|
+
const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
|
|
586
643
|
const parsedArgs = parseToolArgs(extractedArgs);
|
|
587
644
|
return {
|
|
588
645
|
...call,
|
|
589
646
|
toolName: originalName,
|
|
590
647
|
name: originalName,
|
|
591
|
-
args: parsedArgs
|
|
648
|
+
args: parsedArgs != null ? parsedArgs : {}
|
|
592
649
|
};
|
|
593
650
|
});
|
|
594
651
|
const summarizeArgs = (args) => {
|
|
@@ -620,7 +677,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
620
677
|
return `- expected one of: ${formatted}`;
|
|
621
678
|
})();
|
|
622
679
|
diffLines.push(expectedLine);
|
|
623
|
-
diffLines.push(`+
|
|
680
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
624
681
|
return diffLines;
|
|
625
682
|
};
|
|
626
683
|
const paramValueMatches = (allowed, got) => {
|
|
@@ -632,7 +689,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
632
689
|
if (Array.isArray(got)) {
|
|
633
690
|
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
634
691
|
}
|
|
635
|
-
} catch {
|
|
692
|
+
} catch (e) {
|
|
636
693
|
}
|
|
637
694
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
638
695
|
});
|
|
@@ -670,13 +727,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
670
727
|
}
|
|
671
728
|
};
|
|
672
729
|
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
730
|
+
var _a, _b, _c, _d;
|
|
673
731
|
const funcDesc = tools[0];
|
|
674
|
-
const gt = possibleAnswer.ground_truth
|
|
675
|
-
const expectedFuncName = funcDesc
|
|
732
|
+
const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
|
|
733
|
+
const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
|
|
676
734
|
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
677
735
|
const received = restoredCalls[0];
|
|
678
|
-
const receivedName = received
|
|
679
|
-
const receivedArgs = summarizeArgs(received
|
|
736
|
+
const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
|
|
737
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
680
738
|
const expected = {
|
|
681
739
|
function: expectedFuncName,
|
|
682
740
|
params: expectedParams
|
|
@@ -688,7 +746,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
688
746
|
const diff = [];
|
|
689
747
|
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
690
748
|
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
691
|
-
const required = funcDesc
|
|
749
|
+
const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
692
750
|
checkMissingParams(
|
|
693
751
|
required,
|
|
694
752
|
receivedArgs,
|
|
@@ -725,12 +783,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
725
783
|
}
|
|
726
784
|
};
|
|
727
785
|
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
786
|
+
var _a;
|
|
728
787
|
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
729
788
|
if (usedActual.has(i)) {
|
|
730
789
|
continue;
|
|
731
790
|
}
|
|
732
791
|
const rc = restoredCalls[i];
|
|
733
|
-
const rcName = rc
|
|
792
|
+
const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
|
|
734
793
|
if (rcName === fname) {
|
|
735
794
|
return i;
|
|
736
795
|
}
|
|
@@ -744,6 +803,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
744
803
|
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
745
804
|
};
|
|
746
805
|
const processExpectedCall = (options) => {
|
|
806
|
+
var _a, _b;
|
|
747
807
|
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
748
808
|
const fname = Object.keys(expectedObj)[0];
|
|
749
809
|
const matchedIndex = findMatchingCallIndex(
|
|
@@ -756,10 +816,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
756
816
|
}
|
|
757
817
|
usedActual.add(matchedIndex);
|
|
758
818
|
const received = restoredCalls[matchedIndex];
|
|
759
|
-
const receivedArgs = summarizeArgs(received
|
|
819
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
760
820
|
const expectedParamsAllowed = expectedObj[fname];
|
|
761
821
|
const funcDesc = tools.find((t) => t.name === fname);
|
|
762
|
-
const requiredParams = funcDesc
|
|
822
|
+
const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
|
|
763
823
|
diff.push(`@@ function ${fname}`);
|
|
764
824
|
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
765
825
|
validateFunctionParams({
|
|
@@ -771,10 +831,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
771
831
|
}
|
|
772
832
|
};
|
|
773
833
|
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
774
|
-
|
|
834
|
+
var _a;
|
|
835
|
+
const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
|
|
775
836
|
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
776
837
|
const actualNames = restoredCalls.map(
|
|
777
|
-
(c) =>
|
|
838
|
+
(c) => {
|
|
839
|
+
var _a2;
|
|
840
|
+
return (_a2 = c.toolName) != null ? _a2 : c.name;
|
|
841
|
+
}
|
|
778
842
|
);
|
|
779
843
|
const expected = {
|
|
780
844
|
functions: expectedNames
|
|
@@ -800,14 +864,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
800
864
|
return { expected, actual, diff };
|
|
801
865
|
};
|
|
802
866
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
803
|
-
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) :
|
|
867
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
|
|
804
868
|
logs.push(
|
|
805
869
|
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
806
870
|
);
|
|
807
871
|
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
872
|
+
var _a, _b, _c, _d;
|
|
808
873
|
try {
|
|
809
874
|
const firstTool = transformedTools[0];
|
|
810
|
-
const schemaType = firstTool
|
|
875
|
+
const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
|
|
811
876
|
caseLogs.push(
|
|
812
877
|
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
813
878
|
);
|
|
@@ -823,49 +888,103 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
823
888
|
caseLogs.push(
|
|
824
889
|
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
825
890
|
);
|
|
826
|
-
} catch {
|
|
891
|
+
} catch (e) {
|
|
827
892
|
caseLogs.push(
|
|
828
893
|
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
829
894
|
);
|
|
830
895
|
}
|
|
831
896
|
};
|
|
832
|
-
const
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
flatMessages,
|
|
837
|
-
mwOriginalText,
|
|
838
|
-
text,
|
|
839
|
-
finishReason,
|
|
840
|
-
mwParsedToolCalls,
|
|
841
|
-
restoredCalls,
|
|
842
|
-
possibleAnswer
|
|
843
|
-
} = options;
|
|
844
|
-
const lastUser = (() => {
|
|
845
|
-
const reversed = [...flatMessages].reverse();
|
|
846
|
-
const found = reversed.find(
|
|
847
|
-
(m) => m.role === "user"
|
|
848
|
-
);
|
|
849
|
-
return found?.content ?? void 0;
|
|
850
|
-
})();
|
|
851
|
-
const rawModelText = (() => {
|
|
852
|
-
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
853
|
-
return mwOriginalText;
|
|
897
|
+
const hasPercentPattern = (diff) => {
|
|
898
|
+
return diff.some((d) => {
|
|
899
|
+
if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
|
|
900
|
+
return false;
|
|
854
901
|
}
|
|
855
|
-
|
|
856
|
-
|
|
902
|
+
const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
|
|
903
|
+
if (!numMatch) {
|
|
904
|
+
return false;
|
|
857
905
|
}
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
906
|
+
const num = Number.parseFloat(numMatch[1]);
|
|
907
|
+
return num >= 1 && num <= 100;
|
|
908
|
+
});
|
|
909
|
+
};
|
|
910
|
+
const isValueError = (errorType, diff) => {
|
|
911
|
+
return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
|
|
912
|
+
};
|
|
913
|
+
const isFunctionNameError = (errorType, diff) => {
|
|
914
|
+
return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
|
|
915
|
+
};
|
|
916
|
+
const isMissingParamError = (errorType, diff) => {
|
|
917
|
+
return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
|
|
918
|
+
};
|
|
919
|
+
const isUnexpectedParamError = (errorType, diff) => {
|
|
920
|
+
return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
|
|
921
|
+
};
|
|
922
|
+
const classifyByErrorPatterns = (errorType, diff) => {
|
|
923
|
+
const patterns = [
|
|
924
|
+
[
|
|
925
|
+
isValueError,
|
|
926
|
+
hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
|
|
927
|
+
],
|
|
928
|
+
[isFunctionNameError, "WRONG_FUNCTION"],
|
|
929
|
+
[isMissingParamError, "MISSING_PARAMS"],
|
|
930
|
+
[isUnexpectedParamError, "UNEXPECTED_PARAMS"]
|
|
931
|
+
];
|
|
932
|
+
for (const [classifier, result] of patterns) {
|
|
933
|
+
if (classifier(errorType, diff)) {
|
|
934
|
+
return result;
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
|
|
938
|
+
return "NO_MATCH";
|
|
939
|
+
}
|
|
940
|
+
return null;
|
|
941
|
+
};
|
|
942
|
+
const classifyByCallCount = (actualCount, expectedCount) => {
|
|
943
|
+
if (actualCount === 0 && expectedCount > 0) {
|
|
944
|
+
return "PARSE_FAILURE";
|
|
945
|
+
}
|
|
946
|
+
if (actualCount > 0 && actualCount < expectedCount) {
|
|
947
|
+
return "PARTIAL_CALLS";
|
|
948
|
+
}
|
|
949
|
+
if (actualCount > expectedCount) {
|
|
950
|
+
return "EXTRA_CALLS";
|
|
951
|
+
}
|
|
952
|
+
return null;
|
|
953
|
+
};
|
|
954
|
+
const classifyFailureType = (options) => {
|
|
955
|
+
const { errorType, restoredCalls, expectedCount, diff } = options;
|
|
956
|
+
const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
|
|
957
|
+
const countBasedResult = classifyByCallCount(
|
|
958
|
+
actualCount,
|
|
959
|
+
expectedCount
|
|
960
|
+
);
|
|
961
|
+
if (countBasedResult) {
|
|
962
|
+
return countBasedResult;
|
|
963
|
+
}
|
|
964
|
+
const patternBasedResult = classifyByErrorPatterns(errorType, diff);
|
|
965
|
+
if (patternBasedResult) {
|
|
966
|
+
return patternBasedResult;
|
|
967
|
+
}
|
|
968
|
+
return "OTHER";
|
|
969
|
+
};
|
|
970
|
+
const extractRawModelText = (mwOriginalText, text) => {
|
|
971
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
972
|
+
return mwOriginalText;
|
|
973
|
+
}
|
|
974
|
+
if (typeof text === "string") {
|
|
975
|
+
return text;
|
|
976
|
+
}
|
|
977
|
+
return "";
|
|
978
|
+
};
|
|
979
|
+
const extractLastUserQuery = (flatMessages) => {
|
|
980
|
+
var _a;
|
|
981
|
+
const reversed = [...flatMessages].reverse();
|
|
982
|
+
const found = reversed.find((m) => m.role === "user");
|
|
983
|
+
const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
|
|
984
|
+
return content.length > 200 ? `${content.slice(0, 200)}...` : content;
|
|
985
|
+
};
|
|
986
|
+
const truncateText = (text, maxLen) => {
|
|
987
|
+
return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
|
|
869
988
|
};
|
|
870
989
|
const logFailureDetails = (options) => {
|
|
871
990
|
const {
|
|
@@ -883,43 +1002,37 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
883
1002
|
} = options;
|
|
884
1003
|
try {
|
|
885
1004
|
const category = testCase.id.split("_")[0];
|
|
886
|
-
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
caseLogs.push(
|
|
896
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
897
|
-
id: testCase.id,
|
|
898
|
-
message: checkerResult.error,
|
|
899
|
-
error_type: checkerResult.error_type,
|
|
900
|
-
expected,
|
|
901
|
-
actual,
|
|
902
|
-
diff
|
|
903
|
-
})}`
|
|
904
|
-
);
|
|
905
|
-
try {
|
|
906
|
-
const contextPayload = buildFailureContext({
|
|
907
|
-
testCase,
|
|
908
|
-
tools,
|
|
909
|
-
flatMessages,
|
|
910
|
-
mwOriginalText,
|
|
911
|
-
text,
|
|
912
|
-
finishReason,
|
|
913
|
-
mwParsedToolCalls,
|
|
1005
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
|
|
1006
|
+
const gtArr = possibleAnswer.ground_truth;
|
|
1007
|
+
const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
|
|
1008
|
+
const rawModelText = extractRawModelText(mwOriginalText, text);
|
|
1009
|
+
const lastUserQuery = extractLastUserQuery(flatMessages);
|
|
1010
|
+
const failurePayload = {
|
|
1011
|
+
id: testCase.id,
|
|
1012
|
+
category: classifyFailureType({
|
|
1013
|
+
errorType: checkerResult.error_type,
|
|
914
1014
|
restoredCalls,
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
1015
|
+
expectedCount,
|
|
1016
|
+
diff
|
|
1017
|
+
}),
|
|
1018
|
+
message: checkerResult.error,
|
|
1019
|
+
error_type: checkerResult.error_type,
|
|
1020
|
+
expected,
|
|
1021
|
+
actual,
|
|
1022
|
+
diff,
|
|
1023
|
+
context: {
|
|
1024
|
+
raw_model_text: truncateText(rawModelText, 500),
|
|
1025
|
+
raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
|
|
1026
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1027
|
+
expected_count: expectedCount,
|
|
1028
|
+
actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
|
|
1029
|
+
finish_reason: finishReason,
|
|
1030
|
+
last_user_query: lastUserQuery,
|
|
1031
|
+
tool_names: tools.map((t) => t.name)
|
|
1032
|
+
}
|
|
1033
|
+
};
|
|
1034
|
+
caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
|
|
1035
|
+
} catch (e) {
|
|
923
1036
|
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
924
1037
|
}
|
|
925
1038
|
};
|
|
@@ -998,7 +1111,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
998
1111
|
const flatMessages = flattenMessages(messages);
|
|
999
1112
|
const { transformedTools, nameMap } = buildTransformedTools(
|
|
1000
1113
|
tools,
|
|
1001
|
-
|
|
1114
|
+
fixSchema2
|
|
1002
1115
|
);
|
|
1003
1116
|
const toolsMap = buildToolsMap(transformedTools);
|
|
1004
1117
|
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
@@ -1020,6 +1133,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1020
1133
|
const mwParsedToolCalls = parseDebugToolCalls(
|
|
1021
1134
|
debugSummaryRef.toolCalls
|
|
1022
1135
|
);
|
|
1136
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1137
|
+
if (!possibleAnswer) {
|
|
1138
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1139
|
+
}
|
|
1140
|
+
if (process.env.DEBUG_PARSER_OUTPUT === "true") {
|
|
1141
|
+
const groundTruth = possibleAnswer.ground_truth;
|
|
1142
|
+
const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
|
|
1143
|
+
console.log("\n========== BFCL CASE DEBUG ==========");
|
|
1144
|
+
console.log(`Test Case: ${testCase.id}`);
|
|
1145
|
+
console.log(`Expected count: ${groundTruth.length} call(s)`);
|
|
1146
|
+
console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
|
|
1147
|
+
console.log(expectedXML);
|
|
1148
|
+
console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
|
|
1149
|
+
console.log(mwOriginalText || text || "(empty)");
|
|
1150
|
+
console.log(
|
|
1151
|
+
"\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
|
|
1152
|
+
);
|
|
1153
|
+
console.log(JSON.stringify(toolCalls, null, 2));
|
|
1154
|
+
console.log("======================================\n");
|
|
1155
|
+
}
|
|
1023
1156
|
logRawToolCalls({
|
|
1024
1157
|
toolCalls,
|
|
1025
1158
|
finishReason,
|
|
@@ -1027,10 +1160,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1027
1160
|
testCaseId: testCase.id,
|
|
1028
1161
|
caseLogs
|
|
1029
1162
|
});
|
|
1030
|
-
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1031
|
-
if (!possibleAnswer) {
|
|
1032
|
-
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1033
|
-
}
|
|
1034
1163
|
const restoredCalls = restoreToolCalls(
|
|
1035
1164
|
toolCalls || [],
|
|
1036
1165
|
nameMap,
|
|
@@ -1051,12 +1180,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1051
1180
|
caseLogs
|
|
1052
1181
|
});
|
|
1053
1182
|
};
|
|
1054
|
-
const
|
|
1183
|
+
const runSingleCase2 = async (testCase) => {
|
|
1055
1184
|
const caseLogs = [];
|
|
1056
1185
|
const { function: tools } = testCase;
|
|
1057
|
-
const temp = config
|
|
1186
|
+
const temp = config == null ? void 0 : config.temperature;
|
|
1058
1187
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1059
|
-
const maxTok = config
|
|
1188
|
+
const maxTok = config == null ? void 0 : config.maxTokens;
|
|
1060
1189
|
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1061
1190
|
try {
|
|
1062
1191
|
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
@@ -1082,15 +1211,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1082
1211
|
});
|
|
1083
1212
|
} catch (e) {
|
|
1084
1213
|
caseLogs.push(
|
|
1085
|
-
`[ERROR] ${testCase.id}: Model generation failed: ${e
|
|
1214
|
+
`[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
|
|
1086
1215
|
);
|
|
1087
|
-
if (e
|
|
1216
|
+
if (e == null ? void 0 : e.stack) {
|
|
1088
1217
|
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
1089
1218
|
}
|
|
1090
1219
|
return { valid: false, logs: caseLogs };
|
|
1091
1220
|
}
|
|
1092
1221
|
};
|
|
1093
|
-
const
|
|
1222
|
+
const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
|
|
1094
1223
|
const results = new Array(items.length);
|
|
1095
1224
|
let idx = 0;
|
|
1096
1225
|
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
@@ -1106,10 +1235,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1106
1235
|
await Promise.all(workers);
|
|
1107
1236
|
return results;
|
|
1108
1237
|
};
|
|
1109
|
-
const resultsPerCase = await
|
|
1238
|
+
const resultsPerCase = await mapWithConcurrency2(
|
|
1110
1239
|
testCases,
|
|
1111
1240
|
concurrency,
|
|
1112
|
-
async (tc) =>
|
|
1241
|
+
async (tc) => runSingleCase2(tc)
|
|
1113
1242
|
);
|
|
1114
1243
|
correctCount = resultsPerCase.reduce(
|
|
1115
1244
|
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
@@ -1127,14 +1256,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1127
1256
|
};
|
|
1128
1257
|
}
|
|
1129
1258
|
const score = correctCount / testCases.length;
|
|
1259
|
+
const caseResults = resultsPerCase.map((r, i) => ({
|
|
1260
|
+
id: testCases[i].id,
|
|
1261
|
+
valid: r.valid
|
|
1262
|
+
}));
|
|
1130
1263
|
return {
|
|
1131
1264
|
score,
|
|
1132
1265
|
success: score > 0.95,
|
|
1133
|
-
// High success threshold as requested
|
|
1134
1266
|
metrics: {
|
|
1135
1267
|
correct_count: correctCount,
|
|
1136
1268
|
total_cases: testCases.length,
|
|
1137
|
-
accuracy: score
|
|
1269
|
+
accuracy: score,
|
|
1270
|
+
case_results: JSON.stringify(caseResults)
|
|
1138
1271
|
},
|
|
1139
1272
|
logs
|
|
1140
1273
|
};
|
|
@@ -1154,42 +1287,410 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1154
1287
|
}
|
|
1155
1288
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1156
1289
|
"bfcl-simple",
|
|
1157
|
-
"BFCL Simple Function Calling",
|
|
1158
|
-
"
|
|
1159
|
-
"
|
|
1290
|
+
"BFCL v4 Simple Function Calling",
|
|
1291
|
+
"BFCL_v4_simple.jsonl",
|
|
1292
|
+
"BFCL_v4_simple_possible_answer.jsonl"
|
|
1160
1293
|
);
|
|
1161
1294
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1162
1295
|
"bfcl-parallel",
|
|
1163
|
-
"BFCL Parallel Function Calling",
|
|
1164
|
-
"
|
|
1165
|
-
"
|
|
1296
|
+
"BFCL v4 Parallel Function Calling",
|
|
1297
|
+
"BFCL_v4_parallel.jsonl",
|
|
1298
|
+
"BFCL_v4_parallel_possible_answer.jsonl"
|
|
1166
1299
|
);
|
|
1167
1300
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1168
1301
|
"bfcl-multiple",
|
|
1169
|
-
"BFCL Multiple Function Calling",
|
|
1170
|
-
"
|
|
1171
|
-
"
|
|
1302
|
+
"BFCL v4 Multiple Function Calling",
|
|
1303
|
+
"BFCL_v4_multiple.jsonl",
|
|
1304
|
+
"BFCL_v4_multiple_possible_answer.jsonl"
|
|
1172
1305
|
);
|
|
1173
1306
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1174
1307
|
"bfcl-parallel-multiple",
|
|
1175
|
-
"BFCL Parallel & Multiple Function Calling",
|
|
1176
|
-
"
|
|
1177
|
-
"
|
|
1308
|
+
"BFCL v4 Parallel & Multiple Function Calling",
|
|
1309
|
+
"BFCL_v4_parallel_multiple.jsonl",
|
|
1310
|
+
"BFCL_v4_parallel_multiple_possible_answer.jsonl"
|
|
1178
1311
|
);
|
|
1179
1312
|
|
|
1180
|
-
// src/benchmarks/
|
|
1313
|
+
// src/benchmarks/complex-func-bench.ts
|
|
1181
1314
|
var import_node_fs3 = require("fs");
|
|
1182
1315
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
1183
1316
|
var import_ai2 = require("ai");
|
|
1317
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1318
|
+
function standardizeString2(input) {
|
|
1319
|
+
if (typeof input !== "string") {
|
|
1320
|
+
return input;
|
|
1321
|
+
}
|
|
1322
|
+
return input.toLowerCase().trim();
|
|
1323
|
+
}
|
|
1324
|
+
function valuesMatch2(modelValue, expectedValue) {
|
|
1325
|
+
if (modelValue === expectedValue) {
|
|
1326
|
+
return true;
|
|
1327
|
+
}
|
|
1328
|
+
if (typeof modelValue === "string" && typeof expectedValue === "string") {
|
|
1329
|
+
return standardizeString2(modelValue) === standardizeString2(expectedValue);
|
|
1330
|
+
}
|
|
1331
|
+
if (typeof modelValue === "number" && typeof expectedValue === "string") {
|
|
1332
|
+
return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
|
|
1333
|
+
}
|
|
1334
|
+
if (typeof modelValue === "string" && typeof expectedValue === "number") {
|
|
1335
|
+
return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
|
|
1336
|
+
}
|
|
1337
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
|
|
1338
|
+
try {
|
|
1339
|
+
return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
|
|
1340
|
+
} catch (e) {
|
|
1341
|
+
return false;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
return false;
|
|
1345
|
+
}
|
|
1346
|
+
function validateFunctionName(modelFuncName, expectedFuncName) {
|
|
1347
|
+
if (modelFuncName !== expectedFuncName) {
|
|
1348
|
+
return {
|
|
1349
|
+
valid: false,
|
|
1350
|
+
error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
|
|
1351
|
+
error_type: "function_name_mismatch"
|
|
1352
|
+
};
|
|
1353
|
+
}
|
|
1354
|
+
return { valid: true };
|
|
1355
|
+
}
|
|
1356
|
+
function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
|
|
1357
|
+
for (const param of requiredParams) {
|
|
1358
|
+
if (!(param in modelArgs) && param in expectedArgs) {
|
|
1359
|
+
return {
|
|
1360
|
+
valid: false,
|
|
1361
|
+
error: `Missing required parameter: '${param}'`,
|
|
1362
|
+
error_type: "missing_required_param"
|
|
1363
|
+
};
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
return { valid: true };
|
|
1367
|
+
}
|
|
1368
|
+
function validateParamValues(expectedArgs, modelArgs, requiredParams) {
|
|
1369
|
+
for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
|
|
1370
|
+
if (!(paramName in modelArgs)) {
|
|
1371
|
+
if (!requiredParams.includes(paramName)) {
|
|
1372
|
+
continue;
|
|
1373
|
+
}
|
|
1374
|
+
return {
|
|
1375
|
+
valid: false,
|
|
1376
|
+
error: `Missing parameter: '${paramName}'`,
|
|
1377
|
+
error_type: "missing_param"
|
|
1378
|
+
};
|
|
1379
|
+
}
|
|
1380
|
+
const modelValue = modelArgs[paramName];
|
|
1381
|
+
if (!valuesMatch2(modelValue, expectedValue)) {
|
|
1382
|
+
return {
|
|
1383
|
+
valid: false,
|
|
1384
|
+
error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
|
|
1385
|
+
error_type: "value_mismatch"
|
|
1386
|
+
};
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
return { valid: true };
|
|
1390
|
+
}
|
|
1391
|
+
function checkFunctionCall(modelCall, expected, toolSpecs) {
|
|
1392
|
+
var _a, _b, _c, _d;
|
|
1393
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1394
|
+
const expectedArgs = expected[expectedFuncName];
|
|
1395
|
+
const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
|
|
1396
|
+
const modelArgs = (_b = modelCall.args) != null ? _b : {};
|
|
1397
|
+
const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
|
|
1398
|
+
if (!nameResult.valid) {
|
|
1399
|
+
return nameResult;
|
|
1400
|
+
}
|
|
1401
|
+
const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
|
|
1402
|
+
const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
1403
|
+
const requiredResult = validateRequiredParams(
|
|
1404
|
+
requiredParams,
|
|
1405
|
+
modelArgs,
|
|
1406
|
+
expectedArgs
|
|
1407
|
+
);
|
|
1408
|
+
if (!requiredResult.valid) {
|
|
1409
|
+
return requiredResult;
|
|
1410
|
+
}
|
|
1411
|
+
return validateParamValues(expectedArgs, modelArgs, requiredParams);
|
|
1412
|
+
}
|
|
1413
|
+
function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
|
|
1414
|
+
if (modelCalls.length !== expectedCalls.length) {
|
|
1415
|
+
return {
|
|
1416
|
+
valid: false,
|
|
1417
|
+
error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
|
|
1418
|
+
error_type: "wrong_call_count"
|
|
1419
|
+
};
|
|
1420
|
+
}
|
|
1421
|
+
if (expectedCalls.length === 1) {
|
|
1422
|
+
return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
|
|
1423
|
+
}
|
|
1424
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
1425
|
+
for (const expected of expectedCalls) {
|
|
1426
|
+
let foundMatch = false;
|
|
1427
|
+
for (let i = 0; i < modelCalls.length; i++) {
|
|
1428
|
+
if (matchedIndices.has(i)) {
|
|
1429
|
+
continue;
|
|
1430
|
+
}
|
|
1431
|
+
const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
|
|
1432
|
+
if (result.valid) {
|
|
1433
|
+
matchedIndices.add(i);
|
|
1434
|
+
foundMatch = true;
|
|
1435
|
+
break;
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
if (!foundMatch) {
|
|
1439
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1440
|
+
return {
|
|
1441
|
+
valid: false,
|
|
1442
|
+
error: `Could not find matching call for function '${expectedFuncName}'`,
|
|
1443
|
+
error_type: "no_matching_call"
|
|
1444
|
+
};
|
|
1445
|
+
}
|
|
1446
|
+
}
|
|
1447
|
+
return { valid: true };
|
|
1448
|
+
}
|
|
1449
|
+
var fixSchemaType = (copy) => {
|
|
1450
|
+
if (!copy.type) {
|
|
1451
|
+
return;
|
|
1452
|
+
}
|
|
1453
|
+
if (copy.type === "dict") {
|
|
1454
|
+
copy.type = "object";
|
|
1455
|
+
}
|
|
1456
|
+
if (copy.type === "tuple") {
|
|
1457
|
+
copy.type = "array";
|
|
1458
|
+
}
|
|
1459
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
1460
|
+
copy.type = "number";
|
|
1461
|
+
}
|
|
1462
|
+
};
|
|
1463
|
+
var fixSchema = (schema) => {
|
|
1464
|
+
if (!schema || typeof schema !== "object") {
|
|
1465
|
+
return { type: "object", properties: {} };
|
|
1466
|
+
}
|
|
1467
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
1468
|
+
if (!Array.isArray(copy)) {
|
|
1469
|
+
fixSchemaType(copy);
|
|
1470
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
1471
|
+
for (const k of Object.keys(copy.properties)) {
|
|
1472
|
+
copy.properties[k] = fixSchema(
|
|
1473
|
+
copy.properties[k]
|
|
1474
|
+
);
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
if (copy.items) {
|
|
1478
|
+
copy.items = fixSchema(copy.items);
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
return copy;
|
|
1482
|
+
};
|
|
1483
|
+
function buildTools(tools) {
|
|
1484
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
1485
|
+
const transformedTools = tools.map((t) => {
|
|
1486
|
+
const fixed = fixSchema(t.parameters);
|
|
1487
|
+
const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
|
|
1488
|
+
const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
|
|
1489
|
+
nameMap.set(sanitized, t.name);
|
|
1490
|
+
return {
|
|
1491
|
+
type: "function",
|
|
1492
|
+
name: sanitized,
|
|
1493
|
+
description: t.description,
|
|
1494
|
+
inputSchema
|
|
1495
|
+
};
|
|
1496
|
+
});
|
|
1497
|
+
const toolsMap = Object.fromEntries(
|
|
1498
|
+
transformedTools.map((t) => [
|
|
1499
|
+
t.name,
|
|
1500
|
+
(0, import_ai2.tool)({
|
|
1501
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
1502
|
+
inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
|
|
1503
|
+
})
|
|
1504
|
+
])
|
|
1505
|
+
);
|
|
1506
|
+
return { nameMap, toolsMap };
|
|
1507
|
+
}
|
|
1508
|
+
async function mapWithConcurrency(items, concurrencyLimit, mapper) {
|
|
1509
|
+
const results = new Array(items.length);
|
|
1510
|
+
let idx = 0;
|
|
1511
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1512
|
+
while (true) {
|
|
1513
|
+
const current = idx;
|
|
1514
|
+
idx += 1;
|
|
1515
|
+
if (current >= items.length) {
|
|
1516
|
+
break;
|
|
1517
|
+
}
|
|
1518
|
+
results[current] = await mapper(items[current]);
|
|
1519
|
+
}
|
|
1520
|
+
});
|
|
1521
|
+
await Promise.all(workers);
|
|
1522
|
+
return results;
|
|
1523
|
+
}
|
|
1524
|
+
async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
|
|
1525
|
+
const caseLogs = [];
|
|
1526
|
+
const { function: tools, question: messages } = testCase;
|
|
1527
|
+
try {
|
|
1528
|
+
const { nameMap, toolsMap } = buildTools(tools);
|
|
1529
|
+
const debugSummaryRef = {};
|
|
1530
|
+
const providerOptions = {
|
|
1531
|
+
toolCallMiddleware: { debugSummary: debugSummaryRef }
|
|
1532
|
+
};
|
|
1533
|
+
const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
|
|
1534
|
+
model,
|
|
1535
|
+
messages,
|
|
1536
|
+
tools: toolsMap,
|
|
1537
|
+
toolChoice: "auto",
|
|
1538
|
+
providerOptions,
|
|
1539
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
1540
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
1541
|
+
});
|
|
1542
|
+
const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
|
|
1543
|
+
var _a, _b, _c, _d;
|
|
1544
|
+
const rawName = (_a = c.toolName) != null ? _a : c.name;
|
|
1545
|
+
const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
|
|
1546
|
+
return {
|
|
1547
|
+
toolName: originalName,
|
|
1548
|
+
name: originalName,
|
|
1549
|
+
args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
|
|
1550
|
+
};
|
|
1551
|
+
});
|
|
1552
|
+
caseLogs.push(
|
|
1553
|
+
`[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
|
|
1554
|
+
);
|
|
1555
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1556
|
+
if (!possibleAnswer) {
|
|
1557
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1558
|
+
}
|
|
1559
|
+
const checkerResult = checkAllFunctionCalls(
|
|
1560
|
+
restoredCalls,
|
|
1561
|
+
possibleAnswer.ground_truth,
|
|
1562
|
+
tools
|
|
1563
|
+
);
|
|
1564
|
+
if (checkerResult.valid) {
|
|
1565
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1566
|
+
return { valid: true, logs: caseLogs };
|
|
1567
|
+
}
|
|
1568
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1569
|
+
return { valid: false, logs: caseLogs };
|
|
1570
|
+
} catch (e) {
|
|
1571
|
+
caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
|
|
1572
|
+
return { valid: false, logs: caseLogs };
|
|
1573
|
+
}
|
|
1574
|
+
}
|
|
1575
|
+
async function loadTestData(dataPath, testDataFile) {
|
|
1576
|
+
const testCasesJson = await import_node_fs3.promises.readFile(
|
|
1577
|
+
import_node_path3.default.join(dataPath, testDataFile),
|
|
1578
|
+
"utf-8"
|
|
1579
|
+
);
|
|
1580
|
+
return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1581
|
+
}
|
|
1582
|
+
async function loadAnswerData(dataPath, answerDataFile) {
|
|
1583
|
+
const answersJson = await import_node_fs3.promises.readFile(
|
|
1584
|
+
import_node_path3.default.join(dataPath, answerDataFile),
|
|
1585
|
+
"utf-8"
|
|
1586
|
+
);
|
|
1587
|
+
const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1588
|
+
return new Map(answers.map((ans) => [ans.id, ans]));
|
|
1589
|
+
}
|
|
1590
|
+
function getConfigValues(config) {
|
|
1591
|
+
const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
|
|
1592
|
+
const limit = limitEnv ? Number(limitEnv) : void 0;
|
|
1593
|
+
const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
|
|
1594
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
1595
|
+
const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
|
|
1596
|
+
const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
|
|
1597
|
+
return { limit, concurrency, temperature, maxTokens };
|
|
1598
|
+
}
|
|
1599
|
+
function aggregateResults(resultsPerCase, testCases) {
|
|
1600
|
+
const logs = [];
|
|
1601
|
+
const correctCount = resultsPerCase.reduce(
|
|
1602
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1603
|
+
0
|
|
1604
|
+
);
|
|
1605
|
+
for (const r of resultsPerCase) {
|
|
1606
|
+
logs.push(...r.logs);
|
|
1607
|
+
}
|
|
1608
|
+
if (testCases.length === 0) {
|
|
1609
|
+
return {
|
|
1610
|
+
score: 0,
|
|
1611
|
+
success: false,
|
|
1612
|
+
metrics: {},
|
|
1613
|
+
logs: ["No test cases found."]
|
|
1614
|
+
};
|
|
1615
|
+
}
|
|
1616
|
+
const score = correctCount / testCases.length;
|
|
1617
|
+
return {
|
|
1618
|
+
score,
|
|
1619
|
+
success: score > 0.5,
|
|
1620
|
+
metrics: {
|
|
1621
|
+
correct_count: correctCount,
|
|
1622
|
+
total_cases: testCases.length,
|
|
1623
|
+
accuracy: score
|
|
1624
|
+
},
|
|
1625
|
+
logs
|
|
1626
|
+
};
|
|
1627
|
+
}
|
|
1628
|
+
function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
|
|
1629
|
+
return {
|
|
1630
|
+
name,
|
|
1631
|
+
version: "1.0.0",
|
|
1632
|
+
description,
|
|
1633
|
+
async run(model, config) {
|
|
1634
|
+
var _a;
|
|
1635
|
+
const logs = [];
|
|
1636
|
+
try {
|
|
1637
|
+
const dataPath = resolveDataDir();
|
|
1638
|
+
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
1639
|
+
let testCases = await loadTestData(dataPath, testDataFile);
|
|
1640
|
+
const possibleAnswersMap = await loadAnswerData(
|
|
1641
|
+
dataPath,
|
|
1642
|
+
answerDataFile
|
|
1643
|
+
);
|
|
1644
|
+
const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
|
|
1645
|
+
if (limit && Number.isFinite(limit) && limit > 0) {
|
|
1646
|
+
testCases = testCases.slice(0, limit);
|
|
1647
|
+
logs.push(`[INFO] Limiting test cases to ${limit}`);
|
|
1648
|
+
}
|
|
1649
|
+
logs.push(
|
|
1650
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
1651
|
+
);
|
|
1652
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1653
|
+
testCases,
|
|
1654
|
+
concurrency,
|
|
1655
|
+
(tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
|
|
1656
|
+
);
|
|
1657
|
+
const result = aggregateResults(resultsPerCase, testCases);
|
|
1658
|
+
result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
|
|
1659
|
+
return result;
|
|
1660
|
+
} catch (e) {
|
|
1661
|
+
return {
|
|
1662
|
+
score: 0,
|
|
1663
|
+
success: false,
|
|
1664
|
+
metrics: {},
|
|
1665
|
+
error: e,
|
|
1666
|
+
logs: [
|
|
1667
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1668
|
+
]
|
|
1669
|
+
};
|
|
1670
|
+
}
|
|
1671
|
+
}
|
|
1672
|
+
};
|
|
1673
|
+
}
|
|
1674
|
+
var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
|
|
1675
|
+
"complex-func-bench",
|
|
1676
|
+
"ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
|
|
1677
|
+
"ComplexFuncBench.jsonl",
|
|
1678
|
+
"ComplexFuncBench_possible_answer.jsonl"
|
|
1679
|
+
);
|
|
1680
|
+
|
|
1681
|
+
// src/benchmarks/json-generation.ts
|
|
1682
|
+
var import_node_fs4 = require("fs");
|
|
1683
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
1684
|
+
var import_ai3 = require("ai");
|
|
1184
1685
|
var import_ajv = __toESM(require("ajv"), 1);
|
|
1185
1686
|
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1186
1687
|
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1187
1688
|
var NEWLINE_REGEX = /\r?\n/;
|
|
1188
|
-
var
|
|
1689
|
+
var LINE_SPLIT_REGEX3 = /\r?\n/;
|
|
1189
1690
|
function tryDirectParse(text) {
|
|
1190
1691
|
try {
|
|
1191
1692
|
return JSON.parse(text);
|
|
1192
|
-
} catch {
|
|
1693
|
+
} catch (e) {
|
|
1193
1694
|
return;
|
|
1194
1695
|
}
|
|
1195
1696
|
}
|
|
@@ -1201,7 +1702,7 @@ function tryCodeFenceParse(text) {
|
|
|
1201
1702
|
const inner = fenceMatch[1].trim();
|
|
1202
1703
|
try {
|
|
1203
1704
|
return JSON.parse(inner);
|
|
1204
|
-
} catch {
|
|
1705
|
+
} catch (e) {
|
|
1205
1706
|
return;
|
|
1206
1707
|
}
|
|
1207
1708
|
}
|
|
@@ -1226,7 +1727,7 @@ function tryBracketScan(text) {
|
|
|
1226
1727
|
const candidate = text.slice(start, i + 1);
|
|
1227
1728
|
try {
|
|
1228
1729
|
return JSON.parse(candidate);
|
|
1229
|
-
} catch {
|
|
1730
|
+
} catch (e) {
|
|
1230
1731
|
return;
|
|
1231
1732
|
}
|
|
1232
1733
|
}
|
|
@@ -1274,12 +1775,12 @@ function subsetMatch(expected, actual) {
|
|
|
1274
1775
|
async function loadDatasets() {
|
|
1275
1776
|
try {
|
|
1276
1777
|
const dataDir = resolveDataDir();
|
|
1277
|
-
const testsJsonl = await
|
|
1278
|
-
|
|
1778
|
+
const testsJsonl = await import_node_fs4.promises.readFile(
|
|
1779
|
+
import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1279
1780
|
"utf-8"
|
|
1280
1781
|
);
|
|
1281
|
-
const expectedJsonl = await
|
|
1282
|
-
|
|
1782
|
+
const expectedJsonl = await import_node_fs4.promises.readFile(
|
|
1783
|
+
import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
1283
1784
|
"utf-8"
|
|
1284
1785
|
);
|
|
1285
1786
|
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -1335,10 +1836,11 @@ function validateTestCase(tc, parsed, context) {
|
|
|
1335
1836
|
return { valid, valuesOk, parsed };
|
|
1336
1837
|
}
|
|
1337
1838
|
async function processTestCase(tc, context) {
|
|
1839
|
+
var _a;
|
|
1338
1840
|
const messages = buildMessages(tc);
|
|
1339
|
-
const temp = context.config
|
|
1841
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1340
1842
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1341
|
-
const { text } = await (0,
|
|
1843
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
1342
1844
|
model: context.model,
|
|
1343
1845
|
messages,
|
|
1344
1846
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1346,7 +1848,7 @@ async function processTestCase(tc, context) {
|
|
|
1346
1848
|
let parsed;
|
|
1347
1849
|
try {
|
|
1348
1850
|
parsed = extractFirstJsonBlock(text);
|
|
1349
|
-
} catch {
|
|
1851
|
+
} catch (e) {
|
|
1350
1852
|
}
|
|
1351
1853
|
if (parsed === void 0) {
|
|
1352
1854
|
context.validation.logs.push(
|
|
@@ -1440,21 +1942,22 @@ function buildBenchmarkResult(total, counts, logs) {
|
|
|
1440
1942
|
async function loadSchemaOnlyTests() {
|
|
1441
1943
|
try {
|
|
1442
1944
|
const dataDir = resolveDataDir();
|
|
1443
|
-
const testsJsonl = await
|
|
1444
|
-
|
|
1945
|
+
const testsJsonl = await import_node_fs4.promises.readFile(
|
|
1946
|
+
import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1445
1947
|
"utf-8"
|
|
1446
1948
|
);
|
|
1447
|
-
const tests = testsJsonl.split(
|
|
1949
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1448
1950
|
return { tests };
|
|
1449
1951
|
} catch (e) {
|
|
1450
1952
|
return { tests: [], error: e };
|
|
1451
1953
|
}
|
|
1452
1954
|
}
|
|
1453
1955
|
async function processSchemaOnlyTestCase(tc, context) {
|
|
1956
|
+
var _a;
|
|
1454
1957
|
const messages = buildMessages(tc);
|
|
1455
|
-
const temp = context.config
|
|
1958
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1456
1959
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1457
|
-
const { text } = await (0,
|
|
1960
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
1458
1961
|
model: context.model,
|
|
1459
1962
|
messages,
|
|
1460
1963
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1462,7 +1965,7 @@ async function processSchemaOnlyTestCase(tc, context) {
|
|
|
1462
1965
|
let parsed;
|
|
1463
1966
|
try {
|
|
1464
1967
|
parsed = extractFirstJsonBlock(text);
|
|
1465
|
-
} catch {
|
|
1968
|
+
} catch (e) {
|
|
1466
1969
|
}
|
|
1467
1970
|
if (parsed === void 0) {
|
|
1468
1971
|
context.logs.push(
|
|
@@ -1531,38 +2034,144 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1531
2034
|
}
|
|
1532
2035
|
};
|
|
1533
2036
|
|
|
2037
|
+
// src/evaluate.ts
|
|
2038
|
+
var import_middleware = require("@ai-sdk-tool/middleware");
|
|
2039
|
+
var import_ai4 = require("ai");
|
|
2040
|
+
|
|
1534
2041
|
// src/reporters/console.ts
|
|
1535
2042
|
var colors = {
|
|
1536
2043
|
reset: "\x1B[0m",
|
|
2044
|
+
bold: "\x1B[1m",
|
|
1537
2045
|
green: "\x1B[32m",
|
|
1538
2046
|
red: "\x1B[31m",
|
|
1539
2047
|
yellow: "\x1B[33m",
|
|
1540
2048
|
cyan: "\x1B[36m",
|
|
1541
2049
|
magenta: "\x1B[35m",
|
|
1542
|
-
gray: "\x1B[90m"
|
|
2050
|
+
gray: "\x1B[90m",
|
|
2051
|
+
white: "\x1B[37m"
|
|
1543
2052
|
};
|
|
2053
|
+
var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
|
|
2054
|
+
function formatDiff(diff) {
|
|
2055
|
+
if (!diff || diff.length === 0) {
|
|
2056
|
+
return "";
|
|
2057
|
+
}
|
|
2058
|
+
return diff.slice(0, 8).map((line) => {
|
|
2059
|
+
if (line.startsWith("-")) {
|
|
2060
|
+
return `${colors.red}${line}${colors.reset}`;
|
|
2061
|
+
}
|
|
2062
|
+
if (line.startsWith("+")) {
|
|
2063
|
+
return `${colors.green}${line}${colors.reset}`;
|
|
2064
|
+
}
|
|
2065
|
+
if (line.startsWith("@@")) {
|
|
2066
|
+
return `${colors.cyan}${line}${colors.reset}`;
|
|
2067
|
+
}
|
|
2068
|
+
return line;
|
|
2069
|
+
}).join("\n ");
|
|
2070
|
+
}
|
|
2071
|
+
function parseFailures(logs) {
|
|
2072
|
+
const failures = [];
|
|
2073
|
+
for (const log of logs) {
|
|
2074
|
+
if (!DEBUG_FAIL_REGEX.test(log)) {
|
|
2075
|
+
continue;
|
|
2076
|
+
}
|
|
2077
|
+
try {
|
|
2078
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
|
|
2079
|
+
const parsed = JSON.parse(jsonStr);
|
|
2080
|
+
failures.push(parsed);
|
|
2081
|
+
} catch (e) {
|
|
2082
|
+
}
|
|
2083
|
+
}
|
|
2084
|
+
return failures;
|
|
2085
|
+
}
|
|
2086
|
+
function groupFailuresByCategory(failures) {
|
|
2087
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2088
|
+
for (const failure of failures) {
|
|
2089
|
+
const category = failure.category || "OTHER";
|
|
2090
|
+
const existing = groups.get(category);
|
|
2091
|
+
if (existing) {
|
|
2092
|
+
existing.push(failure);
|
|
2093
|
+
} else {
|
|
2094
|
+
groups.set(category, [failure]);
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
return groups;
|
|
2098
|
+
}
|
|
2099
|
+
function printCompactFailure(failure) {
|
|
2100
|
+
var _a;
|
|
2101
|
+
console.log(
|
|
2102
|
+
`
|
|
2103
|
+
${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
|
|
2104
|
+
);
|
|
2105
|
+
if (failure.message) {
|
|
2106
|
+
console.log(` ${failure.message}`);
|
|
2107
|
+
}
|
|
2108
|
+
if (failure.diff && failure.diff.length > 0) {
|
|
2109
|
+
console.log(` ${formatDiff(failure.diff)}`);
|
|
2110
|
+
}
|
|
2111
|
+
if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
|
|
2112
|
+
const text = failure.context.raw_model_text;
|
|
2113
|
+
const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
|
|
2114
|
+
console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
function printFailureSummary(failures) {
|
|
2118
|
+
const groups = groupFailuresByCategory(failures);
|
|
2119
|
+
const sorted = [...groups.entries()].sort(
|
|
2120
|
+
(a, b) => b[1].length - a[1].length
|
|
2121
|
+
);
|
|
2122
|
+
console.log(`
|
|
2123
|
+
${colors.bold}Failures by category:${colors.reset}`);
|
|
2124
|
+
for (const [category, categoryFailures] of sorted) {
|
|
2125
|
+
console.log(
|
|
2126
|
+
` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
|
|
2127
|
+
);
|
|
2128
|
+
}
|
|
2129
|
+
const maxToShow = 5;
|
|
2130
|
+
const shown = failures.slice(0, maxToShow);
|
|
2131
|
+
for (const failure of shown) {
|
|
2132
|
+
printCompactFailure(failure);
|
|
2133
|
+
}
|
|
2134
|
+
if (failures.length > maxToShow) {
|
|
2135
|
+
const remaining = failures.length - maxToShow;
|
|
2136
|
+
const remainingIds = failures.slice(maxToShow).map((f) => f.id);
|
|
2137
|
+
const idPreview = remainingIds.slice(0, 5).join(", ");
|
|
2138
|
+
const more = remainingIds.length > 5 ? "..." : "";
|
|
2139
|
+
console.log(
|
|
2140
|
+
`
|
|
2141
|
+
${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
|
|
2142
|
+
);
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
1544
2145
|
function printResult(result) {
|
|
1545
2146
|
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1546
|
-
const
|
|
2147
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2148
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2149
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2150
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2151
|
+
const statusColor = benchmarkResult.success ? colors.green : colors.red;
|
|
1547
2152
|
console.log(
|
|
1548
2153
|
`
|
|
1549
2154
|
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
1550
2155
|
);
|
|
1551
2156
|
console.log(
|
|
1552
|
-
` \u2514 ${
|
|
2157
|
+
` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
|
|
1553
2158
|
);
|
|
1554
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
1555
|
-
if (metrics.length > 0) {
|
|
1556
|
-
console.log(" Metrics:");
|
|
1557
|
-
for (const [key, value] of metrics) {
|
|
1558
|
-
console.log(` - ${key}: ${value}`);
|
|
1559
|
-
}
|
|
1560
|
-
}
|
|
1561
2159
|
if (benchmarkResult.error) {
|
|
1562
2160
|
console.log(
|
|
1563
2161
|
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
1564
2162
|
);
|
|
1565
2163
|
}
|
|
2164
|
+
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2165
|
+
const failures = parseFailures(benchmarkResult.logs);
|
|
2166
|
+
if (failures.length > 0) {
|
|
2167
|
+
printFailureSummary(failures);
|
|
2168
|
+
} else if (benchmarkResult.logs.length > 0) {
|
|
2169
|
+
console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
|
|
2170
|
+
for (const l of benchmarkResult.logs.slice(0, 5)) {
|
|
2171
|
+
console.log(` ${l}`);
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
}
|
|
1566
2175
|
}
|
|
1567
2176
|
function consoleReporter(results) {
|
|
1568
2177
|
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
@@ -1617,14 +2226,14 @@ function hasFunctionNameIssue(diff) {
|
|
|
1617
2226
|
);
|
|
1618
2227
|
}
|
|
1619
2228
|
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
1620
|
-
const expectedName = expected
|
|
1621
|
-
const actualName = actual
|
|
2229
|
+
const expectedName = expected == null ? void 0 : expected.function;
|
|
2230
|
+
const actualName = actual == null ? void 0 : actual.function;
|
|
1622
2231
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
1623
2232
|
suggestions.push(
|
|
1624
2233
|
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
1625
2234
|
);
|
|
1626
2235
|
}
|
|
1627
|
-
if (Array.isArray(expected
|
|
2236
|
+
if (Array.isArray(expected == null ? void 0 : expected.functions)) {
|
|
1628
2237
|
suggestions.push(
|
|
1629
2238
|
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
1630
2239
|
);
|
|
@@ -1679,7 +2288,7 @@ function suggestFromErrorType(error_type, suggestions) {
|
|
|
1679
2288
|
}
|
|
1680
2289
|
function suggestFixFromDiff(parsed) {
|
|
1681
2290
|
const suggestions = [];
|
|
1682
|
-
const { error_type, expected, actual, diff } = parsed
|
|
2291
|
+
const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
|
|
1683
2292
|
if (!Array.isArray(diff)) {
|
|
1684
2293
|
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1685
2294
|
suggestFromErrorType(error_type, suggestions);
|
|
@@ -1704,15 +2313,16 @@ function suggestFixFromDiff(parsed) {
|
|
|
1704
2313
|
return uniqueLines(suggestions);
|
|
1705
2314
|
}
|
|
1706
2315
|
function getTestIdFromLogLine(line) {
|
|
2316
|
+
var _a, _b;
|
|
1707
2317
|
if (line.startsWith("[FAIL]")) {
|
|
1708
2318
|
const m = line.match(FAIL_ID_REGEX);
|
|
1709
|
-
return m
|
|
2319
|
+
return m == null ? void 0 : m[1];
|
|
1710
2320
|
}
|
|
1711
2321
|
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1712
2322
|
try {
|
|
1713
2323
|
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1714
|
-
return String(parsed
|
|
1715
|
-
} catch {
|
|
2324
|
+
return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
|
|
2325
|
+
} catch (e) {
|
|
1716
2326
|
}
|
|
1717
2327
|
}
|
|
1718
2328
|
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
@@ -1720,18 +2330,19 @@ function getTestIdFromLogLine(line) {
|
|
|
1720
2330
|
const parsed = JSON.parse(
|
|
1721
2331
|
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
1722
2332
|
);
|
|
1723
|
-
return String(parsed
|
|
1724
|
-
} catch {
|
|
2333
|
+
return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
|
|
2334
|
+
} catch (e) {
|
|
1725
2335
|
}
|
|
1726
2336
|
}
|
|
1727
2337
|
return;
|
|
1728
2338
|
}
|
|
1729
2339
|
function groupLogsByTestId(failLogs) {
|
|
2340
|
+
var _a;
|
|
1730
2341
|
const byId = /* @__PURE__ */ new Map();
|
|
1731
2342
|
for (const line of failLogs) {
|
|
1732
2343
|
const id = getTestIdFromLogLine(line);
|
|
1733
|
-
const key = id
|
|
1734
|
-
const arr = byId.get(key)
|
|
2344
|
+
const key = id != null ? id : "__general__";
|
|
2345
|
+
const arr = (_a = byId.get(key)) != null ? _a : [];
|
|
1735
2346
|
arr.push(line);
|
|
1736
2347
|
byId.set(key, arr);
|
|
1737
2348
|
}
|
|
@@ -1743,10 +2354,10 @@ function collectDebugIds(lines) {
|
|
|
1743
2354
|
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
1744
2355
|
try {
|
|
1745
2356
|
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1746
|
-
if (parsed
|
|
2357
|
+
if (parsed == null ? void 0 : parsed.id) {
|
|
1747
2358
|
debugIds.add(String(parsed.id));
|
|
1748
2359
|
}
|
|
1749
|
-
} catch {
|
|
2360
|
+
} catch (e) {
|
|
1750
2361
|
}
|
|
1751
2362
|
}
|
|
1752
2363
|
}
|
|
@@ -1782,7 +2393,7 @@ function displayDebugFailLine(line) {
|
|
|
1782
2393
|
console.log(` \u2022 ${s}`);
|
|
1783
2394
|
}
|
|
1784
2395
|
}
|
|
1785
|
-
} catch {
|
|
2396
|
+
} catch (e) {
|
|
1786
2397
|
console.log(` ${line}`);
|
|
1787
2398
|
}
|
|
1788
2399
|
}
|
|
@@ -1826,14 +2437,14 @@ function displayDebugFailContextLine(line) {
|
|
|
1826
2437
|
const ctx = JSON.parse(payload);
|
|
1827
2438
|
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
1828
2439
|
displayContextInfo(ctx);
|
|
1829
|
-
} catch {
|
|
2440
|
+
} catch (e) {
|
|
1830
2441
|
console.log(` ${line}`);
|
|
1831
2442
|
}
|
|
1832
2443
|
}
|
|
1833
2444
|
function displayLogLine(line, debugIds) {
|
|
1834
2445
|
if (line.startsWith("[FAIL]")) {
|
|
1835
2446
|
const m = line.match(FAIL_ID_REGEX);
|
|
1836
|
-
const failId = m
|
|
2447
|
+
const failId = m == null ? void 0 : m[1];
|
|
1837
2448
|
if (failId && debugIds.has(failId)) {
|
|
1838
2449
|
return;
|
|
1839
2450
|
}
|
|
@@ -1903,26 +2514,350 @@ function displayResultHeader(r) {
|
|
|
1903
2514
|
);
|
|
1904
2515
|
}
|
|
1905
2516
|
function consoleDebugReporter(results) {
|
|
2517
|
+
var _a;
|
|
1906
2518
|
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
1907
2519
|
for (const r of results) {
|
|
1908
2520
|
displayResultHeader(r);
|
|
1909
2521
|
displayMetrics(Object.entries(r.result.metrics));
|
|
1910
|
-
if (r.result.logs
|
|
2522
|
+
if ((_a = r.result.logs) == null ? void 0 : _a.length) {
|
|
1911
2523
|
displayResultLogs(r.result.logs);
|
|
1912
2524
|
}
|
|
1913
2525
|
}
|
|
1914
2526
|
console.log("\n------------------------------------\n");
|
|
1915
2527
|
}
|
|
1916
2528
|
|
|
2529
|
+
// src/reporters/console.summary.ts
|
|
2530
|
+
var colors3 = {
|
|
2531
|
+
reset: "\x1B[0m",
|
|
2532
|
+
bold: "\x1B[1m",
|
|
2533
|
+
dim: "\x1B[2m",
|
|
2534
|
+
green: "\x1B[32m",
|
|
2535
|
+
red: "\x1B[31m",
|
|
2536
|
+
yellow: "\x1B[33m",
|
|
2537
|
+
cyan: "\x1B[36m",
|
|
2538
|
+
magenta: "\x1B[35m",
|
|
2539
|
+
gray: "\x1B[90m",
|
|
2540
|
+
white: "\x1B[37m"
|
|
2541
|
+
};
|
|
2542
|
+
var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
|
|
2543
|
+
var ID_NUM_REGEX = /_(\d+)$/;
|
|
2544
|
+
var REASONING_TAG = "think";
|
|
2545
|
+
var MAX_FAILURES_TO_DISPLAY = 5;
|
|
2546
|
+
var CATEGORY_DESCRIPTIONS = {
|
|
2547
|
+
PARSE_FAILURE: {
|
|
2548
|
+
label: "Parse Failure",
|
|
2549
|
+
description: "No tool calls extracted from model output",
|
|
2550
|
+
hint: "Model may have responded in text instead of tool format"
|
|
2551
|
+
},
|
|
2552
|
+
PARTIAL_CALLS: {
|
|
2553
|
+
label: "Partial Calls",
|
|
2554
|
+
description: "Some expected tool calls missing",
|
|
2555
|
+
hint: "Model stopped early or missed some tools"
|
|
2556
|
+
},
|
|
2557
|
+
EXTRA_CALLS: {
|
|
2558
|
+
label: "Extra Calls",
|
|
2559
|
+
description: "More tool calls than expected",
|
|
2560
|
+
hint: "Model called tools that weren't needed"
|
|
2561
|
+
},
|
|
2562
|
+
PARAM_VALUE_PERCENT: {
|
|
2563
|
+
label: "Param Value (Percent)",
|
|
2564
|
+
description: "Percentage sent as integer instead of decimal",
|
|
2565
|
+
hint: "e.g., 5 instead of 0.05 for 5%"
|
|
2566
|
+
},
|
|
2567
|
+
PARAM_VALUE_MISMATCH: {
|
|
2568
|
+
label: "Param Value Mismatch",
|
|
2569
|
+
description: "Parameter values don't match expected"
|
|
2570
|
+
},
|
|
2571
|
+
WRONG_FUNCTION: {
|
|
2572
|
+
label: "Wrong Function",
|
|
2573
|
+
description: "Called wrong function name"
|
|
2574
|
+
},
|
|
2575
|
+
MISSING_PARAMS: {
|
|
2576
|
+
label: "Missing Params",
|
|
2577
|
+
description: "Required parameters not provided"
|
|
2578
|
+
},
|
|
2579
|
+
UNEXPECTED_PARAMS: {
|
|
2580
|
+
label: "Unexpected Params",
|
|
2581
|
+
description: "Extra parameters that shouldn't be there"
|
|
2582
|
+
},
|
|
2583
|
+
NO_MATCH: {
|
|
2584
|
+
label: "No Match",
|
|
2585
|
+
description: "Function called but couldn't match to expected",
|
|
2586
|
+
hint: "Parameters may be correct but don't match any expected combination"
|
|
2587
|
+
},
|
|
2588
|
+
OTHER: {
|
|
2589
|
+
label: "Other",
|
|
2590
|
+
description: "Uncategorized failure"
|
|
2591
|
+
}
|
|
2592
|
+
};
|
|
2593
|
+
function parseFailureLogs(logs) {
|
|
2594
|
+
return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
|
|
2595
|
+
try {
|
|
2596
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
|
|
2597
|
+
return JSON.parse(jsonStr);
|
|
2598
|
+
} catch (e) {
|
|
2599
|
+
return null;
|
|
2600
|
+
}
|
|
2601
|
+
}).filter((parsed) => parsed !== null);
|
|
2602
|
+
}
|
|
2603
|
+
function groupByCategory(failures) {
|
|
2604
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2605
|
+
for (const failure of failures) {
|
|
2606
|
+
const category = failure.category || "OTHER";
|
|
2607
|
+
const existing = groups.get(category);
|
|
2608
|
+
if (existing) {
|
|
2609
|
+
existing.failures.push(failure);
|
|
2610
|
+
} else {
|
|
2611
|
+
groups.set(category, { failures: [failure] });
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
return groups;
|
|
2615
|
+
}
|
|
2616
|
+
function extractParamNames(failures) {
|
|
2617
|
+
const paramNames = /* @__PURE__ */ new Set();
|
|
2618
|
+
for (const f of failures) {
|
|
2619
|
+
if (!f.diff) {
|
|
2620
|
+
continue;
|
|
2621
|
+
}
|
|
2622
|
+
for (const d of f.diff) {
|
|
2623
|
+
if (d.startsWith("@@ param ")) {
|
|
2624
|
+
paramNames.add(d.replace("@@ param ", ""));
|
|
2625
|
+
}
|
|
2626
|
+
}
|
|
2627
|
+
}
|
|
2628
|
+
return paramNames;
|
|
2629
|
+
}
|
|
2630
|
+
function extractFinishReasons(failures) {
|
|
2631
|
+
var _a;
|
|
2632
|
+
const finishReasons = /* @__PURE__ */ new Set();
|
|
2633
|
+
for (const f of failures) {
|
|
2634
|
+
if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
|
|
2635
|
+
finishReasons.add(String(f.context.finish_reason));
|
|
2636
|
+
}
|
|
2637
|
+
}
|
|
2638
|
+
return finishReasons;
|
|
2639
|
+
}
|
|
2640
|
+
function detectPatterns(group) {
|
|
2641
|
+
const { failures } = group;
|
|
2642
|
+
if (failures.length < 2) {
|
|
2643
|
+
return;
|
|
2644
|
+
}
|
|
2645
|
+
const firstCategory = failures[0].category;
|
|
2646
|
+
if (firstCategory === "PARAM_VALUE_PERCENT") {
|
|
2647
|
+
const paramNames = extractParamNames(failures);
|
|
2648
|
+
if (paramNames.size > 0) {
|
|
2649
|
+
group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
if (firstCategory === "PARSE_FAILURE") {
|
|
2653
|
+
const finishReasons = extractFinishReasons(failures);
|
|
2654
|
+
if (finishReasons.size === 1) {
|
|
2655
|
+
group.pattern = `All finished with: ${[...finishReasons][0]}`;
|
|
2656
|
+
}
|
|
2657
|
+
}
|
|
2658
|
+
}
|
|
2659
|
+
function getLineColor(line) {
|
|
2660
|
+
if (line.startsWith("+")) {
|
|
2661
|
+
return colors3.green;
|
|
2662
|
+
}
|
|
2663
|
+
if (line.startsWith("-")) {
|
|
2664
|
+
return colors3.red;
|
|
2665
|
+
}
|
|
2666
|
+
if (line.startsWith("@@")) {
|
|
2667
|
+
return colors3.cyan;
|
|
2668
|
+
}
|
|
2669
|
+
return colors3.white;
|
|
2670
|
+
}
|
|
2671
|
+
function formatFunctions(funcs) {
|
|
2672
|
+
if (Array.isArray(funcs)) {
|
|
2673
|
+
return funcs.join(", ");
|
|
2674
|
+
}
|
|
2675
|
+
return String(funcs);
|
|
2676
|
+
}
|
|
2677
|
+
function printExpectedActual(failure) {
|
|
2678
|
+
if (failure.expected) {
|
|
2679
|
+
const expFuncs = failure.expected.functions || failure.expected.function;
|
|
2680
|
+
if (expFuncs) {
|
|
2681
|
+
console.log(
|
|
2682
|
+
` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
|
|
2683
|
+
);
|
|
2684
|
+
}
|
|
2685
|
+
}
|
|
2686
|
+
if (failure.actual) {
|
|
2687
|
+
const actFuncs = failure.actual.functions || failure.actual.function;
|
|
2688
|
+
if (actFuncs) {
|
|
2689
|
+
const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
|
|
2690
|
+
const color = isEmpty ? colors3.red : colors3.white;
|
|
2691
|
+
const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
|
|
2692
|
+
console.log(
|
|
2693
|
+
` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
|
|
2694
|
+
);
|
|
2695
|
+
}
|
|
2696
|
+
}
|
|
2697
|
+
}
|
|
2698
|
+
function printDiff(diff) {
|
|
2699
|
+
console.log(` ${colors3.gray}Diff:${colors3.reset}`);
|
|
2700
|
+
for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
|
|
2701
|
+
const lineColor = getLineColor(line);
|
|
2702
|
+
console.log(` ${lineColor}${line}${colors3.reset}`);
|
|
2703
|
+
}
|
|
2704
|
+
}
|
|
2705
|
+
function removeReasoningTags(text) {
|
|
2706
|
+
const openTag = `<${REASONING_TAG}>`;
|
|
2707
|
+
const closeTag = `</${REASONING_TAG}>`;
|
|
2708
|
+
const closedTagPattern = new RegExp(
|
|
2709
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
|
|
2710
|
+
"g"
|
|
2711
|
+
);
|
|
2712
|
+
const unclosedTagPattern = new RegExp(
|
|
2713
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
|
|
2714
|
+
"g"
|
|
2715
|
+
);
|
|
2716
|
+
let result = text.replace(closedTagPattern, "");
|
|
2717
|
+
result = result.replace(unclosedTagPattern, "");
|
|
2718
|
+
return result.trim();
|
|
2719
|
+
}
|
|
2720
|
+
function printModelOutput(failure, category) {
|
|
2721
|
+
var _a, _b;
|
|
2722
|
+
if (category !== "PARSE_FAILURE") {
|
|
2723
|
+
return;
|
|
2724
|
+
}
|
|
2725
|
+
const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
|
|
2726
|
+
const cleanedText = removeReasoningTags(rawText);
|
|
2727
|
+
if (cleanedText) {
|
|
2728
|
+
console.log(
|
|
2729
|
+
` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
|
|
2730
|
+
);
|
|
2731
|
+
} else {
|
|
2732
|
+
console.log(
|
|
2733
|
+
` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
|
|
2734
|
+
);
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2737
|
+
function shouldShowDiffByDefault(category) {
|
|
2738
|
+
return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
|
|
2739
|
+
}
|
|
2740
|
+
function printSingleFailure(failure, category, verbose) {
|
|
2741
|
+
console.log(`
|
|
2742
|
+
${colors3.bold}${failure.id}${colors3.reset}`);
|
|
2743
|
+
const hasDiff = failure.diff && failure.diff.length > 0;
|
|
2744
|
+
const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
|
|
2745
|
+
if (showDiffPrimarily) {
|
|
2746
|
+
printDiff(failure.diff);
|
|
2747
|
+
} else {
|
|
2748
|
+
printExpectedActual(failure);
|
|
2749
|
+
if (hasDiff && verbose) {
|
|
2750
|
+
printDiff(failure.diff);
|
|
2751
|
+
}
|
|
2752
|
+
}
|
|
2753
|
+
printModelOutput(failure, category);
|
|
2754
|
+
}
|
|
2755
|
+
var MAX_SAMPLE_FAILURES = 2;
|
|
2756
|
+
function printRemainingIds(failures) {
|
|
2757
|
+
const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
|
|
2758
|
+
const idNums = remainingIds.map((id) => {
|
|
2759
|
+
const match = id.match(ID_NUM_REGEX);
|
|
2760
|
+
return match ? match[1] : id;
|
|
2761
|
+
});
|
|
2762
|
+
console.log(
|
|
2763
|
+
`
|
|
2764
|
+
${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
|
|
2765
|
+
);
|
|
2766
|
+
}
|
|
2767
|
+
function printCategoryHeader(info, count) {
|
|
2768
|
+
console.log(
|
|
2769
|
+
`
|
|
2770
|
+
${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
|
|
2771
|
+
);
|
|
2772
|
+
console.log(`${colors3.dim}${info.description}${colors3.reset}`);
|
|
2773
|
+
}
|
|
2774
|
+
function printCategoryDetails(category, group, verbose) {
|
|
2775
|
+
const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
|
|
2776
|
+
const { failures } = group;
|
|
2777
|
+
printCategoryHeader(info, failures.length);
|
|
2778
|
+
if (group.pattern) {
|
|
2779
|
+
console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
|
|
2780
|
+
}
|
|
2781
|
+
if (info.hint) {
|
|
2782
|
+
console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
|
|
2783
|
+
}
|
|
2784
|
+
const samplesToShow = verbose ? failures : failures.slice(0, 2);
|
|
2785
|
+
for (const failure of samplesToShow) {
|
|
2786
|
+
printSingleFailure(failure, category, verbose);
|
|
2787
|
+
}
|
|
2788
|
+
if (!verbose && failures.length > 2) {
|
|
2789
|
+
printRemainingIds(failures);
|
|
2790
|
+
}
|
|
2791
|
+
}
|
|
2792
|
+
function printResultHeader(result) {
|
|
2793
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
2794
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2795
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2796
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2797
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2798
|
+
const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
|
|
2799
|
+
const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
|
|
2800
|
+
const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
|
|
2801
|
+
const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
|
|
2802
|
+
console.log(
|
|
2803
|
+
`
|
|
2804
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
|
|
2805
|
+
);
|
|
2806
|
+
console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
|
|
2807
|
+
}
|
|
2808
|
+
function printResultSummary(result, verbose) {
|
|
2809
|
+
const { result: benchmarkResult } = result;
|
|
2810
|
+
printResultHeader(result);
|
|
2811
|
+
if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
|
|
2812
|
+
return;
|
|
2813
|
+
}
|
|
2814
|
+
const failures = parseFailureLogs(benchmarkResult.logs);
|
|
2815
|
+
if (failures.length === 0) {
|
|
2816
|
+
if (!benchmarkResult.success) {
|
|
2817
|
+
console.log(
|
|
2818
|
+
`${colors3.yellow}No structured failure data available${colors3.reset}`
|
|
2819
|
+
);
|
|
2820
|
+
}
|
|
2821
|
+
return;
|
|
2822
|
+
}
|
|
2823
|
+
const groups = groupByCategory(failures);
|
|
2824
|
+
for (const group of groups.values()) {
|
|
2825
|
+
detectPatterns(group);
|
|
2826
|
+
}
|
|
2827
|
+
const sortedCategories = [...groups.entries()].sort(
|
|
2828
|
+
(a, b) => b[1].failures.length - a[1].failures.length
|
|
2829
|
+
);
|
|
2830
|
+
for (const [cat, group] of sortedCategories) {
|
|
2831
|
+
printCategoryDetails(cat, group, verbose);
|
|
2832
|
+
}
|
|
2833
|
+
}
|
|
2834
|
+
function consoleSummaryReporter(results) {
|
|
2835
|
+
const verbose = process.env.VERBOSE === "true";
|
|
2836
|
+
console.log(`
|
|
2837
|
+
${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
|
|
2838
|
+
console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
|
|
2839
|
+
for (const result of results) {
|
|
2840
|
+
printResultSummary(result, verbose);
|
|
2841
|
+
}
|
|
2842
|
+
console.log(
|
|
2843
|
+
`
|
|
2844
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
|
|
2845
|
+
`
|
|
2846
|
+
);
|
|
2847
|
+
}
|
|
2848
|
+
|
|
1917
2849
|
// src/reporters/json.ts
|
|
1918
2850
|
function jsonReporter(results) {
|
|
1919
|
-
const serializableResults = results.map((r) =>
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
...r
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
2851
|
+
const serializableResults = results.map((r) => {
|
|
2852
|
+
var _a;
|
|
2853
|
+
return {
|
|
2854
|
+
...r,
|
|
2855
|
+
result: {
|
|
2856
|
+
...r.result,
|
|
2857
|
+
error: (_a = r.result.error) == null ? void 0 : _a.message
|
|
2858
|
+
}
|
|
2859
|
+
};
|
|
2860
|
+
});
|
|
1926
2861
|
console.log(JSON.stringify(serializableResults, null, 2));
|
|
1927
2862
|
}
|
|
1928
2863
|
|
|
@@ -1930,60 +2865,56 @@ function jsonReporter(results) {
|
|
|
1930
2865
|
var reporters = {
|
|
1931
2866
|
console: consoleReporter,
|
|
1932
2867
|
json: jsonReporter,
|
|
1933
|
-
"console.debug": consoleDebugReporter
|
|
2868
|
+
"console.debug": consoleDebugReporter,
|
|
2869
|
+
"console.summary": consoleSummaryReporter
|
|
1934
2870
|
};
|
|
1935
2871
|
|
|
1936
2872
|
// src/evaluate.ts
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
return
|
|
1948
|
-
model: modelId,
|
|
1949
|
-
modelKey,
|
|
1950
|
-
benchmark: benchmark.name,
|
|
1951
|
-
result
|
|
1952
|
-
};
|
|
1953
|
-
} catch (error) {
|
|
1954
|
-
console.error(
|
|
1955
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
1956
|
-
error
|
|
1957
|
-
);
|
|
1958
|
-
return {
|
|
1959
|
-
model: modelId,
|
|
1960
|
-
modelKey,
|
|
1961
|
-
benchmark: benchmark.name,
|
|
1962
|
-
result: {
|
|
1963
|
-
score: 0,
|
|
1964
|
-
success: false,
|
|
1965
|
-
metrics: {},
|
|
1966
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
1967
|
-
}
|
|
1968
|
-
};
|
|
2873
|
+
function isModelConfig(value) {
|
|
2874
|
+
if (typeof value !== "object" || value === null) {
|
|
2875
|
+
return false;
|
|
2876
|
+
}
|
|
2877
|
+
const obj = value;
|
|
2878
|
+
if (!("model" in obj)) {
|
|
2879
|
+
return false;
|
|
2880
|
+
}
|
|
2881
|
+
const model = obj.model;
|
|
2882
|
+
if (typeof model !== "object" || model === null) {
|
|
2883
|
+
return false;
|
|
1969
2884
|
}
|
|
2885
|
+
return "modelId" in model;
|
|
2886
|
+
}
|
|
2887
|
+
function isLanguageModel(value) {
|
|
2888
|
+
if (typeof value !== "object" || value === null) {
|
|
2889
|
+
return false;
|
|
2890
|
+
}
|
|
2891
|
+
const obj = value;
|
|
2892
|
+
return "modelId" in obj && typeof obj.modelId === "string";
|
|
2893
|
+
}
|
|
2894
|
+
function extractModelAndMiddleware(input) {
|
|
2895
|
+
if (isModelConfig(input)) {
|
|
2896
|
+
return [input.model, input.middleware];
|
|
2897
|
+
}
|
|
2898
|
+
return [input, void 0];
|
|
1970
2899
|
}
|
|
1971
2900
|
function normalizeModels(models) {
|
|
1972
|
-
const
|
|
2901
|
+
const entries = [];
|
|
1973
2902
|
if (Array.isArray(models)) {
|
|
1974
2903
|
for (const m of models) {
|
|
1975
|
-
|
|
2904
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2905
|
+
entries.push([void 0, model, middleware]);
|
|
1976
2906
|
}
|
|
1977
|
-
} else if (
|
|
1978
|
-
|
|
2907
|
+
} else if (isModelConfig(models)) {
|
|
2908
|
+
entries.push([void 0, models.model, models.middleware]);
|
|
2909
|
+
} else if (isLanguageModel(models)) {
|
|
2910
|
+
entries.push([void 0, models, void 0]);
|
|
1979
2911
|
} else {
|
|
1980
|
-
for (const [key, m] of Object.entries(
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
modelEntries.push([key, m]);
|
|
2912
|
+
for (const [key, m] of Object.entries(models)) {
|
|
2913
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2914
|
+
entries.push([key, model, middleware]);
|
|
1984
2915
|
}
|
|
1985
2916
|
}
|
|
1986
|
-
return
|
|
2917
|
+
return entries;
|
|
1987
2918
|
}
|
|
1988
2919
|
function buildConfig(temperature, maxTokens) {
|
|
1989
2920
|
const config = {};
|
|
@@ -2004,21 +2935,90 @@ function executeReporter(reporter, results) {
|
|
|
2004
2935
|
reporters.console(results);
|
|
2005
2936
|
}
|
|
2006
2937
|
}
|
|
2938
|
+
function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
|
|
2939
|
+
var _a, _b;
|
|
2940
|
+
const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
|
|
2941
|
+
if (!(cacheEnabled || userMiddleware)) {
|
|
2942
|
+
return baseModel;
|
|
2943
|
+
}
|
|
2944
|
+
const cacheMiddleware = cacheEnabled ? (0, import_middleware.createDiskCacheMiddleware)({
|
|
2945
|
+
cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
|
|
2946
|
+
enabled: true,
|
|
2947
|
+
debug: (_b = cacheOptions.debug) != null ? _b : false
|
|
2948
|
+
}) : null;
|
|
2949
|
+
const middlewares = [];
|
|
2950
|
+
if (userMiddleware) {
|
|
2951
|
+
if (Array.isArray(userMiddleware)) {
|
|
2952
|
+
middlewares.push(...userMiddleware);
|
|
2953
|
+
} else {
|
|
2954
|
+
middlewares.push(userMiddleware);
|
|
2955
|
+
}
|
|
2956
|
+
}
|
|
2957
|
+
if (cacheMiddleware) {
|
|
2958
|
+
middlewares.push(cacheMiddleware);
|
|
2959
|
+
}
|
|
2960
|
+
if (middlewares.length === 0) {
|
|
2961
|
+
return baseModel;
|
|
2962
|
+
}
|
|
2963
|
+
return (0, import_ai4.wrapLanguageModel)({
|
|
2964
|
+
// biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
|
|
2965
|
+
model: baseModel,
|
|
2966
|
+
middleware: middlewares.length === 1 ? middlewares[0] : middlewares
|
|
2967
|
+
});
|
|
2968
|
+
}
|
|
2969
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
2970
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
2971
|
+
const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
|
|
2972
|
+
try {
|
|
2973
|
+
process.stdout.write(`${prefix}: ...`);
|
|
2974
|
+
const result = await benchmark.run(model, config);
|
|
2975
|
+
const scoreDisplay = result.score.toFixed(2);
|
|
2976
|
+
process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
|
|
2977
|
+
`);
|
|
2978
|
+
return {
|
|
2979
|
+
model: modelId,
|
|
2980
|
+
modelKey,
|
|
2981
|
+
benchmark: benchmark.name,
|
|
2982
|
+
result
|
|
2983
|
+
};
|
|
2984
|
+
} catch (error) {
|
|
2985
|
+
process.stdout.write(`\r${prefix}: .... Score: ERROR
|
|
2986
|
+
`);
|
|
2987
|
+
console.error(error);
|
|
2988
|
+
return {
|
|
2989
|
+
model: modelId,
|
|
2990
|
+
modelKey,
|
|
2991
|
+
benchmark: benchmark.name,
|
|
2992
|
+
result: {
|
|
2993
|
+
score: 0,
|
|
2994
|
+
success: false,
|
|
2995
|
+
metrics: {},
|
|
2996
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
2997
|
+
}
|
|
2998
|
+
};
|
|
2999
|
+
}
|
|
3000
|
+
}
|
|
2007
3001
|
async function evaluate(options) {
|
|
2008
3002
|
const {
|
|
2009
3003
|
models,
|
|
2010
3004
|
benchmarks,
|
|
2011
3005
|
reporter = "console",
|
|
2012
3006
|
temperature,
|
|
2013
|
-
maxTokens
|
|
3007
|
+
maxTokens,
|
|
3008
|
+
cache
|
|
2014
3009
|
} = options;
|
|
2015
3010
|
const modelEntries = normalizeModels(models);
|
|
2016
3011
|
const config = buildConfig(temperature, maxTokens);
|
|
2017
3012
|
const allResults = [];
|
|
2018
|
-
for (const [modelKey,
|
|
3013
|
+
for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
|
|
3014
|
+
const effectiveModel = buildEffectiveModel(
|
|
3015
|
+
baseModel,
|
|
3016
|
+
userMiddleware,
|
|
3017
|
+
cache
|
|
3018
|
+
);
|
|
2019
3019
|
for (const benchmark of benchmarks) {
|
|
2020
3020
|
const evaluationResult = await runSingleBenchmark(
|
|
2021
|
-
|
|
3021
|
+
effectiveModel,
|
|
2022
3022
|
benchmark,
|
|
2023
3023
|
modelKey,
|
|
2024
3024
|
config
|
|
@@ -2035,6 +3035,7 @@ async function evaluate(options) {
|
|
|
2035
3035
|
bfclParallelBenchmark,
|
|
2036
3036
|
bfclParallelMultipleBenchmark,
|
|
2037
3037
|
bfclSimpleBenchmark,
|
|
3038
|
+
complexFuncBenchBenchmark,
|
|
2038
3039
|
evaluate,
|
|
2039
3040
|
jsonGenerationBenchmark,
|
|
2040
3041
|
jsonGenerationSchemaOnlyBenchmark
|