@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
- package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
- package/data/BFCL_v4_simple.jsonl +400 -0
- package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +1264 -263
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +77 -11
- package/dist/index.d.ts +77 -11
- package/dist/index.js +1268 -264
- package/dist/index.js.map +1 -1
- package/package.json +18 -11
- package/data/BFCL_v3_simple.jsonl +0 -400
- package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
- /package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0
package/dist/index.js
CHANGED
|
@@ -23,7 +23,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
|
|
|
23
23
|
if (fs.existsSync(dataAtRoot)) {
|
|
24
24
|
return dataAtRoot;
|
|
25
25
|
}
|
|
26
|
-
} catch {
|
|
26
|
+
} catch (e) {
|
|
27
27
|
}
|
|
28
28
|
return null;
|
|
29
29
|
}
|
|
@@ -37,7 +37,7 @@ function tryResolveViaPackageJson(moduleUrl) {
|
|
|
37
37
|
if (fs.existsSync(dataAtPkg)) {
|
|
38
38
|
return dataAtPkg;
|
|
39
39
|
}
|
|
40
|
-
} catch {
|
|
40
|
+
} catch (e) {
|
|
41
41
|
}
|
|
42
42
|
return null;
|
|
43
43
|
}
|
|
@@ -45,7 +45,7 @@ function getStartDir(moduleUrl) {
|
|
|
45
45
|
if (moduleUrl) {
|
|
46
46
|
try {
|
|
47
47
|
return path.dirname(fileURLToPath(moduleUrl));
|
|
48
|
-
} catch {
|
|
48
|
+
} catch (e) {
|
|
49
49
|
return process.cwd();
|
|
50
50
|
}
|
|
51
51
|
}
|
|
@@ -139,7 +139,7 @@ function valuesMatch(modelValue, possibleValue) {
|
|
|
139
139
|
const normalizedModel = normalizeObject(modelValue);
|
|
140
140
|
const normalizedPossible = normalizeObject(possibleValue);
|
|
141
141
|
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
142
|
-
} catch {
|
|
142
|
+
} catch (e) {
|
|
143
143
|
return false;
|
|
144
144
|
}
|
|
145
145
|
}
|
|
@@ -268,7 +268,7 @@ function checkSingleParameter(paramName, modelValue, context) {
|
|
|
268
268
|
return checkStringValue(
|
|
269
269
|
paramName,
|
|
270
270
|
modelValue,
|
|
271
|
-
possibleValues
|
|
271
|
+
possibleValues != null ? possibleValues : []
|
|
272
272
|
);
|
|
273
273
|
}
|
|
274
274
|
if (Array.isArray(modelValue)) {
|
|
@@ -368,45 +368,99 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
368
368
|
// src/benchmarks/bfcl.ts
|
|
369
369
|
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
370
370
|
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
371
|
+
var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
|
|
372
|
+
function convertGroundTruthToXML(call) {
|
|
373
|
+
const keys = Object.keys(call);
|
|
374
|
+
if (keys.length === 0) {
|
|
375
|
+
return "<empty_call />";
|
|
376
|
+
}
|
|
377
|
+
const funcName = keys[0];
|
|
378
|
+
if (!funcName) {
|
|
379
|
+
return "<undefined_function />";
|
|
380
|
+
}
|
|
381
|
+
const params = call[funcName];
|
|
382
|
+
if (!params || typeof params !== "object") {
|
|
383
|
+
return `<${funcName} />`;
|
|
384
|
+
}
|
|
385
|
+
let xml = `<${funcName}>
|
|
386
|
+
`;
|
|
387
|
+
for (const [key, value] of Object.entries(params)) {
|
|
388
|
+
const displayValue = Array.isArray(value) ? value[0] : value;
|
|
389
|
+
let valueStr;
|
|
390
|
+
if (typeof displayValue === "string") {
|
|
391
|
+
valueStr = displayValue;
|
|
392
|
+
} else if (displayValue === null || displayValue === void 0) {
|
|
393
|
+
valueStr = "";
|
|
394
|
+
} else {
|
|
395
|
+
valueStr = JSON.stringify(displayValue);
|
|
396
|
+
}
|
|
397
|
+
xml += ` <${key}>${valueStr}</${key}>
|
|
398
|
+
`;
|
|
399
|
+
}
|
|
400
|
+
xml += `</${funcName}>`;
|
|
401
|
+
return xml;
|
|
402
|
+
}
|
|
403
|
+
function extractCategory(id) {
|
|
404
|
+
if (id.startsWith("parallel_multiple")) {
|
|
405
|
+
return "parallel_multiple";
|
|
406
|
+
}
|
|
407
|
+
if (id.startsWith("simple_python")) {
|
|
408
|
+
return "simple";
|
|
409
|
+
}
|
|
410
|
+
if (id.startsWith("simple_java")) {
|
|
411
|
+
return "simple";
|
|
412
|
+
}
|
|
413
|
+
if (id.startsWith("simple_javascript")) {
|
|
414
|
+
return "simple";
|
|
415
|
+
}
|
|
416
|
+
if (id.startsWith("parallel")) {
|
|
417
|
+
return "parallel";
|
|
418
|
+
}
|
|
419
|
+
if (id.startsWith("multiple")) {
|
|
420
|
+
return "multiple";
|
|
421
|
+
}
|
|
422
|
+
if (id.startsWith("simple")) {
|
|
423
|
+
return "simple";
|
|
424
|
+
}
|
|
425
|
+
return id.split("_")[0];
|
|
426
|
+
}
|
|
371
427
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
372
|
-
const category = testCase.id
|
|
428
|
+
const category = extractCategory(testCase.id);
|
|
373
429
|
try {
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
430
|
+
switch (category) {
|
|
431
|
+
case "simple": {
|
|
432
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
433
|
+
return {
|
|
434
|
+
valid: false,
|
|
435
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
436
|
+
error_type: "simple:wrong_count"
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
return simpleFunctionChecker(
|
|
440
|
+
testCase.function[0],
|
|
441
|
+
modelOutput[0],
|
|
442
|
+
possibleAnswer.ground_truth[0]
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
case "multiple": {
|
|
446
|
+
return multipleFunctionChecker(
|
|
447
|
+
testCase.function,
|
|
448
|
+
modelOutput,
|
|
449
|
+
possibleAnswer.ground_truth
|
|
450
|
+
);
|
|
451
|
+
}
|
|
452
|
+
case "parallel":
|
|
453
|
+
case "parallel_multiple": {
|
|
454
|
+
return parallelFunctionCheckerNoOrder(
|
|
455
|
+
testCase.function,
|
|
456
|
+
modelOutput,
|
|
457
|
+
possibleAnswer.ground_truth
|
|
458
|
+
);
|
|
459
|
+
}
|
|
460
|
+
default: {
|
|
461
|
+
return { valid: true };
|
|
381
462
|
}
|
|
382
|
-
return simpleFunctionChecker(
|
|
383
|
-
testCase.function[0],
|
|
384
|
-
modelOutput[0],
|
|
385
|
-
possibleAnswer.ground_truth[0]
|
|
386
|
-
);
|
|
387
|
-
}
|
|
388
|
-
if (category === "parallel") {
|
|
389
|
-
return parallelFunctionCheckerNoOrder(
|
|
390
|
-
testCase.function,
|
|
391
|
-
modelOutput,
|
|
392
|
-
possibleAnswer.ground_truth
|
|
393
|
-
);
|
|
394
|
-
}
|
|
395
|
-
if (category === "multiple") {
|
|
396
|
-
return multipleFunctionChecker(
|
|
397
|
-
testCase.function,
|
|
398
|
-
modelOutput,
|
|
399
|
-
possibleAnswer.ground_truth
|
|
400
|
-
);
|
|
401
|
-
}
|
|
402
|
-
if (category.includes("parallel-multiple")) {
|
|
403
|
-
return parallelFunctionCheckerNoOrder(
|
|
404
|
-
testCase.function,
|
|
405
|
-
modelOutput,
|
|
406
|
-
possibleAnswer.ground_truth
|
|
407
|
-
);
|
|
408
463
|
}
|
|
409
|
-
return { valid: true };
|
|
410
464
|
} catch (e) {
|
|
411
465
|
return {
|
|
412
466
|
valid: false,
|
|
@@ -448,7 +502,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
448
502
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
449
503
|
);
|
|
450
504
|
}
|
|
451
|
-
const
|
|
505
|
+
const fixSchemaType2 = (copy) => {
|
|
452
506
|
if (!copy.type) {
|
|
453
507
|
return;
|
|
454
508
|
}
|
|
@@ -472,16 +526,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
472
526
|
);
|
|
473
527
|
}
|
|
474
528
|
};
|
|
475
|
-
const
|
|
529
|
+
const fixSchema2 = (schema) => {
|
|
476
530
|
if (!schema || typeof schema !== "object") {
|
|
477
531
|
return { type: "object", properties: {} };
|
|
478
532
|
}
|
|
479
|
-
const copy = Array.isArray(schema) ? schema.map((v) =>
|
|
533
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
|
|
480
534
|
if (!Array.isArray(copy)) {
|
|
481
|
-
|
|
482
|
-
fixSchemaProperties(copy,
|
|
535
|
+
fixSchemaType2(copy);
|
|
536
|
+
fixSchemaProperties(copy, fixSchema2);
|
|
483
537
|
if (copy.items) {
|
|
484
|
-
copy.items =
|
|
538
|
+
copy.items = fixSchema2(copy.items);
|
|
485
539
|
}
|
|
486
540
|
return copy;
|
|
487
541
|
}
|
|
@@ -516,13 +570,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
516
570
|
try {
|
|
517
571
|
const arr = JSON.parse(raw);
|
|
518
572
|
return Array.isArray(arr) ? arr : [];
|
|
519
|
-
} catch {
|
|
573
|
+
} catch (e) {
|
|
520
574
|
return [];
|
|
521
575
|
}
|
|
522
576
|
};
|
|
523
577
|
const getSanitizedName = (rawName, transformedTools) => {
|
|
578
|
+
var _a, _b;
|
|
524
579
|
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
525
|
-
return transformedTools[Number(rawName)]
|
|
580
|
+
return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
|
|
526
581
|
}
|
|
527
582
|
return rawName;
|
|
528
583
|
};
|
|
@@ -532,25 +587,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
532
587
|
}
|
|
533
588
|
try {
|
|
534
589
|
return JSON.parse(extractedArgs);
|
|
535
|
-
} catch {
|
|
590
|
+
} catch (e) {
|
|
536
591
|
return extractedArgs;
|
|
537
592
|
}
|
|
538
593
|
};
|
|
539
594
|
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
595
|
+
var _a, _b, _c, _d, _e, _f;
|
|
540
596
|
const call = c;
|
|
541
|
-
const rawName = call.toolName
|
|
597
|
+
const rawName = (_a = call.toolName) != null ? _a : call.name;
|
|
542
598
|
const sanitizedFromIndex = getSanitizedName(
|
|
543
599
|
rawName,
|
|
544
600
|
transformedTools
|
|
545
601
|
);
|
|
546
|
-
const originalName = nameMap.get(sanitizedFromIndex)
|
|
547
|
-
const extractedArgs = call.args
|
|
602
|
+
const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
|
|
603
|
+
const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
|
|
548
604
|
const parsedArgs = parseToolArgs(extractedArgs);
|
|
549
605
|
return {
|
|
550
606
|
...call,
|
|
551
607
|
toolName: originalName,
|
|
552
608
|
name: originalName,
|
|
553
|
-
args: parsedArgs
|
|
609
|
+
args: parsedArgs != null ? parsedArgs : {}
|
|
554
610
|
};
|
|
555
611
|
});
|
|
556
612
|
const summarizeArgs = (args) => {
|
|
@@ -582,7 +638,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
582
638
|
return `- expected one of: ${formatted}`;
|
|
583
639
|
})();
|
|
584
640
|
diffLines.push(expectedLine);
|
|
585
|
-
diffLines.push(`+
|
|
641
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
586
642
|
return diffLines;
|
|
587
643
|
};
|
|
588
644
|
const paramValueMatches = (allowed, got) => {
|
|
@@ -594,7 +650,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
594
650
|
if (Array.isArray(got)) {
|
|
595
651
|
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
596
652
|
}
|
|
597
|
-
} catch {
|
|
653
|
+
} catch (e) {
|
|
598
654
|
}
|
|
599
655
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
600
656
|
});
|
|
@@ -632,13 +688,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
632
688
|
}
|
|
633
689
|
};
|
|
634
690
|
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
691
|
+
var _a, _b, _c, _d;
|
|
635
692
|
const funcDesc = tools[0];
|
|
636
|
-
const gt = possibleAnswer.ground_truth
|
|
637
|
-
const expectedFuncName = funcDesc
|
|
693
|
+
const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
|
|
694
|
+
const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
|
|
638
695
|
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
639
696
|
const received = restoredCalls[0];
|
|
640
|
-
const receivedName = received
|
|
641
|
-
const receivedArgs = summarizeArgs(received
|
|
697
|
+
const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
|
|
698
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
642
699
|
const expected = {
|
|
643
700
|
function: expectedFuncName,
|
|
644
701
|
params: expectedParams
|
|
@@ -650,7 +707,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
650
707
|
const diff = [];
|
|
651
708
|
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
652
709
|
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
653
|
-
const required = funcDesc
|
|
710
|
+
const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
654
711
|
checkMissingParams(
|
|
655
712
|
required,
|
|
656
713
|
receivedArgs,
|
|
@@ -687,12 +744,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
687
744
|
}
|
|
688
745
|
};
|
|
689
746
|
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
747
|
+
var _a;
|
|
690
748
|
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
691
749
|
if (usedActual.has(i)) {
|
|
692
750
|
continue;
|
|
693
751
|
}
|
|
694
752
|
const rc = restoredCalls[i];
|
|
695
|
-
const rcName = rc
|
|
753
|
+
const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
|
|
696
754
|
if (rcName === fname) {
|
|
697
755
|
return i;
|
|
698
756
|
}
|
|
@@ -706,6 +764,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
706
764
|
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
707
765
|
};
|
|
708
766
|
const processExpectedCall = (options) => {
|
|
767
|
+
var _a, _b;
|
|
709
768
|
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
710
769
|
const fname = Object.keys(expectedObj)[0];
|
|
711
770
|
const matchedIndex = findMatchingCallIndex(
|
|
@@ -718,10 +777,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
718
777
|
}
|
|
719
778
|
usedActual.add(matchedIndex);
|
|
720
779
|
const received = restoredCalls[matchedIndex];
|
|
721
|
-
const receivedArgs = summarizeArgs(received
|
|
780
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
722
781
|
const expectedParamsAllowed = expectedObj[fname];
|
|
723
782
|
const funcDesc = tools.find((t) => t.name === fname);
|
|
724
|
-
const requiredParams = funcDesc
|
|
783
|
+
const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
|
|
725
784
|
diff.push(`@@ function ${fname}`);
|
|
726
785
|
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
727
786
|
validateFunctionParams({
|
|
@@ -733,10 +792,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
733
792
|
}
|
|
734
793
|
};
|
|
735
794
|
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
736
|
-
|
|
795
|
+
var _a;
|
|
796
|
+
const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
|
|
737
797
|
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
738
798
|
const actualNames = restoredCalls.map(
|
|
739
|
-
(c) =>
|
|
799
|
+
(c) => {
|
|
800
|
+
var _a2;
|
|
801
|
+
return (_a2 = c.toolName) != null ? _a2 : c.name;
|
|
802
|
+
}
|
|
740
803
|
);
|
|
741
804
|
const expected = {
|
|
742
805
|
functions: expectedNames
|
|
@@ -762,14 +825,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
762
825
|
return { expected, actual, diff };
|
|
763
826
|
};
|
|
764
827
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
765
|
-
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) :
|
|
828
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
|
|
766
829
|
logs.push(
|
|
767
830
|
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
768
831
|
);
|
|
769
832
|
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
833
|
+
var _a, _b, _c, _d;
|
|
770
834
|
try {
|
|
771
835
|
const firstTool = transformedTools[0];
|
|
772
|
-
const schemaType = firstTool
|
|
836
|
+
const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
|
|
773
837
|
caseLogs.push(
|
|
774
838
|
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
775
839
|
);
|
|
@@ -785,49 +849,103 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
785
849
|
caseLogs.push(
|
|
786
850
|
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
787
851
|
);
|
|
788
|
-
} catch {
|
|
852
|
+
} catch (e) {
|
|
789
853
|
caseLogs.push(
|
|
790
854
|
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
791
855
|
);
|
|
792
856
|
}
|
|
793
857
|
};
|
|
794
|
-
const
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
flatMessages,
|
|
799
|
-
mwOriginalText,
|
|
800
|
-
text,
|
|
801
|
-
finishReason,
|
|
802
|
-
mwParsedToolCalls,
|
|
803
|
-
restoredCalls,
|
|
804
|
-
possibleAnswer
|
|
805
|
-
} = options;
|
|
806
|
-
const lastUser = (() => {
|
|
807
|
-
const reversed = [...flatMessages].reverse();
|
|
808
|
-
const found = reversed.find(
|
|
809
|
-
(m) => m.role === "user"
|
|
810
|
-
);
|
|
811
|
-
return found?.content ?? void 0;
|
|
812
|
-
})();
|
|
813
|
-
const rawModelText = (() => {
|
|
814
|
-
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
815
|
-
return mwOriginalText;
|
|
858
|
+
const hasPercentPattern = (diff) => {
|
|
859
|
+
return diff.some((d) => {
|
|
860
|
+
if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
|
|
861
|
+
return false;
|
|
816
862
|
}
|
|
817
|
-
|
|
818
|
-
|
|
863
|
+
const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
|
|
864
|
+
if (!numMatch) {
|
|
865
|
+
return false;
|
|
819
866
|
}
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
867
|
+
const num = Number.parseFloat(numMatch[1]);
|
|
868
|
+
return num >= 1 && num <= 100;
|
|
869
|
+
});
|
|
870
|
+
};
|
|
871
|
+
const isValueError = (errorType, diff) => {
|
|
872
|
+
return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
|
|
873
|
+
};
|
|
874
|
+
const isFunctionNameError = (errorType, diff) => {
|
|
875
|
+
return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
|
|
876
|
+
};
|
|
877
|
+
const isMissingParamError = (errorType, diff) => {
|
|
878
|
+
return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
|
|
879
|
+
};
|
|
880
|
+
const isUnexpectedParamError = (errorType, diff) => {
|
|
881
|
+
return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
|
|
882
|
+
};
|
|
883
|
+
const classifyByErrorPatterns = (errorType, diff) => {
|
|
884
|
+
const patterns = [
|
|
885
|
+
[
|
|
886
|
+
isValueError,
|
|
887
|
+
hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
|
|
888
|
+
],
|
|
889
|
+
[isFunctionNameError, "WRONG_FUNCTION"],
|
|
890
|
+
[isMissingParamError, "MISSING_PARAMS"],
|
|
891
|
+
[isUnexpectedParamError, "UNEXPECTED_PARAMS"]
|
|
892
|
+
];
|
|
893
|
+
for (const [classifier, result] of patterns) {
|
|
894
|
+
if (classifier(errorType, diff)) {
|
|
895
|
+
return result;
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
|
|
899
|
+
return "NO_MATCH";
|
|
900
|
+
}
|
|
901
|
+
return null;
|
|
902
|
+
};
|
|
903
|
+
const classifyByCallCount = (actualCount, expectedCount) => {
|
|
904
|
+
if (actualCount === 0 && expectedCount > 0) {
|
|
905
|
+
return "PARSE_FAILURE";
|
|
906
|
+
}
|
|
907
|
+
if (actualCount > 0 && actualCount < expectedCount) {
|
|
908
|
+
return "PARTIAL_CALLS";
|
|
909
|
+
}
|
|
910
|
+
if (actualCount > expectedCount) {
|
|
911
|
+
return "EXTRA_CALLS";
|
|
912
|
+
}
|
|
913
|
+
return null;
|
|
914
|
+
};
|
|
915
|
+
const classifyFailureType = (options) => {
|
|
916
|
+
const { errorType, restoredCalls, expectedCount, diff } = options;
|
|
917
|
+
const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
|
|
918
|
+
const countBasedResult = classifyByCallCount(
|
|
919
|
+
actualCount,
|
|
920
|
+
expectedCount
|
|
921
|
+
);
|
|
922
|
+
if (countBasedResult) {
|
|
923
|
+
return countBasedResult;
|
|
924
|
+
}
|
|
925
|
+
const patternBasedResult = classifyByErrorPatterns(errorType, diff);
|
|
926
|
+
if (patternBasedResult) {
|
|
927
|
+
return patternBasedResult;
|
|
928
|
+
}
|
|
929
|
+
return "OTHER";
|
|
930
|
+
};
|
|
931
|
+
const extractRawModelText = (mwOriginalText, text) => {
|
|
932
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
933
|
+
return mwOriginalText;
|
|
934
|
+
}
|
|
935
|
+
if (typeof text === "string") {
|
|
936
|
+
return text;
|
|
937
|
+
}
|
|
938
|
+
return "";
|
|
939
|
+
};
|
|
940
|
+
const extractLastUserQuery = (flatMessages) => {
|
|
941
|
+
var _a;
|
|
942
|
+
const reversed = [...flatMessages].reverse();
|
|
943
|
+
const found = reversed.find((m) => m.role === "user");
|
|
944
|
+
const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
|
|
945
|
+
return content.length > 200 ? `${content.slice(0, 200)}...` : content;
|
|
946
|
+
};
|
|
947
|
+
const truncateText = (text, maxLen) => {
|
|
948
|
+
return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
|
|
831
949
|
};
|
|
832
950
|
const logFailureDetails = (options) => {
|
|
833
951
|
const {
|
|
@@ -845,43 +963,37 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
845
963
|
} = options;
|
|
846
964
|
try {
|
|
847
965
|
const category = testCase.id.split("_")[0];
|
|
848
|
-
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
caseLogs.push(
|
|
858
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
859
|
-
id: testCase.id,
|
|
860
|
-
message: checkerResult.error,
|
|
861
|
-
error_type: checkerResult.error_type,
|
|
862
|
-
expected,
|
|
863
|
-
actual,
|
|
864
|
-
diff
|
|
865
|
-
})}`
|
|
866
|
-
);
|
|
867
|
-
try {
|
|
868
|
-
const contextPayload = buildFailureContext({
|
|
869
|
-
testCase,
|
|
870
|
-
tools,
|
|
871
|
-
flatMessages,
|
|
872
|
-
mwOriginalText,
|
|
873
|
-
text,
|
|
874
|
-
finishReason,
|
|
875
|
-
mwParsedToolCalls,
|
|
966
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
|
|
967
|
+
const gtArr = possibleAnswer.ground_truth;
|
|
968
|
+
const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
|
|
969
|
+
const rawModelText = extractRawModelText(mwOriginalText, text);
|
|
970
|
+
const lastUserQuery = extractLastUserQuery(flatMessages);
|
|
971
|
+
const failurePayload = {
|
|
972
|
+
id: testCase.id,
|
|
973
|
+
category: classifyFailureType({
|
|
974
|
+
errorType: checkerResult.error_type,
|
|
876
975
|
restoredCalls,
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
976
|
+
expectedCount,
|
|
977
|
+
diff
|
|
978
|
+
}),
|
|
979
|
+
message: checkerResult.error,
|
|
980
|
+
error_type: checkerResult.error_type,
|
|
981
|
+
expected,
|
|
982
|
+
actual,
|
|
983
|
+
diff,
|
|
984
|
+
context: {
|
|
985
|
+
raw_model_text: truncateText(rawModelText, 500),
|
|
986
|
+
raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
|
|
987
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
988
|
+
expected_count: expectedCount,
|
|
989
|
+
actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
|
|
990
|
+
finish_reason: finishReason,
|
|
991
|
+
last_user_query: lastUserQuery,
|
|
992
|
+
tool_names: tools.map((t) => t.name)
|
|
993
|
+
}
|
|
994
|
+
};
|
|
995
|
+
caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
|
|
996
|
+
} catch (e) {
|
|
885
997
|
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
886
998
|
}
|
|
887
999
|
};
|
|
@@ -960,7 +1072,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
960
1072
|
const flatMessages = flattenMessages(messages);
|
|
961
1073
|
const { transformedTools, nameMap } = buildTransformedTools(
|
|
962
1074
|
tools,
|
|
963
|
-
|
|
1075
|
+
fixSchema2
|
|
964
1076
|
);
|
|
965
1077
|
const toolsMap = buildToolsMap(transformedTools);
|
|
966
1078
|
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
@@ -982,6 +1094,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
982
1094
|
const mwParsedToolCalls = parseDebugToolCalls(
|
|
983
1095
|
debugSummaryRef.toolCalls
|
|
984
1096
|
);
|
|
1097
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1098
|
+
if (!possibleAnswer) {
|
|
1099
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1100
|
+
}
|
|
1101
|
+
if (process.env.DEBUG_PARSER_OUTPUT === "true") {
|
|
1102
|
+
const groundTruth = possibleAnswer.ground_truth;
|
|
1103
|
+
const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
|
|
1104
|
+
console.log("\n========== BFCL CASE DEBUG ==========");
|
|
1105
|
+
console.log(`Test Case: ${testCase.id}`);
|
|
1106
|
+
console.log(`Expected count: ${groundTruth.length} call(s)`);
|
|
1107
|
+
console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
|
|
1108
|
+
console.log(expectedXML);
|
|
1109
|
+
console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
|
|
1110
|
+
console.log(mwOriginalText || text || "(empty)");
|
|
1111
|
+
console.log(
|
|
1112
|
+
"\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
|
|
1113
|
+
);
|
|
1114
|
+
console.log(JSON.stringify(toolCalls, null, 2));
|
|
1115
|
+
console.log("======================================\n");
|
|
1116
|
+
}
|
|
985
1117
|
logRawToolCalls({
|
|
986
1118
|
toolCalls,
|
|
987
1119
|
finishReason,
|
|
@@ -989,10 +1121,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
989
1121
|
testCaseId: testCase.id,
|
|
990
1122
|
caseLogs
|
|
991
1123
|
});
|
|
992
|
-
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
993
|
-
if (!possibleAnswer) {
|
|
994
|
-
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
995
|
-
}
|
|
996
1124
|
const restoredCalls = restoreToolCalls(
|
|
997
1125
|
toolCalls || [],
|
|
998
1126
|
nameMap,
|
|
@@ -1013,12 +1141,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1013
1141
|
caseLogs
|
|
1014
1142
|
});
|
|
1015
1143
|
};
|
|
1016
|
-
const
|
|
1144
|
+
const runSingleCase2 = async (testCase) => {
|
|
1017
1145
|
const caseLogs = [];
|
|
1018
1146
|
const { function: tools } = testCase;
|
|
1019
|
-
const temp = config
|
|
1147
|
+
const temp = config == null ? void 0 : config.temperature;
|
|
1020
1148
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1021
|
-
const maxTok = config
|
|
1149
|
+
const maxTok = config == null ? void 0 : config.maxTokens;
|
|
1022
1150
|
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1023
1151
|
try {
|
|
1024
1152
|
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
@@ -1044,15 +1172,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1044
1172
|
});
|
|
1045
1173
|
} catch (e) {
|
|
1046
1174
|
caseLogs.push(
|
|
1047
|
-
`[ERROR] ${testCase.id}: Model generation failed: ${e
|
|
1175
|
+
`[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
|
|
1048
1176
|
);
|
|
1049
|
-
if (e
|
|
1177
|
+
if (e == null ? void 0 : e.stack) {
|
|
1050
1178
|
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
1051
1179
|
}
|
|
1052
1180
|
return { valid: false, logs: caseLogs };
|
|
1053
1181
|
}
|
|
1054
1182
|
};
|
|
1055
|
-
const
|
|
1183
|
+
const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
|
|
1056
1184
|
const results = new Array(items.length);
|
|
1057
1185
|
let idx = 0;
|
|
1058
1186
|
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
@@ -1068,10 +1196,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1068
1196
|
await Promise.all(workers);
|
|
1069
1197
|
return results;
|
|
1070
1198
|
};
|
|
1071
|
-
const resultsPerCase = await
|
|
1199
|
+
const resultsPerCase = await mapWithConcurrency2(
|
|
1072
1200
|
testCases,
|
|
1073
1201
|
concurrency,
|
|
1074
|
-
async (tc) =>
|
|
1202
|
+
async (tc) => runSingleCase2(tc)
|
|
1075
1203
|
);
|
|
1076
1204
|
correctCount = resultsPerCase.reduce(
|
|
1077
1205
|
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
@@ -1089,14 +1217,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1089
1217
|
};
|
|
1090
1218
|
}
|
|
1091
1219
|
const score = correctCount / testCases.length;
|
|
1220
|
+
const caseResults = resultsPerCase.map((r, i) => ({
|
|
1221
|
+
id: testCases[i].id,
|
|
1222
|
+
valid: r.valid
|
|
1223
|
+
}));
|
|
1092
1224
|
return {
|
|
1093
1225
|
score,
|
|
1094
1226
|
success: score > 0.95,
|
|
1095
|
-
// High success threshold as requested
|
|
1096
1227
|
metrics: {
|
|
1097
1228
|
correct_count: correctCount,
|
|
1098
1229
|
total_cases: testCases.length,
|
|
1099
|
-
accuracy: score
|
|
1230
|
+
accuracy: score,
|
|
1231
|
+
case_results: JSON.stringify(caseResults)
|
|
1100
1232
|
},
|
|
1101
1233
|
logs
|
|
1102
1234
|
};
|
|
@@ -1116,42 +1248,414 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1116
1248
|
}
|
|
1117
1249
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1118
1250
|
"bfcl-simple",
|
|
1119
|
-
"BFCL Simple Function Calling",
|
|
1120
|
-
"
|
|
1121
|
-
"
|
|
1251
|
+
"BFCL v4 Simple Function Calling",
|
|
1252
|
+
"BFCL_v4_simple.jsonl",
|
|
1253
|
+
"BFCL_v4_simple_possible_answer.jsonl"
|
|
1122
1254
|
);
|
|
1123
1255
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1124
1256
|
"bfcl-parallel",
|
|
1125
|
-
"BFCL Parallel Function Calling",
|
|
1126
|
-
"
|
|
1127
|
-
"
|
|
1257
|
+
"BFCL v4 Parallel Function Calling",
|
|
1258
|
+
"BFCL_v4_parallel.jsonl",
|
|
1259
|
+
"BFCL_v4_parallel_possible_answer.jsonl"
|
|
1128
1260
|
);
|
|
1129
1261
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1130
1262
|
"bfcl-multiple",
|
|
1131
|
-
"BFCL Multiple Function Calling",
|
|
1132
|
-
"
|
|
1133
|
-
"
|
|
1263
|
+
"BFCL v4 Multiple Function Calling",
|
|
1264
|
+
"BFCL_v4_multiple.jsonl",
|
|
1265
|
+
"BFCL_v4_multiple_possible_answer.jsonl"
|
|
1134
1266
|
);
|
|
1135
1267
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1136
1268
|
"bfcl-parallel-multiple",
|
|
1137
|
-
"BFCL Parallel & Multiple Function Calling",
|
|
1138
|
-
"
|
|
1139
|
-
"
|
|
1269
|
+
"BFCL v4 Parallel & Multiple Function Calling",
|
|
1270
|
+
"BFCL_v4_parallel_multiple.jsonl",
|
|
1271
|
+
"BFCL_v4_parallel_multiple_possible_answer.jsonl"
|
|
1140
1272
|
);
|
|
1141
1273
|
|
|
1142
|
-
// src/benchmarks/
|
|
1274
|
+
// src/benchmarks/complex-func-bench.ts
|
|
1143
1275
|
import { promises as fs3 } from "fs";
|
|
1144
1276
|
import path3 from "path";
|
|
1145
|
-
import {
|
|
1277
|
+
import {
|
|
1278
|
+
generateText as generateText2,
|
|
1279
|
+
jsonSchema as jsonSchema2,
|
|
1280
|
+
tool as tool2
|
|
1281
|
+
} from "ai";
|
|
1282
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1283
|
+
function standardizeString2(input) {
|
|
1284
|
+
if (typeof input !== "string") {
|
|
1285
|
+
return input;
|
|
1286
|
+
}
|
|
1287
|
+
return input.toLowerCase().trim();
|
|
1288
|
+
}
|
|
1289
|
+
function valuesMatch2(modelValue, expectedValue) {
|
|
1290
|
+
if (modelValue === expectedValue) {
|
|
1291
|
+
return true;
|
|
1292
|
+
}
|
|
1293
|
+
if (typeof modelValue === "string" && typeof expectedValue === "string") {
|
|
1294
|
+
return standardizeString2(modelValue) === standardizeString2(expectedValue);
|
|
1295
|
+
}
|
|
1296
|
+
if (typeof modelValue === "number" && typeof expectedValue === "string") {
|
|
1297
|
+
return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
|
|
1298
|
+
}
|
|
1299
|
+
if (typeof modelValue === "string" && typeof expectedValue === "number") {
|
|
1300
|
+
return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
|
|
1301
|
+
}
|
|
1302
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
|
|
1303
|
+
try {
|
|
1304
|
+
return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
|
|
1305
|
+
} catch (e) {
|
|
1306
|
+
return false;
|
|
1307
|
+
}
|
|
1308
|
+
}
|
|
1309
|
+
return false;
|
|
1310
|
+
}
|
|
1311
|
+
function validateFunctionName(modelFuncName, expectedFuncName) {
|
|
1312
|
+
if (modelFuncName !== expectedFuncName) {
|
|
1313
|
+
return {
|
|
1314
|
+
valid: false,
|
|
1315
|
+
error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
|
|
1316
|
+
error_type: "function_name_mismatch"
|
|
1317
|
+
};
|
|
1318
|
+
}
|
|
1319
|
+
return { valid: true };
|
|
1320
|
+
}
|
|
1321
|
+
function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
|
|
1322
|
+
for (const param of requiredParams) {
|
|
1323
|
+
if (!(param in modelArgs) && param in expectedArgs) {
|
|
1324
|
+
return {
|
|
1325
|
+
valid: false,
|
|
1326
|
+
error: `Missing required parameter: '${param}'`,
|
|
1327
|
+
error_type: "missing_required_param"
|
|
1328
|
+
};
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
return { valid: true };
|
|
1332
|
+
}
|
|
1333
|
+
function validateParamValues(expectedArgs, modelArgs, requiredParams) {
|
|
1334
|
+
for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
|
|
1335
|
+
if (!(paramName in modelArgs)) {
|
|
1336
|
+
if (!requiredParams.includes(paramName)) {
|
|
1337
|
+
continue;
|
|
1338
|
+
}
|
|
1339
|
+
return {
|
|
1340
|
+
valid: false,
|
|
1341
|
+
error: `Missing parameter: '${paramName}'`,
|
|
1342
|
+
error_type: "missing_param"
|
|
1343
|
+
};
|
|
1344
|
+
}
|
|
1345
|
+
const modelValue = modelArgs[paramName];
|
|
1346
|
+
if (!valuesMatch2(modelValue, expectedValue)) {
|
|
1347
|
+
return {
|
|
1348
|
+
valid: false,
|
|
1349
|
+
error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
|
|
1350
|
+
error_type: "value_mismatch"
|
|
1351
|
+
};
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
return { valid: true };
|
|
1355
|
+
}
|
|
1356
|
+
function checkFunctionCall(modelCall, expected, toolSpecs) {
|
|
1357
|
+
var _a, _b, _c, _d;
|
|
1358
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1359
|
+
const expectedArgs = expected[expectedFuncName];
|
|
1360
|
+
const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
|
|
1361
|
+
const modelArgs = (_b = modelCall.args) != null ? _b : {};
|
|
1362
|
+
const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
|
|
1363
|
+
if (!nameResult.valid) {
|
|
1364
|
+
return nameResult;
|
|
1365
|
+
}
|
|
1366
|
+
const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
|
|
1367
|
+
const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
1368
|
+
const requiredResult = validateRequiredParams(
|
|
1369
|
+
requiredParams,
|
|
1370
|
+
modelArgs,
|
|
1371
|
+
expectedArgs
|
|
1372
|
+
);
|
|
1373
|
+
if (!requiredResult.valid) {
|
|
1374
|
+
return requiredResult;
|
|
1375
|
+
}
|
|
1376
|
+
return validateParamValues(expectedArgs, modelArgs, requiredParams);
|
|
1377
|
+
}
|
|
1378
|
+
function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
|
|
1379
|
+
if (modelCalls.length !== expectedCalls.length) {
|
|
1380
|
+
return {
|
|
1381
|
+
valid: false,
|
|
1382
|
+
error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
|
|
1383
|
+
error_type: "wrong_call_count"
|
|
1384
|
+
};
|
|
1385
|
+
}
|
|
1386
|
+
if (expectedCalls.length === 1) {
|
|
1387
|
+
return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
|
|
1388
|
+
}
|
|
1389
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
1390
|
+
for (const expected of expectedCalls) {
|
|
1391
|
+
let foundMatch = false;
|
|
1392
|
+
for (let i = 0; i < modelCalls.length; i++) {
|
|
1393
|
+
if (matchedIndices.has(i)) {
|
|
1394
|
+
continue;
|
|
1395
|
+
}
|
|
1396
|
+
const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
|
|
1397
|
+
if (result.valid) {
|
|
1398
|
+
matchedIndices.add(i);
|
|
1399
|
+
foundMatch = true;
|
|
1400
|
+
break;
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
if (!foundMatch) {
|
|
1404
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1405
|
+
return {
|
|
1406
|
+
valid: false,
|
|
1407
|
+
error: `Could not find matching call for function '${expectedFuncName}'`,
|
|
1408
|
+
error_type: "no_matching_call"
|
|
1409
|
+
};
|
|
1410
|
+
}
|
|
1411
|
+
}
|
|
1412
|
+
return { valid: true };
|
|
1413
|
+
}
|
|
1414
|
+
var fixSchemaType = (copy) => {
|
|
1415
|
+
if (!copy.type) {
|
|
1416
|
+
return;
|
|
1417
|
+
}
|
|
1418
|
+
if (copy.type === "dict") {
|
|
1419
|
+
copy.type = "object";
|
|
1420
|
+
}
|
|
1421
|
+
if (copy.type === "tuple") {
|
|
1422
|
+
copy.type = "array";
|
|
1423
|
+
}
|
|
1424
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
1425
|
+
copy.type = "number";
|
|
1426
|
+
}
|
|
1427
|
+
};
|
|
1428
|
+
var fixSchema = (schema) => {
|
|
1429
|
+
if (!schema || typeof schema !== "object") {
|
|
1430
|
+
return { type: "object", properties: {} };
|
|
1431
|
+
}
|
|
1432
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
1433
|
+
if (!Array.isArray(copy)) {
|
|
1434
|
+
fixSchemaType(copy);
|
|
1435
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
1436
|
+
for (const k of Object.keys(copy.properties)) {
|
|
1437
|
+
copy.properties[k] = fixSchema(
|
|
1438
|
+
copy.properties[k]
|
|
1439
|
+
);
|
|
1440
|
+
}
|
|
1441
|
+
}
|
|
1442
|
+
if (copy.items) {
|
|
1443
|
+
copy.items = fixSchema(copy.items);
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
return copy;
|
|
1447
|
+
};
|
|
1448
|
+
function buildTools(tools) {
|
|
1449
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
1450
|
+
const transformedTools = tools.map((t) => {
|
|
1451
|
+
const fixed = fixSchema(t.parameters);
|
|
1452
|
+
const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
|
|
1453
|
+
const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
|
|
1454
|
+
nameMap.set(sanitized, t.name);
|
|
1455
|
+
return {
|
|
1456
|
+
type: "function",
|
|
1457
|
+
name: sanitized,
|
|
1458
|
+
description: t.description,
|
|
1459
|
+
inputSchema
|
|
1460
|
+
};
|
|
1461
|
+
});
|
|
1462
|
+
const toolsMap = Object.fromEntries(
|
|
1463
|
+
transformedTools.map((t) => [
|
|
1464
|
+
t.name,
|
|
1465
|
+
tool2({
|
|
1466
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
1467
|
+
inputSchema: jsonSchema2(t.inputSchema)
|
|
1468
|
+
})
|
|
1469
|
+
])
|
|
1470
|
+
);
|
|
1471
|
+
return { nameMap, toolsMap };
|
|
1472
|
+
}
|
|
1473
|
+
async function mapWithConcurrency(items, concurrencyLimit, mapper) {
|
|
1474
|
+
const results = new Array(items.length);
|
|
1475
|
+
let idx = 0;
|
|
1476
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1477
|
+
while (true) {
|
|
1478
|
+
const current = idx;
|
|
1479
|
+
idx += 1;
|
|
1480
|
+
if (current >= items.length) {
|
|
1481
|
+
break;
|
|
1482
|
+
}
|
|
1483
|
+
results[current] = await mapper(items[current]);
|
|
1484
|
+
}
|
|
1485
|
+
});
|
|
1486
|
+
await Promise.all(workers);
|
|
1487
|
+
return results;
|
|
1488
|
+
}
|
|
1489
|
+
async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
|
|
1490
|
+
const caseLogs = [];
|
|
1491
|
+
const { function: tools, question: messages } = testCase;
|
|
1492
|
+
try {
|
|
1493
|
+
const { nameMap, toolsMap } = buildTools(tools);
|
|
1494
|
+
const debugSummaryRef = {};
|
|
1495
|
+
const providerOptions = {
|
|
1496
|
+
toolCallMiddleware: { debugSummary: debugSummaryRef }
|
|
1497
|
+
};
|
|
1498
|
+
const { toolCalls, finishReason } = await generateText2({
|
|
1499
|
+
model,
|
|
1500
|
+
messages,
|
|
1501
|
+
tools: toolsMap,
|
|
1502
|
+
toolChoice: "auto",
|
|
1503
|
+
providerOptions,
|
|
1504
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
1505
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
1506
|
+
});
|
|
1507
|
+
const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
|
|
1508
|
+
var _a, _b, _c, _d;
|
|
1509
|
+
const rawName = (_a = c.toolName) != null ? _a : c.name;
|
|
1510
|
+
const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
|
|
1511
|
+
return {
|
|
1512
|
+
toolName: originalName,
|
|
1513
|
+
name: originalName,
|
|
1514
|
+
args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
|
|
1515
|
+
};
|
|
1516
|
+
});
|
|
1517
|
+
caseLogs.push(
|
|
1518
|
+
`[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
|
|
1519
|
+
);
|
|
1520
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1521
|
+
if (!possibleAnswer) {
|
|
1522
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1523
|
+
}
|
|
1524
|
+
const checkerResult = checkAllFunctionCalls(
|
|
1525
|
+
restoredCalls,
|
|
1526
|
+
possibleAnswer.ground_truth,
|
|
1527
|
+
tools
|
|
1528
|
+
);
|
|
1529
|
+
if (checkerResult.valid) {
|
|
1530
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1531
|
+
return { valid: true, logs: caseLogs };
|
|
1532
|
+
}
|
|
1533
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1534
|
+
return { valid: false, logs: caseLogs };
|
|
1535
|
+
} catch (e) {
|
|
1536
|
+
caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
|
|
1537
|
+
return { valid: false, logs: caseLogs };
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
async function loadTestData(dataPath, testDataFile) {
|
|
1541
|
+
const testCasesJson = await fs3.readFile(
|
|
1542
|
+
path3.join(dataPath, testDataFile),
|
|
1543
|
+
"utf-8"
|
|
1544
|
+
);
|
|
1545
|
+
return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1546
|
+
}
|
|
1547
|
+
async function loadAnswerData(dataPath, answerDataFile) {
|
|
1548
|
+
const answersJson = await fs3.readFile(
|
|
1549
|
+
path3.join(dataPath, answerDataFile),
|
|
1550
|
+
"utf-8"
|
|
1551
|
+
);
|
|
1552
|
+
const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1553
|
+
return new Map(answers.map((ans) => [ans.id, ans]));
|
|
1554
|
+
}
|
|
1555
|
+
function getConfigValues(config) {
|
|
1556
|
+
const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
|
|
1557
|
+
const limit = limitEnv ? Number(limitEnv) : void 0;
|
|
1558
|
+
const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
|
|
1559
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
1560
|
+
const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
|
|
1561
|
+
const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
|
|
1562
|
+
return { limit, concurrency, temperature, maxTokens };
|
|
1563
|
+
}
|
|
1564
|
+
function aggregateResults(resultsPerCase, testCases) {
|
|
1565
|
+
const logs = [];
|
|
1566
|
+
const correctCount = resultsPerCase.reduce(
|
|
1567
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1568
|
+
0
|
|
1569
|
+
);
|
|
1570
|
+
for (const r of resultsPerCase) {
|
|
1571
|
+
logs.push(...r.logs);
|
|
1572
|
+
}
|
|
1573
|
+
if (testCases.length === 0) {
|
|
1574
|
+
return {
|
|
1575
|
+
score: 0,
|
|
1576
|
+
success: false,
|
|
1577
|
+
metrics: {},
|
|
1578
|
+
logs: ["No test cases found."]
|
|
1579
|
+
};
|
|
1580
|
+
}
|
|
1581
|
+
const score = correctCount / testCases.length;
|
|
1582
|
+
return {
|
|
1583
|
+
score,
|
|
1584
|
+
success: score > 0.5,
|
|
1585
|
+
metrics: {
|
|
1586
|
+
correct_count: correctCount,
|
|
1587
|
+
total_cases: testCases.length,
|
|
1588
|
+
accuracy: score
|
|
1589
|
+
},
|
|
1590
|
+
logs
|
|
1591
|
+
};
|
|
1592
|
+
}
|
|
1593
|
+
function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
|
|
1594
|
+
return {
|
|
1595
|
+
name,
|
|
1596
|
+
version: "1.0.0",
|
|
1597
|
+
description,
|
|
1598
|
+
async run(model, config) {
|
|
1599
|
+
var _a;
|
|
1600
|
+
const logs = [];
|
|
1601
|
+
try {
|
|
1602
|
+
const dataPath = resolveDataDir();
|
|
1603
|
+
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
1604
|
+
let testCases = await loadTestData(dataPath, testDataFile);
|
|
1605
|
+
const possibleAnswersMap = await loadAnswerData(
|
|
1606
|
+
dataPath,
|
|
1607
|
+
answerDataFile
|
|
1608
|
+
);
|
|
1609
|
+
const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
|
|
1610
|
+
if (limit && Number.isFinite(limit) && limit > 0) {
|
|
1611
|
+
testCases = testCases.slice(0, limit);
|
|
1612
|
+
logs.push(`[INFO] Limiting test cases to ${limit}`);
|
|
1613
|
+
}
|
|
1614
|
+
logs.push(
|
|
1615
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
1616
|
+
);
|
|
1617
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1618
|
+
testCases,
|
|
1619
|
+
concurrency,
|
|
1620
|
+
(tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
|
|
1621
|
+
);
|
|
1622
|
+
const result = aggregateResults(resultsPerCase, testCases);
|
|
1623
|
+
result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
|
|
1624
|
+
return result;
|
|
1625
|
+
} catch (e) {
|
|
1626
|
+
return {
|
|
1627
|
+
score: 0,
|
|
1628
|
+
success: false,
|
|
1629
|
+
metrics: {},
|
|
1630
|
+
error: e,
|
|
1631
|
+
logs: [
|
|
1632
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1633
|
+
]
|
|
1634
|
+
};
|
|
1635
|
+
}
|
|
1636
|
+
}
|
|
1637
|
+
};
|
|
1638
|
+
}
|
|
1639
|
+
var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
|
|
1640
|
+
"complex-func-bench",
|
|
1641
|
+
"ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
|
|
1642
|
+
"ComplexFuncBench.jsonl",
|
|
1643
|
+
"ComplexFuncBench_possible_answer.jsonl"
|
|
1644
|
+
);
|
|
1645
|
+
|
|
1646
|
+
// src/benchmarks/json-generation.ts
|
|
1647
|
+
import { promises as fs4 } from "fs";
|
|
1648
|
+
import path4 from "path";
|
|
1649
|
+
import { generateText as generateText3 } from "ai";
|
|
1146
1650
|
import Ajv from "ajv";
|
|
1147
1651
|
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1148
1652
|
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1149
1653
|
var NEWLINE_REGEX = /\r?\n/;
|
|
1150
|
-
var
|
|
1654
|
+
var LINE_SPLIT_REGEX3 = /\r?\n/;
|
|
1151
1655
|
function tryDirectParse(text) {
|
|
1152
1656
|
try {
|
|
1153
1657
|
return JSON.parse(text);
|
|
1154
|
-
} catch {
|
|
1658
|
+
} catch (e) {
|
|
1155
1659
|
return;
|
|
1156
1660
|
}
|
|
1157
1661
|
}
|
|
@@ -1163,7 +1667,7 @@ function tryCodeFenceParse(text) {
|
|
|
1163
1667
|
const inner = fenceMatch[1].trim();
|
|
1164
1668
|
try {
|
|
1165
1669
|
return JSON.parse(inner);
|
|
1166
|
-
} catch {
|
|
1670
|
+
} catch (e) {
|
|
1167
1671
|
return;
|
|
1168
1672
|
}
|
|
1169
1673
|
}
|
|
@@ -1188,7 +1692,7 @@ function tryBracketScan(text) {
|
|
|
1188
1692
|
const candidate = text.slice(start, i + 1);
|
|
1189
1693
|
try {
|
|
1190
1694
|
return JSON.parse(candidate);
|
|
1191
|
-
} catch {
|
|
1695
|
+
} catch (e) {
|
|
1192
1696
|
return;
|
|
1193
1697
|
}
|
|
1194
1698
|
}
|
|
@@ -1236,12 +1740,12 @@ function subsetMatch(expected, actual) {
|
|
|
1236
1740
|
async function loadDatasets() {
|
|
1237
1741
|
try {
|
|
1238
1742
|
const dataDir = resolveDataDir();
|
|
1239
|
-
const testsJsonl = await
|
|
1240
|
-
|
|
1743
|
+
const testsJsonl = await fs4.readFile(
|
|
1744
|
+
path4.join(dataDir, "json_generation_tests.jsonl"),
|
|
1241
1745
|
"utf-8"
|
|
1242
1746
|
);
|
|
1243
|
-
const expectedJsonl = await
|
|
1244
|
-
|
|
1747
|
+
const expectedJsonl = await fs4.readFile(
|
|
1748
|
+
path4.join(dataDir, "json_generation_expected.jsonl"),
|
|
1245
1749
|
"utf-8"
|
|
1246
1750
|
);
|
|
1247
1751
|
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -1297,10 +1801,11 @@ function validateTestCase(tc, parsed, context) {
|
|
|
1297
1801
|
return { valid, valuesOk, parsed };
|
|
1298
1802
|
}
|
|
1299
1803
|
async function processTestCase(tc, context) {
|
|
1804
|
+
var _a;
|
|
1300
1805
|
const messages = buildMessages(tc);
|
|
1301
|
-
const temp = context.config
|
|
1806
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1302
1807
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1303
|
-
const { text } = await
|
|
1808
|
+
const { text } = await generateText3({
|
|
1304
1809
|
model: context.model,
|
|
1305
1810
|
messages,
|
|
1306
1811
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1308,7 +1813,7 @@ async function processTestCase(tc, context) {
|
|
|
1308
1813
|
let parsed;
|
|
1309
1814
|
try {
|
|
1310
1815
|
parsed = extractFirstJsonBlock(text);
|
|
1311
|
-
} catch {
|
|
1816
|
+
} catch (e) {
|
|
1312
1817
|
}
|
|
1313
1818
|
if (parsed === void 0) {
|
|
1314
1819
|
context.validation.logs.push(
|
|
@@ -1402,21 +1907,22 @@ function buildBenchmarkResult(total, counts, logs) {
|
|
|
1402
1907
|
async function loadSchemaOnlyTests() {
|
|
1403
1908
|
try {
|
|
1404
1909
|
const dataDir = resolveDataDir();
|
|
1405
|
-
const testsJsonl = await
|
|
1406
|
-
|
|
1910
|
+
const testsJsonl = await fs4.readFile(
|
|
1911
|
+
path4.join(dataDir, "json_generation_tests.jsonl"),
|
|
1407
1912
|
"utf-8"
|
|
1408
1913
|
);
|
|
1409
|
-
const tests = testsJsonl.split(
|
|
1914
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1410
1915
|
return { tests };
|
|
1411
1916
|
} catch (e) {
|
|
1412
1917
|
return { tests: [], error: e };
|
|
1413
1918
|
}
|
|
1414
1919
|
}
|
|
1415
1920
|
async function processSchemaOnlyTestCase(tc, context) {
|
|
1921
|
+
var _a;
|
|
1416
1922
|
const messages = buildMessages(tc);
|
|
1417
|
-
const temp = context.config
|
|
1923
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1418
1924
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1419
|
-
const { text } = await
|
|
1925
|
+
const { text } = await generateText3({
|
|
1420
1926
|
model: context.model,
|
|
1421
1927
|
messages,
|
|
1422
1928
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1424,7 +1930,7 @@ async function processSchemaOnlyTestCase(tc, context) {
|
|
|
1424
1930
|
let parsed;
|
|
1425
1931
|
try {
|
|
1426
1932
|
parsed = extractFirstJsonBlock(text);
|
|
1427
|
-
} catch {
|
|
1933
|
+
} catch (e) {
|
|
1428
1934
|
}
|
|
1429
1935
|
if (parsed === void 0) {
|
|
1430
1936
|
context.logs.push(
|
|
@@ -1493,38 +1999,144 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1493
1999
|
}
|
|
1494
2000
|
};
|
|
1495
2001
|
|
|
2002
|
+
// src/evaluate.ts
|
|
2003
|
+
import { createDiskCacheMiddleware } from "@ai-sdk-tool/middleware";
|
|
2004
|
+
import { wrapLanguageModel } from "ai";
|
|
2005
|
+
|
|
1496
2006
|
// src/reporters/console.ts
|
|
1497
2007
|
var colors = {
|
|
1498
2008
|
reset: "\x1B[0m",
|
|
2009
|
+
bold: "\x1B[1m",
|
|
1499
2010
|
green: "\x1B[32m",
|
|
1500
2011
|
red: "\x1B[31m",
|
|
1501
2012
|
yellow: "\x1B[33m",
|
|
1502
2013
|
cyan: "\x1B[36m",
|
|
1503
2014
|
magenta: "\x1B[35m",
|
|
1504
|
-
gray: "\x1B[90m"
|
|
2015
|
+
gray: "\x1B[90m",
|
|
2016
|
+
white: "\x1B[37m"
|
|
1505
2017
|
};
|
|
2018
|
+
var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
|
|
2019
|
+
function formatDiff(diff) {
|
|
2020
|
+
if (!diff || diff.length === 0) {
|
|
2021
|
+
return "";
|
|
2022
|
+
}
|
|
2023
|
+
return diff.slice(0, 8).map((line) => {
|
|
2024
|
+
if (line.startsWith("-")) {
|
|
2025
|
+
return `${colors.red}${line}${colors.reset}`;
|
|
2026
|
+
}
|
|
2027
|
+
if (line.startsWith("+")) {
|
|
2028
|
+
return `${colors.green}${line}${colors.reset}`;
|
|
2029
|
+
}
|
|
2030
|
+
if (line.startsWith("@@")) {
|
|
2031
|
+
return `${colors.cyan}${line}${colors.reset}`;
|
|
2032
|
+
}
|
|
2033
|
+
return line;
|
|
2034
|
+
}).join("\n ");
|
|
2035
|
+
}
|
|
2036
|
+
function parseFailures(logs) {
|
|
2037
|
+
const failures = [];
|
|
2038
|
+
for (const log of logs) {
|
|
2039
|
+
if (!DEBUG_FAIL_REGEX.test(log)) {
|
|
2040
|
+
continue;
|
|
2041
|
+
}
|
|
2042
|
+
try {
|
|
2043
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
|
|
2044
|
+
const parsed = JSON.parse(jsonStr);
|
|
2045
|
+
failures.push(parsed);
|
|
2046
|
+
} catch (e) {
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
return failures;
|
|
2050
|
+
}
|
|
2051
|
+
function groupFailuresByCategory(failures) {
|
|
2052
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2053
|
+
for (const failure of failures) {
|
|
2054
|
+
const category = failure.category || "OTHER";
|
|
2055
|
+
const existing = groups.get(category);
|
|
2056
|
+
if (existing) {
|
|
2057
|
+
existing.push(failure);
|
|
2058
|
+
} else {
|
|
2059
|
+
groups.set(category, [failure]);
|
|
2060
|
+
}
|
|
2061
|
+
}
|
|
2062
|
+
return groups;
|
|
2063
|
+
}
|
|
2064
|
+
function printCompactFailure(failure) {
|
|
2065
|
+
var _a;
|
|
2066
|
+
console.log(
|
|
2067
|
+
`
|
|
2068
|
+
${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
|
|
2069
|
+
);
|
|
2070
|
+
if (failure.message) {
|
|
2071
|
+
console.log(` ${failure.message}`);
|
|
2072
|
+
}
|
|
2073
|
+
if (failure.diff && failure.diff.length > 0) {
|
|
2074
|
+
console.log(` ${formatDiff(failure.diff)}`);
|
|
2075
|
+
}
|
|
2076
|
+
if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
|
|
2077
|
+
const text = failure.context.raw_model_text;
|
|
2078
|
+
const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
|
|
2079
|
+
console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
|
|
2080
|
+
}
|
|
2081
|
+
}
|
|
2082
|
+
function printFailureSummary(failures) {
|
|
2083
|
+
const groups = groupFailuresByCategory(failures);
|
|
2084
|
+
const sorted = [...groups.entries()].sort(
|
|
2085
|
+
(a, b) => b[1].length - a[1].length
|
|
2086
|
+
);
|
|
2087
|
+
console.log(`
|
|
2088
|
+
${colors.bold}Failures by category:${colors.reset}`);
|
|
2089
|
+
for (const [category, categoryFailures] of sorted) {
|
|
2090
|
+
console.log(
|
|
2091
|
+
` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
|
|
2092
|
+
);
|
|
2093
|
+
}
|
|
2094
|
+
const maxToShow = 5;
|
|
2095
|
+
const shown = failures.slice(0, maxToShow);
|
|
2096
|
+
for (const failure of shown) {
|
|
2097
|
+
printCompactFailure(failure);
|
|
2098
|
+
}
|
|
2099
|
+
if (failures.length > maxToShow) {
|
|
2100
|
+
const remaining = failures.length - maxToShow;
|
|
2101
|
+
const remainingIds = failures.slice(maxToShow).map((f) => f.id);
|
|
2102
|
+
const idPreview = remainingIds.slice(0, 5).join(", ");
|
|
2103
|
+
const more = remainingIds.length > 5 ? "..." : "";
|
|
2104
|
+
console.log(
|
|
2105
|
+
`
|
|
2106
|
+
${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
|
|
2107
|
+
);
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
1506
2110
|
function printResult(result) {
|
|
1507
2111
|
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1508
|
-
const
|
|
2112
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2113
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2114
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2115
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2116
|
+
const statusColor = benchmarkResult.success ? colors.green : colors.red;
|
|
1509
2117
|
console.log(
|
|
1510
2118
|
`
|
|
1511
2119
|
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
1512
2120
|
);
|
|
1513
2121
|
console.log(
|
|
1514
|
-
` \u2514 ${
|
|
2122
|
+
` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
|
|
1515
2123
|
);
|
|
1516
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
1517
|
-
if (metrics.length > 0) {
|
|
1518
|
-
console.log(" Metrics:");
|
|
1519
|
-
for (const [key, value] of metrics) {
|
|
1520
|
-
console.log(` - ${key}: ${value}`);
|
|
1521
|
-
}
|
|
1522
|
-
}
|
|
1523
2124
|
if (benchmarkResult.error) {
|
|
1524
2125
|
console.log(
|
|
1525
2126
|
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
1526
2127
|
);
|
|
1527
2128
|
}
|
|
2129
|
+
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2130
|
+
const failures = parseFailures(benchmarkResult.logs);
|
|
2131
|
+
if (failures.length > 0) {
|
|
2132
|
+
printFailureSummary(failures);
|
|
2133
|
+
} else if (benchmarkResult.logs.length > 0) {
|
|
2134
|
+
console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
|
|
2135
|
+
for (const l of benchmarkResult.logs.slice(0, 5)) {
|
|
2136
|
+
console.log(` ${l}`);
|
|
2137
|
+
}
|
|
2138
|
+
}
|
|
2139
|
+
}
|
|
1528
2140
|
}
|
|
1529
2141
|
function consoleReporter(results) {
|
|
1530
2142
|
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
@@ -1579,14 +2191,14 @@ function hasFunctionNameIssue(diff) {
|
|
|
1579
2191
|
);
|
|
1580
2192
|
}
|
|
1581
2193
|
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
1582
|
-
const expectedName = expected
|
|
1583
|
-
const actualName = actual
|
|
2194
|
+
const expectedName = expected == null ? void 0 : expected.function;
|
|
2195
|
+
const actualName = actual == null ? void 0 : actual.function;
|
|
1584
2196
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
1585
2197
|
suggestions.push(
|
|
1586
2198
|
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
1587
2199
|
);
|
|
1588
2200
|
}
|
|
1589
|
-
if (Array.isArray(expected
|
|
2201
|
+
if (Array.isArray(expected == null ? void 0 : expected.functions)) {
|
|
1590
2202
|
suggestions.push(
|
|
1591
2203
|
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
1592
2204
|
);
|
|
@@ -1641,7 +2253,7 @@ function suggestFromErrorType(error_type, suggestions) {
|
|
|
1641
2253
|
}
|
|
1642
2254
|
function suggestFixFromDiff(parsed) {
|
|
1643
2255
|
const suggestions = [];
|
|
1644
|
-
const { error_type, expected, actual, diff } = parsed
|
|
2256
|
+
const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
|
|
1645
2257
|
if (!Array.isArray(diff)) {
|
|
1646
2258
|
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1647
2259
|
suggestFromErrorType(error_type, suggestions);
|
|
@@ -1666,15 +2278,16 @@ function suggestFixFromDiff(parsed) {
|
|
|
1666
2278
|
return uniqueLines(suggestions);
|
|
1667
2279
|
}
|
|
1668
2280
|
function getTestIdFromLogLine(line) {
|
|
2281
|
+
var _a, _b;
|
|
1669
2282
|
if (line.startsWith("[FAIL]")) {
|
|
1670
2283
|
const m = line.match(FAIL_ID_REGEX);
|
|
1671
|
-
return m
|
|
2284
|
+
return m == null ? void 0 : m[1];
|
|
1672
2285
|
}
|
|
1673
2286
|
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1674
2287
|
try {
|
|
1675
2288
|
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1676
|
-
return String(parsed
|
|
1677
|
-
} catch {
|
|
2289
|
+
return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
|
|
2290
|
+
} catch (e) {
|
|
1678
2291
|
}
|
|
1679
2292
|
}
|
|
1680
2293
|
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
@@ -1682,18 +2295,19 @@ function getTestIdFromLogLine(line) {
|
|
|
1682
2295
|
const parsed = JSON.parse(
|
|
1683
2296
|
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
1684
2297
|
);
|
|
1685
|
-
return String(parsed
|
|
1686
|
-
} catch {
|
|
2298
|
+
return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
|
|
2299
|
+
} catch (e) {
|
|
1687
2300
|
}
|
|
1688
2301
|
}
|
|
1689
2302
|
return;
|
|
1690
2303
|
}
|
|
1691
2304
|
function groupLogsByTestId(failLogs) {
|
|
2305
|
+
var _a;
|
|
1692
2306
|
const byId = /* @__PURE__ */ new Map();
|
|
1693
2307
|
for (const line of failLogs) {
|
|
1694
2308
|
const id = getTestIdFromLogLine(line);
|
|
1695
|
-
const key = id
|
|
1696
|
-
const arr = byId.get(key)
|
|
2309
|
+
const key = id != null ? id : "__general__";
|
|
2310
|
+
const arr = (_a = byId.get(key)) != null ? _a : [];
|
|
1697
2311
|
arr.push(line);
|
|
1698
2312
|
byId.set(key, arr);
|
|
1699
2313
|
}
|
|
@@ -1705,10 +2319,10 @@ function collectDebugIds(lines) {
|
|
|
1705
2319
|
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
1706
2320
|
try {
|
|
1707
2321
|
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1708
|
-
if (parsed
|
|
2322
|
+
if (parsed == null ? void 0 : parsed.id) {
|
|
1709
2323
|
debugIds.add(String(parsed.id));
|
|
1710
2324
|
}
|
|
1711
|
-
} catch {
|
|
2325
|
+
} catch (e) {
|
|
1712
2326
|
}
|
|
1713
2327
|
}
|
|
1714
2328
|
}
|
|
@@ -1744,7 +2358,7 @@ function displayDebugFailLine(line) {
|
|
|
1744
2358
|
console.log(` \u2022 ${s}`);
|
|
1745
2359
|
}
|
|
1746
2360
|
}
|
|
1747
|
-
} catch {
|
|
2361
|
+
} catch (e) {
|
|
1748
2362
|
console.log(` ${line}`);
|
|
1749
2363
|
}
|
|
1750
2364
|
}
|
|
@@ -1788,14 +2402,14 @@ function displayDebugFailContextLine(line) {
|
|
|
1788
2402
|
const ctx = JSON.parse(payload);
|
|
1789
2403
|
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
1790
2404
|
displayContextInfo(ctx);
|
|
1791
|
-
} catch {
|
|
2405
|
+
} catch (e) {
|
|
1792
2406
|
console.log(` ${line}`);
|
|
1793
2407
|
}
|
|
1794
2408
|
}
|
|
1795
2409
|
function displayLogLine(line, debugIds) {
|
|
1796
2410
|
if (line.startsWith("[FAIL]")) {
|
|
1797
2411
|
const m = line.match(FAIL_ID_REGEX);
|
|
1798
|
-
const failId = m
|
|
2412
|
+
const failId = m == null ? void 0 : m[1];
|
|
1799
2413
|
if (failId && debugIds.has(failId)) {
|
|
1800
2414
|
return;
|
|
1801
2415
|
}
|
|
@@ -1865,26 +2479,350 @@ function displayResultHeader(r) {
|
|
|
1865
2479
|
);
|
|
1866
2480
|
}
|
|
1867
2481
|
function consoleDebugReporter(results) {
|
|
2482
|
+
var _a;
|
|
1868
2483
|
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
1869
2484
|
for (const r of results) {
|
|
1870
2485
|
displayResultHeader(r);
|
|
1871
2486
|
displayMetrics(Object.entries(r.result.metrics));
|
|
1872
|
-
if (r.result.logs
|
|
2487
|
+
if ((_a = r.result.logs) == null ? void 0 : _a.length) {
|
|
1873
2488
|
displayResultLogs(r.result.logs);
|
|
1874
2489
|
}
|
|
1875
2490
|
}
|
|
1876
2491
|
console.log("\n------------------------------------\n");
|
|
1877
2492
|
}
|
|
1878
2493
|
|
|
2494
|
+
// src/reporters/console.summary.ts
|
|
2495
|
+
var colors3 = {
|
|
2496
|
+
reset: "\x1B[0m",
|
|
2497
|
+
bold: "\x1B[1m",
|
|
2498
|
+
dim: "\x1B[2m",
|
|
2499
|
+
green: "\x1B[32m",
|
|
2500
|
+
red: "\x1B[31m",
|
|
2501
|
+
yellow: "\x1B[33m",
|
|
2502
|
+
cyan: "\x1B[36m",
|
|
2503
|
+
magenta: "\x1B[35m",
|
|
2504
|
+
gray: "\x1B[90m",
|
|
2505
|
+
white: "\x1B[37m"
|
|
2506
|
+
};
|
|
2507
|
+
var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
|
|
2508
|
+
var ID_NUM_REGEX = /_(\d+)$/;
|
|
2509
|
+
var REASONING_TAG = "think";
|
|
2510
|
+
var MAX_FAILURES_TO_DISPLAY = 5;
|
|
2511
|
+
var CATEGORY_DESCRIPTIONS = {
|
|
2512
|
+
PARSE_FAILURE: {
|
|
2513
|
+
label: "Parse Failure",
|
|
2514
|
+
description: "No tool calls extracted from model output",
|
|
2515
|
+
hint: "Model may have responded in text instead of tool format"
|
|
2516
|
+
},
|
|
2517
|
+
PARTIAL_CALLS: {
|
|
2518
|
+
label: "Partial Calls",
|
|
2519
|
+
description: "Some expected tool calls missing",
|
|
2520
|
+
hint: "Model stopped early or missed some tools"
|
|
2521
|
+
},
|
|
2522
|
+
EXTRA_CALLS: {
|
|
2523
|
+
label: "Extra Calls",
|
|
2524
|
+
description: "More tool calls than expected",
|
|
2525
|
+
hint: "Model called tools that weren't needed"
|
|
2526
|
+
},
|
|
2527
|
+
PARAM_VALUE_PERCENT: {
|
|
2528
|
+
label: "Param Value (Percent)",
|
|
2529
|
+
description: "Percentage sent as integer instead of decimal",
|
|
2530
|
+
hint: "e.g., 5 instead of 0.05 for 5%"
|
|
2531
|
+
},
|
|
2532
|
+
PARAM_VALUE_MISMATCH: {
|
|
2533
|
+
label: "Param Value Mismatch",
|
|
2534
|
+
description: "Parameter values don't match expected"
|
|
2535
|
+
},
|
|
2536
|
+
WRONG_FUNCTION: {
|
|
2537
|
+
label: "Wrong Function",
|
|
2538
|
+
description: "Called wrong function name"
|
|
2539
|
+
},
|
|
2540
|
+
MISSING_PARAMS: {
|
|
2541
|
+
label: "Missing Params",
|
|
2542
|
+
description: "Required parameters not provided"
|
|
2543
|
+
},
|
|
2544
|
+
UNEXPECTED_PARAMS: {
|
|
2545
|
+
label: "Unexpected Params",
|
|
2546
|
+
description: "Extra parameters that shouldn't be there"
|
|
2547
|
+
},
|
|
2548
|
+
NO_MATCH: {
|
|
2549
|
+
label: "No Match",
|
|
2550
|
+
description: "Function called but couldn't match to expected",
|
|
2551
|
+
hint: "Parameters may be correct but don't match any expected combination"
|
|
2552
|
+
},
|
|
2553
|
+
OTHER: {
|
|
2554
|
+
label: "Other",
|
|
2555
|
+
description: "Uncategorized failure"
|
|
2556
|
+
}
|
|
2557
|
+
};
|
|
2558
|
+
function parseFailureLogs(logs) {
|
|
2559
|
+
return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
|
|
2560
|
+
try {
|
|
2561
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
|
|
2562
|
+
return JSON.parse(jsonStr);
|
|
2563
|
+
} catch (e) {
|
|
2564
|
+
return null;
|
|
2565
|
+
}
|
|
2566
|
+
}).filter((parsed) => parsed !== null);
|
|
2567
|
+
}
|
|
2568
|
+
function groupByCategory(failures) {
|
|
2569
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2570
|
+
for (const failure of failures) {
|
|
2571
|
+
const category = failure.category || "OTHER";
|
|
2572
|
+
const existing = groups.get(category);
|
|
2573
|
+
if (existing) {
|
|
2574
|
+
existing.failures.push(failure);
|
|
2575
|
+
} else {
|
|
2576
|
+
groups.set(category, { failures: [failure] });
|
|
2577
|
+
}
|
|
2578
|
+
}
|
|
2579
|
+
return groups;
|
|
2580
|
+
}
|
|
2581
|
+
function extractParamNames(failures) {
|
|
2582
|
+
const paramNames = /* @__PURE__ */ new Set();
|
|
2583
|
+
for (const f of failures) {
|
|
2584
|
+
if (!f.diff) {
|
|
2585
|
+
continue;
|
|
2586
|
+
}
|
|
2587
|
+
for (const d of f.diff) {
|
|
2588
|
+
if (d.startsWith("@@ param ")) {
|
|
2589
|
+
paramNames.add(d.replace("@@ param ", ""));
|
|
2590
|
+
}
|
|
2591
|
+
}
|
|
2592
|
+
}
|
|
2593
|
+
return paramNames;
|
|
2594
|
+
}
|
|
2595
|
+
function extractFinishReasons(failures) {
|
|
2596
|
+
var _a;
|
|
2597
|
+
const finishReasons = /* @__PURE__ */ new Set();
|
|
2598
|
+
for (const f of failures) {
|
|
2599
|
+
if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
|
|
2600
|
+
finishReasons.add(String(f.context.finish_reason));
|
|
2601
|
+
}
|
|
2602
|
+
}
|
|
2603
|
+
return finishReasons;
|
|
2604
|
+
}
|
|
2605
|
+
function detectPatterns(group) {
|
|
2606
|
+
const { failures } = group;
|
|
2607
|
+
if (failures.length < 2) {
|
|
2608
|
+
return;
|
|
2609
|
+
}
|
|
2610
|
+
const firstCategory = failures[0].category;
|
|
2611
|
+
if (firstCategory === "PARAM_VALUE_PERCENT") {
|
|
2612
|
+
const paramNames = extractParamNames(failures);
|
|
2613
|
+
if (paramNames.size > 0) {
|
|
2614
|
+
group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
if (firstCategory === "PARSE_FAILURE") {
|
|
2618
|
+
const finishReasons = extractFinishReasons(failures);
|
|
2619
|
+
if (finishReasons.size === 1) {
|
|
2620
|
+
group.pattern = `All finished with: ${[...finishReasons][0]}`;
|
|
2621
|
+
}
|
|
2622
|
+
}
|
|
2623
|
+
}
|
|
2624
|
+
function getLineColor(line) {
|
|
2625
|
+
if (line.startsWith("+")) {
|
|
2626
|
+
return colors3.green;
|
|
2627
|
+
}
|
|
2628
|
+
if (line.startsWith("-")) {
|
|
2629
|
+
return colors3.red;
|
|
2630
|
+
}
|
|
2631
|
+
if (line.startsWith("@@")) {
|
|
2632
|
+
return colors3.cyan;
|
|
2633
|
+
}
|
|
2634
|
+
return colors3.white;
|
|
2635
|
+
}
|
|
2636
|
+
function formatFunctions(funcs) {
|
|
2637
|
+
if (Array.isArray(funcs)) {
|
|
2638
|
+
return funcs.join(", ");
|
|
2639
|
+
}
|
|
2640
|
+
return String(funcs);
|
|
2641
|
+
}
|
|
2642
|
+
function printExpectedActual(failure) {
|
|
2643
|
+
if (failure.expected) {
|
|
2644
|
+
const expFuncs = failure.expected.functions || failure.expected.function;
|
|
2645
|
+
if (expFuncs) {
|
|
2646
|
+
console.log(
|
|
2647
|
+
` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
|
|
2648
|
+
);
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2651
|
+
if (failure.actual) {
|
|
2652
|
+
const actFuncs = failure.actual.functions || failure.actual.function;
|
|
2653
|
+
if (actFuncs) {
|
|
2654
|
+
const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
|
|
2655
|
+
const color = isEmpty ? colors3.red : colors3.white;
|
|
2656
|
+
const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
|
|
2657
|
+
console.log(
|
|
2658
|
+
` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
|
|
2659
|
+
);
|
|
2660
|
+
}
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
function printDiff(diff) {
|
|
2664
|
+
console.log(` ${colors3.gray}Diff:${colors3.reset}`);
|
|
2665
|
+
for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
|
|
2666
|
+
const lineColor = getLineColor(line);
|
|
2667
|
+
console.log(` ${lineColor}${line}${colors3.reset}`);
|
|
2668
|
+
}
|
|
2669
|
+
}
|
|
2670
|
+
function removeReasoningTags(text) {
|
|
2671
|
+
const openTag = `<${REASONING_TAG}>`;
|
|
2672
|
+
const closeTag = `</${REASONING_TAG}>`;
|
|
2673
|
+
const closedTagPattern = new RegExp(
|
|
2674
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
|
|
2675
|
+
"g"
|
|
2676
|
+
);
|
|
2677
|
+
const unclosedTagPattern = new RegExp(
|
|
2678
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
|
|
2679
|
+
"g"
|
|
2680
|
+
);
|
|
2681
|
+
let result = text.replace(closedTagPattern, "");
|
|
2682
|
+
result = result.replace(unclosedTagPattern, "");
|
|
2683
|
+
return result.trim();
|
|
2684
|
+
}
|
|
2685
|
+
function printModelOutput(failure, category) {
|
|
2686
|
+
var _a, _b;
|
|
2687
|
+
if (category !== "PARSE_FAILURE") {
|
|
2688
|
+
return;
|
|
2689
|
+
}
|
|
2690
|
+
const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
|
|
2691
|
+
const cleanedText = removeReasoningTags(rawText);
|
|
2692
|
+
if (cleanedText) {
|
|
2693
|
+
console.log(
|
|
2694
|
+
` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
|
|
2695
|
+
);
|
|
2696
|
+
} else {
|
|
2697
|
+
console.log(
|
|
2698
|
+
` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
|
|
2699
|
+
);
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2702
|
+
function shouldShowDiffByDefault(category) {
|
|
2703
|
+
return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
|
|
2704
|
+
}
|
|
2705
|
+
function printSingleFailure(failure, category, verbose) {
|
|
2706
|
+
console.log(`
|
|
2707
|
+
${colors3.bold}${failure.id}${colors3.reset}`);
|
|
2708
|
+
const hasDiff = failure.diff && failure.diff.length > 0;
|
|
2709
|
+
const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
|
|
2710
|
+
if (showDiffPrimarily) {
|
|
2711
|
+
printDiff(failure.diff);
|
|
2712
|
+
} else {
|
|
2713
|
+
printExpectedActual(failure);
|
|
2714
|
+
if (hasDiff && verbose) {
|
|
2715
|
+
printDiff(failure.diff);
|
|
2716
|
+
}
|
|
2717
|
+
}
|
|
2718
|
+
printModelOutput(failure, category);
|
|
2719
|
+
}
|
|
2720
|
+
var MAX_SAMPLE_FAILURES = 2;
|
|
2721
|
+
function printRemainingIds(failures) {
|
|
2722
|
+
const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
|
|
2723
|
+
const idNums = remainingIds.map((id) => {
|
|
2724
|
+
const match = id.match(ID_NUM_REGEX);
|
|
2725
|
+
return match ? match[1] : id;
|
|
2726
|
+
});
|
|
2727
|
+
console.log(
|
|
2728
|
+
`
|
|
2729
|
+
${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
|
|
2730
|
+
);
|
|
2731
|
+
}
|
|
2732
|
+
function printCategoryHeader(info, count) {
|
|
2733
|
+
console.log(
|
|
2734
|
+
`
|
|
2735
|
+
${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
|
|
2736
|
+
);
|
|
2737
|
+
console.log(`${colors3.dim}${info.description}${colors3.reset}`);
|
|
2738
|
+
}
|
|
2739
|
+
function printCategoryDetails(category, group, verbose) {
|
|
2740
|
+
const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
|
|
2741
|
+
const { failures } = group;
|
|
2742
|
+
printCategoryHeader(info, failures.length);
|
|
2743
|
+
if (group.pattern) {
|
|
2744
|
+
console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
|
|
2745
|
+
}
|
|
2746
|
+
if (info.hint) {
|
|
2747
|
+
console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
|
|
2748
|
+
}
|
|
2749
|
+
const samplesToShow = verbose ? failures : failures.slice(0, 2);
|
|
2750
|
+
for (const failure of samplesToShow) {
|
|
2751
|
+
printSingleFailure(failure, category, verbose);
|
|
2752
|
+
}
|
|
2753
|
+
if (!verbose && failures.length > 2) {
|
|
2754
|
+
printRemainingIds(failures);
|
|
2755
|
+
}
|
|
2756
|
+
}
|
|
2757
|
+
function printResultHeader(result) {
|
|
2758
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
2759
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2760
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2761
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2762
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2763
|
+
const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
|
|
2764
|
+
const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
|
|
2765
|
+
const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
|
|
2766
|
+
const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
|
|
2767
|
+
console.log(
|
|
2768
|
+
`
|
|
2769
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
|
|
2770
|
+
);
|
|
2771
|
+
console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
|
|
2772
|
+
}
|
|
2773
|
+
function printResultSummary(result, verbose) {
|
|
2774
|
+
const { result: benchmarkResult } = result;
|
|
2775
|
+
printResultHeader(result);
|
|
2776
|
+
if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
|
|
2777
|
+
return;
|
|
2778
|
+
}
|
|
2779
|
+
const failures = parseFailureLogs(benchmarkResult.logs);
|
|
2780
|
+
if (failures.length === 0) {
|
|
2781
|
+
if (!benchmarkResult.success) {
|
|
2782
|
+
console.log(
|
|
2783
|
+
`${colors3.yellow}No structured failure data available${colors3.reset}`
|
|
2784
|
+
);
|
|
2785
|
+
}
|
|
2786
|
+
return;
|
|
2787
|
+
}
|
|
2788
|
+
const groups = groupByCategory(failures);
|
|
2789
|
+
for (const group of groups.values()) {
|
|
2790
|
+
detectPatterns(group);
|
|
2791
|
+
}
|
|
2792
|
+
const sortedCategories = [...groups.entries()].sort(
|
|
2793
|
+
(a, b) => b[1].failures.length - a[1].failures.length
|
|
2794
|
+
);
|
|
2795
|
+
for (const [cat, group] of sortedCategories) {
|
|
2796
|
+
printCategoryDetails(cat, group, verbose);
|
|
2797
|
+
}
|
|
2798
|
+
}
|
|
2799
|
+
function consoleSummaryReporter(results) {
|
|
2800
|
+
const verbose = process.env.VERBOSE === "true";
|
|
2801
|
+
console.log(`
|
|
2802
|
+
${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
|
|
2803
|
+
console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
|
|
2804
|
+
for (const result of results) {
|
|
2805
|
+
printResultSummary(result, verbose);
|
|
2806
|
+
}
|
|
2807
|
+
console.log(
|
|
2808
|
+
`
|
|
2809
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
|
|
2810
|
+
`
|
|
2811
|
+
);
|
|
2812
|
+
}
|
|
2813
|
+
|
|
1879
2814
|
// src/reporters/json.ts
|
|
1880
2815
|
function jsonReporter(results) {
|
|
1881
|
-
const serializableResults = results.map((r) =>
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
...r
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
2816
|
+
const serializableResults = results.map((r) => {
|
|
2817
|
+
var _a;
|
|
2818
|
+
return {
|
|
2819
|
+
...r,
|
|
2820
|
+
result: {
|
|
2821
|
+
...r.result,
|
|
2822
|
+
error: (_a = r.result.error) == null ? void 0 : _a.message
|
|
2823
|
+
}
|
|
2824
|
+
};
|
|
2825
|
+
});
|
|
1888
2826
|
console.log(JSON.stringify(serializableResults, null, 2));
|
|
1889
2827
|
}
|
|
1890
2828
|
|
|
@@ -1892,60 +2830,56 @@ function jsonReporter(results) {
|
|
|
1892
2830
|
var reporters = {
|
|
1893
2831
|
console: consoleReporter,
|
|
1894
2832
|
json: jsonReporter,
|
|
1895
|
-
"console.debug": consoleDebugReporter
|
|
2833
|
+
"console.debug": consoleDebugReporter,
|
|
2834
|
+
"console.summary": consoleSummaryReporter
|
|
1896
2835
|
};
|
|
1897
2836
|
|
|
1898
2837
|
// src/evaluate.ts
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
return
|
|
1910
|
-
model: modelId,
|
|
1911
|
-
modelKey,
|
|
1912
|
-
benchmark: benchmark.name,
|
|
1913
|
-
result
|
|
1914
|
-
};
|
|
1915
|
-
} catch (error) {
|
|
1916
|
-
console.error(
|
|
1917
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
1918
|
-
error
|
|
1919
|
-
);
|
|
1920
|
-
return {
|
|
1921
|
-
model: modelId,
|
|
1922
|
-
modelKey,
|
|
1923
|
-
benchmark: benchmark.name,
|
|
1924
|
-
result: {
|
|
1925
|
-
score: 0,
|
|
1926
|
-
success: false,
|
|
1927
|
-
metrics: {},
|
|
1928
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
1929
|
-
}
|
|
1930
|
-
};
|
|
2838
|
+
function isModelConfig(value) {
|
|
2839
|
+
if (typeof value !== "object" || value === null) {
|
|
2840
|
+
return false;
|
|
2841
|
+
}
|
|
2842
|
+
const obj = value;
|
|
2843
|
+
if (!("model" in obj)) {
|
|
2844
|
+
return false;
|
|
2845
|
+
}
|
|
2846
|
+
const model = obj.model;
|
|
2847
|
+
if (typeof model !== "object" || model === null) {
|
|
2848
|
+
return false;
|
|
1931
2849
|
}
|
|
2850
|
+
return "modelId" in model;
|
|
2851
|
+
}
|
|
2852
|
+
function isLanguageModel(value) {
|
|
2853
|
+
if (typeof value !== "object" || value === null) {
|
|
2854
|
+
return false;
|
|
2855
|
+
}
|
|
2856
|
+
const obj = value;
|
|
2857
|
+
return "modelId" in obj && typeof obj.modelId === "string";
|
|
2858
|
+
}
|
|
2859
|
+
function extractModelAndMiddleware(input) {
|
|
2860
|
+
if (isModelConfig(input)) {
|
|
2861
|
+
return [input.model, input.middleware];
|
|
2862
|
+
}
|
|
2863
|
+
return [input, void 0];
|
|
1932
2864
|
}
|
|
1933
2865
|
function normalizeModels(models) {
|
|
1934
|
-
const
|
|
2866
|
+
const entries = [];
|
|
1935
2867
|
if (Array.isArray(models)) {
|
|
1936
2868
|
for (const m of models) {
|
|
1937
|
-
|
|
2869
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2870
|
+
entries.push([void 0, model, middleware]);
|
|
1938
2871
|
}
|
|
1939
|
-
} else if (
|
|
1940
|
-
|
|
2872
|
+
} else if (isModelConfig(models)) {
|
|
2873
|
+
entries.push([void 0, models.model, models.middleware]);
|
|
2874
|
+
} else if (isLanguageModel(models)) {
|
|
2875
|
+
entries.push([void 0, models, void 0]);
|
|
1941
2876
|
} else {
|
|
1942
|
-
for (const [key, m] of Object.entries(
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
modelEntries.push([key, m]);
|
|
2877
|
+
for (const [key, m] of Object.entries(models)) {
|
|
2878
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2879
|
+
entries.push([key, model, middleware]);
|
|
1946
2880
|
}
|
|
1947
2881
|
}
|
|
1948
|
-
return
|
|
2882
|
+
return entries;
|
|
1949
2883
|
}
|
|
1950
2884
|
function buildConfig(temperature, maxTokens) {
|
|
1951
2885
|
const config = {};
|
|
@@ -1966,21 +2900,90 @@ function executeReporter(reporter, results) {
|
|
|
1966
2900
|
reporters.console(results);
|
|
1967
2901
|
}
|
|
1968
2902
|
}
|
|
2903
|
+
function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
|
|
2904
|
+
var _a, _b;
|
|
2905
|
+
const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
|
|
2906
|
+
if (!(cacheEnabled || userMiddleware)) {
|
|
2907
|
+
return baseModel;
|
|
2908
|
+
}
|
|
2909
|
+
const cacheMiddleware = cacheEnabled ? createDiskCacheMiddleware({
|
|
2910
|
+
cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
|
|
2911
|
+
enabled: true,
|
|
2912
|
+
debug: (_b = cacheOptions.debug) != null ? _b : false
|
|
2913
|
+
}) : null;
|
|
2914
|
+
const middlewares = [];
|
|
2915
|
+
if (userMiddleware) {
|
|
2916
|
+
if (Array.isArray(userMiddleware)) {
|
|
2917
|
+
middlewares.push(...userMiddleware);
|
|
2918
|
+
} else {
|
|
2919
|
+
middlewares.push(userMiddleware);
|
|
2920
|
+
}
|
|
2921
|
+
}
|
|
2922
|
+
if (cacheMiddleware) {
|
|
2923
|
+
middlewares.push(cacheMiddleware);
|
|
2924
|
+
}
|
|
2925
|
+
if (middlewares.length === 0) {
|
|
2926
|
+
return baseModel;
|
|
2927
|
+
}
|
|
2928
|
+
return wrapLanguageModel({
|
|
2929
|
+
// biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
|
|
2930
|
+
model: baseModel,
|
|
2931
|
+
middleware: middlewares.length === 1 ? middlewares[0] : middlewares
|
|
2932
|
+
});
|
|
2933
|
+
}
|
|
2934
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
2935
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
2936
|
+
const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
|
|
2937
|
+
try {
|
|
2938
|
+
process.stdout.write(`${prefix}: ...`);
|
|
2939
|
+
const result = await benchmark.run(model, config);
|
|
2940
|
+
const scoreDisplay = result.score.toFixed(2);
|
|
2941
|
+
process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
|
|
2942
|
+
`);
|
|
2943
|
+
return {
|
|
2944
|
+
model: modelId,
|
|
2945
|
+
modelKey,
|
|
2946
|
+
benchmark: benchmark.name,
|
|
2947
|
+
result
|
|
2948
|
+
};
|
|
2949
|
+
} catch (error) {
|
|
2950
|
+
process.stdout.write(`\r${prefix}: .... Score: ERROR
|
|
2951
|
+
`);
|
|
2952
|
+
console.error(error);
|
|
2953
|
+
return {
|
|
2954
|
+
model: modelId,
|
|
2955
|
+
modelKey,
|
|
2956
|
+
benchmark: benchmark.name,
|
|
2957
|
+
result: {
|
|
2958
|
+
score: 0,
|
|
2959
|
+
success: false,
|
|
2960
|
+
metrics: {},
|
|
2961
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
2962
|
+
}
|
|
2963
|
+
};
|
|
2964
|
+
}
|
|
2965
|
+
}
|
|
1969
2966
|
async function evaluate(options) {
|
|
1970
2967
|
const {
|
|
1971
2968
|
models,
|
|
1972
2969
|
benchmarks,
|
|
1973
2970
|
reporter = "console",
|
|
1974
2971
|
temperature,
|
|
1975
|
-
maxTokens
|
|
2972
|
+
maxTokens,
|
|
2973
|
+
cache
|
|
1976
2974
|
} = options;
|
|
1977
2975
|
const modelEntries = normalizeModels(models);
|
|
1978
2976
|
const config = buildConfig(temperature, maxTokens);
|
|
1979
2977
|
const allResults = [];
|
|
1980
|
-
for (const [modelKey,
|
|
2978
|
+
for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
|
|
2979
|
+
const effectiveModel = buildEffectiveModel(
|
|
2980
|
+
baseModel,
|
|
2981
|
+
userMiddleware,
|
|
2982
|
+
cache
|
|
2983
|
+
);
|
|
1981
2984
|
for (const benchmark of benchmarks) {
|
|
1982
2985
|
const evaluationResult = await runSingleBenchmark(
|
|
1983
|
-
|
|
2986
|
+
effectiveModel,
|
|
1984
2987
|
benchmark,
|
|
1985
2988
|
modelKey,
|
|
1986
2989
|
config
|
|
@@ -1996,6 +2999,7 @@ export {
|
|
|
1996
2999
|
bfclParallelBenchmark,
|
|
1997
3000
|
bfclParallelMultipleBenchmark,
|
|
1998
3001
|
bfclSimpleBenchmark,
|
|
3002
|
+
complexFuncBenchBenchmark,
|
|
1999
3003
|
evaluate,
|
|
2000
3004
|
jsonGenerationBenchmark,
|
|
2001
3005
|
jsonGenerationSchemaOnlyBenchmark
|