@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +587 -91
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -9
- package/dist/index.d.ts +29 -9
- package/dist/index.js +591 -92
- package/dist/index.js.map +1 -1
- package/package.json +17 -11
package/dist/index.cjs
CHANGED
|
@@ -34,6 +34,7 @@ __export(index_exports, {
|
|
|
34
34
|
bfclParallelBenchmark: () => bfclParallelBenchmark,
|
|
35
35
|
bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
|
|
36
36
|
bfclSimpleBenchmark: () => bfclSimpleBenchmark,
|
|
37
|
+
complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
|
|
37
38
|
evaluate: () => evaluate,
|
|
38
39
|
jsonGenerationBenchmark: () => jsonGenerationBenchmark,
|
|
39
40
|
jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
|
|
@@ -61,7 +62,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
|
|
|
61
62
|
if (import_node_fs.default.existsSync(dataAtRoot)) {
|
|
62
63
|
return dataAtRoot;
|
|
63
64
|
}
|
|
64
|
-
} catch {
|
|
65
|
+
} catch (e) {
|
|
65
66
|
}
|
|
66
67
|
return null;
|
|
67
68
|
}
|
|
@@ -75,7 +76,7 @@ function tryResolveViaPackageJson(moduleUrl) {
|
|
|
75
76
|
if (import_node_fs.default.existsSync(dataAtPkg)) {
|
|
76
77
|
return dataAtPkg;
|
|
77
78
|
}
|
|
78
|
-
} catch {
|
|
79
|
+
} catch (e) {
|
|
79
80
|
}
|
|
80
81
|
return null;
|
|
81
82
|
}
|
|
@@ -83,7 +84,7 @@ function getStartDir(moduleUrl) {
|
|
|
83
84
|
if (moduleUrl) {
|
|
84
85
|
try {
|
|
85
86
|
return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
|
|
86
|
-
} catch {
|
|
87
|
+
} catch (e) {
|
|
87
88
|
return process.cwd();
|
|
88
89
|
}
|
|
89
90
|
}
|
|
@@ -177,7 +178,7 @@ function valuesMatch(modelValue, possibleValue) {
|
|
|
177
178
|
const normalizedModel = normalizeObject(modelValue);
|
|
178
179
|
const normalizedPossible = normalizeObject(possibleValue);
|
|
179
180
|
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
180
|
-
} catch {
|
|
181
|
+
} catch (e) {
|
|
181
182
|
return false;
|
|
182
183
|
}
|
|
183
184
|
}
|
|
@@ -306,7 +307,7 @@ function checkSingleParameter(paramName, modelValue, context) {
|
|
|
306
307
|
return checkStringValue(
|
|
307
308
|
paramName,
|
|
308
309
|
modelValue,
|
|
309
|
-
possibleValues
|
|
310
|
+
possibleValues != null ? possibleValues : []
|
|
310
311
|
);
|
|
311
312
|
}
|
|
312
313
|
if (Array.isArray(modelValue)) {
|
|
@@ -406,6 +407,37 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
406
407
|
// src/benchmarks/bfcl.ts
|
|
407
408
|
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
408
409
|
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
410
|
+
function convertGroundTruthToXML(call) {
|
|
411
|
+
const keys = Object.keys(call);
|
|
412
|
+
if (keys.length === 0) {
|
|
413
|
+
return "<empty_call />";
|
|
414
|
+
}
|
|
415
|
+
const funcName = keys[0];
|
|
416
|
+
if (!funcName) {
|
|
417
|
+
return "<undefined_function />";
|
|
418
|
+
}
|
|
419
|
+
const params = call[funcName];
|
|
420
|
+
if (!params || typeof params !== "object") {
|
|
421
|
+
return `<${funcName} />`;
|
|
422
|
+
}
|
|
423
|
+
let xml = `<${funcName}>
|
|
424
|
+
`;
|
|
425
|
+
for (const [key, value] of Object.entries(params)) {
|
|
426
|
+
const displayValue = Array.isArray(value) ? value[0] : value;
|
|
427
|
+
let valueStr;
|
|
428
|
+
if (typeof displayValue === "string") {
|
|
429
|
+
valueStr = displayValue;
|
|
430
|
+
} else if (displayValue === null || displayValue === void 0) {
|
|
431
|
+
valueStr = "";
|
|
432
|
+
} else {
|
|
433
|
+
valueStr = JSON.stringify(displayValue);
|
|
434
|
+
}
|
|
435
|
+
xml += ` <${key}>${valueStr}</${key}>
|
|
436
|
+
`;
|
|
437
|
+
}
|
|
438
|
+
xml += `</${funcName}>`;
|
|
439
|
+
return xml;
|
|
440
|
+
}
|
|
409
441
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
410
442
|
const category = testCase.id.split("_")[0];
|
|
411
443
|
try {
|
|
@@ -486,7 +518,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
486
518
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
487
519
|
);
|
|
488
520
|
}
|
|
489
|
-
const
|
|
521
|
+
const fixSchemaType2 = (copy) => {
|
|
490
522
|
if (!copy.type) {
|
|
491
523
|
return;
|
|
492
524
|
}
|
|
@@ -510,16 +542,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
510
542
|
);
|
|
511
543
|
}
|
|
512
544
|
};
|
|
513
|
-
const
|
|
545
|
+
const fixSchema2 = (schema) => {
|
|
514
546
|
if (!schema || typeof schema !== "object") {
|
|
515
547
|
return { type: "object", properties: {} };
|
|
516
548
|
}
|
|
517
|
-
const copy = Array.isArray(schema) ? schema.map((v) =>
|
|
549
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
|
|
518
550
|
if (!Array.isArray(copy)) {
|
|
519
|
-
|
|
520
|
-
fixSchemaProperties(copy,
|
|
551
|
+
fixSchemaType2(copy);
|
|
552
|
+
fixSchemaProperties(copy, fixSchema2);
|
|
521
553
|
if (copy.items) {
|
|
522
|
-
copy.items =
|
|
554
|
+
copy.items = fixSchema2(copy.items);
|
|
523
555
|
}
|
|
524
556
|
return copy;
|
|
525
557
|
}
|
|
@@ -554,13 +586,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
554
586
|
try {
|
|
555
587
|
const arr = JSON.parse(raw);
|
|
556
588
|
return Array.isArray(arr) ? arr : [];
|
|
557
|
-
} catch {
|
|
589
|
+
} catch (e) {
|
|
558
590
|
return [];
|
|
559
591
|
}
|
|
560
592
|
};
|
|
561
593
|
const getSanitizedName = (rawName, transformedTools) => {
|
|
594
|
+
var _a, _b;
|
|
562
595
|
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
563
|
-
return transformedTools[Number(rawName)]
|
|
596
|
+
return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
|
|
564
597
|
}
|
|
565
598
|
return rawName;
|
|
566
599
|
};
|
|
@@ -570,25 +603,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
570
603
|
}
|
|
571
604
|
try {
|
|
572
605
|
return JSON.parse(extractedArgs);
|
|
573
|
-
} catch {
|
|
606
|
+
} catch (e) {
|
|
574
607
|
return extractedArgs;
|
|
575
608
|
}
|
|
576
609
|
};
|
|
577
610
|
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
611
|
+
var _a, _b, _c, _d, _e, _f;
|
|
578
612
|
const call = c;
|
|
579
|
-
const rawName = call.toolName
|
|
613
|
+
const rawName = (_a = call.toolName) != null ? _a : call.name;
|
|
580
614
|
const sanitizedFromIndex = getSanitizedName(
|
|
581
615
|
rawName,
|
|
582
616
|
transformedTools
|
|
583
617
|
);
|
|
584
|
-
const originalName = nameMap.get(sanitizedFromIndex)
|
|
585
|
-
const extractedArgs = call.args
|
|
618
|
+
const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
|
|
619
|
+
const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
|
|
586
620
|
const parsedArgs = parseToolArgs(extractedArgs);
|
|
587
621
|
return {
|
|
588
622
|
...call,
|
|
589
623
|
toolName: originalName,
|
|
590
624
|
name: originalName,
|
|
591
|
-
args: parsedArgs
|
|
625
|
+
args: parsedArgs != null ? parsedArgs : {}
|
|
592
626
|
};
|
|
593
627
|
});
|
|
594
628
|
const summarizeArgs = (args) => {
|
|
@@ -632,7 +666,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
632
666
|
if (Array.isArray(got)) {
|
|
633
667
|
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
634
668
|
}
|
|
635
|
-
} catch {
|
|
669
|
+
} catch (e) {
|
|
636
670
|
}
|
|
637
671
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
638
672
|
});
|
|
@@ -670,13 +704,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
670
704
|
}
|
|
671
705
|
};
|
|
672
706
|
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
707
|
+
var _a, _b, _c, _d;
|
|
673
708
|
const funcDesc = tools[0];
|
|
674
|
-
const gt = possibleAnswer.ground_truth
|
|
675
|
-
const expectedFuncName = funcDesc
|
|
709
|
+
const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
|
|
710
|
+
const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
|
|
676
711
|
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
677
712
|
const received = restoredCalls[0];
|
|
678
|
-
const receivedName = received
|
|
679
|
-
const receivedArgs = summarizeArgs(received
|
|
713
|
+
const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
|
|
714
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
680
715
|
const expected = {
|
|
681
716
|
function: expectedFuncName,
|
|
682
717
|
params: expectedParams
|
|
@@ -688,7 +723,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
688
723
|
const diff = [];
|
|
689
724
|
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
690
725
|
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
691
|
-
const required = funcDesc
|
|
726
|
+
const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
692
727
|
checkMissingParams(
|
|
693
728
|
required,
|
|
694
729
|
receivedArgs,
|
|
@@ -725,12 +760,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
725
760
|
}
|
|
726
761
|
};
|
|
727
762
|
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
763
|
+
var _a;
|
|
728
764
|
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
729
765
|
if (usedActual.has(i)) {
|
|
730
766
|
continue;
|
|
731
767
|
}
|
|
732
768
|
const rc = restoredCalls[i];
|
|
733
|
-
const rcName = rc
|
|
769
|
+
const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
|
|
734
770
|
if (rcName === fname) {
|
|
735
771
|
return i;
|
|
736
772
|
}
|
|
@@ -744,6 +780,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
744
780
|
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
745
781
|
};
|
|
746
782
|
const processExpectedCall = (options) => {
|
|
783
|
+
var _a, _b;
|
|
747
784
|
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
748
785
|
const fname = Object.keys(expectedObj)[0];
|
|
749
786
|
const matchedIndex = findMatchingCallIndex(
|
|
@@ -756,10 +793,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
756
793
|
}
|
|
757
794
|
usedActual.add(matchedIndex);
|
|
758
795
|
const received = restoredCalls[matchedIndex];
|
|
759
|
-
const receivedArgs = summarizeArgs(received
|
|
796
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
760
797
|
const expectedParamsAllowed = expectedObj[fname];
|
|
761
798
|
const funcDesc = tools.find((t) => t.name === fname);
|
|
762
|
-
const requiredParams = funcDesc
|
|
799
|
+
const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
|
|
763
800
|
diff.push(`@@ function ${fname}`);
|
|
764
801
|
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
765
802
|
validateFunctionParams({
|
|
@@ -771,10 +808,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
771
808
|
}
|
|
772
809
|
};
|
|
773
810
|
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
774
|
-
|
|
811
|
+
var _a;
|
|
812
|
+
const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
|
|
775
813
|
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
776
814
|
const actualNames = restoredCalls.map(
|
|
777
|
-
(c) =>
|
|
815
|
+
(c) => {
|
|
816
|
+
var _a2;
|
|
817
|
+
return (_a2 = c.toolName) != null ? _a2 : c.name;
|
|
818
|
+
}
|
|
778
819
|
);
|
|
779
820
|
const expected = {
|
|
780
821
|
functions: expectedNames
|
|
@@ -800,14 +841,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
800
841
|
return { expected, actual, diff };
|
|
801
842
|
};
|
|
802
843
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
803
|
-
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) :
|
|
844
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
|
|
804
845
|
logs.push(
|
|
805
846
|
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
806
847
|
);
|
|
807
848
|
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
849
|
+
var _a, _b, _c, _d;
|
|
808
850
|
try {
|
|
809
851
|
const firstTool = transformedTools[0];
|
|
810
|
-
const schemaType = firstTool
|
|
852
|
+
const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
|
|
811
853
|
caseLogs.push(
|
|
812
854
|
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
813
855
|
);
|
|
@@ -823,7 +865,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
823
865
|
caseLogs.push(
|
|
824
866
|
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
825
867
|
);
|
|
826
|
-
} catch {
|
|
868
|
+
} catch (e) {
|
|
827
869
|
caseLogs.push(
|
|
828
870
|
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
829
871
|
);
|
|
@@ -842,11 +884,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
842
884
|
possibleAnswer
|
|
843
885
|
} = options;
|
|
844
886
|
const lastUser = (() => {
|
|
887
|
+
var _a;
|
|
845
888
|
const reversed = [...flatMessages].reverse();
|
|
846
889
|
const found = reversed.find(
|
|
847
890
|
(m) => m.role === "user"
|
|
848
891
|
);
|
|
849
|
-
return found
|
|
892
|
+
return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
|
|
850
893
|
})();
|
|
851
894
|
const rawModelText = (() => {
|
|
852
895
|
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
@@ -917,9 +960,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
917
960
|
caseLogs.push(
|
|
918
961
|
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
919
962
|
);
|
|
920
|
-
} catch {
|
|
963
|
+
} catch (e) {
|
|
921
964
|
}
|
|
922
|
-
} catch {
|
|
965
|
+
} catch (e) {
|
|
923
966
|
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
924
967
|
}
|
|
925
968
|
};
|
|
@@ -998,7 +1041,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
998
1041
|
const flatMessages = flattenMessages(messages);
|
|
999
1042
|
const { transformedTools, nameMap } = buildTransformedTools(
|
|
1000
1043
|
tools,
|
|
1001
|
-
|
|
1044
|
+
fixSchema2
|
|
1002
1045
|
);
|
|
1003
1046
|
const toolsMap = buildToolsMap(transformedTools);
|
|
1004
1047
|
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
@@ -1020,6 +1063,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1020
1063
|
const mwParsedToolCalls = parseDebugToolCalls(
|
|
1021
1064
|
debugSummaryRef.toolCalls
|
|
1022
1065
|
);
|
|
1066
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1067
|
+
if (!possibleAnswer) {
|
|
1068
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1069
|
+
}
|
|
1070
|
+
if (process.env.DEBUG_PARSER_OUTPUT === "true") {
|
|
1071
|
+
const groundTruth = possibleAnswer.ground_truth;
|
|
1072
|
+
const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
|
|
1073
|
+
console.log("\n========== BFCL CASE DEBUG ==========");
|
|
1074
|
+
console.log(`Test Case: ${testCase.id}`);
|
|
1075
|
+
console.log(`Expected count: ${groundTruth.length} call(s)`);
|
|
1076
|
+
console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
|
|
1077
|
+
console.log(expectedXML);
|
|
1078
|
+
console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
|
|
1079
|
+
console.log(mwOriginalText || text || "(empty)");
|
|
1080
|
+
console.log(
|
|
1081
|
+
"\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
|
|
1082
|
+
);
|
|
1083
|
+
console.log(JSON.stringify(toolCalls, null, 2));
|
|
1084
|
+
console.log("======================================\n");
|
|
1085
|
+
}
|
|
1023
1086
|
logRawToolCalls({
|
|
1024
1087
|
toolCalls,
|
|
1025
1088
|
finishReason,
|
|
@@ -1027,10 +1090,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1027
1090
|
testCaseId: testCase.id,
|
|
1028
1091
|
caseLogs
|
|
1029
1092
|
});
|
|
1030
|
-
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1031
|
-
if (!possibleAnswer) {
|
|
1032
|
-
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1033
|
-
}
|
|
1034
1093
|
const restoredCalls = restoreToolCalls(
|
|
1035
1094
|
toolCalls || [],
|
|
1036
1095
|
nameMap,
|
|
@@ -1051,12 +1110,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1051
1110
|
caseLogs
|
|
1052
1111
|
});
|
|
1053
1112
|
};
|
|
1054
|
-
const
|
|
1113
|
+
const runSingleCase2 = async (testCase) => {
|
|
1055
1114
|
const caseLogs = [];
|
|
1056
1115
|
const { function: tools } = testCase;
|
|
1057
|
-
const temp = config
|
|
1116
|
+
const temp = config == null ? void 0 : config.temperature;
|
|
1058
1117
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1059
|
-
const maxTok = config
|
|
1118
|
+
const maxTok = config == null ? void 0 : config.maxTokens;
|
|
1060
1119
|
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1061
1120
|
try {
|
|
1062
1121
|
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
@@ -1082,15 +1141,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1082
1141
|
});
|
|
1083
1142
|
} catch (e) {
|
|
1084
1143
|
caseLogs.push(
|
|
1085
|
-
`[ERROR] ${testCase.id}: Model generation failed: ${e
|
|
1144
|
+
`[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
|
|
1086
1145
|
);
|
|
1087
|
-
if (e
|
|
1146
|
+
if (e == null ? void 0 : e.stack) {
|
|
1088
1147
|
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
1089
1148
|
}
|
|
1090
1149
|
return { valid: false, logs: caseLogs };
|
|
1091
1150
|
}
|
|
1092
1151
|
};
|
|
1093
|
-
const
|
|
1152
|
+
const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
|
|
1094
1153
|
const results = new Array(items.length);
|
|
1095
1154
|
let idx = 0;
|
|
1096
1155
|
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
@@ -1106,10 +1165,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1106
1165
|
await Promise.all(workers);
|
|
1107
1166
|
return results;
|
|
1108
1167
|
};
|
|
1109
|
-
const resultsPerCase = await
|
|
1168
|
+
const resultsPerCase = await mapWithConcurrency2(
|
|
1110
1169
|
testCases,
|
|
1111
1170
|
concurrency,
|
|
1112
|
-
async (tc) =>
|
|
1171
|
+
async (tc) => runSingleCase2(tc)
|
|
1113
1172
|
);
|
|
1114
1173
|
correctCount = resultsPerCase.reduce(
|
|
1115
1174
|
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
@@ -1177,19 +1236,387 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
|
1177
1236
|
"BFCL_v3_parallel_multiple_possible_answer.jsonl"
|
|
1178
1237
|
);
|
|
1179
1238
|
|
|
1180
|
-
// src/benchmarks/
|
|
1239
|
+
// src/benchmarks/complex-func-bench.ts
|
|
1181
1240
|
var import_node_fs3 = require("fs");
|
|
1182
1241
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
1183
1242
|
var import_ai2 = require("ai");
|
|
1243
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1244
|
+
function standardizeString2(input) {
|
|
1245
|
+
if (typeof input !== "string") {
|
|
1246
|
+
return input;
|
|
1247
|
+
}
|
|
1248
|
+
return input.toLowerCase().trim();
|
|
1249
|
+
}
|
|
1250
|
+
function valuesMatch2(modelValue, expectedValue) {
|
|
1251
|
+
if (modelValue === expectedValue) {
|
|
1252
|
+
return true;
|
|
1253
|
+
}
|
|
1254
|
+
if (typeof modelValue === "string" && typeof expectedValue === "string") {
|
|
1255
|
+
return standardizeString2(modelValue) === standardizeString2(expectedValue);
|
|
1256
|
+
}
|
|
1257
|
+
if (typeof modelValue === "number" && typeof expectedValue === "string") {
|
|
1258
|
+
return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
|
|
1259
|
+
}
|
|
1260
|
+
if (typeof modelValue === "string" && typeof expectedValue === "number") {
|
|
1261
|
+
return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
|
|
1262
|
+
}
|
|
1263
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
|
|
1264
|
+
try {
|
|
1265
|
+
return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
|
|
1266
|
+
} catch (e) {
|
|
1267
|
+
return false;
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
return false;
|
|
1271
|
+
}
|
|
1272
|
+
function validateFunctionName(modelFuncName, expectedFuncName) {
|
|
1273
|
+
if (modelFuncName !== expectedFuncName) {
|
|
1274
|
+
return {
|
|
1275
|
+
valid: false,
|
|
1276
|
+
error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
|
|
1277
|
+
error_type: "function_name_mismatch"
|
|
1278
|
+
};
|
|
1279
|
+
}
|
|
1280
|
+
return { valid: true };
|
|
1281
|
+
}
|
|
1282
|
+
function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
|
|
1283
|
+
for (const param of requiredParams) {
|
|
1284
|
+
if (!(param in modelArgs) && param in expectedArgs) {
|
|
1285
|
+
return {
|
|
1286
|
+
valid: false,
|
|
1287
|
+
error: `Missing required parameter: '${param}'`,
|
|
1288
|
+
error_type: "missing_required_param"
|
|
1289
|
+
};
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
return { valid: true };
|
|
1293
|
+
}
|
|
1294
|
+
function validateParamValues(expectedArgs, modelArgs, requiredParams) {
|
|
1295
|
+
for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
|
|
1296
|
+
if (!(paramName in modelArgs)) {
|
|
1297
|
+
if (!requiredParams.includes(paramName)) {
|
|
1298
|
+
continue;
|
|
1299
|
+
}
|
|
1300
|
+
return {
|
|
1301
|
+
valid: false,
|
|
1302
|
+
error: `Missing parameter: '${paramName}'`,
|
|
1303
|
+
error_type: "missing_param"
|
|
1304
|
+
};
|
|
1305
|
+
}
|
|
1306
|
+
const modelValue = modelArgs[paramName];
|
|
1307
|
+
if (!valuesMatch2(modelValue, expectedValue)) {
|
|
1308
|
+
return {
|
|
1309
|
+
valid: false,
|
|
1310
|
+
error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
|
|
1311
|
+
error_type: "value_mismatch"
|
|
1312
|
+
};
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
return { valid: true };
|
|
1316
|
+
}
|
|
1317
|
+
function checkFunctionCall(modelCall, expected, toolSpecs) {
|
|
1318
|
+
var _a, _b, _c, _d;
|
|
1319
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1320
|
+
const expectedArgs = expected[expectedFuncName];
|
|
1321
|
+
const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
|
|
1322
|
+
const modelArgs = (_b = modelCall.args) != null ? _b : {};
|
|
1323
|
+
const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
|
|
1324
|
+
if (!nameResult.valid) {
|
|
1325
|
+
return nameResult;
|
|
1326
|
+
}
|
|
1327
|
+
const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
|
|
1328
|
+
const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
1329
|
+
const requiredResult = validateRequiredParams(
|
|
1330
|
+
requiredParams,
|
|
1331
|
+
modelArgs,
|
|
1332
|
+
expectedArgs
|
|
1333
|
+
);
|
|
1334
|
+
if (!requiredResult.valid) {
|
|
1335
|
+
return requiredResult;
|
|
1336
|
+
}
|
|
1337
|
+
return validateParamValues(expectedArgs, modelArgs, requiredParams);
|
|
1338
|
+
}
|
|
1339
|
+
function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
|
|
1340
|
+
if (modelCalls.length !== expectedCalls.length) {
|
|
1341
|
+
return {
|
|
1342
|
+
valid: false,
|
|
1343
|
+
error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
|
|
1344
|
+
error_type: "wrong_call_count"
|
|
1345
|
+
};
|
|
1346
|
+
}
|
|
1347
|
+
if (expectedCalls.length === 1) {
|
|
1348
|
+
return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
|
|
1349
|
+
}
|
|
1350
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
1351
|
+
for (const expected of expectedCalls) {
|
|
1352
|
+
let foundMatch = false;
|
|
1353
|
+
for (let i = 0; i < modelCalls.length; i++) {
|
|
1354
|
+
if (matchedIndices.has(i)) {
|
|
1355
|
+
continue;
|
|
1356
|
+
}
|
|
1357
|
+
const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
|
|
1358
|
+
if (result.valid) {
|
|
1359
|
+
matchedIndices.add(i);
|
|
1360
|
+
foundMatch = true;
|
|
1361
|
+
break;
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
if (!foundMatch) {
|
|
1365
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1366
|
+
return {
|
|
1367
|
+
valid: false,
|
|
1368
|
+
error: `Could not find matching call for function '${expectedFuncName}'`,
|
|
1369
|
+
error_type: "no_matching_call"
|
|
1370
|
+
};
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
1373
|
+
return { valid: true };
|
|
1374
|
+
}
|
|
1375
|
+
var fixSchemaType = (copy) => {
|
|
1376
|
+
if (!copy.type) {
|
|
1377
|
+
return;
|
|
1378
|
+
}
|
|
1379
|
+
if (copy.type === "dict") {
|
|
1380
|
+
copy.type = "object";
|
|
1381
|
+
}
|
|
1382
|
+
if (copy.type === "tuple") {
|
|
1383
|
+
copy.type = "array";
|
|
1384
|
+
}
|
|
1385
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
1386
|
+
copy.type = "number";
|
|
1387
|
+
}
|
|
1388
|
+
};
|
|
1389
|
+
var fixSchema = (schema) => {
|
|
1390
|
+
if (!schema || typeof schema !== "object") {
|
|
1391
|
+
return { type: "object", properties: {} };
|
|
1392
|
+
}
|
|
1393
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
1394
|
+
if (!Array.isArray(copy)) {
|
|
1395
|
+
fixSchemaType(copy);
|
|
1396
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
1397
|
+
for (const k of Object.keys(copy.properties)) {
|
|
1398
|
+
copy.properties[k] = fixSchema(
|
|
1399
|
+
copy.properties[k]
|
|
1400
|
+
);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
if (copy.items) {
|
|
1404
|
+
copy.items = fixSchema(copy.items);
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
return copy;
|
|
1408
|
+
};
|
|
1409
|
+
function buildTools(tools) {
|
|
1410
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
1411
|
+
const transformedTools = tools.map((t) => {
|
|
1412
|
+
const fixed = fixSchema(t.parameters);
|
|
1413
|
+
const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
|
|
1414
|
+
const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
|
|
1415
|
+
nameMap.set(sanitized, t.name);
|
|
1416
|
+
return {
|
|
1417
|
+
type: "function",
|
|
1418
|
+
name: sanitized,
|
|
1419
|
+
description: t.description,
|
|
1420
|
+
inputSchema
|
|
1421
|
+
};
|
|
1422
|
+
});
|
|
1423
|
+
const toolsMap = Object.fromEntries(
|
|
1424
|
+
transformedTools.map((t) => [
|
|
1425
|
+
t.name,
|
|
1426
|
+
(0, import_ai2.tool)({
|
|
1427
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
1428
|
+
inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
|
|
1429
|
+
})
|
|
1430
|
+
])
|
|
1431
|
+
);
|
|
1432
|
+
return { nameMap, toolsMap };
|
|
1433
|
+
}
|
|
1434
|
+
async function mapWithConcurrency(items, concurrencyLimit, mapper) {
|
|
1435
|
+
const results = new Array(items.length);
|
|
1436
|
+
let idx = 0;
|
|
1437
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1438
|
+
while (true) {
|
|
1439
|
+
const current = idx;
|
|
1440
|
+
idx += 1;
|
|
1441
|
+
if (current >= items.length) {
|
|
1442
|
+
break;
|
|
1443
|
+
}
|
|
1444
|
+
results[current] = await mapper(items[current]);
|
|
1445
|
+
}
|
|
1446
|
+
});
|
|
1447
|
+
await Promise.all(workers);
|
|
1448
|
+
return results;
|
|
1449
|
+
}
|
|
1450
|
+
async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
|
|
1451
|
+
const caseLogs = [];
|
|
1452
|
+
const { function: tools, question: messages } = testCase;
|
|
1453
|
+
try {
|
|
1454
|
+
const { nameMap, toolsMap } = buildTools(tools);
|
|
1455
|
+
const debugSummaryRef = {};
|
|
1456
|
+
const providerOptions = {
|
|
1457
|
+
toolCallMiddleware: { debugSummary: debugSummaryRef }
|
|
1458
|
+
};
|
|
1459
|
+
const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
|
|
1460
|
+
model,
|
|
1461
|
+
messages,
|
|
1462
|
+
tools: toolsMap,
|
|
1463
|
+
toolChoice: "auto",
|
|
1464
|
+
providerOptions,
|
|
1465
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
1466
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
1467
|
+
});
|
|
1468
|
+
const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
|
|
1469
|
+
var _a, _b, _c, _d;
|
|
1470
|
+
const rawName = (_a = c.toolName) != null ? _a : c.name;
|
|
1471
|
+
const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
|
|
1472
|
+
return {
|
|
1473
|
+
toolName: originalName,
|
|
1474
|
+
name: originalName,
|
|
1475
|
+
args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
|
|
1476
|
+
};
|
|
1477
|
+
});
|
|
1478
|
+
caseLogs.push(
|
|
1479
|
+
`[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
|
|
1480
|
+
);
|
|
1481
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1482
|
+
if (!possibleAnswer) {
|
|
1483
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1484
|
+
}
|
|
1485
|
+
const checkerResult = checkAllFunctionCalls(
|
|
1486
|
+
restoredCalls,
|
|
1487
|
+
possibleAnswer.ground_truth,
|
|
1488
|
+
tools
|
|
1489
|
+
);
|
|
1490
|
+
if (checkerResult.valid) {
|
|
1491
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1492
|
+
return { valid: true, logs: caseLogs };
|
|
1493
|
+
}
|
|
1494
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1495
|
+
return { valid: false, logs: caseLogs };
|
|
1496
|
+
} catch (e) {
|
|
1497
|
+
caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
|
|
1498
|
+
return { valid: false, logs: caseLogs };
|
|
1499
|
+
}
|
|
1500
|
+
}
|
|
1501
|
+
async function loadTestData(dataPath, testDataFile) {
|
|
1502
|
+
const testCasesJson = await import_node_fs3.promises.readFile(
|
|
1503
|
+
import_node_path3.default.join(dataPath, testDataFile),
|
|
1504
|
+
"utf-8"
|
|
1505
|
+
);
|
|
1506
|
+
return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1507
|
+
}
|
|
1508
|
+
async function loadAnswerData(dataPath, answerDataFile) {
|
|
1509
|
+
const answersJson = await import_node_fs3.promises.readFile(
|
|
1510
|
+
import_node_path3.default.join(dataPath, answerDataFile),
|
|
1511
|
+
"utf-8"
|
|
1512
|
+
);
|
|
1513
|
+
const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1514
|
+
return new Map(answers.map((ans) => [ans.id, ans]));
|
|
1515
|
+
}
|
|
1516
|
+
function getConfigValues(config) {
|
|
1517
|
+
const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
|
|
1518
|
+
const limit = limitEnv ? Number(limitEnv) : void 0;
|
|
1519
|
+
const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
|
|
1520
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
1521
|
+
const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
|
|
1522
|
+
const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
|
|
1523
|
+
return { limit, concurrency, temperature, maxTokens };
|
|
1524
|
+
}
|
|
1525
|
+
function aggregateResults(resultsPerCase, testCases) {
|
|
1526
|
+
const logs = [];
|
|
1527
|
+
const correctCount = resultsPerCase.reduce(
|
|
1528
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1529
|
+
0
|
|
1530
|
+
);
|
|
1531
|
+
for (const r of resultsPerCase) {
|
|
1532
|
+
logs.push(...r.logs);
|
|
1533
|
+
}
|
|
1534
|
+
if (testCases.length === 0) {
|
|
1535
|
+
return {
|
|
1536
|
+
score: 0,
|
|
1537
|
+
success: false,
|
|
1538
|
+
metrics: {},
|
|
1539
|
+
logs: ["No test cases found."]
|
|
1540
|
+
};
|
|
1541
|
+
}
|
|
1542
|
+
const score = correctCount / testCases.length;
|
|
1543
|
+
return {
|
|
1544
|
+
score,
|
|
1545
|
+
success: score > 0.5,
|
|
1546
|
+
metrics: {
|
|
1547
|
+
correct_count: correctCount,
|
|
1548
|
+
total_cases: testCases.length,
|
|
1549
|
+
accuracy: score
|
|
1550
|
+
},
|
|
1551
|
+
logs
|
|
1552
|
+
};
|
|
1553
|
+
}
|
|
1554
|
+
function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
|
|
1555
|
+
return {
|
|
1556
|
+
name,
|
|
1557
|
+
version: "1.0.0",
|
|
1558
|
+
description,
|
|
1559
|
+
async run(model, config) {
|
|
1560
|
+
var _a;
|
|
1561
|
+
const logs = [];
|
|
1562
|
+
try {
|
|
1563
|
+
const dataPath = resolveDataDir();
|
|
1564
|
+
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
1565
|
+
let testCases = await loadTestData(dataPath, testDataFile);
|
|
1566
|
+
const possibleAnswersMap = await loadAnswerData(
|
|
1567
|
+
dataPath,
|
|
1568
|
+
answerDataFile
|
|
1569
|
+
);
|
|
1570
|
+
const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
|
|
1571
|
+
if (limit && Number.isFinite(limit) && limit > 0) {
|
|
1572
|
+
testCases = testCases.slice(0, limit);
|
|
1573
|
+
logs.push(`[INFO] Limiting test cases to ${limit}`);
|
|
1574
|
+
}
|
|
1575
|
+
logs.push(
|
|
1576
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
1577
|
+
);
|
|
1578
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1579
|
+
testCases,
|
|
1580
|
+
concurrency,
|
|
1581
|
+
(tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
|
|
1582
|
+
);
|
|
1583
|
+
const result = aggregateResults(resultsPerCase, testCases);
|
|
1584
|
+
result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
|
|
1585
|
+
return result;
|
|
1586
|
+
} catch (e) {
|
|
1587
|
+
return {
|
|
1588
|
+
score: 0,
|
|
1589
|
+
success: false,
|
|
1590
|
+
metrics: {},
|
|
1591
|
+
error: e,
|
|
1592
|
+
logs: [
|
|
1593
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1594
|
+
]
|
|
1595
|
+
};
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
};
|
|
1599
|
+
}
|
|
1600
|
+
var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
|
|
1601
|
+
"complex-func-bench",
|
|
1602
|
+
"ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
|
|
1603
|
+
"ComplexFuncBench.jsonl",
|
|
1604
|
+
"ComplexFuncBench_possible_answer.jsonl"
|
|
1605
|
+
);
|
|
1606
|
+
|
|
1607
|
+
// src/benchmarks/json-generation.ts
|
|
1608
|
+
var import_node_fs4 = require("fs");
|
|
1609
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
1610
|
+
var import_ai3 = require("ai");
|
|
1184
1611
|
var import_ajv = __toESM(require("ajv"), 1);
|
|
1185
1612
|
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1186
1613
|
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1187
1614
|
var NEWLINE_REGEX = /\r?\n/;
|
|
1188
|
-
var
|
|
1615
|
+
var LINE_SPLIT_REGEX3 = /\r?\n/;
|
|
1189
1616
|
function tryDirectParse(text) {
|
|
1190
1617
|
try {
|
|
1191
1618
|
return JSON.parse(text);
|
|
1192
|
-
} catch {
|
|
1619
|
+
} catch (e) {
|
|
1193
1620
|
return;
|
|
1194
1621
|
}
|
|
1195
1622
|
}
|
|
@@ -1201,7 +1628,7 @@ function tryCodeFenceParse(text) {
|
|
|
1201
1628
|
const inner = fenceMatch[1].trim();
|
|
1202
1629
|
try {
|
|
1203
1630
|
return JSON.parse(inner);
|
|
1204
|
-
} catch {
|
|
1631
|
+
} catch (e) {
|
|
1205
1632
|
return;
|
|
1206
1633
|
}
|
|
1207
1634
|
}
|
|
@@ -1226,7 +1653,7 @@ function tryBracketScan(text) {
|
|
|
1226
1653
|
const candidate = text.slice(start, i + 1);
|
|
1227
1654
|
try {
|
|
1228
1655
|
return JSON.parse(candidate);
|
|
1229
|
-
} catch {
|
|
1656
|
+
} catch (e) {
|
|
1230
1657
|
return;
|
|
1231
1658
|
}
|
|
1232
1659
|
}
|
|
@@ -1274,12 +1701,12 @@ function subsetMatch(expected, actual) {
|
|
|
1274
1701
|
async function loadDatasets() {
|
|
1275
1702
|
try {
|
|
1276
1703
|
const dataDir = resolveDataDir();
|
|
1277
|
-
const testsJsonl = await
|
|
1278
|
-
|
|
1704
|
+
const testsJsonl = await import_node_fs4.promises.readFile(
|
|
1705
|
+
import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1279
1706
|
"utf-8"
|
|
1280
1707
|
);
|
|
1281
|
-
const expectedJsonl = await
|
|
1282
|
-
|
|
1708
|
+
const expectedJsonl = await import_node_fs4.promises.readFile(
|
|
1709
|
+
import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
1283
1710
|
"utf-8"
|
|
1284
1711
|
);
|
|
1285
1712
|
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -1335,10 +1762,11 @@ function validateTestCase(tc, parsed, context) {
|
|
|
1335
1762
|
return { valid, valuesOk, parsed };
|
|
1336
1763
|
}
|
|
1337
1764
|
async function processTestCase(tc, context) {
|
|
1765
|
+
var _a;
|
|
1338
1766
|
const messages = buildMessages(tc);
|
|
1339
|
-
const temp = context.config
|
|
1767
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1340
1768
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1341
|
-
const { text } = await (0,
|
|
1769
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
1342
1770
|
model: context.model,
|
|
1343
1771
|
messages,
|
|
1344
1772
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1346,7 +1774,7 @@ async function processTestCase(tc, context) {
|
|
|
1346
1774
|
let parsed;
|
|
1347
1775
|
try {
|
|
1348
1776
|
parsed = extractFirstJsonBlock(text);
|
|
1349
|
-
} catch {
|
|
1777
|
+
} catch (e) {
|
|
1350
1778
|
}
|
|
1351
1779
|
if (parsed === void 0) {
|
|
1352
1780
|
context.validation.logs.push(
|
|
@@ -1440,21 +1868,22 @@ function buildBenchmarkResult(total, counts, logs) {
|
|
|
1440
1868
|
async function loadSchemaOnlyTests() {
|
|
1441
1869
|
try {
|
|
1442
1870
|
const dataDir = resolveDataDir();
|
|
1443
|
-
const testsJsonl = await
|
|
1444
|
-
|
|
1871
|
+
const testsJsonl = await import_node_fs4.promises.readFile(
|
|
1872
|
+
import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1445
1873
|
"utf-8"
|
|
1446
1874
|
);
|
|
1447
|
-
const tests = testsJsonl.split(
|
|
1875
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1448
1876
|
return { tests };
|
|
1449
1877
|
} catch (e) {
|
|
1450
1878
|
return { tests: [], error: e };
|
|
1451
1879
|
}
|
|
1452
1880
|
}
|
|
1453
1881
|
async function processSchemaOnlyTestCase(tc, context) {
|
|
1882
|
+
var _a;
|
|
1454
1883
|
const messages = buildMessages(tc);
|
|
1455
|
-
const temp = context.config
|
|
1884
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1456
1885
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1457
|
-
const { text } = await (0,
|
|
1886
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
1458
1887
|
model: context.model,
|
|
1459
1888
|
messages,
|
|
1460
1889
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1462,7 +1891,7 @@ async function processSchemaOnlyTestCase(tc, context) {
|
|
|
1462
1891
|
let parsed;
|
|
1463
1892
|
try {
|
|
1464
1893
|
parsed = extractFirstJsonBlock(text);
|
|
1465
|
-
} catch {
|
|
1894
|
+
} catch (e) {
|
|
1466
1895
|
}
|
|
1467
1896
|
if (parsed === void 0) {
|
|
1468
1897
|
context.logs.push(
|
|
@@ -1539,8 +1968,56 @@ var colors = {
|
|
|
1539
1968
|
yellow: "\x1B[33m",
|
|
1540
1969
|
cyan: "\x1B[36m",
|
|
1541
1970
|
magenta: "\x1B[35m",
|
|
1542
|
-
gray: "\x1B[90m"
|
|
1971
|
+
gray: "\x1B[90m",
|
|
1972
|
+
white: "\x1B[37m",
|
|
1973
|
+
bgRed: "\x1B[41m"
|
|
1543
1974
|
};
|
|
1975
|
+
function formatDiff(diff) {
|
|
1976
|
+
if (!diff || diff.length === 0) {
|
|
1977
|
+
return "";
|
|
1978
|
+
}
|
|
1979
|
+
return diff.map((line) => {
|
|
1980
|
+
if (line.startsWith("-")) {
|
|
1981
|
+
return `${colors.red}${line}${colors.reset}`;
|
|
1982
|
+
}
|
|
1983
|
+
if (line.startsWith("+")) {
|
|
1984
|
+
return `${colors.green}${line}${colors.reset}`;
|
|
1985
|
+
}
|
|
1986
|
+
if (line.startsWith("@@")) {
|
|
1987
|
+
return `${colors.cyan}${line}${colors.reset}`;
|
|
1988
|
+
}
|
|
1989
|
+
return line;
|
|
1990
|
+
}).join("\n ");
|
|
1991
|
+
}
|
|
1992
|
+
function printFailLogs(logs) {
|
|
1993
|
+
const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
|
|
1994
|
+
for (const log of failLogs) {
|
|
1995
|
+
try {
|
|
1996
|
+
const jsonStr = log.replace("[DEBUG-FAIL] ", "");
|
|
1997
|
+
const data = JSON.parse(jsonStr);
|
|
1998
|
+
console.log(`
|
|
1999
|
+
${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
|
|
2000
|
+
console.log(
|
|
2001
|
+
` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
|
|
2002
|
+
);
|
|
2003
|
+
console.log(` Message: ${data.message}`);
|
|
2004
|
+
if (data.diff && Array.isArray(data.diff)) {
|
|
2005
|
+
console.log(` Diff:
|
|
2006
|
+
${formatDiff(data.diff)}`);
|
|
2007
|
+
}
|
|
2008
|
+
if (data.expected && data.actual) {
|
|
2009
|
+
const expStr = JSON.stringify(data.expected);
|
|
2010
|
+
const actStr = JSON.stringify(data.actual);
|
|
2011
|
+
if (expStr.length < 100 && actStr.length < 100) {
|
|
2012
|
+
console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
|
|
2013
|
+
console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
} catch (_e) {
|
|
2017
|
+
console.log(` Raw Log: ${log}`);
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
}
|
|
1544
2021
|
function printResult(result) {
|
|
1545
2022
|
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1546
2023
|
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
@@ -1563,6 +2040,18 @@ function printResult(result) {
|
|
|
1563
2040
|
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
1564
2041
|
);
|
|
1565
2042
|
}
|
|
2043
|
+
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2044
|
+
printFailLogs(benchmarkResult.logs);
|
|
2045
|
+
const failLogs = benchmarkResult.logs.filter(
|
|
2046
|
+
(l) => l.startsWith("[DEBUG-FAIL]")
|
|
2047
|
+
);
|
|
2048
|
+
if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
|
|
2049
|
+
console.log(" Raw Logs (Sample):");
|
|
2050
|
+
for (const l of benchmarkResult.logs.slice(0, 10)) {
|
|
2051
|
+
console.log(` ${l}`);
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
2054
|
+
}
|
|
1566
2055
|
}
|
|
1567
2056
|
function consoleReporter(results) {
|
|
1568
2057
|
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
@@ -1617,14 +2106,14 @@ function hasFunctionNameIssue(diff) {
|
|
|
1617
2106
|
);
|
|
1618
2107
|
}
|
|
1619
2108
|
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
1620
|
-
const expectedName = expected
|
|
1621
|
-
const actualName = actual
|
|
2109
|
+
const expectedName = expected == null ? void 0 : expected.function;
|
|
2110
|
+
const actualName = actual == null ? void 0 : actual.function;
|
|
1622
2111
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
1623
2112
|
suggestions.push(
|
|
1624
2113
|
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
1625
2114
|
);
|
|
1626
2115
|
}
|
|
1627
|
-
if (Array.isArray(expected
|
|
2116
|
+
if (Array.isArray(expected == null ? void 0 : expected.functions)) {
|
|
1628
2117
|
suggestions.push(
|
|
1629
2118
|
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
1630
2119
|
);
|
|
@@ -1679,7 +2168,7 @@ function suggestFromErrorType(error_type, suggestions) {
|
|
|
1679
2168
|
}
|
|
1680
2169
|
function suggestFixFromDiff(parsed) {
|
|
1681
2170
|
const suggestions = [];
|
|
1682
|
-
const { error_type, expected, actual, diff } = parsed
|
|
2171
|
+
const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
|
|
1683
2172
|
if (!Array.isArray(diff)) {
|
|
1684
2173
|
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1685
2174
|
suggestFromErrorType(error_type, suggestions);
|
|
@@ -1704,15 +2193,16 @@ function suggestFixFromDiff(parsed) {
|
|
|
1704
2193
|
return uniqueLines(suggestions);
|
|
1705
2194
|
}
|
|
1706
2195
|
function getTestIdFromLogLine(line) {
|
|
2196
|
+
var _a, _b;
|
|
1707
2197
|
if (line.startsWith("[FAIL]")) {
|
|
1708
2198
|
const m = line.match(FAIL_ID_REGEX);
|
|
1709
|
-
return m
|
|
2199
|
+
return m == null ? void 0 : m[1];
|
|
1710
2200
|
}
|
|
1711
2201
|
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1712
2202
|
try {
|
|
1713
2203
|
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1714
|
-
return String(parsed
|
|
1715
|
-
} catch {
|
|
2204
|
+
return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
|
|
2205
|
+
} catch (e) {
|
|
1716
2206
|
}
|
|
1717
2207
|
}
|
|
1718
2208
|
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
@@ -1720,18 +2210,19 @@ function getTestIdFromLogLine(line) {
|
|
|
1720
2210
|
const parsed = JSON.parse(
|
|
1721
2211
|
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
1722
2212
|
);
|
|
1723
|
-
return String(parsed
|
|
1724
|
-
} catch {
|
|
2213
|
+
return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
|
|
2214
|
+
} catch (e) {
|
|
1725
2215
|
}
|
|
1726
2216
|
}
|
|
1727
2217
|
return;
|
|
1728
2218
|
}
|
|
1729
2219
|
function groupLogsByTestId(failLogs) {
|
|
2220
|
+
var _a;
|
|
1730
2221
|
const byId = /* @__PURE__ */ new Map();
|
|
1731
2222
|
for (const line of failLogs) {
|
|
1732
2223
|
const id = getTestIdFromLogLine(line);
|
|
1733
|
-
const key = id
|
|
1734
|
-
const arr = byId.get(key)
|
|
2224
|
+
const key = id != null ? id : "__general__";
|
|
2225
|
+
const arr = (_a = byId.get(key)) != null ? _a : [];
|
|
1735
2226
|
arr.push(line);
|
|
1736
2227
|
byId.set(key, arr);
|
|
1737
2228
|
}
|
|
@@ -1743,10 +2234,10 @@ function collectDebugIds(lines) {
|
|
|
1743
2234
|
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
1744
2235
|
try {
|
|
1745
2236
|
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1746
|
-
if (parsed
|
|
2237
|
+
if (parsed == null ? void 0 : parsed.id) {
|
|
1747
2238
|
debugIds.add(String(parsed.id));
|
|
1748
2239
|
}
|
|
1749
|
-
} catch {
|
|
2240
|
+
} catch (e) {
|
|
1750
2241
|
}
|
|
1751
2242
|
}
|
|
1752
2243
|
}
|
|
@@ -1782,7 +2273,7 @@ function displayDebugFailLine(line) {
|
|
|
1782
2273
|
console.log(` \u2022 ${s}`);
|
|
1783
2274
|
}
|
|
1784
2275
|
}
|
|
1785
|
-
} catch {
|
|
2276
|
+
} catch (e) {
|
|
1786
2277
|
console.log(` ${line}`);
|
|
1787
2278
|
}
|
|
1788
2279
|
}
|
|
@@ -1826,14 +2317,14 @@ function displayDebugFailContextLine(line) {
|
|
|
1826
2317
|
const ctx = JSON.parse(payload);
|
|
1827
2318
|
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
1828
2319
|
displayContextInfo(ctx);
|
|
1829
|
-
} catch {
|
|
2320
|
+
} catch (e) {
|
|
1830
2321
|
console.log(` ${line}`);
|
|
1831
2322
|
}
|
|
1832
2323
|
}
|
|
1833
2324
|
function displayLogLine(line, debugIds) {
|
|
1834
2325
|
if (line.startsWith("[FAIL]")) {
|
|
1835
2326
|
const m = line.match(FAIL_ID_REGEX);
|
|
1836
|
-
const failId = m
|
|
2327
|
+
const failId = m == null ? void 0 : m[1];
|
|
1837
2328
|
if (failId && debugIds.has(failId)) {
|
|
1838
2329
|
return;
|
|
1839
2330
|
}
|
|
@@ -1903,11 +2394,12 @@ function displayResultHeader(r) {
|
|
|
1903
2394
|
);
|
|
1904
2395
|
}
|
|
1905
2396
|
function consoleDebugReporter(results) {
|
|
2397
|
+
var _a;
|
|
1906
2398
|
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
1907
2399
|
for (const r of results) {
|
|
1908
2400
|
displayResultHeader(r);
|
|
1909
2401
|
displayMetrics(Object.entries(r.result.metrics));
|
|
1910
|
-
if (r.result.logs
|
|
2402
|
+
if ((_a = r.result.logs) == null ? void 0 : _a.length) {
|
|
1911
2403
|
displayResultLogs(r.result.logs);
|
|
1912
2404
|
}
|
|
1913
2405
|
}
|
|
@@ -1916,13 +2408,16 @@ function consoleDebugReporter(results) {
|
|
|
1916
2408
|
|
|
1917
2409
|
// src/reporters/json.ts
|
|
1918
2410
|
function jsonReporter(results) {
|
|
1919
|
-
const serializableResults = results.map((r) =>
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
...r
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
2411
|
+
const serializableResults = results.map((r) => {
|
|
2412
|
+
var _a;
|
|
2413
|
+
return {
|
|
2414
|
+
...r,
|
|
2415
|
+
result: {
|
|
2416
|
+
...r.result,
|
|
2417
|
+
error: (_a = r.result.error) == null ? void 0 : _a.message
|
|
2418
|
+
}
|
|
2419
|
+
};
|
|
2420
|
+
});
|
|
1926
2421
|
console.log(JSON.stringify(serializableResults, null, 2));
|
|
1927
2422
|
}
|
|
1928
2423
|
|
|
@@ -2035,6 +2530,7 @@ async function evaluate(options) {
|
|
|
2035
2530
|
bfclParallelBenchmark,
|
|
2036
2531
|
bfclParallelMultipleBenchmark,
|
|
2037
2532
|
bfclSimpleBenchmark,
|
|
2533
|
+
complexFuncBenchBenchmark,
|
|
2038
2534
|
evaluate,
|
|
2039
2535
|
jsonGenerationBenchmark,
|
|
2040
2536
|
jsonGenerationSchemaOnlyBenchmark
|