@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +587 -91
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -9
- package/dist/index.d.ts +29 -9
- package/dist/index.js +591 -92
- package/dist/index.js.map +1 -1
- package/package.json +17 -11
package/dist/index.js
CHANGED
|
@@ -23,7 +23,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
|
|
|
23
23
|
if (fs.existsSync(dataAtRoot)) {
|
|
24
24
|
return dataAtRoot;
|
|
25
25
|
}
|
|
26
|
-
} catch {
|
|
26
|
+
} catch (e) {
|
|
27
27
|
}
|
|
28
28
|
return null;
|
|
29
29
|
}
|
|
@@ -37,7 +37,7 @@ function tryResolveViaPackageJson(moduleUrl) {
|
|
|
37
37
|
if (fs.existsSync(dataAtPkg)) {
|
|
38
38
|
return dataAtPkg;
|
|
39
39
|
}
|
|
40
|
-
} catch {
|
|
40
|
+
} catch (e) {
|
|
41
41
|
}
|
|
42
42
|
return null;
|
|
43
43
|
}
|
|
@@ -45,7 +45,7 @@ function getStartDir(moduleUrl) {
|
|
|
45
45
|
if (moduleUrl) {
|
|
46
46
|
try {
|
|
47
47
|
return path.dirname(fileURLToPath(moduleUrl));
|
|
48
|
-
} catch {
|
|
48
|
+
} catch (e) {
|
|
49
49
|
return process.cwd();
|
|
50
50
|
}
|
|
51
51
|
}
|
|
@@ -139,7 +139,7 @@ function valuesMatch(modelValue, possibleValue) {
|
|
|
139
139
|
const normalizedModel = normalizeObject(modelValue);
|
|
140
140
|
const normalizedPossible = normalizeObject(possibleValue);
|
|
141
141
|
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
142
|
-
} catch {
|
|
142
|
+
} catch (e) {
|
|
143
143
|
return false;
|
|
144
144
|
}
|
|
145
145
|
}
|
|
@@ -268,7 +268,7 @@ function checkSingleParameter(paramName, modelValue, context) {
|
|
|
268
268
|
return checkStringValue(
|
|
269
269
|
paramName,
|
|
270
270
|
modelValue,
|
|
271
|
-
possibleValues
|
|
271
|
+
possibleValues != null ? possibleValues : []
|
|
272
272
|
);
|
|
273
273
|
}
|
|
274
274
|
if (Array.isArray(modelValue)) {
|
|
@@ -368,6 +368,37 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
368
368
|
// src/benchmarks/bfcl.ts
|
|
369
369
|
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
370
370
|
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
371
|
+
function convertGroundTruthToXML(call) {
|
|
372
|
+
const keys = Object.keys(call);
|
|
373
|
+
if (keys.length === 0) {
|
|
374
|
+
return "<empty_call />";
|
|
375
|
+
}
|
|
376
|
+
const funcName = keys[0];
|
|
377
|
+
if (!funcName) {
|
|
378
|
+
return "<undefined_function />";
|
|
379
|
+
}
|
|
380
|
+
const params = call[funcName];
|
|
381
|
+
if (!params || typeof params !== "object") {
|
|
382
|
+
return `<${funcName} />`;
|
|
383
|
+
}
|
|
384
|
+
let xml = `<${funcName}>
|
|
385
|
+
`;
|
|
386
|
+
for (const [key, value] of Object.entries(params)) {
|
|
387
|
+
const displayValue = Array.isArray(value) ? value[0] : value;
|
|
388
|
+
let valueStr;
|
|
389
|
+
if (typeof displayValue === "string") {
|
|
390
|
+
valueStr = displayValue;
|
|
391
|
+
} else if (displayValue === null || displayValue === void 0) {
|
|
392
|
+
valueStr = "";
|
|
393
|
+
} else {
|
|
394
|
+
valueStr = JSON.stringify(displayValue);
|
|
395
|
+
}
|
|
396
|
+
xml += ` <${key}>${valueStr}</${key}>
|
|
397
|
+
`;
|
|
398
|
+
}
|
|
399
|
+
xml += `</${funcName}>`;
|
|
400
|
+
return xml;
|
|
401
|
+
}
|
|
371
402
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
372
403
|
const category = testCase.id.split("_")[0];
|
|
373
404
|
try {
|
|
@@ -448,7 +479,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
448
479
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
449
480
|
);
|
|
450
481
|
}
|
|
451
|
-
const
|
|
482
|
+
const fixSchemaType2 = (copy) => {
|
|
452
483
|
if (!copy.type) {
|
|
453
484
|
return;
|
|
454
485
|
}
|
|
@@ -472,16 +503,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
472
503
|
);
|
|
473
504
|
}
|
|
474
505
|
};
|
|
475
|
-
const
|
|
506
|
+
const fixSchema2 = (schema) => {
|
|
476
507
|
if (!schema || typeof schema !== "object") {
|
|
477
508
|
return { type: "object", properties: {} };
|
|
478
509
|
}
|
|
479
|
-
const copy = Array.isArray(schema) ? schema.map((v) =>
|
|
510
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
|
|
480
511
|
if (!Array.isArray(copy)) {
|
|
481
|
-
|
|
482
|
-
fixSchemaProperties(copy,
|
|
512
|
+
fixSchemaType2(copy);
|
|
513
|
+
fixSchemaProperties(copy, fixSchema2);
|
|
483
514
|
if (copy.items) {
|
|
484
|
-
copy.items =
|
|
515
|
+
copy.items = fixSchema2(copy.items);
|
|
485
516
|
}
|
|
486
517
|
return copy;
|
|
487
518
|
}
|
|
@@ -516,13 +547,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
516
547
|
try {
|
|
517
548
|
const arr = JSON.parse(raw);
|
|
518
549
|
return Array.isArray(arr) ? arr : [];
|
|
519
|
-
} catch {
|
|
550
|
+
} catch (e) {
|
|
520
551
|
return [];
|
|
521
552
|
}
|
|
522
553
|
};
|
|
523
554
|
const getSanitizedName = (rawName, transformedTools) => {
|
|
555
|
+
var _a, _b;
|
|
524
556
|
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
525
|
-
return transformedTools[Number(rawName)]
|
|
557
|
+
return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
|
|
526
558
|
}
|
|
527
559
|
return rawName;
|
|
528
560
|
};
|
|
@@ -532,25 +564,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
532
564
|
}
|
|
533
565
|
try {
|
|
534
566
|
return JSON.parse(extractedArgs);
|
|
535
|
-
} catch {
|
|
567
|
+
} catch (e) {
|
|
536
568
|
return extractedArgs;
|
|
537
569
|
}
|
|
538
570
|
};
|
|
539
571
|
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
572
|
+
var _a, _b, _c, _d, _e, _f;
|
|
540
573
|
const call = c;
|
|
541
|
-
const rawName = call.toolName
|
|
574
|
+
const rawName = (_a = call.toolName) != null ? _a : call.name;
|
|
542
575
|
const sanitizedFromIndex = getSanitizedName(
|
|
543
576
|
rawName,
|
|
544
577
|
transformedTools
|
|
545
578
|
);
|
|
546
|
-
const originalName = nameMap.get(sanitizedFromIndex)
|
|
547
|
-
const extractedArgs = call.args
|
|
579
|
+
const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
|
|
580
|
+
const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
|
|
548
581
|
const parsedArgs = parseToolArgs(extractedArgs);
|
|
549
582
|
return {
|
|
550
583
|
...call,
|
|
551
584
|
toolName: originalName,
|
|
552
585
|
name: originalName,
|
|
553
|
-
args: parsedArgs
|
|
586
|
+
args: parsedArgs != null ? parsedArgs : {}
|
|
554
587
|
};
|
|
555
588
|
});
|
|
556
589
|
const summarizeArgs = (args) => {
|
|
@@ -594,7 +627,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
594
627
|
if (Array.isArray(got)) {
|
|
595
628
|
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
596
629
|
}
|
|
597
|
-
} catch {
|
|
630
|
+
} catch (e) {
|
|
598
631
|
}
|
|
599
632
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
600
633
|
});
|
|
@@ -632,13 +665,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
632
665
|
}
|
|
633
666
|
};
|
|
634
667
|
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
668
|
+
var _a, _b, _c, _d;
|
|
635
669
|
const funcDesc = tools[0];
|
|
636
|
-
const gt = possibleAnswer.ground_truth
|
|
637
|
-
const expectedFuncName = funcDesc
|
|
670
|
+
const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
|
|
671
|
+
const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
|
|
638
672
|
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
639
673
|
const received = restoredCalls[0];
|
|
640
|
-
const receivedName = received
|
|
641
|
-
const receivedArgs = summarizeArgs(received
|
|
674
|
+
const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
|
|
675
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
642
676
|
const expected = {
|
|
643
677
|
function: expectedFuncName,
|
|
644
678
|
params: expectedParams
|
|
@@ -650,7 +684,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
650
684
|
const diff = [];
|
|
651
685
|
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
652
686
|
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
653
|
-
const required = funcDesc
|
|
687
|
+
const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
654
688
|
checkMissingParams(
|
|
655
689
|
required,
|
|
656
690
|
receivedArgs,
|
|
@@ -687,12 +721,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
687
721
|
}
|
|
688
722
|
};
|
|
689
723
|
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
724
|
+
var _a;
|
|
690
725
|
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
691
726
|
if (usedActual.has(i)) {
|
|
692
727
|
continue;
|
|
693
728
|
}
|
|
694
729
|
const rc = restoredCalls[i];
|
|
695
|
-
const rcName = rc
|
|
730
|
+
const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
|
|
696
731
|
if (rcName === fname) {
|
|
697
732
|
return i;
|
|
698
733
|
}
|
|
@@ -706,6 +741,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
706
741
|
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
707
742
|
};
|
|
708
743
|
const processExpectedCall = (options) => {
|
|
744
|
+
var _a, _b;
|
|
709
745
|
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
710
746
|
const fname = Object.keys(expectedObj)[0];
|
|
711
747
|
const matchedIndex = findMatchingCallIndex(
|
|
@@ -718,10 +754,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
718
754
|
}
|
|
719
755
|
usedActual.add(matchedIndex);
|
|
720
756
|
const received = restoredCalls[matchedIndex];
|
|
721
|
-
const receivedArgs = summarizeArgs(received
|
|
757
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
722
758
|
const expectedParamsAllowed = expectedObj[fname];
|
|
723
759
|
const funcDesc = tools.find((t) => t.name === fname);
|
|
724
|
-
const requiredParams = funcDesc
|
|
760
|
+
const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
|
|
725
761
|
diff.push(`@@ function ${fname}`);
|
|
726
762
|
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
727
763
|
validateFunctionParams({
|
|
@@ -733,10 +769,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
733
769
|
}
|
|
734
770
|
};
|
|
735
771
|
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
736
|
-
|
|
772
|
+
var _a;
|
|
773
|
+
const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
|
|
737
774
|
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
738
775
|
const actualNames = restoredCalls.map(
|
|
739
|
-
(c) =>
|
|
776
|
+
(c) => {
|
|
777
|
+
var _a2;
|
|
778
|
+
return (_a2 = c.toolName) != null ? _a2 : c.name;
|
|
779
|
+
}
|
|
740
780
|
);
|
|
741
781
|
const expected = {
|
|
742
782
|
functions: expectedNames
|
|
@@ -762,14 +802,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
762
802
|
return { expected, actual, diff };
|
|
763
803
|
};
|
|
764
804
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
765
|
-
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) :
|
|
805
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
|
|
766
806
|
logs.push(
|
|
767
807
|
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
768
808
|
);
|
|
769
809
|
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
810
|
+
var _a, _b, _c, _d;
|
|
770
811
|
try {
|
|
771
812
|
const firstTool = transformedTools[0];
|
|
772
|
-
const schemaType = firstTool
|
|
813
|
+
const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
|
|
773
814
|
caseLogs.push(
|
|
774
815
|
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
775
816
|
);
|
|
@@ -785,7 +826,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
785
826
|
caseLogs.push(
|
|
786
827
|
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
787
828
|
);
|
|
788
|
-
} catch {
|
|
829
|
+
} catch (e) {
|
|
789
830
|
caseLogs.push(
|
|
790
831
|
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
791
832
|
);
|
|
@@ -804,11 +845,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
804
845
|
possibleAnswer
|
|
805
846
|
} = options;
|
|
806
847
|
const lastUser = (() => {
|
|
848
|
+
var _a;
|
|
807
849
|
const reversed = [...flatMessages].reverse();
|
|
808
850
|
const found = reversed.find(
|
|
809
851
|
(m) => m.role === "user"
|
|
810
852
|
);
|
|
811
|
-
return found
|
|
853
|
+
return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
|
|
812
854
|
})();
|
|
813
855
|
const rawModelText = (() => {
|
|
814
856
|
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
@@ -879,9 +921,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
879
921
|
caseLogs.push(
|
|
880
922
|
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
881
923
|
);
|
|
882
|
-
} catch {
|
|
924
|
+
} catch (e) {
|
|
883
925
|
}
|
|
884
|
-
} catch {
|
|
926
|
+
} catch (e) {
|
|
885
927
|
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
886
928
|
}
|
|
887
929
|
};
|
|
@@ -960,7 +1002,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
960
1002
|
const flatMessages = flattenMessages(messages);
|
|
961
1003
|
const { transformedTools, nameMap } = buildTransformedTools(
|
|
962
1004
|
tools,
|
|
963
|
-
|
|
1005
|
+
fixSchema2
|
|
964
1006
|
);
|
|
965
1007
|
const toolsMap = buildToolsMap(transformedTools);
|
|
966
1008
|
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
@@ -982,6 +1024,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
982
1024
|
const mwParsedToolCalls = parseDebugToolCalls(
|
|
983
1025
|
debugSummaryRef.toolCalls
|
|
984
1026
|
);
|
|
1027
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1028
|
+
if (!possibleAnswer) {
|
|
1029
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1030
|
+
}
|
|
1031
|
+
if (process.env.DEBUG_PARSER_OUTPUT === "true") {
|
|
1032
|
+
const groundTruth = possibleAnswer.ground_truth;
|
|
1033
|
+
const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
|
|
1034
|
+
console.log("\n========== BFCL CASE DEBUG ==========");
|
|
1035
|
+
console.log(`Test Case: ${testCase.id}`);
|
|
1036
|
+
console.log(`Expected count: ${groundTruth.length} call(s)`);
|
|
1037
|
+
console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
|
|
1038
|
+
console.log(expectedXML);
|
|
1039
|
+
console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
|
|
1040
|
+
console.log(mwOriginalText || text || "(empty)");
|
|
1041
|
+
console.log(
|
|
1042
|
+
"\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
|
|
1043
|
+
);
|
|
1044
|
+
console.log(JSON.stringify(toolCalls, null, 2));
|
|
1045
|
+
console.log("======================================\n");
|
|
1046
|
+
}
|
|
985
1047
|
logRawToolCalls({
|
|
986
1048
|
toolCalls,
|
|
987
1049
|
finishReason,
|
|
@@ -989,10 +1051,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
989
1051
|
testCaseId: testCase.id,
|
|
990
1052
|
caseLogs
|
|
991
1053
|
});
|
|
992
|
-
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
993
|
-
if (!possibleAnswer) {
|
|
994
|
-
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
995
|
-
}
|
|
996
1054
|
const restoredCalls = restoreToolCalls(
|
|
997
1055
|
toolCalls || [],
|
|
998
1056
|
nameMap,
|
|
@@ -1013,12 +1071,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1013
1071
|
caseLogs
|
|
1014
1072
|
});
|
|
1015
1073
|
};
|
|
1016
|
-
const
|
|
1074
|
+
const runSingleCase2 = async (testCase) => {
|
|
1017
1075
|
const caseLogs = [];
|
|
1018
1076
|
const { function: tools } = testCase;
|
|
1019
|
-
const temp = config
|
|
1077
|
+
const temp = config == null ? void 0 : config.temperature;
|
|
1020
1078
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1021
|
-
const maxTok = config
|
|
1079
|
+
const maxTok = config == null ? void 0 : config.maxTokens;
|
|
1022
1080
|
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1023
1081
|
try {
|
|
1024
1082
|
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
@@ -1044,15 +1102,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1044
1102
|
});
|
|
1045
1103
|
} catch (e) {
|
|
1046
1104
|
caseLogs.push(
|
|
1047
|
-
`[ERROR] ${testCase.id}: Model generation failed: ${e
|
|
1105
|
+
`[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
|
|
1048
1106
|
);
|
|
1049
|
-
if (e
|
|
1107
|
+
if (e == null ? void 0 : e.stack) {
|
|
1050
1108
|
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
1051
1109
|
}
|
|
1052
1110
|
return { valid: false, logs: caseLogs };
|
|
1053
1111
|
}
|
|
1054
1112
|
};
|
|
1055
|
-
const
|
|
1113
|
+
const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
|
|
1056
1114
|
const results = new Array(items.length);
|
|
1057
1115
|
let idx = 0;
|
|
1058
1116
|
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
@@ -1068,10 +1126,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1068
1126
|
await Promise.all(workers);
|
|
1069
1127
|
return results;
|
|
1070
1128
|
};
|
|
1071
|
-
const resultsPerCase = await
|
|
1129
|
+
const resultsPerCase = await mapWithConcurrency2(
|
|
1072
1130
|
testCases,
|
|
1073
1131
|
concurrency,
|
|
1074
|
-
async (tc) =>
|
|
1132
|
+
async (tc) => runSingleCase2(tc)
|
|
1075
1133
|
);
|
|
1076
1134
|
correctCount = resultsPerCase.reduce(
|
|
1077
1135
|
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
@@ -1139,19 +1197,391 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
|
1139
1197
|
"BFCL_v3_parallel_multiple_possible_answer.jsonl"
|
|
1140
1198
|
);
|
|
1141
1199
|
|
|
1142
|
-
// src/benchmarks/
|
|
1200
|
+
// src/benchmarks/complex-func-bench.ts
|
|
1143
1201
|
import { promises as fs3 } from "fs";
|
|
1144
1202
|
import path3 from "path";
|
|
1145
|
-
import {
|
|
1203
|
+
import {
|
|
1204
|
+
generateText as generateText2,
|
|
1205
|
+
jsonSchema as jsonSchema2,
|
|
1206
|
+
tool as tool2
|
|
1207
|
+
} from "ai";
|
|
1208
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1209
|
+
function standardizeString2(input) {
|
|
1210
|
+
if (typeof input !== "string") {
|
|
1211
|
+
return input;
|
|
1212
|
+
}
|
|
1213
|
+
return input.toLowerCase().trim();
|
|
1214
|
+
}
|
|
1215
|
+
function valuesMatch2(modelValue, expectedValue) {
|
|
1216
|
+
if (modelValue === expectedValue) {
|
|
1217
|
+
return true;
|
|
1218
|
+
}
|
|
1219
|
+
if (typeof modelValue === "string" && typeof expectedValue === "string") {
|
|
1220
|
+
return standardizeString2(modelValue) === standardizeString2(expectedValue);
|
|
1221
|
+
}
|
|
1222
|
+
if (typeof modelValue === "number" && typeof expectedValue === "string") {
|
|
1223
|
+
return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
|
|
1224
|
+
}
|
|
1225
|
+
if (typeof modelValue === "string" && typeof expectedValue === "number") {
|
|
1226
|
+
return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
|
|
1227
|
+
}
|
|
1228
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
|
|
1229
|
+
try {
|
|
1230
|
+
return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
|
|
1231
|
+
} catch (e) {
|
|
1232
|
+
return false;
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
return false;
|
|
1236
|
+
}
|
|
1237
|
+
function validateFunctionName(modelFuncName, expectedFuncName) {
|
|
1238
|
+
if (modelFuncName !== expectedFuncName) {
|
|
1239
|
+
return {
|
|
1240
|
+
valid: false,
|
|
1241
|
+
error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
|
|
1242
|
+
error_type: "function_name_mismatch"
|
|
1243
|
+
};
|
|
1244
|
+
}
|
|
1245
|
+
return { valid: true };
|
|
1246
|
+
}
|
|
1247
|
+
function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
|
|
1248
|
+
for (const param of requiredParams) {
|
|
1249
|
+
if (!(param in modelArgs) && param in expectedArgs) {
|
|
1250
|
+
return {
|
|
1251
|
+
valid: false,
|
|
1252
|
+
error: `Missing required parameter: '${param}'`,
|
|
1253
|
+
error_type: "missing_required_param"
|
|
1254
|
+
};
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
return { valid: true };
|
|
1258
|
+
}
|
|
1259
|
+
function validateParamValues(expectedArgs, modelArgs, requiredParams) {
|
|
1260
|
+
for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
|
|
1261
|
+
if (!(paramName in modelArgs)) {
|
|
1262
|
+
if (!requiredParams.includes(paramName)) {
|
|
1263
|
+
continue;
|
|
1264
|
+
}
|
|
1265
|
+
return {
|
|
1266
|
+
valid: false,
|
|
1267
|
+
error: `Missing parameter: '${paramName}'`,
|
|
1268
|
+
error_type: "missing_param"
|
|
1269
|
+
};
|
|
1270
|
+
}
|
|
1271
|
+
const modelValue = modelArgs[paramName];
|
|
1272
|
+
if (!valuesMatch2(modelValue, expectedValue)) {
|
|
1273
|
+
return {
|
|
1274
|
+
valid: false,
|
|
1275
|
+
error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
|
|
1276
|
+
error_type: "value_mismatch"
|
|
1277
|
+
};
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
return { valid: true };
|
|
1281
|
+
}
|
|
1282
|
+
function checkFunctionCall(modelCall, expected, toolSpecs) {
|
|
1283
|
+
var _a, _b, _c, _d;
|
|
1284
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1285
|
+
const expectedArgs = expected[expectedFuncName];
|
|
1286
|
+
const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
|
|
1287
|
+
const modelArgs = (_b = modelCall.args) != null ? _b : {};
|
|
1288
|
+
const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
|
|
1289
|
+
if (!nameResult.valid) {
|
|
1290
|
+
return nameResult;
|
|
1291
|
+
}
|
|
1292
|
+
const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
|
|
1293
|
+
const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
1294
|
+
const requiredResult = validateRequiredParams(
|
|
1295
|
+
requiredParams,
|
|
1296
|
+
modelArgs,
|
|
1297
|
+
expectedArgs
|
|
1298
|
+
);
|
|
1299
|
+
if (!requiredResult.valid) {
|
|
1300
|
+
return requiredResult;
|
|
1301
|
+
}
|
|
1302
|
+
return validateParamValues(expectedArgs, modelArgs, requiredParams);
|
|
1303
|
+
}
|
|
1304
|
+
function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
|
|
1305
|
+
if (modelCalls.length !== expectedCalls.length) {
|
|
1306
|
+
return {
|
|
1307
|
+
valid: false,
|
|
1308
|
+
error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
|
|
1309
|
+
error_type: "wrong_call_count"
|
|
1310
|
+
};
|
|
1311
|
+
}
|
|
1312
|
+
if (expectedCalls.length === 1) {
|
|
1313
|
+
return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
|
|
1314
|
+
}
|
|
1315
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
1316
|
+
for (const expected of expectedCalls) {
|
|
1317
|
+
let foundMatch = false;
|
|
1318
|
+
for (let i = 0; i < modelCalls.length; i++) {
|
|
1319
|
+
if (matchedIndices.has(i)) {
|
|
1320
|
+
continue;
|
|
1321
|
+
}
|
|
1322
|
+
const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
|
|
1323
|
+
if (result.valid) {
|
|
1324
|
+
matchedIndices.add(i);
|
|
1325
|
+
foundMatch = true;
|
|
1326
|
+
break;
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
if (!foundMatch) {
|
|
1330
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1331
|
+
return {
|
|
1332
|
+
valid: false,
|
|
1333
|
+
error: `Could not find matching call for function '${expectedFuncName}'`,
|
|
1334
|
+
error_type: "no_matching_call"
|
|
1335
|
+
};
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
return { valid: true };
|
|
1339
|
+
}
|
|
1340
|
+
var fixSchemaType = (copy) => {
|
|
1341
|
+
if (!copy.type) {
|
|
1342
|
+
return;
|
|
1343
|
+
}
|
|
1344
|
+
if (copy.type === "dict") {
|
|
1345
|
+
copy.type = "object";
|
|
1346
|
+
}
|
|
1347
|
+
if (copy.type === "tuple") {
|
|
1348
|
+
copy.type = "array";
|
|
1349
|
+
}
|
|
1350
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
1351
|
+
copy.type = "number";
|
|
1352
|
+
}
|
|
1353
|
+
};
|
|
1354
|
+
var fixSchema = (schema) => {
|
|
1355
|
+
if (!schema || typeof schema !== "object") {
|
|
1356
|
+
return { type: "object", properties: {} };
|
|
1357
|
+
}
|
|
1358
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
1359
|
+
if (!Array.isArray(copy)) {
|
|
1360
|
+
fixSchemaType(copy);
|
|
1361
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
1362
|
+
for (const k of Object.keys(copy.properties)) {
|
|
1363
|
+
copy.properties[k] = fixSchema(
|
|
1364
|
+
copy.properties[k]
|
|
1365
|
+
);
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
if (copy.items) {
|
|
1369
|
+
copy.items = fixSchema(copy.items);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
return copy;
|
|
1373
|
+
};
|
|
1374
|
+
function buildTools(tools) {
|
|
1375
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
1376
|
+
const transformedTools = tools.map((t) => {
|
|
1377
|
+
const fixed = fixSchema(t.parameters);
|
|
1378
|
+
const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
|
|
1379
|
+
const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
|
|
1380
|
+
nameMap.set(sanitized, t.name);
|
|
1381
|
+
return {
|
|
1382
|
+
type: "function",
|
|
1383
|
+
name: sanitized,
|
|
1384
|
+
description: t.description,
|
|
1385
|
+
inputSchema
|
|
1386
|
+
};
|
|
1387
|
+
});
|
|
1388
|
+
const toolsMap = Object.fromEntries(
|
|
1389
|
+
transformedTools.map((t) => [
|
|
1390
|
+
t.name,
|
|
1391
|
+
tool2({
|
|
1392
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
1393
|
+
inputSchema: jsonSchema2(t.inputSchema)
|
|
1394
|
+
})
|
|
1395
|
+
])
|
|
1396
|
+
);
|
|
1397
|
+
return { nameMap, toolsMap };
|
|
1398
|
+
}
|
|
1399
|
+
async function mapWithConcurrency(items, concurrencyLimit, mapper) {
|
|
1400
|
+
const results = new Array(items.length);
|
|
1401
|
+
let idx = 0;
|
|
1402
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1403
|
+
while (true) {
|
|
1404
|
+
const current = idx;
|
|
1405
|
+
idx += 1;
|
|
1406
|
+
if (current >= items.length) {
|
|
1407
|
+
break;
|
|
1408
|
+
}
|
|
1409
|
+
results[current] = await mapper(items[current]);
|
|
1410
|
+
}
|
|
1411
|
+
});
|
|
1412
|
+
await Promise.all(workers);
|
|
1413
|
+
return results;
|
|
1414
|
+
}
|
|
1415
|
+
async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
|
|
1416
|
+
const caseLogs = [];
|
|
1417
|
+
const { function: tools, question: messages } = testCase;
|
|
1418
|
+
try {
|
|
1419
|
+
const { nameMap, toolsMap } = buildTools(tools);
|
|
1420
|
+
const debugSummaryRef = {};
|
|
1421
|
+
const providerOptions = {
|
|
1422
|
+
toolCallMiddleware: { debugSummary: debugSummaryRef }
|
|
1423
|
+
};
|
|
1424
|
+
const { toolCalls, finishReason } = await generateText2({
|
|
1425
|
+
model,
|
|
1426
|
+
messages,
|
|
1427
|
+
tools: toolsMap,
|
|
1428
|
+
toolChoice: "auto",
|
|
1429
|
+
providerOptions,
|
|
1430
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
1431
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
1432
|
+
});
|
|
1433
|
+
const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
|
|
1434
|
+
var _a, _b, _c, _d;
|
|
1435
|
+
const rawName = (_a = c.toolName) != null ? _a : c.name;
|
|
1436
|
+
const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
|
|
1437
|
+
return {
|
|
1438
|
+
toolName: originalName,
|
|
1439
|
+
name: originalName,
|
|
1440
|
+
args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
|
|
1441
|
+
};
|
|
1442
|
+
});
|
|
1443
|
+
caseLogs.push(
|
|
1444
|
+
`[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
|
|
1445
|
+
);
|
|
1446
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1447
|
+
if (!possibleAnswer) {
|
|
1448
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1449
|
+
}
|
|
1450
|
+
const checkerResult = checkAllFunctionCalls(
|
|
1451
|
+
restoredCalls,
|
|
1452
|
+
possibleAnswer.ground_truth,
|
|
1453
|
+
tools
|
|
1454
|
+
);
|
|
1455
|
+
if (checkerResult.valid) {
|
|
1456
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1457
|
+
return { valid: true, logs: caseLogs };
|
|
1458
|
+
}
|
|
1459
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1460
|
+
return { valid: false, logs: caseLogs };
|
|
1461
|
+
} catch (e) {
|
|
1462
|
+
caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
|
|
1463
|
+
return { valid: false, logs: caseLogs };
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
async function loadTestData(dataPath, testDataFile) {
|
|
1467
|
+
const testCasesJson = await fs3.readFile(
|
|
1468
|
+
path3.join(dataPath, testDataFile),
|
|
1469
|
+
"utf-8"
|
|
1470
|
+
);
|
|
1471
|
+
return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1472
|
+
}
|
|
1473
|
+
async function loadAnswerData(dataPath, answerDataFile) {
|
|
1474
|
+
const answersJson = await fs3.readFile(
|
|
1475
|
+
path3.join(dataPath, answerDataFile),
|
|
1476
|
+
"utf-8"
|
|
1477
|
+
);
|
|
1478
|
+
const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1479
|
+
return new Map(answers.map((ans) => [ans.id, ans]));
|
|
1480
|
+
}
|
|
1481
|
+
function getConfigValues(config) {
|
|
1482
|
+
const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
|
|
1483
|
+
const limit = limitEnv ? Number(limitEnv) : void 0;
|
|
1484
|
+
const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
|
|
1485
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
1486
|
+
const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
|
|
1487
|
+
const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
|
|
1488
|
+
return { limit, concurrency, temperature, maxTokens };
|
|
1489
|
+
}
|
|
1490
|
+
function aggregateResults(resultsPerCase, testCases) {
|
|
1491
|
+
const logs = [];
|
|
1492
|
+
const correctCount = resultsPerCase.reduce(
|
|
1493
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1494
|
+
0
|
|
1495
|
+
);
|
|
1496
|
+
for (const r of resultsPerCase) {
|
|
1497
|
+
logs.push(...r.logs);
|
|
1498
|
+
}
|
|
1499
|
+
if (testCases.length === 0) {
|
|
1500
|
+
return {
|
|
1501
|
+
score: 0,
|
|
1502
|
+
success: false,
|
|
1503
|
+
metrics: {},
|
|
1504
|
+
logs: ["No test cases found."]
|
|
1505
|
+
};
|
|
1506
|
+
}
|
|
1507
|
+
const score = correctCount / testCases.length;
|
|
1508
|
+
return {
|
|
1509
|
+
score,
|
|
1510
|
+
success: score > 0.5,
|
|
1511
|
+
metrics: {
|
|
1512
|
+
correct_count: correctCount,
|
|
1513
|
+
total_cases: testCases.length,
|
|
1514
|
+
accuracy: score
|
|
1515
|
+
},
|
|
1516
|
+
logs
|
|
1517
|
+
};
|
|
1518
|
+
}
|
|
1519
|
+
function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
|
|
1520
|
+
return {
|
|
1521
|
+
name,
|
|
1522
|
+
version: "1.0.0",
|
|
1523
|
+
description,
|
|
1524
|
+
async run(model, config) {
|
|
1525
|
+
var _a;
|
|
1526
|
+
const logs = [];
|
|
1527
|
+
try {
|
|
1528
|
+
const dataPath = resolveDataDir();
|
|
1529
|
+
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
1530
|
+
let testCases = await loadTestData(dataPath, testDataFile);
|
|
1531
|
+
const possibleAnswersMap = await loadAnswerData(
|
|
1532
|
+
dataPath,
|
|
1533
|
+
answerDataFile
|
|
1534
|
+
);
|
|
1535
|
+
const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
|
|
1536
|
+
if (limit && Number.isFinite(limit) && limit > 0) {
|
|
1537
|
+
testCases = testCases.slice(0, limit);
|
|
1538
|
+
logs.push(`[INFO] Limiting test cases to ${limit}`);
|
|
1539
|
+
}
|
|
1540
|
+
logs.push(
|
|
1541
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
1542
|
+
);
|
|
1543
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1544
|
+
testCases,
|
|
1545
|
+
concurrency,
|
|
1546
|
+
(tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
|
|
1547
|
+
);
|
|
1548
|
+
const result = aggregateResults(resultsPerCase, testCases);
|
|
1549
|
+
result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
|
|
1550
|
+
return result;
|
|
1551
|
+
} catch (e) {
|
|
1552
|
+
return {
|
|
1553
|
+
score: 0,
|
|
1554
|
+
success: false,
|
|
1555
|
+
metrics: {},
|
|
1556
|
+
error: e,
|
|
1557
|
+
logs: [
|
|
1558
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1559
|
+
]
|
|
1560
|
+
};
|
|
1561
|
+
}
|
|
1562
|
+
}
|
|
1563
|
+
};
|
|
1564
|
+
}
|
|
1565
|
+
var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
|
|
1566
|
+
"complex-func-bench",
|
|
1567
|
+
"ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
|
|
1568
|
+
"ComplexFuncBench.jsonl",
|
|
1569
|
+
"ComplexFuncBench_possible_answer.jsonl"
|
|
1570
|
+
);
|
|
1571
|
+
|
|
1572
|
+
// src/benchmarks/json-generation.ts
|
|
1573
|
+
import { promises as fs4 } from "fs";
|
|
1574
|
+
import path4 from "path";
|
|
1575
|
+
import { generateText as generateText3 } from "ai";
|
|
1146
1576
|
import Ajv from "ajv";
|
|
1147
1577
|
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1148
1578
|
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1149
1579
|
var NEWLINE_REGEX = /\r?\n/;
|
|
1150
|
-
var
|
|
1580
|
+
var LINE_SPLIT_REGEX3 = /\r?\n/;
|
|
1151
1581
|
function tryDirectParse(text) {
|
|
1152
1582
|
try {
|
|
1153
1583
|
return JSON.parse(text);
|
|
1154
|
-
} catch {
|
|
1584
|
+
} catch (e) {
|
|
1155
1585
|
return;
|
|
1156
1586
|
}
|
|
1157
1587
|
}
|
|
@@ -1163,7 +1593,7 @@ function tryCodeFenceParse(text) {
|
|
|
1163
1593
|
const inner = fenceMatch[1].trim();
|
|
1164
1594
|
try {
|
|
1165
1595
|
return JSON.parse(inner);
|
|
1166
|
-
} catch {
|
|
1596
|
+
} catch (e) {
|
|
1167
1597
|
return;
|
|
1168
1598
|
}
|
|
1169
1599
|
}
|
|
@@ -1188,7 +1618,7 @@ function tryBracketScan(text) {
|
|
|
1188
1618
|
const candidate = text.slice(start, i + 1);
|
|
1189
1619
|
try {
|
|
1190
1620
|
return JSON.parse(candidate);
|
|
1191
|
-
} catch {
|
|
1621
|
+
} catch (e) {
|
|
1192
1622
|
return;
|
|
1193
1623
|
}
|
|
1194
1624
|
}
|
|
@@ -1236,12 +1666,12 @@ function subsetMatch(expected, actual) {
|
|
|
1236
1666
|
async function loadDatasets() {
|
|
1237
1667
|
try {
|
|
1238
1668
|
const dataDir = resolveDataDir();
|
|
1239
|
-
const testsJsonl = await
|
|
1240
|
-
|
|
1669
|
+
const testsJsonl = await fs4.readFile(
|
|
1670
|
+
path4.join(dataDir, "json_generation_tests.jsonl"),
|
|
1241
1671
|
"utf-8"
|
|
1242
1672
|
);
|
|
1243
|
-
const expectedJsonl = await
|
|
1244
|
-
|
|
1673
|
+
const expectedJsonl = await fs4.readFile(
|
|
1674
|
+
path4.join(dataDir, "json_generation_expected.jsonl"),
|
|
1245
1675
|
"utf-8"
|
|
1246
1676
|
);
|
|
1247
1677
|
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -1297,10 +1727,11 @@ function validateTestCase(tc, parsed, context) {
|
|
|
1297
1727
|
return { valid, valuesOk, parsed };
|
|
1298
1728
|
}
|
|
1299
1729
|
async function processTestCase(tc, context) {
|
|
1730
|
+
var _a;
|
|
1300
1731
|
const messages = buildMessages(tc);
|
|
1301
|
-
const temp = context.config
|
|
1732
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1302
1733
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1303
|
-
const { text } = await
|
|
1734
|
+
const { text } = await generateText3({
|
|
1304
1735
|
model: context.model,
|
|
1305
1736
|
messages,
|
|
1306
1737
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1308,7 +1739,7 @@ async function processTestCase(tc, context) {
|
|
|
1308
1739
|
let parsed;
|
|
1309
1740
|
try {
|
|
1310
1741
|
parsed = extractFirstJsonBlock(text);
|
|
1311
|
-
} catch {
|
|
1742
|
+
} catch (e) {
|
|
1312
1743
|
}
|
|
1313
1744
|
if (parsed === void 0) {
|
|
1314
1745
|
context.validation.logs.push(
|
|
@@ -1402,21 +1833,22 @@ function buildBenchmarkResult(total, counts, logs) {
|
|
|
1402
1833
|
async function loadSchemaOnlyTests() {
|
|
1403
1834
|
try {
|
|
1404
1835
|
const dataDir = resolveDataDir();
|
|
1405
|
-
const testsJsonl = await
|
|
1406
|
-
|
|
1836
|
+
const testsJsonl = await fs4.readFile(
|
|
1837
|
+
path4.join(dataDir, "json_generation_tests.jsonl"),
|
|
1407
1838
|
"utf-8"
|
|
1408
1839
|
);
|
|
1409
|
-
const tests = testsJsonl.split(
|
|
1840
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1410
1841
|
return { tests };
|
|
1411
1842
|
} catch (e) {
|
|
1412
1843
|
return { tests: [], error: e };
|
|
1413
1844
|
}
|
|
1414
1845
|
}
|
|
1415
1846
|
async function processSchemaOnlyTestCase(tc, context) {
|
|
1847
|
+
var _a;
|
|
1416
1848
|
const messages = buildMessages(tc);
|
|
1417
|
-
const temp = context.config
|
|
1849
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1418
1850
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1419
|
-
const { text } = await
|
|
1851
|
+
const { text } = await generateText3({
|
|
1420
1852
|
model: context.model,
|
|
1421
1853
|
messages,
|
|
1422
1854
|
...temperature !== void 0 ? { temperature } : {}
|
|
@@ -1424,7 +1856,7 @@ async function processSchemaOnlyTestCase(tc, context) {
|
|
|
1424
1856
|
let parsed;
|
|
1425
1857
|
try {
|
|
1426
1858
|
parsed = extractFirstJsonBlock(text);
|
|
1427
|
-
} catch {
|
|
1859
|
+
} catch (e) {
|
|
1428
1860
|
}
|
|
1429
1861
|
if (parsed === void 0) {
|
|
1430
1862
|
context.logs.push(
|
|
@@ -1501,8 +1933,56 @@ var colors = {
|
|
|
1501
1933
|
yellow: "\x1B[33m",
|
|
1502
1934
|
cyan: "\x1B[36m",
|
|
1503
1935
|
magenta: "\x1B[35m",
|
|
1504
|
-
gray: "\x1B[90m"
|
|
1936
|
+
gray: "\x1B[90m",
|
|
1937
|
+
white: "\x1B[37m",
|
|
1938
|
+
bgRed: "\x1B[41m"
|
|
1505
1939
|
};
|
|
1940
|
+
function formatDiff(diff) {
|
|
1941
|
+
if (!diff || diff.length === 0) {
|
|
1942
|
+
return "";
|
|
1943
|
+
}
|
|
1944
|
+
return diff.map((line) => {
|
|
1945
|
+
if (line.startsWith("-")) {
|
|
1946
|
+
return `${colors.red}${line}${colors.reset}`;
|
|
1947
|
+
}
|
|
1948
|
+
if (line.startsWith("+")) {
|
|
1949
|
+
return `${colors.green}${line}${colors.reset}`;
|
|
1950
|
+
}
|
|
1951
|
+
if (line.startsWith("@@")) {
|
|
1952
|
+
return `${colors.cyan}${line}${colors.reset}`;
|
|
1953
|
+
}
|
|
1954
|
+
return line;
|
|
1955
|
+
}).join("\n ");
|
|
1956
|
+
}
|
|
1957
|
+
function printFailLogs(logs) {
|
|
1958
|
+
const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
|
|
1959
|
+
for (const log of failLogs) {
|
|
1960
|
+
try {
|
|
1961
|
+
const jsonStr = log.replace("[DEBUG-FAIL] ", "");
|
|
1962
|
+
const data = JSON.parse(jsonStr);
|
|
1963
|
+
console.log(`
|
|
1964
|
+
${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
|
|
1965
|
+
console.log(
|
|
1966
|
+
` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
|
|
1967
|
+
);
|
|
1968
|
+
console.log(` Message: ${data.message}`);
|
|
1969
|
+
if (data.diff && Array.isArray(data.diff)) {
|
|
1970
|
+
console.log(` Diff:
|
|
1971
|
+
${formatDiff(data.diff)}`);
|
|
1972
|
+
}
|
|
1973
|
+
if (data.expected && data.actual) {
|
|
1974
|
+
const expStr = JSON.stringify(data.expected);
|
|
1975
|
+
const actStr = JSON.stringify(data.actual);
|
|
1976
|
+
if (expStr.length < 100 && actStr.length < 100) {
|
|
1977
|
+
console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
|
|
1978
|
+
console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
} catch (_e) {
|
|
1982
|
+
console.log(` Raw Log: ${log}`);
|
|
1983
|
+
}
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1506
1986
|
function printResult(result) {
|
|
1507
1987
|
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1508
1988
|
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
@@ -1525,6 +2005,18 @@ function printResult(result) {
|
|
|
1525
2005
|
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
1526
2006
|
);
|
|
1527
2007
|
}
|
|
2008
|
+
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2009
|
+
printFailLogs(benchmarkResult.logs);
|
|
2010
|
+
const failLogs = benchmarkResult.logs.filter(
|
|
2011
|
+
(l) => l.startsWith("[DEBUG-FAIL]")
|
|
2012
|
+
);
|
|
2013
|
+
if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
|
|
2014
|
+
console.log(" Raw Logs (Sample):");
|
|
2015
|
+
for (const l of benchmarkResult.logs.slice(0, 10)) {
|
|
2016
|
+
console.log(` ${l}`);
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
1528
2020
|
}
|
|
1529
2021
|
function consoleReporter(results) {
|
|
1530
2022
|
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
@@ -1579,14 +2071,14 @@ function hasFunctionNameIssue(diff) {
|
|
|
1579
2071
|
);
|
|
1580
2072
|
}
|
|
1581
2073
|
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
1582
|
-
const expectedName = expected
|
|
1583
|
-
const actualName = actual
|
|
2074
|
+
const expectedName = expected == null ? void 0 : expected.function;
|
|
2075
|
+
const actualName = actual == null ? void 0 : actual.function;
|
|
1584
2076
|
if (expectedName && actualName && expectedName !== actualName) {
|
|
1585
2077
|
suggestions.push(
|
|
1586
2078
|
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
1587
2079
|
);
|
|
1588
2080
|
}
|
|
1589
|
-
if (Array.isArray(expected
|
|
2081
|
+
if (Array.isArray(expected == null ? void 0 : expected.functions)) {
|
|
1590
2082
|
suggestions.push(
|
|
1591
2083
|
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
1592
2084
|
);
|
|
@@ -1641,7 +2133,7 @@ function suggestFromErrorType(error_type, suggestions) {
|
|
|
1641
2133
|
}
|
|
1642
2134
|
function suggestFixFromDiff(parsed) {
|
|
1643
2135
|
const suggestions = [];
|
|
1644
|
-
const { error_type, expected, actual, diff } = parsed
|
|
2136
|
+
const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
|
|
1645
2137
|
if (!Array.isArray(diff)) {
|
|
1646
2138
|
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1647
2139
|
suggestFromErrorType(error_type, suggestions);
|
|
@@ -1666,15 +2158,16 @@ function suggestFixFromDiff(parsed) {
|
|
|
1666
2158
|
return uniqueLines(suggestions);
|
|
1667
2159
|
}
|
|
1668
2160
|
function getTestIdFromLogLine(line) {
|
|
2161
|
+
var _a, _b;
|
|
1669
2162
|
if (line.startsWith("[FAIL]")) {
|
|
1670
2163
|
const m = line.match(FAIL_ID_REGEX);
|
|
1671
|
-
return m
|
|
2164
|
+
return m == null ? void 0 : m[1];
|
|
1672
2165
|
}
|
|
1673
2166
|
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1674
2167
|
try {
|
|
1675
2168
|
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1676
|
-
return String(parsed
|
|
1677
|
-
} catch {
|
|
2169
|
+
return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
|
|
2170
|
+
} catch (e) {
|
|
1678
2171
|
}
|
|
1679
2172
|
}
|
|
1680
2173
|
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
@@ -1682,18 +2175,19 @@ function getTestIdFromLogLine(line) {
|
|
|
1682
2175
|
const parsed = JSON.parse(
|
|
1683
2176
|
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
1684
2177
|
);
|
|
1685
|
-
return String(parsed
|
|
1686
|
-
} catch {
|
|
2178
|
+
return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
|
|
2179
|
+
} catch (e) {
|
|
1687
2180
|
}
|
|
1688
2181
|
}
|
|
1689
2182
|
return;
|
|
1690
2183
|
}
|
|
1691
2184
|
function groupLogsByTestId(failLogs) {
|
|
2185
|
+
var _a;
|
|
1692
2186
|
const byId = /* @__PURE__ */ new Map();
|
|
1693
2187
|
for (const line of failLogs) {
|
|
1694
2188
|
const id = getTestIdFromLogLine(line);
|
|
1695
|
-
const key = id
|
|
1696
|
-
const arr = byId.get(key)
|
|
2189
|
+
const key = id != null ? id : "__general__";
|
|
2190
|
+
const arr = (_a = byId.get(key)) != null ? _a : [];
|
|
1697
2191
|
arr.push(line);
|
|
1698
2192
|
byId.set(key, arr);
|
|
1699
2193
|
}
|
|
@@ -1705,10 +2199,10 @@ function collectDebugIds(lines) {
|
|
|
1705
2199
|
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
1706
2200
|
try {
|
|
1707
2201
|
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1708
|
-
if (parsed
|
|
2202
|
+
if (parsed == null ? void 0 : parsed.id) {
|
|
1709
2203
|
debugIds.add(String(parsed.id));
|
|
1710
2204
|
}
|
|
1711
|
-
} catch {
|
|
2205
|
+
} catch (e) {
|
|
1712
2206
|
}
|
|
1713
2207
|
}
|
|
1714
2208
|
}
|
|
@@ -1744,7 +2238,7 @@ function displayDebugFailLine(line) {
|
|
|
1744
2238
|
console.log(` \u2022 ${s}`);
|
|
1745
2239
|
}
|
|
1746
2240
|
}
|
|
1747
|
-
} catch {
|
|
2241
|
+
} catch (e) {
|
|
1748
2242
|
console.log(` ${line}`);
|
|
1749
2243
|
}
|
|
1750
2244
|
}
|
|
@@ -1788,14 +2282,14 @@ function displayDebugFailContextLine(line) {
|
|
|
1788
2282
|
const ctx = JSON.parse(payload);
|
|
1789
2283
|
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
1790
2284
|
displayContextInfo(ctx);
|
|
1791
|
-
} catch {
|
|
2285
|
+
} catch (e) {
|
|
1792
2286
|
console.log(` ${line}`);
|
|
1793
2287
|
}
|
|
1794
2288
|
}
|
|
1795
2289
|
function displayLogLine(line, debugIds) {
|
|
1796
2290
|
if (line.startsWith("[FAIL]")) {
|
|
1797
2291
|
const m = line.match(FAIL_ID_REGEX);
|
|
1798
|
-
const failId = m
|
|
2292
|
+
const failId = m == null ? void 0 : m[1];
|
|
1799
2293
|
if (failId && debugIds.has(failId)) {
|
|
1800
2294
|
return;
|
|
1801
2295
|
}
|
|
@@ -1865,11 +2359,12 @@ function displayResultHeader(r) {
|
|
|
1865
2359
|
);
|
|
1866
2360
|
}
|
|
1867
2361
|
function consoleDebugReporter(results) {
|
|
2362
|
+
var _a;
|
|
1868
2363
|
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
1869
2364
|
for (const r of results) {
|
|
1870
2365
|
displayResultHeader(r);
|
|
1871
2366
|
displayMetrics(Object.entries(r.result.metrics));
|
|
1872
|
-
if (r.result.logs
|
|
2367
|
+
if ((_a = r.result.logs) == null ? void 0 : _a.length) {
|
|
1873
2368
|
displayResultLogs(r.result.logs);
|
|
1874
2369
|
}
|
|
1875
2370
|
}
|
|
@@ -1878,13 +2373,16 @@ function consoleDebugReporter(results) {
|
|
|
1878
2373
|
|
|
1879
2374
|
// src/reporters/json.ts
|
|
1880
2375
|
function jsonReporter(results) {
|
|
1881
|
-
const serializableResults = results.map((r) =>
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
...r
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
2376
|
+
const serializableResults = results.map((r) => {
|
|
2377
|
+
var _a;
|
|
2378
|
+
return {
|
|
2379
|
+
...r,
|
|
2380
|
+
result: {
|
|
2381
|
+
...r.result,
|
|
2382
|
+
error: (_a = r.result.error) == null ? void 0 : _a.message
|
|
2383
|
+
}
|
|
2384
|
+
};
|
|
2385
|
+
});
|
|
1888
2386
|
console.log(JSON.stringify(serializableResults, null, 2));
|
|
1889
2387
|
}
|
|
1890
2388
|
|
|
@@ -1996,6 +2494,7 @@ export {
|
|
|
1996
2494
|
bfclParallelBenchmark,
|
|
1997
2495
|
bfclParallelMultipleBenchmark,
|
|
1998
2496
|
bfclSimpleBenchmark,
|
|
2497
|
+
complexFuncBenchBenchmark,
|
|
1999
2498
|
evaluate,
|
|
2000
2499
|
jsonGenerationBenchmark,
|
|
2001
2500
|
jsonGenerationSchemaOnlyBenchmark
|