@ai-sdk-tool/eval 1.0.0-canary.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
- package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
- package/data/BFCL_v4_simple.jsonl +400 -0
- package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
- package/dist/index.cjs +715 -210
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -3
- package/dist/index.d.ts +49 -3
- package/dist/index.js +715 -210
- package/dist/index.js.map +1 -1
- package/package.json +6 -5
- package/data/BFCL_v3_simple.jsonl +0 -400
- package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
- /package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -407,6 +407,7 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
407
407
|
// src/benchmarks/bfcl.ts
|
|
408
408
|
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
409
409
|
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
410
|
+
var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
|
|
410
411
|
function convertGroundTruthToXML(call) {
|
|
411
412
|
const keys = Object.keys(call);
|
|
412
413
|
if (keys.length === 0) {
|
|
@@ -438,45 +439,67 @@ function convertGroundTruthToXML(call) {
|
|
|
438
439
|
xml += `</${funcName}>`;
|
|
439
440
|
return xml;
|
|
440
441
|
}
|
|
442
|
+
function extractCategory(id) {
|
|
443
|
+
if (id.startsWith("parallel_multiple")) {
|
|
444
|
+
return "parallel_multiple";
|
|
445
|
+
}
|
|
446
|
+
if (id.startsWith("simple_python")) {
|
|
447
|
+
return "simple";
|
|
448
|
+
}
|
|
449
|
+
if (id.startsWith("simple_java")) {
|
|
450
|
+
return "simple";
|
|
451
|
+
}
|
|
452
|
+
if (id.startsWith("simple_javascript")) {
|
|
453
|
+
return "simple";
|
|
454
|
+
}
|
|
455
|
+
if (id.startsWith("parallel")) {
|
|
456
|
+
return "parallel";
|
|
457
|
+
}
|
|
458
|
+
if (id.startsWith("multiple")) {
|
|
459
|
+
return "multiple";
|
|
460
|
+
}
|
|
461
|
+
if (id.startsWith("simple")) {
|
|
462
|
+
return "simple";
|
|
463
|
+
}
|
|
464
|
+
return id.split("_")[0];
|
|
465
|
+
}
|
|
441
466
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
442
|
-
const category = testCase.id
|
|
467
|
+
const category = extractCategory(testCase.id);
|
|
443
468
|
try {
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
469
|
+
switch (category) {
|
|
470
|
+
case "simple": {
|
|
471
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
472
|
+
return {
|
|
473
|
+
valid: false,
|
|
474
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
475
|
+
error_type: "simple:wrong_count"
|
|
476
|
+
};
|
|
477
|
+
}
|
|
478
|
+
return simpleFunctionChecker(
|
|
479
|
+
testCase.function[0],
|
|
480
|
+
modelOutput[0],
|
|
481
|
+
possibleAnswer.ground_truth[0]
|
|
482
|
+
);
|
|
483
|
+
}
|
|
484
|
+
case "multiple": {
|
|
485
|
+
return multipleFunctionChecker(
|
|
486
|
+
testCase.function,
|
|
487
|
+
modelOutput,
|
|
488
|
+
possibleAnswer.ground_truth
|
|
489
|
+
);
|
|
490
|
+
}
|
|
491
|
+
case "parallel":
|
|
492
|
+
case "parallel_multiple": {
|
|
493
|
+
return parallelFunctionCheckerNoOrder(
|
|
494
|
+
testCase.function,
|
|
495
|
+
modelOutput,
|
|
496
|
+
possibleAnswer.ground_truth
|
|
497
|
+
);
|
|
498
|
+
}
|
|
499
|
+
default: {
|
|
500
|
+
return { valid: true };
|
|
451
501
|
}
|
|
452
|
-
return simpleFunctionChecker(
|
|
453
|
-
testCase.function[0],
|
|
454
|
-
modelOutput[0],
|
|
455
|
-
possibleAnswer.ground_truth[0]
|
|
456
|
-
);
|
|
457
|
-
}
|
|
458
|
-
if (category === "parallel") {
|
|
459
|
-
return parallelFunctionCheckerNoOrder(
|
|
460
|
-
testCase.function,
|
|
461
|
-
modelOutput,
|
|
462
|
-
possibleAnswer.ground_truth
|
|
463
|
-
);
|
|
464
|
-
}
|
|
465
|
-
if (category === "multiple") {
|
|
466
|
-
return multipleFunctionChecker(
|
|
467
|
-
testCase.function,
|
|
468
|
-
modelOutput,
|
|
469
|
-
possibleAnswer.ground_truth
|
|
470
|
-
);
|
|
471
|
-
}
|
|
472
|
-
if (category.includes("parallel-multiple")) {
|
|
473
|
-
return parallelFunctionCheckerNoOrder(
|
|
474
|
-
testCase.function,
|
|
475
|
-
modelOutput,
|
|
476
|
-
possibleAnswer.ground_truth
|
|
477
|
-
);
|
|
478
502
|
}
|
|
479
|
-
return { valid: true };
|
|
480
503
|
} catch (e) {
|
|
481
504
|
return {
|
|
482
505
|
valid: false,
|
|
@@ -654,7 +677,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
654
677
|
return `- expected one of: ${formatted}`;
|
|
655
678
|
})();
|
|
656
679
|
diffLines.push(expectedLine);
|
|
657
|
-
diffLines.push(`+
|
|
680
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
658
681
|
return diffLines;
|
|
659
682
|
};
|
|
660
683
|
const paramValueMatches = (allowed, got) => {
|
|
@@ -871,44 +894,97 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
871
894
|
);
|
|
872
895
|
}
|
|
873
896
|
};
|
|
874
|
-
const
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
flatMessages,
|
|
879
|
-
mwOriginalText,
|
|
880
|
-
text,
|
|
881
|
-
finishReason,
|
|
882
|
-
mwParsedToolCalls,
|
|
883
|
-
restoredCalls,
|
|
884
|
-
possibleAnswer
|
|
885
|
-
} = options;
|
|
886
|
-
const lastUser = (() => {
|
|
887
|
-
var _a;
|
|
888
|
-
const reversed = [...flatMessages].reverse();
|
|
889
|
-
const found = reversed.find(
|
|
890
|
-
(m) => m.role === "user"
|
|
891
|
-
);
|
|
892
|
-
return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
|
|
893
|
-
})();
|
|
894
|
-
const rawModelText = (() => {
|
|
895
|
-
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
896
|
-
return mwOriginalText;
|
|
897
|
+
const hasPercentPattern = (diff) => {
|
|
898
|
+
return diff.some((d) => {
|
|
899
|
+
if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
|
|
900
|
+
return false;
|
|
897
901
|
}
|
|
898
|
-
|
|
899
|
-
|
|
902
|
+
const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
|
|
903
|
+
if (!numMatch) {
|
|
904
|
+
return false;
|
|
900
905
|
}
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
906
|
+
const num = Number.parseFloat(numMatch[1]);
|
|
907
|
+
return num >= 1 && num <= 100;
|
|
908
|
+
});
|
|
909
|
+
};
|
|
910
|
+
const isValueError = (errorType, diff) => {
|
|
911
|
+
return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
|
|
912
|
+
};
|
|
913
|
+
const isFunctionNameError = (errorType, diff) => {
|
|
914
|
+
return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
|
|
915
|
+
};
|
|
916
|
+
const isMissingParamError = (errorType, diff) => {
|
|
917
|
+
return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
|
|
918
|
+
};
|
|
919
|
+
const isUnexpectedParamError = (errorType, diff) => {
|
|
920
|
+
return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
|
|
921
|
+
};
|
|
922
|
+
const classifyByErrorPatterns = (errorType, diff) => {
|
|
923
|
+
const patterns = [
|
|
924
|
+
[
|
|
925
|
+
isValueError,
|
|
926
|
+
hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
|
|
927
|
+
],
|
|
928
|
+
[isFunctionNameError, "WRONG_FUNCTION"],
|
|
929
|
+
[isMissingParamError, "MISSING_PARAMS"],
|
|
930
|
+
[isUnexpectedParamError, "UNEXPECTED_PARAMS"]
|
|
931
|
+
];
|
|
932
|
+
for (const [classifier, result] of patterns) {
|
|
933
|
+
if (classifier(errorType, diff)) {
|
|
934
|
+
return result;
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
|
|
938
|
+
return "NO_MATCH";
|
|
939
|
+
}
|
|
940
|
+
return null;
|
|
941
|
+
};
|
|
942
|
+
const classifyByCallCount = (actualCount, expectedCount) => {
|
|
943
|
+
if (actualCount === 0 && expectedCount > 0) {
|
|
944
|
+
return "PARSE_FAILURE";
|
|
945
|
+
}
|
|
946
|
+
if (actualCount > 0 && actualCount < expectedCount) {
|
|
947
|
+
return "PARTIAL_CALLS";
|
|
948
|
+
}
|
|
949
|
+
if (actualCount > expectedCount) {
|
|
950
|
+
return "EXTRA_CALLS";
|
|
951
|
+
}
|
|
952
|
+
return null;
|
|
953
|
+
};
|
|
954
|
+
const classifyFailureType = (options) => {
|
|
955
|
+
const { errorType, restoredCalls, expectedCount, diff } = options;
|
|
956
|
+
const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
|
|
957
|
+
const countBasedResult = classifyByCallCount(
|
|
958
|
+
actualCount,
|
|
959
|
+
expectedCount
|
|
960
|
+
);
|
|
961
|
+
if (countBasedResult) {
|
|
962
|
+
return countBasedResult;
|
|
963
|
+
}
|
|
964
|
+
const patternBasedResult = classifyByErrorPatterns(errorType, diff);
|
|
965
|
+
if (patternBasedResult) {
|
|
966
|
+
return patternBasedResult;
|
|
967
|
+
}
|
|
968
|
+
return "OTHER";
|
|
969
|
+
};
|
|
970
|
+
const extractRawModelText = (mwOriginalText, text) => {
|
|
971
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
972
|
+
return mwOriginalText;
|
|
973
|
+
}
|
|
974
|
+
if (typeof text === "string") {
|
|
975
|
+
return text;
|
|
976
|
+
}
|
|
977
|
+
return "";
|
|
978
|
+
};
|
|
979
|
+
const extractLastUserQuery = (flatMessages) => {
|
|
980
|
+
var _a;
|
|
981
|
+
const reversed = [...flatMessages].reverse();
|
|
982
|
+
const found = reversed.find((m) => m.role === "user");
|
|
983
|
+
const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
|
|
984
|
+
return content.length > 200 ? `${content.slice(0, 200)}...` : content;
|
|
985
|
+
};
|
|
986
|
+
const truncateText = (text, maxLen) => {
|
|
987
|
+
return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
|
|
912
988
|
};
|
|
913
989
|
const logFailureDetails = (options) => {
|
|
914
990
|
const {
|
|
@@ -926,42 +1002,36 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
926
1002
|
} = options;
|
|
927
1003
|
try {
|
|
928
1004
|
const category = testCase.id.split("_")[0];
|
|
929
|
-
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
caseLogs.push(
|
|
939
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
940
|
-
id: testCase.id,
|
|
941
|
-
message: checkerResult.error,
|
|
942
|
-
error_type: checkerResult.error_type,
|
|
943
|
-
expected,
|
|
944
|
-
actual,
|
|
945
|
-
diff
|
|
946
|
-
})}`
|
|
947
|
-
);
|
|
948
|
-
try {
|
|
949
|
-
const contextPayload = buildFailureContext({
|
|
950
|
-
testCase,
|
|
951
|
-
tools,
|
|
952
|
-
flatMessages,
|
|
953
|
-
mwOriginalText,
|
|
954
|
-
text,
|
|
955
|
-
finishReason,
|
|
956
|
-
mwParsedToolCalls,
|
|
1005
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
|
|
1006
|
+
const gtArr = possibleAnswer.ground_truth;
|
|
1007
|
+
const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
|
|
1008
|
+
const rawModelText = extractRawModelText(mwOriginalText, text);
|
|
1009
|
+
const lastUserQuery = extractLastUserQuery(flatMessages);
|
|
1010
|
+
const failurePayload = {
|
|
1011
|
+
id: testCase.id,
|
|
1012
|
+
category: classifyFailureType({
|
|
1013
|
+
errorType: checkerResult.error_type,
|
|
957
1014
|
restoredCalls,
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1015
|
+
expectedCount,
|
|
1016
|
+
diff
|
|
1017
|
+
}),
|
|
1018
|
+
message: checkerResult.error,
|
|
1019
|
+
error_type: checkerResult.error_type,
|
|
1020
|
+
expected,
|
|
1021
|
+
actual,
|
|
1022
|
+
diff,
|
|
1023
|
+
context: {
|
|
1024
|
+
raw_model_text: truncateText(rawModelText, 500),
|
|
1025
|
+
raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
|
|
1026
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1027
|
+
expected_count: expectedCount,
|
|
1028
|
+
actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
|
|
1029
|
+
finish_reason: finishReason,
|
|
1030
|
+
last_user_query: lastUserQuery,
|
|
1031
|
+
tool_names: tools.map((t) => t.name)
|
|
1032
|
+
}
|
|
1033
|
+
};
|
|
1034
|
+
caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
|
|
965
1035
|
} catch (e) {
|
|
966
1036
|
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
967
1037
|
}
|
|
@@ -1186,14 +1256,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1186
1256
|
};
|
|
1187
1257
|
}
|
|
1188
1258
|
const score = correctCount / testCases.length;
|
|
1259
|
+
const caseResults = resultsPerCase.map((r, i) => ({
|
|
1260
|
+
id: testCases[i].id,
|
|
1261
|
+
valid: r.valid
|
|
1262
|
+
}));
|
|
1189
1263
|
return {
|
|
1190
1264
|
score,
|
|
1191
1265
|
success: score > 0.95,
|
|
1192
|
-
// High success threshold as requested
|
|
1193
1266
|
metrics: {
|
|
1194
1267
|
correct_count: correctCount,
|
|
1195
1268
|
total_cases: testCases.length,
|
|
1196
|
-
accuracy: score
|
|
1269
|
+
accuracy: score,
|
|
1270
|
+
case_results: JSON.stringify(caseResults)
|
|
1197
1271
|
},
|
|
1198
1272
|
logs
|
|
1199
1273
|
};
|
|
@@ -1213,27 +1287,27 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1213
1287
|
}
|
|
1214
1288
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1215
1289
|
"bfcl-simple",
|
|
1216
|
-
"BFCL Simple Function Calling",
|
|
1217
|
-
"
|
|
1218
|
-
"
|
|
1290
|
+
"BFCL v4 Simple Function Calling",
|
|
1291
|
+
"BFCL_v4_simple.jsonl",
|
|
1292
|
+
"BFCL_v4_simple_possible_answer.jsonl"
|
|
1219
1293
|
);
|
|
1220
1294
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1221
1295
|
"bfcl-parallel",
|
|
1222
|
-
"BFCL Parallel Function Calling",
|
|
1223
|
-
"
|
|
1224
|
-
"
|
|
1296
|
+
"BFCL v4 Parallel Function Calling",
|
|
1297
|
+
"BFCL_v4_parallel.jsonl",
|
|
1298
|
+
"BFCL_v4_parallel_possible_answer.jsonl"
|
|
1225
1299
|
);
|
|
1226
1300
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1227
1301
|
"bfcl-multiple",
|
|
1228
|
-
"BFCL Multiple Function Calling",
|
|
1229
|
-
"
|
|
1230
|
-
"
|
|
1302
|
+
"BFCL v4 Multiple Function Calling",
|
|
1303
|
+
"BFCL_v4_multiple.jsonl",
|
|
1304
|
+
"BFCL_v4_multiple_possible_answer.jsonl"
|
|
1231
1305
|
);
|
|
1232
1306
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1233
1307
|
"bfcl-parallel-multiple",
|
|
1234
|
-
"BFCL Parallel & Multiple Function Calling",
|
|
1235
|
-
"
|
|
1236
|
-
"
|
|
1308
|
+
"BFCL v4 Parallel & Multiple Function Calling",
|
|
1309
|
+
"BFCL_v4_parallel_multiple.jsonl",
|
|
1310
|
+
"BFCL_v4_parallel_multiple_possible_answer.jsonl"
|
|
1237
1311
|
);
|
|
1238
1312
|
|
|
1239
1313
|
// src/benchmarks/complex-func-bench.ts
|
|
@@ -1960,23 +2034,28 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1960
2034
|
}
|
|
1961
2035
|
};
|
|
1962
2036
|
|
|
2037
|
+
// src/evaluate.ts
|
|
2038
|
+
var import_middleware = require("@ai-sdk-tool/middleware");
|
|
2039
|
+
var import_ai4 = require("ai");
|
|
2040
|
+
|
|
1963
2041
|
// src/reporters/console.ts
|
|
1964
2042
|
var colors = {
|
|
1965
2043
|
reset: "\x1B[0m",
|
|
2044
|
+
bold: "\x1B[1m",
|
|
1966
2045
|
green: "\x1B[32m",
|
|
1967
2046
|
red: "\x1B[31m",
|
|
1968
2047
|
yellow: "\x1B[33m",
|
|
1969
2048
|
cyan: "\x1B[36m",
|
|
1970
2049
|
magenta: "\x1B[35m",
|
|
1971
2050
|
gray: "\x1B[90m",
|
|
1972
|
-
white: "\x1B[37m"
|
|
1973
|
-
bgRed: "\x1B[41m"
|
|
2051
|
+
white: "\x1B[37m"
|
|
1974
2052
|
};
|
|
2053
|
+
var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
|
|
1975
2054
|
function formatDiff(diff) {
|
|
1976
2055
|
if (!diff || diff.length === 0) {
|
|
1977
2056
|
return "";
|
|
1978
2057
|
}
|
|
1979
|
-
return diff.map((line) => {
|
|
2058
|
+
return diff.slice(0, 8).map((line) => {
|
|
1980
2059
|
if (line.startsWith("-")) {
|
|
1981
2060
|
return `${colors.red}${line}${colors.reset}`;
|
|
1982
2061
|
}
|
|
@@ -1989,65 +2068,106 @@ function formatDiff(diff) {
|
|
|
1989
2068
|
return line;
|
|
1990
2069
|
}).join("\n ");
|
|
1991
2070
|
}
|
|
1992
|
-
function
|
|
1993
|
-
const
|
|
1994
|
-
for (const log of
|
|
2071
|
+
function parseFailures(logs) {
|
|
2072
|
+
const failures = [];
|
|
2073
|
+
for (const log of logs) {
|
|
2074
|
+
if (!DEBUG_FAIL_REGEX.test(log)) {
|
|
2075
|
+
continue;
|
|
2076
|
+
}
|
|
1995
2077
|
try {
|
|
1996
|
-
const jsonStr = log.replace(
|
|
1997
|
-
const
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
console.log(
|
|
2001
|
-
` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
|
|
2002
|
-
);
|
|
2003
|
-
console.log(` Message: ${data.message}`);
|
|
2004
|
-
if (data.diff && Array.isArray(data.diff)) {
|
|
2005
|
-
console.log(` Diff:
|
|
2006
|
-
${formatDiff(data.diff)}`);
|
|
2007
|
-
}
|
|
2008
|
-
if (data.expected && data.actual) {
|
|
2009
|
-
const expStr = JSON.stringify(data.expected);
|
|
2010
|
-
const actStr = JSON.stringify(data.actual);
|
|
2011
|
-
if (expStr.length < 100 && actStr.length < 100) {
|
|
2012
|
-
console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
|
|
2013
|
-
console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
|
|
2014
|
-
}
|
|
2015
|
-
}
|
|
2016
|
-
} catch (_e) {
|
|
2017
|
-
console.log(` Raw Log: ${log}`);
|
|
2078
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
|
|
2079
|
+
const parsed = JSON.parse(jsonStr);
|
|
2080
|
+
failures.push(parsed);
|
|
2081
|
+
} catch (e) {
|
|
2018
2082
|
}
|
|
2019
2083
|
}
|
|
2084
|
+
return failures;
|
|
2085
|
+
}
|
|
2086
|
+
function groupFailuresByCategory(failures) {
|
|
2087
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2088
|
+
for (const failure of failures) {
|
|
2089
|
+
const category = failure.category || "OTHER";
|
|
2090
|
+
const existing = groups.get(category);
|
|
2091
|
+
if (existing) {
|
|
2092
|
+
existing.push(failure);
|
|
2093
|
+
} else {
|
|
2094
|
+
groups.set(category, [failure]);
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
return groups;
|
|
2098
|
+
}
|
|
2099
|
+
function printCompactFailure(failure) {
|
|
2100
|
+
var _a;
|
|
2101
|
+
console.log(
|
|
2102
|
+
`
|
|
2103
|
+
${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
|
|
2104
|
+
);
|
|
2105
|
+
if (failure.message) {
|
|
2106
|
+
console.log(` ${failure.message}`);
|
|
2107
|
+
}
|
|
2108
|
+
if (failure.diff && failure.diff.length > 0) {
|
|
2109
|
+
console.log(` ${formatDiff(failure.diff)}`);
|
|
2110
|
+
}
|
|
2111
|
+
if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
|
|
2112
|
+
const text = failure.context.raw_model_text;
|
|
2113
|
+
const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
|
|
2114
|
+
console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
function printFailureSummary(failures) {
|
|
2118
|
+
const groups = groupFailuresByCategory(failures);
|
|
2119
|
+
const sorted = [...groups.entries()].sort(
|
|
2120
|
+
(a, b) => b[1].length - a[1].length
|
|
2121
|
+
);
|
|
2122
|
+
console.log(`
|
|
2123
|
+
${colors.bold}Failures by category:${colors.reset}`);
|
|
2124
|
+
for (const [category, categoryFailures] of sorted) {
|
|
2125
|
+
console.log(
|
|
2126
|
+
` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
|
|
2127
|
+
);
|
|
2128
|
+
}
|
|
2129
|
+
const maxToShow = 5;
|
|
2130
|
+
const shown = failures.slice(0, maxToShow);
|
|
2131
|
+
for (const failure of shown) {
|
|
2132
|
+
printCompactFailure(failure);
|
|
2133
|
+
}
|
|
2134
|
+
if (failures.length > maxToShow) {
|
|
2135
|
+
const remaining = failures.length - maxToShow;
|
|
2136
|
+
const remainingIds = failures.slice(maxToShow).map((f) => f.id);
|
|
2137
|
+
const idPreview = remainingIds.slice(0, 5).join(", ");
|
|
2138
|
+
const more = remainingIds.length > 5 ? "..." : "";
|
|
2139
|
+
console.log(
|
|
2140
|
+
`
|
|
2141
|
+
${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
|
|
2142
|
+
);
|
|
2143
|
+
}
|
|
2020
2144
|
}
|
|
2021
2145
|
function printResult(result) {
|
|
2022
2146
|
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
2023
|
-
const
|
|
2147
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2148
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2149
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2150
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2151
|
+
const statusColor = benchmarkResult.success ? colors.green : colors.red;
|
|
2024
2152
|
console.log(
|
|
2025
2153
|
`
|
|
2026
2154
|
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
2027
2155
|
);
|
|
2028
2156
|
console.log(
|
|
2029
|
-
` \u2514 ${
|
|
2157
|
+
` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
|
|
2030
2158
|
);
|
|
2031
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
2032
|
-
if (metrics.length > 0) {
|
|
2033
|
-
console.log(" Metrics:");
|
|
2034
|
-
for (const [key, value] of metrics) {
|
|
2035
|
-
console.log(` - ${key}: ${value}`);
|
|
2036
|
-
}
|
|
2037
|
-
}
|
|
2038
2159
|
if (benchmarkResult.error) {
|
|
2039
2160
|
console.log(
|
|
2040
2161
|
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
2041
2162
|
);
|
|
2042
2163
|
}
|
|
2043
2164
|
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
(
|
|
2047
|
-
)
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
for (const l of benchmarkResult.logs.slice(0, 10)) {
|
|
2165
|
+
const failures = parseFailures(benchmarkResult.logs);
|
|
2166
|
+
if (failures.length > 0) {
|
|
2167
|
+
printFailureSummary(failures);
|
|
2168
|
+
} else if (benchmarkResult.logs.length > 0) {
|
|
2169
|
+
console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
|
|
2170
|
+
for (const l of benchmarkResult.logs.slice(0, 5)) {
|
|
2051
2171
|
console.log(` ${l}`);
|
|
2052
2172
|
}
|
|
2053
2173
|
}
|
|
@@ -2406,6 +2526,326 @@ function consoleDebugReporter(results) {
|
|
|
2406
2526
|
console.log("\n------------------------------------\n");
|
|
2407
2527
|
}
|
|
2408
2528
|
|
|
2529
|
+
// src/reporters/console.summary.ts
|
|
2530
|
+
var colors3 = {
|
|
2531
|
+
reset: "\x1B[0m",
|
|
2532
|
+
bold: "\x1B[1m",
|
|
2533
|
+
dim: "\x1B[2m",
|
|
2534
|
+
green: "\x1B[32m",
|
|
2535
|
+
red: "\x1B[31m",
|
|
2536
|
+
yellow: "\x1B[33m",
|
|
2537
|
+
cyan: "\x1B[36m",
|
|
2538
|
+
magenta: "\x1B[35m",
|
|
2539
|
+
gray: "\x1B[90m",
|
|
2540
|
+
white: "\x1B[37m"
|
|
2541
|
+
};
|
|
2542
|
+
var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
|
|
2543
|
+
var ID_NUM_REGEX = /_(\d+)$/;
|
|
2544
|
+
var REASONING_TAG = "think";
|
|
2545
|
+
var MAX_FAILURES_TO_DISPLAY = 5;
|
|
2546
|
+
var CATEGORY_DESCRIPTIONS = {
|
|
2547
|
+
PARSE_FAILURE: {
|
|
2548
|
+
label: "Parse Failure",
|
|
2549
|
+
description: "No tool calls extracted from model output",
|
|
2550
|
+
hint: "Model may have responded in text instead of tool format"
|
|
2551
|
+
},
|
|
2552
|
+
PARTIAL_CALLS: {
|
|
2553
|
+
label: "Partial Calls",
|
|
2554
|
+
description: "Some expected tool calls missing",
|
|
2555
|
+
hint: "Model stopped early or missed some tools"
|
|
2556
|
+
},
|
|
2557
|
+
EXTRA_CALLS: {
|
|
2558
|
+
label: "Extra Calls",
|
|
2559
|
+
description: "More tool calls than expected",
|
|
2560
|
+
hint: "Model called tools that weren't needed"
|
|
2561
|
+
},
|
|
2562
|
+
PARAM_VALUE_PERCENT: {
|
|
2563
|
+
label: "Param Value (Percent)",
|
|
2564
|
+
description: "Percentage sent as integer instead of decimal",
|
|
2565
|
+
hint: "e.g., 5 instead of 0.05 for 5%"
|
|
2566
|
+
},
|
|
2567
|
+
PARAM_VALUE_MISMATCH: {
|
|
2568
|
+
label: "Param Value Mismatch",
|
|
2569
|
+
description: "Parameter values don't match expected"
|
|
2570
|
+
},
|
|
2571
|
+
WRONG_FUNCTION: {
|
|
2572
|
+
label: "Wrong Function",
|
|
2573
|
+
description: "Called wrong function name"
|
|
2574
|
+
},
|
|
2575
|
+
MISSING_PARAMS: {
|
|
2576
|
+
label: "Missing Params",
|
|
2577
|
+
description: "Required parameters not provided"
|
|
2578
|
+
},
|
|
2579
|
+
UNEXPECTED_PARAMS: {
|
|
2580
|
+
label: "Unexpected Params",
|
|
2581
|
+
description: "Extra parameters that shouldn't be there"
|
|
2582
|
+
},
|
|
2583
|
+
NO_MATCH: {
|
|
2584
|
+
label: "No Match",
|
|
2585
|
+
description: "Function called but couldn't match to expected",
|
|
2586
|
+
hint: "Parameters may be correct but don't match any expected combination"
|
|
2587
|
+
},
|
|
2588
|
+
OTHER: {
|
|
2589
|
+
label: "Other",
|
|
2590
|
+
description: "Uncategorized failure"
|
|
2591
|
+
}
|
|
2592
|
+
};
|
|
2593
|
+
function parseFailureLogs(logs) {
|
|
2594
|
+
return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
|
|
2595
|
+
try {
|
|
2596
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
|
|
2597
|
+
return JSON.parse(jsonStr);
|
|
2598
|
+
} catch (e) {
|
|
2599
|
+
return null;
|
|
2600
|
+
}
|
|
2601
|
+
}).filter((parsed) => parsed !== null);
|
|
2602
|
+
}
|
|
2603
|
+
function groupByCategory(failures) {
|
|
2604
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2605
|
+
for (const failure of failures) {
|
|
2606
|
+
const category = failure.category || "OTHER";
|
|
2607
|
+
const existing = groups.get(category);
|
|
2608
|
+
if (existing) {
|
|
2609
|
+
existing.failures.push(failure);
|
|
2610
|
+
} else {
|
|
2611
|
+
groups.set(category, { failures: [failure] });
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
return groups;
|
|
2615
|
+
}
|
|
2616
|
+
function extractParamNames(failures) {
|
|
2617
|
+
const paramNames = /* @__PURE__ */ new Set();
|
|
2618
|
+
for (const f of failures) {
|
|
2619
|
+
if (!f.diff) {
|
|
2620
|
+
continue;
|
|
2621
|
+
}
|
|
2622
|
+
for (const d of f.diff) {
|
|
2623
|
+
if (d.startsWith("@@ param ")) {
|
|
2624
|
+
paramNames.add(d.replace("@@ param ", ""));
|
|
2625
|
+
}
|
|
2626
|
+
}
|
|
2627
|
+
}
|
|
2628
|
+
return paramNames;
|
|
2629
|
+
}
|
|
2630
|
+
function extractFinishReasons(failures) {
|
|
2631
|
+
var _a;
|
|
2632
|
+
const finishReasons = /* @__PURE__ */ new Set();
|
|
2633
|
+
for (const f of failures) {
|
|
2634
|
+
if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
|
|
2635
|
+
finishReasons.add(String(f.context.finish_reason));
|
|
2636
|
+
}
|
|
2637
|
+
}
|
|
2638
|
+
return finishReasons;
|
|
2639
|
+
}
|
|
2640
|
+
function detectPatterns(group) {
|
|
2641
|
+
const { failures } = group;
|
|
2642
|
+
if (failures.length < 2) {
|
|
2643
|
+
return;
|
|
2644
|
+
}
|
|
2645
|
+
const firstCategory = failures[0].category;
|
|
2646
|
+
if (firstCategory === "PARAM_VALUE_PERCENT") {
|
|
2647
|
+
const paramNames = extractParamNames(failures);
|
|
2648
|
+
if (paramNames.size > 0) {
|
|
2649
|
+
group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
if (firstCategory === "PARSE_FAILURE") {
|
|
2653
|
+
const finishReasons = extractFinishReasons(failures);
|
|
2654
|
+
if (finishReasons.size === 1) {
|
|
2655
|
+
group.pattern = `All finished with: ${[...finishReasons][0]}`;
|
|
2656
|
+
}
|
|
2657
|
+
}
|
|
2658
|
+
}
|
|
2659
|
+
function getLineColor(line) {
|
|
2660
|
+
if (line.startsWith("+")) {
|
|
2661
|
+
return colors3.green;
|
|
2662
|
+
}
|
|
2663
|
+
if (line.startsWith("-")) {
|
|
2664
|
+
return colors3.red;
|
|
2665
|
+
}
|
|
2666
|
+
if (line.startsWith("@@")) {
|
|
2667
|
+
return colors3.cyan;
|
|
2668
|
+
}
|
|
2669
|
+
return colors3.white;
|
|
2670
|
+
}
|
|
2671
|
+
function formatFunctions(funcs) {
|
|
2672
|
+
if (Array.isArray(funcs)) {
|
|
2673
|
+
return funcs.join(", ");
|
|
2674
|
+
}
|
|
2675
|
+
return String(funcs);
|
|
2676
|
+
}
|
|
2677
|
+
function printExpectedActual(failure) {
|
|
2678
|
+
if (failure.expected) {
|
|
2679
|
+
const expFuncs = failure.expected.functions || failure.expected.function;
|
|
2680
|
+
if (expFuncs) {
|
|
2681
|
+
console.log(
|
|
2682
|
+
` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
|
|
2683
|
+
);
|
|
2684
|
+
}
|
|
2685
|
+
}
|
|
2686
|
+
if (failure.actual) {
|
|
2687
|
+
const actFuncs = failure.actual.functions || failure.actual.function;
|
|
2688
|
+
if (actFuncs) {
|
|
2689
|
+
const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
|
|
2690
|
+
const color = isEmpty ? colors3.red : colors3.white;
|
|
2691
|
+
const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
|
|
2692
|
+
console.log(
|
|
2693
|
+
` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
|
|
2694
|
+
);
|
|
2695
|
+
}
|
|
2696
|
+
}
|
|
2697
|
+
}
|
|
2698
|
+
function printDiff(diff) {
|
|
2699
|
+
console.log(` ${colors3.gray}Diff:${colors3.reset}`);
|
|
2700
|
+
for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
|
|
2701
|
+
const lineColor = getLineColor(line);
|
|
2702
|
+
console.log(` ${lineColor}${line}${colors3.reset}`);
|
|
2703
|
+
}
|
|
2704
|
+
}
|
|
2705
|
+
function removeReasoningTags(text) {
|
|
2706
|
+
const openTag = `<${REASONING_TAG}>`;
|
|
2707
|
+
const closeTag = `</${REASONING_TAG}>`;
|
|
2708
|
+
const closedTagPattern = new RegExp(
|
|
2709
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
|
|
2710
|
+
"g"
|
|
2711
|
+
);
|
|
2712
|
+
const unclosedTagPattern = new RegExp(
|
|
2713
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
|
|
2714
|
+
"g"
|
|
2715
|
+
);
|
|
2716
|
+
let result = text.replace(closedTagPattern, "");
|
|
2717
|
+
result = result.replace(unclosedTagPattern, "");
|
|
2718
|
+
return result.trim();
|
|
2719
|
+
}
|
|
2720
|
+
function printModelOutput(failure, category) {
|
|
2721
|
+
var _a, _b;
|
|
2722
|
+
if (category !== "PARSE_FAILURE") {
|
|
2723
|
+
return;
|
|
2724
|
+
}
|
|
2725
|
+
const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
|
|
2726
|
+
const cleanedText = removeReasoningTags(rawText);
|
|
2727
|
+
if (cleanedText) {
|
|
2728
|
+
console.log(
|
|
2729
|
+
` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
|
|
2730
|
+
);
|
|
2731
|
+
} else {
|
|
2732
|
+
console.log(
|
|
2733
|
+
` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
|
|
2734
|
+
);
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2737
|
+
function shouldShowDiffByDefault(category) {
|
|
2738
|
+
return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
|
|
2739
|
+
}
|
|
2740
|
+
function printSingleFailure(failure, category, verbose) {
|
|
2741
|
+
console.log(`
|
|
2742
|
+
${colors3.bold}${failure.id}${colors3.reset}`);
|
|
2743
|
+
const hasDiff = failure.diff && failure.diff.length > 0;
|
|
2744
|
+
const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
|
|
2745
|
+
if (showDiffPrimarily) {
|
|
2746
|
+
printDiff(failure.diff);
|
|
2747
|
+
} else {
|
|
2748
|
+
printExpectedActual(failure);
|
|
2749
|
+
if (hasDiff && verbose) {
|
|
2750
|
+
printDiff(failure.diff);
|
|
2751
|
+
}
|
|
2752
|
+
}
|
|
2753
|
+
printModelOutput(failure, category);
|
|
2754
|
+
}
|
|
2755
|
+
var MAX_SAMPLE_FAILURES = 2;
|
|
2756
|
+
function printRemainingIds(failures) {
|
|
2757
|
+
const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
|
|
2758
|
+
const idNums = remainingIds.map((id) => {
|
|
2759
|
+
const match = id.match(ID_NUM_REGEX);
|
|
2760
|
+
return match ? match[1] : id;
|
|
2761
|
+
});
|
|
2762
|
+
console.log(
|
|
2763
|
+
`
|
|
2764
|
+
${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
|
|
2765
|
+
);
|
|
2766
|
+
}
|
|
2767
|
+
function printCategoryHeader(info, count) {
|
|
2768
|
+
console.log(
|
|
2769
|
+
`
|
|
2770
|
+
${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
|
|
2771
|
+
);
|
|
2772
|
+
console.log(`${colors3.dim}${info.description}${colors3.reset}`);
|
|
2773
|
+
}
|
|
2774
|
+
function printCategoryDetails(category, group, verbose) {
|
|
2775
|
+
const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
|
|
2776
|
+
const { failures } = group;
|
|
2777
|
+
printCategoryHeader(info, failures.length);
|
|
2778
|
+
if (group.pattern) {
|
|
2779
|
+
console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
|
|
2780
|
+
}
|
|
2781
|
+
if (info.hint) {
|
|
2782
|
+
console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
|
|
2783
|
+
}
|
|
2784
|
+
const samplesToShow = verbose ? failures : failures.slice(0, 2);
|
|
2785
|
+
for (const failure of samplesToShow) {
|
|
2786
|
+
printSingleFailure(failure, category, verbose);
|
|
2787
|
+
}
|
|
2788
|
+
if (!verbose && failures.length > 2) {
|
|
2789
|
+
printRemainingIds(failures);
|
|
2790
|
+
}
|
|
2791
|
+
}
|
|
2792
|
+
function printResultHeader(result) {
|
|
2793
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
2794
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2795
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2796
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2797
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2798
|
+
const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
|
|
2799
|
+
const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
|
|
2800
|
+
const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
|
|
2801
|
+
const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
|
|
2802
|
+
console.log(
|
|
2803
|
+
`
|
|
2804
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
|
|
2805
|
+
);
|
|
2806
|
+
console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
|
|
2807
|
+
}
|
|
2808
|
+
function printResultSummary(result, verbose) {
|
|
2809
|
+
const { result: benchmarkResult } = result;
|
|
2810
|
+
printResultHeader(result);
|
|
2811
|
+
if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
|
|
2812
|
+
return;
|
|
2813
|
+
}
|
|
2814
|
+
const failures = parseFailureLogs(benchmarkResult.logs);
|
|
2815
|
+
if (failures.length === 0) {
|
|
2816
|
+
if (!benchmarkResult.success) {
|
|
2817
|
+
console.log(
|
|
2818
|
+
`${colors3.yellow}No structured failure data available${colors3.reset}`
|
|
2819
|
+
);
|
|
2820
|
+
}
|
|
2821
|
+
return;
|
|
2822
|
+
}
|
|
2823
|
+
const groups = groupByCategory(failures);
|
|
2824
|
+
for (const group of groups.values()) {
|
|
2825
|
+
detectPatterns(group);
|
|
2826
|
+
}
|
|
2827
|
+
const sortedCategories = [...groups.entries()].sort(
|
|
2828
|
+
(a, b) => b[1].failures.length - a[1].failures.length
|
|
2829
|
+
);
|
|
2830
|
+
for (const [cat, group] of sortedCategories) {
|
|
2831
|
+
printCategoryDetails(cat, group, verbose);
|
|
2832
|
+
}
|
|
2833
|
+
}
|
|
2834
|
+
function consoleSummaryReporter(results) {
|
|
2835
|
+
const verbose = process.env.VERBOSE === "true";
|
|
2836
|
+
console.log(`
|
|
2837
|
+
${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
|
|
2838
|
+
console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
|
|
2839
|
+
for (const result of results) {
|
|
2840
|
+
printResultSummary(result, verbose);
|
|
2841
|
+
}
|
|
2842
|
+
console.log(
|
|
2843
|
+
`
|
|
2844
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
|
|
2845
|
+
`
|
|
2846
|
+
);
|
|
2847
|
+
}
|
|
2848
|
+
|
|
2409
2849
|
// src/reporters/json.ts
|
|
2410
2850
|
function jsonReporter(results) {
|
|
2411
2851
|
const serializableResults = results.map((r) => {
|
|
@@ -2425,60 +2865,56 @@ function jsonReporter(results) {
|
|
|
2425
2865
|
var reporters = {
|
|
2426
2866
|
console: consoleReporter,
|
|
2427
2867
|
json: jsonReporter,
|
|
2428
|
-
"console.debug": consoleDebugReporter
|
|
2868
|
+
"console.debug": consoleDebugReporter,
|
|
2869
|
+
"console.summary": consoleSummaryReporter
|
|
2429
2870
|
};
|
|
2430
2871
|
|
|
2431
2872
|
// src/evaluate.ts
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
console.log(
|
|
2440
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
2441
|
-
);
|
|
2442
|
-
return {
|
|
2443
|
-
model: modelId,
|
|
2444
|
-
modelKey,
|
|
2445
|
-
benchmark: benchmark.name,
|
|
2446
|
-
result
|
|
2447
|
-
};
|
|
2448
|
-
} catch (error) {
|
|
2449
|
-
console.error(
|
|
2450
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
2451
|
-
error
|
|
2452
|
-
);
|
|
2453
|
-
return {
|
|
2454
|
-
model: modelId,
|
|
2455
|
-
modelKey,
|
|
2456
|
-
benchmark: benchmark.name,
|
|
2457
|
-
result: {
|
|
2458
|
-
score: 0,
|
|
2459
|
-
success: false,
|
|
2460
|
-
metrics: {},
|
|
2461
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
2462
|
-
}
|
|
2463
|
-
};
|
|
2873
|
+
function isModelConfig(value) {
|
|
2874
|
+
if (typeof value !== "object" || value === null) {
|
|
2875
|
+
return false;
|
|
2876
|
+
}
|
|
2877
|
+
const obj = value;
|
|
2878
|
+
if (!("model" in obj)) {
|
|
2879
|
+
return false;
|
|
2464
2880
|
}
|
|
2881
|
+
const model = obj.model;
|
|
2882
|
+
if (typeof model !== "object" || model === null) {
|
|
2883
|
+
return false;
|
|
2884
|
+
}
|
|
2885
|
+
return "modelId" in model;
|
|
2886
|
+
}
|
|
2887
|
+
function isLanguageModel(value) {
|
|
2888
|
+
if (typeof value !== "object" || value === null) {
|
|
2889
|
+
return false;
|
|
2890
|
+
}
|
|
2891
|
+
const obj = value;
|
|
2892
|
+
return "modelId" in obj && typeof obj.modelId === "string";
|
|
2893
|
+
}
|
|
2894
|
+
function extractModelAndMiddleware(input) {
|
|
2895
|
+
if (isModelConfig(input)) {
|
|
2896
|
+
return [input.model, input.middleware];
|
|
2897
|
+
}
|
|
2898
|
+
return [input, void 0];
|
|
2465
2899
|
}
|
|
2466
2900
|
function normalizeModels(models) {
|
|
2467
|
-
const
|
|
2901
|
+
const entries = [];
|
|
2468
2902
|
if (Array.isArray(models)) {
|
|
2469
2903
|
for (const m of models) {
|
|
2470
|
-
|
|
2904
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2905
|
+
entries.push([void 0, model, middleware]);
|
|
2471
2906
|
}
|
|
2472
|
-
} else if (
|
|
2473
|
-
|
|
2907
|
+
} else if (isModelConfig(models)) {
|
|
2908
|
+
entries.push([void 0, models.model, models.middleware]);
|
|
2909
|
+
} else if (isLanguageModel(models)) {
|
|
2910
|
+
entries.push([void 0, models, void 0]);
|
|
2474
2911
|
} else {
|
|
2475
|
-
for (const [key, m] of Object.entries(
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
modelEntries.push([key, m]);
|
|
2912
|
+
for (const [key, m] of Object.entries(models)) {
|
|
2913
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2914
|
+
entries.push([key, model, middleware]);
|
|
2479
2915
|
}
|
|
2480
2916
|
}
|
|
2481
|
-
return
|
|
2917
|
+
return entries;
|
|
2482
2918
|
}
|
|
2483
2919
|
function buildConfig(temperature, maxTokens) {
|
|
2484
2920
|
const config = {};
|
|
@@ -2499,21 +2935,90 @@ function executeReporter(reporter, results) {
|
|
|
2499
2935
|
reporters.console(results);
|
|
2500
2936
|
}
|
|
2501
2937
|
}
|
|
2938
|
+
function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
|
|
2939
|
+
var _a, _b;
|
|
2940
|
+
const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
|
|
2941
|
+
if (!(cacheEnabled || userMiddleware)) {
|
|
2942
|
+
return baseModel;
|
|
2943
|
+
}
|
|
2944
|
+
const cacheMiddleware = cacheEnabled ? (0, import_middleware.createDiskCacheMiddleware)({
|
|
2945
|
+
cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
|
|
2946
|
+
enabled: true,
|
|
2947
|
+
debug: (_b = cacheOptions.debug) != null ? _b : false
|
|
2948
|
+
}) : null;
|
|
2949
|
+
const middlewares = [];
|
|
2950
|
+
if (userMiddleware) {
|
|
2951
|
+
if (Array.isArray(userMiddleware)) {
|
|
2952
|
+
middlewares.push(...userMiddleware);
|
|
2953
|
+
} else {
|
|
2954
|
+
middlewares.push(userMiddleware);
|
|
2955
|
+
}
|
|
2956
|
+
}
|
|
2957
|
+
if (cacheMiddleware) {
|
|
2958
|
+
middlewares.push(cacheMiddleware);
|
|
2959
|
+
}
|
|
2960
|
+
if (middlewares.length === 0) {
|
|
2961
|
+
return baseModel;
|
|
2962
|
+
}
|
|
2963
|
+
return (0, import_ai4.wrapLanguageModel)({
|
|
2964
|
+
// biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
|
|
2965
|
+
model: baseModel,
|
|
2966
|
+
middleware: middlewares.length === 1 ? middlewares[0] : middlewares
|
|
2967
|
+
});
|
|
2968
|
+
}
|
|
2969
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
2970
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
2971
|
+
const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
|
|
2972
|
+
try {
|
|
2973
|
+
process.stdout.write(`${prefix}: ...`);
|
|
2974
|
+
const result = await benchmark.run(model, config);
|
|
2975
|
+
const scoreDisplay = result.score.toFixed(2);
|
|
2976
|
+
process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
|
|
2977
|
+
`);
|
|
2978
|
+
return {
|
|
2979
|
+
model: modelId,
|
|
2980
|
+
modelKey,
|
|
2981
|
+
benchmark: benchmark.name,
|
|
2982
|
+
result
|
|
2983
|
+
};
|
|
2984
|
+
} catch (error) {
|
|
2985
|
+
process.stdout.write(`\r${prefix}: .... Score: ERROR
|
|
2986
|
+
`);
|
|
2987
|
+
console.error(error);
|
|
2988
|
+
return {
|
|
2989
|
+
model: modelId,
|
|
2990
|
+
modelKey,
|
|
2991
|
+
benchmark: benchmark.name,
|
|
2992
|
+
result: {
|
|
2993
|
+
score: 0,
|
|
2994
|
+
success: false,
|
|
2995
|
+
metrics: {},
|
|
2996
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
2997
|
+
}
|
|
2998
|
+
};
|
|
2999
|
+
}
|
|
3000
|
+
}
|
|
2502
3001
|
async function evaluate(options) {
|
|
2503
3002
|
const {
|
|
2504
3003
|
models,
|
|
2505
3004
|
benchmarks,
|
|
2506
3005
|
reporter = "console",
|
|
2507
3006
|
temperature,
|
|
2508
|
-
maxTokens
|
|
3007
|
+
maxTokens,
|
|
3008
|
+
cache
|
|
2509
3009
|
} = options;
|
|
2510
3010
|
const modelEntries = normalizeModels(models);
|
|
2511
3011
|
const config = buildConfig(temperature, maxTokens);
|
|
2512
3012
|
const allResults = [];
|
|
2513
|
-
for (const [modelKey,
|
|
3013
|
+
for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
|
|
3014
|
+
const effectiveModel = buildEffectiveModel(
|
|
3015
|
+
baseModel,
|
|
3016
|
+
userMiddleware,
|
|
3017
|
+
cache
|
|
3018
|
+
);
|
|
2514
3019
|
for (const benchmark of benchmarks) {
|
|
2515
3020
|
const evaluationResult = await runSingleBenchmark(
|
|
2516
|
-
|
|
3021
|
+
effectiveModel,
|
|
2517
3022
|
benchmark,
|
|
2518
3023
|
modelKey,
|
|
2519
3024
|
config
|