@ai-sdk-tool/eval 1.0.0-canary.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
- package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
- package/data/BFCL_v4_simple.jsonl +400 -0
- package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
- package/dist/index.cjs +715 -210
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -3
- package/dist/index.d.ts +49 -3
- package/dist/index.js +715 -210
- package/dist/index.js.map +1 -1
- package/package.json +6 -5
- package/data/BFCL_v3_simple.jsonl +0 -400
- package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
- /package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0
package/dist/index.js
CHANGED
|
@@ -368,6 +368,7 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
368
368
|
// src/benchmarks/bfcl.ts
|
|
369
369
|
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
370
370
|
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
371
|
+
var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
|
|
371
372
|
function convertGroundTruthToXML(call) {
|
|
372
373
|
const keys = Object.keys(call);
|
|
373
374
|
if (keys.length === 0) {
|
|
@@ -399,45 +400,67 @@ function convertGroundTruthToXML(call) {
|
|
|
399
400
|
xml += `</${funcName}>`;
|
|
400
401
|
return xml;
|
|
401
402
|
}
|
|
403
|
+
function extractCategory(id) {
|
|
404
|
+
if (id.startsWith("parallel_multiple")) {
|
|
405
|
+
return "parallel_multiple";
|
|
406
|
+
}
|
|
407
|
+
if (id.startsWith("simple_python")) {
|
|
408
|
+
return "simple";
|
|
409
|
+
}
|
|
410
|
+
if (id.startsWith("simple_java")) {
|
|
411
|
+
return "simple";
|
|
412
|
+
}
|
|
413
|
+
if (id.startsWith("simple_javascript")) {
|
|
414
|
+
return "simple";
|
|
415
|
+
}
|
|
416
|
+
if (id.startsWith("parallel")) {
|
|
417
|
+
return "parallel";
|
|
418
|
+
}
|
|
419
|
+
if (id.startsWith("multiple")) {
|
|
420
|
+
return "multiple";
|
|
421
|
+
}
|
|
422
|
+
if (id.startsWith("simple")) {
|
|
423
|
+
return "simple";
|
|
424
|
+
}
|
|
425
|
+
return id.split("_")[0];
|
|
426
|
+
}
|
|
402
427
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
403
|
-
const category = testCase.id
|
|
428
|
+
const category = extractCategory(testCase.id);
|
|
404
429
|
try {
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
430
|
+
switch (category) {
|
|
431
|
+
case "simple": {
|
|
432
|
+
if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
|
|
433
|
+
return {
|
|
434
|
+
valid: false,
|
|
435
|
+
error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
|
|
436
|
+
error_type: "simple:wrong_count"
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
return simpleFunctionChecker(
|
|
440
|
+
testCase.function[0],
|
|
441
|
+
modelOutput[0],
|
|
442
|
+
possibleAnswer.ground_truth[0]
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
case "multiple": {
|
|
446
|
+
return multipleFunctionChecker(
|
|
447
|
+
testCase.function,
|
|
448
|
+
modelOutput,
|
|
449
|
+
possibleAnswer.ground_truth
|
|
450
|
+
);
|
|
451
|
+
}
|
|
452
|
+
case "parallel":
|
|
453
|
+
case "parallel_multiple": {
|
|
454
|
+
return parallelFunctionCheckerNoOrder(
|
|
455
|
+
testCase.function,
|
|
456
|
+
modelOutput,
|
|
457
|
+
possibleAnswer.ground_truth
|
|
458
|
+
);
|
|
459
|
+
}
|
|
460
|
+
default: {
|
|
461
|
+
return { valid: true };
|
|
412
462
|
}
|
|
413
|
-
return simpleFunctionChecker(
|
|
414
|
-
testCase.function[0],
|
|
415
|
-
modelOutput[0],
|
|
416
|
-
possibleAnswer.ground_truth[0]
|
|
417
|
-
);
|
|
418
|
-
}
|
|
419
|
-
if (category === "parallel") {
|
|
420
|
-
return parallelFunctionCheckerNoOrder(
|
|
421
|
-
testCase.function,
|
|
422
|
-
modelOutput,
|
|
423
|
-
possibleAnswer.ground_truth
|
|
424
|
-
);
|
|
425
|
-
}
|
|
426
|
-
if (category === "multiple") {
|
|
427
|
-
return multipleFunctionChecker(
|
|
428
|
-
testCase.function,
|
|
429
|
-
modelOutput,
|
|
430
|
-
possibleAnswer.ground_truth
|
|
431
|
-
);
|
|
432
|
-
}
|
|
433
|
-
if (category.includes("parallel-multiple")) {
|
|
434
|
-
return parallelFunctionCheckerNoOrder(
|
|
435
|
-
testCase.function,
|
|
436
|
-
modelOutput,
|
|
437
|
-
possibleAnswer.ground_truth
|
|
438
|
-
);
|
|
439
463
|
}
|
|
440
|
-
return { valid: true };
|
|
441
464
|
} catch (e) {
|
|
442
465
|
return {
|
|
443
466
|
valid: false,
|
|
@@ -615,7 +638,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
615
638
|
return `- expected one of: ${formatted}`;
|
|
616
639
|
})();
|
|
617
640
|
diffLines.push(expectedLine);
|
|
618
|
-
diffLines.push(`+
|
|
641
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
619
642
|
return diffLines;
|
|
620
643
|
};
|
|
621
644
|
const paramValueMatches = (allowed, got) => {
|
|
@@ -832,44 +855,97 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
832
855
|
);
|
|
833
856
|
}
|
|
834
857
|
};
|
|
835
|
-
const
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
flatMessages,
|
|
840
|
-
mwOriginalText,
|
|
841
|
-
text,
|
|
842
|
-
finishReason,
|
|
843
|
-
mwParsedToolCalls,
|
|
844
|
-
restoredCalls,
|
|
845
|
-
possibleAnswer
|
|
846
|
-
} = options;
|
|
847
|
-
const lastUser = (() => {
|
|
848
|
-
var _a;
|
|
849
|
-
const reversed = [...flatMessages].reverse();
|
|
850
|
-
const found = reversed.find(
|
|
851
|
-
(m) => m.role === "user"
|
|
852
|
-
);
|
|
853
|
-
return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
|
|
854
|
-
})();
|
|
855
|
-
const rawModelText = (() => {
|
|
856
|
-
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
857
|
-
return mwOriginalText;
|
|
858
|
+
const hasPercentPattern = (diff) => {
|
|
859
|
+
return diff.some((d) => {
|
|
860
|
+
if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
|
|
861
|
+
return false;
|
|
858
862
|
}
|
|
859
|
-
|
|
860
|
-
|
|
863
|
+
const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
|
|
864
|
+
if (!numMatch) {
|
|
865
|
+
return false;
|
|
861
866
|
}
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
867
|
+
const num = Number.parseFloat(numMatch[1]);
|
|
868
|
+
return num >= 1 && num <= 100;
|
|
869
|
+
});
|
|
870
|
+
};
|
|
871
|
+
const isValueError = (errorType, diff) => {
|
|
872
|
+
return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
|
|
873
|
+
};
|
|
874
|
+
const isFunctionNameError = (errorType, diff) => {
|
|
875
|
+
return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
|
|
876
|
+
};
|
|
877
|
+
const isMissingParamError = (errorType, diff) => {
|
|
878
|
+
return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
|
|
879
|
+
};
|
|
880
|
+
const isUnexpectedParamError = (errorType, diff) => {
|
|
881
|
+
return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
|
|
882
|
+
};
|
|
883
|
+
const classifyByErrorPatterns = (errorType, diff) => {
|
|
884
|
+
const patterns = [
|
|
885
|
+
[
|
|
886
|
+
isValueError,
|
|
887
|
+
hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
|
|
888
|
+
],
|
|
889
|
+
[isFunctionNameError, "WRONG_FUNCTION"],
|
|
890
|
+
[isMissingParamError, "MISSING_PARAMS"],
|
|
891
|
+
[isUnexpectedParamError, "UNEXPECTED_PARAMS"]
|
|
892
|
+
];
|
|
893
|
+
for (const [classifier, result] of patterns) {
|
|
894
|
+
if (classifier(errorType, diff)) {
|
|
895
|
+
return result;
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
|
|
899
|
+
return "NO_MATCH";
|
|
900
|
+
}
|
|
901
|
+
return null;
|
|
902
|
+
};
|
|
903
|
+
const classifyByCallCount = (actualCount, expectedCount) => {
|
|
904
|
+
if (actualCount === 0 && expectedCount > 0) {
|
|
905
|
+
return "PARSE_FAILURE";
|
|
906
|
+
}
|
|
907
|
+
if (actualCount > 0 && actualCount < expectedCount) {
|
|
908
|
+
return "PARTIAL_CALLS";
|
|
909
|
+
}
|
|
910
|
+
if (actualCount > expectedCount) {
|
|
911
|
+
return "EXTRA_CALLS";
|
|
912
|
+
}
|
|
913
|
+
return null;
|
|
914
|
+
};
|
|
915
|
+
const classifyFailureType = (options) => {
|
|
916
|
+
const { errorType, restoredCalls, expectedCount, diff } = options;
|
|
917
|
+
const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
|
|
918
|
+
const countBasedResult = classifyByCallCount(
|
|
919
|
+
actualCount,
|
|
920
|
+
expectedCount
|
|
921
|
+
);
|
|
922
|
+
if (countBasedResult) {
|
|
923
|
+
return countBasedResult;
|
|
924
|
+
}
|
|
925
|
+
const patternBasedResult = classifyByErrorPatterns(errorType, diff);
|
|
926
|
+
if (patternBasedResult) {
|
|
927
|
+
return patternBasedResult;
|
|
928
|
+
}
|
|
929
|
+
return "OTHER";
|
|
930
|
+
};
|
|
931
|
+
const extractRawModelText = (mwOriginalText, text) => {
|
|
932
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
933
|
+
return mwOriginalText;
|
|
934
|
+
}
|
|
935
|
+
if (typeof text === "string") {
|
|
936
|
+
return text;
|
|
937
|
+
}
|
|
938
|
+
return "";
|
|
939
|
+
};
|
|
940
|
+
const extractLastUserQuery = (flatMessages) => {
|
|
941
|
+
var _a;
|
|
942
|
+
const reversed = [...flatMessages].reverse();
|
|
943
|
+
const found = reversed.find((m) => m.role === "user");
|
|
944
|
+
const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
|
|
945
|
+
return content.length > 200 ? `${content.slice(0, 200)}...` : content;
|
|
946
|
+
};
|
|
947
|
+
const truncateText = (text, maxLen) => {
|
|
948
|
+
return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
|
|
873
949
|
};
|
|
874
950
|
const logFailureDetails = (options) => {
|
|
875
951
|
const {
|
|
@@ -887,42 +963,36 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
887
963
|
} = options;
|
|
888
964
|
try {
|
|
889
965
|
const category = testCase.id.split("_")[0];
|
|
890
|
-
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
caseLogs.push(
|
|
900
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
901
|
-
id: testCase.id,
|
|
902
|
-
message: checkerResult.error,
|
|
903
|
-
error_type: checkerResult.error_type,
|
|
904
|
-
expected,
|
|
905
|
-
actual,
|
|
906
|
-
diff
|
|
907
|
-
})}`
|
|
908
|
-
);
|
|
909
|
-
try {
|
|
910
|
-
const contextPayload = buildFailureContext({
|
|
911
|
-
testCase,
|
|
912
|
-
tools,
|
|
913
|
-
flatMessages,
|
|
914
|
-
mwOriginalText,
|
|
915
|
-
text,
|
|
916
|
-
finishReason,
|
|
917
|
-
mwParsedToolCalls,
|
|
966
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
|
|
967
|
+
const gtArr = possibleAnswer.ground_truth;
|
|
968
|
+
const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
|
|
969
|
+
const rawModelText = extractRawModelText(mwOriginalText, text);
|
|
970
|
+
const lastUserQuery = extractLastUserQuery(flatMessages);
|
|
971
|
+
const failurePayload = {
|
|
972
|
+
id: testCase.id,
|
|
973
|
+
category: classifyFailureType({
|
|
974
|
+
errorType: checkerResult.error_type,
|
|
918
975
|
restoredCalls,
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
976
|
+
expectedCount,
|
|
977
|
+
diff
|
|
978
|
+
}),
|
|
979
|
+
message: checkerResult.error,
|
|
980
|
+
error_type: checkerResult.error_type,
|
|
981
|
+
expected,
|
|
982
|
+
actual,
|
|
983
|
+
diff,
|
|
984
|
+
context: {
|
|
985
|
+
raw_model_text: truncateText(rawModelText, 500),
|
|
986
|
+
raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
|
|
987
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
988
|
+
expected_count: expectedCount,
|
|
989
|
+
actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
|
|
990
|
+
finish_reason: finishReason,
|
|
991
|
+
last_user_query: lastUserQuery,
|
|
992
|
+
tool_names: tools.map((t) => t.name)
|
|
993
|
+
}
|
|
994
|
+
};
|
|
995
|
+
caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
|
|
926
996
|
} catch (e) {
|
|
927
997
|
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
928
998
|
}
|
|
@@ -1147,14 +1217,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1147
1217
|
};
|
|
1148
1218
|
}
|
|
1149
1219
|
const score = correctCount / testCases.length;
|
|
1220
|
+
const caseResults = resultsPerCase.map((r, i) => ({
|
|
1221
|
+
id: testCases[i].id,
|
|
1222
|
+
valid: r.valid
|
|
1223
|
+
}));
|
|
1150
1224
|
return {
|
|
1151
1225
|
score,
|
|
1152
1226
|
success: score > 0.95,
|
|
1153
|
-
// High success threshold as requested
|
|
1154
1227
|
metrics: {
|
|
1155
1228
|
correct_count: correctCount,
|
|
1156
1229
|
total_cases: testCases.length,
|
|
1157
|
-
accuracy: score
|
|
1230
|
+
accuracy: score,
|
|
1231
|
+
case_results: JSON.stringify(caseResults)
|
|
1158
1232
|
},
|
|
1159
1233
|
logs
|
|
1160
1234
|
};
|
|
@@ -1174,27 +1248,27 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1174
1248
|
}
|
|
1175
1249
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1176
1250
|
"bfcl-simple",
|
|
1177
|
-
"BFCL Simple Function Calling",
|
|
1178
|
-
"
|
|
1179
|
-
"
|
|
1251
|
+
"BFCL v4 Simple Function Calling",
|
|
1252
|
+
"BFCL_v4_simple.jsonl",
|
|
1253
|
+
"BFCL_v4_simple_possible_answer.jsonl"
|
|
1180
1254
|
);
|
|
1181
1255
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1182
1256
|
"bfcl-parallel",
|
|
1183
|
-
"BFCL Parallel Function Calling",
|
|
1184
|
-
"
|
|
1185
|
-
"
|
|
1257
|
+
"BFCL v4 Parallel Function Calling",
|
|
1258
|
+
"BFCL_v4_parallel.jsonl",
|
|
1259
|
+
"BFCL_v4_parallel_possible_answer.jsonl"
|
|
1186
1260
|
);
|
|
1187
1261
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1188
1262
|
"bfcl-multiple",
|
|
1189
|
-
"BFCL Multiple Function Calling",
|
|
1190
|
-
"
|
|
1191
|
-
"
|
|
1263
|
+
"BFCL v4 Multiple Function Calling",
|
|
1264
|
+
"BFCL_v4_multiple.jsonl",
|
|
1265
|
+
"BFCL_v4_multiple_possible_answer.jsonl"
|
|
1192
1266
|
);
|
|
1193
1267
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1194
1268
|
"bfcl-parallel-multiple",
|
|
1195
|
-
"BFCL Parallel & Multiple Function Calling",
|
|
1196
|
-
"
|
|
1197
|
-
"
|
|
1269
|
+
"BFCL v4 Parallel & Multiple Function Calling",
|
|
1270
|
+
"BFCL_v4_parallel_multiple.jsonl",
|
|
1271
|
+
"BFCL_v4_parallel_multiple_possible_answer.jsonl"
|
|
1198
1272
|
);
|
|
1199
1273
|
|
|
1200
1274
|
// src/benchmarks/complex-func-bench.ts
|
|
@@ -1925,23 +1999,28 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1925
1999
|
}
|
|
1926
2000
|
};
|
|
1927
2001
|
|
|
2002
|
+
// src/evaluate.ts
|
|
2003
|
+
import { createDiskCacheMiddleware } from "@ai-sdk-tool/middleware";
|
|
2004
|
+
import { wrapLanguageModel } from "ai";
|
|
2005
|
+
|
|
1928
2006
|
// src/reporters/console.ts
|
|
1929
2007
|
var colors = {
|
|
1930
2008
|
reset: "\x1B[0m",
|
|
2009
|
+
bold: "\x1B[1m",
|
|
1931
2010
|
green: "\x1B[32m",
|
|
1932
2011
|
red: "\x1B[31m",
|
|
1933
2012
|
yellow: "\x1B[33m",
|
|
1934
2013
|
cyan: "\x1B[36m",
|
|
1935
2014
|
magenta: "\x1B[35m",
|
|
1936
2015
|
gray: "\x1B[90m",
|
|
1937
|
-
white: "\x1B[37m"
|
|
1938
|
-
bgRed: "\x1B[41m"
|
|
2016
|
+
white: "\x1B[37m"
|
|
1939
2017
|
};
|
|
2018
|
+
var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
|
|
1940
2019
|
function formatDiff(diff) {
|
|
1941
2020
|
if (!diff || diff.length === 0) {
|
|
1942
2021
|
return "";
|
|
1943
2022
|
}
|
|
1944
|
-
return diff.map((line) => {
|
|
2023
|
+
return diff.slice(0, 8).map((line) => {
|
|
1945
2024
|
if (line.startsWith("-")) {
|
|
1946
2025
|
return `${colors.red}${line}${colors.reset}`;
|
|
1947
2026
|
}
|
|
@@ -1954,65 +2033,106 @@ function formatDiff(diff) {
|
|
|
1954
2033
|
return line;
|
|
1955
2034
|
}).join("\n ");
|
|
1956
2035
|
}
|
|
1957
|
-
function
|
|
1958
|
-
const
|
|
1959
|
-
for (const log of
|
|
2036
|
+
function parseFailures(logs) {
|
|
2037
|
+
const failures = [];
|
|
2038
|
+
for (const log of logs) {
|
|
2039
|
+
if (!DEBUG_FAIL_REGEX.test(log)) {
|
|
2040
|
+
continue;
|
|
2041
|
+
}
|
|
1960
2042
|
try {
|
|
1961
|
-
const jsonStr = log.replace(
|
|
1962
|
-
const
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
console.log(
|
|
1966
|
-
` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
|
|
1967
|
-
);
|
|
1968
|
-
console.log(` Message: ${data.message}`);
|
|
1969
|
-
if (data.diff && Array.isArray(data.diff)) {
|
|
1970
|
-
console.log(` Diff:
|
|
1971
|
-
${formatDiff(data.diff)}`);
|
|
1972
|
-
}
|
|
1973
|
-
if (data.expected && data.actual) {
|
|
1974
|
-
const expStr = JSON.stringify(data.expected);
|
|
1975
|
-
const actStr = JSON.stringify(data.actual);
|
|
1976
|
-
if (expStr.length < 100 && actStr.length < 100) {
|
|
1977
|
-
console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
|
|
1978
|
-
console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
|
|
1979
|
-
}
|
|
1980
|
-
}
|
|
1981
|
-
} catch (_e) {
|
|
1982
|
-
console.log(` Raw Log: ${log}`);
|
|
2043
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
|
|
2044
|
+
const parsed = JSON.parse(jsonStr);
|
|
2045
|
+
failures.push(parsed);
|
|
2046
|
+
} catch (e) {
|
|
1983
2047
|
}
|
|
1984
2048
|
}
|
|
2049
|
+
return failures;
|
|
2050
|
+
}
|
|
2051
|
+
function groupFailuresByCategory(failures) {
|
|
2052
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2053
|
+
for (const failure of failures) {
|
|
2054
|
+
const category = failure.category || "OTHER";
|
|
2055
|
+
const existing = groups.get(category);
|
|
2056
|
+
if (existing) {
|
|
2057
|
+
existing.push(failure);
|
|
2058
|
+
} else {
|
|
2059
|
+
groups.set(category, [failure]);
|
|
2060
|
+
}
|
|
2061
|
+
}
|
|
2062
|
+
return groups;
|
|
2063
|
+
}
|
|
2064
|
+
function printCompactFailure(failure) {
|
|
2065
|
+
var _a;
|
|
2066
|
+
console.log(
|
|
2067
|
+
`
|
|
2068
|
+
${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
|
|
2069
|
+
);
|
|
2070
|
+
if (failure.message) {
|
|
2071
|
+
console.log(` ${failure.message}`);
|
|
2072
|
+
}
|
|
2073
|
+
if (failure.diff && failure.diff.length > 0) {
|
|
2074
|
+
console.log(` ${formatDiff(failure.diff)}`);
|
|
2075
|
+
}
|
|
2076
|
+
if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
|
|
2077
|
+
const text = failure.context.raw_model_text;
|
|
2078
|
+
const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
|
|
2079
|
+
console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
|
|
2080
|
+
}
|
|
2081
|
+
}
|
|
2082
|
+
function printFailureSummary(failures) {
|
|
2083
|
+
const groups = groupFailuresByCategory(failures);
|
|
2084
|
+
const sorted = [...groups.entries()].sort(
|
|
2085
|
+
(a, b) => b[1].length - a[1].length
|
|
2086
|
+
);
|
|
2087
|
+
console.log(`
|
|
2088
|
+
${colors.bold}Failures by category:${colors.reset}`);
|
|
2089
|
+
for (const [category, categoryFailures] of sorted) {
|
|
2090
|
+
console.log(
|
|
2091
|
+
` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
|
|
2092
|
+
);
|
|
2093
|
+
}
|
|
2094
|
+
const maxToShow = 5;
|
|
2095
|
+
const shown = failures.slice(0, maxToShow);
|
|
2096
|
+
for (const failure of shown) {
|
|
2097
|
+
printCompactFailure(failure);
|
|
2098
|
+
}
|
|
2099
|
+
if (failures.length > maxToShow) {
|
|
2100
|
+
const remaining = failures.length - maxToShow;
|
|
2101
|
+
const remainingIds = failures.slice(maxToShow).map((f) => f.id);
|
|
2102
|
+
const idPreview = remainingIds.slice(0, 5).join(", ");
|
|
2103
|
+
const more = remainingIds.length > 5 ? "..." : "";
|
|
2104
|
+
console.log(
|
|
2105
|
+
`
|
|
2106
|
+
${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
|
|
2107
|
+
);
|
|
2108
|
+
}
|
|
1985
2109
|
}
|
|
1986
2110
|
function printResult(result) {
|
|
1987
2111
|
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1988
|
-
const
|
|
2112
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2113
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2114
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2115
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2116
|
+
const statusColor = benchmarkResult.success ? colors.green : colors.red;
|
|
1989
2117
|
console.log(
|
|
1990
2118
|
`
|
|
1991
2119
|
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
1992
2120
|
);
|
|
1993
2121
|
console.log(
|
|
1994
|
-
` \u2514 ${
|
|
2122
|
+
` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
|
|
1995
2123
|
);
|
|
1996
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
1997
|
-
if (metrics.length > 0) {
|
|
1998
|
-
console.log(" Metrics:");
|
|
1999
|
-
for (const [key, value] of metrics) {
|
|
2000
|
-
console.log(` - ${key}: ${value}`);
|
|
2001
|
-
}
|
|
2002
|
-
}
|
|
2003
2124
|
if (benchmarkResult.error) {
|
|
2004
2125
|
console.log(
|
|
2005
2126
|
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
2006
2127
|
);
|
|
2007
2128
|
}
|
|
2008
2129
|
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
(
|
|
2012
|
-
)
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
for (const l of benchmarkResult.logs.slice(0, 10)) {
|
|
2130
|
+
const failures = parseFailures(benchmarkResult.logs);
|
|
2131
|
+
if (failures.length > 0) {
|
|
2132
|
+
printFailureSummary(failures);
|
|
2133
|
+
} else if (benchmarkResult.logs.length > 0) {
|
|
2134
|
+
console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
|
|
2135
|
+
for (const l of benchmarkResult.logs.slice(0, 5)) {
|
|
2016
2136
|
console.log(` ${l}`);
|
|
2017
2137
|
}
|
|
2018
2138
|
}
|
|
@@ -2371,6 +2491,326 @@ function consoleDebugReporter(results) {
|
|
|
2371
2491
|
console.log("\n------------------------------------\n");
|
|
2372
2492
|
}
|
|
2373
2493
|
|
|
2494
|
+
// src/reporters/console.summary.ts
|
|
2495
|
+
var colors3 = {
|
|
2496
|
+
reset: "\x1B[0m",
|
|
2497
|
+
bold: "\x1B[1m",
|
|
2498
|
+
dim: "\x1B[2m",
|
|
2499
|
+
green: "\x1B[32m",
|
|
2500
|
+
red: "\x1B[31m",
|
|
2501
|
+
yellow: "\x1B[33m",
|
|
2502
|
+
cyan: "\x1B[36m",
|
|
2503
|
+
magenta: "\x1B[35m",
|
|
2504
|
+
gray: "\x1B[90m",
|
|
2505
|
+
white: "\x1B[37m"
|
|
2506
|
+
};
|
|
2507
|
+
var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
|
|
2508
|
+
var ID_NUM_REGEX = /_(\d+)$/;
|
|
2509
|
+
var REASONING_TAG = "think";
|
|
2510
|
+
var MAX_FAILURES_TO_DISPLAY = 5;
|
|
2511
|
+
var CATEGORY_DESCRIPTIONS = {
|
|
2512
|
+
PARSE_FAILURE: {
|
|
2513
|
+
label: "Parse Failure",
|
|
2514
|
+
description: "No tool calls extracted from model output",
|
|
2515
|
+
hint: "Model may have responded in text instead of tool format"
|
|
2516
|
+
},
|
|
2517
|
+
PARTIAL_CALLS: {
|
|
2518
|
+
label: "Partial Calls",
|
|
2519
|
+
description: "Some expected tool calls missing",
|
|
2520
|
+
hint: "Model stopped early or missed some tools"
|
|
2521
|
+
},
|
|
2522
|
+
EXTRA_CALLS: {
|
|
2523
|
+
label: "Extra Calls",
|
|
2524
|
+
description: "More tool calls than expected",
|
|
2525
|
+
hint: "Model called tools that weren't needed"
|
|
2526
|
+
},
|
|
2527
|
+
PARAM_VALUE_PERCENT: {
|
|
2528
|
+
label: "Param Value (Percent)",
|
|
2529
|
+
description: "Percentage sent as integer instead of decimal",
|
|
2530
|
+
hint: "e.g., 5 instead of 0.05 for 5%"
|
|
2531
|
+
},
|
|
2532
|
+
PARAM_VALUE_MISMATCH: {
|
|
2533
|
+
label: "Param Value Mismatch",
|
|
2534
|
+
description: "Parameter values don't match expected"
|
|
2535
|
+
},
|
|
2536
|
+
WRONG_FUNCTION: {
|
|
2537
|
+
label: "Wrong Function",
|
|
2538
|
+
description: "Called wrong function name"
|
|
2539
|
+
},
|
|
2540
|
+
MISSING_PARAMS: {
|
|
2541
|
+
label: "Missing Params",
|
|
2542
|
+
description: "Required parameters not provided"
|
|
2543
|
+
},
|
|
2544
|
+
UNEXPECTED_PARAMS: {
|
|
2545
|
+
label: "Unexpected Params",
|
|
2546
|
+
description: "Extra parameters that shouldn't be there"
|
|
2547
|
+
},
|
|
2548
|
+
NO_MATCH: {
|
|
2549
|
+
label: "No Match",
|
|
2550
|
+
description: "Function called but couldn't match to expected",
|
|
2551
|
+
hint: "Parameters may be correct but don't match any expected combination"
|
|
2552
|
+
},
|
|
2553
|
+
OTHER: {
|
|
2554
|
+
label: "Other",
|
|
2555
|
+
description: "Uncategorized failure"
|
|
2556
|
+
}
|
|
2557
|
+
};
|
|
2558
|
+
function parseFailureLogs(logs) {
|
|
2559
|
+
return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
|
|
2560
|
+
try {
|
|
2561
|
+
const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
|
|
2562
|
+
return JSON.parse(jsonStr);
|
|
2563
|
+
} catch (e) {
|
|
2564
|
+
return null;
|
|
2565
|
+
}
|
|
2566
|
+
}).filter((parsed) => parsed !== null);
|
|
2567
|
+
}
|
|
2568
|
+
function groupByCategory(failures) {
|
|
2569
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2570
|
+
for (const failure of failures) {
|
|
2571
|
+
const category = failure.category || "OTHER";
|
|
2572
|
+
const existing = groups.get(category);
|
|
2573
|
+
if (existing) {
|
|
2574
|
+
existing.failures.push(failure);
|
|
2575
|
+
} else {
|
|
2576
|
+
groups.set(category, { failures: [failure] });
|
|
2577
|
+
}
|
|
2578
|
+
}
|
|
2579
|
+
return groups;
|
|
2580
|
+
}
|
|
2581
|
+
function extractParamNames(failures) {
|
|
2582
|
+
const paramNames = /* @__PURE__ */ new Set();
|
|
2583
|
+
for (const f of failures) {
|
|
2584
|
+
if (!f.diff) {
|
|
2585
|
+
continue;
|
|
2586
|
+
}
|
|
2587
|
+
for (const d of f.diff) {
|
|
2588
|
+
if (d.startsWith("@@ param ")) {
|
|
2589
|
+
paramNames.add(d.replace("@@ param ", ""));
|
|
2590
|
+
}
|
|
2591
|
+
}
|
|
2592
|
+
}
|
|
2593
|
+
return paramNames;
|
|
2594
|
+
}
|
|
2595
|
+
function extractFinishReasons(failures) {
|
|
2596
|
+
var _a;
|
|
2597
|
+
const finishReasons = /* @__PURE__ */ new Set();
|
|
2598
|
+
for (const f of failures) {
|
|
2599
|
+
if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
|
|
2600
|
+
finishReasons.add(String(f.context.finish_reason));
|
|
2601
|
+
}
|
|
2602
|
+
}
|
|
2603
|
+
return finishReasons;
|
|
2604
|
+
}
|
|
2605
|
+
function detectPatterns(group) {
|
|
2606
|
+
const { failures } = group;
|
|
2607
|
+
if (failures.length < 2) {
|
|
2608
|
+
return;
|
|
2609
|
+
}
|
|
2610
|
+
const firstCategory = failures[0].category;
|
|
2611
|
+
if (firstCategory === "PARAM_VALUE_PERCENT") {
|
|
2612
|
+
const paramNames = extractParamNames(failures);
|
|
2613
|
+
if (paramNames.size > 0) {
|
|
2614
|
+
group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
if (firstCategory === "PARSE_FAILURE") {
|
|
2618
|
+
const finishReasons = extractFinishReasons(failures);
|
|
2619
|
+
if (finishReasons.size === 1) {
|
|
2620
|
+
group.pattern = `All finished with: ${[...finishReasons][0]}`;
|
|
2621
|
+
}
|
|
2622
|
+
}
|
|
2623
|
+
}
|
|
2624
|
+
function getLineColor(line) {
|
|
2625
|
+
if (line.startsWith("+")) {
|
|
2626
|
+
return colors3.green;
|
|
2627
|
+
}
|
|
2628
|
+
if (line.startsWith("-")) {
|
|
2629
|
+
return colors3.red;
|
|
2630
|
+
}
|
|
2631
|
+
if (line.startsWith("@@")) {
|
|
2632
|
+
return colors3.cyan;
|
|
2633
|
+
}
|
|
2634
|
+
return colors3.white;
|
|
2635
|
+
}
|
|
2636
|
+
function formatFunctions(funcs) {
|
|
2637
|
+
if (Array.isArray(funcs)) {
|
|
2638
|
+
return funcs.join(", ");
|
|
2639
|
+
}
|
|
2640
|
+
return String(funcs);
|
|
2641
|
+
}
|
|
2642
|
+
function printExpectedActual(failure) {
|
|
2643
|
+
if (failure.expected) {
|
|
2644
|
+
const expFuncs = failure.expected.functions || failure.expected.function;
|
|
2645
|
+
if (expFuncs) {
|
|
2646
|
+
console.log(
|
|
2647
|
+
` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
|
|
2648
|
+
);
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2651
|
+
if (failure.actual) {
|
|
2652
|
+
const actFuncs = failure.actual.functions || failure.actual.function;
|
|
2653
|
+
if (actFuncs) {
|
|
2654
|
+
const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
|
|
2655
|
+
const color = isEmpty ? colors3.red : colors3.white;
|
|
2656
|
+
const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
|
|
2657
|
+
console.log(
|
|
2658
|
+
` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
|
|
2659
|
+
);
|
|
2660
|
+
}
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
function printDiff(diff) {
|
|
2664
|
+
console.log(` ${colors3.gray}Diff:${colors3.reset}`);
|
|
2665
|
+
for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
|
|
2666
|
+
const lineColor = getLineColor(line);
|
|
2667
|
+
console.log(` ${lineColor}${line}${colors3.reset}`);
|
|
2668
|
+
}
|
|
2669
|
+
}
|
|
2670
|
+
function removeReasoningTags(text) {
|
|
2671
|
+
const openTag = `<${REASONING_TAG}>`;
|
|
2672
|
+
const closeTag = `</${REASONING_TAG}>`;
|
|
2673
|
+
const closedTagPattern = new RegExp(
|
|
2674
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
|
|
2675
|
+
"g"
|
|
2676
|
+
);
|
|
2677
|
+
const unclosedTagPattern = new RegExp(
|
|
2678
|
+
`${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
|
|
2679
|
+
"g"
|
|
2680
|
+
);
|
|
2681
|
+
let result = text.replace(closedTagPattern, "");
|
|
2682
|
+
result = result.replace(unclosedTagPattern, "");
|
|
2683
|
+
return result.trim();
|
|
2684
|
+
}
|
|
2685
|
+
function printModelOutput(failure, category) {
|
|
2686
|
+
var _a, _b;
|
|
2687
|
+
if (category !== "PARSE_FAILURE") {
|
|
2688
|
+
return;
|
|
2689
|
+
}
|
|
2690
|
+
const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
|
|
2691
|
+
const cleanedText = removeReasoningTags(rawText);
|
|
2692
|
+
if (cleanedText) {
|
|
2693
|
+
console.log(
|
|
2694
|
+
` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
|
|
2695
|
+
);
|
|
2696
|
+
} else {
|
|
2697
|
+
console.log(
|
|
2698
|
+
` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
|
|
2699
|
+
);
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2702
|
+
function shouldShowDiffByDefault(category) {
|
|
2703
|
+
return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
|
|
2704
|
+
}
|
|
2705
|
+
function printSingleFailure(failure, category, verbose) {
|
|
2706
|
+
console.log(`
|
|
2707
|
+
${colors3.bold}${failure.id}${colors3.reset}`);
|
|
2708
|
+
const hasDiff = failure.diff && failure.diff.length > 0;
|
|
2709
|
+
const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
|
|
2710
|
+
if (showDiffPrimarily) {
|
|
2711
|
+
printDiff(failure.diff);
|
|
2712
|
+
} else {
|
|
2713
|
+
printExpectedActual(failure);
|
|
2714
|
+
if (hasDiff && verbose) {
|
|
2715
|
+
printDiff(failure.diff);
|
|
2716
|
+
}
|
|
2717
|
+
}
|
|
2718
|
+
printModelOutput(failure, category);
|
|
2719
|
+
}
|
|
2720
|
+
var MAX_SAMPLE_FAILURES = 2;
|
|
2721
|
+
function printRemainingIds(failures) {
|
|
2722
|
+
const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
|
|
2723
|
+
const idNums = remainingIds.map((id) => {
|
|
2724
|
+
const match = id.match(ID_NUM_REGEX);
|
|
2725
|
+
return match ? match[1] : id;
|
|
2726
|
+
});
|
|
2727
|
+
console.log(
|
|
2728
|
+
`
|
|
2729
|
+
${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
|
|
2730
|
+
);
|
|
2731
|
+
}
|
|
2732
|
+
function printCategoryHeader(info, count) {
|
|
2733
|
+
console.log(
|
|
2734
|
+
`
|
|
2735
|
+
${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
|
|
2736
|
+
);
|
|
2737
|
+
console.log(`${colors3.dim}${info.description}${colors3.reset}`);
|
|
2738
|
+
}
|
|
2739
|
+
function printCategoryDetails(category, group, verbose) {
|
|
2740
|
+
const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
|
|
2741
|
+
const { failures } = group;
|
|
2742
|
+
printCategoryHeader(info, failures.length);
|
|
2743
|
+
if (group.pattern) {
|
|
2744
|
+
console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
|
|
2745
|
+
}
|
|
2746
|
+
if (info.hint) {
|
|
2747
|
+
console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
|
|
2748
|
+
}
|
|
2749
|
+
const samplesToShow = verbose ? failures : failures.slice(0, 2);
|
|
2750
|
+
for (const failure of samplesToShow) {
|
|
2751
|
+
printSingleFailure(failure, category, verbose);
|
|
2752
|
+
}
|
|
2753
|
+
if (!verbose && failures.length > 2) {
|
|
2754
|
+
printRemainingIds(failures);
|
|
2755
|
+
}
|
|
2756
|
+
}
|
|
2757
|
+
function printResultHeader(result) {
|
|
2758
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
2759
|
+
const passed = benchmarkResult.metrics.correct_count;
|
|
2760
|
+
const total = benchmarkResult.metrics.total_cases;
|
|
2761
|
+
const scorePercent = (benchmarkResult.score * 100).toFixed(1);
|
|
2762
|
+
const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
|
|
2763
|
+
const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
|
|
2764
|
+
const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
|
|
2765
|
+
const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
|
|
2766
|
+
const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
|
|
2767
|
+
console.log(
|
|
2768
|
+
`
|
|
2769
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
|
|
2770
|
+
);
|
|
2771
|
+
console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
|
|
2772
|
+
}
|
|
2773
|
+
function printResultSummary(result, verbose) {
|
|
2774
|
+
const { result: benchmarkResult } = result;
|
|
2775
|
+
printResultHeader(result);
|
|
2776
|
+
if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
|
|
2777
|
+
return;
|
|
2778
|
+
}
|
|
2779
|
+
const failures = parseFailureLogs(benchmarkResult.logs);
|
|
2780
|
+
if (failures.length === 0) {
|
|
2781
|
+
if (!benchmarkResult.success) {
|
|
2782
|
+
console.log(
|
|
2783
|
+
`${colors3.yellow}No structured failure data available${colors3.reset}`
|
|
2784
|
+
);
|
|
2785
|
+
}
|
|
2786
|
+
return;
|
|
2787
|
+
}
|
|
2788
|
+
const groups = groupByCategory(failures);
|
|
2789
|
+
for (const group of groups.values()) {
|
|
2790
|
+
detectPatterns(group);
|
|
2791
|
+
}
|
|
2792
|
+
const sortedCategories = [...groups.entries()].sort(
|
|
2793
|
+
(a, b) => b[1].failures.length - a[1].failures.length
|
|
2794
|
+
);
|
|
2795
|
+
for (const [cat, group] of sortedCategories) {
|
|
2796
|
+
printCategoryDetails(cat, group, verbose);
|
|
2797
|
+
}
|
|
2798
|
+
}
|
|
2799
|
+
function consoleSummaryReporter(results) {
|
|
2800
|
+
const verbose = process.env.VERBOSE === "true";
|
|
2801
|
+
console.log(`
|
|
2802
|
+
${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
|
|
2803
|
+
console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
|
|
2804
|
+
for (const result of results) {
|
|
2805
|
+
printResultSummary(result, verbose);
|
|
2806
|
+
}
|
|
2807
|
+
console.log(
|
|
2808
|
+
`
|
|
2809
|
+
${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
|
|
2810
|
+
`
|
|
2811
|
+
);
|
|
2812
|
+
}
|
|
2813
|
+
|
|
2374
2814
|
// src/reporters/json.ts
|
|
2375
2815
|
function jsonReporter(results) {
|
|
2376
2816
|
const serializableResults = results.map((r) => {
|
|
@@ -2390,60 +2830,56 @@ function jsonReporter(results) {
|
|
|
2390
2830
|
var reporters = {
|
|
2391
2831
|
console: consoleReporter,
|
|
2392
2832
|
json: jsonReporter,
|
|
2393
|
-
"console.debug": consoleDebugReporter
|
|
2833
|
+
"console.debug": consoleDebugReporter,
|
|
2834
|
+
"console.summary": consoleSummaryReporter
|
|
2394
2835
|
};
|
|
2395
2836
|
|
|
2396
2837
|
// src/evaluate.ts
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
console.log(
|
|
2405
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
2406
|
-
);
|
|
2407
|
-
return {
|
|
2408
|
-
model: modelId,
|
|
2409
|
-
modelKey,
|
|
2410
|
-
benchmark: benchmark.name,
|
|
2411
|
-
result
|
|
2412
|
-
};
|
|
2413
|
-
} catch (error) {
|
|
2414
|
-
console.error(
|
|
2415
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
2416
|
-
error
|
|
2417
|
-
);
|
|
2418
|
-
return {
|
|
2419
|
-
model: modelId,
|
|
2420
|
-
modelKey,
|
|
2421
|
-
benchmark: benchmark.name,
|
|
2422
|
-
result: {
|
|
2423
|
-
score: 0,
|
|
2424
|
-
success: false,
|
|
2425
|
-
metrics: {},
|
|
2426
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
2427
|
-
}
|
|
2428
|
-
};
|
|
2838
|
+
function isModelConfig(value) {
|
|
2839
|
+
if (typeof value !== "object" || value === null) {
|
|
2840
|
+
return false;
|
|
2841
|
+
}
|
|
2842
|
+
const obj = value;
|
|
2843
|
+
if (!("model" in obj)) {
|
|
2844
|
+
return false;
|
|
2429
2845
|
}
|
|
2846
|
+
const model = obj.model;
|
|
2847
|
+
if (typeof model !== "object" || model === null) {
|
|
2848
|
+
return false;
|
|
2849
|
+
}
|
|
2850
|
+
return "modelId" in model;
|
|
2851
|
+
}
|
|
2852
|
+
function isLanguageModel(value) {
|
|
2853
|
+
if (typeof value !== "object" || value === null) {
|
|
2854
|
+
return false;
|
|
2855
|
+
}
|
|
2856
|
+
const obj = value;
|
|
2857
|
+
return "modelId" in obj && typeof obj.modelId === "string";
|
|
2858
|
+
}
|
|
2859
|
+
function extractModelAndMiddleware(input) {
|
|
2860
|
+
if (isModelConfig(input)) {
|
|
2861
|
+
return [input.model, input.middleware];
|
|
2862
|
+
}
|
|
2863
|
+
return [input, void 0];
|
|
2430
2864
|
}
|
|
2431
2865
|
function normalizeModels(models) {
|
|
2432
|
-
const
|
|
2866
|
+
const entries = [];
|
|
2433
2867
|
if (Array.isArray(models)) {
|
|
2434
2868
|
for (const m of models) {
|
|
2435
|
-
|
|
2869
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2870
|
+
entries.push([void 0, model, middleware]);
|
|
2436
2871
|
}
|
|
2437
|
-
} else if (
|
|
2438
|
-
|
|
2872
|
+
} else if (isModelConfig(models)) {
|
|
2873
|
+
entries.push([void 0, models.model, models.middleware]);
|
|
2874
|
+
} else if (isLanguageModel(models)) {
|
|
2875
|
+
entries.push([void 0, models, void 0]);
|
|
2439
2876
|
} else {
|
|
2440
|
-
for (const [key, m] of Object.entries(
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
modelEntries.push([key, m]);
|
|
2877
|
+
for (const [key, m] of Object.entries(models)) {
|
|
2878
|
+
const [model, middleware] = extractModelAndMiddleware(m);
|
|
2879
|
+
entries.push([key, model, middleware]);
|
|
2444
2880
|
}
|
|
2445
2881
|
}
|
|
2446
|
-
return
|
|
2882
|
+
return entries;
|
|
2447
2883
|
}
|
|
2448
2884
|
function buildConfig(temperature, maxTokens) {
|
|
2449
2885
|
const config = {};
|
|
@@ -2464,21 +2900,90 @@ function executeReporter(reporter, results) {
|
|
|
2464
2900
|
reporters.console(results);
|
|
2465
2901
|
}
|
|
2466
2902
|
}
|
|
2903
|
+
function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
|
|
2904
|
+
var _a, _b;
|
|
2905
|
+
const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
|
|
2906
|
+
if (!(cacheEnabled || userMiddleware)) {
|
|
2907
|
+
return baseModel;
|
|
2908
|
+
}
|
|
2909
|
+
const cacheMiddleware = cacheEnabled ? createDiskCacheMiddleware({
|
|
2910
|
+
cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
|
|
2911
|
+
enabled: true,
|
|
2912
|
+
debug: (_b = cacheOptions.debug) != null ? _b : false
|
|
2913
|
+
}) : null;
|
|
2914
|
+
const middlewares = [];
|
|
2915
|
+
if (userMiddleware) {
|
|
2916
|
+
if (Array.isArray(userMiddleware)) {
|
|
2917
|
+
middlewares.push(...userMiddleware);
|
|
2918
|
+
} else {
|
|
2919
|
+
middlewares.push(userMiddleware);
|
|
2920
|
+
}
|
|
2921
|
+
}
|
|
2922
|
+
if (cacheMiddleware) {
|
|
2923
|
+
middlewares.push(cacheMiddleware);
|
|
2924
|
+
}
|
|
2925
|
+
if (middlewares.length === 0) {
|
|
2926
|
+
return baseModel;
|
|
2927
|
+
}
|
|
2928
|
+
return wrapLanguageModel({
|
|
2929
|
+
// biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
|
|
2930
|
+
model: baseModel,
|
|
2931
|
+
middleware: middlewares.length === 1 ? middlewares[0] : middlewares
|
|
2932
|
+
});
|
|
2933
|
+
}
|
|
2934
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
2935
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
2936
|
+
const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
|
|
2937
|
+
try {
|
|
2938
|
+
process.stdout.write(`${prefix}: ...`);
|
|
2939
|
+
const result = await benchmark.run(model, config);
|
|
2940
|
+
const scoreDisplay = result.score.toFixed(2);
|
|
2941
|
+
process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
|
|
2942
|
+
`);
|
|
2943
|
+
return {
|
|
2944
|
+
model: modelId,
|
|
2945
|
+
modelKey,
|
|
2946
|
+
benchmark: benchmark.name,
|
|
2947
|
+
result
|
|
2948
|
+
};
|
|
2949
|
+
} catch (error) {
|
|
2950
|
+
process.stdout.write(`\r${prefix}: .... Score: ERROR
|
|
2951
|
+
`);
|
|
2952
|
+
console.error(error);
|
|
2953
|
+
return {
|
|
2954
|
+
model: modelId,
|
|
2955
|
+
modelKey,
|
|
2956
|
+
benchmark: benchmark.name,
|
|
2957
|
+
result: {
|
|
2958
|
+
score: 0,
|
|
2959
|
+
success: false,
|
|
2960
|
+
metrics: {},
|
|
2961
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
2962
|
+
}
|
|
2963
|
+
};
|
|
2964
|
+
}
|
|
2965
|
+
}
|
|
2467
2966
|
async function evaluate(options) {
|
|
2468
2967
|
const {
|
|
2469
2968
|
models,
|
|
2470
2969
|
benchmarks,
|
|
2471
2970
|
reporter = "console",
|
|
2472
2971
|
temperature,
|
|
2473
|
-
maxTokens
|
|
2972
|
+
maxTokens,
|
|
2973
|
+
cache
|
|
2474
2974
|
} = options;
|
|
2475
2975
|
const modelEntries = normalizeModels(models);
|
|
2476
2976
|
const config = buildConfig(temperature, maxTokens);
|
|
2477
2977
|
const allResults = [];
|
|
2478
|
-
for (const [modelKey,
|
|
2978
|
+
for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
|
|
2979
|
+
const effectiveModel = buildEffectiveModel(
|
|
2980
|
+
baseModel,
|
|
2981
|
+
userMiddleware,
|
|
2982
|
+
cache
|
|
2983
|
+
);
|
|
2479
2984
|
for (const benchmark of benchmarks) {
|
|
2480
2985
|
const evaluationResult = await runSingleBenchmark(
|
|
2481
|
-
|
|
2986
|
+
effectiveModel,
|
|
2482
2987
|
benchmark,
|
|
2483
2988
|
modelKey,
|
|
2484
2989
|
config
|