@ai-sdk-tool/eval 1.0.0-canary.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -407,6 +407,7 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
407
407
  // src/benchmarks/bfcl.ts
408
408
  var LINE_SPLIT_REGEX = /\r?\n/;
409
409
  var NUMERIC_STRING_REGEX = /^\d+$/;
410
+ var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
410
411
  function convertGroundTruthToXML(call) {
411
412
  const keys = Object.keys(call);
412
413
  if (keys.length === 0) {
@@ -438,45 +439,67 @@ function convertGroundTruthToXML(call) {
438
439
  xml += `</${funcName}>`;
439
440
  return xml;
440
441
  }
442
+ function extractCategory(id) {
443
+ if (id.startsWith("parallel_multiple")) {
444
+ return "parallel_multiple";
445
+ }
446
+ if (id.startsWith("simple_python")) {
447
+ return "simple";
448
+ }
449
+ if (id.startsWith("simple_java")) {
450
+ return "simple";
451
+ }
452
+ if (id.startsWith("simple_javascript")) {
453
+ return "simple";
454
+ }
455
+ if (id.startsWith("parallel")) {
456
+ return "parallel";
457
+ }
458
+ if (id.startsWith("multiple")) {
459
+ return "multiple";
460
+ }
461
+ if (id.startsWith("simple")) {
462
+ return "simple";
463
+ }
464
+ return id.split("_")[0];
465
+ }
441
466
  function check(testCase, modelOutput, possibleAnswer) {
442
- const category = testCase.id.split("_")[0];
467
+ const category = extractCategory(testCase.id);
443
468
  try {
444
- if (category === "simple") {
445
- if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
446
- return {
447
- valid: false,
448
- error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
449
- error_type: "simple:wrong_count"
450
- };
469
+ switch (category) {
470
+ case "simple": {
471
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
472
+ return {
473
+ valid: false,
474
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
475
+ error_type: "simple:wrong_count"
476
+ };
477
+ }
478
+ return simpleFunctionChecker(
479
+ testCase.function[0],
480
+ modelOutput[0],
481
+ possibleAnswer.ground_truth[0]
482
+ );
483
+ }
484
+ case "multiple": {
485
+ return multipleFunctionChecker(
486
+ testCase.function,
487
+ modelOutput,
488
+ possibleAnswer.ground_truth
489
+ );
490
+ }
491
+ case "parallel":
492
+ case "parallel_multiple": {
493
+ return parallelFunctionCheckerNoOrder(
494
+ testCase.function,
495
+ modelOutput,
496
+ possibleAnswer.ground_truth
497
+ );
498
+ }
499
+ default: {
500
+ return { valid: true };
451
501
  }
452
- return simpleFunctionChecker(
453
- testCase.function[0],
454
- modelOutput[0],
455
- possibleAnswer.ground_truth[0]
456
- );
457
- }
458
- if (category === "parallel") {
459
- return parallelFunctionCheckerNoOrder(
460
- testCase.function,
461
- modelOutput,
462
- possibleAnswer.ground_truth
463
- );
464
- }
465
- if (category === "multiple") {
466
- return multipleFunctionChecker(
467
- testCase.function,
468
- modelOutput,
469
- possibleAnswer.ground_truth
470
- );
471
- }
472
- if (category.includes("parallel-multiple")) {
473
- return parallelFunctionCheckerNoOrder(
474
- testCase.function,
475
- modelOutput,
476
- possibleAnswer.ground_truth
477
- );
478
502
  }
479
- return { valid: true };
480
503
  } catch (e) {
481
504
  return {
482
505
  valid: false,
@@ -654,7 +677,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
654
677
  return `- expected one of: ${formatted}`;
655
678
  })();
656
679
  diffLines.push(expectedLine);
657
- diffLines.push(`+ got: ${JSON.stringify(got)}`);
680
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
658
681
  return diffLines;
659
682
  };
660
683
  const paramValueMatches = (allowed, got) => {
@@ -871,44 +894,97 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
871
894
  );
872
895
  }
873
896
  };
874
- const buildFailureContext = (options) => {
875
- const {
876
- testCase,
877
- tools,
878
- flatMessages,
879
- mwOriginalText,
880
- text,
881
- finishReason,
882
- mwParsedToolCalls,
883
- restoredCalls,
884
- possibleAnswer
885
- } = options;
886
- const lastUser = (() => {
887
- var _a;
888
- const reversed = [...flatMessages].reverse();
889
- const found = reversed.find(
890
- (m) => m.role === "user"
891
- );
892
- return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
893
- })();
894
- const rawModelText = (() => {
895
- if (mwOriginalText && mwOriginalText.length > 0) {
896
- return mwOriginalText;
897
+ const hasPercentPattern = (diff) => {
898
+ return diff.some((d) => {
899
+ if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
900
+ return false;
897
901
  }
898
- if (typeof text === "string") {
899
- return text;
902
+ const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
903
+ if (!numMatch) {
904
+ return false;
900
905
  }
901
- return "";
902
- })();
903
- return {
904
- id: testCase.id,
905
- tool_schema: tools,
906
- last_user_query: lastUser,
907
- raw_model_text: rawModelText,
908
- finish_reason: finishReason,
909
- parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
910
- ground_truth: possibleAnswer.ground_truth
911
- };
906
+ const num = Number.parseFloat(numMatch[1]);
907
+ return num >= 1 && num <= 100;
908
+ });
909
+ };
910
+ const isValueError = (errorType, diff) => {
911
+ return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
912
+ };
913
+ const isFunctionNameError = (errorType, diff) => {
914
+ return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
915
+ };
916
+ const isMissingParamError = (errorType, diff) => {
917
+ return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
918
+ };
919
+ const isUnexpectedParamError = (errorType, diff) => {
920
+ return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
921
+ };
922
+ const classifyByErrorPatterns = (errorType, diff) => {
923
+ const patterns = [
924
+ [
925
+ isValueError,
926
+ hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
927
+ ],
928
+ [isFunctionNameError, "WRONG_FUNCTION"],
929
+ [isMissingParamError, "MISSING_PARAMS"],
930
+ [isUnexpectedParamError, "UNEXPECTED_PARAMS"]
931
+ ];
932
+ for (const [classifier, result] of patterns) {
933
+ if (classifier(errorType, diff)) {
934
+ return result;
935
+ }
936
+ }
937
+ if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
938
+ return "NO_MATCH";
939
+ }
940
+ return null;
941
+ };
942
+ const classifyByCallCount = (actualCount, expectedCount) => {
943
+ if (actualCount === 0 && expectedCount > 0) {
944
+ return "PARSE_FAILURE";
945
+ }
946
+ if (actualCount > 0 && actualCount < expectedCount) {
947
+ return "PARTIAL_CALLS";
948
+ }
949
+ if (actualCount > expectedCount) {
950
+ return "EXTRA_CALLS";
951
+ }
952
+ return null;
953
+ };
954
+ const classifyFailureType = (options) => {
955
+ const { errorType, restoredCalls, expectedCount, diff } = options;
956
+ const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
957
+ const countBasedResult = classifyByCallCount(
958
+ actualCount,
959
+ expectedCount
960
+ );
961
+ if (countBasedResult) {
962
+ return countBasedResult;
963
+ }
964
+ const patternBasedResult = classifyByErrorPatterns(errorType, diff);
965
+ if (patternBasedResult) {
966
+ return patternBasedResult;
967
+ }
968
+ return "OTHER";
969
+ };
970
+ const extractRawModelText = (mwOriginalText, text) => {
971
+ if (mwOriginalText && mwOriginalText.length > 0) {
972
+ return mwOriginalText;
973
+ }
974
+ if (typeof text === "string") {
975
+ return text;
976
+ }
977
+ return "";
978
+ };
979
+ const extractLastUserQuery = (flatMessages) => {
980
+ var _a;
981
+ const reversed = [...flatMessages].reverse();
982
+ const found = reversed.find((m) => m.role === "user");
983
+ const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
984
+ return content.length > 200 ? `${content.slice(0, 200)}...` : content;
985
+ };
986
+ const truncateText = (text, maxLen) => {
987
+ return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
912
988
  };
913
989
  const logFailureDetails = (options) => {
914
990
  const {
@@ -926,42 +1002,36 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
926
1002
  } = options;
927
1003
  try {
928
1004
  const category = testCase.id.split("_")[0];
929
- const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
930
- tools,
931
- possibleAnswer,
932
- restoredCalls
933
- ) : buildParallelDiff(
934
- tools,
935
- possibleAnswer,
936
- restoredCalls
937
- );
938
- caseLogs.push(
939
- `[DEBUG-FAIL] ${JSON.stringify({
940
- id: testCase.id,
941
- message: checkerResult.error,
942
- error_type: checkerResult.error_type,
943
- expected,
944
- actual,
945
- diff
946
- })}`
947
- );
948
- try {
949
- const contextPayload = buildFailureContext({
950
- testCase,
951
- tools,
952
- flatMessages,
953
- mwOriginalText,
954
- text,
955
- finishReason,
956
- mwParsedToolCalls,
1005
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
1006
+ const gtArr = possibleAnswer.ground_truth;
1007
+ const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
1008
+ const rawModelText = extractRawModelText(mwOriginalText, text);
1009
+ const lastUserQuery = extractLastUserQuery(flatMessages);
1010
+ const failurePayload = {
1011
+ id: testCase.id,
1012
+ category: classifyFailureType({
1013
+ errorType: checkerResult.error_type,
957
1014
  restoredCalls,
958
- possibleAnswer
959
- });
960
- caseLogs.push(
961
- `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
962
- );
963
- } catch (e) {
964
- }
1015
+ expectedCount,
1016
+ diff
1017
+ }),
1018
+ message: checkerResult.error,
1019
+ error_type: checkerResult.error_type,
1020
+ expected,
1021
+ actual,
1022
+ diff,
1023
+ context: {
1024
+ raw_model_text: truncateText(rawModelText, 500),
1025
+ raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
1026
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1027
+ expected_count: expectedCount,
1028
+ actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
1029
+ finish_reason: finishReason,
1030
+ last_user_query: lastUserQuery,
1031
+ tool_names: tools.map((t) => t.name)
1032
+ }
1033
+ };
1034
+ caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
965
1035
  } catch (e) {
966
1036
  caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
967
1037
  }
@@ -1186,14 +1256,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1186
1256
  };
1187
1257
  }
1188
1258
  const score = correctCount / testCases.length;
1259
+ const caseResults = resultsPerCase.map((r, i) => ({
1260
+ id: testCases[i].id,
1261
+ valid: r.valid
1262
+ }));
1189
1263
  return {
1190
1264
  score,
1191
1265
  success: score > 0.95,
1192
- // High success threshold as requested
1193
1266
  metrics: {
1194
1267
  correct_count: correctCount,
1195
1268
  total_cases: testCases.length,
1196
- accuracy: score
1269
+ accuracy: score,
1270
+ case_results: JSON.stringify(caseResults)
1197
1271
  },
1198
1272
  logs
1199
1273
  };
@@ -1213,27 +1287,27 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1213
1287
  }
1214
1288
  var bfclSimpleBenchmark = createBfclBenchmark(
1215
1289
  "bfcl-simple",
1216
- "BFCL Simple Function Calling",
1217
- "BFCL_v3_simple.jsonl",
1218
- "BFCL_v3_simple_possible_answer.jsonl"
1290
+ "BFCL v4 Simple Function Calling",
1291
+ "BFCL_v4_simple.jsonl",
1292
+ "BFCL_v4_simple_possible_answer.jsonl"
1219
1293
  );
1220
1294
  var bfclParallelBenchmark = createBfclBenchmark(
1221
1295
  "bfcl-parallel",
1222
- "BFCL Parallel Function Calling",
1223
- "BFCL_v3_parallel.jsonl",
1224
- "BFCL_v3_parallel_possible_answer.jsonl"
1296
+ "BFCL v4 Parallel Function Calling",
1297
+ "BFCL_v4_parallel.jsonl",
1298
+ "BFCL_v4_parallel_possible_answer.jsonl"
1225
1299
  );
1226
1300
  var bfclMultipleBenchmark = createBfclBenchmark(
1227
1301
  "bfcl-multiple",
1228
- "BFCL Multiple Function Calling",
1229
- "BFCL_v3_multiple.jsonl",
1230
- "BFCL_v3_multiple_possible_answer.jsonl"
1302
+ "BFCL v4 Multiple Function Calling",
1303
+ "BFCL_v4_multiple.jsonl",
1304
+ "BFCL_v4_multiple_possible_answer.jsonl"
1231
1305
  );
1232
1306
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1233
1307
  "bfcl-parallel-multiple",
1234
- "BFCL Parallel & Multiple Function Calling",
1235
- "BFCL_v3_parallel_multiple.jsonl",
1236
- "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1308
+ "BFCL v4 Parallel & Multiple Function Calling",
1309
+ "BFCL_v4_parallel_multiple.jsonl",
1310
+ "BFCL_v4_parallel_multiple_possible_answer.jsonl"
1237
1311
  );
1238
1312
 
1239
1313
  // src/benchmarks/complex-func-bench.ts
@@ -1960,23 +2034,28 @@ var jsonGenerationSchemaOnlyBenchmark = {
1960
2034
  }
1961
2035
  };
1962
2036
 
2037
+ // src/evaluate.ts
2038
+ var import_middleware = require("@ai-sdk-tool/middleware");
2039
+ var import_ai4 = require("ai");
2040
+
1963
2041
  // src/reporters/console.ts
1964
2042
  var colors = {
1965
2043
  reset: "\x1B[0m",
2044
+ bold: "\x1B[1m",
1966
2045
  green: "\x1B[32m",
1967
2046
  red: "\x1B[31m",
1968
2047
  yellow: "\x1B[33m",
1969
2048
  cyan: "\x1B[36m",
1970
2049
  magenta: "\x1B[35m",
1971
2050
  gray: "\x1B[90m",
1972
- white: "\x1B[37m",
1973
- bgRed: "\x1B[41m"
2051
+ white: "\x1B[37m"
1974
2052
  };
2053
+ var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
1975
2054
  function formatDiff(diff) {
1976
2055
  if (!diff || diff.length === 0) {
1977
2056
  return "";
1978
2057
  }
1979
- return diff.map((line) => {
2058
+ return diff.slice(0, 8).map((line) => {
1980
2059
  if (line.startsWith("-")) {
1981
2060
  return `${colors.red}${line}${colors.reset}`;
1982
2061
  }
@@ -1989,65 +2068,106 @@ function formatDiff(diff) {
1989
2068
  return line;
1990
2069
  }).join("\n ");
1991
2070
  }
1992
- function printFailLogs(logs) {
1993
- const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
1994
- for (const log of failLogs) {
2071
+ function parseFailures(logs) {
2072
+ const failures = [];
2073
+ for (const log of logs) {
2074
+ if (!DEBUG_FAIL_REGEX.test(log)) {
2075
+ continue;
2076
+ }
1995
2077
  try {
1996
- const jsonStr = log.replace("[DEBUG-FAIL] ", "");
1997
- const data = JSON.parse(jsonStr);
1998
- console.log(`
1999
- ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
2000
- console.log(
2001
- ` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
2002
- );
2003
- console.log(` Message: ${data.message}`);
2004
- if (data.diff && Array.isArray(data.diff)) {
2005
- console.log(` Diff:
2006
- ${formatDiff(data.diff)}`);
2007
- }
2008
- if (data.expected && data.actual) {
2009
- const expStr = JSON.stringify(data.expected);
2010
- const actStr = JSON.stringify(data.actual);
2011
- if (expStr.length < 100 && actStr.length < 100) {
2012
- console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
2013
- console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
2014
- }
2015
- }
2016
- } catch (_e) {
2017
- console.log(` Raw Log: ${log}`);
2078
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
2079
+ const parsed = JSON.parse(jsonStr);
2080
+ failures.push(parsed);
2081
+ } catch (e) {
2018
2082
  }
2019
2083
  }
2084
+ return failures;
2085
+ }
2086
+ function groupFailuresByCategory(failures) {
2087
+ const groups = /* @__PURE__ */ new Map();
2088
+ for (const failure of failures) {
2089
+ const category = failure.category || "OTHER";
2090
+ const existing = groups.get(category);
2091
+ if (existing) {
2092
+ existing.push(failure);
2093
+ } else {
2094
+ groups.set(category, [failure]);
2095
+ }
2096
+ }
2097
+ return groups;
2098
+ }
2099
+ function printCompactFailure(failure) {
2100
+ var _a;
2101
+ console.log(
2102
+ `
2103
+ ${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
2104
+ );
2105
+ if (failure.message) {
2106
+ console.log(` ${failure.message}`);
2107
+ }
2108
+ if (failure.diff && failure.diff.length > 0) {
2109
+ console.log(` ${formatDiff(failure.diff)}`);
2110
+ }
2111
+ if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
2112
+ const text = failure.context.raw_model_text;
2113
+ const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
2114
+ console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
2115
+ }
2116
+ }
2117
+ function printFailureSummary(failures) {
2118
+ const groups = groupFailuresByCategory(failures);
2119
+ const sorted = [...groups.entries()].sort(
2120
+ (a, b) => b[1].length - a[1].length
2121
+ );
2122
+ console.log(`
2123
+ ${colors.bold}Failures by category:${colors.reset}`);
2124
+ for (const [category, categoryFailures] of sorted) {
2125
+ console.log(
2126
+ ` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
2127
+ );
2128
+ }
2129
+ const maxToShow = 5;
2130
+ const shown = failures.slice(0, maxToShow);
2131
+ for (const failure of shown) {
2132
+ printCompactFailure(failure);
2133
+ }
2134
+ if (failures.length > maxToShow) {
2135
+ const remaining = failures.length - maxToShow;
2136
+ const remainingIds = failures.slice(maxToShow).map((f) => f.id);
2137
+ const idPreview = remainingIds.slice(0, 5).join(", ");
2138
+ const more = remainingIds.length > 5 ? "..." : "";
2139
+ console.log(
2140
+ `
2141
+ ${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
2142
+ );
2143
+ }
2020
2144
  }
2021
2145
  function printResult(result) {
2022
2146
  const { model, modelKey, benchmark, result: benchmarkResult } = result;
2023
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
2147
+ const passed = benchmarkResult.metrics.correct_count;
2148
+ const total = benchmarkResult.metrics.total_cases;
2149
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2150
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2151
+ const statusColor = benchmarkResult.success ? colors.green : colors.red;
2024
2152
  console.log(
2025
2153
  `
2026
2154
  ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
2027
2155
  );
2028
2156
  console.log(
2029
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
2157
+ ` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
2030
2158
  );
2031
- const metrics = Object.entries(benchmarkResult.metrics);
2032
- if (metrics.length > 0) {
2033
- console.log(" Metrics:");
2034
- for (const [key, value] of metrics) {
2035
- console.log(` - ${key}: ${value}`);
2036
- }
2037
- }
2038
2159
  if (benchmarkResult.error) {
2039
2160
  console.log(
2040
2161
  ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
2041
2162
  );
2042
2163
  }
2043
2164
  if (!benchmarkResult.success && benchmarkResult.logs) {
2044
- printFailLogs(benchmarkResult.logs);
2045
- const failLogs = benchmarkResult.logs.filter(
2046
- (l) => l.startsWith("[DEBUG-FAIL]")
2047
- );
2048
- if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
2049
- console.log(" Raw Logs (Sample):");
2050
- for (const l of benchmarkResult.logs.slice(0, 10)) {
2165
+ const failures = parseFailures(benchmarkResult.logs);
2166
+ if (failures.length > 0) {
2167
+ printFailureSummary(failures);
2168
+ } else if (benchmarkResult.logs.length > 0) {
2169
+ console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
2170
+ for (const l of benchmarkResult.logs.slice(0, 5)) {
2051
2171
  console.log(` ${l}`);
2052
2172
  }
2053
2173
  }
@@ -2406,6 +2526,326 @@ function consoleDebugReporter(results) {
2406
2526
  console.log("\n------------------------------------\n");
2407
2527
  }
2408
2528
 
2529
+ // src/reporters/console.summary.ts
2530
+ var colors3 = {
2531
+ reset: "\x1B[0m",
2532
+ bold: "\x1B[1m",
2533
+ dim: "\x1B[2m",
2534
+ green: "\x1B[32m",
2535
+ red: "\x1B[31m",
2536
+ yellow: "\x1B[33m",
2537
+ cyan: "\x1B[36m",
2538
+ magenta: "\x1B[35m",
2539
+ gray: "\x1B[90m",
2540
+ white: "\x1B[37m"
2541
+ };
2542
+ var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
2543
+ var ID_NUM_REGEX = /_(\d+)$/;
2544
+ var REASONING_TAG = "think";
2545
+ var MAX_FAILURES_TO_DISPLAY = 5;
2546
+ var CATEGORY_DESCRIPTIONS = {
2547
+ PARSE_FAILURE: {
2548
+ label: "Parse Failure",
2549
+ description: "No tool calls extracted from model output",
2550
+ hint: "Model may have responded in text instead of tool format"
2551
+ },
2552
+ PARTIAL_CALLS: {
2553
+ label: "Partial Calls",
2554
+ description: "Some expected tool calls missing",
2555
+ hint: "Model stopped early or missed some tools"
2556
+ },
2557
+ EXTRA_CALLS: {
2558
+ label: "Extra Calls",
2559
+ description: "More tool calls than expected",
2560
+ hint: "Model called tools that weren't needed"
2561
+ },
2562
+ PARAM_VALUE_PERCENT: {
2563
+ label: "Param Value (Percent)",
2564
+ description: "Percentage sent as integer instead of decimal",
2565
+ hint: "e.g., 5 instead of 0.05 for 5%"
2566
+ },
2567
+ PARAM_VALUE_MISMATCH: {
2568
+ label: "Param Value Mismatch",
2569
+ description: "Parameter values don't match expected"
2570
+ },
2571
+ WRONG_FUNCTION: {
2572
+ label: "Wrong Function",
2573
+ description: "Called wrong function name"
2574
+ },
2575
+ MISSING_PARAMS: {
2576
+ label: "Missing Params",
2577
+ description: "Required parameters not provided"
2578
+ },
2579
+ UNEXPECTED_PARAMS: {
2580
+ label: "Unexpected Params",
2581
+ description: "Extra parameters that shouldn't be there"
2582
+ },
2583
+ NO_MATCH: {
2584
+ label: "No Match",
2585
+ description: "Function called but couldn't match to expected",
2586
+ hint: "Parameters may be correct but don't match any expected combination"
2587
+ },
2588
+ OTHER: {
2589
+ label: "Other",
2590
+ description: "Uncategorized failure"
2591
+ }
2592
+ };
2593
+ function parseFailureLogs(logs) {
2594
+ return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
2595
+ try {
2596
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
2597
+ return JSON.parse(jsonStr);
2598
+ } catch (e) {
2599
+ return null;
2600
+ }
2601
+ }).filter((parsed) => parsed !== null);
2602
+ }
2603
+ function groupByCategory(failures) {
2604
+ const groups = /* @__PURE__ */ new Map();
2605
+ for (const failure of failures) {
2606
+ const category = failure.category || "OTHER";
2607
+ const existing = groups.get(category);
2608
+ if (existing) {
2609
+ existing.failures.push(failure);
2610
+ } else {
2611
+ groups.set(category, { failures: [failure] });
2612
+ }
2613
+ }
2614
+ return groups;
2615
+ }
2616
+ function extractParamNames(failures) {
2617
+ const paramNames = /* @__PURE__ */ new Set();
2618
+ for (const f of failures) {
2619
+ if (!f.diff) {
2620
+ continue;
2621
+ }
2622
+ for (const d of f.diff) {
2623
+ if (d.startsWith("@@ param ")) {
2624
+ paramNames.add(d.replace("@@ param ", ""));
2625
+ }
2626
+ }
2627
+ }
2628
+ return paramNames;
2629
+ }
2630
+ function extractFinishReasons(failures) {
2631
+ var _a;
2632
+ const finishReasons = /* @__PURE__ */ new Set();
2633
+ for (const f of failures) {
2634
+ if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
2635
+ finishReasons.add(String(f.context.finish_reason));
2636
+ }
2637
+ }
2638
+ return finishReasons;
2639
+ }
2640
+ function detectPatterns(group) {
2641
+ const { failures } = group;
2642
+ if (failures.length < 2) {
2643
+ return;
2644
+ }
2645
+ const firstCategory = failures[0].category;
2646
+ if (firstCategory === "PARAM_VALUE_PERCENT") {
2647
+ const paramNames = extractParamNames(failures);
2648
+ if (paramNames.size > 0) {
2649
+ group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
2650
+ }
2651
+ }
2652
+ if (firstCategory === "PARSE_FAILURE") {
2653
+ const finishReasons = extractFinishReasons(failures);
2654
+ if (finishReasons.size === 1) {
2655
+ group.pattern = `All finished with: ${[...finishReasons][0]}`;
2656
+ }
2657
+ }
2658
+ }
2659
+ function getLineColor(line) {
2660
+ if (line.startsWith("+")) {
2661
+ return colors3.green;
2662
+ }
2663
+ if (line.startsWith("-")) {
2664
+ return colors3.red;
2665
+ }
2666
+ if (line.startsWith("@@")) {
2667
+ return colors3.cyan;
2668
+ }
2669
+ return colors3.white;
2670
+ }
2671
+ function formatFunctions(funcs) {
2672
+ if (Array.isArray(funcs)) {
2673
+ return funcs.join(", ");
2674
+ }
2675
+ return String(funcs);
2676
+ }
2677
+ function printExpectedActual(failure) {
2678
+ if (failure.expected) {
2679
+ const expFuncs = failure.expected.functions || failure.expected.function;
2680
+ if (expFuncs) {
2681
+ console.log(
2682
+ ` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
2683
+ );
2684
+ }
2685
+ }
2686
+ if (failure.actual) {
2687
+ const actFuncs = failure.actual.functions || failure.actual.function;
2688
+ if (actFuncs) {
2689
+ const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
2690
+ const color = isEmpty ? colors3.red : colors3.white;
2691
+ const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
2692
+ console.log(
2693
+ ` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
2694
+ );
2695
+ }
2696
+ }
2697
+ }
2698
+ function printDiff(diff) {
2699
+ console.log(` ${colors3.gray}Diff:${colors3.reset}`);
2700
+ for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
2701
+ const lineColor = getLineColor(line);
2702
+ console.log(` ${lineColor}${line}${colors3.reset}`);
2703
+ }
2704
+ }
2705
+ function removeReasoningTags(text) {
2706
+ const openTag = `<${REASONING_TAG}>`;
2707
+ const closeTag = `</${REASONING_TAG}>`;
2708
+ const closedTagPattern = new RegExp(
2709
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
2710
+ "g"
2711
+ );
2712
+ const unclosedTagPattern = new RegExp(
2713
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
2714
+ "g"
2715
+ );
2716
+ let result = text.replace(closedTagPattern, "");
2717
+ result = result.replace(unclosedTagPattern, "");
2718
+ return result.trim();
2719
+ }
2720
+ function printModelOutput(failure, category) {
2721
+ var _a, _b;
2722
+ if (category !== "PARSE_FAILURE") {
2723
+ return;
2724
+ }
2725
+ const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
2726
+ const cleanedText = removeReasoningTags(rawText);
2727
+ if (cleanedText) {
2728
+ console.log(
2729
+ ` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
2730
+ );
2731
+ } else {
2732
+ console.log(
2733
+ ` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
2734
+ );
2735
+ }
2736
+ }
2737
+ function shouldShowDiffByDefault(category) {
2738
+ return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
2739
+ }
2740
+ function printSingleFailure(failure, category, verbose) {
2741
+ console.log(`
2742
+ ${colors3.bold}${failure.id}${colors3.reset}`);
2743
+ const hasDiff = failure.diff && failure.diff.length > 0;
2744
+ const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
2745
+ if (showDiffPrimarily) {
2746
+ printDiff(failure.diff);
2747
+ } else {
2748
+ printExpectedActual(failure);
2749
+ if (hasDiff && verbose) {
2750
+ printDiff(failure.diff);
2751
+ }
2752
+ }
2753
+ printModelOutput(failure, category);
2754
+ }
2755
+ var MAX_SAMPLE_FAILURES = 2;
2756
+ function printRemainingIds(failures) {
2757
+ const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
2758
+ const idNums = remainingIds.map((id) => {
2759
+ const match = id.match(ID_NUM_REGEX);
2760
+ return match ? match[1] : id;
2761
+ });
2762
+ console.log(
2763
+ `
2764
+ ${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
2765
+ );
2766
+ }
2767
+ function printCategoryHeader(info, count) {
2768
+ console.log(
2769
+ `
2770
+ ${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
2771
+ );
2772
+ console.log(`${colors3.dim}${info.description}${colors3.reset}`);
2773
+ }
2774
+ function printCategoryDetails(category, group, verbose) {
2775
+ const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
2776
+ const { failures } = group;
2777
+ printCategoryHeader(info, failures.length);
2778
+ if (group.pattern) {
2779
+ console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
2780
+ }
2781
+ if (info.hint) {
2782
+ console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
2783
+ }
2784
+ const samplesToShow = verbose ? failures : failures.slice(0, 2);
2785
+ for (const failure of samplesToShow) {
2786
+ printSingleFailure(failure, category, verbose);
2787
+ }
2788
+ if (!verbose && failures.length > 2) {
2789
+ printRemainingIds(failures);
2790
+ }
2791
+ }
2792
+ function printResultHeader(result) {
2793
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
2794
+ const passed = benchmarkResult.metrics.correct_count;
2795
+ const total = benchmarkResult.metrics.total_cases;
2796
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2797
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2798
+ const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
2799
+ const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
2800
+ const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
2801
+ const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
2802
+ console.log(
2803
+ `
2804
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
2805
+ );
2806
+ console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
2807
+ }
2808
+ function printResultSummary(result, verbose) {
2809
+ const { result: benchmarkResult } = result;
2810
+ printResultHeader(result);
2811
+ if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
2812
+ return;
2813
+ }
2814
+ const failures = parseFailureLogs(benchmarkResult.logs);
2815
+ if (failures.length === 0) {
2816
+ if (!benchmarkResult.success) {
2817
+ console.log(
2818
+ `${colors3.yellow}No structured failure data available${colors3.reset}`
2819
+ );
2820
+ }
2821
+ return;
2822
+ }
2823
+ const groups = groupByCategory(failures);
2824
+ for (const group of groups.values()) {
2825
+ detectPatterns(group);
2826
+ }
2827
+ const sortedCategories = [...groups.entries()].sort(
2828
+ (a, b) => b[1].failures.length - a[1].failures.length
2829
+ );
2830
+ for (const [cat, group] of sortedCategories) {
2831
+ printCategoryDetails(cat, group, verbose);
2832
+ }
2833
+ }
2834
+ function consoleSummaryReporter(results) {
2835
+ const verbose = process.env.VERBOSE === "true";
2836
+ console.log(`
2837
+ ${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
2838
+ console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
2839
+ for (const result of results) {
2840
+ printResultSummary(result, verbose);
2841
+ }
2842
+ console.log(
2843
+ `
2844
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
2845
+ `
2846
+ );
2847
+ }
2848
+
2409
2849
  // src/reporters/json.ts
2410
2850
  function jsonReporter(results) {
2411
2851
  const serializableResults = results.map((r) => {
@@ -2425,60 +2865,56 @@ function jsonReporter(results) {
2425
2865
  var reporters = {
2426
2866
  console: consoleReporter,
2427
2867
  json: jsonReporter,
2428
- "console.debug": consoleDebugReporter
2868
+ "console.debug": consoleDebugReporter,
2869
+ "console.summary": consoleSummaryReporter
2429
2870
  };
2430
2871
 
2431
2872
  // src/evaluate.ts
2432
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
2433
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2434
- try {
2435
- console.log(
2436
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
2437
- );
2438
- const result = await benchmark.run(model, config);
2439
- console.log(
2440
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
2441
- );
2442
- return {
2443
- model: modelId,
2444
- modelKey,
2445
- benchmark: benchmark.name,
2446
- result
2447
- };
2448
- } catch (error) {
2449
- console.error(
2450
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
2451
- error
2452
- );
2453
- return {
2454
- model: modelId,
2455
- modelKey,
2456
- benchmark: benchmark.name,
2457
- result: {
2458
- score: 0,
2459
- success: false,
2460
- metrics: {},
2461
- error: error instanceof Error ? error : new Error(String(error))
2462
- }
2463
- };
2873
+ function isModelConfig(value) {
2874
+ if (typeof value !== "object" || value === null) {
2875
+ return false;
2876
+ }
2877
+ const obj = value;
2878
+ if (!("model" in obj)) {
2879
+ return false;
2464
2880
  }
2881
+ const model = obj.model;
2882
+ if (typeof model !== "object" || model === null) {
2883
+ return false;
2884
+ }
2885
+ return "modelId" in model;
2886
+ }
2887
+ function isLanguageModel(value) {
2888
+ if (typeof value !== "object" || value === null) {
2889
+ return false;
2890
+ }
2891
+ const obj = value;
2892
+ return "modelId" in obj && typeof obj.modelId === "string";
2893
+ }
2894
+ function extractModelAndMiddleware(input) {
2895
+ if (isModelConfig(input)) {
2896
+ return [input.model, input.middleware];
2897
+ }
2898
+ return [input, void 0];
2465
2899
  }
2466
2900
  function normalizeModels(models) {
2467
- const modelEntries = [];
2901
+ const entries = [];
2468
2902
  if (Array.isArray(models)) {
2469
2903
  for (const m of models) {
2470
- modelEntries.push([void 0, m]);
2904
+ const [model, middleware] = extractModelAndMiddleware(m);
2905
+ entries.push([void 0, model, middleware]);
2471
2906
  }
2472
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
2473
- modelEntries.push([void 0, models]);
2907
+ } else if (isModelConfig(models)) {
2908
+ entries.push([void 0, models.model, models.middleware]);
2909
+ } else if (isLanguageModel(models)) {
2910
+ entries.push([void 0, models, void 0]);
2474
2911
  } else {
2475
- for (const [key, m] of Object.entries(
2476
- models
2477
- )) {
2478
- modelEntries.push([key, m]);
2912
+ for (const [key, m] of Object.entries(models)) {
2913
+ const [model, middleware] = extractModelAndMiddleware(m);
2914
+ entries.push([key, model, middleware]);
2479
2915
  }
2480
2916
  }
2481
- return modelEntries;
2917
+ return entries;
2482
2918
  }
2483
2919
  function buildConfig(temperature, maxTokens) {
2484
2920
  const config = {};
@@ -2499,21 +2935,90 @@ function executeReporter(reporter, results) {
2499
2935
  reporters.console(results);
2500
2936
  }
2501
2937
  }
2938
+ function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
2939
+ var _a, _b;
2940
+ const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
2941
+ if (!(cacheEnabled || userMiddleware)) {
2942
+ return baseModel;
2943
+ }
2944
+ const cacheMiddleware = cacheEnabled ? (0, import_middleware.createDiskCacheMiddleware)({
2945
+ cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
2946
+ enabled: true,
2947
+ debug: (_b = cacheOptions.debug) != null ? _b : false
2948
+ }) : null;
2949
+ const middlewares = [];
2950
+ if (userMiddleware) {
2951
+ if (Array.isArray(userMiddleware)) {
2952
+ middlewares.push(...userMiddleware);
2953
+ } else {
2954
+ middlewares.push(userMiddleware);
2955
+ }
2956
+ }
2957
+ if (cacheMiddleware) {
2958
+ middlewares.push(cacheMiddleware);
2959
+ }
2960
+ if (middlewares.length === 0) {
2961
+ return baseModel;
2962
+ }
2963
+ return (0, import_ai4.wrapLanguageModel)({
2964
+ // biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
2965
+ model: baseModel,
2966
+ middleware: middlewares.length === 1 ? middlewares[0] : middlewares
2967
+ });
2968
+ }
2969
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
2970
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2971
+ const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
2972
+ try {
2973
+ process.stdout.write(`${prefix}: ...`);
2974
+ const result = await benchmark.run(model, config);
2975
+ const scoreDisplay = result.score.toFixed(2);
2976
+ process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
2977
+ `);
2978
+ return {
2979
+ model: modelId,
2980
+ modelKey,
2981
+ benchmark: benchmark.name,
2982
+ result
2983
+ };
2984
+ } catch (error) {
2985
+ process.stdout.write(`\r${prefix}: .... Score: ERROR
2986
+ `);
2987
+ console.error(error);
2988
+ return {
2989
+ model: modelId,
2990
+ modelKey,
2991
+ benchmark: benchmark.name,
2992
+ result: {
2993
+ score: 0,
2994
+ success: false,
2995
+ metrics: {},
2996
+ error: error instanceof Error ? error : new Error(String(error))
2997
+ }
2998
+ };
2999
+ }
3000
+ }
2502
3001
  async function evaluate(options) {
2503
3002
  const {
2504
3003
  models,
2505
3004
  benchmarks,
2506
3005
  reporter = "console",
2507
3006
  temperature,
2508
- maxTokens
3007
+ maxTokens,
3008
+ cache
2509
3009
  } = options;
2510
3010
  const modelEntries = normalizeModels(models);
2511
3011
  const config = buildConfig(temperature, maxTokens);
2512
3012
  const allResults = [];
2513
- for (const [modelKey, model] of modelEntries) {
3013
+ for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
3014
+ const effectiveModel = buildEffectiveModel(
3015
+ baseModel,
3016
+ userMiddleware,
3017
+ cache
3018
+ );
2514
3019
  for (const benchmark of benchmarks) {
2515
3020
  const evaluationResult = await runSingleBenchmark(
2516
- model,
3021
+ effectiveModel,
2517
3022
  benchmark,
2518
3023
  modelKey,
2519
3024
  config