@ai-sdk-tool/eval 1.0.0-canary.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -368,6 +368,7 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
368
368
  // src/benchmarks/bfcl.ts
369
369
  var LINE_SPLIT_REGEX = /\r?\n/;
370
370
  var NUMERIC_STRING_REGEX = /^\d+$/;
371
+ var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
371
372
  function convertGroundTruthToXML(call) {
372
373
  const keys = Object.keys(call);
373
374
  if (keys.length === 0) {
@@ -399,45 +400,67 @@ function convertGroundTruthToXML(call) {
399
400
  xml += `</${funcName}>`;
400
401
  return xml;
401
402
  }
403
+ function extractCategory(id) {
404
+ if (id.startsWith("parallel_multiple")) {
405
+ return "parallel_multiple";
406
+ }
407
+ if (id.startsWith("simple_python")) {
408
+ return "simple";
409
+ }
410
+ if (id.startsWith("simple_java")) {
411
+ return "simple";
412
+ }
413
+ if (id.startsWith("simple_javascript")) {
414
+ return "simple";
415
+ }
416
+ if (id.startsWith("parallel")) {
417
+ return "parallel";
418
+ }
419
+ if (id.startsWith("multiple")) {
420
+ return "multiple";
421
+ }
422
+ if (id.startsWith("simple")) {
423
+ return "simple";
424
+ }
425
+ return id.split("_")[0];
426
+ }
402
427
  function check(testCase, modelOutput, possibleAnswer) {
403
- const category = testCase.id.split("_")[0];
428
+ const category = extractCategory(testCase.id);
404
429
  try {
405
- if (category === "simple") {
406
- if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
407
- return {
408
- valid: false,
409
- error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
410
- error_type: "simple:wrong_count"
411
- };
430
+ switch (category) {
431
+ case "simple": {
432
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
433
+ return {
434
+ valid: false,
435
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
436
+ error_type: "simple:wrong_count"
437
+ };
438
+ }
439
+ return simpleFunctionChecker(
440
+ testCase.function[0],
441
+ modelOutput[0],
442
+ possibleAnswer.ground_truth[0]
443
+ );
444
+ }
445
+ case "multiple": {
446
+ return multipleFunctionChecker(
447
+ testCase.function,
448
+ modelOutput,
449
+ possibleAnswer.ground_truth
450
+ );
451
+ }
452
+ case "parallel":
453
+ case "parallel_multiple": {
454
+ return parallelFunctionCheckerNoOrder(
455
+ testCase.function,
456
+ modelOutput,
457
+ possibleAnswer.ground_truth
458
+ );
459
+ }
460
+ default: {
461
+ return { valid: true };
412
462
  }
413
- return simpleFunctionChecker(
414
- testCase.function[0],
415
- modelOutput[0],
416
- possibleAnswer.ground_truth[0]
417
- );
418
- }
419
- if (category === "parallel") {
420
- return parallelFunctionCheckerNoOrder(
421
- testCase.function,
422
- modelOutput,
423
- possibleAnswer.ground_truth
424
- );
425
- }
426
- if (category === "multiple") {
427
- return multipleFunctionChecker(
428
- testCase.function,
429
- modelOutput,
430
- possibleAnswer.ground_truth
431
- );
432
- }
433
- if (category.includes("parallel-multiple")) {
434
- return parallelFunctionCheckerNoOrder(
435
- testCase.function,
436
- modelOutput,
437
- possibleAnswer.ground_truth
438
- );
439
463
  }
440
- return { valid: true };
441
464
  } catch (e) {
442
465
  return {
443
466
  valid: false,
@@ -615,7 +638,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
615
638
  return `- expected one of: ${formatted}`;
616
639
  })();
617
640
  diffLines.push(expectedLine);
618
- diffLines.push(`+ got: ${JSON.stringify(got)}`);
641
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
619
642
  return diffLines;
620
643
  };
621
644
  const paramValueMatches = (allowed, got) => {
@@ -832,44 +855,97 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
832
855
  );
833
856
  }
834
857
  };
835
- const buildFailureContext = (options) => {
836
- const {
837
- testCase,
838
- tools,
839
- flatMessages,
840
- mwOriginalText,
841
- text,
842
- finishReason,
843
- mwParsedToolCalls,
844
- restoredCalls,
845
- possibleAnswer
846
- } = options;
847
- const lastUser = (() => {
848
- var _a;
849
- const reversed = [...flatMessages].reverse();
850
- const found = reversed.find(
851
- (m) => m.role === "user"
852
- );
853
- return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
854
- })();
855
- const rawModelText = (() => {
856
- if (mwOriginalText && mwOriginalText.length > 0) {
857
- return mwOriginalText;
858
+ const hasPercentPattern = (diff) => {
859
+ return diff.some((d) => {
860
+ if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
861
+ return false;
858
862
  }
859
- if (typeof text === "string") {
860
- return text;
863
+ const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
864
+ if (!numMatch) {
865
+ return false;
861
866
  }
862
- return "";
863
- })();
864
- return {
865
- id: testCase.id,
866
- tool_schema: tools,
867
- last_user_query: lastUser,
868
- raw_model_text: rawModelText,
869
- finish_reason: finishReason,
870
- parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
871
- ground_truth: possibleAnswer.ground_truth
872
- };
867
+ const num = Number.parseFloat(numMatch[1]);
868
+ return num >= 1 && num <= 100;
869
+ });
870
+ };
871
+ const isValueError = (errorType, diff) => {
872
+ return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
873
+ };
874
+ const isFunctionNameError = (errorType, diff) => {
875
+ return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
876
+ };
877
+ const isMissingParamError = (errorType, diff) => {
878
+ return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
879
+ };
880
+ const isUnexpectedParamError = (errorType, diff) => {
881
+ return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
882
+ };
883
+ const classifyByErrorPatterns = (errorType, diff) => {
884
+ const patterns = [
885
+ [
886
+ isValueError,
887
+ hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
888
+ ],
889
+ [isFunctionNameError, "WRONG_FUNCTION"],
890
+ [isMissingParamError, "MISSING_PARAMS"],
891
+ [isUnexpectedParamError, "UNEXPECTED_PARAMS"]
892
+ ];
893
+ for (const [classifier, result] of patterns) {
894
+ if (classifier(errorType, diff)) {
895
+ return result;
896
+ }
897
+ }
898
+ if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
899
+ return "NO_MATCH";
900
+ }
901
+ return null;
902
+ };
903
+ const classifyByCallCount = (actualCount, expectedCount) => {
904
+ if (actualCount === 0 && expectedCount > 0) {
905
+ return "PARSE_FAILURE";
906
+ }
907
+ if (actualCount > 0 && actualCount < expectedCount) {
908
+ return "PARTIAL_CALLS";
909
+ }
910
+ if (actualCount > expectedCount) {
911
+ return "EXTRA_CALLS";
912
+ }
913
+ return null;
914
+ };
915
+ const classifyFailureType = (options) => {
916
+ const { errorType, restoredCalls, expectedCount, diff } = options;
917
+ const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
918
+ const countBasedResult = classifyByCallCount(
919
+ actualCount,
920
+ expectedCount
921
+ );
922
+ if (countBasedResult) {
923
+ return countBasedResult;
924
+ }
925
+ const patternBasedResult = classifyByErrorPatterns(errorType, diff);
926
+ if (patternBasedResult) {
927
+ return patternBasedResult;
928
+ }
929
+ return "OTHER";
930
+ };
931
+ const extractRawModelText = (mwOriginalText, text) => {
932
+ if (mwOriginalText && mwOriginalText.length > 0) {
933
+ return mwOriginalText;
934
+ }
935
+ if (typeof text === "string") {
936
+ return text;
937
+ }
938
+ return "";
939
+ };
940
+ const extractLastUserQuery = (flatMessages) => {
941
+ var _a;
942
+ const reversed = [...flatMessages].reverse();
943
+ const found = reversed.find((m) => m.role === "user");
944
+ const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
945
+ return content.length > 200 ? `${content.slice(0, 200)}...` : content;
946
+ };
947
+ const truncateText = (text, maxLen) => {
948
+ return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
873
949
  };
874
950
  const logFailureDetails = (options) => {
875
951
  const {
@@ -887,42 +963,36 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
887
963
  } = options;
888
964
  try {
889
965
  const category = testCase.id.split("_")[0];
890
- const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
891
- tools,
892
- possibleAnswer,
893
- restoredCalls
894
- ) : buildParallelDiff(
895
- tools,
896
- possibleAnswer,
897
- restoredCalls
898
- );
899
- caseLogs.push(
900
- `[DEBUG-FAIL] ${JSON.stringify({
901
- id: testCase.id,
902
- message: checkerResult.error,
903
- error_type: checkerResult.error_type,
904
- expected,
905
- actual,
906
- diff
907
- })}`
908
- );
909
- try {
910
- const contextPayload = buildFailureContext({
911
- testCase,
912
- tools,
913
- flatMessages,
914
- mwOriginalText,
915
- text,
916
- finishReason,
917
- mwParsedToolCalls,
966
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
967
+ const gtArr = possibleAnswer.ground_truth;
968
+ const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
969
+ const rawModelText = extractRawModelText(mwOriginalText, text);
970
+ const lastUserQuery = extractLastUserQuery(flatMessages);
971
+ const failurePayload = {
972
+ id: testCase.id,
973
+ category: classifyFailureType({
974
+ errorType: checkerResult.error_type,
918
975
  restoredCalls,
919
- possibleAnswer
920
- });
921
- caseLogs.push(
922
- `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
923
- );
924
- } catch (e) {
925
- }
976
+ expectedCount,
977
+ diff
978
+ }),
979
+ message: checkerResult.error,
980
+ error_type: checkerResult.error_type,
981
+ expected,
982
+ actual,
983
+ diff,
984
+ context: {
985
+ raw_model_text: truncateText(rawModelText, 500),
986
+ raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
987
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
988
+ expected_count: expectedCount,
989
+ actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
990
+ finish_reason: finishReason,
991
+ last_user_query: lastUserQuery,
992
+ tool_names: tools.map((t) => t.name)
993
+ }
994
+ };
995
+ caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
926
996
  } catch (e) {
927
997
  caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
928
998
  }
@@ -1147,14 +1217,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1147
1217
  };
1148
1218
  }
1149
1219
  const score = correctCount / testCases.length;
1220
+ const caseResults = resultsPerCase.map((r, i) => ({
1221
+ id: testCases[i].id,
1222
+ valid: r.valid
1223
+ }));
1150
1224
  return {
1151
1225
  score,
1152
1226
  success: score > 0.95,
1153
- // High success threshold as requested
1154
1227
  metrics: {
1155
1228
  correct_count: correctCount,
1156
1229
  total_cases: testCases.length,
1157
- accuracy: score
1230
+ accuracy: score,
1231
+ case_results: JSON.stringify(caseResults)
1158
1232
  },
1159
1233
  logs
1160
1234
  };
@@ -1174,27 +1248,27 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1174
1248
  }
1175
1249
  var bfclSimpleBenchmark = createBfclBenchmark(
1176
1250
  "bfcl-simple",
1177
- "BFCL Simple Function Calling",
1178
- "BFCL_v3_simple.jsonl",
1179
- "BFCL_v3_simple_possible_answer.jsonl"
1251
+ "BFCL v4 Simple Function Calling",
1252
+ "BFCL_v4_simple.jsonl",
1253
+ "BFCL_v4_simple_possible_answer.jsonl"
1180
1254
  );
1181
1255
  var bfclParallelBenchmark = createBfclBenchmark(
1182
1256
  "bfcl-parallel",
1183
- "BFCL Parallel Function Calling",
1184
- "BFCL_v3_parallel.jsonl",
1185
- "BFCL_v3_parallel_possible_answer.jsonl"
1257
+ "BFCL v4 Parallel Function Calling",
1258
+ "BFCL_v4_parallel.jsonl",
1259
+ "BFCL_v4_parallel_possible_answer.jsonl"
1186
1260
  );
1187
1261
  var bfclMultipleBenchmark = createBfclBenchmark(
1188
1262
  "bfcl-multiple",
1189
- "BFCL Multiple Function Calling",
1190
- "BFCL_v3_multiple.jsonl",
1191
- "BFCL_v3_multiple_possible_answer.jsonl"
1263
+ "BFCL v4 Multiple Function Calling",
1264
+ "BFCL_v4_multiple.jsonl",
1265
+ "BFCL_v4_multiple_possible_answer.jsonl"
1192
1266
  );
1193
1267
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1194
1268
  "bfcl-parallel-multiple",
1195
- "BFCL Parallel & Multiple Function Calling",
1196
- "BFCL_v3_parallel_multiple.jsonl",
1197
- "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1269
+ "BFCL v4 Parallel & Multiple Function Calling",
1270
+ "BFCL_v4_parallel_multiple.jsonl",
1271
+ "BFCL_v4_parallel_multiple_possible_answer.jsonl"
1198
1272
  );
1199
1273
 
1200
1274
  // src/benchmarks/complex-func-bench.ts
@@ -1925,23 +1999,28 @@ var jsonGenerationSchemaOnlyBenchmark = {
1925
1999
  }
1926
2000
  };
1927
2001
 
2002
+ // src/evaluate.ts
2003
+ import { createDiskCacheMiddleware } from "@ai-sdk-tool/middleware";
2004
+ import { wrapLanguageModel } from "ai";
2005
+
1928
2006
  // src/reporters/console.ts
1929
2007
  var colors = {
1930
2008
  reset: "\x1B[0m",
2009
+ bold: "\x1B[1m",
1931
2010
  green: "\x1B[32m",
1932
2011
  red: "\x1B[31m",
1933
2012
  yellow: "\x1B[33m",
1934
2013
  cyan: "\x1B[36m",
1935
2014
  magenta: "\x1B[35m",
1936
2015
  gray: "\x1B[90m",
1937
- white: "\x1B[37m",
1938
- bgRed: "\x1B[41m"
2016
+ white: "\x1B[37m"
1939
2017
  };
2018
+ var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
1940
2019
  function formatDiff(diff) {
1941
2020
  if (!diff || diff.length === 0) {
1942
2021
  return "";
1943
2022
  }
1944
- return diff.map((line) => {
2023
+ return diff.slice(0, 8).map((line) => {
1945
2024
  if (line.startsWith("-")) {
1946
2025
  return `${colors.red}${line}${colors.reset}`;
1947
2026
  }
@@ -1954,65 +2033,106 @@ function formatDiff(diff) {
1954
2033
  return line;
1955
2034
  }).join("\n ");
1956
2035
  }
1957
- function printFailLogs(logs) {
1958
- const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
1959
- for (const log of failLogs) {
2036
+ function parseFailures(logs) {
2037
+ const failures = [];
2038
+ for (const log of logs) {
2039
+ if (!DEBUG_FAIL_REGEX.test(log)) {
2040
+ continue;
2041
+ }
1960
2042
  try {
1961
- const jsonStr = log.replace("[DEBUG-FAIL] ", "");
1962
- const data = JSON.parse(jsonStr);
1963
- console.log(`
1964
- ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
1965
- console.log(
1966
- ` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
1967
- );
1968
- console.log(` Message: ${data.message}`);
1969
- if (data.diff && Array.isArray(data.diff)) {
1970
- console.log(` Diff:
1971
- ${formatDiff(data.diff)}`);
1972
- }
1973
- if (data.expected && data.actual) {
1974
- const expStr = JSON.stringify(data.expected);
1975
- const actStr = JSON.stringify(data.actual);
1976
- if (expStr.length < 100 && actStr.length < 100) {
1977
- console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
1978
- console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
1979
- }
1980
- }
1981
- } catch (_e) {
1982
- console.log(` Raw Log: ${log}`);
2043
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
2044
+ const parsed = JSON.parse(jsonStr);
2045
+ failures.push(parsed);
2046
+ } catch (e) {
1983
2047
  }
1984
2048
  }
2049
+ return failures;
2050
+ }
2051
+ function groupFailuresByCategory(failures) {
2052
+ const groups = /* @__PURE__ */ new Map();
2053
+ for (const failure of failures) {
2054
+ const category = failure.category || "OTHER";
2055
+ const existing = groups.get(category);
2056
+ if (existing) {
2057
+ existing.push(failure);
2058
+ } else {
2059
+ groups.set(category, [failure]);
2060
+ }
2061
+ }
2062
+ return groups;
2063
+ }
2064
+ function printCompactFailure(failure) {
2065
+ var _a;
2066
+ console.log(
2067
+ `
2068
+ ${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
2069
+ );
2070
+ if (failure.message) {
2071
+ console.log(` ${failure.message}`);
2072
+ }
2073
+ if (failure.diff && failure.diff.length > 0) {
2074
+ console.log(` ${formatDiff(failure.diff)}`);
2075
+ }
2076
+ if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
2077
+ const text = failure.context.raw_model_text;
2078
+ const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
2079
+ console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
2080
+ }
2081
+ }
2082
+ function printFailureSummary(failures) {
2083
+ const groups = groupFailuresByCategory(failures);
2084
+ const sorted = [...groups.entries()].sort(
2085
+ (a, b) => b[1].length - a[1].length
2086
+ );
2087
+ console.log(`
2088
+ ${colors.bold}Failures by category:${colors.reset}`);
2089
+ for (const [category, categoryFailures] of sorted) {
2090
+ console.log(
2091
+ ` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
2092
+ );
2093
+ }
2094
+ const maxToShow = 5;
2095
+ const shown = failures.slice(0, maxToShow);
2096
+ for (const failure of shown) {
2097
+ printCompactFailure(failure);
2098
+ }
2099
+ if (failures.length > maxToShow) {
2100
+ const remaining = failures.length - maxToShow;
2101
+ const remainingIds = failures.slice(maxToShow).map((f) => f.id);
2102
+ const idPreview = remainingIds.slice(0, 5).join(", ");
2103
+ const more = remainingIds.length > 5 ? "..." : "";
2104
+ console.log(
2105
+ `
2106
+ ${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
2107
+ );
2108
+ }
1985
2109
  }
1986
2110
  function printResult(result) {
1987
2111
  const { model, modelKey, benchmark, result: benchmarkResult } = result;
1988
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
2112
+ const passed = benchmarkResult.metrics.correct_count;
2113
+ const total = benchmarkResult.metrics.total_cases;
2114
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2115
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2116
+ const statusColor = benchmarkResult.success ? colors.green : colors.red;
1989
2117
  console.log(
1990
2118
  `
1991
2119
  ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
1992
2120
  );
1993
2121
  console.log(
1994
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
2122
+ ` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
1995
2123
  );
1996
- const metrics = Object.entries(benchmarkResult.metrics);
1997
- if (metrics.length > 0) {
1998
- console.log(" Metrics:");
1999
- for (const [key, value] of metrics) {
2000
- console.log(` - ${key}: ${value}`);
2001
- }
2002
- }
2003
2124
  if (benchmarkResult.error) {
2004
2125
  console.log(
2005
2126
  ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
2006
2127
  );
2007
2128
  }
2008
2129
  if (!benchmarkResult.success && benchmarkResult.logs) {
2009
- printFailLogs(benchmarkResult.logs);
2010
- const failLogs = benchmarkResult.logs.filter(
2011
- (l) => l.startsWith("[DEBUG-FAIL]")
2012
- );
2013
- if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
2014
- console.log(" Raw Logs (Sample):");
2015
- for (const l of benchmarkResult.logs.slice(0, 10)) {
2130
+ const failures = parseFailures(benchmarkResult.logs);
2131
+ if (failures.length > 0) {
2132
+ printFailureSummary(failures);
2133
+ } else if (benchmarkResult.logs.length > 0) {
2134
+ console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
2135
+ for (const l of benchmarkResult.logs.slice(0, 5)) {
2016
2136
  console.log(` ${l}`);
2017
2137
  }
2018
2138
  }
@@ -2371,6 +2491,326 @@ function consoleDebugReporter(results) {
2371
2491
  console.log("\n------------------------------------\n");
2372
2492
  }
2373
2493
 
2494
+ // src/reporters/console.summary.ts
2495
+ var colors3 = {
2496
+ reset: "\x1B[0m",
2497
+ bold: "\x1B[1m",
2498
+ dim: "\x1B[2m",
2499
+ green: "\x1B[32m",
2500
+ red: "\x1B[31m",
2501
+ yellow: "\x1B[33m",
2502
+ cyan: "\x1B[36m",
2503
+ magenta: "\x1B[35m",
2504
+ gray: "\x1B[90m",
2505
+ white: "\x1B[37m"
2506
+ };
2507
+ var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
2508
+ var ID_NUM_REGEX = /_(\d+)$/;
2509
+ var REASONING_TAG = "think";
2510
+ var MAX_FAILURES_TO_DISPLAY = 5;
2511
+ var CATEGORY_DESCRIPTIONS = {
2512
+ PARSE_FAILURE: {
2513
+ label: "Parse Failure",
2514
+ description: "No tool calls extracted from model output",
2515
+ hint: "Model may have responded in text instead of tool format"
2516
+ },
2517
+ PARTIAL_CALLS: {
2518
+ label: "Partial Calls",
2519
+ description: "Some expected tool calls missing",
2520
+ hint: "Model stopped early or missed some tools"
2521
+ },
2522
+ EXTRA_CALLS: {
2523
+ label: "Extra Calls",
2524
+ description: "More tool calls than expected",
2525
+ hint: "Model called tools that weren't needed"
2526
+ },
2527
+ PARAM_VALUE_PERCENT: {
2528
+ label: "Param Value (Percent)",
2529
+ description: "Percentage sent as integer instead of decimal",
2530
+ hint: "e.g., 5 instead of 0.05 for 5%"
2531
+ },
2532
+ PARAM_VALUE_MISMATCH: {
2533
+ label: "Param Value Mismatch",
2534
+ description: "Parameter values don't match expected"
2535
+ },
2536
+ WRONG_FUNCTION: {
2537
+ label: "Wrong Function",
2538
+ description: "Called wrong function name"
2539
+ },
2540
+ MISSING_PARAMS: {
2541
+ label: "Missing Params",
2542
+ description: "Required parameters not provided"
2543
+ },
2544
+ UNEXPECTED_PARAMS: {
2545
+ label: "Unexpected Params",
2546
+ description: "Extra parameters that shouldn't be there"
2547
+ },
2548
+ NO_MATCH: {
2549
+ label: "No Match",
2550
+ description: "Function called but couldn't match to expected",
2551
+ hint: "Parameters may be correct but don't match any expected combination"
2552
+ },
2553
+ OTHER: {
2554
+ label: "Other",
2555
+ description: "Uncategorized failure"
2556
+ }
2557
+ };
2558
+ function parseFailureLogs(logs) {
2559
+ return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
2560
+ try {
2561
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
2562
+ return JSON.parse(jsonStr);
2563
+ } catch (e) {
2564
+ return null;
2565
+ }
2566
+ }).filter((parsed) => parsed !== null);
2567
+ }
2568
+ function groupByCategory(failures) {
2569
+ const groups = /* @__PURE__ */ new Map();
2570
+ for (const failure of failures) {
2571
+ const category = failure.category || "OTHER";
2572
+ const existing = groups.get(category);
2573
+ if (existing) {
2574
+ existing.failures.push(failure);
2575
+ } else {
2576
+ groups.set(category, { failures: [failure] });
2577
+ }
2578
+ }
2579
+ return groups;
2580
+ }
2581
+ function extractParamNames(failures) {
2582
+ const paramNames = /* @__PURE__ */ new Set();
2583
+ for (const f of failures) {
2584
+ if (!f.diff) {
2585
+ continue;
2586
+ }
2587
+ for (const d of f.diff) {
2588
+ if (d.startsWith("@@ param ")) {
2589
+ paramNames.add(d.replace("@@ param ", ""));
2590
+ }
2591
+ }
2592
+ }
2593
+ return paramNames;
2594
+ }
2595
+ function extractFinishReasons(failures) {
2596
+ var _a;
2597
+ const finishReasons = /* @__PURE__ */ new Set();
2598
+ for (const f of failures) {
2599
+ if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
2600
+ finishReasons.add(String(f.context.finish_reason));
2601
+ }
2602
+ }
2603
+ return finishReasons;
2604
+ }
2605
+ function detectPatterns(group) {
2606
+ const { failures } = group;
2607
+ if (failures.length < 2) {
2608
+ return;
2609
+ }
2610
+ const firstCategory = failures[0].category;
2611
+ if (firstCategory === "PARAM_VALUE_PERCENT") {
2612
+ const paramNames = extractParamNames(failures);
2613
+ if (paramNames.size > 0) {
2614
+ group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
2615
+ }
2616
+ }
2617
+ if (firstCategory === "PARSE_FAILURE") {
2618
+ const finishReasons = extractFinishReasons(failures);
2619
+ if (finishReasons.size === 1) {
2620
+ group.pattern = `All finished with: ${[...finishReasons][0]}`;
2621
+ }
2622
+ }
2623
+ }
2624
+ function getLineColor(line) {
2625
+ if (line.startsWith("+")) {
2626
+ return colors3.green;
2627
+ }
2628
+ if (line.startsWith("-")) {
2629
+ return colors3.red;
2630
+ }
2631
+ if (line.startsWith("@@")) {
2632
+ return colors3.cyan;
2633
+ }
2634
+ return colors3.white;
2635
+ }
2636
+ function formatFunctions(funcs) {
2637
+ if (Array.isArray(funcs)) {
2638
+ return funcs.join(", ");
2639
+ }
2640
+ return String(funcs);
2641
+ }
2642
+ function printExpectedActual(failure) {
2643
+ if (failure.expected) {
2644
+ const expFuncs = failure.expected.functions || failure.expected.function;
2645
+ if (expFuncs) {
2646
+ console.log(
2647
+ ` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
2648
+ );
2649
+ }
2650
+ }
2651
+ if (failure.actual) {
2652
+ const actFuncs = failure.actual.functions || failure.actual.function;
2653
+ if (actFuncs) {
2654
+ const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
2655
+ const color = isEmpty ? colors3.red : colors3.white;
2656
+ const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
2657
+ console.log(
2658
+ ` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
2659
+ );
2660
+ }
2661
+ }
2662
+ }
2663
+ function printDiff(diff) {
2664
+ console.log(` ${colors3.gray}Diff:${colors3.reset}`);
2665
+ for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
2666
+ const lineColor = getLineColor(line);
2667
+ console.log(` ${lineColor}${line}${colors3.reset}`);
2668
+ }
2669
+ }
2670
+ function removeReasoningTags(text) {
2671
+ const openTag = `<${REASONING_TAG}>`;
2672
+ const closeTag = `</${REASONING_TAG}>`;
2673
+ const closedTagPattern = new RegExp(
2674
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
2675
+ "g"
2676
+ );
2677
+ const unclosedTagPattern = new RegExp(
2678
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
2679
+ "g"
2680
+ );
2681
+ let result = text.replace(closedTagPattern, "");
2682
+ result = result.replace(unclosedTagPattern, "");
2683
+ return result.trim();
2684
+ }
2685
+ function printModelOutput(failure, category) {
2686
+ var _a, _b;
2687
+ if (category !== "PARSE_FAILURE") {
2688
+ return;
2689
+ }
2690
+ const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
2691
+ const cleanedText = removeReasoningTags(rawText);
2692
+ if (cleanedText) {
2693
+ console.log(
2694
+ ` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
2695
+ );
2696
+ } else {
2697
+ console.log(
2698
+ ` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
2699
+ );
2700
+ }
2701
+ }
2702
+ function shouldShowDiffByDefault(category) {
2703
+ return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
2704
+ }
2705
+ function printSingleFailure(failure, category, verbose) {
2706
+ console.log(`
2707
+ ${colors3.bold}${failure.id}${colors3.reset}`);
2708
+ const hasDiff = failure.diff && failure.diff.length > 0;
2709
+ const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
2710
+ if (showDiffPrimarily) {
2711
+ printDiff(failure.diff);
2712
+ } else {
2713
+ printExpectedActual(failure);
2714
+ if (hasDiff && verbose) {
2715
+ printDiff(failure.diff);
2716
+ }
2717
+ }
2718
+ printModelOutput(failure, category);
2719
+ }
2720
+ var MAX_SAMPLE_FAILURES = 2;
2721
+ function printRemainingIds(failures) {
2722
+ const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
2723
+ const idNums = remainingIds.map((id) => {
2724
+ const match = id.match(ID_NUM_REGEX);
2725
+ return match ? match[1] : id;
2726
+ });
2727
+ console.log(
2728
+ `
2729
+ ${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
2730
+ );
2731
+ }
2732
+ function printCategoryHeader(info, count) {
2733
+ console.log(
2734
+ `
2735
+ ${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
2736
+ );
2737
+ console.log(`${colors3.dim}${info.description}${colors3.reset}`);
2738
+ }
2739
+ function printCategoryDetails(category, group, verbose) {
2740
+ const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
2741
+ const { failures } = group;
2742
+ printCategoryHeader(info, failures.length);
2743
+ if (group.pattern) {
2744
+ console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
2745
+ }
2746
+ if (info.hint) {
2747
+ console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
2748
+ }
2749
+ const samplesToShow = verbose ? failures : failures.slice(0, 2);
2750
+ for (const failure of samplesToShow) {
2751
+ printSingleFailure(failure, category, verbose);
2752
+ }
2753
+ if (!verbose && failures.length > 2) {
2754
+ printRemainingIds(failures);
2755
+ }
2756
+ }
2757
+ function printResultHeader(result) {
2758
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
2759
+ const passed = benchmarkResult.metrics.correct_count;
2760
+ const total = benchmarkResult.metrics.total_cases;
2761
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2762
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2763
+ const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
2764
+ const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
2765
+ const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
2766
+ const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
2767
+ console.log(
2768
+ `
2769
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
2770
+ );
2771
+ console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
2772
+ }
2773
+ function printResultSummary(result, verbose) {
2774
+ const { result: benchmarkResult } = result;
2775
+ printResultHeader(result);
2776
+ if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
2777
+ return;
2778
+ }
2779
+ const failures = parseFailureLogs(benchmarkResult.logs);
2780
+ if (failures.length === 0) {
2781
+ if (!benchmarkResult.success) {
2782
+ console.log(
2783
+ `${colors3.yellow}No structured failure data available${colors3.reset}`
2784
+ );
2785
+ }
2786
+ return;
2787
+ }
2788
+ const groups = groupByCategory(failures);
2789
+ for (const group of groups.values()) {
2790
+ detectPatterns(group);
2791
+ }
2792
+ const sortedCategories = [...groups.entries()].sort(
2793
+ (a, b) => b[1].failures.length - a[1].failures.length
2794
+ );
2795
+ for (const [cat, group] of sortedCategories) {
2796
+ printCategoryDetails(cat, group, verbose);
2797
+ }
2798
+ }
2799
+ function consoleSummaryReporter(results) {
2800
+ const verbose = process.env.VERBOSE === "true";
2801
+ console.log(`
2802
+ ${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
2803
+ console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
2804
+ for (const result of results) {
2805
+ printResultSummary(result, verbose);
2806
+ }
2807
+ console.log(
2808
+ `
2809
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
2810
+ `
2811
+ );
2812
+ }
2813
+
2374
2814
  // src/reporters/json.ts
2375
2815
  function jsonReporter(results) {
2376
2816
  const serializableResults = results.map((r) => {
@@ -2390,60 +2830,56 @@ function jsonReporter(results) {
2390
2830
  var reporters = {
2391
2831
  console: consoleReporter,
2392
2832
  json: jsonReporter,
2393
- "console.debug": consoleDebugReporter
2833
+ "console.debug": consoleDebugReporter,
2834
+ "console.summary": consoleSummaryReporter
2394
2835
  };
2395
2836
 
2396
2837
  // src/evaluate.ts
2397
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
2398
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2399
- try {
2400
- console.log(
2401
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
2402
- );
2403
- const result = await benchmark.run(model, config);
2404
- console.log(
2405
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
2406
- );
2407
- return {
2408
- model: modelId,
2409
- modelKey,
2410
- benchmark: benchmark.name,
2411
- result
2412
- };
2413
- } catch (error) {
2414
- console.error(
2415
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
2416
- error
2417
- );
2418
- return {
2419
- model: modelId,
2420
- modelKey,
2421
- benchmark: benchmark.name,
2422
- result: {
2423
- score: 0,
2424
- success: false,
2425
- metrics: {},
2426
- error: error instanceof Error ? error : new Error(String(error))
2427
- }
2428
- };
2838
+ function isModelConfig(value) {
2839
+ if (typeof value !== "object" || value === null) {
2840
+ return false;
2841
+ }
2842
+ const obj = value;
2843
+ if (!("model" in obj)) {
2844
+ return false;
2429
2845
  }
2846
+ const model = obj.model;
2847
+ if (typeof model !== "object" || model === null) {
2848
+ return false;
2849
+ }
2850
+ return "modelId" in model;
2851
+ }
2852
+ function isLanguageModel(value) {
2853
+ if (typeof value !== "object" || value === null) {
2854
+ return false;
2855
+ }
2856
+ const obj = value;
2857
+ return "modelId" in obj && typeof obj.modelId === "string";
2858
+ }
2859
+ function extractModelAndMiddleware(input) {
2860
+ if (isModelConfig(input)) {
2861
+ return [input.model, input.middleware];
2862
+ }
2863
+ return [input, void 0];
2430
2864
  }
2431
2865
  function normalizeModels(models) {
2432
- const modelEntries = [];
2866
+ const entries = [];
2433
2867
  if (Array.isArray(models)) {
2434
2868
  for (const m of models) {
2435
- modelEntries.push([void 0, m]);
2869
+ const [model, middleware] = extractModelAndMiddleware(m);
2870
+ entries.push([void 0, model, middleware]);
2436
2871
  }
2437
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
2438
- modelEntries.push([void 0, models]);
2872
+ } else if (isModelConfig(models)) {
2873
+ entries.push([void 0, models.model, models.middleware]);
2874
+ } else if (isLanguageModel(models)) {
2875
+ entries.push([void 0, models, void 0]);
2439
2876
  } else {
2440
- for (const [key, m] of Object.entries(
2441
- models
2442
- )) {
2443
- modelEntries.push([key, m]);
2877
+ for (const [key, m] of Object.entries(models)) {
2878
+ const [model, middleware] = extractModelAndMiddleware(m);
2879
+ entries.push([key, model, middleware]);
2444
2880
  }
2445
2881
  }
2446
- return modelEntries;
2882
+ return entries;
2447
2883
  }
2448
2884
  function buildConfig(temperature, maxTokens) {
2449
2885
  const config = {};
@@ -2464,21 +2900,90 @@ function executeReporter(reporter, results) {
2464
2900
  reporters.console(results);
2465
2901
  }
2466
2902
  }
2903
+ function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
2904
+ var _a, _b;
2905
+ const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
2906
+ if (!(cacheEnabled || userMiddleware)) {
2907
+ return baseModel;
2908
+ }
2909
+ const cacheMiddleware = cacheEnabled ? createDiskCacheMiddleware({
2910
+ cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
2911
+ enabled: true,
2912
+ debug: (_b = cacheOptions.debug) != null ? _b : false
2913
+ }) : null;
2914
+ const middlewares = [];
2915
+ if (userMiddleware) {
2916
+ if (Array.isArray(userMiddleware)) {
2917
+ middlewares.push(...userMiddleware);
2918
+ } else {
2919
+ middlewares.push(userMiddleware);
2920
+ }
2921
+ }
2922
+ if (cacheMiddleware) {
2923
+ middlewares.push(cacheMiddleware);
2924
+ }
2925
+ if (middlewares.length === 0) {
2926
+ return baseModel;
2927
+ }
2928
+ return wrapLanguageModel({
2929
+ // biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
2930
+ model: baseModel,
2931
+ middleware: middlewares.length === 1 ? middlewares[0] : middlewares
2932
+ });
2933
+ }
2934
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
2935
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2936
+ const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
2937
+ try {
2938
+ process.stdout.write(`${prefix}: ...`);
2939
+ const result = await benchmark.run(model, config);
2940
+ const scoreDisplay = result.score.toFixed(2);
2941
+ process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
2942
+ `);
2943
+ return {
2944
+ model: modelId,
2945
+ modelKey,
2946
+ benchmark: benchmark.name,
2947
+ result
2948
+ };
2949
+ } catch (error) {
2950
+ process.stdout.write(`\r${prefix}: .... Score: ERROR
2951
+ `);
2952
+ console.error(error);
2953
+ return {
2954
+ model: modelId,
2955
+ modelKey,
2956
+ benchmark: benchmark.name,
2957
+ result: {
2958
+ score: 0,
2959
+ success: false,
2960
+ metrics: {},
2961
+ error: error instanceof Error ? error : new Error(String(error))
2962
+ }
2963
+ };
2964
+ }
2965
+ }
2467
2966
  async function evaluate(options) {
2468
2967
  const {
2469
2968
  models,
2470
2969
  benchmarks,
2471
2970
  reporter = "console",
2472
2971
  temperature,
2473
- maxTokens
2972
+ maxTokens,
2973
+ cache
2474
2974
  } = options;
2475
2975
  const modelEntries = normalizeModels(models);
2476
2976
  const config = buildConfig(temperature, maxTokens);
2477
2977
  const allResults = [];
2478
- for (const [modelKey, model] of modelEntries) {
2978
+ for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
2979
+ const effectiveModel = buildEffectiveModel(
2980
+ baseModel,
2981
+ userMiddleware,
2982
+ cache
2983
+ );
2479
2984
  for (const benchmark of benchmarks) {
2480
2985
  const evaluationResult = await runSingleBenchmark(
2481
- model,
2986
+ effectiveModel,
2482
2987
  benchmark,
2483
2988
  modelKey,
2484
2989
  config