npm - @ai-sdk-tool/eval - Versions diffs - 1.0.0-canary.1 → 1.0.0 - Mend

@ai-sdk-tool/eval 1.0.0-canary.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
package/data/BFCL_v4_simple.jsonl +400 -0
package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
package/dist/index.cjs +715 -210
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +49 -3
package/dist/index.d.ts +49 -3
package/dist/index.js +715 -210
package/dist/index.js.map +1 -1
package/package.json +6 -5
package/data/BFCL_v3_simple.jsonl +0 -400
package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
/package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
/package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
/package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
/package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0

package/dist/index.js CHANGED Viewed

@@ -368,6 +368,7 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
 // src/benchmarks/bfcl.ts
 var LINE_SPLIT_REGEX = /\r?\n/;
 var NUMERIC_STRING_REGEX = /^\d+$/;
+var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
 function convertGroundTruthToXML(call) {
   const keys = Object.keys(call);
   if (keys.length === 0) {
@@ -399,45 +400,67 @@ function convertGroundTruthToXML(call) {
   xml += `</${funcName}>`;
   return xml;
 }
+function extractCategory(id) {
+  if (id.startsWith("parallel_multiple")) {
+    return "parallel_multiple";
+  }
+  if (id.startsWith("simple_python")) {
+    return "simple";
+  }
+  if (id.startsWith("simple_java")) {
+    return "simple";
+  }
+  if (id.startsWith("simple_javascript")) {
+    return "simple";
+  }
+  if (id.startsWith("parallel")) {
+    return "parallel";
+  }
+  if (id.startsWith("multiple")) {
+    return "multiple";
+  }
+  if (id.startsWith("simple")) {
+    return "simple";
+  }
+  return id.split("_")[0];
+}
 function check(testCase, modelOutput, possibleAnswer) {
-  const category = testCase.id.split("_")[0];
+  const category = extractCategory(testCase.id);
   try {
-    if (category === "simple") {
-      if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
-        return {
-          valid: false,
-          error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
-          error_type: "simple:wrong_count"
-        };
+    switch (category) {
+      case "simple": {
+        if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
+          return {
+            valid: false,
+            error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
+            error_type: "simple:wrong_count"
+          };
+        }
+        return simpleFunctionChecker(
+          testCase.function[0],
+          modelOutput[0],
+          possibleAnswer.ground_truth[0]
+        );
+      }
+      case "multiple": {
+        return multipleFunctionChecker(
+          testCase.function,
+          modelOutput,
+          possibleAnswer.ground_truth
+        );
+      }
+      case "parallel":
+      case "parallel_multiple": {
+        return parallelFunctionCheckerNoOrder(
+          testCase.function,
+          modelOutput,
+          possibleAnswer.ground_truth
+        );
+      }
+      default: {
+        return { valid: true };
       }
-      return simpleFunctionChecker(
-        testCase.function[0],
-        modelOutput[0],
-        possibleAnswer.ground_truth[0]
-      );
-    }
-    if (category === "parallel") {
-      return parallelFunctionCheckerNoOrder(
-        testCase.function,
-        modelOutput,
-        possibleAnswer.ground_truth
-      );
-    }
-    if (category === "multiple") {
-      return multipleFunctionChecker(
-        testCase.function,
-        modelOutput,
-        possibleAnswer.ground_truth
-      );
-    }
-    if (category.includes("parallel-multiple")) {
-      return parallelFunctionCheckerNoOrder(
-        testCase.function,
-        modelOutput,
-        possibleAnswer.ground_truth
-      );
     }
-    return { valid: true };
   } catch (e) {
     return {
       valid: false,
@@ -615,7 +638,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             return `- expected one of: ${formatted}`;
           })();
           diffLines.push(expectedLine);
-          diffLines.push(`+ got: ${JSON.stringify(got)}`);
+          diffLines.push(`+      got: ${JSON.stringify(got)}`);
           return diffLines;
         };
         const paramValueMatches = (allowed, got) => {
@@ -832,44 +855,97 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             );
           }
         };
-        const buildFailureContext = (options) => {
-          const {
-            testCase,
-            tools,
-            flatMessages,
-            mwOriginalText,
-            text,
-            finishReason,
-            mwParsedToolCalls,
-            restoredCalls,
-            possibleAnswer
-          } = options;
-          const lastUser = (() => {
-            var _a;
-            const reversed = [...flatMessages].reverse();
-            const found = reversed.find(
-              (m) => m.role === "user"
-            );
-            return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
-          })();
-          const rawModelText = (() => {
-            if (mwOriginalText && mwOriginalText.length > 0) {
-              return mwOriginalText;
+        const hasPercentPattern = (diff) => {
+          return diff.some((d) => {
+            if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
+              return false;
             }
-            if (typeof text === "string") {
-              return text;
+            const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
+            if (!numMatch) {
+              return false;
             }
-            return "";
-          })();
-          return {
-            id: testCase.id,
-            tool_schema: tools,
-            last_user_query: lastUser,
-            raw_model_text: rawModelText,
-            finish_reason: finishReason,
-            parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
-            ground_truth: possibleAnswer.ground_truth
-          };
+            const num = Number.parseFloat(numMatch[1]);
+            return num >= 1 && num <= 100;
+          });
+        };
+        const isValueError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
+        };
+        const isFunctionNameError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
+        };
+        const isMissingParamError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
+        };
+        const isUnexpectedParamError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
+        };
+        const classifyByErrorPatterns = (errorType, diff) => {
+          const patterns = [
+            [
+              isValueError,
+              hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
+            ],
+            [isFunctionNameError, "WRONG_FUNCTION"],
+            [isMissingParamError, "MISSING_PARAMS"],
+            [isUnexpectedParamError, "UNEXPECTED_PARAMS"]
+          ];
+          for (const [classifier, result] of patterns) {
+            if (classifier(errorType, diff)) {
+              return result;
+            }
+          }
+          if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
+            return "NO_MATCH";
+          }
+          return null;
+        };
+        const classifyByCallCount = (actualCount, expectedCount) => {
+          if (actualCount === 0 && expectedCount > 0) {
+            return "PARSE_FAILURE";
+          }
+          if (actualCount > 0 && actualCount < expectedCount) {
+            return "PARTIAL_CALLS";
+          }
+          if (actualCount > expectedCount) {
+            return "EXTRA_CALLS";
+          }
+          return null;
+        };
+        const classifyFailureType = (options) => {
+          const { errorType, restoredCalls, expectedCount, diff } = options;
+          const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
+          const countBasedResult = classifyByCallCount(
+            actualCount,
+            expectedCount
+          );
+          if (countBasedResult) {
+            return countBasedResult;
+          }
+          const patternBasedResult = classifyByErrorPatterns(errorType, diff);
+          if (patternBasedResult) {
+            return patternBasedResult;
+          }
+          return "OTHER";
+        };
+        const extractRawModelText = (mwOriginalText, text) => {
+          if (mwOriginalText && mwOriginalText.length > 0) {
+            return mwOriginalText;
+          }
+          if (typeof text === "string") {
+            return text;
+          }
+          return "";
+        };
+        const extractLastUserQuery = (flatMessages) => {
+          var _a;
+          const reversed = [...flatMessages].reverse();
+          const found = reversed.find((m) => m.role === "user");
+          const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
+          return content.length > 200 ? `${content.slice(0, 200)}...` : content;
+        };
+        const truncateText = (text, maxLen) => {
+          return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
         };
         const logFailureDetails = (options) => {
           const {
@@ -887,42 +963,36 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           } = options;
           try {
             const category = testCase.id.split("_")[0];
-            const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
-              tools,
-              possibleAnswer,
-              restoredCalls
-            ) : buildParallelDiff(
-              tools,
-              possibleAnswer,
-              restoredCalls
-            );
-            caseLogs.push(
-              `[DEBUG-FAIL] ${JSON.stringify({
-                id: testCase.id,
-                message: checkerResult.error,
-                error_type: checkerResult.error_type,
-                expected,
-                actual,
-                diff
-              })}`
-            );
-            try {
-              const contextPayload = buildFailureContext({
-                testCase,
-                tools,
-                flatMessages,
-                mwOriginalText,
-                text,
-                finishReason,
-                mwParsedToolCalls,
+            const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
+            const gtArr = possibleAnswer.ground_truth;
+            const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
+            const rawModelText = extractRawModelText(mwOriginalText, text);
+            const lastUserQuery = extractLastUserQuery(flatMessages);
+            const failurePayload = {
+              id: testCase.id,
+              category: classifyFailureType({
+                errorType: checkerResult.error_type,
                 restoredCalls,
-                possibleAnswer
-              });
-              caseLogs.push(
-                `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
-              );
-            } catch (e) {
-            }
+                expectedCount,
+                diff
+              }),
+              message: checkerResult.error,
+              error_type: checkerResult.error_type,
+              expected,
+              actual,
+              diff,
+              context: {
+                raw_model_text: truncateText(rawModelText, 500),
+                raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
+                parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
+                expected_count: expectedCount,
+                actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
+                finish_reason: finishReason,
+                last_user_query: lastUserQuery,
+                tool_names: tools.map((t) => t.name)
+              }
+            };
+            caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
           } catch (e) {
             caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
           }
@@ -1147,14 +1217,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           };
         }
         const score = correctCount / testCases.length;
+        const caseResults = resultsPerCase.map((r, i) => ({
+          id: testCases[i].id,
+          valid: r.valid
+        }));
         return {
           score,
           success: score > 0.95,
-          // High success threshold as requested
           metrics: {
             correct_count: correctCount,
             total_cases: testCases.length,
-            accuracy: score
+            accuracy: score,
+            case_results: JSON.stringify(caseResults)
           },
           logs
         };
@@ -1174,27 +1248,27 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
 }
 var bfclSimpleBenchmark = createBfclBenchmark(
   "bfcl-simple",
-  "BFCL Simple Function Calling",
-  "BFCL_v3_simple.jsonl",
-  "BFCL_v3_simple_possible_answer.jsonl"
+  "BFCL v4 Simple Function Calling",
+  "BFCL_v4_simple.jsonl",
+  "BFCL_v4_simple_possible_answer.jsonl"
 );
 var bfclParallelBenchmark = createBfclBenchmark(
   "bfcl-parallel",
-  "BFCL Parallel Function Calling",
-  "BFCL_v3_parallel.jsonl",
-  "BFCL_v3_parallel_possible_answer.jsonl"
+  "BFCL v4 Parallel Function Calling",
+  "BFCL_v4_parallel.jsonl",
+  "BFCL_v4_parallel_possible_answer.jsonl"
 );
 var bfclMultipleBenchmark = createBfclBenchmark(
   "bfcl-multiple",
-  "BFCL Multiple Function Calling",
-  "BFCL_v3_multiple.jsonl",
-  "BFCL_v3_multiple_possible_answer.jsonl"
+  "BFCL v4 Multiple Function Calling",
+  "BFCL_v4_multiple.jsonl",
+  "BFCL_v4_multiple_possible_answer.jsonl"
 );
 var bfclParallelMultipleBenchmark = createBfclBenchmark(
   "bfcl-parallel-multiple",
-  "BFCL Parallel & Multiple Function Calling",
-  "BFCL_v3_parallel_multiple.jsonl",
-  "BFCL_v3_parallel_multiple_possible_answer.jsonl"
+  "BFCL v4 Parallel & Multiple Function Calling",
+  "BFCL_v4_parallel_multiple.jsonl",
+  "BFCL_v4_parallel_multiple_possible_answer.jsonl"
 );
 // src/benchmarks/complex-func-bench.ts
@@ -1925,23 +1999,28 @@ var jsonGenerationSchemaOnlyBenchmark = {
   }
 };
+// src/evaluate.ts
+import { createDiskCacheMiddleware } from "@ai-sdk-tool/middleware";
+import { wrapLanguageModel } from "ai";
 // src/reporters/console.ts
 var colors = {
   reset: "\x1B[0m",
+  bold: "\x1B[1m",
   green: "\x1B[32m",
   red: "\x1B[31m",
   yellow: "\x1B[33m",
   cyan: "\x1B[36m",
   magenta: "\x1B[35m",
   gray: "\x1B[90m",
-  white: "\x1B[37m",
-  bgRed: "\x1B[41m"
+  white: "\x1B[37m"
 };
+var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
 function formatDiff(diff) {
   if (!diff || diff.length === 0) {
     return "";
   }
-  return diff.map((line) => {
+  return diff.slice(0, 8).map((line) => {
     if (line.startsWith("-")) {
       return `${colors.red}${line}${colors.reset}`;
     }
@@ -1954,65 +2033,106 @@ function formatDiff(diff) {
     return line;
   }).join("\n      ");
 }
-function printFailLogs(logs) {
-  const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
-  for (const log of failLogs) {
+function parseFailures(logs) {
+  const failures = [];
+  for (const log of logs) {
+    if (!DEBUG_FAIL_REGEX.test(log)) {
+      continue;
+    }
     try {
-      const jsonStr = log.replace("[DEBUG-FAIL] ", "");
-      const data = JSON.parse(jsonStr);
-      console.log(`
-    ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
-      console.log(
-        `    Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
-      );
-      console.log(`    Message: ${data.message}`);
-      if (data.diff && Array.isArray(data.diff)) {
-        console.log(`    Diff:
-      ${formatDiff(data.diff)}`);
-      }
-      if (data.expected && data.actual) {
-        const expStr = JSON.stringify(data.expected);
-        const actStr = JSON.stringify(data.actual);
-        if (expStr.length < 100 && actStr.length < 100) {
-          console.log(`    Expected: ${colors.gray}${expStr}${colors.reset}`);
-          console.log(`    Actual:   ${colors.gray}${actStr}${colors.reset}`);
-        }
-      }
-    } catch (_e) {
-      console.log(`    Raw Log: ${log}`);
+      const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
+      const parsed = JSON.parse(jsonStr);
+      failures.push(parsed);
+    } catch (e) {
     }
   }
+  return failures;
+}
+function groupFailuresByCategory(failures) {
+  const groups = /* @__PURE__ */ new Map();
+  for (const failure of failures) {
+    const category = failure.category || "OTHER";
+    const existing = groups.get(category);
+    if (existing) {
+      existing.push(failure);
+    } else {
+      groups.set(category, [failure]);
+    }
+  }
+  return groups;
+}
+function printCompactFailure(failure) {
+  var _a;
+  console.log(
+    `
+    ${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
+  );
+  if (failure.message) {
+    console.log(`      ${failure.message}`);
+  }
+  if (failure.diff && failure.diff.length > 0) {
+    console.log(`      ${formatDiff(failure.diff)}`);
+  }
+  if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
+    const text = failure.context.raw_model_text;
+    const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
+    console.log(`      ${colors.gray}Model: "${truncated}"${colors.reset}`);
+  }
+}
+function printFailureSummary(failures) {
+  const groups = groupFailuresByCategory(failures);
+  const sorted = [...groups.entries()].sort(
+    (a, b) => b[1].length - a[1].length
+  );
+  console.log(`
+    ${colors.bold}Failures by category:${colors.reset}`);
+  for (const [category, categoryFailures] of sorted) {
+    console.log(
+      `      ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
+    );
+  }
+  const maxToShow = 5;
+  const shown = failures.slice(0, maxToShow);
+  for (const failure of shown) {
+    printCompactFailure(failure);
+  }
+  if (failures.length > maxToShow) {
+    const remaining = failures.length - maxToShow;
+    const remainingIds = failures.slice(maxToShow).map((f) => f.id);
+    const idPreview = remainingIds.slice(0, 5).join(", ");
+    const more = remainingIds.length > 5 ? "..." : "";
+    console.log(
+      `
+    ${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
+    );
+  }
 }
 function printResult(result) {
   const { model, modelKey, benchmark, result: benchmarkResult } = result;
-  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
+  const passed = benchmarkResult.metrics.correct_count;
+  const total = benchmarkResult.metrics.total_cases;
+  const scorePercent = (benchmarkResult.score * 100).toFixed(1);
+  const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
+  const statusColor = benchmarkResult.success ? colors.green : colors.red;
   console.log(
     `
  ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
   );
   console.log(
-    `  \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
+    `  \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
   );
-  const metrics = Object.entries(benchmarkResult.metrics);
-  if (metrics.length > 0) {
-    console.log("    Metrics:");
-    for (const [key, value] of metrics) {
-      console.log(`      - ${key}: ${value}`);
-    }
-  }
   if (benchmarkResult.error) {
     console.log(
       `    ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
     );
   }
   if (!benchmarkResult.success && benchmarkResult.logs) {
-    printFailLogs(benchmarkResult.logs);
-    const failLogs = benchmarkResult.logs.filter(
-      (l) => l.startsWith("[DEBUG-FAIL]")
-    );
-    if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
-      console.log("    Raw Logs (Sample):");
-      for (const l of benchmarkResult.logs.slice(0, 10)) {
+    const failures = parseFailures(benchmarkResult.logs);
+    if (failures.length > 0) {
+      printFailureSummary(failures);
+    } else if (benchmarkResult.logs.length > 0) {
+      console.log(`    ${colors.gray}Raw Logs (Sample):${colors.reset}`);
+      for (const l of benchmarkResult.logs.slice(0, 5)) {
         console.log(`      ${l}`);
       }
     }
@@ -2371,6 +2491,326 @@ function consoleDebugReporter(results) {
   console.log("\n------------------------------------\n");
 }
+// src/reporters/console.summary.ts
+var colors3 = {
+  reset: "\x1B[0m",
+  bold: "\x1B[1m",
+  dim: "\x1B[2m",
+  green: "\x1B[32m",
+  red: "\x1B[31m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  magenta: "\x1B[35m",
+  gray: "\x1B[90m",
+  white: "\x1B[37m"
+};
+var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
+var ID_NUM_REGEX = /_(\d+)$/;
+var REASONING_TAG = "think";
+var MAX_FAILURES_TO_DISPLAY = 5;
+var CATEGORY_DESCRIPTIONS = {
+  PARSE_FAILURE: {
+    label: "Parse Failure",
+    description: "No tool calls extracted from model output",
+    hint: "Model may have responded in text instead of tool format"
+  },
+  PARTIAL_CALLS: {
+    label: "Partial Calls",
+    description: "Some expected tool calls missing",
+    hint: "Model stopped early or missed some tools"
+  },
+  EXTRA_CALLS: {
+    label: "Extra Calls",
+    description: "More tool calls than expected",
+    hint: "Model called tools that weren't needed"
+  },
+  PARAM_VALUE_PERCENT: {
+    label: "Param Value (Percent)",
+    description: "Percentage sent as integer instead of decimal",
+    hint: "e.g., 5 instead of 0.05 for 5%"
+  },
+  PARAM_VALUE_MISMATCH: {
+    label: "Param Value Mismatch",
+    description: "Parameter values don't match expected"
+  },
+  WRONG_FUNCTION: {
+    label: "Wrong Function",
+    description: "Called wrong function name"
+  },
+  MISSING_PARAMS: {
+    label: "Missing Params",
+    description: "Required parameters not provided"
+  },
+  UNEXPECTED_PARAMS: {
+    label: "Unexpected Params",
+    description: "Extra parameters that shouldn't be there"
+  },
+  NO_MATCH: {
+    label: "No Match",
+    description: "Function called but couldn't match to expected",
+    hint: "Parameters may be correct but don't match any expected combination"
+  },
+  OTHER: {
+    label: "Other",
+    description: "Uncategorized failure"
+  }
+};
+function parseFailureLogs(logs) {
+  return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
+    try {
+      const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
+      return JSON.parse(jsonStr);
+    } catch (e) {
+      return null;
+    }
+  }).filter((parsed) => parsed !== null);
+}
+function groupByCategory(failures) {
+  const groups = /* @__PURE__ */ new Map();
+  for (const failure of failures) {
+    const category = failure.category || "OTHER";
+    const existing = groups.get(category);
+    if (existing) {
+      existing.failures.push(failure);
+    } else {
+      groups.set(category, { failures: [failure] });
+    }
+  }
+  return groups;
+}
+function extractParamNames(failures) {
+  const paramNames = /* @__PURE__ */ new Set();
+  for (const f of failures) {
+    if (!f.diff) {
+      continue;
+    }
+    for (const d of f.diff) {
+      if (d.startsWith("@@ param ")) {
+        paramNames.add(d.replace("@@ param ", ""));
+      }
+    }
+  }
+  return paramNames;
+}
+function extractFinishReasons(failures) {
+  var _a;
+  const finishReasons = /* @__PURE__ */ new Set();
+  for (const f of failures) {
+    if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
+      finishReasons.add(String(f.context.finish_reason));
+    }
+  }
+  return finishReasons;
+}
+function detectPatterns(group) {
+  const { failures } = group;
+  if (failures.length < 2) {
+    return;
+  }
+  const firstCategory = failures[0].category;
+  if (firstCategory === "PARAM_VALUE_PERCENT") {
+    const paramNames = extractParamNames(failures);
+    if (paramNames.size > 0) {
+      group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
+    }
+  }
+  if (firstCategory === "PARSE_FAILURE") {
+    const finishReasons = extractFinishReasons(failures);
+    if (finishReasons.size === 1) {
+      group.pattern = `All finished with: ${[...finishReasons][0]}`;
+    }
+  }
+}
+function getLineColor(line) {
+  if (line.startsWith("+")) {
+    return colors3.green;
+  }
+  if (line.startsWith("-")) {
+    return colors3.red;
+  }
+  if (line.startsWith("@@")) {
+    return colors3.cyan;
+  }
+  return colors3.white;
+}
+function formatFunctions(funcs) {
+  if (Array.isArray(funcs)) {
+    return funcs.join(", ");
+  }
+  return String(funcs);
+}
+function printExpectedActual(failure) {
+  if (failure.expected) {
+    const expFuncs = failure.expected.functions || failure.expected.function;
+    if (expFuncs) {
+      console.log(
+        `    ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
+      );
+    }
+  }
+  if (failure.actual) {
+    const actFuncs = failure.actual.functions || failure.actual.function;
+    if (actFuncs) {
+      const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
+      const color = isEmpty ? colors3.red : colors3.white;
+      const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
+      console.log(
+        `    ${colors3.gray}Actual:${colors3.reset}   ${color}${text}${colors3.reset}`
+      );
+    }
+  }
+}
+function printDiff(diff) {
+  console.log(`    ${colors3.gray}Diff:${colors3.reset}`);
+  for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
+    const lineColor = getLineColor(line);
+    console.log(`      ${lineColor}${line}${colors3.reset}`);
+  }
+}
+function removeReasoningTags(text) {
+  const openTag = `<${REASONING_TAG}>`;
+  const closeTag = `</${REASONING_TAG}>`;
+  const closedTagPattern = new RegExp(
+    `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
+    "g"
+  );
+  const unclosedTagPattern = new RegExp(
+    `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
+    "g"
+  );
+  let result = text.replace(closedTagPattern, "");
+  result = result.replace(unclosedTagPattern, "");
+  return result.trim();
+}
+function printModelOutput(failure, category) {
+  var _a, _b;
+  if (category !== "PARSE_FAILURE") {
+    return;
+  }
+  const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
+  const cleanedText = removeReasoningTags(rawText);
+  if (cleanedText) {
+    console.log(
+      `    ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
+    );
+  } else {
+    console.log(
+      `    ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
+    );
+  }
+}
+function shouldShowDiffByDefault(category) {
+  return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
+}
+function printSingleFailure(failure, category, verbose) {
+  console.log(`
+  ${colors3.bold}${failure.id}${colors3.reset}`);
+  const hasDiff = failure.diff && failure.diff.length > 0;
+  const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
+  if (showDiffPrimarily) {
+    printDiff(failure.diff);
+  } else {
+    printExpectedActual(failure);
+    if (hasDiff && verbose) {
+      printDiff(failure.diff);
+    }
+  }
+  printModelOutput(failure, category);
+}
+var MAX_SAMPLE_FAILURES = 2;
+function printRemainingIds(failures) {
+  const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
+  const idNums = remainingIds.map((id) => {
+    const match = id.match(ID_NUM_REGEX);
+    return match ? match[1] : id;
+  });
+  console.log(
+    `
+  ${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
+  );
+}
+function printCategoryHeader(info, count) {
+  console.log(
+    `
+${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
+  );
+  console.log(`${colors3.dim}${info.description}${colors3.reset}`);
+}
+function printCategoryDetails(category, group, verbose) {
+  const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
+  const { failures } = group;
+  printCategoryHeader(info, failures.length);
+  if (group.pattern) {
+    console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
+  }
+  if (info.hint) {
+    console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
+  }
+  const samplesToShow = verbose ? failures : failures.slice(0, 2);
+  for (const failure of samplesToShow) {
+    printSingleFailure(failure, category, verbose);
+  }
+  if (!verbose && failures.length > 2) {
+    printRemainingIds(failures);
+  }
+}
+function printResultHeader(result) {
+  const { model, modelKey, benchmark, result: benchmarkResult } = result;
+  const passed = benchmarkResult.metrics.correct_count;
+  const total = benchmarkResult.metrics.total_cases;
+  const scorePercent = (benchmarkResult.score * 100).toFixed(1);
+  const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
+  const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
+  const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
+  const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
+  const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
+  console.log(
+    `
+${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
+  );
+  console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
+}
+function printResultSummary(result, verbose) {
+  const { result: benchmarkResult } = result;
+  printResultHeader(result);
+  if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
+    return;
+  }
+  const failures = parseFailureLogs(benchmarkResult.logs);
+  if (failures.length === 0) {
+    if (!benchmarkResult.success) {
+      console.log(
+        `${colors3.yellow}No structured failure data available${colors3.reset}`
+      );
+    }
+    return;
+  }
+  const groups = groupByCategory(failures);
+  for (const group of groups.values()) {
+    detectPatterns(group);
+  }
+  const sortedCategories = [...groups.entries()].sort(
+    (a, b) => b[1].failures.length - a[1].failures.length
+  );
+  for (const [cat, group] of sortedCategories) {
+    printCategoryDetails(cat, group, verbose);
+  }
+}
+function consoleSummaryReporter(results) {
+  const verbose = process.env.VERBOSE === "true";
+  console.log(`
+${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
+  console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
+  for (const result of results) {
+    printResultSummary(result, verbose);
+  }
+  console.log(
+    `
+${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
+`
+  );
+}
 // src/reporters/json.ts
 function jsonReporter(results) {
   const serializableResults = results.map((r) => {
@@ -2390,60 +2830,56 @@ function jsonReporter(results) {
 var reporters = {
   console: consoleReporter,
   json: jsonReporter,
-  "console.debug": consoleDebugReporter
+  "console.debug": consoleDebugReporter,
+  "console.summary": consoleSummaryReporter
 };
 // src/evaluate.ts
-async function runSingleBenchmark(model, benchmark, modelKey, config) {
-  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
-  try {
-    console.log(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
-    );
-    const result = await benchmark.run(model, config);
-    console.log(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
-    );
-    return {
-      model: modelId,
-      modelKey,
-      benchmark: benchmark.name,
-      result
-    };
-  } catch (error) {
-    console.error(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
-      error
-    );
-    return {
-      model: modelId,
-      modelKey,
-      benchmark: benchmark.name,
-      result: {
-        score: 0,
-        success: false,
-        metrics: {},
-        error: error instanceof Error ? error : new Error(String(error))
-      }
-    };
+function isModelConfig(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const obj = value;
+  if (!("model" in obj)) {
+    return false;
   }
+  const model = obj.model;
+  if (typeof model !== "object" || model === null) {
+    return false;
+  }
+  return "modelId" in model;
+}
+function isLanguageModel(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const obj = value;
+  return "modelId" in obj && typeof obj.modelId === "string";
+}
+function extractModelAndMiddleware(input) {
+  if (isModelConfig(input)) {
+    return [input.model, input.middleware];
+  }
+  return [input, void 0];
 }
 function normalizeModels(models) {
-  const modelEntries = [];
+  const entries = [];
   if (Array.isArray(models)) {
     for (const m of models) {
-      modelEntries.push([void 0, m]);
+      const [model, middleware] = extractModelAndMiddleware(m);
+      entries.push([void 0, model, middleware]);
     }
-  } else if (typeof models === "object" && models !== null && "modelId" in models) {
-    modelEntries.push([void 0, models]);
+  } else if (isModelConfig(models)) {
+    entries.push([void 0, models.model, models.middleware]);
+  } else if (isLanguageModel(models)) {
+    entries.push([void 0, models, void 0]);
   } else {
-    for (const [key, m] of Object.entries(
-      models
-    )) {
-      modelEntries.push([key, m]);
+    for (const [key, m] of Object.entries(models)) {
+      const [model, middleware] = extractModelAndMiddleware(m);
+      entries.push([key, model, middleware]);
     }
   }
-  return modelEntries;
+  return entries;
 }
 function buildConfig(temperature, maxTokens) {
   const config = {};
@@ -2464,21 +2900,90 @@ function executeReporter(reporter, results) {
     reporters.console(results);
   }
 }
+function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
+  var _a, _b;
+  const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
+  if (!(cacheEnabled || userMiddleware)) {
+    return baseModel;
+  }
+  const cacheMiddleware = cacheEnabled ? createDiskCacheMiddleware({
+    cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
+    enabled: true,
+    debug: (_b = cacheOptions.debug) != null ? _b : false
+  }) : null;
+  const middlewares = [];
+  if (userMiddleware) {
+    if (Array.isArray(userMiddleware)) {
+      middlewares.push(...userMiddleware);
+    } else {
+      middlewares.push(userMiddleware);
+    }
+  }
+  if (cacheMiddleware) {
+    middlewares.push(cacheMiddleware);
+  }
+  if (middlewares.length === 0) {
+    return baseModel;
+  }
+  return wrapLanguageModel({
+    // biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
+    model: baseModel,
+    middleware: middlewares.length === 1 ? middlewares[0] : middlewares
+  });
+}
+async function runSingleBenchmark(model, benchmark, modelKey, config) {
+  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
+  const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
+  try {
+    process.stdout.write(`${prefix}: ...`);
+    const result = await benchmark.run(model, config);
+    const scoreDisplay = result.score.toFixed(2);
+    process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
+`);
+    return {
+      model: modelId,
+      modelKey,
+      benchmark: benchmark.name,
+      result
+    };
+  } catch (error) {
+    process.stdout.write(`\r${prefix}: .... Score: ERROR
+`);
+    console.error(error);
+    return {
+      model: modelId,
+      modelKey,
+      benchmark: benchmark.name,
+      result: {
+        score: 0,
+        success: false,
+        metrics: {},
+        error: error instanceof Error ? error : new Error(String(error))
+      }
+    };
+  }
+}
 async function evaluate(options) {
   const {
     models,
     benchmarks,
     reporter = "console",
     temperature,
-    maxTokens
+    maxTokens,
+    cache
   } = options;
   const modelEntries = normalizeModels(models);
   const config = buildConfig(temperature, maxTokens);
   const allResults = [];
-  for (const [modelKey, model] of modelEntries) {
+  for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
+    const effectiveModel = buildEffectiveModel(
+      baseModel,
+      userMiddleware,
+      cache
+    );
     for (const benchmark of benchmarks) {
       const evaluationResult = await runSingleBenchmark(
-        model,
+        effectiveModel,
         benchmark,
         modelKey,
         config