npm - @ai-sdk-tool/eval - Versions diffs - 1.0.0-canary.0 → 1.0.0 - Mend

@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
package/data/BFCL_v4_simple.jsonl +400 -0
package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
package/data/ComplexFuncBench.jsonl +1000 -0
package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
package/dist/index.cjs +1264 -263
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +77 -11
package/dist/index.d.ts +77 -11
package/dist/index.js +1268 -264
package/dist/index.js.map +1 -1
package/package.json +18 -11
package/data/BFCL_v3_simple.jsonl +0 -400
package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
/package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
/package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
/package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
/package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0

package/dist/index.cjs CHANGED Viewed

@@ -34,6 +34,7 @@ __export(index_exports, {
   bfclParallelBenchmark: () => bfclParallelBenchmark,
   bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
   bfclSimpleBenchmark: () => bfclSimpleBenchmark,
+  complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
   evaluate: () => evaluate,
   jsonGenerationBenchmark: () => jsonGenerationBenchmark,
   jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
@@ -61,7 +62,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
     if (import_node_fs.default.existsSync(dataAtRoot)) {
       return dataAtRoot;
     }
-  } catch {
+  } catch (e) {
   }
   return null;
 }
@@ -75,7 +76,7 @@ function tryResolveViaPackageJson(moduleUrl) {
     if (import_node_fs.default.existsSync(dataAtPkg)) {
       return dataAtPkg;
     }
-  } catch {
+  } catch (e) {
   }
   return null;
 }
@@ -83,7 +84,7 @@ function getStartDir(moduleUrl) {
   if (moduleUrl) {
     try {
       return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
-    } catch {
+    } catch (e) {
       return process.cwd();
     }
   }
@@ -177,7 +178,7 @@ function valuesMatch(modelValue, possibleValue) {
       const normalizedModel = normalizeObject(modelValue);
       const normalizedPossible = normalizeObject(possibleValue);
       return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
-    } catch {
+    } catch (e) {
       return false;
     }
   }
@@ -306,7 +307,7 @@ function checkSingleParameter(paramName, modelValue, context) {
     return checkStringValue(
       paramName,
       modelValue,
-      possibleValues ?? []
+      possibleValues != null ? possibleValues : []
     );
   }
   if (Array.isArray(modelValue)) {
@@ -406,45 +407,99 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
 // src/benchmarks/bfcl.ts
 var LINE_SPLIT_REGEX = /\r?\n/;
 var NUMERIC_STRING_REGEX = /^\d+$/;
+var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
+function convertGroundTruthToXML(call) {
+  const keys = Object.keys(call);
+  if (keys.length === 0) {
+    return "<empty_call />";
+  }
+  const funcName = keys[0];
+  if (!funcName) {
+    return "<undefined_function />";
+  }
+  const params = call[funcName];
+  if (!params || typeof params !== "object") {
+    return `<${funcName} />`;
+  }
+  let xml = `<${funcName}>
+`;
+  for (const [key, value] of Object.entries(params)) {
+    const displayValue = Array.isArray(value) ? value[0] : value;
+    let valueStr;
+    if (typeof displayValue === "string") {
+      valueStr = displayValue;
+    } else if (displayValue === null || displayValue === void 0) {
+      valueStr = "";
+    } else {
+      valueStr = JSON.stringify(displayValue);
+    }
+    xml += `  <${key}>${valueStr}</${key}>
+`;
+  }
+  xml += `</${funcName}>`;
+  return xml;
+}
+function extractCategory(id) {
+  if (id.startsWith("parallel_multiple")) {
+    return "parallel_multiple";
+  }
+  if (id.startsWith("simple_python")) {
+    return "simple";
+  }
+  if (id.startsWith("simple_java")) {
+    return "simple";
+  }
+  if (id.startsWith("simple_javascript")) {
+    return "simple";
+  }
+  if (id.startsWith("parallel")) {
+    return "parallel";
+  }
+  if (id.startsWith("multiple")) {
+    return "multiple";
+  }
+  if (id.startsWith("simple")) {
+    return "simple";
+  }
+  return id.split("_")[0];
+}
 function check(testCase, modelOutput, possibleAnswer) {
-  const category = testCase.id.split("_")[0];
+  const category = extractCategory(testCase.id);
   try {
-    if (category === "simple") {
-      if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
-        return {
-          valid: false,
-          error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
-          error_type: "simple:wrong_count"
-        };
+    switch (category) {
+      case "simple": {
+        if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
+          return {
+            valid: false,
+            error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
+            error_type: "simple:wrong_count"
+          };
+        }
+        return simpleFunctionChecker(
+          testCase.function[0],
+          modelOutput[0],
+          possibleAnswer.ground_truth[0]
+        );
+      }
+      case "multiple": {
+        return multipleFunctionChecker(
+          testCase.function,
+          modelOutput,
+          possibleAnswer.ground_truth
+        );
+      }
+      case "parallel":
+      case "parallel_multiple": {
+        return parallelFunctionCheckerNoOrder(
+          testCase.function,
+          modelOutput,
+          possibleAnswer.ground_truth
+        );
+      }
+      default: {
+        return { valid: true };
       }
-      return simpleFunctionChecker(
-        testCase.function[0],
-        modelOutput[0],
-        possibleAnswer.ground_truth[0]
-      );
-    }
-    if (category === "parallel") {
-      return parallelFunctionCheckerNoOrder(
-        testCase.function,
-        modelOutput,
-        possibleAnswer.ground_truth
-      );
-    }
-    if (category === "multiple") {
-      return multipleFunctionChecker(
-        testCase.function,
-        modelOutput,
-        possibleAnswer.ground_truth
-      );
-    }
-    if (category.includes("parallel-multiple")) {
-      return parallelFunctionCheckerNoOrder(
-        testCase.function,
-        modelOutput,
-        possibleAnswer.ground_truth
-      );
     }
-    return { valid: true };
   } catch (e) {
     return {
       valid: false,
@@ -486,7 +541,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
           );
         }
-        const fixSchemaType = (copy) => {
+        const fixSchemaType2 = (copy) => {
           if (!copy.type) {
             return;
           }
@@ -510,16 +565,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             );
           }
         };
-        const fixSchema = (schema) => {
+        const fixSchema2 = (schema) => {
           if (!schema || typeof schema !== "object") {
             return { type: "object", properties: {} };
           }
-          const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
+          const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
           if (!Array.isArray(copy)) {
-            fixSchemaType(copy);
-            fixSchemaProperties(copy, fixSchema);
+            fixSchemaType2(copy);
+            fixSchemaProperties(copy, fixSchema2);
             if (copy.items) {
-              copy.items = fixSchema(copy.items);
+              copy.items = fixSchema2(copy.items);
             }
             return copy;
           }
@@ -554,13 +609,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           try {
             const arr = JSON.parse(raw);
             return Array.isArray(arr) ? arr : [];
-          } catch {
+          } catch (e) {
             return [];
           }
         };
         const getSanitizedName = (rawName, transformedTools) => {
+          var _a, _b;
           if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
-            return transformedTools[Number(rawName)]?.name ?? rawName;
+            return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
           }
           return rawName;
         };
@@ -570,25 +626,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
           try {
             return JSON.parse(extractedArgs);
-          } catch {
+          } catch (e) {
             return extractedArgs;
           }
         };
         const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
+          var _a, _b, _c, _d, _e, _f;
           const call = c;
-          const rawName = call.toolName ?? call.name;
+          const rawName = (_a = call.toolName) != null ? _a : call.name;
           const sanitizedFromIndex = getSanitizedName(
             rawName,
             transformedTools
           );
-          const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
-          const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
+          const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
+          const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
           const parsedArgs = parseToolArgs(extractedArgs);
           return {
             ...call,
             toolName: originalName,
             name: originalName,
-            args: parsedArgs ?? {}
+            args: parsedArgs != null ? parsedArgs : {}
           };
         });
         const summarizeArgs = (args) => {
@@ -620,7 +677,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             return `- expected one of: ${formatted}`;
           })();
           diffLines.push(expectedLine);
-          diffLines.push(`+ got: ${JSON.stringify(got)}`);
+          diffLines.push(`+      got: ${JSON.stringify(got)}`);
           return diffLines;
         };
         const paramValueMatches = (allowed, got) => {
@@ -632,7 +689,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
               if (Array.isArray(got)) {
                 return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
               }
-            } catch {
+            } catch (e) {
             }
             return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
           });
@@ -670,13 +727,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
         };
         const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
+          var _a, _b, _c, _d;
           const funcDesc = tools[0];
-          const gt = possibleAnswer.ground_truth?.[0];
-          const expectedFuncName = funcDesc?.name;
+          const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
+          const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
           const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
           const received = restoredCalls[0];
-          const receivedName = received?.toolName ?? received?.name;
-          const receivedArgs = summarizeArgs(received?.args);
+          const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
+          const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
           const expected = {
             function: expectedFuncName,
             params: expectedParams
@@ -688,7 +746,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const diff = [];
           checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
           if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
-            const required = funcDesc?.parameters?.required ?? [];
+            const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
             checkMissingParams(
               required,
               receivedArgs,
@@ -725,12 +783,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
         };
         const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
+          var _a;
           for (let i = 0; i < restoredCalls.length; i += 1) {
             if (usedActual.has(i)) {
               continue;
             }
             const rc = restoredCalls[i];
-            const rcName = rc?.toolName ?? rc?.name;
+            const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
             if (rcName === fname) {
               return i;
             }
@@ -744,6 +803,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
         };
         const processExpectedCall = (options) => {
+          var _a, _b;
           const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
           const fname = Object.keys(expectedObj)[0];
           const matchedIndex = findMatchingCallIndex(
@@ -756,10 +816,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
           usedActual.add(matchedIndex);
           const received = restoredCalls[matchedIndex];
-          const receivedArgs = summarizeArgs(received?.args);
+          const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
           const expectedParamsAllowed = expectedObj[fname];
           const funcDesc = tools.find((t) => t.name === fname);
-          const requiredParams = funcDesc?.parameters?.required ?? [];
+          const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
           diff.push(`@@ function ${fname}`);
           if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
             validateFunctionParams({
@@ -771,10 +831,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
         };
         const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
-          const gtArr = possibleAnswer.ground_truth ?? [];
+          var _a;
+          const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
           const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
           const actualNames = restoredCalls.map(
-            (c) => c.toolName ?? c.name
+            (c) => {
+              var _a2;
+              return (_a2 = c.toolName) != null ? _a2 : c.name;
+            }
           );
           const expected = {
             functions: expectedNames
@@ -800,14 +864,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           return { expected, actual, diff };
         };
         const concurrencyEnv = process.env.BFCL_CONCURRENCY;
-        const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
+        const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
         logs.push(
           `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
         );
         const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
+          var _a, _b, _c, _d;
           try {
             const firstTool = transformedTools[0];
-            const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
+            const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
             caseLogs.push(
               `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
             );
@@ -823,49 +888,103 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             caseLogs.push(
               `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
             );
-          } catch {
+          } catch (e) {
             caseLogs.push(
               `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
             );
           }
         };
-        const buildFailureContext = (options) => {
-          const {
-            testCase,
-            tools,
-            flatMessages,
-            mwOriginalText,
-            text,
-            finishReason,
-            mwParsedToolCalls,
-            restoredCalls,
-            possibleAnswer
-          } = options;
-          const lastUser = (() => {
-            const reversed = [...flatMessages].reverse();
-            const found = reversed.find(
-              (m) => m.role === "user"
-            );
-            return found?.content ?? void 0;
-          })();
-          const rawModelText = (() => {
-            if (mwOriginalText && mwOriginalText.length > 0) {
-              return mwOriginalText;
+        const hasPercentPattern = (diff) => {
+          return diff.some((d) => {
+            if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
+              return false;
             }
-            if (typeof text === "string") {
-              return text;
+            const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
+            if (!numMatch) {
+              return false;
             }
-            return "";
-          })();
-          return {
-            id: testCase.id,
-            tool_schema: tools,
-            last_user_query: lastUser,
-            raw_model_text: rawModelText,
-            finish_reason: finishReason,
-            parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
-            ground_truth: possibleAnswer.ground_truth
-          };
+            const num = Number.parseFloat(numMatch[1]);
+            return num >= 1 && num <= 100;
+          });
+        };
+        const isValueError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
+        };
+        const isFunctionNameError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
+        };
+        const isMissingParamError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
+        };
+        const isUnexpectedParamError = (errorType, diff) => {
+          return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
+        };
+        const classifyByErrorPatterns = (errorType, diff) => {
+          const patterns = [
+            [
+              isValueError,
+              hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
+            ],
+            [isFunctionNameError, "WRONG_FUNCTION"],
+            [isMissingParamError, "MISSING_PARAMS"],
+            [isUnexpectedParamError, "UNEXPECTED_PARAMS"]
+          ];
+          for (const [classifier, result] of patterns) {
+            if (classifier(errorType, diff)) {
+              return result;
+            }
+          }
+          if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
+            return "NO_MATCH";
+          }
+          return null;
+        };
+        const classifyByCallCount = (actualCount, expectedCount) => {
+          if (actualCount === 0 && expectedCount > 0) {
+            return "PARSE_FAILURE";
+          }
+          if (actualCount > 0 && actualCount < expectedCount) {
+            return "PARTIAL_CALLS";
+          }
+          if (actualCount > expectedCount) {
+            return "EXTRA_CALLS";
+          }
+          return null;
+        };
+        const classifyFailureType = (options) => {
+          const { errorType, restoredCalls, expectedCount, diff } = options;
+          const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
+          const countBasedResult = classifyByCallCount(
+            actualCount,
+            expectedCount
+          );
+          if (countBasedResult) {
+            return countBasedResult;
+          }
+          const patternBasedResult = classifyByErrorPatterns(errorType, diff);
+          if (patternBasedResult) {
+            return patternBasedResult;
+          }
+          return "OTHER";
+        };
+        const extractRawModelText = (mwOriginalText, text) => {
+          if (mwOriginalText && mwOriginalText.length > 0) {
+            return mwOriginalText;
+          }
+          if (typeof text === "string") {
+            return text;
+          }
+          return "";
+        };
+        const extractLastUserQuery = (flatMessages) => {
+          var _a;
+          const reversed = [...flatMessages].reverse();
+          const found = reversed.find((m) => m.role === "user");
+          const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
+          return content.length > 200 ? `${content.slice(0, 200)}...` : content;
+        };
+        const truncateText = (text, maxLen) => {
+          return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
         };
         const logFailureDetails = (options) => {
           const {
@@ -883,43 +1002,37 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           } = options;
           try {
             const category = testCase.id.split("_")[0];
-            const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
-              tools,
-              possibleAnswer,
-              restoredCalls
-            ) : buildParallelDiff(
-              tools,
-              possibleAnswer,
-              restoredCalls
-            );
-            caseLogs.push(
-              `[DEBUG-FAIL] ${JSON.stringify({
-                id: testCase.id,
-                message: checkerResult.error,
-                error_type: checkerResult.error_type,
-                expected,
-                actual,
-                diff
-              })}`
-            );
-            try {
-              const contextPayload = buildFailureContext({
-                testCase,
-                tools,
-                flatMessages,
-                mwOriginalText,
-                text,
-                finishReason,
-                mwParsedToolCalls,
+            const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
+            const gtArr = possibleAnswer.ground_truth;
+            const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
+            const rawModelText = extractRawModelText(mwOriginalText, text);
+            const lastUserQuery = extractLastUserQuery(flatMessages);
+            const failurePayload = {
+              id: testCase.id,
+              category: classifyFailureType({
+                errorType: checkerResult.error_type,
                 restoredCalls,
-                possibleAnswer
-              });
-              caseLogs.push(
-                `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
-              );
-            } catch {
-            }
-          } catch {
+                expectedCount,
+                diff
+              }),
+              message: checkerResult.error,
+              error_type: checkerResult.error_type,
+              expected,
+              actual,
+              diff,
+              context: {
+                raw_model_text: truncateText(rawModelText, 500),
+                raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
+                parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
+                expected_count: expectedCount,
+                actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
+                finish_reason: finishReason,
+                last_user_query: lastUserQuery,
+                tool_names: tools.map((t) => t.name)
+              }
+            };
+            caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
+          } catch (e) {
             caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
           }
         };
@@ -998,7 +1111,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const flatMessages = flattenMessages(messages);
           const { transformedTools, nameMap } = buildTransformedTools(
             tools,
-            fixSchema
+            fixSchema2
           );
           const toolsMap = buildToolsMap(transformedTools);
           return { flatMessages, transformedTools, nameMap, toolsMap };
@@ -1020,6 +1133,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const mwParsedToolCalls = parseDebugToolCalls(
             debugSummaryRef.toolCalls
           );
+          const possibleAnswer = possibleAnswersMap.get(testCase.id);
+          if (!possibleAnswer) {
+            throw new Error(`No possible answer for id: ${testCase.id}`);
+          }
+          if (process.env.DEBUG_PARSER_OUTPUT === "true") {
+            const groundTruth = possibleAnswer.ground_truth;
+            const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
+            console.log("\n========== BFCL CASE DEBUG ==========");
+            console.log(`Test Case: ${testCase.id}`);
+            console.log(`Expected count: ${groundTruth.length} call(s)`);
+            console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
+            console.log(expectedXML);
+            console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
+            console.log(mwOriginalText || text || "(empty)");
+            console.log(
+              "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
+            );
+            console.log(JSON.stringify(toolCalls, null, 2));
+            console.log("======================================\n");
+          }
           logRawToolCalls({
             toolCalls,
             finishReason,
@@ -1027,10 +1160,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             testCaseId: testCase.id,
             caseLogs
           });
-          const possibleAnswer = possibleAnswersMap.get(testCase.id);
-          if (!possibleAnswer) {
-            throw new Error(`No possible answer for id: ${testCase.id}`);
-          }
           const restoredCalls = restoreToolCalls(
             toolCalls || [],
             nameMap,
@@ -1051,12 +1180,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             caseLogs
           });
         };
-        const runSingleCase = async (testCase) => {
+        const runSingleCase2 = async (testCase) => {
           const caseLogs = [];
           const { function: tools } = testCase;
-          const temp = config?.temperature;
+          const temp = config == null ? void 0 : config.temperature;
           const temperature = typeof temp === "number" ? temp : void 0;
-          const maxTok = config?.maxTokens;
+          const maxTok = config == null ? void 0 : config.maxTokens;
           const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
           try {
             const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
@@ -1082,15 +1211,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             });
           } catch (e) {
             caseLogs.push(
-              `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
+              `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
             );
-            if (e?.stack) {
+            if (e == null ? void 0 : e.stack) {
               caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
             }
             return { valid: false, logs: caseLogs };
           }
         };
-        const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
+        const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
           const results = new Array(items.length);
           let idx = 0;
           const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
@@ -1106,10 +1235,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           await Promise.all(workers);
           return results;
         };
-        const resultsPerCase = await mapWithConcurrency(
+        const resultsPerCase = await mapWithConcurrency2(
           testCases,
           concurrency,
-          async (tc) => runSingleCase(tc)
+          async (tc) => runSingleCase2(tc)
         );
         correctCount = resultsPerCase.reduce(
           (acc, r) => acc + (r.valid ? 1 : 0),
@@ -1127,14 +1256,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           };
         }
         const score = correctCount / testCases.length;
+        const caseResults = resultsPerCase.map((r, i) => ({
+          id: testCases[i].id,
+          valid: r.valid
+        }));
         return {
           score,
           success: score > 0.95,
-          // High success threshold as requested
           metrics: {
             correct_count: correctCount,
             total_cases: testCases.length,
-            accuracy: score
+            accuracy: score,
+            case_results: JSON.stringify(caseResults)
           },
           logs
         };
@@ -1154,42 +1287,410 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
 }
 var bfclSimpleBenchmark = createBfclBenchmark(
   "bfcl-simple",
-  "BFCL Simple Function Calling",
-  "BFCL_v3_simple.jsonl",
-  "BFCL_v3_simple_possible_answer.jsonl"
+  "BFCL v4 Simple Function Calling",
+  "BFCL_v4_simple.jsonl",
+  "BFCL_v4_simple_possible_answer.jsonl"
 );
 var bfclParallelBenchmark = createBfclBenchmark(
   "bfcl-parallel",
-  "BFCL Parallel Function Calling",
-  "BFCL_v3_parallel.jsonl",
-  "BFCL_v3_parallel_possible_answer.jsonl"
+  "BFCL v4 Parallel Function Calling",
+  "BFCL_v4_parallel.jsonl",
+  "BFCL_v4_parallel_possible_answer.jsonl"
 );
 var bfclMultipleBenchmark = createBfclBenchmark(
   "bfcl-multiple",
-  "BFCL Multiple Function Calling",
-  "BFCL_v3_multiple.jsonl",
-  "BFCL_v3_multiple_possible_answer.jsonl"
+  "BFCL v4 Multiple Function Calling",
+  "BFCL_v4_multiple.jsonl",
+  "BFCL_v4_multiple_possible_answer.jsonl"
 );
 var bfclParallelMultipleBenchmark = createBfclBenchmark(
   "bfcl-parallel-multiple",
-  "BFCL Parallel & Multiple Function Calling",
-  "BFCL_v3_parallel_multiple.jsonl",
-  "BFCL_v3_parallel_multiple_possible_answer.jsonl"
+  "BFCL v4 Parallel & Multiple Function Calling",
+  "BFCL_v4_parallel_multiple.jsonl",
+  "BFCL_v4_parallel_multiple_possible_answer.jsonl"
 );
-// src/benchmarks/json-generation.ts
+// src/benchmarks/complex-func-bench.ts
 var import_node_fs3 = require("fs");
 var import_node_path3 = __toESM(require("path"), 1);
 var import_ai2 = require("ai");
+var LINE_SPLIT_REGEX2 = /\r?\n/;
+function standardizeString2(input) {
+  if (typeof input !== "string") {
+    return input;
+  }
+  return input.toLowerCase().trim();
+}
+function valuesMatch2(modelValue, expectedValue) {
+  if (modelValue === expectedValue) {
+    return true;
+  }
+  if (typeof modelValue === "string" && typeof expectedValue === "string") {
+    return standardizeString2(modelValue) === standardizeString2(expectedValue);
+  }
+  if (typeof modelValue === "number" && typeof expectedValue === "string") {
+    return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
+  }
+  if (typeof modelValue === "string" && typeof expectedValue === "number") {
+    return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
+  }
+  if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
+    try {
+      return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
+    } catch (e) {
+      return false;
+    }
+  }
+  return false;
+}
+function validateFunctionName(modelFuncName, expectedFuncName) {
+  if (modelFuncName !== expectedFuncName) {
+    return {
+      valid: false,
+      error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
+      error_type: "function_name_mismatch"
+    };
+  }
+  return { valid: true };
+}
+function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
+  for (const param of requiredParams) {
+    if (!(param in modelArgs) && param in expectedArgs) {
+      return {
+        valid: false,
+        error: `Missing required parameter: '${param}'`,
+        error_type: "missing_required_param"
+      };
+    }
+  }
+  return { valid: true };
+}
+function validateParamValues(expectedArgs, modelArgs, requiredParams) {
+  for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
+    if (!(paramName in modelArgs)) {
+      if (!requiredParams.includes(paramName)) {
+        continue;
+      }
+      return {
+        valid: false,
+        error: `Missing parameter: '${paramName}'`,
+        error_type: "missing_param"
+      };
+    }
+    const modelValue = modelArgs[paramName];
+    if (!valuesMatch2(modelValue, expectedValue)) {
+      return {
+        valid: false,
+        error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
+        error_type: "value_mismatch"
+      };
+    }
+  }
+  return { valid: true };
+}
+function checkFunctionCall(modelCall, expected, toolSpecs) {
+  var _a, _b, _c, _d;
+  const expectedFuncName = Object.keys(expected)[0];
+  const expectedArgs = expected[expectedFuncName];
+  const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
+  const modelArgs = (_b = modelCall.args) != null ? _b : {};
+  const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
+  if (!nameResult.valid) {
+    return nameResult;
+  }
+  const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
+  const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
+  const requiredResult = validateRequiredParams(
+    requiredParams,
+    modelArgs,
+    expectedArgs
+  );
+  if (!requiredResult.valid) {
+    return requiredResult;
+  }
+  return validateParamValues(expectedArgs, modelArgs, requiredParams);
+}
+function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
+  if (modelCalls.length !== expectedCalls.length) {
+    return {
+      valid: false,
+      error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
+      error_type: "wrong_call_count"
+    };
+  }
+  if (expectedCalls.length === 1) {
+    return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
+  }
+  const matchedIndices = /* @__PURE__ */ new Set();
+  for (const expected of expectedCalls) {
+    let foundMatch = false;
+    for (let i = 0; i < modelCalls.length; i++) {
+      if (matchedIndices.has(i)) {
+        continue;
+      }
+      const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
+      if (result.valid) {
+        matchedIndices.add(i);
+        foundMatch = true;
+        break;
+      }
+    }
+    if (!foundMatch) {
+      const expectedFuncName = Object.keys(expected)[0];
+      return {
+        valid: false,
+        error: `Could not find matching call for function '${expectedFuncName}'`,
+        error_type: "no_matching_call"
+      };
+    }
+  }
+  return { valid: true };
+}
+var fixSchemaType = (copy) => {
+  if (!copy.type) {
+    return;
+  }
+  if (copy.type === "dict") {
+    copy.type = "object";
+  }
+  if (copy.type === "tuple") {
+    copy.type = "array";
+  }
+  if (copy.type === "integer" || copy.type === "float") {
+    copy.type = "number";
+  }
+};
+var fixSchema = (schema) => {
+  if (!schema || typeof schema !== "object") {
+    return { type: "object", properties: {} };
+  }
+  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
+  if (!Array.isArray(copy)) {
+    fixSchemaType(copy);
+    if (copy.properties && typeof copy.properties === "object") {
+      for (const k of Object.keys(copy.properties)) {
+        copy.properties[k] = fixSchema(
+          copy.properties[k]
+        );
+      }
+    }
+    if (copy.items) {
+      copy.items = fixSchema(copy.items);
+    }
+  }
+  return copy;
+};
+function buildTools(tools) {
+  const nameMap = /* @__PURE__ */ new Map();
+  const transformedTools = tools.map((t) => {
+    const fixed = fixSchema(t.parameters);
+    const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
+    const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
+    nameMap.set(sanitized, t.name);
+    return {
+      type: "function",
+      name: sanitized,
+      description: t.description,
+      inputSchema
+    };
+  });
+  const toolsMap = Object.fromEntries(
+    transformedTools.map((t) => [
+      t.name,
+      (0, import_ai2.tool)({
+        description: typeof t.description === "string" ? t.description : void 0,
+        inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
+      })
+    ])
+  );
+  return { nameMap, toolsMap };
+}
+async function mapWithConcurrency(items, concurrencyLimit, mapper) {
+  const results = new Array(items.length);
+  let idx = 0;
+  const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
+    while (true) {
+      const current = idx;
+      idx += 1;
+      if (current >= items.length) {
+        break;
+      }
+      results[current] = await mapper(items[current]);
+    }
+  });
+  await Promise.all(workers);
+  return results;
+}
+async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
+  const caseLogs = [];
+  const { function: tools, question: messages } = testCase;
+  try {
+    const { nameMap, toolsMap } = buildTools(tools);
+    const debugSummaryRef = {};
+    const providerOptions = {
+      toolCallMiddleware: { debugSummary: debugSummaryRef }
+    };
+    const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
+      model,
+      messages,
+      tools: toolsMap,
+      toolChoice: "auto",
+      providerOptions,
+      ...temperature !== void 0 ? { temperature } : {},
+      ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
+    });
+    const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
+      var _a, _b, _c, _d;
+      const rawName = (_a = c.toolName) != null ? _a : c.name;
+      const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
+      return {
+        toolName: originalName,
+        name: originalName,
+        args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
+      };
+    });
+    caseLogs.push(
+      `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
+    );
+    const possibleAnswer = possibleAnswersMap.get(testCase.id);
+    if (!possibleAnswer) {
+      throw new Error(`No possible answer for id: ${testCase.id}`);
+    }
+    const checkerResult = checkAllFunctionCalls(
+      restoredCalls,
+      possibleAnswer.ground_truth,
+      tools
+    );
+    if (checkerResult.valid) {
+      caseLogs.push(`[PASS] ${testCase.id}`);
+      return { valid: true, logs: caseLogs };
+    }
+    caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
+    return { valid: false, logs: caseLogs };
+  } catch (e) {
+    caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
+    return { valid: false, logs: caseLogs };
+  }
+}
+async function loadTestData(dataPath, testDataFile) {
+  const testCasesJson = await import_node_fs3.promises.readFile(
+    import_node_path3.default.join(dataPath, testDataFile),
+    "utf-8"
+  );
+  return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+}
+async function loadAnswerData(dataPath, answerDataFile) {
+  const answersJson = await import_node_fs3.promises.readFile(
+    import_node_path3.default.join(dataPath, answerDataFile),
+    "utf-8"
+  );
+  const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+  return new Map(answers.map((ans) => [ans.id, ans]));
+}
+function getConfigValues(config) {
+  const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
+  const limit = limitEnv ? Number(limitEnv) : void 0;
+  const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
+  const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
+  const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
+  const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
+  return { limit, concurrency, temperature, maxTokens };
+}
+function aggregateResults(resultsPerCase, testCases) {
+  const logs = [];
+  const correctCount = resultsPerCase.reduce(
+    (acc, r) => acc + (r.valid ? 1 : 0),
+    0
+  );
+  for (const r of resultsPerCase) {
+    logs.push(...r.logs);
+  }
+  if (testCases.length === 0) {
+    return {
+      score: 0,
+      success: false,
+      metrics: {},
+      logs: ["No test cases found."]
+    };
+  }
+  const score = correctCount / testCases.length;
+  return {
+    score,
+    success: score > 0.5,
+    metrics: {
+      correct_count: correctCount,
+      total_cases: testCases.length,
+      accuracy: score
+    },
+    logs
+  };
+}
+function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
+  return {
+    name,
+    version: "1.0.0",
+    description,
+    async run(model, config) {
+      var _a;
+      const logs = [];
+      try {
+        const dataPath = resolveDataDir();
+        logs.push(`[INFO] Using data dir: ${dataPath}`);
+        let testCases = await loadTestData(dataPath, testDataFile);
+        const possibleAnswersMap = await loadAnswerData(
+          dataPath,
+          answerDataFile
+        );
+        const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
+        if (limit && Number.isFinite(limit) && limit > 0) {
+          testCases = testCases.slice(0, limit);
+          logs.push(`[INFO] Limiting test cases to ${limit}`);
+        }
+        logs.push(
+          `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
+        );
+        const resultsPerCase = await mapWithConcurrency(
+          testCases,
+          concurrency,
+          (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
+        );
+        const result = aggregateResults(resultsPerCase, testCases);
+        result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
+        return result;
+      } catch (e) {
+        return {
+          score: 0,
+          success: false,
+          metrics: {},
+          error: e,
+          logs: [
+            `[FATAL] Failed to run benchmark ${name}: ${e.message}`
+          ]
+        };
+      }
+    }
+  };
+}
+var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
+  "complex-func-bench",
+  "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
+  "ComplexFuncBench.jsonl",
+  "ComplexFuncBench_possible_answer.jsonl"
+);
+// src/benchmarks/json-generation.ts
+var import_node_fs4 = require("fs");
+var import_node_path4 = __toESM(require("path"), 1);
+var import_ai3 = require("ai");
 var import_ajv = __toESM(require("ajv"), 1);
 var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
 var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
 var NEWLINE_REGEX = /\r?\n/;
-var LINE_SPLIT_REGEX2 = /\r?\n/;
+var LINE_SPLIT_REGEX3 = /\r?\n/;
 function tryDirectParse(text) {
   try {
     return JSON.parse(text);
-  } catch {
+  } catch (e) {
     return;
   }
 }
@@ -1201,7 +1702,7 @@ function tryCodeFenceParse(text) {
   const inner = fenceMatch[1].trim();
   try {
     return JSON.parse(inner);
-  } catch {
+  } catch (e) {
     return;
   }
 }
@@ -1226,7 +1727,7 @@ function tryBracketScan(text) {
       const candidate = text.slice(start, i + 1);
       try {
         return JSON.parse(candidate);
-      } catch {
+      } catch (e) {
         return;
       }
     }
@@ -1274,12 +1775,12 @@ function subsetMatch(expected, actual) {
 async function loadDatasets() {
   try {
     const dataDir = resolveDataDir();
-    const testsJsonl = await import_node_fs3.promises.readFile(
-      import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
+    const testsJsonl = await import_node_fs4.promises.readFile(
+      import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
       "utf-8"
     );
-    const expectedJsonl = await import_node_fs3.promises.readFile(
-      import_node_path3.default.join(dataDir, "json_generation_expected.jsonl"),
+    const expectedJsonl = await import_node_fs4.promises.readFile(
+      import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
       "utf-8"
     );
     const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -1335,10 +1836,11 @@ function validateTestCase(tc, parsed, context) {
   return { valid, valuesOk, parsed };
 }
 async function processTestCase(tc, context) {
+  var _a;
   const messages = buildMessages(tc);
-  const temp = context.config?.temperature;
+  const temp = (_a = context.config) == null ? void 0 : _a.temperature;
   const temperature = typeof temp === "number" ? temp : void 0;
-  const { text } = await (0, import_ai2.generateText)({
+  const { text } = await (0, import_ai3.generateText)({
     model: context.model,
     messages,
     ...temperature !== void 0 ? { temperature } : {}
@@ -1346,7 +1848,7 @@ async function processTestCase(tc, context) {
   let parsed;
   try {
     parsed = extractFirstJsonBlock(text);
-  } catch {
+  } catch (e) {
   }
   if (parsed === void 0) {
     context.validation.logs.push(
@@ -1440,21 +1942,22 @@ function buildBenchmarkResult(total, counts, logs) {
 async function loadSchemaOnlyTests() {
   try {
     const dataDir = resolveDataDir();
-    const testsJsonl = await import_node_fs3.promises.readFile(
-      import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
+    const testsJsonl = await import_node_fs4.promises.readFile(
+      import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
       "utf-8"
     );
-    const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+    const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
     return { tests };
   } catch (e) {
     return { tests: [], error: e };
   }
 }
 async function processSchemaOnlyTestCase(tc, context) {
+  var _a;
   const messages = buildMessages(tc);
-  const temp = context.config?.temperature;
+  const temp = (_a = context.config) == null ? void 0 : _a.temperature;
   const temperature = typeof temp === "number" ? temp : void 0;
-  const { text } = await (0, import_ai2.generateText)({
+  const { text } = await (0, import_ai3.generateText)({
     model: context.model,
     messages,
     ...temperature !== void 0 ? { temperature } : {}
@@ -1462,7 +1965,7 @@ async function processSchemaOnlyTestCase(tc, context) {
   let parsed;
   try {
     parsed = extractFirstJsonBlock(text);
-  } catch {
+  } catch (e) {
   }
   if (parsed === void 0) {
     context.logs.push(
@@ -1531,38 +2034,144 @@ var jsonGenerationSchemaOnlyBenchmark = {
   }
 };
+// src/evaluate.ts
+var import_middleware = require("@ai-sdk-tool/middleware");
+var import_ai4 = require("ai");
 // src/reporters/console.ts
 var colors = {
   reset: "\x1B[0m",
+  bold: "\x1B[1m",
   green: "\x1B[32m",
   red: "\x1B[31m",
   yellow: "\x1B[33m",
   cyan: "\x1B[36m",
   magenta: "\x1B[35m",
-  gray: "\x1B[90m"
+  gray: "\x1B[90m",
+  white: "\x1B[37m"
 };
+var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
+function formatDiff(diff) {
+  if (!diff || diff.length === 0) {
+    return "";
+  }
+  return diff.slice(0, 8).map((line) => {
+    if (line.startsWith("-")) {
+      return `${colors.red}${line}${colors.reset}`;
+    }
+    if (line.startsWith("+")) {
+      return `${colors.green}${line}${colors.reset}`;
+    }
+    if (line.startsWith("@@")) {
+      return `${colors.cyan}${line}${colors.reset}`;
+    }
+    return line;
+  }).join("\n      ");
+}
+function parseFailures(logs) {
+  const failures = [];
+  for (const log of logs) {
+    if (!DEBUG_FAIL_REGEX.test(log)) {
+      continue;
+    }
+    try {
+      const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
+      const parsed = JSON.parse(jsonStr);
+      failures.push(parsed);
+    } catch (e) {
+    }
+  }
+  return failures;
+}
+function groupFailuresByCategory(failures) {
+  const groups = /* @__PURE__ */ new Map();
+  for (const failure of failures) {
+    const category = failure.category || "OTHER";
+    const existing = groups.get(category);
+    if (existing) {
+      existing.push(failure);
+    } else {
+      groups.set(category, [failure]);
+    }
+  }
+  return groups;
+}
+function printCompactFailure(failure) {
+  var _a;
+  console.log(
+    `
+    ${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
+  );
+  if (failure.message) {
+    console.log(`      ${failure.message}`);
+  }
+  if (failure.diff && failure.diff.length > 0) {
+    console.log(`      ${formatDiff(failure.diff)}`);
+  }
+  if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
+    const text = failure.context.raw_model_text;
+    const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
+    console.log(`      ${colors.gray}Model: "${truncated}"${colors.reset}`);
+  }
+}
+function printFailureSummary(failures) {
+  const groups = groupFailuresByCategory(failures);
+  const sorted = [...groups.entries()].sort(
+    (a, b) => b[1].length - a[1].length
+  );
+  console.log(`
+    ${colors.bold}Failures by category:${colors.reset}`);
+  for (const [category, categoryFailures] of sorted) {
+    console.log(
+      `      ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
+    );
+  }
+  const maxToShow = 5;
+  const shown = failures.slice(0, maxToShow);
+  for (const failure of shown) {
+    printCompactFailure(failure);
+  }
+  if (failures.length > maxToShow) {
+    const remaining = failures.length - maxToShow;
+    const remainingIds = failures.slice(maxToShow).map((f) => f.id);
+    const idPreview = remainingIds.slice(0, 5).join(", ");
+    const more = remainingIds.length > 5 ? "..." : "";
+    console.log(
+      `
+    ${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
+    );
+  }
+}
 function printResult(result) {
   const { model, modelKey, benchmark, result: benchmarkResult } = result;
-  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
+  const passed = benchmarkResult.metrics.correct_count;
+  const total = benchmarkResult.metrics.total_cases;
+  const scorePercent = (benchmarkResult.score * 100).toFixed(1);
+  const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
+  const statusColor = benchmarkResult.success ? colors.green : colors.red;
   console.log(
     `
  ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
   );
   console.log(
-    `  \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
+    `  \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
   );
-  const metrics = Object.entries(benchmarkResult.metrics);
-  if (metrics.length > 0) {
-    console.log("    Metrics:");
-    for (const [key, value] of metrics) {
-      console.log(`      - ${key}: ${value}`);
-    }
-  }
   if (benchmarkResult.error) {
     console.log(
       `    ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
     );
   }
+  if (!benchmarkResult.success && benchmarkResult.logs) {
+    const failures = parseFailures(benchmarkResult.logs);
+    if (failures.length > 0) {
+      printFailureSummary(failures);
+    } else if (benchmarkResult.logs.length > 0) {
+      console.log(`    ${colors.gray}Raw Logs (Sample):${colors.reset}`);
+      for (const l of benchmarkResult.logs.slice(0, 5)) {
+        console.log(`      ${l}`);
+      }
+    }
+  }
 }
 function consoleReporter(results) {
   console.log("\n--- \u{1F4CA} Evaluation Report ---");
@@ -1617,14 +2226,14 @@ function hasFunctionNameIssue(diff) {
   );
 }
 function suggestFunctionNameFix(expected, actual, suggestions) {
-  const expectedName = expected?.function;
-  const actualName = actual?.function;
+  const expectedName = expected == null ? void 0 : expected.function;
+  const actualName = actual == null ? void 0 : actual.function;
   if (expectedName && actualName && expectedName !== actualName) {
     suggestions.push(
       `Call the function '${expectedName}' instead of '${actualName}'.`
     );
   }
-  if (Array.isArray(expected?.functions)) {
+  if (Array.isArray(expected == null ? void 0 : expected.functions)) {
     suggestions.push(
       `Ensure tool calls include: ${expected.functions.join(", ")}.`
     );
@@ -1679,7 +2288,7 @@ function suggestFromErrorType(error_type, suggestions) {
 }
 function suggestFixFromDiff(parsed) {
   const suggestions = [];
-  const { error_type, expected, actual, diff } = parsed ?? {};
+  const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
   if (!Array.isArray(diff)) {
     if (suggestions.length === 0 && typeof error_type === "string") {
       suggestFromErrorType(error_type, suggestions);
@@ -1704,15 +2313,16 @@ function suggestFixFromDiff(parsed) {
   return uniqueLines(suggestions);
 }
 function getTestIdFromLogLine(line) {
+  var _a, _b;
   if (line.startsWith("[FAIL]")) {
     const m = line.match(FAIL_ID_REGEX);
-    return m?.[1];
+    return m == null ? void 0 : m[1];
   }
   if (line.startsWith("[DEBUG-FAIL]")) {
     try {
       const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
-      return String(parsed?.id ?? "");
-    } catch {
+      return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
+    } catch (e) {
     }
   }
   if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
@@ -1720,18 +2330,19 @@ function getTestIdFromLogLine(line) {
       const parsed = JSON.parse(
         line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
       );
-      return String(parsed?.id ?? "");
-    } catch {
+      return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
+    } catch (e) {
     }
   }
   return;
 }
 function groupLogsByTestId(failLogs) {
+  var _a;
   const byId = /* @__PURE__ */ new Map();
   for (const line of failLogs) {
     const id = getTestIdFromLogLine(line);
-    const key = id ?? "__general__";
-    const arr = byId.get(key) ?? [];
+    const key = id != null ? id : "__general__";
+    const arr = (_a = byId.get(key)) != null ? _a : [];
     arr.push(line);
     byId.set(key, arr);
   }
@@ -1743,10 +2354,10 @@ function collectDebugIds(lines) {
     if (l.startsWith("[DEBUG-FAIL]")) {
       try {
         const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
-        if (parsed?.id) {
+        if (parsed == null ? void 0 : parsed.id) {
           debugIds.add(String(parsed.id));
         }
-      } catch {
+      } catch (e) {
       }
     }
   }
@@ -1782,7 +2393,7 @@ function displayDebugFailLine(line) {
         console.log(`            \u2022 ${s}`);
       }
     }
-  } catch {
+  } catch (e) {
     console.log(`        ${line}`);
   }
 }
@@ -1826,14 +2437,14 @@ function displayDebugFailContextLine(line) {
     const ctx = JSON.parse(payload);
     console.log(`        ${colors2.gray}context:${colors2.reset}`);
     displayContextInfo(ctx);
-  } catch {
+  } catch (e) {
     console.log(`        ${line}`);
   }
 }
 function displayLogLine(line, debugIds) {
   if (line.startsWith("[FAIL]")) {
     const m = line.match(FAIL_ID_REGEX);
-    const failId = m?.[1];
+    const failId = m == null ? void 0 : m[1];
     if (failId && debugIds.has(failId)) {
       return;
     }
@@ -1903,26 +2514,350 @@ function displayResultHeader(r) {
   );
 }
 function consoleDebugReporter(results) {
+  var _a;
   console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
   for (const r of results) {
     displayResultHeader(r);
     displayMetrics(Object.entries(r.result.metrics));
-    if (r.result.logs?.length) {
+    if ((_a = r.result.logs) == null ? void 0 : _a.length) {
       displayResultLogs(r.result.logs);
     }
   }
   console.log("\n------------------------------------\n");
 }
+// src/reporters/console.summary.ts
+var colors3 = {
+  reset: "\x1B[0m",
+  bold: "\x1B[1m",
+  dim: "\x1B[2m",
+  green: "\x1B[32m",
+  red: "\x1B[31m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  magenta: "\x1B[35m",
+  gray: "\x1B[90m",
+  white: "\x1B[37m"
+};
+var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
+var ID_NUM_REGEX = /_(\d+)$/;
+var REASONING_TAG = "think";
+var MAX_FAILURES_TO_DISPLAY = 5;
+var CATEGORY_DESCRIPTIONS = {
+  PARSE_FAILURE: {
+    label: "Parse Failure",
+    description: "No tool calls extracted from model output",
+    hint: "Model may have responded in text instead of tool format"
+  },
+  PARTIAL_CALLS: {
+    label: "Partial Calls",
+    description: "Some expected tool calls missing",
+    hint: "Model stopped early or missed some tools"
+  },
+  EXTRA_CALLS: {
+    label: "Extra Calls",
+    description: "More tool calls than expected",
+    hint: "Model called tools that weren't needed"
+  },
+  PARAM_VALUE_PERCENT: {
+    label: "Param Value (Percent)",
+    description: "Percentage sent as integer instead of decimal",
+    hint: "e.g., 5 instead of 0.05 for 5%"
+  },
+  PARAM_VALUE_MISMATCH: {
+    label: "Param Value Mismatch",
+    description: "Parameter values don't match expected"
+  },
+  WRONG_FUNCTION: {
+    label: "Wrong Function",
+    description: "Called wrong function name"
+  },
+  MISSING_PARAMS: {
+    label: "Missing Params",
+    description: "Required parameters not provided"
+  },
+  UNEXPECTED_PARAMS: {
+    label: "Unexpected Params",
+    description: "Extra parameters that shouldn't be there"
+  },
+  NO_MATCH: {
+    label: "No Match",
+    description: "Function called but couldn't match to expected",
+    hint: "Parameters may be correct but don't match any expected combination"
+  },
+  OTHER: {
+    label: "Other",
+    description: "Uncategorized failure"
+  }
+};
+function parseFailureLogs(logs) {
+  return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
+    try {
+      const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
+      return JSON.parse(jsonStr);
+    } catch (e) {
+      return null;
+    }
+  }).filter((parsed) => parsed !== null);
+}
+function groupByCategory(failures) {
+  const groups = /* @__PURE__ */ new Map();
+  for (const failure of failures) {
+    const category = failure.category || "OTHER";
+    const existing = groups.get(category);
+    if (existing) {
+      existing.failures.push(failure);
+    } else {
+      groups.set(category, { failures: [failure] });
+    }
+  }
+  return groups;
+}
+function extractParamNames(failures) {
+  const paramNames = /* @__PURE__ */ new Set();
+  for (const f of failures) {
+    if (!f.diff) {
+      continue;
+    }
+    for (const d of f.diff) {
+      if (d.startsWith("@@ param ")) {
+        paramNames.add(d.replace("@@ param ", ""));
+      }
+    }
+  }
+  return paramNames;
+}
+function extractFinishReasons(failures) {
+  var _a;
+  const finishReasons = /* @__PURE__ */ new Set();
+  for (const f of failures) {
+    if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
+      finishReasons.add(String(f.context.finish_reason));
+    }
+  }
+  return finishReasons;
+}
+function detectPatterns(group) {
+  const { failures } = group;
+  if (failures.length < 2) {
+    return;
+  }
+  const firstCategory = failures[0].category;
+  if (firstCategory === "PARAM_VALUE_PERCENT") {
+    const paramNames = extractParamNames(failures);
+    if (paramNames.size > 0) {
+      group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
+    }
+  }
+  if (firstCategory === "PARSE_FAILURE") {
+    const finishReasons = extractFinishReasons(failures);
+    if (finishReasons.size === 1) {
+      group.pattern = `All finished with: ${[...finishReasons][0]}`;
+    }
+  }
+}
+function getLineColor(line) {
+  if (line.startsWith("+")) {
+    return colors3.green;
+  }
+  if (line.startsWith("-")) {
+    return colors3.red;
+  }
+  if (line.startsWith("@@")) {
+    return colors3.cyan;
+  }
+  return colors3.white;
+}
+function formatFunctions(funcs) {
+  if (Array.isArray(funcs)) {
+    return funcs.join(", ");
+  }
+  return String(funcs);
+}
+function printExpectedActual(failure) {
+  if (failure.expected) {
+    const expFuncs = failure.expected.functions || failure.expected.function;
+    if (expFuncs) {
+      console.log(
+        `    ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
+      );
+    }
+  }
+  if (failure.actual) {
+    const actFuncs = failure.actual.functions || failure.actual.function;
+    if (actFuncs) {
+      const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
+      const color = isEmpty ? colors3.red : colors3.white;
+      const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
+      console.log(
+        `    ${colors3.gray}Actual:${colors3.reset}   ${color}${text}${colors3.reset}`
+      );
+    }
+  }
+}
+function printDiff(diff) {
+  console.log(`    ${colors3.gray}Diff:${colors3.reset}`);
+  for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
+    const lineColor = getLineColor(line);
+    console.log(`      ${lineColor}${line}${colors3.reset}`);
+  }
+}
+function removeReasoningTags(text) {
+  const openTag = `<${REASONING_TAG}>`;
+  const closeTag = `</${REASONING_TAG}>`;
+  const closedTagPattern = new RegExp(
+    `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
+    "g"
+  );
+  const unclosedTagPattern = new RegExp(
+    `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
+    "g"
+  );
+  let result = text.replace(closedTagPattern, "");
+  result = result.replace(unclosedTagPattern, "");
+  return result.trim();
+}
+function printModelOutput(failure, category) {
+  var _a, _b;
+  if (category !== "PARSE_FAILURE") {
+    return;
+  }
+  const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
+  const cleanedText = removeReasoningTags(rawText);
+  if (cleanedText) {
+    console.log(
+      `    ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
+    );
+  } else {
+    console.log(
+      `    ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
+    );
+  }
+}
+function shouldShowDiffByDefault(category) {
+  return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
+}
+function printSingleFailure(failure, category, verbose) {
+  console.log(`
+  ${colors3.bold}${failure.id}${colors3.reset}`);
+  const hasDiff = failure.diff && failure.diff.length > 0;
+  const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
+  if (showDiffPrimarily) {
+    printDiff(failure.diff);
+  } else {
+    printExpectedActual(failure);
+    if (hasDiff && verbose) {
+      printDiff(failure.diff);
+    }
+  }
+  printModelOutput(failure, category);
+}
+var MAX_SAMPLE_FAILURES = 2;
+function printRemainingIds(failures) {
+  const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
+  const idNums = remainingIds.map((id) => {
+    const match = id.match(ID_NUM_REGEX);
+    return match ? match[1] : id;
+  });
+  console.log(
+    `
+  ${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
+  );
+}
+function printCategoryHeader(info, count) {
+  console.log(
+    `
+${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
+  );
+  console.log(`${colors3.dim}${info.description}${colors3.reset}`);
+}
+function printCategoryDetails(category, group, verbose) {
+  const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
+  const { failures } = group;
+  printCategoryHeader(info, failures.length);
+  if (group.pattern) {
+    console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
+  }
+  if (info.hint) {
+    console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
+  }
+  const samplesToShow = verbose ? failures : failures.slice(0, 2);
+  for (const failure of samplesToShow) {
+    printSingleFailure(failure, category, verbose);
+  }
+  if (!verbose && failures.length > 2) {
+    printRemainingIds(failures);
+  }
+}
+function printResultHeader(result) {
+  const { model, modelKey, benchmark, result: benchmarkResult } = result;
+  const passed = benchmarkResult.metrics.correct_count;
+  const total = benchmarkResult.metrics.total_cases;
+  const scorePercent = (benchmarkResult.score * 100).toFixed(1);
+  const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
+  const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
+  const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
+  const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
+  const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
+  console.log(
+    `
+${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
+  );
+  console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
+}
+function printResultSummary(result, verbose) {
+  const { result: benchmarkResult } = result;
+  printResultHeader(result);
+  if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
+    return;
+  }
+  const failures = parseFailureLogs(benchmarkResult.logs);
+  if (failures.length === 0) {
+    if (!benchmarkResult.success) {
+      console.log(
+        `${colors3.yellow}No structured failure data available${colors3.reset}`
+      );
+    }
+    return;
+  }
+  const groups = groupByCategory(failures);
+  for (const group of groups.values()) {
+    detectPatterns(group);
+  }
+  const sortedCategories = [...groups.entries()].sort(
+    (a, b) => b[1].failures.length - a[1].failures.length
+  );
+  for (const [cat, group] of sortedCategories) {
+    printCategoryDetails(cat, group, verbose);
+  }
+}
+function consoleSummaryReporter(results) {
+  const verbose = process.env.VERBOSE === "true";
+  console.log(`
+${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
+  console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
+  for (const result of results) {
+    printResultSummary(result, verbose);
+  }
+  console.log(
+    `
+${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
+`
+  );
+}
 // src/reporters/json.ts
 function jsonReporter(results) {
-  const serializableResults = results.map((r) => ({
-    ...r,
-    result: {
-      ...r.result,
-      error: r.result.error?.message
-    }
-  }));
+  const serializableResults = results.map((r) => {
+    var _a;
+    return {
+      ...r,
+      result: {
+        ...r.result,
+        error: (_a = r.result.error) == null ? void 0 : _a.message
+      }
+    };
+  });
   console.log(JSON.stringify(serializableResults, null, 2));
 }
@@ -1930,60 +2865,56 @@ function jsonReporter(results) {
 var reporters = {
   console: consoleReporter,
   json: jsonReporter,
-  "console.debug": consoleDebugReporter
+  "console.debug": consoleDebugReporter,
+  "console.summary": consoleSummaryReporter
 };
 // src/evaluate.ts
-async function runSingleBenchmark(model, benchmark, modelKey, config) {
-  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
-  try {
-    console.log(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
-    );
-    const result = await benchmark.run(model, config);
-    console.log(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
-    );
-    return {
-      model: modelId,
-      modelKey,
-      benchmark: benchmark.name,
-      result
-    };
-  } catch (error) {
-    console.error(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
-      error
-    );
-    return {
-      model: modelId,
-      modelKey,
-      benchmark: benchmark.name,
-      result: {
-        score: 0,
-        success: false,
-        metrics: {},
-        error: error instanceof Error ? error : new Error(String(error))
-      }
-    };
+function isModelConfig(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const obj = value;
+  if (!("model" in obj)) {
+    return false;
+  }
+  const model = obj.model;
+  if (typeof model !== "object" || model === null) {
+    return false;
   }
+  return "modelId" in model;
+}
+function isLanguageModel(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const obj = value;
+  return "modelId" in obj && typeof obj.modelId === "string";
+}
+function extractModelAndMiddleware(input) {
+  if (isModelConfig(input)) {
+    return [input.model, input.middleware];
+  }
+  return [input, void 0];
 }
 function normalizeModels(models) {
-  const modelEntries = [];
+  const entries = [];
   if (Array.isArray(models)) {
     for (const m of models) {
-      modelEntries.push([void 0, m]);
+      const [model, middleware] = extractModelAndMiddleware(m);
+      entries.push([void 0, model, middleware]);
     }
-  } else if (typeof models === "object" && models !== null && "modelId" in models) {
-    modelEntries.push([void 0, models]);
+  } else if (isModelConfig(models)) {
+    entries.push([void 0, models.model, models.middleware]);
+  } else if (isLanguageModel(models)) {
+    entries.push([void 0, models, void 0]);
   } else {
-    for (const [key, m] of Object.entries(
-      models
-    )) {
-      modelEntries.push([key, m]);
+    for (const [key, m] of Object.entries(models)) {
+      const [model, middleware] = extractModelAndMiddleware(m);
+      entries.push([key, model, middleware]);
     }
   }
-  return modelEntries;
+  return entries;
 }
 function buildConfig(temperature, maxTokens) {
   const config = {};
@@ -2004,21 +2935,90 @@ function executeReporter(reporter, results) {
     reporters.console(results);
   }
 }
+function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
+  var _a, _b;
+  const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
+  if (!(cacheEnabled || userMiddleware)) {
+    return baseModel;
+  }
+  const cacheMiddleware = cacheEnabled ? (0, import_middleware.createDiskCacheMiddleware)({
+    cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
+    enabled: true,
+    debug: (_b = cacheOptions.debug) != null ? _b : false
+  }) : null;
+  const middlewares = [];
+  if (userMiddleware) {
+    if (Array.isArray(userMiddleware)) {
+      middlewares.push(...userMiddleware);
+    } else {
+      middlewares.push(userMiddleware);
+    }
+  }
+  if (cacheMiddleware) {
+    middlewares.push(cacheMiddleware);
+  }
+  if (middlewares.length === 0) {
+    return baseModel;
+  }
+  return (0, import_ai4.wrapLanguageModel)({
+    // biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
+    model: baseModel,
+    middleware: middlewares.length === 1 ? middlewares[0] : middlewares
+  });
+}
+async function runSingleBenchmark(model, benchmark, modelKey, config) {
+  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
+  const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
+  try {
+    process.stdout.write(`${prefix}: ...`);
+    const result = await benchmark.run(model, config);
+    const scoreDisplay = result.score.toFixed(2);
+    process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
+`);
+    return {
+      model: modelId,
+      modelKey,
+      benchmark: benchmark.name,
+      result
+    };
+  } catch (error) {
+    process.stdout.write(`\r${prefix}: .... Score: ERROR
+`);
+    console.error(error);
+    return {
+      model: modelId,
+      modelKey,
+      benchmark: benchmark.name,
+      result: {
+        score: 0,
+        success: false,
+        metrics: {},
+        error: error instanceof Error ? error : new Error(String(error))
+      }
+    };
+  }
+}
 async function evaluate(options) {
   const {
     models,
     benchmarks,
     reporter = "console",
     temperature,
-    maxTokens
+    maxTokens,
+    cache
   } = options;
   const modelEntries = normalizeModels(models);
   const config = buildConfig(temperature, maxTokens);
   const allResults = [];
-  for (const [modelKey, model] of modelEntries) {
+  for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
+    const effectiveModel = buildEffectiveModel(
+      baseModel,
+      userMiddleware,
+      cache
+    );
     for (const benchmark of benchmarks) {
       const evaluationResult = await runSingleBenchmark(
-        model,
+        effectiveModel,
         benchmark,
         modelKey,
         config
@@ -2035,6 +3035,7 @@ async function evaluate(options) {
   bfclParallelBenchmark,
   bfclParallelMultipleBenchmark,
   bfclSimpleBenchmark,
+  complexFuncBenchBenchmark,
   evaluate,
   jsonGenerationBenchmark,
   jsonGenerationSchemaOnlyBenchmark