npm - @ai-sdk-tool/eval - Versions diffs - 0.1.8 → 1.0.0-canary.0 - Mend

@ai-sdk-tool/eval 0.1.8 → 1.0.0-canary.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/index.js CHANGED Viewed

@@ -1,424 +1,18 @@
-// src/reporters/console.ts
-var colors = {
-  reset: "\x1B[0m",
-  green: "\x1B[32m",
-  red: "\x1B[31m",
-  yellow: "\x1B[33m",
-  cyan: "\x1B[36m",
-  magenta: "\x1B[35m",
-  gray: "\x1B[90m"
-};
-function printResult(result) {
-  const { model, modelKey, benchmark, result: benchmarkResult } = result;
-  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
-  console.log(
-    `
- ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
-  );
-  console.log(
-    `  \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
-  );
-  const metrics = Object.entries(benchmarkResult.metrics);
-  if (metrics.length > 0) {
-    console.log("    Metrics:");
-    for (const [key, value] of metrics) {
-      console.log(`      - ${key}: ${value}`);
-    }
-  }
-  if (benchmarkResult.error) {
-    console.log(
-      `    ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
-    );
-  }
-}
-function consoleReporter(results) {
-  console.log("\n--- \u{1F4CA} Evaluation Report ---");
-  for (const result of results) {
-    printResult(result);
-  }
-  console.log("\n---------------------------\n");
-}
-// src/reporters/console.debug.ts
-var colors2 = {
-  reset: "\x1B[0m",
-  green: "\x1B[32m",
-  red: "\x1B[31m",
-  yellow: "\x1B[33m",
-  cyan: "\x1B[36m",
-  magenta: "\x1B[35m",
-  gray: "\x1B[90m",
-  bold: "\x1B[1m",
-  underline: "\x1B[4m"
-};
-function colorizeDiffLine(line) {
-  if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
-  if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
-  if (line.startsWith("@"))
-    return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
-  return line;
-}
-function uniqueLines(lines) {
-  const seen = /* @__PURE__ */ new Set();
-  const out = [];
-  for (const l of lines) {
-    if (seen.has(l)) continue;
-    seen.add(l);
-    out.push(l);
-  }
-  return out;
-}
-function suggestFixFromDiff(parsed) {
-  const suggestions = [];
-  const { error_type, expected, actual, diff } = parsed ?? {};
-  if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
-    const expectedName = expected?.function;
-    const actualName = actual?.function;
-    if (expectedName && actualName && expectedName !== actualName) {
-      suggestions.push(
-        `Call the function '${expectedName}' instead of '${actualName}'.`
-      );
-    }
-    if (Array.isArray(expected?.functions)) {
-      suggestions.push(
-        `Ensure tool calls include: ${expected.functions.join(", ")}.`
-      );
-    }
-  }
-  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
-    const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
-    if (missing.length) {
-      suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
-    }
-  }
-  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
-    const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
-    if (extras.length) {
-      suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
-    }
-  }
-  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
-    const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
-    for (const param of targets) {
-      const allowedOneOfLine = diff.find(
-        (d) => String(d).startsWith("- expected one of:")
-      );
-      const allowedSingleLine = diff.find(
-        (d) => String(d).startsWith("- expected:")
-      );
-      if (allowedSingleLine) {
-        const value = allowedSingleLine.replace("- expected: ", "");
-        suggestions.push(`Set '${param}' to: ${value}.`);
-      } else if (allowedOneOfLine) {
-        const allowed = allowedOneOfLine.replace("- expected one of: ", "");
-        suggestions.push(`Set '${param}' to one of: ${allowed}.`);
-      } else {
-        suggestions.push(`Adjust '${param}' to an allowed value.`);
-      }
-    }
-  }
-  if (suggestions.length === 0 && typeof error_type === "string") {
-    if (error_type.includes("missing_required")) {
-      suggestions.push(
-        "Add all required parameters defined by the tool schema."
-      );
-    } else if (error_type.includes("unexpected_param")) {
-      suggestions.push("Remove parameters not present in the tool schema.");
-    } else if (error_type.includes("wrong_count")) {
-      suggestions.push(
-        "Adjust the number of tool calls to match expected count."
-      );
-    } else if (error_type.includes("wrong_func_name")) {
-      suggestions.push("Use the exact expected function name from the schema.");
-    } else if (error_type.includes("value_error")) {
-      suggestions.push("Choose a value from the allowed options.");
-    }
-  }
-  return uniqueLines(suggestions);
-}
-function consoleDebugReporter(results) {
-  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
-  for (const r of results) {
-    const { model, modelKey, benchmark, result } = r;
-    const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
-    console.log(
-      `
- ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
-    );
-    console.log(
-      `  \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
-    );
-    const metrics = Object.entries(result.metrics);
-    if (metrics.length > 0) {
-      console.log("    Metrics:");
-      for (const [k, v] of metrics) console.log(`      - ${k}: ${v}`);
-    }
-    if (result.logs && result.logs.length) {
-      const failLogs = result.logs.filter(
-        (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
-      );
-      const hasFails = failLogs.length > 0;
-      if (hasFails) {
-        let getTestIdFromLogLine2 = function(line) {
-          if (line.startsWith("[FAIL]")) {
-            const m = line.match(/^\[FAIL\]\s+([^:]+):/);
-            return m?.[1];
-          }
-          if (line.startsWith("[DEBUG-FAIL]")) {
-            try {
-              const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
-              return String(parsed?.id ?? "");
-            } catch {
-            }
-          }
-          if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
-            try {
-              const parsed = JSON.parse(
-                line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
-              );
-              return String(parsed?.id ?? "");
-            } catch {
-            }
-          }
-          return void 0;
-        };
-        var getTestIdFromLogLine = getTestIdFromLogLine2;
-        const byId = /* @__PURE__ */ new Map();
-        for (const line of failLogs) {
-          const id = getTestIdFromLogLine2(line);
-          const key = id ?? "__general__";
-          const arr = byId.get(key) ?? [];
-          arr.push(line);
-          byId.set(key, arr);
-        }
-        console.log(
-          `    ${colors2.bold}Failure details (grouped):${colors2.reset}`
-        );
-        for (const [groupId, lines] of byId) {
-          if (groupId !== "__general__") {
-            console.log(`      ${colors2.underline}${groupId}${colors2.reset}`);
-          }
-          const debugIds = /* @__PURE__ */ new Set();
-          for (const l of lines) {
-            if (l.startsWith("[DEBUG-FAIL]")) {
-              try {
-                const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
-                if (parsed?.id) debugIds.add(String(parsed.id));
-              } catch {
-              }
-            }
-          }
-          for (const line of lines) {
-            if (line.startsWith("[FAIL]")) {
-              const m = line.match(/^\[FAIL\]\s+([^:]+):/);
-              const failId = m?.[1];
-              if (failId && debugIds.has(failId)) continue;
-              console.log(`        ${colors2.red}${line}${colors2.reset}`);
-            } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
-              console.log(`        ${colors2.yellow}${line}${colors2.reset}`);
-            } else if (line.startsWith("[STACK]")) {
-              console.log(`        ${colors2.gray}${line}${colors2.reset}`);
-            } else if (line.startsWith("[DEBUG-FAIL]")) {
-              const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
-              try {
-                const parsed = JSON.parse(payload);
-                const { message, diff, expected, actual } = parsed;
-                if (message)
-                  console.log(
-                    `        ${colors2.bold}${message}${colors2.reset}`
-                  );
-                if (diff && Array.isArray(diff)) {
-                  for (const dLine of diff)
-                    console.log("          " + colorizeDiffLine(dLine));
-                } else {
-                  console.log("          expected:");
-                  console.log(
-                    colors2.green + "            " + JSON.stringify(expected, null, 2).split("\n").join("\n            ") + colors2.reset
-                  );
-                  console.log("          actual:");
-                  console.log(
-                    colors2.red + "            " + JSON.stringify(actual, null, 2).split("\n").join("\n            ") + colors2.reset
-                  );
-                }
-                const suggestions = suggestFixFromDiff(parsed);
-                if (suggestions.length) {
-                  console.log(
-                    `          ${colors2.bold}Suggested fix:${colors2.reset}`
-                  );
-                  for (const s of suggestions)
-                    console.log(`            \u2022 ${s}`);
-                }
-              } catch {
-                console.log(`        ${line}`);
-              }
-            } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
-              const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
-              try {
-                const ctx = JSON.parse(payload);
-                console.log(`        ${colors2.gray}context:${colors2.reset}`);
-                if (ctx.tool_schema) {
-                  console.log(
-                    colors2.gray + "          tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n            ") + colors2.reset
-                  );
-                }
-                if (ctx.last_user_query) {
-                  console.log(
-                    colors2.gray + "          last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
-                  );
-                }
-                if (ctx.raw_model_text) {
-                  console.log(
-                    colors2.gray + "          raw model text (middleware parsed):\n            " + String(ctx.raw_model_text).split("\n").join("\n            ") + colors2.reset
-                  );
-                }
-                if (ctx.parsed_tool_calls) {
-                  console.log(
-                    colors2.gray + "          parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n            ") + colors2.reset
-                  );
-                }
-                if (ctx.ground_truth) {
-                  console.log(
-                    colors2.gray + "          ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n            ") + colors2.reset
-                  );
-                }
-                if (ctx.finish_reason) {
-                  console.log(
-                    colors2.gray + "          finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
-                  );
-                }
-              } catch {
-                console.log(`        ${line}`);
-              }
-            }
-          }
-        }
-      } else {
-        const info = result.logs.filter(
-          (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
-        );
-        for (const line of info)
-          console.log(`      ${colors2.gray}${line}${colors2.reset}`);
-      }
-    }
-  }
-  console.log("\n------------------------------------\n");
-}
-// src/reporters/json.ts
-function jsonReporter(results) {
-  const serializableResults = results.map((r) => ({
-    ...r,
-    result: {
-      ...r.result,
-      error: r.result.error?.message
-    }
-  }));
-  console.log(JSON.stringify(serializableResults, null, 2));
-}
-// src/reporters/index.ts
-var reporters = {
-  console: consoleReporter,
-  json: jsonReporter,
-  "console.debug": consoleDebugReporter
-};
-// src/evaluate.ts
-async function runSingleBenchmark(model, benchmark, modelKey, config) {
-  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
-  try {
-    console.log(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
-    );
-    const result = await benchmark.run(model, config);
-    console.log(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
-    );
-    return {
-      model: modelId,
-      modelKey,
-      benchmark: benchmark.name,
-      result
-    };
-  } catch (error) {
-    console.error(
-      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
-      error
-    );
-    return {
-      model: modelId,
-      modelKey,
-      benchmark: benchmark.name,
-      result: {
-        score: 0,
-        success: false,
-        metrics: {},
-        error: error instanceof Error ? error : new Error(String(error))
-      }
-    };
-  }
-}
-async function evaluate(options) {
-  const {
-    models,
-    benchmarks,
-    reporter = "console",
-    temperature,
-    maxTokens
-  } = options;
-  const modelEntries = [];
-  if (Array.isArray(models)) {
-    for (const m of models) modelEntries.push([void 0, m]);
-  } else if (typeof models === "object" && models !== null && "modelId" in models) {
-    modelEntries.push([void 0, models]);
-  } else {
-    for (const [key, m] of Object.entries(
-      models
-    )) {
-      modelEntries.push([key, m]);
-    }
-  }
-  const allResults = [];
-  for (const [modelKey, model] of modelEntries) {
-    for (const benchmark of benchmarks) {
-      const config = {};
-      if (temperature !== void 0) config.temperature = temperature;
-      if (maxTokens !== void 0) config.maxTokens = maxTokens;
-      const evaluationResult = await runSingleBenchmark(
-        model,
-        benchmark,
-        modelKey,
-        Object.keys(config).length > 0 ? config : void 0
-      );
-      allResults.push(evaluationResult);
-    }
-  }
-  const report = reporters[reporter];
-  if (report) {
-    report(allResults);
-  } else {
-    console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
-    reporters.console(allResults);
-  }
-  return allResults;
-}
 // src/benchmarks/bfcl.ts
-import { generateText, jsonSchema, tool } from "ai";
 import { promises as fs2 } from "fs";
 import path2 from "path";
+import {
+  generateText,
+  jsonSchema,
+  tool
+} from "ai";
 // src/utils/paths.ts
 import fs from "fs";
 import { createRequire } from "module";
 import path from "path";
 import { fileURLToPath } from "url";
-function resolveDataDir(fromModuleUrl) {
-  const moduleUrl = fromModuleUrl;
-  const override = process.env.BFCL_DATA_DIR;
-  if (override && override.trim().length > 0) {
-    return override;
-  }
+function tryResolveViaPackageEntry(moduleUrl) {
   try {
     const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
     const requireFromEntry = createRequire(baseForRequireEntry);
@@ -426,43 +20,80 @@ function resolveDataDir(fromModuleUrl) {
     const entryDir = path.dirname(entryPath);
     const guessPkgRoot = fs.existsSync(path.join(entryDir, "..")) ? path.resolve(entryDir, "..") : entryDir;
     const dataAtRoot = path.join(guessPkgRoot, "data");
-    if (fs.existsSync(dataAtRoot)) return dataAtRoot;
+    if (fs.existsSync(dataAtRoot)) {
+      return dataAtRoot;
+    }
   } catch {
   }
+  return null;
+}
+function tryResolveViaPackageJson(moduleUrl) {
   try {
     const baseForRequire = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
     const require2 = createRequire(baseForRequire);
     const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
     const pkgDir = path.dirname(pkgJsonPath);
     const dataAtPkg = path.join(pkgDir, "data");
-    if (fs.existsSync(dataAtPkg)) return dataAtPkg;
+    if (fs.existsSync(dataAtPkg)) {
+      return dataAtPkg;
+    }
   } catch {
   }
-  let startDir;
+  return null;
+}
+function getStartDir(moduleUrl) {
   if (moduleUrl) {
     try {
-      startDir = path.dirname(fileURLToPath(moduleUrl));
+      return path.dirname(fileURLToPath(moduleUrl));
     } catch {
-      startDir = process.cwd();
+      return process.cwd();
     }
-  } else {
-    startDir = process.cwd();
   }
+  return process.cwd();
+}
+function findDataDirByTraversal(startDir) {
   let dir = startDir;
-  for (let i = 0; i < 6; i++) {
+  const MAX_PARENT_TRAVERSAL_DEPTH = 6;
+  for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
     const dataCandidate = path.join(dir, "data");
-    if (fs.existsSync(dataCandidate)) return dataCandidate;
+    if (fs.existsSync(dataCandidate)) {
+      return dataCandidate;
+    }
     const parent = path.resolve(dir, "..");
-    if (parent === dir) break;
+    if (parent === dir) {
+      break;
+    }
     dir = parent;
   }
+  return null;
+}
+function resolveDataDir(fromModuleUrl) {
+  const override = process.env.BFCL_DATA_DIR;
+  if (override && override.trim().length > 0) {
+    return override;
+  }
+  const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
+  if (viaEntry) {
+    return viaEntry;
+  }
+  const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
+  if (viaPackageJson) {
+    return viaPackageJson;
+  }
+  const startDir = getStartDir(fromModuleUrl);
+  const viaTraversal = findDataDirByTraversal(startDir);
+  if (viaTraversal) {
+    return viaTraversal;
+  }
   const pkgRoot = path.resolve(startDir, "..", "..");
   return path.join(pkgRoot, "data");
 }
 // src/benchmarks/bfcl/ast-checker.ts
 function standardizeString(input) {
-  if (typeof input !== "string") return input;
+  if (typeof input !== "string") {
+    return input;
+  }
   const regex = /[ ,./\\-_*^]/g;
   return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
 }
@@ -482,127 +113,181 @@ function checkStringValue(param, modelValue, possibleAnswers) {
   }
   return { valid: true };
 }
-function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
-  const modelArgs = modelToolCall.args;
-  const modelFuncName = modelToolCall.toolName;
-  const expectedFuncName = funcDescription.name;
-  const expectedParams = funcDescription.parameters.properties;
-  const requiredParams = funcDescription.parameters.required;
-  if (modelFuncName !== expectedFuncName) {
-    return {
-      valid: false,
-      error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
-      error_type: "simple_function_checker:wrong_func_name"
-    };
+function normalizeObject(obj) {
+  if (Array.isArray(obj)) {
+    return obj.map(normalizeObject);
   }
-  const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
-  const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
-  for (const param of requiredParams) {
-    if (!(param in argsObj)) {
-      return {
+  if (obj && typeof obj === "object") {
+    const normalized = {};
+    for (const [key, value] of Object.entries(obj)) {
+      if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
+        normalized[key] = value[0];
+      } else {
+        normalized[key] = normalizeObject(value);
+      }
+    }
+    return normalized;
+  }
+  return obj;
+}
+function valuesMatch(modelValue, possibleValue) {
+  if (modelValue === possibleValue) {
+    return true;
+  }
+  if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
+    try {
+      const normalizedModel = normalizeObject(modelValue);
+      const normalizedPossible = normalizeObject(possibleValue);
+      return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
+    } catch {
+      return false;
+    }
+  }
+  if (typeof modelValue === "number" && typeof possibleValue === "string") {
+    return modelValue.toString() === possibleValue;
+  }
+  if (typeof modelValue === "string" && typeof possibleValue === "number") {
+    return modelValue === possibleValue.toString();
+  }
+  return false;
+}
+function checkArrayValue(paramName, modelValue, possibleValues) {
+  const modelValueStr = JSON.stringify(
+    modelValue.map((v) => standardizeString(String(v))).sort()
+  );
+  const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
+    if (!Array.isArray(p)) {
+      return false;
+    }
+    return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
+  }) : false;
+  if (!hasMatch) {
+    return {
+      valid: false,
+      error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
+        modelValue
+      )}. Expected one of ${JSON.stringify(possibleValues)}.`,
+      error_type: "value_error:list"
+    };
+  }
+  return { valid: true };
+}
+function checkObjectValue(paramName, modelValue, possibleValues) {
+  const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
+    (possibleValue) => valuesMatch(modelValue, possibleValue)
+  ) : false;
+  if (!hasMatch) {
+    return {
+      valid: false,
+      error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
+        modelValue
+      )}. Expected one of ${JSON.stringify(possibleValues)}.`,
+      error_type: "value_error:other"
+    };
+  }
+  return { valid: true };
+}
+function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
+  const funcNameCheck = checkFunctionName(
+    funcDescription.name,
+    modelToolCall.toolName
+  );
+  if (!funcNameCheck.valid) {
+    return funcNameCheck;
+  }
+  const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
+  const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
+  const context = {
+    funcDescription,
+    modelToolCall,
+    possibleAnswerParams,
+    expectedParams: funcDescription.parameters.properties
+  };
+  const requiredCheck = checkRequiredParams(
+    funcDescription.parameters.required,
+    argsObj
+  );
+  if (!requiredCheck.valid) {
+    return requiredCheck;
+  }
+  const paramsCheck = checkAllParameters(argsObj, context);
+  if (!paramsCheck.valid) {
+    return paramsCheck;
+  }
+  const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
+  if (!optionalCheck.valid) {
+    return optionalCheck;
+  }
+  return { valid: true };
+}
+function checkFunctionName(expected, actual) {
+  if (actual !== expected) {
+    return {
+      valid: false,
+      error: `Function name '${actual}' does not match expected '${expected}'.`,
+      error_type: "simple_function_checker:wrong_func_name"
+    };
+  }
+  return { valid: true };
+}
+function checkRequiredParams(requiredParams, argsObj) {
+  for (const param of requiredParams) {
+    if (!(param in argsObj)) {
+      return {
         valid: false,
         error: `Missing required parameter: '${param}'.`,
         error_type: "simple_function_checker:missing_required"
       };
     }
   }
-  if (modelArgs && typeof modelArgs === "object") {
-    for (const paramName of Object.keys(argsObj)) {
-      const modelValue = argsObj[paramName];
-      if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
+  return { valid: true };
+}
+function checkAllParameters(argsObj, context) {
+  for (const paramName of Object.keys(argsObj)) {
+    const paramCheck = checkSingleParameter(
+      paramName,
+      argsObj[paramName],
+      context
+    );
+    if (!paramCheck.valid) {
+      return paramCheck;
+    }
+  }
+  return { valid: true };
+}
+function checkSingleParameter(paramName, modelValue, context) {
+  if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
+    return {
+      valid: false,
+      error: `Unexpected parameter: '${paramName}'.`,
+      error_type: "simple_function_checker:unexpected_param"
+    };
+  }
+  const possibleValues = context.possibleAnswerParams[paramName];
+  if (typeof modelValue === "string") {
+    return checkStringValue(
+      paramName,
+      modelValue,
+      possibleValues ?? []
+    );
+  }
+  if (Array.isArray(modelValue)) {
+    return checkArrayValue(paramName, modelValue, possibleValues);
+  }
+  return checkObjectValue(paramName, modelValue, possibleValues);
+}
+function checkOptionalParams(argsObj, possibleAnswerParams) {
+  for (const paramName in possibleAnswerParams) {
+    if (Object.hasOwn(possibleAnswerParams, paramName)) {
+      const val = possibleAnswerParams[paramName];
+      const isOptional = Array.isArray(val) && val.includes("");
+      if (!(paramName in argsObj || isOptional)) {
         return {
           valid: false,
-          error: `Unexpected parameter: '${paramName}'.`,
-          error_type: "simple_function_checker:unexpected_param"
+          error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
+          error_type: "simple_function_checker:missing_optional"
         };
       }
-      const possibleValues = possibleAnswerParams[paramName];
-      if (typeof modelValue === "string") {
-        const result = checkStringValue(
-          paramName,
-          modelValue,
-          possibleValues ?? []
-        );
-        if (!result.valid) return result;
-      } else if (Array.isArray(modelValue)) {
-        const modelValueStr = JSON.stringify(
-          modelValue.map((v) => standardizeString(String(v))).sort()
-        );
-        const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
-          if (!Array.isArray(p)) return false;
-          return JSON.stringify(
-            p.map((v) => standardizeString(String(v))).sort()
-          ) === modelValueStr;
-        }) : false;
-        if (!hasMatch) {
-          return {
-            valid: false,
-            error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
-              modelValue
-            )}. Expected one of ${JSON.stringify(possibleValues)}.`,
-            error_type: "value_error:list"
-          };
-        }
-      } else {
-        const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
-          if (modelValue === possibleValue) return true;
-          if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
-            try {
-              const normalizeObject = (obj) => {
-                if (Array.isArray(obj)) {
-                  return obj.map(normalizeObject);
-                }
-                if (obj && typeof obj === "object") {
-                  const normalized = {};
-                  for (const [key, value] of Object.entries(
-                    obj
-                  )) {
-                    if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
-                      normalized[key] = value[0];
-                    } else {
-                      normalized[key] = normalizeObject(value);
-                    }
-                  }
-                  return normalized;
-                }
-                return obj;
-              };
-              const normalizedModel = normalizeObject(modelValue);
-              const normalizedPossible = normalizeObject(possibleValue);
-              return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
-            } catch {
-              return false;
-            }
-          }
-          if (typeof modelValue === "number" && typeof possibleValue === "string") {
-            return modelValue.toString() === possibleValue;
-          }
-          if (typeof modelValue === "string" && typeof possibleValue === "number") {
-            return modelValue === possibleValue.toString();
-          }
-          return false;
-        }) : false;
-        if (!hasMatch) {
-          return {
-            valid: false,
-            error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
-              modelValue
-            )}. Expected one of ${JSON.stringify(possibleValues)}.`,
-            error_type: "value_error:other"
-          };
-        }
-      }
-    }
-  }
-  for (const paramName in possibleAnswerParams) {
-    const val = possibleAnswerParams[paramName];
-    const isOptional = Array.isArray(val) && val.includes("");
-    if (!(paramName in argsObj) && !isOptional) {
-      return {
-        valid: false,
-        error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
-        error_type: "simple_function_checker:missing_optional"
-      };
     }
   }
   return { valid: true };
@@ -629,8 +314,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
       };
     }
     let foundMatch = false;
-    for (let i = 0; i < modelToolCalls.length; i++) {
-      if (matchedModelCallIndices.has(i)) continue;
+    for (let i = 0; i < modelToolCalls.length; i += 1) {
+      if (matchedModelCallIndices.has(i)) {
+        continue;
+      }
       const checkerResult = simpleFunctionChecker(
         funcDescription,
         modelToolCalls[i],
@@ -679,6 +366,8 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
 }
 // src/benchmarks/bfcl.ts
+var LINE_SPLIT_REGEX = /\r?\n/;
+var NUMERIC_STRING_REGEX = /^\d+$/;
 function check(testCase, modelOutput, possibleAnswer) {
   const category = testCase.id.split("_")[0];
   try {
@@ -695,19 +384,22 @@ function check(testCase, modelOutput, possibleAnswer) {
         modelOutput[0],
         possibleAnswer.ground_truth[0]
       );
-    } else if (category === "parallel") {
+    }
+    if (category === "parallel") {
       return parallelFunctionCheckerNoOrder(
         testCase.function,
         modelOutput,
         possibleAnswer.ground_truth
       );
-    } else if (category === "multiple") {
+    }
+    if (category === "multiple") {
       return multipleFunctionChecker(
         testCase.function,
         modelOutput,
         possibleAnswer.ground_truth
       );
-    } else if (category.includes("parallel-multiple")) {
+    }
+    if (category.includes("parallel-multiple")) {
       return parallelFunctionCheckerNoOrder(
         testCase.function,
         modelOutput,
@@ -743,8 +435,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           path2.join(dataPath, answerDataFile),
           "utf-8"
         );
-        testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-        const possibleAnswers = possibleAnswersJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+        testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+        const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
         const possibleAnswersMap = new Map(
           possibleAnswers.map((ans) => [ans.id, ans])
         );
@@ -756,373 +448,600 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
           );
         }
+        const fixSchemaType = (copy) => {
+          if (!copy.type) {
+            return;
+          }
+          if (copy.type === "dict") {
+            copy.type = "object";
+          }
+          if (copy.type === "tuple") {
+            copy.type = "array";
+          }
+          if (copy.type === "integer" || copy.type === "float") {
+            copy.type = "number";
+          }
+        };
+        const fixSchemaProperties = (copy, fixSchemaFn) => {
+          if (!copy.properties || typeof copy.properties !== "object") {
+            return;
+          }
+          for (const k of Object.keys(copy.properties)) {
+            copy.properties[k] = fixSchemaFn(
+              copy.properties[k]
+            );
+          }
+        };
         const fixSchema = (schema) => {
-          if (!schema || typeof schema !== "object")
+          if (!schema || typeof schema !== "object") {
             return { type: "object", properties: {} };
+          }
           const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
           if (!Array.isArray(copy)) {
-            if (copy.type) {
-              if (copy.type === "dict") copy.type = "object";
-              if (copy.type === "integer" || copy.type === "float")
-                copy.type = "number";
+            fixSchemaType(copy);
+            fixSchemaProperties(copy, fixSchema);
+            if (copy.items) {
+              copy.items = fixSchema(copy.items);
             }
-            if (copy.properties && typeof copy.properties === "object") {
-              for (const k of Object.keys(copy.properties)) {
-                copy.properties[k] = fixSchema(
-                  copy.properties[k]
-                );
-              }
-            }
-            if (copy.items) copy.items = fixSchema(copy.items);
             return copy;
           }
           return copy;
         };
+        const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
+        const sanitizeName = (toolName) => {
+          const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
+          return s.length > 0 ? s : "tool";
+        };
+        const buildTransformedTools = (tools, fixSchemaFn) => {
+          const nameMap = /* @__PURE__ */ new Map();
+          const transformedTools = tools.map((t) => {
+            const fixed = fixSchemaFn(t.parameters);
+            const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
+            const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
+            const sanitized = sanitizeName(t.name);
+            nameMap.set(sanitized, t.name);
+            return {
+              type: "function",
+              name: sanitized,
+              description: t.description,
+              inputSchema
+            };
+          });
+          return { transformedTools, nameMap };
+        };
+        const parseDebugToolCalls = (raw) => {
+          if (!raw) {
+            return [];
+          }
+          try {
+            const arr = JSON.parse(raw);
+            return Array.isArray(arr) ? arr : [];
+          } catch {
+            return [];
+          }
+        };
+        const getSanitizedName = (rawName, transformedTools) => {
+          if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
+            return transformedTools[Number(rawName)]?.name ?? rawName;
+          }
+          return rawName;
+        };
+        const parseToolArgs = (extractedArgs) => {
+          if (typeof extractedArgs !== "string") {
+            return extractedArgs;
+          }
+          try {
+            return JSON.parse(extractedArgs);
+          } catch {
+            return extractedArgs;
+          }
+        };
+        const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
+          const call = c;
+          const rawName = call.toolName ?? call.name;
+          const sanitizedFromIndex = getSanitizedName(
+            rawName,
+            transformedTools
+          );
+          const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
+          const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
+          const parsedArgs = parseToolArgs(extractedArgs);
+          return {
+            ...call,
+            toolName: originalName,
+            name: originalName,
+            args: parsedArgs ?? {}
+          };
+        });
+        const summarizeArgs = (args) => {
+          if (args == null) {
+            return args;
+          }
+          if (typeof args !== "object") {
+            return args;
+          }
+          return Object.keys(args).sort().reduce(
+            (acc, k) => {
+              acc[k] = args[k];
+              return acc;
+            },
+            {}
+          );
+        };
+        const generateParamMismatchDiff = (paramName, allowed, got) => {
+          const diffLines = [];
+          diffLines.push(`@@ param ${paramName}`);
+          const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
+          const expectedLine = (() => {
+            if (allowedArray.length === 1) {
+              return `- expected: ${JSON.stringify(allowedArray[0])}`;
+            }
+            const formatted = allowedArray.map(
+              (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
+            ).join(", ");
+            return `- expected one of: ${formatted}`;
+          })();
+          diffLines.push(expectedLine);
+          diffLines.push(`+ got: ${JSON.stringify(got)}`);
+          return diffLines;
+        };
+        const paramValueMatches = (allowed, got) => {
+          if (!Array.isArray(allowed)) {
+            return false;
+          }
+          return allowed.some((v) => {
+            try {
+              if (Array.isArray(got)) {
+                return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
+              }
+            } catch {
+            }
+            return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
+          });
+        };
+        const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
+          if (expectedName !== receivedName) {
+            diff.push("@@ function name");
+            diff.push(`- ${expectedName}`);
+            diff.push(`+ ${receivedName}`);
+          }
+        };
+        const checkMissingParams = (required, receivedArgs, diff) => {
+          for (const req of required) {
+            if (!(req in receivedArgs)) {
+              diff.push(`- missing required param: ${req}`);
+            }
+          }
+        };
+        const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
+          for (const k of Object.keys(receivedArgs)) {
+            if (!(k in expectedParams)) {
+              diff.push(`+ unexpected param: ${k}`);
+            }
+          }
+        };
+        const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
+          for (const k of Object.keys(receivedArgs)) {
+            if (k in expectedParams) {
+              const allowed = expectedParams[k];
+              const got = receivedArgs[k];
+              if (!paramValueMatches(allowed, got)) {
+                diff.push(...generateParamMismatchDiff(k, allowed, got));
+              }
+            }
+          }
+        };
+        const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
+          const funcDesc = tools[0];
+          const gt = possibleAnswer.ground_truth?.[0];
+          const expectedFuncName = funcDesc?.name;
+          const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
+          const received = restoredCalls[0];
+          const receivedName = received?.toolName ?? received?.name;
+          const receivedArgs = summarizeArgs(received?.args);
+          const expected = {
+            function: expectedFuncName,
+            params: expectedParams
+          };
+          const actual = {
+            function: receivedName,
+            args: receivedArgs
+          };
+          const diff = [];
+          checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
+          if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
+            const required = funcDesc?.parameters?.required ?? [];
+            checkMissingParams(
+              required,
+              receivedArgs,
+              diff
+            );
+            checkUnexpectedParams(
+              expectedParams,
+              receivedArgs,
+              diff
+            );
+            checkParamValueMismatches(
+              expectedParams,
+              receivedArgs,
+              diff
+            );
+          }
+          return { expected, actual, diff };
+        };
+        const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
+          if (expectedCount !== actualCount) {
+            diff.push("@@ call count");
+            diff.push(`- expected ${expectedCount}`);
+            diff.push(`+ got ${actualCount}`);
+          }
+        };
+        const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
+          const missing = expectedNames.filter((n) => !actualNames.includes(n));
+          const extra = actualNames.filter((n) => !expectedNames.includes(n));
+          for (const m of missing) {
+            diff.push(`- missing function: ${m}`);
+          }
+          for (const e of extra) {
+            diff.push(`+ unexpected function: ${e}`);
+          }
+        };
+        const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
+          for (let i = 0; i < restoredCalls.length; i += 1) {
+            if (usedActual.has(i)) {
+              continue;
+            }
+            const rc = restoredCalls[i];
+            const rcName = rc?.toolName ?? rc?.name;
+            if (rcName === fname) {
+              return i;
+            }
+          }
+          return -1;
+        };
+        const validateFunctionParams = (options) => {
+          const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
+          checkMissingParams(requiredParams, receivedArgs, diff);
+          checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
+          checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
+        };
+        const processExpectedCall = (options) => {
+          const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
+          const fname = Object.keys(expectedObj)[0];
+          const matchedIndex = findMatchingCallIndex(
+            fname,
+            restoredCalls,
+            usedActual
+          );
+          if (matchedIndex === -1) {
+            return;
+          }
+          usedActual.add(matchedIndex);
+          const received = restoredCalls[matchedIndex];
+          const receivedArgs = summarizeArgs(received?.args);
+          const expectedParamsAllowed = expectedObj[fname];
+          const funcDesc = tools.find((t) => t.name === fname);
+          const requiredParams = funcDesc?.parameters?.required ?? [];
+          diff.push(`@@ function ${fname}`);
+          if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
+            validateFunctionParams({
+              receivedArgs,
+              expectedParamsAllowed,
+              requiredParams,
+              diff
+            });
+          }
+        };
+        const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
+          const gtArr = possibleAnswer.ground_truth ?? [];
+          const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
+          const actualNames = restoredCalls.map(
+            (c) => c.toolName ?? c.name
+          );
+          const expected = {
+            functions: expectedNames
+          };
+          const actual = { functions: actualNames };
+          const diff = [];
+          checkCallCountMismatch(
+            expectedNames.length,
+            actualNames.length,
+            diff
+          );
+          addMissingAndExtraFunctions(expectedNames, actualNames, diff);
+          const usedActual = /* @__PURE__ */ new Set();
+          for (const expectedObj of gtArr) {
+            processExpectedCall({
+              expectedObj,
+              restoredCalls,
+              tools,
+              usedActual,
+              diff
+            });
+          }
+          return { expected, actual, diff };
+        };
         const concurrencyEnv = process.env.BFCL_CONCURRENCY;
         const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
         logs.push(
           `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
         );
-        const runSingleCase = async (testCase) => {
-          const caseLogs = [];
-          const { function: tools, question: messages } = testCase;
-          const temp = config?.temperature;
-          const temperature = typeof temp === "number" ? temp : void 0;
-          const maxTok = config?.maxTokens;
-          const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
+        const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
           try {
-            const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
-            const nameMap = /* @__PURE__ */ new Map();
-            const sanitizeName = (name2) => {
-              const s = name2.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
-              return s.length > 0 ? s : "tool";
-            };
-            const transformedTools = tools.map((t) => {
-              const fixed = fixSchema(t.parameters);
-              const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
-              const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
-              const sanitized = sanitizeName(t.name);
-              nameMap.set(sanitized, t.name);
-              return {
-                type: "function",
-                name: sanitized,
-                description: t.description,
-                inputSchema
-              };
-            });
-            const toolsMap = Object.fromEntries(
-              transformedTools.map((t) => [
-                t.name,
-                tool({
-                  description: typeof t.description === "string" ? t.description : void 0,
-                  inputSchema: jsonSchema(t.inputSchema)
-                })
-              ])
+            const firstTool = transformedTools[0];
+            const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
+            caseLogs.push(
+              `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
             );
-            try {
-              const firstTool = transformedTools[0];
-              const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
-              caseLogs.push(
-                `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
-              );
-            } catch (e) {
-              caseLogs.push(
-                `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
-              );
+          } catch (e) {
+            caseLogs.push(
+              `[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
+            );
+          }
+        };
+        const logRawToolCalls = (options) => {
+          const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
+          try {
+            caseLogs.push(
+              `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
+            );
+          } catch {
+            caseLogs.push(
+              `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
+            );
+          }
+        };
+        const buildFailureContext = (options) => {
+          const {
+            testCase,
+            tools,
+            flatMessages,
+            mwOriginalText,
+            text,
+            finishReason,
+            mwParsedToolCalls,
+            restoredCalls,
+            possibleAnswer
+          } = options;
+          const lastUser = (() => {
+            const reversed = [...flatMessages].reverse();
+            const found = reversed.find(
+              (m) => m.role === "user"
+            );
+            return found?.content ?? void 0;
+          })();
+          const rawModelText = (() => {
+            if (mwOriginalText && mwOriginalText.length > 0) {
+              return mwOriginalText;
             }
-            const debugSummaryRef = {};
-            const providerOptions = {
-              toolCallMiddleware: {
-                debugSummary: debugSummaryRef
-              }
-            };
-            const { toolCalls, text, finishReason } = await generateText({
-              model,
-              messages: flatMessages,
-              tools: toolsMap,
-              toolChoice: "auto",
-              providerOptions,
-              ...temperature !== void 0 ? { temperature } : {},
-              ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
-            });
-            const mwOriginalText = debugSummaryRef.originalText;
-            const mwParsedToolCalls = (() => {
-              const raw = debugSummaryRef.toolCalls;
-              if (!raw) return [];
-              try {
-                const arr = JSON.parse(raw);
-                return Array.isArray(arr) ? arr : [];
-              } catch {
-                return [];
-              }
-            })();
+            if (typeof text === "string") {
+              return text;
+            }
+            return "";
+          })();
+          return {
+            id: testCase.id,
+            tool_schema: tools,
+            last_user_query: lastUser,
+            raw_model_text: rawModelText,
+            finish_reason: finishReason,
+            parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
+            ground_truth: possibleAnswer.ground_truth
+          };
+        };
+        const logFailureDetails = (options) => {
+          const {
+            testCase,
+            tools,
+            possibleAnswer,
+            restoredCalls,
+            checkerResult,
+            flatMessages,
+            mwOriginalText,
+            text,
+            finishReason,
+            mwParsedToolCalls,
+            caseLogs
+          } = options;
+          try {
+            const category = testCase.id.split("_")[0];
+            const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
+              tools,
+              possibleAnswer,
+              restoredCalls
+            ) : buildParallelDiff(
+              tools,
+              possibleAnswer,
+              restoredCalls
+            );
+            caseLogs.push(
+              `[DEBUG-FAIL] ${JSON.stringify({
+                id: testCase.id,
+                message: checkerResult.error,
+                error_type: checkerResult.error_type,
+                expected,
+                actual,
+                diff
+              })}`
+            );
             try {
+              const contextPayload = buildFailureContext({
+                testCase,
+                tools,
+                flatMessages,
+                mwOriginalText,
+                text,
+                finishReason,
+                mwParsedToolCalls,
+                restoredCalls,
+                possibleAnswer
+              });
               caseLogs.push(
-                `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
+                `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
               );
             } catch {
-              caseLogs.push(
-                `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
-              );
             }
-            const possibleAnswer = possibleAnswersMap.get(testCase.id);
-            if (!possibleAnswer) {
-              throw new Error(`No possible answer for id: ${testCase.id}`);
+          } catch {
+            caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
+          }
+        };
+        const buildToolsMap = (transformedTools) => Object.fromEntries(
+          transformedTools.map((t) => [
+            t.name,
+            tool({
+              description: typeof t.description === "string" ? t.description : void 0,
+              inputSchema: jsonSchema(
+                t.inputSchema
+              )
+            })
+          ])
+        );
+        const executeModelGeneration = async (options) => {
+          const {
+            model: modelInstance,
+            flatMessages,
+            toolsMap,
+            temperature,
+            maxTokens
+          } = options;
+          const debugSummaryRef = {};
+          const providerOptions = {
+            toolCallMiddleware: {
+              debugSummary: debugSummaryRef
             }
-            const restoredCalls = (toolCalls || []).map((c) => {
-              const rawName = c.toolName ?? c.name;
-              const sanitizedFromIndex = typeof rawName === "string" && /^\d+$/.test(rawName) ? transformedTools[Number(rawName)]?.name ?? rawName : rawName;
-              const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
-              const extractedArgs = c.args ?? c.arguments ?? c.input ?? c.params ?? c.parameters ?? void 0;
-              let parsedArgs = extractedArgs;
-              if (typeof parsedArgs === "string") {
-                try {
-                  parsedArgs = JSON.parse(parsedArgs);
-                } catch {
-                }
-              }
-              return {
-                ...c,
-                toolName: originalName,
-                name: originalName,
-                args: parsedArgs ?? {}
-              };
+          };
+          const { toolCalls, text, finishReason } = await generateText({
+            model: modelInstance,
+            messages: flatMessages,
+            tools: toolsMap,
+            toolChoice: "auto",
+            providerOptions,
+            ...temperature !== void 0 ? { temperature } : {},
+            ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
+          });
+          return { toolCalls, text, finishReason, debugSummaryRef };
+        };
+        const processValidationResult = (options) => {
+          const {
+            checkerResult,
+            testCase,
+            tools,
+            possibleAnswer,
+            restoredCalls,
+            flatMessages,
+            mwOriginalText,
+            text,
+            finishReason,
+            mwParsedToolCalls,
+            caseLogs
+          } = options;
+          if (checkerResult.valid) {
+            caseLogs.push(`[PASS] ${testCase.id}`);
+            return { valid: true, logs: caseLogs };
+          }
+          caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
+          logFailureDetails({
+            testCase,
+            tools,
+            possibleAnswer,
+            restoredCalls,
+            checkerResult,
+            flatMessages,
+            mwOriginalText,
+            text,
+            finishReason,
+            mwParsedToolCalls,
+            caseLogs
+          });
+          return { valid: false, logs: caseLogs };
+        };
+        const prepareTestCaseData = (testCase) => {
+          const { function: tools, question: messages } = testCase;
+          const flatMessages = flattenMessages(messages);
+          const { transformedTools, nameMap } = buildTransformedTools(
+            tools,
+            fixSchema
+          );
+          const toolsMap = buildToolsMap(transformedTools);
+          return { flatMessages, transformedTools, nameMap, toolsMap };
+        };
+        const processModelResponse = (options) => {
+          const {
+            testCase,
+            toolCalls,
+            text,
+            finishReason,
+            debugSummaryRef,
+            nameMap,
+            transformedTools,
+            flatMessages,
+            tools,
+            caseLogs
+          } = options;
+          const mwOriginalText = debugSummaryRef.originalText;
+          const mwParsedToolCalls = parseDebugToolCalls(
+            debugSummaryRef.toolCalls
+          );
+          logRawToolCalls({
+            toolCalls,
+            finishReason,
+            text,
+            testCaseId: testCase.id,
+            caseLogs
+          });
+          const possibleAnswer = possibleAnswersMap.get(testCase.id);
+          if (!possibleAnswer) {
+            throw new Error(`No possible answer for id: ${testCase.id}`);
+          }
+          const restoredCalls = restoreToolCalls(
+            toolCalls || [],
+            nameMap,
+            transformedTools
+          );
+          const checkerResult = check(testCase, restoredCalls, possibleAnswer);
+          return processValidationResult({
+            checkerResult,
+            testCase,
+            tools,
+            possibleAnswer,
+            restoredCalls,
+            flatMessages,
+            mwOriginalText,
+            text,
+            finishReason,
+            mwParsedToolCalls,
+            caseLogs
+          });
+        };
+        const runSingleCase = async (testCase) => {
+          const caseLogs = [];
+          const { function: tools } = testCase;
+          const temp = config?.temperature;
+          const temperature = typeof temp === "number" ? temp : void 0;
+          const maxTok = config?.maxTokens;
+          const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
+          try {
+            const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
+            logFirstToolDebug(transformedTools, testCase.id, caseLogs);
+            const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
+              model,
+              flatMessages,
+              toolsMap,
+              temperature,
+              maxTokens
             });
-            const checkerResult = check(
+            return processModelResponse({
               testCase,
-              restoredCalls,
-              possibleAnswer
-            );
-            if (checkerResult.valid) {
-              caseLogs.push(`[PASS] ${testCase.id}`);
-              return { valid: true, logs: caseLogs };
-            } else {
-              caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
-              try {
-                let generateParamMismatchDiff2 = function(paramName, allowed, got) {
-                  const diffLines = [];
-                  diffLines.push(`@@ param ${paramName}`);
-                  const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
-                  const expectedLine = (() => {
-                    if (allowedArray.length === 1) {
-                      return `- expected: ${JSON.stringify(allowedArray[0])}`;
-                    }
-                    const formatted = allowedArray.map(
-                      (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
-                    ).join(", ");
-                    return `- expected one of: ${formatted}`;
-                  })();
-                  diffLines.push(expectedLine);
-                  diffLines.push(`+ got: ${JSON.stringify(got)}`);
-                  return diffLines;
-                };
-                var generateParamMismatchDiff = generateParamMismatchDiff2;
-                const category = testCase.id.split("_")[0];
-                const diff = [];
-                const summarizeArgs = (args) => {
-                  if (args == null) return args;
-                  if (typeof args !== "object") return args;
-                  return Object.keys(args).sort().reduce(
-                    (acc, k) => {
-                      acc[k] = args[k];
-                      return acc;
-                    },
-                    {}
-                  );
-                };
-                const expected = {};
-                const actual = {};
-                if (category === "simple") {
-                  const funcDesc = tools[0];
-                  const gt = possibleAnswer.ground_truth?.[0];
-                  const expectedFuncName = funcDesc?.name;
-                  const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
-                  const received = restoredCalls[0];
-                  const receivedName = received?.toolName ?? received?.name;
-                  const receivedArgs = summarizeArgs(received?.args);
-                  expected.function = expectedFuncName;
-                  expected.params = expectedParams;
-                  actual.function = receivedName;
-                  actual.args = receivedArgs;
-                  if (expectedFuncName !== receivedName) {
-                    diff.push(`@@ function name`);
-                    diff.push(`- ${expectedFuncName}`);
-                    diff.push(`+ ${receivedName}`);
-                  }
-                  if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
-                    const required = funcDesc?.parameters?.required ?? [];
-                    for (const req of required) {
-                      if (!(req in receivedArgs)) {
-                        diff.push(`- missing required param: ${req}`);
-                      }
-                    }
-                    for (const k of Object.keys(
-                      receivedArgs
-                    )) {
-                      if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
-                        diff.push(`+ unexpected param: ${k}`);
-                      }
-                    }
-                    for (const k of Object.keys(
-                      receivedArgs
-                    )) {
-                      if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
-                        const allowed = expectedParams[k];
-                        const got = receivedArgs[k];
-                        const includes = Array.isArray(allowed) && allowed.some((v) => {
-                          try {
-                            if (Array.isArray(got)) {
-                              return JSON.stringify(
-                                got.map((x) => String(x)).sort()
-                              ) === JSON.stringify(
-                                v.map((x) => String(x)).sort()
-                              );
-                            }
-                          } catch {
-                          }
-                          return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
-                        });
-                        if (!includes) {
-                          diff.push(
-                            ...generateParamMismatchDiff2(k, allowed, got)
-                          );
-                        }
-                      }
-                    }
-                  }
-                } else {
-                  const gtArr = possibleAnswer.ground_truth ?? [];
-                  const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
-                  const actualNames = restoredCalls.map(
-                    (c) => c.toolName ?? c.name
-                  );
-                  expected.functions = expectedNames;
-                  actual.functions = actualNames;
-                  if (expectedNames.length !== actualNames.length) {
-                    diff.push(`@@ call count`);
-                    diff.push(`- expected ${expectedNames.length}`);
-                    diff.push(`+ got ${actualNames.length}`);
-                  }
-                  const missing = expectedNames.filter(
-                    (n) => !actualNames.includes(n)
-                  );
-                  const extra = actualNames.filter(
-                    (n) => !expectedNames.includes(n)
-                  );
-                  for (const m of missing)
-                    diff.push(`- missing function: ${m}`);
-                  for (const e of extra)
-                    diff.push(`+ unexpected function: ${e}`);
-                  const usedActual = /* @__PURE__ */ new Set();
-                  for (const expectedObj of gtArr) {
-                    const fname = Object.keys(expectedObj)[0];
-                    let matchedIndex = -1;
-                    for (let i = 0; i < restoredCalls.length; i++) {
-                      if (usedActual.has(i)) continue;
-                      const rc = restoredCalls[i];
-                      const rcName = rc?.toolName ?? rc?.name;
-                      if (rcName === fname) {
-                        matchedIndex = i;
-                        break;
-                      }
-                    }
-                    if (matchedIndex === -1) continue;
-                    usedActual.add(matchedIndex);
-                    const received = restoredCalls[matchedIndex];
-                    const receivedArgs = summarizeArgs(received?.args);
-                    const expectedParamsAllowed = expectedObj[fname];
-                    const funcDesc = tools.find(
-                      (t) => t.name === fname
-                    );
-                    const requiredParams = funcDesc?.parameters?.required ?? [];
-                    diff.push(`@@ function ${fname}`);
-                    if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
-                      for (const req of requiredParams) {
-                        if (!(req in receivedArgs)) {
-                          diff.push(`- missing required param: ${req}`);
-                        }
-                      }
-                      for (const k of Object.keys(
-                        receivedArgs
-                      )) {
-                        if (!Object.prototype.hasOwnProperty.call(
-                          expectedParamsAllowed,
-                          k
-                        )) {
-                          diff.push(`+ unexpected param: ${k}`);
-                        }
-                      }
-                      for (const k of Object.keys(
-                        receivedArgs
-                      )) {
-                        if (Object.prototype.hasOwnProperty.call(
-                          expectedParamsAllowed,
-                          k
-                        )) {
-                          const allowed = expectedParamsAllowed[k];
-                          const got = receivedArgs[k];
-                          const includes = Array.isArray(allowed) && allowed.some((v) => {
-                            try {
-                              if (Array.isArray(got)) {
-                                return JSON.stringify(
-                                  got.map((x) => String(x)).sort()
-                                ) === JSON.stringify(
-                                  v.map((x) => String(x)).sort()
-                                );
-                              }
-                            } catch {
-                            }
-                            return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
-                          });
-                          if (!includes) {
-                            diff.push(
-                              ...generateParamMismatchDiff2(k, allowed, got)
-                            );
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-                caseLogs.push(
-                  `[DEBUG-FAIL] ${JSON.stringify({
-                    id: testCase.id,
-                    message: checkerResult.error,
-                    error_type: checkerResult.error_type,
-                    expected,
-                    actual,
-                    diff
-                  })}`
-                );
-                try {
-                  const lastUser = (() => {
-                    const reversed = [...flatMessages].reverse();
-                    const found = reversed.find(
-                      (m) => m.role === "user"
-                    );
-                    return found?.content ?? void 0;
-                  })();
-                  const contextPayload = {
-                    id: testCase.id,
-                    tool_schema: tools,
-                    last_user_query: lastUser,
-                    raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
-                    finish_reason: finishReason,
-                    parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
-                    ground_truth: possibleAnswer.ground_truth
-                  };
-                  caseLogs.push(
-                    `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
-                  );
-                } catch {
-                }
-              } catch {
-                caseLogs.push(
-                  `[DEBUG] ${testCase.id}: failed to build debug diff`
-                );
-              }
-              return { valid: false, logs: caseLogs };
-            }
+              toolCalls,
+              text,
+              finishReason,
+              debugSummaryRef,
+              nameMap,
+              transformedTools,
+              flatMessages,
+              tools,
+              caseLogs
+            });
           } catch (e) {
             caseLogs.push(
               `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
@@ -1133,13 +1052,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             return { valid: false, logs: caseLogs };
           }
         };
-        const mapWithConcurrency = async (items, limit2, mapper) => {
+        const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
           const results = new Array(items.length);
           let idx = 0;
-          const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
+          const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
             while (true) {
-              const current = idx++;
-              if (current >= items.length) break;
+              const current = idx;
+              idx += 1;
+              if (current >= items.length) {
+                break;
+              }
               results[current] = await mapper(items[current], current);
             }
           });
@@ -1155,7 +1077,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           (acc, r) => acc + (r.valid ? 1 : 0),
           0
         );
-        for (const r of resultsPerCase) logs.push(...r.logs);
+        for (const r of resultsPerCase) {
+          logs.push(...r.logs);
+        }
         if (testCases.length === 0) {
           return {
             score: 0,
@@ -1182,7 +1106,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           success: false,
           metrics: {},
           error: e,
-          logs: [`[FATAL] Failed to run benchmark ${name}: ${e.message}`]
+          logs: [
+            `[FATAL] Failed to run benchmark ${name}: ${e.message}`
+          ]
         };
       }
     }
@@ -1191,87 +1117,222 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
 var bfclSimpleBenchmark = createBfclBenchmark(
   "bfcl-simple",
   "BFCL Simple Function Calling",
-  "BFCL_v3_simple.json",
-  "BFCL_v3_simple_possible_answer.json"
+  "BFCL_v3_simple.jsonl",
+  "BFCL_v3_simple_possible_answer.jsonl"
 );
 var bfclParallelBenchmark = createBfclBenchmark(
   "bfcl-parallel",
   "BFCL Parallel Function Calling",
-  "BFCL_v3_parallel.json",
-  "BFCL_v3_parallel_possible_answer.json"
+  "BFCL_v3_parallel.jsonl",
+  "BFCL_v3_parallel_possible_answer.jsonl"
 );
 var bfclMultipleBenchmark = createBfclBenchmark(
   "bfcl-multiple",
   "BFCL Multiple Function Calling",
-  "BFCL_v3_multiple.json",
-  "BFCL_v3_multiple_possible_answer.json"
+  "BFCL_v3_multiple.jsonl",
+  "BFCL_v3_multiple_possible_answer.jsonl"
 );
 var bfclParallelMultipleBenchmark = createBfclBenchmark(
   "bfcl-parallel-multiple",
   "BFCL Parallel & Multiple Function Calling",
-  "BFCL_v3_parallel_multiple.json",
-  "BFCL_v3_parallel_multiple_possible_answer.json"
+  "BFCL_v3_parallel_multiple.jsonl",
+  "BFCL_v3_parallel_multiple_possible_answer.jsonl"
 );
 // src/benchmarks/json-generation.ts
-import { generateText as generateText2 } from "ai";
-import Ajv from "ajv";
 import { promises as fs3 } from "fs";
 import path3 from "path";
-function extractFirstJsonBlock(text) {
+import { generateText as generateText2 } from "ai";
+import Ajv from "ajv";
+var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
+var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
+var NEWLINE_REGEX = /\r?\n/;
+var LINE_SPLIT_REGEX2 = /\r?\n/;
+function tryDirectParse(text) {
   try {
     return JSON.parse(text);
   } catch {
+    return;
   }
-  const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
-  if (fenceMatch) {
-    const inner = fenceMatch[1].trim();
-    try {
-      return JSON.parse(inner);
-    } catch {
-    }
+}
+function tryCodeFenceParse(text) {
+  const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
+  if (!fenceMatch) {
+    return;
+  }
+  const inner = fenceMatch[1].trim();
+  try {
+    return JSON.parse(inner);
+  } catch {
+    return;
   }
+}
+function tryBracketScan(text) {
   const startIdxObj = text.indexOf("{");
   const startIdxArr = text.indexOf("[");
   const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
-  if (start === void 0) return void 0;
+  if (start === void 0) {
+    return;
+  }
   const open = text[start] === "{" ? "{" : "[";
   const close = open === "{" ? "}" : "]";
   let depth = 0;
-  for (let i = start; i < text.length; i++) {
+  for (let i = start; i < text.length; i += 1) {
     const ch = text[i];
-    if (ch === open) depth++;
-    else if (ch === close) depth--;
+    if (ch === open) {
+      depth += 1;
+    } else if (ch === close) {
+      depth -= 1;
+    }
     if (depth === 0) {
       const candidate = text.slice(start, i + 1);
       try {
         return JSON.parse(candidate);
       } catch {
+        return;
       }
-      break;
     }
   }
-  return void 0;
+  return;
+}
+function extractFirstJsonBlock(text) {
+  const directResult = tryDirectParse(text);
+  if (directResult !== void 0) {
+    return directResult;
+  }
+  const fenceResult = tryCodeFenceParse(text);
+  if (fenceResult !== void 0) {
+    return fenceResult;
+  }
+  return tryBracketScan(text);
 }
 function subsetMatch(expected, actual) {
   if (expected === null || typeof expected !== "object") {
     return expected === actual;
   }
   if (Array.isArray(expected)) {
-    if (!Array.isArray(actual)) return false;
-    for (let i = 0; i < expected.length; i++) {
-      if (!subsetMatch(expected[i], actual[i])) return false;
+    if (!Array.isArray(actual)) {
+      return false;
+    }
+    for (let i = 0; i < expected.length; i += 1) {
+      if (!subsetMatch(expected[i], actual[i])) {
+        return false;
+      }
     }
     return true;
   }
-  if (actual === null || typeof actual !== "object") return false;
+  if (actual === null || typeof actual !== "object") {
+    return false;
+  }
   const eObj = expected;
   const aObj = actual;
   for (const key of Object.keys(eObj)) {
-    if (!subsetMatch(eObj[key], aObj[key])) return false;
+    if (!subsetMatch(eObj[key], aObj[key])) {
+      return false;
+    }
   }
   return true;
 }
+async function loadDatasets() {
+  try {
+    const dataDir = resolveDataDir();
+    const testsJsonl = await fs3.readFile(
+      path3.join(dataDir, "json_generation_tests.jsonl"),
+      "utf-8"
+    );
+    const expectedJsonl = await fs3.readFile(
+      path3.join(dataDir, "json_generation_expected.jsonl"),
+      "utf-8"
+    );
+    const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+    const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+    const expectedMap = /* @__PURE__ */ new Map();
+    for (const r of expecteds) {
+      expectedMap.set(r.id, r);
+    }
+    return { tests, expectedMap };
+  } catch (e) {
+    return {
+      tests: [],
+      expectedMap: /* @__PURE__ */ new Map(),
+      error: e
+    };
+  }
+}
+function buildMessages(tc) {
+  const schemaStr = JSON.stringify(tc.schema, null, 2);
+  return [
+    {
+      role: "system",
+      content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
+    },
+    {
+      role: "user",
+      content: [
+        "Generate a JSON object that reflects the following facts.",
+        "JSON Schema:",
+        schemaStr,
+        "Facts:",
+        tc.promptFacts,
+        "Output must be a single JSON only, with no additional text."
+      ].join("\n\n")
+    }
+  ];
+}
+function validateTestCase(tc, parsed, context) {
+  const validate = context.ajv.compile(tc.schema);
+  const valid = validate(parsed);
+  if (!valid) {
+    context.logs.push(
+      `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
+    );
+  }
+  const expectedRec = context.expectedMap.get(tc.id);
+  if (!expectedRec) {
+    context.logs.push(
+      `[WARN] ${tc.id}: No expected record found. Skipping value match.`
+    );
+  }
+  const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
+  return { valid, valuesOk, parsed };
+}
+async function processTestCase(tc, context) {
+  const messages = buildMessages(tc);
+  const temp = context.config?.temperature;
+  const temperature = typeof temp === "number" ? temp : void 0;
+  const { text } = await generateText2({
+    model: context.model,
+    messages,
+    ...temperature !== void 0 ? { temperature } : {}
+  });
+  let parsed;
+  try {
+    parsed = extractFirstJsonBlock(text);
+  } catch {
+  }
+  if (parsed === void 0) {
+    context.validation.logs.push(
+      `[FAIL] ${tc.id}: Unable to parse JSON from model output.`
+    );
+    return { schemaValid: false, valueMatch: false, correct: false };
+  }
+  const {
+    valid,
+    valuesOk,
+    parsed: validatedParsed
+  } = validateTestCase(tc, parsed, context.validation);
+  const correct = valid && valuesOk;
+  if (correct) {
+    context.validation.logs.push(`[PASS] ${tc.id}`);
+  } else {
+    context.validation.logs.push(
+      `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
+        validatedParsed
+      )}`
+    );
+  }
+  return { schemaValid: valid, valueMatch: valuesOk, correct };
+}
 var jsonGenerationBenchmark = {
   name: "json-generation",
   version: "2.1.0",
@@ -1279,116 +1340,124 @@ var jsonGenerationBenchmark = {
   async run(model, config) {
     const logs = [];
     const ajv = new Ajv({ allErrors: true, strict: false });
-    let schemaValidCount = 0;
-    let valueMatchCount = 0;
-    let correctCount = 0;
-    let tests = [];
-    const expectedMap = /* @__PURE__ */ new Map();
-    try {
-      const dataDir = resolveDataDir();
-      const testsJsonl = await fs3.readFile(
-        path3.join(dataDir, "json_generation_tests.jsonl"),
-        "utf-8"
-      );
-      const expectedJsonl = await fs3.readFile(
-        path3.join(dataDir, "json_generation_expected.jsonl"),
-        "utf-8"
-      );
-      tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-      const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-      for (const r of expecteds) expectedMap.set(r.id, r);
-    } catch (e) {
-      const msg = e instanceof Error ? e.message : String(e);
+    const { tests, expectedMap, error } = await loadDatasets();
+    if (error) {
       return {
         score: 0,
         success: false,
         metrics: {},
-        logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
-        error: e
+        logs: [
+          `[FATAL] Failed to load json-generation datasets: ${error.message}`
+        ],
+        error
       };
     }
-    for (const tc of tests) {
-      try {
-        const schemaStr = JSON.stringify(tc.schema, null, 2);
-        const messages = [
-          {
-            role: "system",
-            content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
-          },
-          {
-            role: "user",
-            content: [
-              "Generate a JSON object that reflects the following facts.",
-              "JSON Schema:",
-              schemaStr,
-              "Facts:",
-              tc.promptFacts,
-              "Output must be a single JSON only, with no additional text."
-            ].join("\n\n")
-          }
-        ];
-        const temp = config?.temperature;
-        const temperature = typeof temp === "number" ? temp : void 0;
-        const { text } = await generateText2({
-          model,
-          messages,
-          ...temperature !== void 0 ? { temperature } : {}
-        });
-        let parsed;
-        try {
-          parsed = extractFirstJsonBlock(text);
-        } catch {
-        }
-        if (parsed === void 0) {
-          logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
-          continue;
-        }
-        const validate = ajv.compile(tc.schema);
-        const valid = validate(parsed);
-        if (valid) schemaValidCount++;
-        else
-          logs.push(
-            `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
-          );
-        const expectedRec = expectedMap.get(tc.id);
-        if (!expectedRec) {
-          logs.push(
-            `[WARN] ${tc.id}: No expected record found. Skipping value match.`
-          );
-        }
-        const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
-        if (valuesOk) valueMatchCount++;
-        if (valid && valuesOk) {
-          correctCount++;
-          logs.push(`[PASS] ${tc.id}`);
-        } else {
-          logs.push(
-            `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
-              parsed
-            )}`
-          );
-        }
-      } catch (e) {
-        const msg = e instanceof Error ? e.message : String(e);
-        logs.push(`[ERROR] ${tc.id}: ${msg}`);
-      }
-    }
-    const total = tests.length;
-    const score = correctCount / total;
-    return {
-      score,
-      success: score >= 0.8,
-      metrics: {
-        total_cases: total,
-        correct_count: correctCount,
-        schema_valid_count: schemaValidCount,
-        value_match_count: valueMatchCount,
-        accuracy: score
-      },
-      logs
+    const context = {
+      model,
+      config,
+      validation: { expectedMap, ajv, logs }
     };
+    const counts = await processAllTests(tests, context);
+    return buildBenchmarkResult(tests.length, counts, logs);
   }
 };
+async function processAllTests(tests, context) {
+  let schemaValidCount = 0;
+  let valueMatchCount = 0;
+  let correctCount = 0;
+  for (const tc of tests) {
+    try {
+      const result = await processTestCase(tc, context);
+      if (result.schemaValid) {
+        schemaValidCount += 1;
+      }
+      if (result.valueMatch) {
+        valueMatchCount += 1;
+      }
+      if (result.correct) {
+        correctCount += 1;
+      }
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
+    }
+  }
+  return { schemaValidCount, valueMatchCount, correctCount };
+}
+function buildBenchmarkResult(total, counts, logs) {
+  const score = counts.correctCount / total;
+  return {
+    score,
+    success: score >= 0.8,
+    metrics: {
+      total_cases: total,
+      correct_count: counts.correctCount,
+      schema_valid_count: counts.schemaValidCount,
+      value_match_count: counts.valueMatchCount,
+      accuracy: score
+    },
+    logs
+  };
+}
+async function loadSchemaOnlyTests() {
+  try {
+    const dataDir = resolveDataDir();
+    const testsJsonl = await fs3.readFile(
+      path3.join(dataDir, "json_generation_tests.jsonl"),
+      "utf-8"
+    );
+    const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+    return { tests };
+  } catch (e) {
+    return { tests: [], error: e };
+  }
+}
+async function processSchemaOnlyTestCase(tc, context) {
+  const messages = buildMessages(tc);
+  const temp = context.config?.temperature;
+  const temperature = typeof temp === "number" ? temp : void 0;
+  const { text } = await generateText2({
+    model: context.model,
+    messages,
+    ...temperature !== void 0 ? { temperature } : {}
+  });
+  let parsed;
+  try {
+    parsed = extractFirstJsonBlock(text);
+  } catch {
+  }
+  if (parsed === void 0) {
+    context.logs.push(
+      `[FAIL] ${tc.id}: Could not parse JSON from model output.`
+    );
+    return false;
+  }
+  const validate = context.ajv.compile(tc.schema);
+  const valid = validate(parsed);
+  if (valid) {
+    context.logs.push(`[PASS] ${tc.id}`);
+    return true;
+  }
+  context.logs.push(
+    `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
+  );
+  return false;
+}
+async function runSchemaOnlyTests(tests, context) {
+  let schemaValidCount = 0;
+  for (const tc of tests) {
+    try {
+      const isValid = await processSchemaOnlyTestCase(tc, context);
+      if (isValid) {
+        schemaValidCount += 1;
+      }
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
+    }
+  }
+  return schemaValidCount;
+}
 var jsonGenerationSchemaOnlyBenchmark = {
   name: "json-generation-schema-only",
   version: "1.0.1",
@@ -1396,76 +1465,19 @@ var jsonGenerationSchemaOnlyBenchmark = {
   async run(model, config) {
     const logs = [];
     const ajv = new Ajv({ allErrors: true, strict: false });
-    let tests = [];
-    try {
-      const dataDir = resolveDataDir();
-      const testsJsonl = await fs3.readFile(
-        path3.join(dataDir, "json_generation_tests.jsonl"),
-        "utf-8"
-      );
-      tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-    } catch (e) {
-      const msg = e instanceof Error ? e.message : String(e);
+    const { tests, error } = await loadSchemaOnlyTests();
+    if (error) {
+      const msg = error.message;
       return {
         score: 0,
         success: false,
         metrics: {},
         logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
-        error: e
+        error
       };
     }
-    let schemaValidCount = 0;
-    for (const tc of tests) {
-      try {
-        const schemaStr = JSON.stringify(tc.schema, null, 2);
-        const messages = [
-          {
-            role: "system",
-            content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
-          },
-          {
-            role: "user",
-            content: [
-              "Generate a JSON object that reflects the following facts.",
-              "JSON Schema:",
-              schemaStr,
-              "Facts:",
-              tc.promptFacts,
-              "Output must be a single JSON only, with no additional text."
-            ].join("\n\n")
-          }
-        ];
-        const temp = config?.temperature;
-        const temperature = typeof temp === "number" ? temp : void 0;
-        const { text } = await generateText2({
-          model,
-          messages,
-          ...temperature !== void 0 ? { temperature } : {}
-        });
-        let parsed;
-        try {
-          parsed = extractFirstJsonBlock(text);
-        } catch {
-        }
-        if (parsed === void 0) {
-          logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
-          continue;
-        }
-        const validate = ajv.compile(tc.schema);
-        const valid = validate(parsed);
-        if (valid) {
-          schemaValidCount++;
-          logs.push(`[PASS] ${tc.id}`);
-        } else {
-          logs.push(
-            `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
-          );
-        }
-      } catch (e) {
-        const msg = e instanceof Error ? e.message : String(e);
-        logs.push(`[ERROR] ${tc.id}: ${msg}`);
-      }
-    }
+    const context = { model, config, ajv, logs };
+    const schemaValidCount = await runSchemaOnlyTests(tests, context);
     const total = tests.length;
     const score = total > 0 ? schemaValidCount / total : 0;
     return {
@@ -1480,6 +1492,505 @@ var jsonGenerationSchemaOnlyBenchmark = {
     };
   }
 };
+// src/reporters/console.ts
+var colors = {
+  reset: "\x1B[0m",
+  green: "\x1B[32m",
+  red: "\x1B[31m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  magenta: "\x1B[35m",
+  gray: "\x1B[90m"
+};
+function printResult(result) {
+  const { model, modelKey, benchmark, result: benchmarkResult } = result;
+  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
+  console.log(
+    `
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
+  );
+  console.log(
+    `  \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
+  );
+  const metrics = Object.entries(benchmarkResult.metrics);
+  if (metrics.length > 0) {
+    console.log("    Metrics:");
+    for (const [key, value] of metrics) {
+      console.log(`      - ${key}: ${value}`);
+    }
+  }
+  if (benchmarkResult.error) {
+    console.log(
+      `    ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
+    );
+  }
+}
+function consoleReporter(results) {
+  console.log("\n--- \u{1F4CA} Evaluation Report ---");
+  for (const result of results) {
+    printResult(result);
+  }
+  console.log("\n---------------------------\n");
+}
+// src/reporters/console.debug.ts
+var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
+var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
+var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
+var colors2 = {
+  reset: "\x1B[0m",
+  green: "\x1B[32m",
+  red: "\x1B[31m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  magenta: "\x1B[35m",
+  gray: "\x1B[90m",
+  bold: "\x1B[1m",
+  underline: "\x1B[4m"
+};
+function colorizeDiffLine(line) {
+  if (line.startsWith("+")) {
+    return `${colors2.green}${line}${colors2.reset}`;
+  }
+  if (line.startsWith("-")) {
+    return `${colors2.red}${line}${colors2.reset}`;
+  }
+  if (line.startsWith("@")) {
+    return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
+  }
+  return line;
+}
+function uniqueLines(lines) {
+  const seen = /* @__PURE__ */ new Set();
+  const out = [];
+  for (const l of lines) {
+    if (seen.has(l)) {
+      continue;
+    }
+    seen.add(l);
+    out.push(l);
+  }
+  return out;
+}
+function hasFunctionNameIssue(diff) {
+  return diff.some(
+    (d) => String(d).includes("function name") || String(d).includes("missing function:")
+  );
+}
+function suggestFunctionNameFix(expected, actual, suggestions) {
+  const expectedName = expected?.function;
+  const actualName = actual?.function;
+  if (expectedName && actualName && expectedName !== actualName) {
+    suggestions.push(
+      `Call the function '${expectedName}' instead of '${actualName}'.`
+    );
+  }
+  if (Array.isArray(expected?.functions)) {
+    suggestions.push(
+      `Ensure tool calls include: ${expected.functions.join(", ")}.`
+    );
+  }
+}
+function suggestMissingParamFix(diff, suggestions) {
+  const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
+  if (missing.length) {
+    suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
+  }
+}
+function suggestUnexpectedParamFix(diff, suggestions) {
+  const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
+  if (extras.length) {
+    suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
+  }
+}
+function suggestParamValueFix(diff, suggestions) {
+  const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
+  for (const param of targets) {
+    const allowedOneOfLine = diff.find(
+      (d) => String(d).startsWith("- expected one of:")
+    );
+    const allowedSingleLine = diff.find(
+      (d) => String(d).startsWith("- expected:")
+    );
+    if (allowedSingleLine) {
+      const value = allowedSingleLine.replace("- expected: ", "");
+      suggestions.push(`Set '${param}' to: ${value}.`);
+    } else if (allowedOneOfLine) {
+      const allowed = allowedOneOfLine.replace("- expected one of: ", "");
+      suggestions.push(`Set '${param}' to one of: ${allowed}.`);
+    } else {
+      suggestions.push(`Adjust '${param}' to an allowed value.`);
+    }
+  }
+}
+function suggestFromErrorType(error_type, suggestions) {
+  if (error_type.includes("missing_required")) {
+    suggestions.push("Add all required parameters defined by the tool schema.");
+  } else if (error_type.includes("unexpected_param")) {
+    suggestions.push("Remove parameters not present in the tool schema.");
+  } else if (error_type.includes("wrong_count")) {
+    suggestions.push(
+      "Adjust the number of tool calls to match expected count."
+    );
+  } else if (error_type.includes("wrong_func_name")) {
+    suggestions.push("Use the exact expected function name from the schema.");
+  } else if (error_type.includes("value_error")) {
+    suggestions.push("Choose a value from the allowed options.");
+  }
+}
+function suggestFixFromDiff(parsed) {
+  const suggestions = [];
+  const { error_type, expected, actual, diff } = parsed ?? {};
+  if (!Array.isArray(diff)) {
+    if (suggestions.length === 0 && typeof error_type === "string") {
+      suggestFromErrorType(error_type, suggestions);
+    }
+    return uniqueLines(suggestions);
+  }
+  if (hasFunctionNameIssue(diff)) {
+    suggestFunctionNameFix(expected, actual, suggestions);
+  }
+  if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
+    suggestMissingParamFix(diff, suggestions);
+  }
+  if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
+    suggestUnexpectedParamFix(diff, suggestions);
+  }
+  if (diff.some((d) => String(d).startsWith("@@ param "))) {
+    suggestParamValueFix(diff, suggestions);
+  }
+  if (suggestions.length === 0 && typeof error_type === "string") {
+    suggestFromErrorType(error_type, suggestions);
+  }
+  return uniqueLines(suggestions);
+}
+function getTestIdFromLogLine(line) {
+  if (line.startsWith("[FAIL]")) {
+    const m = line.match(FAIL_ID_REGEX);
+    return m?.[1];
+  }
+  if (line.startsWith("[DEBUG-FAIL]")) {
+    try {
+      const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
+      return String(parsed?.id ?? "");
+    } catch {
+    }
+  }
+  if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
+    try {
+      const parsed = JSON.parse(
+        line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
+      );
+      return String(parsed?.id ?? "");
+    } catch {
+    }
+  }
+  return;
+}
+function groupLogsByTestId(failLogs) {
+  const byId = /* @__PURE__ */ new Map();
+  for (const line of failLogs) {
+    const id = getTestIdFromLogLine(line);
+    const key = id ?? "__general__";
+    const arr = byId.get(key) ?? [];
+    arr.push(line);
+    byId.set(key, arr);
+  }
+  return byId;
+}
+function collectDebugIds(lines) {
+  const debugIds = /* @__PURE__ */ new Set();
+  for (const l of lines) {
+    if (l.startsWith("[DEBUG-FAIL]")) {
+      try {
+        const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
+        if (parsed?.id) {
+          debugIds.add(String(parsed.id));
+        }
+      } catch {
+      }
+    }
+  }
+  return debugIds;
+}
+function printIndentedJson(prefix, data, color) {
+  console.log(
+    color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n            ") + colors2.reset
+  );
+}
+function displayDebugFailLine(line) {
+  const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
+  try {
+    const parsed = JSON.parse(payload);
+    const { message, diff, expected, actual } = parsed;
+    if (message) {
+      console.log(`        ${colors2.bold}${message}${colors2.reset}`);
+    }
+    if (diff && Array.isArray(diff)) {
+      for (const dLine of diff) {
+        console.log(`          ${colorizeDiffLine(dLine)}`);
+      }
+    } else {
+      console.log("          expected:");
+      printIndentedJson("            ", expected, colors2.green);
+      console.log("          actual:");
+      printIndentedJson("            ", actual, colors2.red);
+    }
+    const suggestions = suggestFixFromDiff(parsed);
+    if (suggestions.length) {
+      console.log(`          ${colors2.bold}Suggested fix:${colors2.reset}`);
+      for (const s of suggestions) {
+        console.log(`            \u2022 ${s}`);
+      }
+    }
+  } catch {
+    console.log(`        ${line}`);
+  }
+}
+function displayContextInfo(ctx) {
+  if (ctx.tool_schema) {
+    printIndentedJson("          tool schema: ", ctx.tool_schema, colors2.gray);
+  }
+  if (ctx.last_user_query) {
+    console.log(
+      colors2.gray + "          last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
+    );
+  }
+  if (ctx.raw_model_text) {
+    console.log(
+      colors2.gray + "          raw model text (middleware parsed):\n            " + String(ctx.raw_model_text).split("\n").join("\n            ") + colors2.reset
+    );
+  }
+  if (ctx.parsed_tool_calls) {
+    printIndentedJson(
+      "          parsed tool calls: ",
+      ctx.parsed_tool_calls,
+      colors2.gray
+    );
+  }
+  if (ctx.ground_truth) {
+    printIndentedJson(
+      "          ground truth: ",
+      ctx.ground_truth,
+      colors2.gray
+    );
+  }
+  if (ctx.finish_reason) {
+    console.log(
+      colors2.gray + "          finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
+    );
+  }
+}
+function displayDebugFailContextLine(line) {
+  const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
+  try {
+    const ctx = JSON.parse(payload);
+    console.log(`        ${colors2.gray}context:${colors2.reset}`);
+    displayContextInfo(ctx);
+  } catch {
+    console.log(`        ${line}`);
+  }
+}
+function displayLogLine(line, debugIds) {
+  if (line.startsWith("[FAIL]")) {
+    const m = line.match(FAIL_ID_REGEX);
+    const failId = m?.[1];
+    if (failId && debugIds.has(failId)) {
+      return;
+    }
+    console.log(`        ${colors2.red}${line}${colors2.reset}`);
+  } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
+    console.log(`        ${colors2.yellow}${line}${colors2.reset}`);
+  } else if (line.startsWith("[STACK]")) {
+    console.log(`        ${colors2.gray}${line}${colors2.reset}`);
+  } else if (line.startsWith("[DEBUG-FAIL]")) {
+    displayDebugFailLine(line);
+  } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
+    displayDebugFailContextLine(line);
+  }
+}
+function displayGroupedFailures(byId) {
+  console.log(`    ${colors2.bold}Failure details (grouped):${colors2.reset}`);
+  for (const [groupId, lines] of byId) {
+    if (groupId !== "__general__") {
+      console.log(`      ${colors2.underline}${groupId}${colors2.reset}`);
+    }
+    const debugIds = collectDebugIds(lines);
+    for (const line of lines) {
+      displayLogLine(line, debugIds);
+    }
+  }
+}
+function displaySuccessLogs(logs) {
+  const info = logs.filter(
+    (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
+  );
+  for (const line of info) {
+    console.log(`      ${colors2.gray}${line}${colors2.reset}`);
+  }
+}
+function filterFailureLogs(logs) {
+  return logs.filter(
+    (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
+  );
+}
+function displayResultLogs(logs) {
+  const failLogs = filterFailureLogs(logs);
+  const hasFails = failLogs.length > 0;
+  if (hasFails) {
+    const byId = groupLogsByTestId(failLogs);
+    displayGroupedFailures(byId);
+  } else {
+    displaySuccessLogs(logs);
+  }
+}
+function displayMetrics(metrics) {
+  if (metrics.length > 0) {
+    console.log("    Metrics:");
+    for (const [k, v] of metrics) {
+      console.log(`      - ${k}: ${v}`);
+    }
+  }
+}
+function displayResultHeader(r) {
+  const { model, modelKey, benchmark, result } = r;
+  const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
+  console.log(
+    `
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
+  );
+  console.log(
+    `  \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
+  );
+}
+function consoleDebugReporter(results) {
+  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
+  for (const r of results) {
+    displayResultHeader(r);
+    displayMetrics(Object.entries(r.result.metrics));
+    if (r.result.logs?.length) {
+      displayResultLogs(r.result.logs);
+    }
+  }
+  console.log("\n------------------------------------\n");
+}
+// src/reporters/json.ts
+function jsonReporter(results) {
+  const serializableResults = results.map((r) => ({
+    ...r,
+    result: {
+      ...r.result,
+      error: r.result.error?.message
+    }
+  }));
+  console.log(JSON.stringify(serializableResults, null, 2));
+}
+// src/reporters/index.ts
+var reporters = {
+  console: consoleReporter,
+  json: jsonReporter,
+  "console.debug": consoleDebugReporter
+};
+// src/evaluate.ts
+async function runSingleBenchmark(model, benchmark, modelKey, config) {
+  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
+  try {
+    console.log(
+      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
+    );
+    const result = await benchmark.run(model, config);
+    console.log(
+      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
+    );
+    return {
+      model: modelId,
+      modelKey,
+      benchmark: benchmark.name,
+      result
+    };
+  } catch (error) {
+    console.error(
+      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
+      error
+    );
+    return {
+      model: modelId,
+      modelKey,
+      benchmark: benchmark.name,
+      result: {
+        score: 0,
+        success: false,
+        metrics: {},
+        error: error instanceof Error ? error : new Error(String(error))
+      }
+    };
+  }
+}
+function normalizeModels(models) {
+  const modelEntries = [];
+  if (Array.isArray(models)) {
+    for (const m of models) {
+      modelEntries.push([void 0, m]);
+    }
+  } else if (typeof models === "object" && models !== null && "modelId" in models) {
+    modelEntries.push([void 0, models]);
+  } else {
+    for (const [key, m] of Object.entries(
+      models
+    )) {
+      modelEntries.push([key, m]);
+    }
+  }
+  return modelEntries;
+}
+function buildConfig(temperature, maxTokens) {
+  const config = {};
+  if (temperature !== void 0) {
+    config.temperature = temperature;
+  }
+  if (maxTokens !== void 0) {
+    config.maxTokens = maxTokens;
+  }
+  return Object.keys(config).length > 0 ? config : void 0;
+}
+function executeReporter(reporter, results) {
+  const report = reporters[reporter];
+  if (report) {
+    report(results);
+  } else {
+    console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
+    reporters.console(results);
+  }
+}
+async function evaluate(options) {
+  const {
+    models,
+    benchmarks,
+    reporter = "console",
+    temperature,
+    maxTokens
+  } = options;
+  const modelEntries = normalizeModels(models);
+  const config = buildConfig(temperature, maxTokens);
+  const allResults = [];
+  for (const [modelKey, model] of modelEntries) {
+    for (const benchmark of benchmarks) {
+      const evaluationResult = await runSingleBenchmark(
+        model,
+        benchmark,
+        modelKey,
+        config
+      );
+      allResults.push(evaluationResult);
+    }
+  }
+  executeReporter(reporter, allResults);
+  return allResults;
+}
 export {
   bfclMultipleBenchmark,
   bfclParallelBenchmark,