npm - @ai-sdk-tool/eval - Versions diffs - 0.1.2 → 0.1.3 - Mend

@ai-sdk-tool/eval 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -47,14 +47,15 @@ var colors = {
   red: "\x1B[31m",
   yellow: "\x1B[33m",
   cyan: "\x1B[36m",
-  magenta: "\x1B[35m"
+  magenta: "\x1B[35m",
+  gray: "\x1B[90m"
 };
 function printResult(result) {
-  const { model, benchmark, result: benchmarkResult } = result;
+  const { model, modelKey, benchmark, result: benchmarkResult } = result;
   const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
   console.log(
     `
- ${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
+ ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
   );
   console.log(
     `  \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
@@ -80,6 +81,186 @@ function consoleReporter(results) {
   console.log("\n---------------------------\n");
 }
+// src/reporters/console.debug.ts
+var colors2 = {
+  reset: "\x1B[0m",
+  green: "\x1B[32m",
+  red: "\x1B[31m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  magenta: "\x1B[35m",
+  gray: "\x1B[90m",
+  bold: "\x1B[1m",
+  underline: "\x1B[4m"
+};
+function colorizeDiffLine(line) {
+  if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
+  if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
+  if (line.startsWith("@"))
+    return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
+  return line;
+}
+function uniqueLines(lines) {
+  const seen = /* @__PURE__ */ new Set();
+  const out = [];
+  for (const l of lines) {
+    if (seen.has(l)) continue;
+    seen.add(l);
+    out.push(l);
+  }
+  return out;
+}
+function suggestFixFromDiff(parsed) {
+  const suggestions = [];
+  const { error_type, expected, actual, diff } = parsed ?? {};
+  if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
+    const expectedName = expected?.function;
+    const actualName = actual?.function;
+    if (expectedName && actualName && expectedName !== actualName) {
+      suggestions.push(
+        `Call the function '${expectedName}' instead of '${actualName}'.`
+      );
+    }
+    if (Array.isArray(expected?.functions)) {
+      suggestions.push(
+        `Ensure tool calls include: ${expected.functions.join(", ")}.`
+      );
+    }
+  }
+  if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
+    const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
+    if (missing.length) {
+      suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
+    }
+  }
+  if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
+    const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
+    if (extras.length) {
+      suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
+    }
+  }
+  if (diff && diff.some((d) => d.startsWith("@@ param "))) {
+    const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
+    for (const param of targets) {
+      const allowedLine = diff.find(
+        (d) => d.startsWith("- expected one of:")
+      );
+      if (allowedLine) {
+        const allowed = allowedLine.replace("- expected one of: ", "");
+        suggestions.push(`Set '${param}' to one of: ${allowed}.`);
+      } else {
+        suggestions.push(`Adjust '${param}' to an allowed value.`);
+      }
+    }
+  }
+  if (suggestions.length === 0 && typeof error_type === "string") {
+    if (error_type.includes("missing_required")) {
+      suggestions.push(
+        "Add all required parameters defined by the tool schema."
+      );
+    } else if (error_type.includes("unexpected_param")) {
+      suggestions.push("Remove parameters not present in the tool schema.");
+    } else if (error_type.includes("wrong_count")) {
+      suggestions.push(
+        "Adjust the number of tool calls to match expected count."
+      );
+    } else if (error_type.includes("wrong_func_name")) {
+      suggestions.push("Use the exact expected function name from the schema.");
+    } else if (error_type.includes("value_error")) {
+      suggestions.push("Choose a value from the allowed options.");
+    }
+  }
+  return uniqueLines(suggestions);
+}
+function consoleDebugReporter(results) {
+  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
+  for (const r of results) {
+    const { model, modelKey, benchmark, result } = r;
+    const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
+    console.log(
+      `
+ ${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
+    );
+    console.log(
+      `  \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
+    );
+    const metrics = Object.entries(result.metrics);
+    if (metrics.length > 0) {
+      console.log("    Metrics:");
+      for (const [k, v] of metrics) console.log(`      - ${k}: ${v}`);
+    }
+    if (result.logs && result.logs.length) {
+      const failLogs = result.logs.filter(
+        (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
+      );
+      const hasFails = failLogs.length > 0;
+      if (hasFails) {
+        console.log(`    ${colors2.bold}Failure details:${colors2.reset}`);
+        const debugIds = /* @__PURE__ */ new Set();
+        for (const l of failLogs) {
+          if (l.startsWith("[DEBUG-FAIL]")) {
+            try {
+              const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
+              if (parsed?.id) debugIds.add(String(parsed.id));
+            } catch {
+            }
+          }
+        }
+        for (const line of failLogs) {
+          if (line.startsWith("[FAIL]")) {
+            const m = line.match(/^\[FAIL\]\s+([^:]+):/);
+            const failId = m?.[1];
+            if (failId && debugIds.has(failId)) continue;
+            console.log(`      ${colors2.red}${line}${colors2.reset}`);
+          } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
+            console.log(`      ${colors2.yellow}${line}${colors2.reset}`);
+          } else if (line.startsWith("[STACK]")) {
+            console.log(`      ${colors2.gray}${line}${colors2.reset}`);
+          } else if (line.startsWith("[DEBUG-FAIL]")) {
+            const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
+            try {
+              const parsed = JSON.parse(payload);
+              const { id, expected, actual, message, diff } = parsed;
+              console.log(
+                `      ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
+              );
+              if (diff && Array.isArray(diff)) {
+                for (const dLine of diff)
+                  console.log("        " + colorizeDiffLine(dLine));
+              } else {
+                console.log("        expected:");
+                console.log(
+                  colors2.green + "          " + JSON.stringify(expected, null, 2).split("\n").join("\n          ") + colors2.reset
+                );
+                console.log("        actual:");
+                console.log(
+                  colors2.red + "          " + JSON.stringify(actual, null, 2).split("\n").join("\n          ") + colors2.reset
+                );
+              }
+              const suggestions = suggestFixFromDiff(parsed);
+              if (suggestions.length) {
+                console.log(
+                  `        ${colors2.bold}Suggested fix:${colors2.reset}`
+                );
+                for (const s of suggestions) console.log(`          \u2022 ${s}`);
+              }
+            } catch {
+              console.log(`      ${line}`);
+            }
+          }
+        }
+      } else {
+        const info = result.logs.filter(
+          (l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
+        );
+        for (const line of info)
+          console.log(`      ${colors2.gray}${line}${colors2.reset}`);
+      }
+    }
+  }
+  console.log("\n------------------------------------\n");
+}
 // src/reporters/json.ts
 function jsonReporter(results) {
   const serializableResults = results.map((r) => ({
@@ -95,30 +276,35 @@ function jsonReporter(results) {
 // src/reporters/index.ts
 var reporters = {
   console: consoleReporter,
-  json: jsonReporter
+  json: jsonReporter,
+  "console.debug": consoleDebugReporter
 };
 // src/evaluate.ts
-async function runSingleBenchmark(model, benchmark) {
+async function runSingleBenchmark(model, benchmark, modelKey) {
   const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
   try {
-    console.log(`[${modelId}] Running benchmark: ${benchmark.name}...`);
+    console.log(
+      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
+    );
     const result = await benchmark.run(model);
     console.log(
-      `[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
+      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
     );
     return {
       model: modelId,
+      modelKey,
       benchmark: benchmark.name,
       result
     };
   } catch (error) {
     console.error(
-      `[${modelId}] Error running benchmark: ${benchmark.name}`,
+      `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
       error
     );
     return {
       model: modelId,
+      modelKey,
       benchmark: benchmark.name,
       result: {
         score: 0,
@@ -131,11 +317,26 @@ async function runSingleBenchmark(model, benchmark) {
 }
 async function evaluate(options) {
   const { models, benchmarks, reporter = "console" } = options;
-  const modelsArray = Array.isArray(models) ? models : [models];
+  const modelEntries = [];
+  if (Array.isArray(models)) {
+    for (const m of models) modelEntries.push([void 0, m]);
+  } else if (typeof models === "object" && models !== null && "modelId" in models) {
+    modelEntries.push([void 0, models]);
+  } else {
+    for (const [key, m] of Object.entries(
+      models
+    )) {
+      modelEntries.push([key, m]);
+    }
+  }
   const allResults = [];
-  for (const model of modelsArray) {
+  for (const [modelKey, model] of modelEntries) {
     for (const benchmark of benchmarks) {
-      const evaluationResult = await runSingleBenchmark(model, benchmark);
+      const evaluationResult = await runSingleBenchmark(
+        model,
+        benchmark,
+        modelKey
+      );
       allResults.push(evaluationResult);
     }
   }
@@ -478,7 +679,9 @@ function checkStringValue(param, modelValue, possibleAnswers) {
   if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
     return {
       valid: false,
-      error: `Invalid value for parameter '${param}': '${modelValue}'. Expected one of ${possibleAnswers.join(", ")}.`,
+      error: `Invalid value for parameter '${param}': ${JSON.stringify(
+        modelValue
+      )}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
       error_type: "value_error:string"
     };
   }
@@ -532,15 +735,55 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
       if (!hasMatch) {
         return {
           valid: false,
-          error: `Invalid value for list parameter '${paramName}'.`,
+          error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
+            modelValue
+          )}. Expected one of ${JSON.stringify(possibleValues)}.`,
           error_type: "value_error:list"
         };
       }
     } else {
-      if (!possibleValues.includes(modelValue)) {
+      const hasMatch = possibleValues.some((possibleValue) => {
+        if (modelValue === possibleValue) return true;
+        if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
+          try {
+            const normalizeObject = (obj) => {
+              if (Array.isArray(obj)) {
+                return obj.map(normalizeObject);
+              }
+              if (obj && typeof obj === "object") {
+                const normalized = {};
+                for (const [key, value] of Object.entries(obj)) {
+                  if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
+                    normalized[key] = value[0];
+                  } else {
+                    normalized[key] = normalizeObject(value);
+                  }
+                }
+                return normalized;
+              }
+              return obj;
+            };
+            const normalizedModel = normalizeObject(modelValue);
+            const normalizedPossible = normalizeObject(possibleValue);
+            return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
+          } catch {
+            return false;
+          }
+        }
+        if (typeof modelValue === "number" && typeof possibleValue === "string") {
+          return modelValue.toString() === possibleValue;
+        }
+        if (typeof modelValue === "string" && typeof possibleValue === "number") {
+          return modelValue === possibleValue.toString();
+        }
+        return false;
+      });
+      if (!hasMatch) {
         return {
           valid: false,
-          error: `Invalid value for parameter '${paramName}': got '${modelValue}', expected one of '${possibleValues}'.`,
+          error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
+            modelValue
+          )}. Expected one of ${JSON.stringify(possibleValues)}.`,
           error_type: "value_error:other"
         };
       }
@@ -636,7 +879,8 @@ function check(testCase, modelOutput, possibleAnswer) {
       if (!modelOutput || modelOutput.length !== 1) {
         return {
           valid: false,
-          error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`
+          error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
+          error_type: "simple:wrong_count"
         };
       }
       return simpleFunctionChecker(
@@ -665,7 +909,11 @@ function check(testCase, modelOutput, possibleAnswer) {
     }
     return { valid: true };
   } catch (e) {
-    return { valid: false, error: `Checker Error: ${e.message}` };
+    return {
+      valid: false,
+      error: `Checker Error: ${e.message}`,
+      error_type: "checker_error"
+    };
   }
 }
 function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
@@ -717,7 +965,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           if (copy.items) copy.items = fixSchema(copy.items);
           return copy;
         };
-        for (const testCase of testCases) {
+        const concurrencyEnv = process.env.BFCL_CONCURRENCY;
+        const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
+        logs.push(
+          `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
+        );
+        const runSingleCase = async (testCase) => {
+          const caseLogs = [];
           const { function: tools, question: messages } = testCase;
           try {
             const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
@@ -750,11 +1004,11 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             try {
               const firstTool = transformedTools[0];
               const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
-              logs.push(
+              caseLogs.push(
                 `[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
               );
             } catch (e) {
-              logs.push(
+              caseLogs.push(
                 `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
               );
             }
@@ -762,14 +1016,22 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
               model,
               messages: flatMessages,
               tools: toolsMap,
-              toolChoice: "auto"
+              toolChoice: "auto",
+              // Pass original schema information to middleware
+              providerOptions: {
+                toolCallMiddleware: {
+                  originalToolSchemas: Object.fromEntries(
+                    transformedTools.map((t) => [t.name, t.inputSchema])
+                  )
+                }
+              }
             });
             try {
-              logs.push(
+              caseLogs.push(
                 `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
               );
             } catch {
-              logs.push(
+              caseLogs.push(
                 `[DEBUG] ${testCase.id}: failed to serialize toolCalls`
               );
             }
@@ -802,20 +1064,221 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
               possibleAnswer
             );
             if (checkerResult.valid) {
-              correctCount++;
-              logs.push(`[PASS] ${testCase.id}`);
+              caseLogs.push(`[PASS] ${testCase.id}`);
+              return { valid: true, logs: caseLogs };
             } else {
-              logs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
+              caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
+              try {
+                const category = testCase.id.split("_")[0];
+                const diff = [];
+                const summarizeArgs = (args) => {
+                  if (args == null) return args;
+                  if (typeof args !== "object") return args;
+                  return Object.keys(args).sort().reduce((acc, k) => {
+                    acc[k] = args[k];
+                    return acc;
+                  }, {});
+                };
+                const expected = {};
+                const actual = {};
+                if (category === "simple") {
+                  const funcDesc = tools[0];
+                  const gt = possibleAnswer.ground_truth?.[0];
+                  const expectedFuncName = funcDesc?.name;
+                  const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
+                  const received = restoredCalls[0];
+                  const receivedName = received?.toolName ?? received?.name;
+                  const receivedArgs = summarizeArgs(received?.args);
+                  expected.function = expectedFuncName;
+                  expected.params = expectedParams;
+                  actual.function = receivedName;
+                  actual.args = receivedArgs;
+                  if (expectedFuncName !== receivedName) {
+                    diff.push(`@@ function name`);
+                    diff.push(`- ${expectedFuncName}`);
+                    diff.push(`+ ${receivedName}`);
+                  }
+                  if (expectedParams && receivedArgs) {
+                    const required = funcDesc?.parameters?.required ?? [];
+                    for (const req of required) {
+                      if (!(req in receivedArgs)) {
+                        diff.push(`- missing required param: ${req}`);
+                      }
+                    }
+                    for (const k of Object.keys(receivedArgs)) {
+                      if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
+                        diff.push(`+ unexpected param: ${k}`);
+                      }
+                    }
+                    for (const k of Object.keys(receivedArgs)) {
+                      if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
+                        const allowed = expectedParams[k];
+                        const got = receivedArgs[k];
+                        const includes = Array.isArray(allowed) && allowed.some((v) => {
+                          try {
+                            if (Array.isArray(got)) {
+                              return JSON.stringify(
+                                got.map((x) => String(x)).sort()
+                              ) === JSON.stringify(
+                                v.map((x) => String(x)).sort()
+                              );
+                            }
+                          } catch {
+                          }
+                          return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
+                        });
+                        if (!includes) {
+                          diff.push(`@@ param ${k}`);
+                          diff.push(
+                            `- expected one of: ${JSON.stringify(allowed)}`
+                          );
+                          diff.push(`+ got: ${JSON.stringify(got)}`);
+                        }
+                      }
+                    }
+                  }
+                } else {
+                  const gtArr = possibleAnswer.ground_truth ?? [];
+                  const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
+                  const actualNames = restoredCalls.map(
+                    (c) => c.toolName ?? c.name
+                  );
+                  expected.functions = expectedNames;
+                  actual.functions = actualNames;
+                  if (expectedNames.length !== actualNames.length) {
+                    diff.push(`@@ call count`);
+                    diff.push(`- expected ${expectedNames.length}`);
+                    diff.push(`+ got ${actualNames.length}`);
+                  }
+                  const missing = expectedNames.filter(
+                    (n) => !actualNames.includes(n)
+                  );
+                  const extra = actualNames.filter(
+                    (n) => !expectedNames.includes(n)
+                  );
+                  for (const m of missing)
+                    diff.push(`- missing function: ${m}`);
+                  for (const e of extra)
+                    diff.push(`+ unexpected function: ${e}`);
+                  const usedActual = /* @__PURE__ */ new Set();
+                  for (const expectedObj of gtArr) {
+                    const fname = Object.keys(expectedObj)[0];
+                    let matchedIndex = -1;
+                    for (let i = 0; i < restoredCalls.length; i++) {
+                      if (usedActual.has(i)) continue;
+                      const rc = restoredCalls[i];
+                      const rcName = rc?.toolName ?? rc?.name;
+                      if (rcName === fname) {
+                        matchedIndex = i;
+                        break;
+                      }
+                    }
+                    if (matchedIndex === -1) continue;
+                    usedActual.add(matchedIndex);
+                    const received = restoredCalls[matchedIndex];
+                    const receivedArgs = summarizeArgs(received?.args);
+                    const expectedParamsAllowed = expectedObj[fname];
+                    const funcDesc = tools.find(
+                      (t) => t.name === fname
+                    );
+                    const requiredParams = funcDesc?.parameters?.required ?? [];
+                    diff.push(`@@ function ${fname}`);
+                    if (expectedParamsAllowed && receivedArgs) {
+                      for (const req of requiredParams) {
+                        if (!(req in receivedArgs)) {
+                          diff.push(`- missing required param: ${req}`);
+                        }
+                      }
+                      for (const k of Object.keys(receivedArgs)) {
+                        if (!Object.prototype.hasOwnProperty.call(
+                          expectedParamsAllowed,
+                          k
+                        )) {
+                          diff.push(`+ unexpected param: ${k}`);
+                        }
+                      }
+                      for (const k of Object.keys(receivedArgs)) {
+                        if (Object.prototype.hasOwnProperty.call(
+                          expectedParamsAllowed,
+                          k
+                        )) {
+                          const allowed = expectedParamsAllowed[k];
+                          const got = receivedArgs[k];
+                          const includes = Array.isArray(allowed) && allowed.some((v) => {
+                            try {
+                              if (Array.isArray(got)) {
+                                return JSON.stringify(
+                                  got.map((x) => String(x)).sort()
+                                ) === JSON.stringify(
+                                  v.map((x) => String(x)).sort()
+                                );
+                              }
+                            } catch {
+                            }
+                            return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
+                          });
+                          if (!includes) {
+                            diff.push(`@@ param ${k}`);
+                            diff.push(
+                              `- expected one of: ${JSON.stringify(allowed)}`
+                            );
+                            diff.push(`+ got: ${JSON.stringify(got)}`);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+                caseLogs.push(
+                  `[DEBUG-FAIL] ${JSON.stringify({
+                    id: testCase.id,
+                    message: checkerResult.error,
+                    error_type: checkerResult.error_type,
+                    expected,
+                    actual,
+                    diff
+                  })}`
+                );
+              } catch {
+                caseLogs.push(
+                  `[DEBUG] ${testCase.id}: failed to build debug diff`
+                );
+              }
+              return { valid: false, logs: caseLogs };
             }
           } catch (e) {
-            logs.push(
+            caseLogs.push(
               `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
             );
             if (e?.stack) {
-              logs.push(`[STACK] ${testCase.id}: ${e.stack}`);
+              caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
             }
+            return { valid: false, logs: caseLogs };
           }
-        }
+        };
+        const mapWithConcurrency = async (items, limit2, mapper) => {
+          const results = new Array(items.length);
+          let idx = 0;
+          const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
+            while (true) {
+              const current = idx++;
+              if (current >= items.length) break;
+              results[current] = await mapper(items[current], current);
+            }
+          });
+          await Promise.all(workers);
+          return results;
+        };
+        const resultsPerCase = await mapWithConcurrency(
+          testCases,
+          concurrency,
+          async (tc) => runSingleCase(tc)
+        );
+        correctCount = resultsPerCase.reduce(
+          (acc, r) => acc + (r.valid ? 1 : 0),
+          0
+        );
+        for (const r of resultsPerCase) logs.push(...r.logs);
         if (testCases.length === 0) {
           return {
             score: 0,