npm - @ai-sdk-tool/eval - Versions diffs - 0.1.7 → 0.1.8 - Mend

@ai-sdk-tool/eval 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.cjs CHANGED Viewed

@@ -142,11 +142,17 @@ function suggestFixFromDiff(parsed) {
   if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
     const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
     for (const param of targets) {
-      const allowedLine = diff.find(
+      const allowedOneOfLine = diff.find(
         (d) => String(d).startsWith("- expected one of:")
       );
-      if (allowedLine) {
-        const allowed = allowedLine.replace("- expected one of: ", "");
+      const allowedSingleLine = diff.find(
+        (d) => String(d).startsWith("- expected:")
+      );
+      if (allowedSingleLine) {
+        const value = allowedSingleLine.replace("- expected: ", "");
+        suggestions.push(`Set '${param}' to: ${value}.`);
+      } else if (allowedOneOfLine) {
+        const allowed = allowedOneOfLine.replace("- expected one of: ", "");
         suggestions.push(`Set '${param}' to one of: ${allowed}.`);
       } else {
         suggestions.push(`Adjust '${param}' to an allowed value.`);
@@ -191,61 +197,140 @@ function consoleDebugReporter(results) {
     }
     if (result.logs && result.logs.length) {
       const failLogs = result.logs.filter(
-        (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
+        (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
       );
       const hasFails = failLogs.length > 0;
       if (hasFails) {
-        console.log(`    ${colors2.bold}Failure details:${colors2.reset}`);
-        const debugIds = /* @__PURE__ */ new Set();
-        for (const l of failLogs) {
-          if (l.startsWith("[DEBUG-FAIL]")) {
+        let getTestIdFromLogLine2 = function(line) {
+          if (line.startsWith("[FAIL]")) {
+            const m = line.match(/^\[FAIL\]\s+([^:]+):/);
+            return m?.[1];
+          }
+          if (line.startsWith("[DEBUG-FAIL]")) {
             try {
-              const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
-              if (parsed?.id) debugIds.add(String(parsed.id));
+              const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
+              return String(parsed?.id ?? "");
             } catch {
             }
           }
-        }
-        for (const line of failLogs) {
-          if (line.startsWith("[FAIL]")) {
-            const m = line.match(/^\[FAIL\]\s+([^:]+):/);
-            const failId = m?.[1];
-            if (failId && debugIds.has(failId)) continue;
-            console.log(`      ${colors2.red}${line}${colors2.reset}`);
-          } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
-            console.log(`      ${colors2.yellow}${line}${colors2.reset}`);
-          } else if (line.startsWith("[STACK]")) {
-            console.log(`      ${colors2.gray}${line}${colors2.reset}`);
-          } else if (line.startsWith("[DEBUG-FAIL]")) {
-            const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
+          if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
             try {
-              const parsed = JSON.parse(payload);
-              const { id, expected, actual, message, diff } = parsed;
-              console.log(
-                `      ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
+              const parsed = JSON.parse(
+                line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
               );
-              if (diff && Array.isArray(diff)) {
-                for (const dLine of diff)
-                  console.log("        " + colorizeDiffLine(dLine));
-              } else {
-                console.log("        expected:");
-                console.log(
-                  colors2.green + "          " + JSON.stringify(expected, null, 2).split("\n").join("\n          ") + colors2.reset
-                );
-                console.log("        actual:");
-                console.log(
-                  colors2.red + "          " + JSON.stringify(actual, null, 2).split("\n").join("\n          ") + colors2.reset
-                );
+              return String(parsed?.id ?? "");
+            } catch {
+            }
+          }
+          return void 0;
+        };
+        var getTestIdFromLogLine = getTestIdFromLogLine2;
+        const byId = /* @__PURE__ */ new Map();
+        for (const line of failLogs) {
+          const id = getTestIdFromLogLine2(line);
+          const key = id ?? "__general__";
+          const arr = byId.get(key) ?? [];
+          arr.push(line);
+          byId.set(key, arr);
+        }
+        console.log(
+          `    ${colors2.bold}Failure details (grouped):${colors2.reset}`
+        );
+        for (const [groupId, lines] of byId) {
+          if (groupId !== "__general__") {
+            console.log(`      ${colors2.underline}${groupId}${colors2.reset}`);
+          }
+          const debugIds = /* @__PURE__ */ new Set();
+          for (const l of lines) {
+            if (l.startsWith("[DEBUG-FAIL]")) {
+              try {
+                const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
+                if (parsed?.id) debugIds.add(String(parsed.id));
+              } catch {
               }
-              const suggestions = suggestFixFromDiff(parsed);
-              if (suggestions.length) {
-                console.log(
-                  `        ${colors2.bold}Suggested fix:${colors2.reset}`
-                );
-                for (const s of suggestions) console.log(`          \u2022 ${s}`);
+            }
+          }
+          for (const line of lines) {
+            if (line.startsWith("[FAIL]")) {
+              const m = line.match(/^\[FAIL\]\s+([^:]+):/);
+              const failId = m?.[1];
+              if (failId && debugIds.has(failId)) continue;
+              console.log(`        ${colors2.red}${line}${colors2.reset}`);
+            } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
+              console.log(`        ${colors2.yellow}${line}${colors2.reset}`);
+            } else if (line.startsWith("[STACK]")) {
+              console.log(`        ${colors2.gray}${line}${colors2.reset}`);
+            } else if (line.startsWith("[DEBUG-FAIL]")) {
+              const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
+              try {
+                const parsed = JSON.parse(payload);
+                const { message, diff, expected, actual } = parsed;
+                if (message)
+                  console.log(
+                    `        ${colors2.bold}${message}${colors2.reset}`
+                  );
+                if (diff && Array.isArray(diff)) {
+                  for (const dLine of diff)
+                    console.log("          " + colorizeDiffLine(dLine));
+                } else {
+                  console.log("          expected:");
+                  console.log(
+                    colors2.green + "            " + JSON.stringify(expected, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                  console.log("          actual:");
+                  console.log(
+                    colors2.red + "            " + JSON.stringify(actual, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                const suggestions = suggestFixFromDiff(parsed);
+                if (suggestions.length) {
+                  console.log(
+                    `          ${colors2.bold}Suggested fix:${colors2.reset}`
+                  );
+                  for (const s of suggestions)
+                    console.log(`            \u2022 ${s}`);
+                }
+              } catch {
+                console.log(`        ${line}`);
+              }
+            } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
+              const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
+              try {
+                const ctx = JSON.parse(payload);
+                console.log(`        ${colors2.gray}context:${colors2.reset}`);
+                if (ctx.tool_schema) {
+                  console.log(
+                    colors2.gray + "          tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.last_user_query) {
+                  console.log(
+                    colors2.gray + "          last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
+                  );
+                }
+                if (ctx.raw_model_text) {
+                  console.log(
+                    colors2.gray + "          raw model text (middleware parsed):\n            " + String(ctx.raw_model_text).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.parsed_tool_calls) {
+                  console.log(
+                    colors2.gray + "          parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.ground_truth) {
+                  console.log(
+                    colors2.gray + "          ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.finish_reason) {
+                  console.log(
+                    colors2.gray + "          finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
+                  );
+                }
+              } catch {
+                console.log(`        ${line}`);
               }
-            } catch {
-              console.log(`      ${line}`);
             }
           }
         }
@@ -787,14 +872,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                 `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
               );
             }
+            const debugSummaryRef = {};
+            const providerOptions = {
+              toolCallMiddleware: {
+                debugSummary: debugSummaryRef
+              }
+            };
             const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
               model,
               messages: flatMessages,
               tools: toolsMap,
               toolChoice: "auto",
+              providerOptions,
               ...temperature !== void 0 ? { temperature } : {},
               ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
             });
+            const mwOriginalText = debugSummaryRef.originalText;
+            const mwParsedToolCalls = (() => {
+              const raw = debugSummaryRef.toolCalls;
+              if (!raw) return [];
+              try {
+                const arr = JSON.parse(raw);
+                return Array.isArray(arr) ? arr : [];
+              } catch {
+                return [];
+              }
+            })();
             try {
               caseLogs.push(
                 `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
@@ -838,6 +941,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             } else {
               caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
               try {
+                let generateParamMismatchDiff2 = function(paramName, allowed, got) {
+                  const diffLines = [];
+                  diffLines.push(`@@ param ${paramName}`);
+                  const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
+                  const expectedLine = (() => {
+                    if (allowedArray.length === 1) {
+                      return `- expected: ${JSON.stringify(allowedArray[0])}`;
+                    }
+                    const formatted = allowedArray.map(
+                      (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
+                    ).join(", ");
+                    return `- expected one of: ${formatted}`;
+                  })();
+                  diffLines.push(expectedLine);
+                  diffLines.push(`+ got: ${JSON.stringify(got)}`);
+                  return diffLines;
+                };
+                var generateParamMismatchDiff = generateParamMismatchDiff2;
                 const category = testCase.id.split("_")[0];
                 const diff = [];
                 const summarizeArgs = (args) => {
@@ -904,11 +1025,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                           return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
                         });
                         if (!includes) {
-                          diff.push(`@@ param ${k}`);
                           diff.push(
-                            `- expected one of: ${JSON.stringify(allowed)}`
+                            ...generateParamMismatchDiff2(k, allowed, got)
                           );
-                          diff.push(`+ got: ${JSON.stringify(got)}`);
                         }
                       }
                     }
@@ -998,11 +1117,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                             return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
                           });
                           if (!includes) {
-                            diff.push(`@@ param ${k}`);
                             diff.push(
-                              `- expected one of: ${JSON.stringify(allowed)}`
+                              ...generateParamMismatchDiff2(k, allowed, got)
                             );
-                            diff.push(`+ got: ${JSON.stringify(got)}`);
                           }
                         }
                       }
@@ -1019,6 +1136,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                     diff
                   })}`
                 );
+                try {
+                  const lastUser = (() => {
+                    const reversed = [...flatMessages].reverse();
+                    const found = reversed.find(
+                      (m) => m.role === "user"
+                    );
+                    return found?.content ?? void 0;
+                  })();
+                  const contextPayload = {
+                    id: testCase.id,
+                    tool_schema: tools,
+                    last_user_query: lastUser,
+                    raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
+                    finish_reason: finishReason,
+                    parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
+                    ground_truth: possibleAnswer.ground_truth
+                  };
+                  caseLogs.push(
+                    `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
+                  );
+                } catch {
+                }
               } catch {
                 caseLogs.push(
                   `[DEBUG] ${testCase.id}: failed to build debug diff`