npm - @ai-sdk-tool/eval - Versions diffs - 0.1.6 → 0.1.8 - Mend

@ai-sdk-tool/eval 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.js CHANGED Viewed

@@ -100,11 +100,17 @@ function suggestFixFromDiff(parsed) {
   if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
     const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
     for (const param of targets) {
-      const allowedLine = diff.find(
+      const allowedOneOfLine = diff.find(
         (d) => String(d).startsWith("- expected one of:")
       );
-      if (allowedLine) {
-        const allowed = allowedLine.replace("- expected one of: ", "");
+      const allowedSingleLine = diff.find(
+        (d) => String(d).startsWith("- expected:")
+      );
+      if (allowedSingleLine) {
+        const value = allowedSingleLine.replace("- expected: ", "");
+        suggestions.push(`Set '${param}' to: ${value}.`);
+      } else if (allowedOneOfLine) {
+        const allowed = allowedOneOfLine.replace("- expected one of: ", "");
         suggestions.push(`Set '${param}' to one of: ${allowed}.`);
       } else {
         suggestions.push(`Adjust '${param}' to an allowed value.`);
@@ -149,61 +155,140 @@ function consoleDebugReporter(results) {
     }
     if (result.logs && result.logs.length) {
       const failLogs = result.logs.filter(
-        (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
+        (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
       );
       const hasFails = failLogs.length > 0;
       if (hasFails) {
-        console.log(`    ${colors2.bold}Failure details:${colors2.reset}`);
-        const debugIds = /* @__PURE__ */ new Set();
-        for (const l of failLogs) {
-          if (l.startsWith("[DEBUG-FAIL]")) {
+        let getTestIdFromLogLine2 = function(line) {
+          if (line.startsWith("[FAIL]")) {
+            const m = line.match(/^\[FAIL\]\s+([^:]+):/);
+            return m?.[1];
+          }
+          if (line.startsWith("[DEBUG-FAIL]")) {
             try {
-              const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
-              if (parsed?.id) debugIds.add(String(parsed.id));
+              const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
+              return String(parsed?.id ?? "");
             } catch {
             }
           }
-        }
-        for (const line of failLogs) {
-          if (line.startsWith("[FAIL]")) {
-            const m = line.match(/^\[FAIL\]\s+([^:]+):/);
-            const failId = m?.[1];
-            if (failId && debugIds.has(failId)) continue;
-            console.log(`      ${colors2.red}${line}${colors2.reset}`);
-          } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
-            console.log(`      ${colors2.yellow}${line}${colors2.reset}`);
-          } else if (line.startsWith("[STACK]")) {
-            console.log(`      ${colors2.gray}${line}${colors2.reset}`);
-          } else if (line.startsWith("[DEBUG-FAIL]")) {
-            const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
+          if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
             try {
-              const parsed = JSON.parse(payload);
-              const { id, expected, actual, message, diff } = parsed;
-              console.log(
-                `      ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
+              const parsed = JSON.parse(
+                line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
               );
-              if (diff && Array.isArray(diff)) {
-                for (const dLine of diff)
-                  console.log("        " + colorizeDiffLine(dLine));
-              } else {
-                console.log("        expected:");
-                console.log(
-                  colors2.green + "          " + JSON.stringify(expected, null, 2).split("\n").join("\n          ") + colors2.reset
-                );
-                console.log("        actual:");
-                console.log(
-                  colors2.red + "          " + JSON.stringify(actual, null, 2).split("\n").join("\n          ") + colors2.reset
-                );
+              return String(parsed?.id ?? "");
+            } catch {
+            }
+          }
+          return void 0;
+        };
+        var getTestIdFromLogLine = getTestIdFromLogLine2;
+        const byId = /* @__PURE__ */ new Map();
+        for (const line of failLogs) {
+          const id = getTestIdFromLogLine2(line);
+          const key = id ?? "__general__";
+          const arr = byId.get(key) ?? [];
+          arr.push(line);
+          byId.set(key, arr);
+        }
+        console.log(
+          `    ${colors2.bold}Failure details (grouped):${colors2.reset}`
+        );
+        for (const [groupId, lines] of byId) {
+          if (groupId !== "__general__") {
+            console.log(`      ${colors2.underline}${groupId}${colors2.reset}`);
+          }
+          const debugIds = /* @__PURE__ */ new Set();
+          for (const l of lines) {
+            if (l.startsWith("[DEBUG-FAIL]")) {
+              try {
+                const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
+                if (parsed?.id) debugIds.add(String(parsed.id));
+              } catch {
               }
-              const suggestions = suggestFixFromDiff(parsed);
-              if (suggestions.length) {
-                console.log(
-                  `        ${colors2.bold}Suggested fix:${colors2.reset}`
-                );
-                for (const s of suggestions) console.log(`          \u2022 ${s}`);
+            }
+          }
+          for (const line of lines) {
+            if (line.startsWith("[FAIL]")) {
+              const m = line.match(/^\[FAIL\]\s+([^:]+):/);
+              const failId = m?.[1];
+              if (failId && debugIds.has(failId)) continue;
+              console.log(`        ${colors2.red}${line}${colors2.reset}`);
+            } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
+              console.log(`        ${colors2.yellow}${line}${colors2.reset}`);
+            } else if (line.startsWith("[STACK]")) {
+              console.log(`        ${colors2.gray}${line}${colors2.reset}`);
+            } else if (line.startsWith("[DEBUG-FAIL]")) {
+              const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
+              try {
+                const parsed = JSON.parse(payload);
+                const { message, diff, expected, actual } = parsed;
+                if (message)
+                  console.log(
+                    `        ${colors2.bold}${message}${colors2.reset}`
+                  );
+                if (diff && Array.isArray(diff)) {
+                  for (const dLine of diff)
+                    console.log("          " + colorizeDiffLine(dLine));
+                } else {
+                  console.log("          expected:");
+                  console.log(
+                    colors2.green + "            " + JSON.stringify(expected, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                  console.log("          actual:");
+                  console.log(
+                    colors2.red + "            " + JSON.stringify(actual, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                const suggestions = suggestFixFromDiff(parsed);
+                if (suggestions.length) {
+                  console.log(
+                    `          ${colors2.bold}Suggested fix:${colors2.reset}`
+                  );
+                  for (const s of suggestions)
+                    console.log(`            \u2022 ${s}`);
+                }
+              } catch {
+                console.log(`        ${line}`);
+              }
+            } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
+              const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
+              try {
+                const ctx = JSON.parse(payload);
+                console.log(`        ${colors2.gray}context:${colors2.reset}`);
+                if (ctx.tool_schema) {
+                  console.log(
+                    colors2.gray + "          tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.last_user_query) {
+                  console.log(
+                    colors2.gray + "          last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
+                  );
+                }
+                if (ctx.raw_model_text) {
+                  console.log(
+                    colors2.gray + "          raw model text (middleware parsed):\n            " + String(ctx.raw_model_text).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.parsed_tool_calls) {
+                  console.log(
+                    colors2.gray + "          parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.ground_truth) {
+                  console.log(
+                    colors2.gray + "          ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n            ") + colors2.reset
+                  );
+                }
+                if (ctx.finish_reason) {
+                  console.log(
+                    colors2.gray + "          finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
+                  );
+                }
+              } catch {
+                console.log(`        ${line}`);
               }
-            } catch {
-              console.log(`      ${line}`);
             }
           }
         }
@@ -274,7 +359,13 @@ async function runSingleBenchmark(model, benchmark, modelKey, config) {
   }
 }
 async function evaluate(options) {
-  const { models, benchmarks, reporter = "console", temperature } = options;
+  const {
+    models,
+    benchmarks,
+    reporter = "console",
+    temperature,
+    maxTokens
+  } = options;
   const modelEntries = [];
   if (Array.isArray(models)) {
     for (const m of models) modelEntries.push([void 0, m]);
@@ -290,11 +381,14 @@ async function evaluate(options) {
   const allResults = [];
   for (const [modelKey, model] of modelEntries) {
     for (const benchmark of benchmarks) {
+      const config = {};
+      if (temperature !== void 0) config.temperature = temperature;
+      if (maxTokens !== void 0) config.maxTokens = maxTokens;
       const evaluationResult = await runSingleBenchmark(
         model,
         benchmark,
         modelKey,
-        temperature !== void 0 ? { temperature } : void 0
+        Object.keys(config).length > 0 ? config : void 0
       );
       allResults.push(evaluationResult);
     }
@@ -694,6 +788,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const { function: tools, question: messages } = testCase;
           const temp = config?.temperature;
           const temperature = typeof temp === "number" ? temp : void 0;
+          const maxTok = config?.maxTokens;
+          const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
           try {
             const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
             const nameMap = /* @__PURE__ */ new Map();
@@ -734,24 +830,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                 `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
               );
             }
+            const debugSummaryRef = {};
+            const providerOptions = {
+              toolCallMiddleware: {
+                debugSummary: debugSummaryRef
+              }
+            };
             const { toolCalls, text, finishReason } = await generateText({
               model,
               messages: flatMessages,
               tools: toolsMap,
               toolChoice: "auto",
+              providerOptions,
               ...temperature !== void 0 ? { temperature } : {},
-              // Pass original schema information to middleware
-              providerOptions: {
-                toolCallMiddleware: {
-                  originalToolSchemas: Object.fromEntries(
-                    transformedTools.map((t) => [
-                      t.name,
-                      t.inputSchema
-                    ])
-                  )
-                }
-              }
+              ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
             });
+            const mwOriginalText = debugSummaryRef.originalText;
+            const mwParsedToolCalls = (() => {
+              const raw = debugSummaryRef.toolCalls;
+              if (!raw) return [];
+              try {
+                const arr = JSON.parse(raw);
+                return Array.isArray(arr) ? arr : [];
+              } catch {
+                return [];
+              }
+            })();
             try {
               caseLogs.push(
                 `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
@@ -795,6 +899,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             } else {
               caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
               try {
+                let generateParamMismatchDiff2 = function(paramName, allowed, got) {
+                  const diffLines = [];
+                  diffLines.push(`@@ param ${paramName}`);
+                  const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
+                  const expectedLine = (() => {
+                    if (allowedArray.length === 1) {
+                      return `- expected: ${JSON.stringify(allowedArray[0])}`;
+                    }
+                    const formatted = allowedArray.map(
+                      (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
+                    ).join(", ");
+                    return `- expected one of: ${formatted}`;
+                  })();
+                  diffLines.push(expectedLine);
+                  diffLines.push(`+ got: ${JSON.stringify(got)}`);
+                  return diffLines;
+                };
+                var generateParamMismatchDiff = generateParamMismatchDiff2;
                 const category = testCase.id.split("_")[0];
                 const diff = [];
                 const summarizeArgs = (args) => {
@@ -861,11 +983,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                           return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
                         });
                         if (!includes) {
-                          diff.push(`@@ param ${k}`);
                           diff.push(
-                            `- expected one of: ${JSON.stringify(allowed)}`
+                            ...generateParamMismatchDiff2(k, allowed, got)
                           );
-                          diff.push(`+ got: ${JSON.stringify(got)}`);
                         }
                       }
                     }
@@ -955,11 +1075,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                             return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
                           });
                           if (!includes) {
-                            diff.push(`@@ param ${k}`);
                             diff.push(
-                              `- expected one of: ${JSON.stringify(allowed)}`
+                              ...generateParamMismatchDiff2(k, allowed, got)
                             );
-                            diff.push(`+ got: ${JSON.stringify(got)}`);
                           }
                         }
                       }
@@ -976,6 +1094,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                     diff
                   })}`
                 );
+                try {
+                  const lastUser = (() => {
+                    const reversed = [...flatMessages].reverse();
+                    const found = reversed.find(
+                      (m) => m.role === "user"
+                    );
+                    return found?.content ?? void 0;
+                  })();
+                  const contextPayload = {
+                    id: testCase.id,
+                    tool_schema: tools,
+                    last_user_query: lastUser,
+                    raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
+                    finish_reason: finishReason,
+                    parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
+                    ground_truth: possibleAnswer.ground_truth
+                  };
+                  caseLogs.push(
+                    `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
+                  );
+                } catch {
+                }
               } catch {
                 caseLogs.push(
                   `[DEBUG] ${testCase.id}: failed to build debug diff`