npm - @wix/evalforge-evaluator - Versions diffs - 0.22.0 → 0.24.0 - Mend

@wix/evalforge-evaluator 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +309 -78
package/build/index.js.map +4 -4
package/build/index.mjs +309 -78
package/build/index.mjs.map +4 -4
package/build/types/run-scenario/file-diff.d.ts +30 -0
package/package.json +3 -3

package/build/index.js CHANGED Viewed

@@ -6360,40 +6360,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
       traceContext.authToken
     );
   }
+  const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
+  let timeoutHandle;
+  let timedOut = false;
   try {
-    for await (const message of query({
-      prompt: scenario.triggerPrompt,
-      options: queryOptions
-    })) {
-      messageCount++;
-      console.log("[SDK Message]", JSON.stringify(message, null, 2));
-      allMessages.push(message);
-      if (messageCount <= 3) {
-        console.error(
-          "[DEBUG-H5] SDK message received",
-          JSON.stringify({
-            messageCount,
-            type: message.type,
-            timestamp: Date.now()
-          })
-        );
-      }
-      if (traceContext && isAssistantMessage(message)) {
-        traceStepNumber++;
-        const traceEvent = createTraceEventFromMessage(
-          message,
-          traceContext,
-          traceStepNumber,
-          false
-          // Not complete yet
-        );
-        emitTraceEvent(
-          traceEvent,
-          traceContext.tracePushUrl,
-          traceContext.routeHeader,
-          traceContext.authToken
+    const timeoutPromise = new Promise((_, reject) => {
+      timeoutHandle = setTimeout(() => {
+        timedOut = true;
+        reject(
+          new Error(
+            `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
+          )
         );
+      }, SDK_TIMEOUT_MS);
+    });
+    const sdkPromise = (async () => {
+      for await (const message of query({
+        prompt: scenario.triggerPrompt,
+        options: queryOptions
+      })) {
+        messageCount++;
+        console.log("[SDK Message]", JSON.stringify(message, null, 2));
+        allMessages.push(message);
+        if (messageCount <= 3) {
+          console.error(
+            "[DEBUG-H5] SDK message received",
+            JSON.stringify({
+              messageCount,
+              type: message.type,
+              timestamp: Date.now()
+            })
+          );
+        }
+        if (traceContext && isAssistantMessage(message)) {
+          traceStepNumber++;
+          const traceEvent = createTraceEventFromMessage(
+            message,
+            traceContext,
+            traceStepNumber,
+            false
+            // Not complete yet
+          );
+          emitTraceEvent(
+            traceEvent,
+            traceContext.tracePushUrl,
+            traceContext.routeHeader,
+            traceContext.authToken
+          );
+        }
       }
+    })();
+    await Promise.race([sdkPromise, timeoutPromise]);
+    if (timeoutHandle) {
+      clearTimeout(timeoutHandle);
     }
     console.log(
       "[executeWithClaudeCode] Claude Agent SDK query completed, received",
@@ -6401,6 +6420,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
       "messages"
     );
   } catch (sdkError) {
+    if (timeoutHandle) {
+      clearTimeout(timeoutHandle);
+    }
+    if (timedOut) {
+      console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
+    }
     console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
     console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
     console.error(
@@ -6794,6 +6819,154 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
   };
 }
+// src/run-scenario/file-diff.ts
+var import_fs6 = require("fs");
+var import_path6 = require("path");
+var IGNORED_PATTERNS = [
+  "node_modules",
+  ".git",
+  ".claude",
+  ".cursor",
+  "dist",
+  "build",
+  ".next",
+  ".turbo",
+  "__pycache__",
+  ".pytest_cache",
+  ".venv",
+  "venv",
+  ".DS_Store"
+];
+var BINARY_EXTENSIONS = [
+  ".png",
+  ".jpg",
+  ".jpeg",
+  ".gif",
+  ".webp",
+  ".ico",
+  ".svg",
+  ".woff",
+  ".woff2",
+  ".ttf",
+  ".eot",
+  ".mp3",
+  ".mp4",
+  ".wav",
+  ".avi",
+  ".mov",
+  ".pdf",
+  ".zip",
+  ".tar",
+  ".gz",
+  ".rar",
+  ".7z",
+  ".exe",
+  ".dll",
+  ".so",
+  ".dylib"
+];
+var MAX_FILE_SIZE = 100 * 1024;
+function shouldIgnore(name2) {
+  return IGNORED_PATTERNS.some(
+    (pattern) => name2 === pattern || name2.startsWith(pattern + "/")
+  );
+}
+function isBinaryFile(filename) {
+  const lower = filename.toLowerCase();
+  return BINARY_EXTENSIONS.some((ext) => lower.endsWith(ext));
+}
+function snapshotDirectory(dir, baseDir) {
+  const snapshot = {};
+  const base = baseDir || dir;
+  if (!(0, import_fs6.existsSync)(dir)) {
+    return snapshot;
+  }
+  const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
+  for (const entry of entries) {
+    const fullPath = (0, import_path6.join)(dir, entry.name);
+    const relativePath = (0, import_path6.relative)(base, fullPath);
+    if (shouldIgnore(entry.name)) {
+      continue;
+    }
+    if (entry.isDirectory()) {
+      const subSnapshot = snapshotDirectory(fullPath, base);
+      Object.assign(snapshot, subSnapshot);
+    } else if (entry.isFile()) {
+      if (isBinaryFile(entry.name)) {
+        continue;
+      }
+      try {
+        const stats = (0, import_fs6.statSync)(fullPath);
+        if (stats.size > MAX_FILE_SIZE) {
+          continue;
+        }
+        const content = (0, import_fs6.readFileSync)(fullPath, "utf-8");
+        snapshot[relativePath] = content;
+      } catch {
+        continue;
+      }
+    }
+  }
+  return snapshot;
+}
+function generateDiffLines(before, after) {
+  const beforeLines = before.split("\n");
+  const afterLines = after.split("\n");
+  const result = [];
+  let lineNumber = 1;
+  const maxLines = Math.max(beforeLines.length, afterLines.length);
+  for (let i = 0; i < maxLines; i++) {
+    const beforeLine = beforeLines[i];
+    const afterLine = afterLines[i];
+    if (beforeLine === afterLine) {
+      if (beforeLine !== void 0) {
+        result.push({
+          type: "unchanged",
+          content: beforeLine,
+          lineNumber: lineNumber++
+        });
+      }
+    } else {
+      if (beforeLine !== void 0) {
+        result.push({
+          type: "removed",
+          content: beforeLine,
+          lineNumber
+        });
+      }
+      if (afterLine !== void 0) {
+        result.push({
+          type: "added",
+          content: afterLine,
+          lineNumber
+        });
+      }
+      lineNumber++;
+    }
+  }
+  return result;
+}
+function diffSnapshots(before, after) {
+  const diffs = [];
+  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
+  for (const path10 of allPaths) {
+    const beforeContent = before[path10] ?? "";
+    const afterContent = after[path10] ?? "";
+    if (beforeContent === afterContent) {
+      continue;
+    }
+    const diffLines = generateDiffLines(beforeContent, afterContent);
+    diffs.push({
+      path: path10,
+      expected: beforeContent,
+      actual: afterContent,
+      diffLines
+    });
+  }
+  diffs.sort((a, b) => a.path.localeCompare(b.path));
+  return diffs;
+}
 // src/run-scenario/callSkill.ts
 async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
   if (agent && agent.runCommand !== "claude") {
@@ -6802,6 +6975,7 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
     );
   }
   const startedAt = (/* @__PURE__ */ new Date()).toISOString();
+  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
   const { result, llmTrace } = await executeWithClaudeCode(skill, scenario, {
     cwd: workDir || process.cwd(),
     systemPrompt: skill.skillMd,
@@ -6822,6 +6996,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
     }
   });
   const completedAt = (/* @__PURE__ */ new Date()).toISOString();
+  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
+  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
   return {
     id: (0, import_crypto2.randomUUID)(),
     targetId: skill.id,
@@ -6831,6 +7007,7 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
     modelConfig: agent?.modelConfig,
     duration: result.durationMs,
     outputText: result.outputText,
+    fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
     startedAt,
     completedAt,
     llmTrace
@@ -7442,33 +7619,14 @@ async function testClaudeDirectExecution(config) {
   const versionResult = await runAndLog(
     "claude --version",
     `${envExports} && "${claudePath}" --version 2>&1`,
-    15e3
-  );
-  const fullCmdResult = await runAndLog(
-    "claude -p (with json output)",
-    `${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
-    45e3
-  );
-  const simpleCmdResult = await runAndLog(
-    "claude -p (simple)",
-    `${envExports} && "${claudePath}" -p "Hello" 2>&1`,
-    45e3
-  );
-  const printFlagResult = await runAndLog(
-    "claude --print (long flag)",
-    `${envExports} && "${claudePath}" --print "Hi" 2>&1`,
-    45e3
-  );
-  const positionalResult = await runAndLog(
-    'claude "prompt" (positional)',
-    `${envExports} && "${claudePath}" "Hello world" 2>&1`,
-    45e3
+    1e4
+    // Short timeout - should complete in <2s
   );
-  await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
+  await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
   await runAndLog(
     "claude --version (no custom env)",
     `"${claudePath}" --version 2>&1`,
-    15e3
+    1e4
   );
   const homeDir = process.env.HOME || "/tmp";
   const claudeConfigDir = path9.join(homeDir, ".claude");
@@ -7478,35 +7636,110 @@ async function testClaudeDirectExecution(config) {
     try {
       const configContents = fs11.readdirSync(claudeConfigDir);
       details.claudeConfigContents = configContents;
-      for (const file of configContents) {
-        if (file.includes("log") || file.includes("error")) {
-          const logPath = path9.join(claudeConfigDir, file);
-          const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
-          const logContent = await execCommand(catCmd);
-          details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
-        }
-      }
     } catch (e) {
       details.claudeConfigError = e instanceof Error ? e.message : String(e);
     }
   }
   details.commandResults = commandResults;
-  const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
   const versionWorked = versionResult.exitCode === 0;
-  const passed = anyPromptWorked;
-  let errorMsg;
-  if (!passed) {
-    const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
-    errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
-  }
+  const passed = versionWorked;
   return {
-    name: "claude-direct-execution",
+    name: "claude-cli-basic",
     passed,
     details,
-    error: errorMsg,
+    error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
     durationMs: Date.now() - start
   };
 }
+async function testClaudeSdkExecution(config) {
+  const start = Date.now();
+  const details = {};
+  const SDK_TIMEOUT_MS = 3e4;
+  try {
+    console.error("[SDK-DIAG] Importing Claude Agent SDK...");
+    const sdk = await import("@anthropic-ai/claude-agent-sdk");
+    details.sdkImported = true;
+    const env = { ...process.env };
+    const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
+    env.ANTHROPIC_API_KEY = placeholderApiKey;
+    env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
+    if (config.aiGatewayUrl) {
+      env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
+    }
+    if (config.aiGatewayHeaders) {
+      const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
+      env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
+    }
+    details.envConfigured = true;
+    details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
+    details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
+    const canUseTool = async () => {
+      console.error("[SDK-DIAG] canUseTool callback called - returning allow");
+      return { behavior: "allow" };
+    };
+    const queryOptions = {
+      env,
+      cwd: "/tmp",
+      settingSources: ["project"],
+      allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
+      model: "claude-3-5-sonnet-latest",
+      maxTurns: 1,
+      // Just one turn for this test
+      permissionMode: "default",
+      canUseTool
+    };
+    details.queryOptionsConfigured = true;
+    console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
+    const timeoutPromise = new Promise((_, reject) => {
+      setTimeout(
+        () => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
+        SDK_TIMEOUT_MS
+      );
+    });
+    const sdkPromise = (async () => {
+      let messageCount2 = 0;
+      let resultMessage2 = null;
+      for await (const message of sdk.query({
+        prompt: 'Say "SDK test successful" and nothing else.',
+        ...queryOptions
+      })) {
+        messageCount2++;
+        console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
+        if (message.type === "result" || message.type === "assistant") {
+          resultMessage2 = message;
+        }
+      }
+      return { messageCount: messageCount2, resultMessage: resultMessage2 };
+    })();
+    const { messageCount, resultMessage } = await Promise.race([
+      sdkPromise,
+      timeoutPromise
+    ]);
+    details.messageCount = messageCount;
+    details.hasResultMessage = !!resultMessage;
+    details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
+    const passed = messageCount > 0;
+    return {
+      name: "claude-sdk-execution",
+      passed,
+      details,
+      error: passed ? void 0 : "SDK query completed but returned no messages",
+      durationMs: Date.now() - start
+    };
+  } catch (err) {
+    const error = err instanceof Error ? err.message : String(err);
+    details.error = error;
+    details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
+    console.error("[SDK-DIAG] SDK test failed:", error);
+    return {
+      name: "claude-sdk-execution",
+      passed: false,
+      details,
+      error: `SDK execution failed: ${error}`,
+      durationMs: Date.now() - start
+    };
+  }
+}
 async function testChildProcessSpawning() {
   const start = Date.now();
   const details = {};
@@ -7711,15 +7944,13 @@ async function runDiagnostics(config, evalRunId2) {
   await runTest("claude-cli-execution", testClaudeExecution);
   await runTest("environment-dump", testEnvironmentDump);
   await runTest("file-system-structure", testFileSystemStructure);
-  await runTest("network-connectivity", () => testNetworkConnectivity(config));
-  await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
-  await runTest(
-    "claude-direct-execution",
-    () => testClaudeDirectExecution(config)
-  );
   await runTest("child-process-spawning", testChildProcessSpawning);
-  await runTest("sdk-import", testSdkImport);
   await runTest("file-system-write", testFileSystemWrite);
+  await runTest("sdk-import", testSdkImport);
+  await runTest("network-connectivity", () => testNetworkConnectivity(config));
+  await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
+  await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
+  await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
   const completedAt = (/* @__PURE__ */ new Date()).toISOString();
   const totalDurationMs = Date.now() - startTime;
   const report = {