@wix/evalforge-evaluator 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -6360,40 +6360,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
6360
6360
  traceContext.authToken
6361
6361
  );
6362
6362
  }
6363
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6364
+ let timeoutHandle;
6365
+ let timedOut = false;
6363
6366
  try {
6364
- for await (const message of query({
6365
- prompt: scenario.triggerPrompt,
6366
- options: queryOptions
6367
- })) {
6368
- messageCount++;
6369
- console.log("[SDK Message]", JSON.stringify(message, null, 2));
6370
- allMessages.push(message);
6371
- if (messageCount <= 3) {
6372
- console.error(
6373
- "[DEBUG-H5] SDK message received",
6374
- JSON.stringify({
6375
- messageCount,
6376
- type: message.type,
6377
- timestamp: Date.now()
6378
- })
6379
- );
6380
- }
6381
- if (traceContext && isAssistantMessage(message)) {
6382
- traceStepNumber++;
6383
- const traceEvent = createTraceEventFromMessage(
6384
- message,
6385
- traceContext,
6386
- traceStepNumber,
6387
- false
6388
- // Not complete yet
6389
- );
6390
- emitTraceEvent(
6391
- traceEvent,
6392
- traceContext.tracePushUrl,
6393
- traceContext.routeHeader,
6394
- traceContext.authToken
6367
+ const timeoutPromise = new Promise((_, reject) => {
6368
+ timeoutHandle = setTimeout(() => {
6369
+ timedOut = true;
6370
+ reject(
6371
+ new Error(
6372
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6373
+ )
6395
6374
  );
6375
+ }, SDK_TIMEOUT_MS);
6376
+ });
6377
+ const sdkPromise = (async () => {
6378
+ for await (const message of query({
6379
+ prompt: scenario.triggerPrompt,
6380
+ options: queryOptions
6381
+ })) {
6382
+ messageCount++;
6383
+ console.log("[SDK Message]", JSON.stringify(message, null, 2));
6384
+ allMessages.push(message);
6385
+ if (messageCount <= 3) {
6386
+ console.error(
6387
+ "[DEBUG-H5] SDK message received",
6388
+ JSON.stringify({
6389
+ messageCount,
6390
+ type: message.type,
6391
+ timestamp: Date.now()
6392
+ })
6393
+ );
6394
+ }
6395
+ if (traceContext && isAssistantMessage(message)) {
6396
+ traceStepNumber++;
6397
+ const traceEvent = createTraceEventFromMessage(
6398
+ message,
6399
+ traceContext,
6400
+ traceStepNumber,
6401
+ false
6402
+ // Not complete yet
6403
+ );
6404
+ emitTraceEvent(
6405
+ traceEvent,
6406
+ traceContext.tracePushUrl,
6407
+ traceContext.routeHeader,
6408
+ traceContext.authToken
6409
+ );
6410
+ }
6396
6411
  }
6412
+ })();
6413
+ await Promise.race([sdkPromise, timeoutPromise]);
6414
+ if (timeoutHandle) {
6415
+ clearTimeout(timeoutHandle);
6397
6416
  }
6398
6417
  console.log(
6399
6418
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
@@ -6401,6 +6420,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
6401
6420
  "messages"
6402
6421
  );
6403
6422
  } catch (sdkError) {
6423
+ if (timeoutHandle) {
6424
+ clearTimeout(timeoutHandle);
6425
+ }
6426
+ if (timedOut) {
6427
+ console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6428
+ }
6404
6429
  console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
6405
6430
  console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
6406
6431
  console.error(
@@ -6794,6 +6819,154 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
6794
6819
  };
6795
6820
  }
6796
6821
 
6822
+ // src/run-scenario/file-diff.ts
6823
+ var import_fs6 = require("fs");
6824
+ var import_path6 = require("path");
6825
+ var IGNORED_PATTERNS = [
6826
+ "node_modules",
6827
+ ".git",
6828
+ ".claude",
6829
+ ".cursor",
6830
+ "dist",
6831
+ "build",
6832
+ ".next",
6833
+ ".turbo",
6834
+ "__pycache__",
6835
+ ".pytest_cache",
6836
+ ".venv",
6837
+ "venv",
6838
+ ".DS_Store"
6839
+ ];
6840
+ var BINARY_EXTENSIONS = [
6841
+ ".png",
6842
+ ".jpg",
6843
+ ".jpeg",
6844
+ ".gif",
6845
+ ".webp",
6846
+ ".ico",
6847
+ ".svg",
6848
+ ".woff",
6849
+ ".woff2",
6850
+ ".ttf",
6851
+ ".eot",
6852
+ ".mp3",
6853
+ ".mp4",
6854
+ ".wav",
6855
+ ".avi",
6856
+ ".mov",
6857
+ ".pdf",
6858
+ ".zip",
6859
+ ".tar",
6860
+ ".gz",
6861
+ ".rar",
6862
+ ".7z",
6863
+ ".exe",
6864
+ ".dll",
6865
+ ".so",
6866
+ ".dylib"
6867
+ ];
6868
+ var MAX_FILE_SIZE = 100 * 1024;
6869
+ function shouldIgnore(name2) {
6870
+ return IGNORED_PATTERNS.some(
6871
+ (pattern) => name2 === pattern || name2.startsWith(pattern + "/")
6872
+ );
6873
+ }
6874
+ function isBinaryFile(filename) {
6875
+ const lower = filename.toLowerCase();
6876
+ return BINARY_EXTENSIONS.some((ext) => lower.endsWith(ext));
6877
+ }
6878
+ function snapshotDirectory(dir, baseDir) {
6879
+ const snapshot = {};
6880
+ const base = baseDir || dir;
6881
+ if (!(0, import_fs6.existsSync)(dir)) {
6882
+ return snapshot;
6883
+ }
6884
+ const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
6885
+ for (const entry of entries) {
6886
+ const fullPath = (0, import_path6.join)(dir, entry.name);
6887
+ const relativePath = (0, import_path6.relative)(base, fullPath);
6888
+ if (shouldIgnore(entry.name)) {
6889
+ continue;
6890
+ }
6891
+ if (entry.isDirectory()) {
6892
+ const subSnapshot = snapshotDirectory(fullPath, base);
6893
+ Object.assign(snapshot, subSnapshot);
6894
+ } else if (entry.isFile()) {
6895
+ if (isBinaryFile(entry.name)) {
6896
+ continue;
6897
+ }
6898
+ try {
6899
+ const stats = (0, import_fs6.statSync)(fullPath);
6900
+ if (stats.size > MAX_FILE_SIZE) {
6901
+ continue;
6902
+ }
6903
+ const content = (0, import_fs6.readFileSync)(fullPath, "utf-8");
6904
+ snapshot[relativePath] = content;
6905
+ } catch {
6906
+ continue;
6907
+ }
6908
+ }
6909
+ }
6910
+ return snapshot;
6911
+ }
6912
+ function generateDiffLines(before, after) {
6913
+ const beforeLines = before.split("\n");
6914
+ const afterLines = after.split("\n");
6915
+ const result = [];
6916
+ let lineNumber = 1;
6917
+ const maxLines = Math.max(beforeLines.length, afterLines.length);
6918
+ for (let i = 0; i < maxLines; i++) {
6919
+ const beforeLine = beforeLines[i];
6920
+ const afterLine = afterLines[i];
6921
+ if (beforeLine === afterLine) {
6922
+ if (beforeLine !== void 0) {
6923
+ result.push({
6924
+ type: "unchanged",
6925
+ content: beforeLine,
6926
+ lineNumber: lineNumber++
6927
+ });
6928
+ }
6929
+ } else {
6930
+ if (beforeLine !== void 0) {
6931
+ result.push({
6932
+ type: "removed",
6933
+ content: beforeLine,
6934
+ lineNumber
6935
+ });
6936
+ }
6937
+ if (afterLine !== void 0) {
6938
+ result.push({
6939
+ type: "added",
6940
+ content: afterLine,
6941
+ lineNumber
6942
+ });
6943
+ }
6944
+ lineNumber++;
6945
+ }
6946
+ }
6947
+ return result;
6948
+ }
6949
+ function diffSnapshots(before, after) {
6950
+ const diffs = [];
6951
+ const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
6952
+ for (const path10 of allPaths) {
6953
+ const beforeContent = before[path10] ?? "";
6954
+ const afterContent = after[path10] ?? "";
6955
+ if (beforeContent === afterContent) {
6956
+ continue;
6957
+ }
6958
+ const diffLines = generateDiffLines(beforeContent, afterContent);
6959
+ diffs.push({
6960
+ path: path10,
6961
+ expected: beforeContent,
6962
+ actual: afterContent,
6963
+ diffLines
6964
+ });
6965
+ }
6966
+ diffs.sort((a, b) => a.path.localeCompare(b.path));
6967
+ return diffs;
6968
+ }
6969
+
6797
6970
  // src/run-scenario/callSkill.ts
6798
6971
  async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6799
6972
  if (agent && agent.runCommand !== "claude") {
@@ -6802,6 +6975,7 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6802
6975
  );
6803
6976
  }
6804
6977
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
6978
+ const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
6805
6979
  const { result, llmTrace } = await executeWithClaudeCode(skill, scenario, {
6806
6980
  cwd: workDir || process.cwd(),
6807
6981
  systemPrompt: skill.skillMd,
@@ -6822,6 +6996,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6822
6996
  }
6823
6997
  });
6824
6998
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
6999
+ const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
7000
+ const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
6825
7001
  return {
6826
7002
  id: (0, import_crypto2.randomUUID)(),
6827
7003
  targetId: skill.id,
@@ -6831,6 +7007,7 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6831
7007
  modelConfig: agent?.modelConfig,
6832
7008
  duration: result.durationMs,
6833
7009
  outputText: result.outputText,
7010
+ fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
6834
7011
  startedAt,
6835
7012
  completedAt,
6836
7013
  llmTrace
@@ -7442,33 +7619,14 @@ async function testClaudeDirectExecution(config) {
7442
7619
  const versionResult = await runAndLog(
7443
7620
  "claude --version",
7444
7621
  `${envExports} && "${claudePath}" --version 2>&1`,
7445
- 15e3
7446
- );
7447
- const fullCmdResult = await runAndLog(
7448
- "claude -p (with json output)",
7449
- `${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
7450
- 45e3
7451
- );
7452
- const simpleCmdResult = await runAndLog(
7453
- "claude -p (simple)",
7454
- `${envExports} && "${claudePath}" -p "Hello" 2>&1`,
7455
- 45e3
7456
- );
7457
- const printFlagResult = await runAndLog(
7458
- "claude --print (long flag)",
7459
- `${envExports} && "${claudePath}" --print "Hi" 2>&1`,
7460
- 45e3
7461
- );
7462
- const positionalResult = await runAndLog(
7463
- 'claude "prompt" (positional)',
7464
- `${envExports} && "${claudePath}" "Hello world" 2>&1`,
7465
- 45e3
7622
+ 1e4
7623
+ // Short timeout - should complete in <2s
7466
7624
  );
7467
- await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
7625
+ await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
7468
7626
  await runAndLog(
7469
7627
  "claude --version (no custom env)",
7470
7628
  `"${claudePath}" --version 2>&1`,
7471
- 15e3
7629
+ 1e4
7472
7630
  );
7473
7631
  const homeDir = process.env.HOME || "/tmp";
7474
7632
  const claudeConfigDir = path9.join(homeDir, ".claude");
@@ -7478,35 +7636,110 @@ async function testClaudeDirectExecution(config) {
7478
7636
  try {
7479
7637
  const configContents = fs11.readdirSync(claudeConfigDir);
7480
7638
  details.claudeConfigContents = configContents;
7481
- for (const file of configContents) {
7482
- if (file.includes("log") || file.includes("error")) {
7483
- const logPath = path9.join(claudeConfigDir, file);
7484
- const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
7485
- const logContent = await execCommand(catCmd);
7486
- details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
7487
- }
7488
- }
7489
7639
  } catch (e) {
7490
7640
  details.claudeConfigError = e instanceof Error ? e.message : String(e);
7491
7641
  }
7492
7642
  }
7493
7643
  details.commandResults = commandResults;
7494
- const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
7495
7644
  const versionWorked = versionResult.exitCode === 0;
7496
- const passed = anyPromptWorked;
7497
- let errorMsg;
7498
- if (!passed) {
7499
- const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
7500
- errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
7501
- }
7645
+ const passed = versionWorked;
7502
7646
  return {
7503
- name: "claude-direct-execution",
7647
+ name: "claude-cli-basic",
7504
7648
  passed,
7505
7649
  details,
7506
- error: errorMsg,
7650
+ error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
7507
7651
  durationMs: Date.now() - start
7508
7652
  };
7509
7653
  }
7654
+ async function testClaudeSdkExecution(config) {
7655
+ const start = Date.now();
7656
+ const details = {};
7657
+ const SDK_TIMEOUT_MS = 3e4;
7658
+ try {
7659
+ console.error("[SDK-DIAG] Importing Claude Agent SDK...");
7660
+ const sdk = await import("@anthropic-ai/claude-agent-sdk");
7661
+ details.sdkImported = true;
7662
+ const env = { ...process.env };
7663
+ const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
7664
+ env.ANTHROPIC_API_KEY = placeholderApiKey;
7665
+ env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
7666
+ if (config.aiGatewayUrl) {
7667
+ env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
7668
+ }
7669
+ if (config.aiGatewayHeaders) {
7670
+ const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
7671
+ env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
7672
+ }
7673
+ details.envConfigured = true;
7674
+ details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
7675
+ details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
7676
+ const canUseTool = async () => {
7677
+ console.error("[SDK-DIAG] canUseTool callback called - returning allow");
7678
+ return { behavior: "allow" };
7679
+ };
7680
+ const queryOptions = {
7681
+ env,
7682
+ cwd: "/tmp",
7683
+ settingSources: ["project"],
7684
+ allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
7685
+ model: "claude-3-5-sonnet-latest",
7686
+ maxTurns: 1,
7687
+ // Just one turn for this test
7688
+ permissionMode: "default",
7689
+ canUseTool
7690
+ };
7691
+ details.queryOptionsConfigured = true;
7692
+ console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
7693
+ const timeoutPromise = new Promise((_, reject) => {
7694
+ setTimeout(
7695
+ () => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
7696
+ SDK_TIMEOUT_MS
7697
+ );
7698
+ });
7699
+ const sdkPromise = (async () => {
7700
+ let messageCount2 = 0;
7701
+ let resultMessage2 = null;
7702
+ for await (const message of sdk.query({
7703
+ prompt: 'Say "SDK test successful" and nothing else.',
7704
+ ...queryOptions
7705
+ })) {
7706
+ messageCount2++;
7707
+ console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
7708
+ if (message.type === "result" || message.type === "assistant") {
7709
+ resultMessage2 = message;
7710
+ }
7711
+ }
7712
+ return { messageCount: messageCount2, resultMessage: resultMessage2 };
7713
+ })();
7714
+ const { messageCount, resultMessage } = await Promise.race([
7715
+ sdkPromise,
7716
+ timeoutPromise
7717
+ ]);
7718
+ details.messageCount = messageCount;
7719
+ details.hasResultMessage = !!resultMessage;
7720
+ details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
7721
+ const passed = messageCount > 0;
7722
+ return {
7723
+ name: "claude-sdk-execution",
7724
+ passed,
7725
+ details,
7726
+ error: passed ? void 0 : "SDK query completed but returned no messages",
7727
+ durationMs: Date.now() - start
7728
+ };
7729
+ } catch (err) {
7730
+ const error = err instanceof Error ? err.message : String(err);
7731
+ details.error = error;
7732
+ details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
7733
+ console.error("[SDK-DIAG] SDK test failed:", error);
7734
+ return {
7735
+ name: "claude-sdk-execution",
7736
+ passed: false,
7737
+ details,
7738
+ error: `SDK execution failed: ${error}`,
7739
+ durationMs: Date.now() - start
7740
+ };
7741
+ }
7742
+ }
7510
7743
  async function testChildProcessSpawning() {
7511
7744
  const start = Date.now();
7512
7745
  const details = {};
@@ -7711,15 +7944,13 @@ async function runDiagnostics(config, evalRunId2) {
7711
7944
  await runTest("claude-cli-execution", testClaudeExecution);
7712
7945
  await runTest("environment-dump", testEnvironmentDump);
7713
7946
  await runTest("file-system-structure", testFileSystemStructure);
7714
- await runTest("network-connectivity", () => testNetworkConnectivity(config));
7715
- await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7716
- await runTest(
7717
- "claude-direct-execution",
7718
- () => testClaudeDirectExecution(config)
7719
- );
7720
7947
  await runTest("child-process-spawning", testChildProcessSpawning);
7721
- await runTest("sdk-import", testSdkImport);
7722
7948
  await runTest("file-system-write", testFileSystemWrite);
7949
+ await runTest("sdk-import", testSdkImport);
7950
+ await runTest("network-connectivity", () => testNetworkConnectivity(config));
7951
+ await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7952
+ await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
7953
+ await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
7723
7954
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7724
7955
  const totalDurationMs = Date.now() - startTime;
7725
7956
  const report = {