@wix/evalforge-evaluator 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -6343,40 +6343,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
6343
6343
  traceContext.authToken
6344
6344
  );
6345
6345
  }
6346
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6347
+ let timeoutHandle;
6348
+ let timedOut = false;
6346
6349
  try {
6347
- for await (const message of query({
6348
- prompt: scenario.triggerPrompt,
6349
- options: queryOptions
6350
- })) {
6351
- messageCount++;
6352
- console.log("[SDK Message]", JSON.stringify(message, null, 2));
6353
- allMessages.push(message);
6354
- if (messageCount <= 3) {
6355
- console.error(
6356
- "[DEBUG-H5] SDK message received",
6357
- JSON.stringify({
6358
- messageCount,
6359
- type: message.type,
6360
- timestamp: Date.now()
6361
- })
6362
- );
6363
- }
6364
- if (traceContext && isAssistantMessage(message)) {
6365
- traceStepNumber++;
6366
- const traceEvent = createTraceEventFromMessage(
6367
- message,
6368
- traceContext,
6369
- traceStepNumber,
6370
- false
6371
- // Not complete yet
6372
- );
6373
- emitTraceEvent(
6374
- traceEvent,
6375
- traceContext.tracePushUrl,
6376
- traceContext.routeHeader,
6377
- traceContext.authToken
6350
+ const timeoutPromise = new Promise((_, reject) => {
6351
+ timeoutHandle = setTimeout(() => {
6352
+ timedOut = true;
6353
+ reject(
6354
+ new Error(
6355
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6356
+ )
6378
6357
  );
6358
+ }, SDK_TIMEOUT_MS);
6359
+ });
6360
+ const sdkPromise = (async () => {
6361
+ for await (const message of query({
6362
+ prompt: scenario.triggerPrompt,
6363
+ options: queryOptions
6364
+ })) {
6365
+ messageCount++;
6366
+ console.log("[SDK Message]", JSON.stringify(message, null, 2));
6367
+ allMessages.push(message);
6368
+ if (messageCount <= 3) {
6369
+ console.error(
6370
+ "[DEBUG-H5] SDK message received",
6371
+ JSON.stringify({
6372
+ messageCount,
6373
+ type: message.type,
6374
+ timestamp: Date.now()
6375
+ })
6376
+ );
6377
+ }
6378
+ if (traceContext && isAssistantMessage(message)) {
6379
+ traceStepNumber++;
6380
+ const traceEvent = createTraceEventFromMessage(
6381
+ message,
6382
+ traceContext,
6383
+ traceStepNumber,
6384
+ false
6385
+ // Not complete yet
6386
+ );
6387
+ emitTraceEvent(
6388
+ traceEvent,
6389
+ traceContext.tracePushUrl,
6390
+ traceContext.routeHeader,
6391
+ traceContext.authToken
6392
+ );
6393
+ }
6379
6394
  }
6395
+ })();
6396
+ await Promise.race([sdkPromise, timeoutPromise]);
6397
+ if (timeoutHandle) {
6398
+ clearTimeout(timeoutHandle);
6380
6399
  }
6381
6400
  console.log(
6382
6401
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
@@ -6384,6 +6403,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
6384
6403
  "messages"
6385
6404
  );
6386
6405
  } catch (sdkError) {
6406
+ if (timeoutHandle) {
6407
+ clearTimeout(timeoutHandle);
6408
+ }
6409
+ if (timedOut) {
6410
+ console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6411
+ }
6387
6412
  console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
6388
6413
  console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
6389
6414
  console.error(
@@ -6777,6 +6802,154 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
6777
6802
  };
6778
6803
  }
6779
6804
 
6805
+ // src/run-scenario/file-diff.ts
6806
+ import { readdirSync as readdirSync2, readFileSync, statSync, existsSync as existsSync2 } from "fs";
6807
+ import { join as join3, relative } from "path";
6808
+ var IGNORED_PATTERNS = [
6809
+ "node_modules",
6810
+ ".git",
6811
+ ".claude",
6812
+ ".cursor",
6813
+ "dist",
6814
+ "build",
6815
+ ".next",
6816
+ ".turbo",
6817
+ "__pycache__",
6818
+ ".pytest_cache",
6819
+ ".venv",
6820
+ "venv",
6821
+ ".DS_Store"
6822
+ ];
6823
+ var BINARY_EXTENSIONS = [
6824
+ ".png",
6825
+ ".jpg",
6826
+ ".jpeg",
6827
+ ".gif",
6828
+ ".webp",
6829
+ ".ico",
6830
+ ".svg",
6831
+ ".woff",
6832
+ ".woff2",
6833
+ ".ttf",
6834
+ ".eot",
6835
+ ".mp3",
6836
+ ".mp4",
6837
+ ".wav",
6838
+ ".avi",
6839
+ ".mov",
6840
+ ".pdf",
6841
+ ".zip",
6842
+ ".tar",
6843
+ ".gz",
6844
+ ".rar",
6845
+ ".7z",
6846
+ ".exe",
6847
+ ".dll",
6848
+ ".so",
6849
+ ".dylib"
6850
+ ];
6851
+ var MAX_FILE_SIZE = 100 * 1024;
6852
+ function shouldIgnore(name2) {
6853
+ return IGNORED_PATTERNS.some(
6854
+ (pattern) => name2 === pattern || name2.startsWith(pattern + "/")
6855
+ );
6856
+ }
6857
+ function isBinaryFile(filename) {
6858
+ const lower = filename.toLowerCase();
6859
+ return BINARY_EXTENSIONS.some((ext) => lower.endsWith(ext));
6860
+ }
6861
+ function snapshotDirectory(dir, baseDir) {
6862
+ const snapshot = {};
6863
+ const base = baseDir || dir;
6864
+ if (!existsSync2(dir)) {
6865
+ return snapshot;
6866
+ }
6867
+ const entries = readdirSync2(dir, { withFileTypes: true });
6868
+ for (const entry of entries) {
6869
+ const fullPath = join3(dir, entry.name);
6870
+ const relativePath = relative(base, fullPath);
6871
+ if (shouldIgnore(entry.name)) {
6872
+ continue;
6873
+ }
6874
+ if (entry.isDirectory()) {
6875
+ const subSnapshot = snapshotDirectory(fullPath, base);
6876
+ Object.assign(snapshot, subSnapshot);
6877
+ } else if (entry.isFile()) {
6878
+ if (isBinaryFile(entry.name)) {
6879
+ continue;
6880
+ }
6881
+ try {
6882
+ const stats = statSync(fullPath);
6883
+ if (stats.size > MAX_FILE_SIZE) {
6884
+ continue;
6885
+ }
6886
+ const content = readFileSync(fullPath, "utf-8");
6887
+ snapshot[relativePath] = content;
6888
+ } catch {
6889
+ continue;
6890
+ }
6891
+ }
6892
+ }
6893
+ return snapshot;
6894
+ }
6895
+ function generateDiffLines(before, after) {
6896
+ const beforeLines = before.split("\n");
6897
+ const afterLines = after.split("\n");
6898
+ const result = [];
6899
+ let lineNumber = 1;
6900
+ const maxLines = Math.max(beforeLines.length, afterLines.length);
6901
+ for (let i = 0; i < maxLines; i++) {
6902
+ const beforeLine = beforeLines[i];
6903
+ const afterLine = afterLines[i];
6904
+ if (beforeLine === afterLine) {
6905
+ if (beforeLine !== void 0) {
6906
+ result.push({
6907
+ type: "unchanged",
6908
+ content: beforeLine,
6909
+ lineNumber: lineNumber++
6910
+ });
6911
+ }
6912
+ } else {
6913
+ if (beforeLine !== void 0) {
6914
+ result.push({
6915
+ type: "removed",
6916
+ content: beforeLine,
6917
+ lineNumber
6918
+ });
6919
+ }
6920
+ if (afterLine !== void 0) {
6921
+ result.push({
6922
+ type: "added",
6923
+ content: afterLine,
6924
+ lineNumber
6925
+ });
6926
+ }
6927
+ lineNumber++;
6928
+ }
6929
+ }
6930
+ return result;
6931
+ }
6932
+ function diffSnapshots(before, after) {
6933
+ const diffs = [];
6934
+ const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
6935
+ for (const path10 of allPaths) {
6936
+ const beforeContent = before[path10] ?? "";
6937
+ const afterContent = after[path10] ?? "";
6938
+ if (beforeContent === afterContent) {
6939
+ continue;
6940
+ }
6941
+ const diffLines = generateDiffLines(beforeContent, afterContent);
6942
+ diffs.push({
6943
+ path: path10,
6944
+ expected: beforeContent,
6945
+ actual: afterContent,
6946
+ diffLines
6947
+ });
6948
+ }
6949
+ diffs.sort((a, b) => a.path.localeCompare(b.path));
6950
+ return diffs;
6951
+ }
6952
+
6780
6953
  // src/run-scenario/callSkill.ts
6781
6954
  async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6782
6955
  if (agent && agent.runCommand !== "claude") {
@@ -6785,6 +6958,7 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6785
6958
  );
6786
6959
  }
6787
6960
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
6961
+ const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
6788
6962
  const { result, llmTrace } = await executeWithClaudeCode(skill, scenario, {
6789
6963
  cwd: workDir || process.cwd(),
6790
6964
  systemPrompt: skill.skillMd,
@@ -6805,6 +6979,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6805
6979
  }
6806
6980
  });
6807
6981
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
6982
+ const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
6983
+ const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
6808
6984
  return {
6809
6985
  id: randomUUID2(),
6810
6986
  targetId: skill.id,
@@ -6814,6 +6990,7 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
6814
6990
  modelConfig: agent?.modelConfig,
6815
6991
  duration: result.durationMs,
6816
6992
  outputText: result.outputText,
6993
+ fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
6817
6994
  startedAt,
6818
6995
  completedAt,
6819
6996
  llmTrace
@@ -7425,33 +7602,14 @@ async function testClaudeDirectExecution(config) {
7425
7602
  const versionResult = await runAndLog(
7426
7603
  "claude --version",
7427
7604
  `${envExports} && "${claudePath}" --version 2>&1`,
7428
- 15e3
7429
- );
7430
- const fullCmdResult = await runAndLog(
7431
- "claude -p (with json output)",
7432
- `${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
7433
- 45e3
7434
- );
7435
- const simpleCmdResult = await runAndLog(
7436
- "claude -p (simple)",
7437
- `${envExports} && "${claudePath}" -p "Hello" 2>&1`,
7438
- 45e3
7439
- );
7440
- const printFlagResult = await runAndLog(
7441
- "claude --print (long flag)",
7442
- `${envExports} && "${claudePath}" --print "Hi" 2>&1`,
7443
- 45e3
7444
- );
7445
- const positionalResult = await runAndLog(
7446
- 'claude "prompt" (positional)',
7447
- `${envExports} && "${claudePath}" "Hello world" 2>&1`,
7448
- 45e3
7605
+ 1e4
7606
+ // Short timeout - should complete in <2s
7449
7607
  );
7450
- await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
7608
+ await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
7451
7609
  await runAndLog(
7452
7610
  "claude --version (no custom env)",
7453
7611
  `"${claudePath}" --version 2>&1`,
7454
- 15e3
7612
+ 1e4
7455
7613
  );
7456
7614
  const homeDir = process.env.HOME || "/tmp";
7457
7615
  const claudeConfigDir = path9.join(homeDir, ".claude");
@@ -7461,35 +7619,110 @@ async function testClaudeDirectExecution(config) {
7461
7619
  try {
7462
7620
  const configContents = fs11.readdirSync(claudeConfigDir);
7463
7621
  details.claudeConfigContents = configContents;
7464
- for (const file of configContents) {
7465
- if (file.includes("log") || file.includes("error")) {
7466
- const logPath = path9.join(claudeConfigDir, file);
7467
- const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
7468
- const logContent = await execCommand(catCmd);
7469
- details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
7470
- }
7471
- }
7472
7622
  } catch (e) {
7473
7623
  details.claudeConfigError = e instanceof Error ? e.message : String(e);
7474
7624
  }
7475
7625
  }
7476
7626
  details.commandResults = commandResults;
7477
- const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
7478
7627
  const versionWorked = versionResult.exitCode === 0;
7479
- const passed = anyPromptWorked;
7480
- let errorMsg;
7481
- if (!passed) {
7482
- const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
7483
- errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
7484
- }
7628
+ const passed = versionWorked;
7485
7629
  return {
7486
- name: "claude-direct-execution",
7630
+ name: "claude-cli-basic",
7487
7631
  passed,
7488
7632
  details,
7489
- error: errorMsg,
7633
+ error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
7490
7634
  durationMs: Date.now() - start
7491
7635
  };
7492
7636
  }
7637
+ async function testClaudeSdkExecution(config) {
7638
+ const start = Date.now();
7639
+ const details = {};
7640
+ const SDK_TIMEOUT_MS = 3e4;
7641
+ try {
7642
+ console.error("[SDK-DIAG] Importing Claude Agent SDK...");
7643
+ const sdk = await import("@anthropic-ai/claude-agent-sdk");
7644
+ details.sdkImported = true;
7645
+ const env = { ...process.env };
7646
+ const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
7647
+ env.ANTHROPIC_API_KEY = placeholderApiKey;
7648
+ env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
7649
+ if (config.aiGatewayUrl) {
7650
+ env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
7651
+ }
7652
+ if (config.aiGatewayHeaders) {
7653
+ const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
7654
+ env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
7655
+ }
7656
+ details.envConfigured = true;
7657
+ details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
7658
+ details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
7659
+ const canUseTool = async () => {
7660
+ console.error("[SDK-DIAG] canUseTool callback called - returning allow");
7661
+ return { behavior: "allow" };
7662
+ };
7663
+ const queryOptions = {
7664
+ env,
7665
+ cwd: "/tmp",
7666
+ settingSources: ["project"],
7667
+ allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
7668
+ model: "claude-3-5-sonnet-latest",
7669
+ maxTurns: 1,
7670
+ // Just one turn for this test
7671
+ permissionMode: "default",
7672
+ canUseTool
7673
+ };
7674
+ details.queryOptionsConfigured = true;
7675
+ console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
7676
+ const timeoutPromise = new Promise((_, reject) => {
7677
+ setTimeout(
7678
+ () => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
7679
+ SDK_TIMEOUT_MS
7680
+ );
7681
+ });
7682
+ const sdkPromise = (async () => {
7683
+ let messageCount2 = 0;
7684
+ let resultMessage2 = null;
7685
+ for await (const message of sdk.query({
7686
+ prompt: 'Say "SDK test successful" and nothing else.',
7687
+ ...queryOptions
7688
+ })) {
7689
+ messageCount2++;
7690
+ console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
7691
+ if (message.type === "result" || message.type === "assistant") {
7692
+ resultMessage2 = message;
7693
+ }
7694
+ }
7695
+ return { messageCount: messageCount2, resultMessage: resultMessage2 };
7696
+ })();
7697
+ const { messageCount, resultMessage } = await Promise.race([
7698
+ sdkPromise,
7699
+ timeoutPromise
7700
+ ]);
7701
+ details.messageCount = messageCount;
7702
+ details.hasResultMessage = !!resultMessage;
7703
+ details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
7704
+ const passed = messageCount > 0;
7705
+ return {
7706
+ name: "claude-sdk-execution",
7707
+ passed,
7708
+ details,
7709
+ error: passed ? void 0 : "SDK query completed but returned no messages",
7710
+ durationMs: Date.now() - start
7711
+ };
7712
+ } catch (err) {
7713
+ const error = err instanceof Error ? err.message : String(err);
7714
+ details.error = error;
7715
+ details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
7716
+ console.error("[SDK-DIAG] SDK test failed:", error);
7717
+ return {
7718
+ name: "claude-sdk-execution",
7719
+ passed: false,
7720
+ details,
7721
+ error: `SDK execution failed: ${error}`,
7722
+ durationMs: Date.now() - start
7723
+ };
7724
+ }
7725
+ }
7493
7726
  async function testChildProcessSpawning() {
7494
7727
  const start = Date.now();
7495
7728
  const details = {};
@@ -7694,15 +7927,13 @@ async function runDiagnostics(config, evalRunId2) {
7694
7927
  await runTest("claude-cli-execution", testClaudeExecution);
7695
7928
  await runTest("environment-dump", testEnvironmentDump);
7696
7929
  await runTest("file-system-structure", testFileSystemStructure);
7697
- await runTest("network-connectivity", () => testNetworkConnectivity(config));
7698
- await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7699
- await runTest(
7700
- "claude-direct-execution",
7701
- () => testClaudeDirectExecution(config)
7702
- );
7703
7930
  await runTest("child-process-spawning", testChildProcessSpawning);
7704
- await runTest("sdk-import", testSdkImport);
7705
7931
  await runTest("file-system-write", testFileSystemWrite);
7932
+ await runTest("sdk-import", testSdkImport);
7933
+ await runTest("network-connectivity", () => testNetworkConnectivity(config));
7934
+ await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7935
+ await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
7936
+ await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
7706
7937
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7707
7938
  const totalDurationMs = Date.now() - startTime;
7708
7939
  const report = {