@wix/evalforge-evaluator 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -6360,40 +6360,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
6360
6360
  traceContext.authToken
6361
6361
  );
6362
6362
  }
6363
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6364
+ let timeoutHandle;
6365
+ let timedOut = false;
6363
6366
  try {
6364
- for await (const message of query({
6365
- prompt: scenario.triggerPrompt,
6366
- options: queryOptions
6367
- })) {
6368
- messageCount++;
6369
- console.log("[SDK Message]", JSON.stringify(message, null, 2));
6370
- allMessages.push(message);
6371
- if (messageCount <= 3) {
6372
- console.error(
6373
- "[DEBUG-H5] SDK message received",
6374
- JSON.stringify({
6375
- messageCount,
6376
- type: message.type,
6377
- timestamp: Date.now()
6378
- })
6379
- );
6380
- }
6381
- if (traceContext && isAssistantMessage(message)) {
6382
- traceStepNumber++;
6383
- const traceEvent = createTraceEventFromMessage(
6384
- message,
6385
- traceContext,
6386
- traceStepNumber,
6387
- false
6388
- // Not complete yet
6389
- );
6390
- emitTraceEvent(
6391
- traceEvent,
6392
- traceContext.tracePushUrl,
6393
- traceContext.routeHeader,
6394
- traceContext.authToken
6367
+ const timeoutPromise = new Promise((_, reject) => {
6368
+ timeoutHandle = setTimeout(() => {
6369
+ timedOut = true;
6370
+ reject(
6371
+ new Error(
6372
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6373
+ )
6395
6374
  );
6375
+ }, SDK_TIMEOUT_MS);
6376
+ });
6377
+ const sdkPromise = (async () => {
6378
+ for await (const message of query({
6379
+ prompt: scenario.triggerPrompt,
6380
+ options: queryOptions
6381
+ })) {
6382
+ messageCount++;
6383
+ console.log("[SDK Message]", JSON.stringify(message, null, 2));
6384
+ allMessages.push(message);
6385
+ if (messageCount <= 3) {
6386
+ console.error(
6387
+ "[DEBUG-H5] SDK message received",
6388
+ JSON.stringify({
6389
+ messageCount,
6390
+ type: message.type,
6391
+ timestamp: Date.now()
6392
+ })
6393
+ );
6394
+ }
6395
+ if (traceContext && isAssistantMessage(message)) {
6396
+ traceStepNumber++;
6397
+ const traceEvent = createTraceEventFromMessage(
6398
+ message,
6399
+ traceContext,
6400
+ traceStepNumber,
6401
+ false
6402
+ // Not complete yet
6403
+ );
6404
+ emitTraceEvent(
6405
+ traceEvent,
6406
+ traceContext.tracePushUrl,
6407
+ traceContext.routeHeader,
6408
+ traceContext.authToken
6409
+ );
6410
+ }
6396
6411
  }
6412
+ })();
6413
+ await Promise.race([sdkPromise, timeoutPromise]);
6414
+ if (timeoutHandle) {
6415
+ clearTimeout(timeoutHandle);
6397
6416
  }
6398
6417
  console.log(
6399
6418
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
@@ -6401,6 +6420,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
6401
6420
  "messages"
6402
6421
  );
6403
6422
  } catch (sdkError) {
6423
+ if (timeoutHandle) {
6424
+ clearTimeout(timeoutHandle);
6425
+ }
6426
+ if (timedOut) {
6427
+ console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6428
+ }
6404
6429
  console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
6405
6430
  console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
6406
6431
  console.error(
@@ -7442,33 +7467,14 @@ async function testClaudeDirectExecution(config) {
7442
7467
  const versionResult = await runAndLog(
7443
7468
  "claude --version",
7444
7469
  `${envExports} && "${claudePath}" --version 2>&1`,
7445
- 15e3
7446
- );
7447
- const fullCmdResult = await runAndLog(
7448
- "claude -p (with json output)",
7449
- `${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
7450
- 45e3
7451
- );
7452
- const simpleCmdResult = await runAndLog(
7453
- "claude -p (simple)",
7454
- `${envExports} && "${claudePath}" -p "Hello" 2>&1`,
7455
- 45e3
7470
+ 1e4
7471
+ // Short timeout - should complete in <2s
7456
7472
  );
7457
- const printFlagResult = await runAndLog(
7458
- "claude --print (long flag)",
7459
- `${envExports} && "${claudePath}" --print "Hi" 2>&1`,
7460
- 45e3
7461
- );
7462
- const positionalResult = await runAndLog(
7463
- 'claude "prompt" (positional)',
7464
- `${envExports} && "${claudePath}" "Hello world" 2>&1`,
7465
- 45e3
7466
- );
7467
- await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
7473
+ await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
7468
7474
  await runAndLog(
7469
7475
  "claude --version (no custom env)",
7470
7476
  `"${claudePath}" --version 2>&1`,
7471
- 15e3
7477
+ 1e4
7472
7478
  );
7473
7479
  const homeDir = process.env.HOME || "/tmp";
7474
7480
  const claudeConfigDir = path9.join(homeDir, ".claude");
@@ -7478,35 +7484,110 @@ async function testClaudeDirectExecution(config) {
7478
7484
  try {
7479
7485
  const configContents = fs11.readdirSync(claudeConfigDir);
7480
7486
  details.claudeConfigContents = configContents;
7481
- for (const file of configContents) {
7482
- if (file.includes("log") || file.includes("error")) {
7483
- const logPath = path9.join(claudeConfigDir, file);
7484
- const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
7485
- const logContent = await execCommand(catCmd);
7486
- details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
7487
- }
7488
- }
7489
7487
  } catch (e) {
7490
7488
  details.claudeConfigError = e instanceof Error ? e.message : String(e);
7491
7489
  }
7492
7490
  }
7493
7491
  details.commandResults = commandResults;
7494
- const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
7495
7492
  const versionWorked = versionResult.exitCode === 0;
7496
- const passed = anyPromptWorked;
7497
- let errorMsg;
7498
- if (!passed) {
7499
- const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
7500
- errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
7501
- }
7493
+ const passed = versionWorked;
7502
7494
  return {
7503
- name: "claude-direct-execution",
7495
+ name: "claude-cli-basic",
7504
7496
  passed,
7505
7497
  details,
7506
- error: errorMsg,
7498
+ error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
7507
7499
  durationMs: Date.now() - start
7508
7500
  };
7509
7501
  }
7502
+ async function testClaudeSdkExecution(config) {
7503
+ const start = Date.now();
7504
+ const details = {};
7505
+ const SDK_TIMEOUT_MS = 3e4;
7506
+ try {
7507
+ console.error("[SDK-DIAG] Importing Claude Agent SDK...");
7508
+ const sdk = await import("@anthropic-ai/claude-agent-sdk");
7509
+ details.sdkImported = true;
7510
+ const env = { ...process.env };
7511
+ const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
7512
+ env.ANTHROPIC_API_KEY = placeholderApiKey;
7513
+ env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
7514
+ if (config.aiGatewayUrl) {
7515
+ env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
7516
+ }
7517
+ if (config.aiGatewayHeaders) {
7518
+ const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
7519
+ env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
7520
+ }
7521
+ details.envConfigured = true;
7522
+ details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
7523
+ details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
7524
+ const canUseTool = async () => {
7525
+ console.error("[SDK-DIAG] canUseTool callback called - returning allow");
7526
+ return { behavior: "allow" };
7527
+ };
7528
+ const queryOptions = {
7529
+ env,
7530
+ cwd: "/tmp",
7531
+ settingSources: ["project"],
7532
+ allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
7533
+ model: "claude-3-5-sonnet-latest",
7534
+ maxTurns: 1,
7535
+ // Just one turn for this test
7536
+ permissionMode: "default",
7537
+ canUseTool
7538
+ };
7539
+ details.queryOptionsConfigured = true;
7540
+ console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
7541
+ const timeoutPromise = new Promise((_, reject) => {
7542
+ setTimeout(
7543
+ () => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
7544
+ SDK_TIMEOUT_MS
7545
+ );
7546
+ });
7547
+ const sdkPromise = (async () => {
7548
+ let messageCount2 = 0;
7549
+ let resultMessage2 = null;
7550
+ for await (const message of sdk.query({
7551
+ prompt: 'Say "SDK test successful" and nothing else.',
7552
+ ...queryOptions
7553
+ })) {
7554
+ messageCount2++;
7555
+ console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
7556
+ if (message.type === "result" || message.type === "assistant") {
7557
+ resultMessage2 = message;
7558
+ }
7559
+ }
7560
+ return { messageCount: messageCount2, resultMessage: resultMessage2 };
7561
+ })();
7562
+ const { messageCount, resultMessage } = await Promise.race([
7563
+ sdkPromise,
7564
+ timeoutPromise
7565
+ ]);
7566
+ details.messageCount = messageCount;
7567
+ details.hasResultMessage = !!resultMessage;
7568
+ details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
7569
+ const passed = messageCount > 0;
7570
+ return {
7571
+ name: "claude-sdk-execution",
7572
+ passed,
7573
+ details,
7574
+ error: passed ? void 0 : "SDK query completed but returned no messages",
7575
+ durationMs: Date.now() - start
7576
+ };
7577
+ } catch (err) {
7578
+ const error = err instanceof Error ? err.message : String(err);
7579
+ details.error = error;
7580
+ details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
7581
+ console.error("[SDK-DIAG] SDK test failed:", error);
7582
+ return {
7583
+ name: "claude-sdk-execution",
7584
+ passed: false,
7585
+ details,
7586
+ error: `SDK execution failed: ${error}`,
7587
+ durationMs: Date.now() - start
7588
+ };
7589
+ }
7590
+ }
7510
7591
  async function testChildProcessSpawning() {
7511
7592
  const start = Date.now();
7512
7593
  const details = {};
@@ -7711,15 +7792,13 @@ async function runDiagnostics(config, evalRunId2) {
7711
7792
  await runTest("claude-cli-execution", testClaudeExecution);
7712
7793
  await runTest("environment-dump", testEnvironmentDump);
7713
7794
  await runTest("file-system-structure", testFileSystemStructure);
7714
- await runTest("network-connectivity", () => testNetworkConnectivity(config));
7715
- await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7716
- await runTest(
7717
- "claude-direct-execution",
7718
- () => testClaudeDirectExecution(config)
7719
- );
7720
7795
  await runTest("child-process-spawning", testChildProcessSpawning);
7721
- await runTest("sdk-import", testSdkImport);
7722
7796
  await runTest("file-system-write", testFileSystemWrite);
7797
+ await runTest("sdk-import", testSdkImport);
7798
+ await runTest("network-connectivity", () => testNetworkConnectivity(config));
7799
+ await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7800
+ await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
7801
+ await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
7723
7802
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7724
7803
  const totalDurationMs = Date.now() - startTime;
7725
7804
  const report = {