@wix/evalforge-evaluator 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -6343,40 +6343,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
6343
6343
  traceContext.authToken
6344
6344
  );
6345
6345
  }
6346
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6347
+ let timeoutHandle;
6348
+ let timedOut = false;
6346
6349
  try {
6347
- for await (const message of query({
6348
- prompt: scenario.triggerPrompt,
6349
- options: queryOptions
6350
- })) {
6351
- messageCount++;
6352
- console.log("[SDK Message]", JSON.stringify(message, null, 2));
6353
- allMessages.push(message);
6354
- if (messageCount <= 3) {
6355
- console.error(
6356
- "[DEBUG-H5] SDK message received",
6357
- JSON.stringify({
6358
- messageCount,
6359
- type: message.type,
6360
- timestamp: Date.now()
6361
- })
6362
- );
6363
- }
6364
- if (traceContext && isAssistantMessage(message)) {
6365
- traceStepNumber++;
6366
- const traceEvent = createTraceEventFromMessage(
6367
- message,
6368
- traceContext,
6369
- traceStepNumber,
6370
- false
6371
- // Not complete yet
6372
- );
6373
- emitTraceEvent(
6374
- traceEvent,
6375
- traceContext.tracePushUrl,
6376
- traceContext.routeHeader,
6377
- traceContext.authToken
6350
+ const timeoutPromise = new Promise((_, reject) => {
6351
+ timeoutHandle = setTimeout(() => {
6352
+ timedOut = true;
6353
+ reject(
6354
+ new Error(
6355
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6356
+ )
6378
6357
  );
6358
+ }, SDK_TIMEOUT_MS);
6359
+ });
6360
+ const sdkPromise = (async () => {
6361
+ for await (const message of query({
6362
+ prompt: scenario.triggerPrompt,
6363
+ options: queryOptions
6364
+ })) {
6365
+ messageCount++;
6366
+ console.log("[SDK Message]", JSON.stringify(message, null, 2));
6367
+ allMessages.push(message);
6368
+ if (messageCount <= 3) {
6369
+ console.error(
6370
+ "[DEBUG-H5] SDK message received",
6371
+ JSON.stringify({
6372
+ messageCount,
6373
+ type: message.type,
6374
+ timestamp: Date.now()
6375
+ })
6376
+ );
6377
+ }
6378
+ if (traceContext && isAssistantMessage(message)) {
6379
+ traceStepNumber++;
6380
+ const traceEvent = createTraceEventFromMessage(
6381
+ message,
6382
+ traceContext,
6383
+ traceStepNumber,
6384
+ false
6385
+ // Not complete yet
6386
+ );
6387
+ emitTraceEvent(
6388
+ traceEvent,
6389
+ traceContext.tracePushUrl,
6390
+ traceContext.routeHeader,
6391
+ traceContext.authToken
6392
+ );
6393
+ }
6379
6394
  }
6395
+ })();
6396
+ await Promise.race([sdkPromise, timeoutPromise]);
6397
+ if (timeoutHandle) {
6398
+ clearTimeout(timeoutHandle);
6380
6399
  }
6381
6400
  console.log(
6382
6401
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
@@ -6384,6 +6403,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
6384
6403
  "messages"
6385
6404
  );
6386
6405
  } catch (sdkError) {
6406
+ if (timeoutHandle) {
6407
+ clearTimeout(timeoutHandle);
6408
+ }
6409
+ if (timedOut) {
6410
+ console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6411
+ }
6387
6412
  console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
6388
6413
  console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
6389
6414
  console.error(
@@ -7425,33 +7450,14 @@ async function testClaudeDirectExecution(config) {
7425
7450
  const versionResult = await runAndLog(
7426
7451
  "claude --version",
7427
7452
  `${envExports} && "${claudePath}" --version 2>&1`,
7428
- 15e3
7429
- );
7430
- const fullCmdResult = await runAndLog(
7431
- "claude -p (with json output)",
7432
- `${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
7433
- 45e3
7434
- );
7435
- const simpleCmdResult = await runAndLog(
7436
- "claude -p (simple)",
7437
- `${envExports} && "${claudePath}" -p "Hello" 2>&1`,
7438
- 45e3
7453
+ 1e4
7454
+ // Short timeout - should complete in <2s
7439
7455
  );
7440
- const printFlagResult = await runAndLog(
7441
- "claude --print (long flag)",
7442
- `${envExports} && "${claudePath}" --print "Hi" 2>&1`,
7443
- 45e3
7444
- );
7445
- const positionalResult = await runAndLog(
7446
- 'claude "prompt" (positional)',
7447
- `${envExports} && "${claudePath}" "Hello world" 2>&1`,
7448
- 45e3
7449
- );
7450
- await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
7456
+ await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
7451
7457
  await runAndLog(
7452
7458
  "claude --version (no custom env)",
7453
7459
  `"${claudePath}" --version 2>&1`,
7454
- 15e3
7460
+ 1e4
7455
7461
  );
7456
7462
  const homeDir = process.env.HOME || "/tmp";
7457
7463
  const claudeConfigDir = path9.join(homeDir, ".claude");
@@ -7461,35 +7467,110 @@ async function testClaudeDirectExecution(config) {
7461
7467
  try {
7462
7468
  const configContents = fs11.readdirSync(claudeConfigDir);
7463
7469
  details.claudeConfigContents = configContents;
7464
- for (const file of configContents) {
7465
- if (file.includes("log") || file.includes("error")) {
7466
- const logPath = path9.join(claudeConfigDir, file);
7467
- const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
7468
- const logContent = await execCommand(catCmd);
7469
- details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
7470
- }
7471
- }
7472
7470
  } catch (e) {
7473
7471
  details.claudeConfigError = e instanceof Error ? e.message : String(e);
7474
7472
  }
7475
7473
  }
7476
7474
  details.commandResults = commandResults;
7477
- const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
7478
7475
  const versionWorked = versionResult.exitCode === 0;
7479
- const passed = anyPromptWorked;
7480
- let errorMsg;
7481
- if (!passed) {
7482
- const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
7483
- errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
7484
- }
7476
+ const passed = versionWorked;
7485
7477
  return {
7486
- name: "claude-direct-execution",
7478
+ name: "claude-cli-basic",
7487
7479
  passed,
7488
7480
  details,
7489
- error: errorMsg,
7481
+ error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
7490
7482
  durationMs: Date.now() - start
7491
7483
  };
7492
7484
  }
7485
+ async function testClaudeSdkExecution(config) {
7486
+ const start = Date.now();
7487
+ const details = {};
7488
+ const SDK_TIMEOUT_MS = 3e4;
7489
+ try {
7490
+ console.error("[SDK-DIAG] Importing Claude Agent SDK...");
7491
+ const sdk = await import("@anthropic-ai/claude-agent-sdk");
7492
+ details.sdkImported = true;
7493
+ const env = { ...process.env };
7494
+ const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
7495
+ env.ANTHROPIC_API_KEY = placeholderApiKey;
7496
+ env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
7497
+ if (config.aiGatewayUrl) {
7498
+ env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
7499
+ }
7500
+ if (config.aiGatewayHeaders) {
7501
+ const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
7502
+ env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
7503
+ }
7504
+ details.envConfigured = true;
7505
+ details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
7506
+ details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
7507
+ const canUseTool = async () => {
7508
+ console.error("[SDK-DIAG] canUseTool callback called - returning allow");
7509
+ return { behavior: "allow" };
7510
+ };
7511
+ const queryOptions = {
7512
+ env,
7513
+ cwd: "/tmp",
7514
+ settingSources: ["project"],
7515
+ allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
7516
+ model: "claude-3-5-sonnet-latest",
7517
+ maxTurns: 1,
7518
+ // Just one turn for this test
7519
+ permissionMode: "default",
7520
+ canUseTool
7521
+ };
7522
+ details.queryOptionsConfigured = true;
7523
+ console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
7524
+ const timeoutPromise = new Promise((_, reject) => {
7525
+ setTimeout(
7526
+ () => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
7527
+ SDK_TIMEOUT_MS
7528
+ );
7529
+ });
7530
+ const sdkPromise = (async () => {
7531
+ let messageCount2 = 0;
7532
+ let resultMessage2 = null;
7533
+ for await (const message of sdk.query({
7534
+ prompt: 'Say "SDK test successful" and nothing else.',
7535
+ ...queryOptions
7536
+ })) {
7537
+ messageCount2++;
7538
+ console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
7539
+ if (message.type === "result" || message.type === "assistant") {
7540
+ resultMessage2 = message;
7541
+ }
7542
+ }
7543
+ return { messageCount: messageCount2, resultMessage: resultMessage2 };
7544
+ })();
7545
+ const { messageCount, resultMessage } = await Promise.race([
7546
+ sdkPromise,
7547
+ timeoutPromise
7548
+ ]);
7549
+ details.messageCount = messageCount;
7550
+ details.hasResultMessage = !!resultMessage;
7551
+ details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
7552
+ const passed = messageCount > 0;
7553
+ return {
7554
+ name: "claude-sdk-execution",
7555
+ passed,
7556
+ details,
7557
+ error: passed ? void 0 : "SDK query completed but returned no messages",
7558
+ durationMs: Date.now() - start
7559
+ };
7560
+ } catch (err) {
7561
+ const error = err instanceof Error ? err.message : String(err);
7562
+ details.error = error;
7563
+ details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
7564
+ console.error("[SDK-DIAG] SDK test failed:", error);
7565
+ return {
7566
+ name: "claude-sdk-execution",
7567
+ passed: false,
7568
+ details,
7569
+ error: `SDK execution failed: ${error}`,
7570
+ durationMs: Date.now() - start
7571
+ };
7572
+ }
7573
+ }
7493
7574
  async function testChildProcessSpawning() {
7494
7575
  const start = Date.now();
7495
7576
  const details = {};
@@ -7694,15 +7775,13 @@ async function runDiagnostics(config, evalRunId2) {
7694
7775
  await runTest("claude-cli-execution", testClaudeExecution);
7695
7776
  await runTest("environment-dump", testEnvironmentDump);
7696
7777
  await runTest("file-system-structure", testFileSystemStructure);
7697
- await runTest("network-connectivity", () => testNetworkConnectivity(config));
7698
- await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7699
- await runTest(
7700
- "claude-direct-execution",
7701
- () => testClaudeDirectExecution(config)
7702
- );
7703
7778
  await runTest("child-process-spawning", testChildProcessSpawning);
7704
- await runTest("sdk-import", testSdkImport);
7705
7779
  await runTest("file-system-write", testFileSystemWrite);
7780
+ await runTest("sdk-import", testSdkImport);
7781
+ await runTest("network-connectivity", () => testNetworkConnectivity(config));
7782
+ await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
7783
+ await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
7784
+ await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
7706
7785
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7707
7786
  const totalDurationMs = Date.now() - startTime;
7708
7787
  const report = {