@wix/evalforge-evaluator 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +157 -78
- package/build/index.js.map +3 -3
- package/build/index.mjs +157 -78
- package/build/index.mjs.map +3 -3
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -6360,40 +6360,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6360
6360
|
traceContext.authToken
|
|
6361
6361
|
);
|
|
6362
6362
|
}
|
|
6363
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
6364
|
+
let timeoutHandle;
|
|
6365
|
+
let timedOut = false;
|
|
6363
6366
|
try {
|
|
6364
|
-
|
|
6365
|
-
|
|
6366
|
-
|
|
6367
|
-
|
|
6368
|
-
|
|
6369
|
-
|
|
6370
|
-
|
|
6371
|
-
if (messageCount <= 3) {
|
|
6372
|
-
console.error(
|
|
6373
|
-
"[DEBUG-H5] SDK message received",
|
|
6374
|
-
JSON.stringify({
|
|
6375
|
-
messageCount,
|
|
6376
|
-
type: message.type,
|
|
6377
|
-
timestamp: Date.now()
|
|
6378
|
-
})
|
|
6379
|
-
);
|
|
6380
|
-
}
|
|
6381
|
-
if (traceContext && isAssistantMessage(message)) {
|
|
6382
|
-
traceStepNumber++;
|
|
6383
|
-
const traceEvent = createTraceEventFromMessage(
|
|
6384
|
-
message,
|
|
6385
|
-
traceContext,
|
|
6386
|
-
traceStepNumber,
|
|
6387
|
-
false
|
|
6388
|
-
// Not complete yet
|
|
6389
|
-
);
|
|
6390
|
-
emitTraceEvent(
|
|
6391
|
-
traceEvent,
|
|
6392
|
-
traceContext.tracePushUrl,
|
|
6393
|
-
traceContext.routeHeader,
|
|
6394
|
-
traceContext.authToken
|
|
6367
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
6368
|
+
timeoutHandle = setTimeout(() => {
|
|
6369
|
+
timedOut = true;
|
|
6370
|
+
reject(
|
|
6371
|
+
new Error(
|
|
6372
|
+
`SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
|
|
6373
|
+
)
|
|
6395
6374
|
);
|
|
6375
|
+
}, SDK_TIMEOUT_MS);
|
|
6376
|
+
});
|
|
6377
|
+
const sdkPromise = (async () => {
|
|
6378
|
+
for await (const message of query({
|
|
6379
|
+
prompt: scenario.triggerPrompt,
|
|
6380
|
+
options: queryOptions
|
|
6381
|
+
})) {
|
|
6382
|
+
messageCount++;
|
|
6383
|
+
console.log("[SDK Message]", JSON.stringify(message, null, 2));
|
|
6384
|
+
allMessages.push(message);
|
|
6385
|
+
if (messageCount <= 3) {
|
|
6386
|
+
console.error(
|
|
6387
|
+
"[DEBUG-H5] SDK message received",
|
|
6388
|
+
JSON.stringify({
|
|
6389
|
+
messageCount,
|
|
6390
|
+
type: message.type,
|
|
6391
|
+
timestamp: Date.now()
|
|
6392
|
+
})
|
|
6393
|
+
);
|
|
6394
|
+
}
|
|
6395
|
+
if (traceContext && isAssistantMessage(message)) {
|
|
6396
|
+
traceStepNumber++;
|
|
6397
|
+
const traceEvent = createTraceEventFromMessage(
|
|
6398
|
+
message,
|
|
6399
|
+
traceContext,
|
|
6400
|
+
traceStepNumber,
|
|
6401
|
+
false
|
|
6402
|
+
// Not complete yet
|
|
6403
|
+
);
|
|
6404
|
+
emitTraceEvent(
|
|
6405
|
+
traceEvent,
|
|
6406
|
+
traceContext.tracePushUrl,
|
|
6407
|
+
traceContext.routeHeader,
|
|
6408
|
+
traceContext.authToken
|
|
6409
|
+
);
|
|
6410
|
+
}
|
|
6396
6411
|
}
|
|
6412
|
+
})();
|
|
6413
|
+
await Promise.race([sdkPromise, timeoutPromise]);
|
|
6414
|
+
if (timeoutHandle) {
|
|
6415
|
+
clearTimeout(timeoutHandle);
|
|
6397
6416
|
}
|
|
6398
6417
|
console.log(
|
|
6399
6418
|
"[executeWithClaudeCode] Claude Agent SDK query completed, received",
|
|
@@ -6401,6 +6420,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6401
6420
|
"messages"
|
|
6402
6421
|
);
|
|
6403
6422
|
} catch (sdkError) {
|
|
6423
|
+
if (timeoutHandle) {
|
|
6424
|
+
clearTimeout(timeoutHandle);
|
|
6425
|
+
}
|
|
6426
|
+
if (timedOut) {
|
|
6427
|
+
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
6428
|
+
}
|
|
6404
6429
|
console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
|
|
6405
6430
|
console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
|
|
6406
6431
|
console.error(
|
|
@@ -7442,33 +7467,14 @@ async function testClaudeDirectExecution(config) {
|
|
|
7442
7467
|
const versionResult = await runAndLog(
|
|
7443
7468
|
"claude --version",
|
|
7444
7469
|
`${envExports} && "${claudePath}" --version 2>&1`,
|
|
7445
|
-
|
|
7446
|
-
|
|
7447
|
-
const fullCmdResult = await runAndLog(
|
|
7448
|
-
"claude -p (with json output)",
|
|
7449
|
-
`${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
|
|
7450
|
-
45e3
|
|
7451
|
-
);
|
|
7452
|
-
const simpleCmdResult = await runAndLog(
|
|
7453
|
-
"claude -p (simple)",
|
|
7454
|
-
`${envExports} && "${claudePath}" -p "Hello" 2>&1`,
|
|
7455
|
-
45e3
|
|
7470
|
+
1e4
|
|
7471
|
+
// Short timeout - should complete in <2s
|
|
7456
7472
|
);
|
|
7457
|
-
|
|
7458
|
-
"claude --print (long flag)",
|
|
7459
|
-
`${envExports} && "${claudePath}" --print "Hi" 2>&1`,
|
|
7460
|
-
45e3
|
|
7461
|
-
);
|
|
7462
|
-
const positionalResult = await runAndLog(
|
|
7463
|
-
'claude "prompt" (positional)',
|
|
7464
|
-
`${envExports} && "${claudePath}" "Hello world" 2>&1`,
|
|
7465
|
-
45e3
|
|
7466
|
-
);
|
|
7467
|
-
await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
|
|
7473
|
+
await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
|
|
7468
7474
|
await runAndLog(
|
|
7469
7475
|
"claude --version (no custom env)",
|
|
7470
7476
|
`"${claudePath}" --version 2>&1`,
|
|
7471
|
-
|
|
7477
|
+
1e4
|
|
7472
7478
|
);
|
|
7473
7479
|
const homeDir = process.env.HOME || "/tmp";
|
|
7474
7480
|
const claudeConfigDir = path9.join(homeDir, ".claude");
|
|
@@ -7478,35 +7484,110 @@ async function testClaudeDirectExecution(config) {
|
|
|
7478
7484
|
try {
|
|
7479
7485
|
const configContents = fs11.readdirSync(claudeConfigDir);
|
|
7480
7486
|
details.claudeConfigContents = configContents;
|
|
7481
|
-
for (const file of configContents) {
|
|
7482
|
-
if (file.includes("log") || file.includes("error")) {
|
|
7483
|
-
const logPath = path9.join(claudeConfigDir, file);
|
|
7484
|
-
const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
|
|
7485
|
-
const logContent = await execCommand(catCmd);
|
|
7486
|
-
details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
|
|
7487
|
-
}
|
|
7488
|
-
}
|
|
7489
7487
|
} catch (e) {
|
|
7490
7488
|
details.claudeConfigError = e instanceof Error ? e.message : String(e);
|
|
7491
7489
|
}
|
|
7492
7490
|
}
|
|
7493
7491
|
details.commandResults = commandResults;
|
|
7494
|
-
const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
|
|
7495
7492
|
const versionWorked = versionResult.exitCode === 0;
|
|
7496
|
-
const passed =
|
|
7497
|
-
let errorMsg;
|
|
7498
|
-
if (!passed) {
|
|
7499
|
-
const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
|
|
7500
|
-
errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
|
|
7501
|
-
}
|
|
7493
|
+
const passed = versionWorked;
|
|
7502
7494
|
return {
|
|
7503
|
-
name: "claude-
|
|
7495
|
+
name: "claude-cli-basic",
|
|
7504
7496
|
passed,
|
|
7505
7497
|
details,
|
|
7506
|
-
error:
|
|
7498
|
+
error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
|
|
7507
7499
|
durationMs: Date.now() - start
|
|
7508
7500
|
};
|
|
7509
7501
|
}
|
|
7502
|
+
async function testClaudeSdkExecution(config) {
|
|
7503
|
+
const start = Date.now();
|
|
7504
|
+
const details = {};
|
|
7505
|
+
const SDK_TIMEOUT_MS = 3e4;
|
|
7506
|
+
try {
|
|
7507
|
+
console.error("[SDK-DIAG] Importing Claude Agent SDK...");
|
|
7508
|
+
const sdk = await import("@anthropic-ai/claude-agent-sdk");
|
|
7509
|
+
details.sdkImported = true;
|
|
7510
|
+
const env = { ...process.env };
|
|
7511
|
+
const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
|
|
7512
|
+
env.ANTHROPIC_API_KEY = placeholderApiKey;
|
|
7513
|
+
env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
|
|
7514
|
+
if (config.aiGatewayUrl) {
|
|
7515
|
+
env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
|
|
7516
|
+
}
|
|
7517
|
+
if (config.aiGatewayHeaders) {
|
|
7518
|
+
const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
|
|
7519
|
+
env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
|
|
7520
|
+
}
|
|
7521
|
+
details.envConfigured = true;
|
|
7522
|
+
details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
|
|
7523
|
+
details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
|
|
7524
|
+
const canUseTool = async () => {
|
|
7525
|
+
console.error("[SDK-DIAG] canUseTool callback called - returning allow");
|
|
7526
|
+
return { behavior: "allow" };
|
|
7527
|
+
};
|
|
7528
|
+
const queryOptions = {
|
|
7529
|
+
env,
|
|
7530
|
+
cwd: "/tmp",
|
|
7531
|
+
settingSources: ["project"],
|
|
7532
|
+
allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
|
|
7533
|
+
model: "claude-3-5-sonnet-latest",
|
|
7534
|
+
maxTurns: 1,
|
|
7535
|
+
// Just one turn for this test
|
|
7536
|
+
permissionMode: "default",
|
|
7537
|
+
canUseTool
|
|
7538
|
+
};
|
|
7539
|
+
details.queryOptionsConfigured = true;
|
|
7540
|
+
console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
|
|
7541
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
7542
|
+
setTimeout(
|
|
7543
|
+
() => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
|
|
7544
|
+
SDK_TIMEOUT_MS
|
|
7545
|
+
);
|
|
7546
|
+
});
|
|
7547
|
+
const sdkPromise = (async () => {
|
|
7548
|
+
let messageCount2 = 0;
|
|
7549
|
+
let resultMessage2 = null;
|
|
7550
|
+
for await (const message of sdk.query({
|
|
7551
|
+
prompt: 'Say "SDK test successful" and nothing else.',
|
|
7552
|
+
...queryOptions
|
|
7553
|
+
})) {
|
|
7554
|
+
messageCount2++;
|
|
7555
|
+
console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
|
|
7556
|
+
if (message.type === "result" || message.type === "assistant") {
|
|
7557
|
+
resultMessage2 = message;
|
|
7558
|
+
}
|
|
7559
|
+
}
|
|
7560
|
+
return { messageCount: messageCount2, resultMessage: resultMessage2 };
|
|
7561
|
+
})();
|
|
7562
|
+
const { messageCount, resultMessage } = await Promise.race([
|
|
7563
|
+
sdkPromise,
|
|
7564
|
+
timeoutPromise
|
|
7565
|
+
]);
|
|
7566
|
+
details.messageCount = messageCount;
|
|
7567
|
+
details.hasResultMessage = !!resultMessage;
|
|
7568
|
+
details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
|
|
7569
|
+
const passed = messageCount > 0;
|
|
7570
|
+
return {
|
|
7571
|
+
name: "claude-sdk-execution",
|
|
7572
|
+
passed,
|
|
7573
|
+
details,
|
|
7574
|
+
error: passed ? void 0 : "SDK query completed but returned no messages",
|
|
7575
|
+
durationMs: Date.now() - start
|
|
7576
|
+
};
|
|
7577
|
+
} catch (err) {
|
|
7578
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
7579
|
+
details.error = error;
|
|
7580
|
+
details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
|
|
7581
|
+
console.error("[SDK-DIAG] SDK test failed:", error);
|
|
7582
|
+
return {
|
|
7583
|
+
name: "claude-sdk-execution",
|
|
7584
|
+
passed: false,
|
|
7585
|
+
details,
|
|
7586
|
+
error: `SDK execution failed: ${error}`,
|
|
7587
|
+
durationMs: Date.now() - start
|
|
7588
|
+
};
|
|
7589
|
+
}
|
|
7590
|
+
}
|
|
7510
7591
|
async function testChildProcessSpawning() {
|
|
7511
7592
|
const start = Date.now();
|
|
7512
7593
|
const details = {};
|
|
@@ -7711,15 +7792,13 @@ async function runDiagnostics(config, evalRunId2) {
|
|
|
7711
7792
|
await runTest("claude-cli-execution", testClaudeExecution);
|
|
7712
7793
|
await runTest("environment-dump", testEnvironmentDump);
|
|
7713
7794
|
await runTest("file-system-structure", testFileSystemStructure);
|
|
7714
|
-
await runTest("network-connectivity", () => testNetworkConnectivity(config));
|
|
7715
|
-
await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
|
|
7716
|
-
await runTest(
|
|
7717
|
-
"claude-direct-execution",
|
|
7718
|
-
() => testClaudeDirectExecution(config)
|
|
7719
|
-
);
|
|
7720
7795
|
await runTest("child-process-spawning", testChildProcessSpawning);
|
|
7721
|
-
await runTest("sdk-import", testSdkImport);
|
|
7722
7796
|
await runTest("file-system-write", testFileSystemWrite);
|
|
7797
|
+
await runTest("sdk-import", testSdkImport);
|
|
7798
|
+
await runTest("network-connectivity", () => testNetworkConnectivity(config));
|
|
7799
|
+
await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
|
|
7800
|
+
await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
|
|
7801
|
+
await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
|
|
7723
7802
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
7724
7803
|
const totalDurationMs = Date.now() - startTime;
|
|
7725
7804
|
const report = {
|