@wix/evalforge-evaluator 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +157 -78
- package/build/index.js.map +3 -3
- package/build/index.mjs +157 -78
- package/build/index.mjs.map +3 -3
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -6343,40 +6343,59 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6343
6343
|
traceContext.authToken
|
|
6344
6344
|
);
|
|
6345
6345
|
}
|
|
6346
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
6347
|
+
let timeoutHandle;
|
|
6348
|
+
let timedOut = false;
|
|
6346
6349
|
try {
|
|
6347
|
-
|
|
6348
|
-
|
|
6349
|
-
|
|
6350
|
-
|
|
6351
|
-
|
|
6352
|
-
|
|
6353
|
-
|
|
6354
|
-
if (messageCount <= 3) {
|
|
6355
|
-
console.error(
|
|
6356
|
-
"[DEBUG-H5] SDK message received",
|
|
6357
|
-
JSON.stringify({
|
|
6358
|
-
messageCount,
|
|
6359
|
-
type: message.type,
|
|
6360
|
-
timestamp: Date.now()
|
|
6361
|
-
})
|
|
6362
|
-
);
|
|
6363
|
-
}
|
|
6364
|
-
if (traceContext && isAssistantMessage(message)) {
|
|
6365
|
-
traceStepNumber++;
|
|
6366
|
-
const traceEvent = createTraceEventFromMessage(
|
|
6367
|
-
message,
|
|
6368
|
-
traceContext,
|
|
6369
|
-
traceStepNumber,
|
|
6370
|
-
false
|
|
6371
|
-
// Not complete yet
|
|
6372
|
-
);
|
|
6373
|
-
emitTraceEvent(
|
|
6374
|
-
traceEvent,
|
|
6375
|
-
traceContext.tracePushUrl,
|
|
6376
|
-
traceContext.routeHeader,
|
|
6377
|
-
traceContext.authToken
|
|
6350
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
6351
|
+
timeoutHandle = setTimeout(() => {
|
|
6352
|
+
timedOut = true;
|
|
6353
|
+
reject(
|
|
6354
|
+
new Error(
|
|
6355
|
+
`SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
|
|
6356
|
+
)
|
|
6378
6357
|
);
|
|
6358
|
+
}, SDK_TIMEOUT_MS);
|
|
6359
|
+
});
|
|
6360
|
+
const sdkPromise = (async () => {
|
|
6361
|
+
for await (const message of query({
|
|
6362
|
+
prompt: scenario.triggerPrompt,
|
|
6363
|
+
options: queryOptions
|
|
6364
|
+
})) {
|
|
6365
|
+
messageCount++;
|
|
6366
|
+
console.log("[SDK Message]", JSON.stringify(message, null, 2));
|
|
6367
|
+
allMessages.push(message);
|
|
6368
|
+
if (messageCount <= 3) {
|
|
6369
|
+
console.error(
|
|
6370
|
+
"[DEBUG-H5] SDK message received",
|
|
6371
|
+
JSON.stringify({
|
|
6372
|
+
messageCount,
|
|
6373
|
+
type: message.type,
|
|
6374
|
+
timestamp: Date.now()
|
|
6375
|
+
})
|
|
6376
|
+
);
|
|
6377
|
+
}
|
|
6378
|
+
if (traceContext && isAssistantMessage(message)) {
|
|
6379
|
+
traceStepNumber++;
|
|
6380
|
+
const traceEvent = createTraceEventFromMessage(
|
|
6381
|
+
message,
|
|
6382
|
+
traceContext,
|
|
6383
|
+
traceStepNumber,
|
|
6384
|
+
false
|
|
6385
|
+
// Not complete yet
|
|
6386
|
+
);
|
|
6387
|
+
emitTraceEvent(
|
|
6388
|
+
traceEvent,
|
|
6389
|
+
traceContext.tracePushUrl,
|
|
6390
|
+
traceContext.routeHeader,
|
|
6391
|
+
traceContext.authToken
|
|
6392
|
+
);
|
|
6393
|
+
}
|
|
6379
6394
|
}
|
|
6395
|
+
})();
|
|
6396
|
+
await Promise.race([sdkPromise, timeoutPromise]);
|
|
6397
|
+
if (timeoutHandle) {
|
|
6398
|
+
clearTimeout(timeoutHandle);
|
|
6380
6399
|
}
|
|
6381
6400
|
console.log(
|
|
6382
6401
|
"[executeWithClaudeCode] Claude Agent SDK query completed, received",
|
|
@@ -6384,6 +6403,12 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6384
6403
|
"messages"
|
|
6385
6404
|
);
|
|
6386
6405
|
} catch (sdkError) {
|
|
6406
|
+
if (timeoutHandle) {
|
|
6407
|
+
clearTimeout(timeoutHandle);
|
|
6408
|
+
}
|
|
6409
|
+
if (timedOut) {
|
|
6410
|
+
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
6411
|
+
}
|
|
6387
6412
|
console.error("[SDK-ERROR] ====== CLAUDE SDK EXECUTION FAILED ======");
|
|
6388
6413
|
console.error("[SDK-ERROR] Timestamp:", (/* @__PURE__ */ new Date()).toISOString());
|
|
6389
6414
|
console.error(
|
|
@@ -7425,33 +7450,14 @@ async function testClaudeDirectExecution(config) {
|
|
|
7425
7450
|
const versionResult = await runAndLog(
|
|
7426
7451
|
"claude --version",
|
|
7427
7452
|
`${envExports} && "${claudePath}" --version 2>&1`,
|
|
7428
|
-
|
|
7429
|
-
|
|
7430
|
-
const fullCmdResult = await runAndLog(
|
|
7431
|
-
"claude -p (with json output)",
|
|
7432
|
-
`${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
|
|
7433
|
-
45e3
|
|
7434
|
-
);
|
|
7435
|
-
const simpleCmdResult = await runAndLog(
|
|
7436
|
-
"claude -p (simple)",
|
|
7437
|
-
`${envExports} && "${claudePath}" -p "Hello" 2>&1`,
|
|
7438
|
-
45e3
|
|
7453
|
+
1e4
|
|
7454
|
+
// Short timeout - should complete in <2s
|
|
7439
7455
|
);
|
|
7440
|
-
|
|
7441
|
-
"claude --print (long flag)",
|
|
7442
|
-
`${envExports} && "${claudePath}" --print "Hi" 2>&1`,
|
|
7443
|
-
45e3
|
|
7444
|
-
);
|
|
7445
|
-
const positionalResult = await runAndLog(
|
|
7446
|
-
'claude "prompt" (positional)',
|
|
7447
|
-
`${envExports} && "${claudePath}" "Hello world" 2>&1`,
|
|
7448
|
-
45e3
|
|
7449
|
-
);
|
|
7450
|
-
await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
|
|
7456
|
+
await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 1e4);
|
|
7451
7457
|
await runAndLog(
|
|
7452
7458
|
"claude --version (no custom env)",
|
|
7453
7459
|
`"${claudePath}" --version 2>&1`,
|
|
7454
|
-
|
|
7460
|
+
1e4
|
|
7455
7461
|
);
|
|
7456
7462
|
const homeDir = process.env.HOME || "/tmp";
|
|
7457
7463
|
const claudeConfigDir = path9.join(homeDir, ".claude");
|
|
@@ -7461,35 +7467,110 @@ async function testClaudeDirectExecution(config) {
|
|
|
7461
7467
|
try {
|
|
7462
7468
|
const configContents = fs11.readdirSync(claudeConfigDir);
|
|
7463
7469
|
details.claudeConfigContents = configContents;
|
|
7464
|
-
for (const file of configContents) {
|
|
7465
|
-
if (file.includes("log") || file.includes("error")) {
|
|
7466
|
-
const logPath = path9.join(claudeConfigDir, file);
|
|
7467
|
-
const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
|
|
7468
|
-
const logContent = await execCommand(catCmd);
|
|
7469
|
-
details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
|
|
7470
|
-
}
|
|
7471
|
-
}
|
|
7472
7470
|
} catch (e) {
|
|
7473
7471
|
details.claudeConfigError = e instanceof Error ? e.message : String(e);
|
|
7474
7472
|
}
|
|
7475
7473
|
}
|
|
7476
7474
|
details.commandResults = commandResults;
|
|
7477
|
-
const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
|
|
7478
7475
|
const versionWorked = versionResult.exitCode === 0;
|
|
7479
|
-
const passed =
|
|
7480
|
-
let errorMsg;
|
|
7481
|
-
if (!passed) {
|
|
7482
|
-
const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
|
|
7483
|
-
errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
|
|
7484
|
-
}
|
|
7476
|
+
const passed = versionWorked;
|
|
7485
7477
|
return {
|
|
7486
|
-
name: "claude-
|
|
7478
|
+
name: "claude-cli-basic",
|
|
7487
7479
|
passed,
|
|
7488
7480
|
details,
|
|
7489
|
-
error:
|
|
7481
|
+
error: passed ? void 0 : `Claude CLI --version failed with exit code ${versionResult.exitCode}`,
|
|
7490
7482
|
durationMs: Date.now() - start
|
|
7491
7483
|
};
|
|
7492
7484
|
}
|
|
7485
|
+
async function testClaudeSdkExecution(config) {
|
|
7486
|
+
const start = Date.now();
|
|
7487
|
+
const details = {};
|
|
7488
|
+
const SDK_TIMEOUT_MS = 3e4;
|
|
7489
|
+
try {
|
|
7490
|
+
console.error("[SDK-DIAG] Importing Claude Agent SDK...");
|
|
7491
|
+
const sdk = await import("@anthropic-ai/claude-agent-sdk");
|
|
7492
|
+
details.sdkImported = true;
|
|
7493
|
+
const env = { ...process.env };
|
|
7494
|
+
const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
|
|
7495
|
+
env.ANTHROPIC_API_KEY = placeholderApiKey;
|
|
7496
|
+
env.ANTHROPIC_AUTH_TOKEN = placeholderApiKey;
|
|
7497
|
+
if (config.aiGatewayUrl) {
|
|
7498
|
+
env.ANTHROPIC_BASE_URL = config.aiGatewayUrl;
|
|
7499
|
+
}
|
|
7500
|
+
if (config.aiGatewayHeaders) {
|
|
7501
|
+
const headerLines = Object.entries(config.aiGatewayHeaders).map(([key, value]) => `${key}:${value}`).join("\n");
|
|
7502
|
+
env.ANTHROPIC_CUSTOM_HEADERS = headerLines;
|
|
7503
|
+
}
|
|
7504
|
+
details.envConfigured = true;
|
|
7505
|
+
details.hasBaseUrl = !!env.ANTHROPIC_BASE_URL;
|
|
7506
|
+
details.hasCustomHeaders = !!env.ANTHROPIC_CUSTOM_HEADERS;
|
|
7507
|
+
const canUseTool = async () => {
|
|
7508
|
+
console.error("[SDK-DIAG] canUseTool callback called - returning allow");
|
|
7509
|
+
return { behavior: "allow" };
|
|
7510
|
+
};
|
|
7511
|
+
const queryOptions = {
|
|
7512
|
+
env,
|
|
7513
|
+
cwd: "/tmp",
|
|
7514
|
+
settingSources: ["project"],
|
|
7515
|
+
allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
|
|
7516
|
+
model: "claude-3-5-sonnet-latest",
|
|
7517
|
+
maxTurns: 1,
|
|
7518
|
+
// Just one turn for this test
|
|
7519
|
+
permissionMode: "default",
|
|
7520
|
+
canUseTool
|
|
7521
|
+
};
|
|
7522
|
+
details.queryOptionsConfigured = true;
|
|
7523
|
+
console.error("[SDK-DIAG] Starting SDK query with canUseTool...");
|
|
7524
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
7525
|
+
setTimeout(
|
|
7526
|
+
() => reject(new Error(`SDK test timed out after ${SDK_TIMEOUT_MS}ms`)),
|
|
7527
|
+
SDK_TIMEOUT_MS
|
|
7528
|
+
);
|
|
7529
|
+
});
|
|
7530
|
+
const sdkPromise = (async () => {
|
|
7531
|
+
let messageCount2 = 0;
|
|
7532
|
+
let resultMessage2 = null;
|
|
7533
|
+
for await (const message of sdk.query({
|
|
7534
|
+
prompt: 'Say "SDK test successful" and nothing else.',
|
|
7535
|
+
...queryOptions
|
|
7536
|
+
})) {
|
|
7537
|
+
messageCount2++;
|
|
7538
|
+
console.error(`[SDK-DIAG] Received message ${messageCount2}:`, message);
|
|
7539
|
+
if (message.type === "result" || message.type === "assistant") {
|
|
7540
|
+
resultMessage2 = message;
|
|
7541
|
+
}
|
|
7542
|
+
}
|
|
7543
|
+
return { messageCount: messageCount2, resultMessage: resultMessage2 };
|
|
7544
|
+
})();
|
|
7545
|
+
const { messageCount, resultMessage } = await Promise.race([
|
|
7546
|
+
sdkPromise,
|
|
7547
|
+
timeoutPromise
|
|
7548
|
+
]);
|
|
7549
|
+
details.messageCount = messageCount;
|
|
7550
|
+
details.hasResultMessage = !!resultMessage;
|
|
7551
|
+
details.resultPreview = resultMessage && typeof resultMessage === "object" ? JSON.stringify(resultMessage).slice(0, 500) : null;
|
|
7552
|
+
const passed = messageCount > 0;
|
|
7553
|
+
return {
|
|
7554
|
+
name: "claude-sdk-execution",
|
|
7555
|
+
passed,
|
|
7556
|
+
details,
|
|
7557
|
+
error: passed ? void 0 : "SDK query completed but returned no messages",
|
|
7558
|
+
durationMs: Date.now() - start
|
|
7559
|
+
};
|
|
7560
|
+
} catch (err) {
|
|
7561
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
7562
|
+
details.error = error;
|
|
7563
|
+
details.errorStack = err instanceof Error ? err.stack?.split("\n").slice(0, 5) : void 0;
|
|
7564
|
+
console.error("[SDK-DIAG] SDK test failed:", error);
|
|
7565
|
+
return {
|
|
7566
|
+
name: "claude-sdk-execution",
|
|
7567
|
+
passed: false,
|
|
7568
|
+
details,
|
|
7569
|
+
error: `SDK execution failed: ${error}`,
|
|
7570
|
+
durationMs: Date.now() - start
|
|
7571
|
+
};
|
|
7572
|
+
}
|
|
7573
|
+
}
|
|
7493
7574
|
async function testChildProcessSpawning() {
|
|
7494
7575
|
const start = Date.now();
|
|
7495
7576
|
const details = {};
|
|
@@ -7694,15 +7775,13 @@ async function runDiagnostics(config, evalRunId2) {
|
|
|
7694
7775
|
await runTest("claude-cli-execution", testClaudeExecution);
|
|
7695
7776
|
await runTest("environment-dump", testEnvironmentDump);
|
|
7696
7777
|
await runTest("file-system-structure", testFileSystemStructure);
|
|
7697
|
-
await runTest("network-connectivity", () => testNetworkConnectivity(config));
|
|
7698
|
-
await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
|
|
7699
|
-
await runTest(
|
|
7700
|
-
"claude-direct-execution",
|
|
7701
|
-
() => testClaudeDirectExecution(config)
|
|
7702
|
-
);
|
|
7703
7778
|
await runTest("child-process-spawning", testChildProcessSpawning);
|
|
7704
|
-
await runTest("sdk-import", testSdkImport);
|
|
7705
7779
|
await runTest("file-system-write", testFileSystemWrite);
|
|
7780
|
+
await runTest("sdk-import", testSdkImport);
|
|
7781
|
+
await runTest("network-connectivity", () => testNetworkConnectivity(config));
|
|
7782
|
+
await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
|
|
7783
|
+
await runTest("claude-cli-basic", () => testClaudeDirectExecution(config));
|
|
7784
|
+
await runTest("claude-sdk-execution", () => testClaudeSdkExecution(config));
|
|
7706
7785
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
7707
7786
|
const totalDurationMs = Date.now() - startTime;
|
|
7708
7787
|
const report = {
|