@wix/evalforge-evaluator 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +105 -42
- package/build/index.js.map +2 -2
- package/build/index.mjs +105 -42
- package/build/index.mjs.map +2 -2
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -6285,6 +6285,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6285
6285
|
})
|
|
6286
6286
|
);
|
|
6287
6287
|
let messageCount = 0;
|
|
6288
|
+
const canUseTool = async () => {
|
|
6289
|
+
return { behavior: "allow" };
|
|
6290
|
+
};
|
|
6288
6291
|
const queryOptions = {
|
|
6289
6292
|
env: sdkEnv,
|
|
6290
6293
|
cwd: options.cwd,
|
|
@@ -6294,8 +6297,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6294
6297
|
maxTurns,
|
|
6295
6298
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
6296
6299
|
mcpServers: options.mcpServers,
|
|
6297
|
-
|
|
6298
|
-
|
|
6300
|
+
// Use 'default' permission mode with custom canUseTool handler
|
|
6301
|
+
// instead of 'bypassPermissions' which fails on root
|
|
6302
|
+
permissionMode: "default",
|
|
6303
|
+
canUseTool
|
|
6299
6304
|
};
|
|
6300
6305
|
if (options.temperature !== void 0) {
|
|
6301
6306
|
queryOptions.temperature = options.temperature;
|
|
@@ -6314,8 +6319,8 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6314
6319
|
console.log("[SDK-DEBUG] maxTokens:", queryOptions.maxTokens);
|
|
6315
6320
|
console.log("[SDK-DEBUG] permissionMode:", queryOptions.permissionMode);
|
|
6316
6321
|
console.log(
|
|
6317
|
-
"[SDK-DEBUG]
|
|
6318
|
-
queryOptions.
|
|
6322
|
+
"[SDK-DEBUG] canUseTool:",
|
|
6323
|
+
queryOptions.canUseTool ? "custom handler (auto-allow)" : "not set"
|
|
6319
6324
|
);
|
|
6320
6325
|
console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
|
|
6321
6326
|
console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
|
|
@@ -7365,6 +7370,35 @@ async function testAiGatewayApiCall(config) {
|
|
|
7365
7370
|
async function testClaudeDirectExecution(config) {
|
|
7366
7371
|
const start = Date.now();
|
|
7367
7372
|
const details = {};
|
|
7373
|
+
const commandResults = [];
|
|
7374
|
+
const runAndLog = async (name2, command, timeoutMs = 3e4) => {
|
|
7375
|
+
console.error(`
|
|
7376
|
+
[CLAUDE-DIAG] ========== ${name2} ==========`);
|
|
7377
|
+
const cmdPreview = command.length > 500 ? command.slice(0, 500) + "..." : command;
|
|
7378
|
+
console.error(`[CLAUDE-DIAG] Command: ${cmdPreview}`);
|
|
7379
|
+
const cmdStart = Date.now();
|
|
7380
|
+
const result = await execCommand(command, timeoutMs);
|
|
7381
|
+
const cmdDuration = Date.now() - cmdStart;
|
|
7382
|
+
console.error(`[CLAUDE-DIAG] Exit code: ${result.exitCode}`);
|
|
7383
|
+
console.error(`[CLAUDE-DIAG] Duration: ${cmdDuration}ms`);
|
|
7384
|
+
console.error(`[CLAUDE-DIAG] Stdout (${result.stdout.length} chars):`);
|
|
7385
|
+
console.error(result.stdout || "(empty)");
|
|
7386
|
+
if (result.stderr) {
|
|
7387
|
+
console.error(`[CLAUDE-DIAG] Stderr (${result.stderr.length} chars):`);
|
|
7388
|
+
console.error(result.stderr);
|
|
7389
|
+
}
|
|
7390
|
+
console.error(`[CLAUDE-DIAG] ========== END ${name2} ==========
|
|
7391
|
+
`);
|
|
7392
|
+
commandResults.push({
|
|
7393
|
+
name: name2,
|
|
7394
|
+
command: command.slice(0, 300),
|
|
7395
|
+
exitCode: result.exitCode,
|
|
7396
|
+
stdout: result.stdout.slice(0, 1500),
|
|
7397
|
+
stderr: result.stderr.slice(0, 500),
|
|
7398
|
+
durationMs: cmdDuration
|
|
7399
|
+
});
|
|
7400
|
+
return result;
|
|
7401
|
+
};
|
|
7368
7402
|
const npmRootResult = await execCommand("npm root -g");
|
|
7369
7403
|
const npmRoot = npmRootResult.stdout;
|
|
7370
7404
|
const claudePath = path9.join(
|
|
@@ -7378,6 +7412,7 @@ async function testClaudeDirectExecution(config) {
|
|
|
7378
7412
|
details.claudePath = claudePath;
|
|
7379
7413
|
details.claudeExists = fs11.existsSync(claudePath);
|
|
7380
7414
|
if (!details.claudeExists) {
|
|
7415
|
+
details.commandResults = commandResults;
|
|
7381
7416
|
return {
|
|
7382
7417
|
name: "claude-direct-execution",
|
|
7383
7418
|
passed: false,
|
|
@@ -7390,34 +7425,51 @@ async function testClaudeDirectExecution(config) {
|
|
|
7390
7425
|
const headers = config.aiGatewayHeaders;
|
|
7391
7426
|
details.gatewayUrl = gatewayUrl;
|
|
7392
7427
|
details.hasHeaders = !!headers;
|
|
7393
|
-
const
|
|
7394
|
-
const
|
|
7395
|
-
`ANTHROPIC_API_KEY=sk-ant-api03-placeholder-auth-via-gateway-000000000000`,
|
|
7396
|
-
`ANTHROPIC_AUTH_TOKEN=sk-ant-api03-placeholder-auth-via-gateway-000000000000`,
|
|
7397
|
-
`ANTHROPIC_BASE_URL
|
|
7398
|
-
`ANTHROPIC_CUSTOM_HEADERS="$
|
|
7399
|
-
|
|
7400
|
-
`
|
|
7428
|
+
const headerLinesEscaped = headers ? Object.entries(headers).map(([key, value]) => `${key}:${value}`).join("\\n") : "";
|
|
7429
|
+
const envExportParts = [
|
|
7430
|
+
`export ANTHROPIC_API_KEY="sk-ant-api03-placeholder-auth-via-gateway-000000000000"`,
|
|
7431
|
+
`export ANTHROPIC_AUTH_TOKEN="sk-ant-api03-placeholder-auth-via-gateway-000000000000"`,
|
|
7432
|
+
`export ANTHROPIC_BASE_URL="${gatewayUrl || ""}"`,
|
|
7433
|
+
`export ANTHROPIC_CUSTOM_HEADERS="$(printf '${headerLinesEscaped}')"`,
|
|
7434
|
+
// Use printf!
|
|
7435
|
+
`export HOME="${process.env.HOME || "/tmp"}"`,
|
|
7436
|
+
`export PATH="${process.env.PATH || ""}"`
|
|
7401
7437
|
];
|
|
7402
|
-
const envExports =
|
|
7403
|
-
|
|
7404
|
-
|
|
7405
|
-
|
|
7406
|
-
|
|
7407
|
-
|
|
7408
|
-
|
|
7409
|
-
|
|
7410
|
-
|
|
7411
|
-
|
|
7412
|
-
|
|
7413
|
-
|
|
7414
|
-
|
|
7415
|
-
|
|
7416
|
-
|
|
7417
|
-
|
|
7418
|
-
|
|
7419
|
-
|
|
7420
|
-
|
|
7438
|
+
const envExports = envExportParts.join(" && ");
|
|
7439
|
+
details.envExportsPreview = envExportParts.map(
|
|
7440
|
+
(e) => e.includes("SECRET") || e.includes("secret") ? e.replace(/:.+/, ":[REDACTED]") : e
|
|
7441
|
+
).join("\n");
|
|
7442
|
+
const versionResult = await runAndLog(
|
|
7443
|
+
"claude --version",
|
|
7444
|
+
`${envExports} && "${claudePath}" --version 2>&1`,
|
|
7445
|
+
15e3
|
|
7446
|
+
);
|
|
7447
|
+
const fullCmdResult = await runAndLog(
|
|
7448
|
+
"claude -p (with json output)",
|
|
7449
|
+
`${envExports} && "${claudePath}" -p "Say hello" --output-format json 2>&1`,
|
|
7450
|
+
45e3
|
|
7451
|
+
);
|
|
7452
|
+
const simpleCmdResult = await runAndLog(
|
|
7453
|
+
"claude -p (simple)",
|
|
7454
|
+
`${envExports} && "${claudePath}" -p "Hello" 2>&1`,
|
|
7455
|
+
45e3
|
|
7456
|
+
);
|
|
7457
|
+
const printFlagResult = await runAndLog(
|
|
7458
|
+
"claude --print (long flag)",
|
|
7459
|
+
`${envExports} && "${claudePath}" --print "Hi" 2>&1`,
|
|
7460
|
+
45e3
|
|
7461
|
+
);
|
|
7462
|
+
const positionalResult = await runAndLog(
|
|
7463
|
+
'claude "prompt" (positional)',
|
|
7464
|
+
`${envExports} && "${claudePath}" "Hello world" 2>&1`,
|
|
7465
|
+
45e3
|
|
7466
|
+
);
|
|
7467
|
+
await runAndLog("claude --help", `"${claudePath}" --help 2>&1`, 15e3);
|
|
7468
|
+
await runAndLog(
|
|
7469
|
+
"claude --version (no custom env)",
|
|
7470
|
+
`"${claudePath}" --version 2>&1`,
|
|
7471
|
+
15e3
|
|
7472
|
+
);
|
|
7421
7473
|
const homeDir = process.env.HOME || "/tmp";
|
|
7422
7474
|
const claudeConfigDir = path9.join(homeDir, ".claude");
|
|
7423
7475
|
details.claudeConfigDir = claudeConfigDir;
|
|
@@ -7426,24 +7478,32 @@ async function testClaudeDirectExecution(config) {
|
|
|
7426
7478
|
try {
|
|
7427
7479
|
const configContents = fs11.readdirSync(claudeConfigDir);
|
|
7428
7480
|
details.claudeConfigContents = configContents;
|
|
7481
|
+
for (const file of configContents) {
|
|
7482
|
+
if (file.includes("log") || file.includes("error")) {
|
|
7483
|
+
const logPath = path9.join(claudeConfigDir, file);
|
|
7484
|
+
const catCmd = `cat "${logPath}" 2>&1 | tail -50`;
|
|
7485
|
+
const logContent = await execCommand(catCmd);
|
|
7486
|
+
details[`claudeLogFile_${file}`] = logContent.stdout.slice(0, 1e3);
|
|
7487
|
+
}
|
|
7488
|
+
}
|
|
7429
7489
|
} catch (e) {
|
|
7430
7490
|
details.claudeConfigError = e instanceof Error ? e.message : String(e);
|
|
7431
7491
|
}
|
|
7432
7492
|
}
|
|
7433
|
-
|
|
7434
|
-
const
|
|
7435
|
-
const
|
|
7436
|
-
|
|
7437
|
-
|
|
7438
|
-
|
|
7439
|
-
|
|
7440
|
-
|
|
7441
|
-
|
|
7493
|
+
details.commandResults = commandResults;
|
|
7494
|
+
const anyPromptWorked = fullCmdResult.exitCode === 0 || simpleCmdResult.exitCode === 0 || printFlagResult.exitCode === 0 || positionalResult.exitCode === 0;
|
|
7495
|
+
const versionWorked = versionResult.exitCode === 0;
|
|
7496
|
+
const passed = anyPromptWorked;
|
|
7497
|
+
let errorMsg;
|
|
7498
|
+
if (!passed) {
|
|
7499
|
+
const failedCmds = commandResults.filter((r) => r.exitCode !== 0).map((r) => `${r.name}: exit=${r.exitCode}`).join(", ");
|
|
7500
|
+
errorMsg = `All Claude CLI commands failed. ${failedCmds}. Version works: ${versionWorked}`;
|
|
7501
|
+
}
|
|
7442
7502
|
return {
|
|
7443
7503
|
name: "claude-direct-execution",
|
|
7444
7504
|
passed,
|
|
7445
7505
|
details,
|
|
7446
|
-
error:
|
|
7506
|
+
error: errorMsg,
|
|
7447
7507
|
durationMs: Date.now() - start
|
|
7448
7508
|
};
|
|
7449
7509
|
}
|
|
@@ -7653,7 +7713,10 @@ async function runDiagnostics(config, evalRunId2) {
|
|
|
7653
7713
|
await runTest("file-system-structure", testFileSystemStructure);
|
|
7654
7714
|
await runTest("network-connectivity", () => testNetworkConnectivity(config));
|
|
7655
7715
|
await runTest("ai-gateway-api-call", () => testAiGatewayApiCall(config));
|
|
7656
|
-
await runTest(
|
|
7716
|
+
await runTest(
|
|
7717
|
+
"claude-direct-execution",
|
|
7718
|
+
() => testClaudeDirectExecution(config)
|
|
7719
|
+
);
|
|
7657
7720
|
await runTest("child-process-spawning", testChildProcessSpawning);
|
|
7658
7721
|
await runTest("sdk-import", testSdkImport);
|
|
7659
7722
|
await runTest("file-system-write", testFileSystemWrite);
|