@wix/evalforge-evaluator 0.117.0 → 0.119.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1200,10 +1200,10 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1200
1200
  };
1201
1201
  }
1202
1202
  async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1203
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1203
+ const { mkdir: mkdirAsync, writeFile: writeFile7 } = await import("fs/promises");
1204
1204
  const claudeDir = `${cwd}/.claude`;
1205
1205
  await mkdirAsync(claudeDir, { recursive: true });
1206
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1206
+ await writeFile7(`${claudeDir}/settings.json`, "{}", {
1207
1207
  flag: "wx"
1208
1208
  }).catch(() => {
1209
1209
  });
@@ -2162,9 +2162,18 @@ defaultRegistry.register(claudeCodeAdapter);
2162
2162
  var import_evalforge_types9 = require("@wix/evalforge-types");
2163
2163
 
2164
2164
  // src/run-scenario/agents/opencode/execute.ts
2165
- var import_os3 = require("os");
2165
+ var import_child_process = require("child_process");
2166
2166
  var import_evalforge_types8 = require("@wix/evalforge-types");
2167
2167
 
2168
+ // src/run-scenario/agents/opencode/types.ts
2169
+ function tryParseJson(text) {
2170
+ try {
2171
+ return JSON.parse(text);
2172
+ } catch {
2173
+ return null;
2174
+ }
2175
+ }
2176
+
2168
2177
  // src/run-scenario/agents/opencode/write-skills.ts
2169
2178
  var import_promises7 = require("fs/promises");
2170
2179
  var import_path8 = require("path");
@@ -2260,6 +2269,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
2260
2269
  }
2261
2270
 
2262
2271
  // src/run-scenario/agents/opencode/config.ts
2272
+ var import_os3 = require("os");
2263
2273
  var import_evalforge_types6 = require("@wix/evalforge-types");
2264
2274
  var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2265
2275
  function parseModel(model) {
@@ -2312,7 +2322,14 @@ function toOpenCodeMcpConfig(servers) {
2312
2322
  }
2313
2323
  return result;
2314
2324
  }
2315
- async function buildOpenCodeConfig(options) {
2325
+ function ensureOpenCodeInPath(currentPath) {
2326
+ const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
2327
+ if (currentPath.includes(opencodeBin)) {
2328
+ return currentPath;
2329
+ }
2330
+ return `${opencodeBin}:${currentPath}`;
2331
+ }
2332
+ async function buildOpenCodeEnv(options) {
2316
2333
  const modelStr = options.model || DEFAULT_MODEL2;
2317
2334
  const { providerID, modelID } = parseModel(modelStr);
2318
2335
  const provider = {};
@@ -2324,9 +2341,7 @@ async function buildOpenCodeConfig(options) {
2324
2341
  if (options.aiGatewayHeaders) {
2325
2342
  providerOptions.headers = { ...options.aiGatewayHeaders };
2326
2343
  }
2327
- provider[providerID] = {
2328
- options: providerOptions
2329
- };
2344
+ provider[providerID] = { options: providerOptions };
2330
2345
  }
2331
2346
  let mcp;
2332
2347
  if (options.mcps && options.mcps.length > 0) {
@@ -2367,70 +2382,81 @@ async function buildOpenCodeConfig(options) {
2367
2382
  },
2368
2383
  ...mcp ? { mcp } : {}
2369
2384
  };
2370
- return { config, providerID, modelID };
2385
+ const env = {
2386
+ ...process.env,
2387
+ PATH: ensureOpenCodeInPath(process.env.PATH || ""),
2388
+ OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
2389
+ OPENCODE_DISABLE_LSP_DOWNLOAD: "true"
2390
+ };
2391
+ return { env, providerID, modelID };
2371
2392
  }
2372
2393
 
2373
2394
  // src/run-scenario/agents/opencode/build-trace.ts
2374
2395
  var import_evalforge_types7 = require("@wix/evalforge-types");
2375
2396
  var import_crypto2 = require("crypto");
2376
- function buildLLMTrace(messages, totalDurationMs, model, provider) {
2377
- const assistantMessages = messages.filter(
2378
- (m) => m.info.role === "assistant"
2379
- );
2380
- const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
2381
- const { info, parts } = msg;
2382
- let text = "";
2383
- let thinking = "";
2384
- const toolCalls = [];
2385
- let stepInputTokens = 0;
2386
- let stepOutputTokens = 0;
2387
- let stepCost = 0;
2388
- let finishReason = "unknown";
2389
- for (const part of parts) {
2390
- switch (part.type) {
2391
- case "text": {
2392
- const textPart = part;
2393
- text += textPart.text;
2394
- break;
2395
- }
2396
- case "reasoning": {
2397
- const reasoningPart = part;
2398
- thinking += reasoningPart.text;
2399
- break;
2400
- }
2401
- case "tool": {
2402
- const toolPart = part;
2403
- toolCalls.push({
2404
- toolName: toolPart.tool,
2405
- args: toolPart.state.input
2406
- });
2407
- break;
2408
- }
2409
- case "step-finish": {
2410
- const sf = part;
2411
- stepInputTokens += sf.tokens.input;
2412
- stepOutputTokens += sf.tokens.output;
2413
- stepCost += sf.cost;
2414
- finishReason = sf.reason;
2415
- break;
2416
- }
2397
+ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, executionStartTime) {
2398
+ const turns = [];
2399
+ let current = {
2400
+ textParts: [],
2401
+ reasoningParts: [],
2402
+ toolCalls: []
2403
+ };
2404
+ for (const { event: evt, receivedAt } of timestampedEvents) {
2405
+ switch (evt.type) {
2406
+ case "text":
2407
+ current.textParts.push(evt.part.text);
2408
+ break;
2409
+ case "reasoning":
2410
+ current.reasoningParts.push(evt.part.text);
2411
+ break;
2412
+ case "tool_use": {
2413
+ const tu = evt;
2414
+ current.toolCalls.push({
2415
+ toolName: tu.part.tool,
2416
+ args: tu.part.state.input
2417
+ });
2418
+ break;
2419
+ }
2420
+ case "step_finish": {
2421
+ const sf = evt;
2422
+ current.stepFinish = sf.part;
2423
+ current.receivedAt = receivedAt;
2424
+ turns.push(current);
2425
+ current = {
2426
+ textParts: [],
2427
+ reasoningParts: [],
2428
+ toolCalls: []
2429
+ };
2430
+ break;
2417
2431
  }
2418
2432
  }
2419
- if (stepInputTokens === 0 && stepOutputTokens === 0) {
2420
- stepInputTokens = info.tokens.input;
2421
- stepOutputTokens = info.tokens.output;
2422
- stepCost = info.cost;
2423
- }
2424
- const startedAt = new Date(info.time.created).toISOString();
2425
- const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
2426
- const durationMs = Math.max(0, completedAt - info.time.created);
2427
- const isSuccess = finishReason !== "error";
2428
- const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2429
- const stepModel = info.modelID || model;
2430
- const stepProvider = info.providerID || provider;
2431
- const toolCallCount = toolCalls.length;
2433
+ }
2434
+ if (current.textParts.length > 0 || current.reasoningParts.length > 0 || current.toolCalls.length > 0) {
2435
+ if (timestampedEvents.length > 0) {
2436
+ current.receivedAt = timestampedEvents[timestampedEvents.length - 1].receivedAt;
2437
+ }
2438
+ turns.push(current);
2439
+ }
2440
+ const executionStartMs = executionStartTime.getTime();
2441
+ const allSteps = turns.flatMap((turn, turnIndex) => {
2442
+ const sf = turn.stepFinish;
2443
+ const stepInputTokens = sf?.tokens.input ?? 0;
2444
+ const stepOutputTokens = sf?.tokens.output ?? 0;
2445
+ const stepCost = sf?.cost ?? 0;
2446
+ const finishReason = sf?.reason ?? "unknown";
2447
+ const stepModel = sf?.modelID || model;
2448
+ const stepProvider = sf?.providerID || provider;
2449
+ const turnEndMs = turn.receivedAt ?? executionStartMs + totalDurationMs;
2450
+ const prevEndMs = turnIndex > 0 ? turns[turnIndex - 1].receivedAt ?? executionStartMs : executionStartMs;
2451
+ const durationMs = Math.max(0, turnEndMs - prevEndMs);
2452
+ const startedAt = new Date(prevEndMs).toISOString();
2453
+ const text = turn.textParts.join("");
2454
+ const thinking = turn.reasoningParts.join("");
2455
+ const toolCallCount = turn.toolCalls.length;
2432
2456
  const hasThinking = !!thinking;
2433
2457
  const hasText = !!text;
2458
+ const isSuccess = finishReason !== "error";
2459
+ const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2434
2460
  const subSteps = [];
2435
2461
  const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
2436
2462
  const toolSubSteps = toolCallCount;
@@ -2440,7 +2466,6 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2440
2466
  subSteps.push({
2441
2467
  id: (0, import_crypto2.randomUUID)(),
2442
2468
  stepNumber: 0,
2443
- // renumbered below
2444
2469
  turnIndex,
2445
2470
  type: import_evalforge_types7.LLMStepType.THINKING,
2446
2471
  model: stepModel,
@@ -2462,7 +2487,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2462
2487
  }
2463
2488
  if (toolCallCount > 0) {
2464
2489
  for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
2465
- const tc = toolCalls[tcIdx];
2490
+ const tc = turn.toolCalls[tcIdx];
2466
2491
  const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
2467
2492
  const toolBudgetSteps = toolSubSteps + textSubSteps;
2468
2493
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
@@ -2541,11 +2566,21 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2541
2566
  }
2542
2567
  return subSteps;
2543
2568
  }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
2544
- const totalTokens = buildTotalTokens(assistantMessages);
2545
- const totalCost = assistantMessages.reduce((sum, m) => {
2546
- const aMsg = m.info;
2547
- return sum + aMsg.cost;
2548
- }, 0);
2569
+ let totalPrompt = 0;
2570
+ let totalCompletion = 0;
2571
+ let totalCost = 0;
2572
+ for (const turn of turns) {
2573
+ if (turn.stepFinish) {
2574
+ totalPrompt += turn.stepFinish.tokens.input;
2575
+ totalCompletion += turn.stepFinish.tokens.output;
2576
+ totalCost += turn.stepFinish.cost;
2577
+ }
2578
+ }
2579
+ const totalTokens = {
2580
+ prompt: totalPrompt,
2581
+ completion: totalCompletion,
2582
+ total: totalPrompt + totalCompletion
2583
+ };
2549
2584
  const stepTypeBreakdown = {};
2550
2585
  for (const step of allSteps) {
2551
2586
  const entry = stepTypeBreakdown[step.type] ?? {
@@ -2563,7 +2598,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2563
2598
  const modelUsed = allSteps[0]?.model || model;
2564
2599
  const summary = {
2565
2600
  totalSteps: allSteps.length,
2566
- totalTurns: assistantMessages.length,
2601
+ totalTurns: turns.length,
2567
2602
  totalDurationMs,
2568
2603
  totalTokens,
2569
2604
  totalCostUsd: totalCost,
@@ -2584,116 +2619,100 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2584
2619
  summary
2585
2620
  };
2586
2621
  }
2587
- function buildTotalTokens(assistantMessages) {
2588
- let prompt = 0;
2589
- let completion = 0;
2590
- for (const { info } of assistantMessages) {
2591
- prompt += info.tokens.input;
2592
- completion += info.tokens.output;
2593
- }
2594
- return { prompt, completion, total: prompt + completion };
2595
- }
2596
2622
 
2597
2623
  // src/run-scenario/agents/opencode/build-conversation.ts
2598
- function buildConversation2(messages) {
2624
+ function buildConversation2(timestampedEvents) {
2599
2625
  const result = [];
2600
- for (const { info, parts } of messages) {
2601
- const timestamp = new Date(info.time.created).toISOString();
2602
- if (info.role === "assistant") {
2603
- const content = [];
2604
- for (const part of parts) {
2605
- switch (part.type) {
2606
- case "text": {
2607
- const textPart = part;
2608
- content.push({ type: "text", text: textPart.text });
2609
- break;
2610
- }
2611
- case "reasoning": {
2612
- const reasoningPart = part;
2613
- content.push({ type: "thinking", thinking: reasoningPart.text });
2614
- break;
2615
- }
2616
- case "tool": {
2617
- const toolPart = part;
2618
- content.push({
2619
- type: "tool_use",
2620
- toolName: toolPart.tool,
2621
- toolId: toolPart.callID,
2622
- input: toolPart.state.input
2623
- });
2624
- break;
2625
- }
2626
- }
2626
+ let assistantContent = [];
2627
+ let userContent = [];
2628
+ let latestReceivedAt = 0;
2629
+ const flushAssistant = () => {
2630
+ if (assistantContent.length > 0) {
2631
+ const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
2632
+ result.push({ role: "assistant", content: assistantContent, timestamp });
2633
+ assistantContent = [];
2634
+ }
2635
+ };
2636
+ const flushUser = () => {
2637
+ if (userContent.length > 0) {
2638
+ const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
2639
+ result.push({ role: "user", content: userContent, timestamp });
2640
+ userContent = [];
2641
+ }
2642
+ };
2643
+ for (const { event: evt, receivedAt } of timestampedEvents) {
2644
+ latestReceivedAt = receivedAt;
2645
+ switch (evt.type) {
2646
+ case "text": {
2647
+ const te = evt;
2648
+ assistantContent.push({ type: "text", text: te.part.text });
2649
+ break;
2627
2650
  }
2628
- if (content.length > 0) {
2629
- result.push({ role: "assistant", content, timestamp });
2651
+ case "reasoning": {
2652
+ const re = evt;
2653
+ assistantContent.push({ type: "thinking", thinking: re.part.text });
2654
+ break;
2630
2655
  }
2631
- } else if (info.role === "user") {
2632
- const content = [];
2633
- for (const part of parts) {
2634
- if (part.type === "text") {
2635
- const textPart = part;
2636
- content.push({ type: "text", text: textPart.text });
2637
- } else if (part.type === "tool") {
2638
- const toolPart = part;
2639
- const state = toolPart.state;
2640
- if (state.status === "completed") {
2641
- const completed = state;
2642
- content.push({
2643
- type: "tool_result",
2644
- toolUseId: toolPart.callID,
2645
- content: completed.output
2646
- });
2647
- } else if (state.status === "error") {
2648
- const errState = state;
2649
- content.push({
2650
- type: "tool_result",
2651
- toolUseId: toolPart.callID,
2652
- content: errState.error,
2653
- isError: true
2654
- });
2655
- }
2656
+ case "tool_use": {
2657
+ const tu = evt;
2658
+ assistantContent.push({
2659
+ type: "tool_use",
2660
+ toolName: tu.part.tool,
2661
+ toolId: tu.part.callID,
2662
+ input: tu.part.state.input
2663
+ });
2664
+ if (tu.part.state.status === "completed" || tu.part.state.status === "error") {
2665
+ flushAssistant();
2666
+ const isError = tu.part.state.status === "error";
2667
+ const content = isError ? tu.part.state.error || "Tool execution failed" : tu.part.state.output || "";
2668
+ userContent.push({
2669
+ type: "tool_result",
2670
+ toolUseId: tu.part.callID,
2671
+ content,
2672
+ ...isError ? { isError: true } : {}
2673
+ });
2674
+ flushUser();
2656
2675
  }
2676
+ break;
2657
2677
  }
2658
- if (content.length > 0) {
2659
- result.push({ role: "user", content, timestamp });
2678
+ case "step_finish": {
2679
+ flushAssistant();
2680
+ flushUser();
2681
+ break;
2660
2682
  }
2661
2683
  }
2662
2684
  }
2685
+ flushAssistant();
2686
+ flushUser();
2663
2687
  return result;
2664
2688
  }
2665
2689
 
2666
2690
  // src/run-scenario/agents/opencode/execute.ts
2667
- var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2668
- function ensureOpenCodeInPath() {
2669
- const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
2670
- const currentPath = process.env.PATH || "";
2671
- if (!currentPath.includes(opencodeBin)) {
2672
- process.env.PATH = `${opencodeBin}:${currentPath}`;
2673
- }
2674
- }
2691
+ var import_promises9 = require("fs/promises");
2692
+ var import_path10 = require("path");
2693
+ var KILL_GRACE_PERIOD_MS = 5e3;
2694
+ var IDLE_TIMEOUT_MS = 12e4;
2695
+ var IDLE_CHECK_INTERVAL_MS = 15e3;
2675
2696
  function extractToolAction(toolName, args) {
2676
2697
  if (!toolName) return "Using tool...";
2677
- const a = args;
2678
- if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
2679
- const desc = String(a.description).slice(0, 55);
2680
- return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
2681
- }
2682
- if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
2683
- const cmd = String(a.command).slice(0, 50);
2684
- return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
2685
- }
2686
- if (a?.file_path || a?.path || a?.target_file) {
2687
- const filePath = String(a.file_path || a.path || a.target_file).slice(
2688
- 0,
2689
- 50
2690
- );
2698
+ if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
2699
+ const desc = String(args.description).slice(0, 55);
2700
+ return `Task: ${desc}${String(args.description).length > 55 ? "..." : ""}`;
2701
+ }
2702
+ if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && args?.command) {
2703
+ const cmd = String(args.command).slice(0, 50);
2704
+ return `Running: ${cmd}${String(args.command).length > 50 ? "..." : ""}`;
2705
+ }
2706
+ if (args?.file_path || args?.path || args?.target_file) {
2707
+ const filePath = String(
2708
+ args.file_path || args.path || args.target_file
2709
+ ).slice(0, 50);
2691
2710
  if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
2692
2711
  if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
2693
2712
  }
2694
2713
  return `Using ${toolName}...`;
2695
2714
  }
2696
- function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2715
+ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
2697
2716
  const base = {
2698
2717
  evalRunId: context.evalRunId,
2699
2718
  scenarioId: context.scenarioId,
@@ -2704,42 +2723,41 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2704
2723
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2705
2724
  isComplete
2706
2725
  };
2707
- switch (part.type) {
2726
+ switch (evt.type) {
2708
2727
  case "text": {
2709
- const textPart = part;
2728
+ const te = evt;
2710
2729
  return {
2711
2730
  ...base,
2712
2731
  type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
2713
- outputPreview: textPart.text.slice(0, 500)
2732
+ outputPreview: te.part.text.slice(0, 500)
2714
2733
  };
2715
2734
  }
2716
- case "reasoning": {
2717
- const reasoningPart = part;
2735
+ case "reasoning":
2718
2736
  return {
2719
2737
  ...base,
2720
2738
  type: import_evalforge_types8.LiveTraceEventType.THINKING,
2721
- thinking: reasoningPart.text.slice(0, 500)
2739
+ thinking: evt.part.text.slice(0, 500)
2722
2740
  };
2723
- }
2724
- case "tool": {
2725
- const toolPart = part;
2726
- const toolName = toolPart.tool;
2727
- const args = toolPart.state.input;
2741
+ case "tool_use": {
2742
+ const tu = evt;
2743
+ const toolName = tu.part.tool;
2744
+ const args = tu.part.state.input;
2728
2745
  const toolArgs = JSON.stringify(args).slice(0, 500);
2729
2746
  let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
2730
2747
  let filePath;
2731
- const a = args;
2732
- if (a.file_path || a.path || a.target_file) {
2733
- filePath = String(a.file_path || a.path || a.target_file);
2734
- if (/write|edit/i.test(toolName)) {
2735
- type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
2736
- } else if (/read|view/i.test(toolName)) {
2737
- type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
2748
+ if (args) {
2749
+ if (args.file_path || args.path || args.target_file) {
2750
+ filePath = String(args.file_path || args.path || args.target_file);
2751
+ if (/write|edit/i.test(toolName)) {
2752
+ type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
2753
+ } else if (/read|view/i.test(toolName)) {
2754
+ type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
2755
+ }
2738
2756
  }
2739
2757
  }
2740
2758
  return { ...base, type, toolName, toolArgs, filePath };
2741
2759
  }
2742
- case "step-finish":
2760
+ case "step_finish":
2743
2761
  return {
2744
2762
  ...base,
2745
2763
  type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
@@ -2769,6 +2787,37 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2769
2787
  );
2770
2788
  }
2771
2789
  }
2790
+ async function writeSystemPromptRule(cwd, systemPrompt) {
2791
+ const rulesDir = (0, import_path10.join)(cwd, ".opencode", "rules");
2792
+ await (0, import_promises9.mkdir)(rulesDir, { recursive: true });
2793
+ await (0, import_promises9.writeFile)(
2794
+ (0, import_path10.join)(rulesDir, "evalforge-system-prompt.md"),
2795
+ systemPrompt,
2796
+ "utf-8"
2797
+ );
2798
+ }
2799
+ function killProcess(child, resolved) {
2800
+ if (!child) return;
2801
+ const killSignal = (signal) => {
2802
+ if (child.pid) {
2803
+ try {
2804
+ process.kill(-child.pid, signal);
2805
+ console.log(
2806
+ `[OpenCode] Sent ${signal} to process group (pid: -${child.pid})`
2807
+ );
2808
+ return;
2809
+ } catch {
2810
+ }
2811
+ }
2812
+ child.kill(signal);
2813
+ };
2814
+ killSignal("SIGTERM");
2815
+ setTimeout(() => {
2816
+ if (child && !resolved) {
2817
+ killSignal("SIGKILL");
2818
+ }
2819
+ }, KILL_GRACE_PERIOD_MS);
2820
+ }
2772
2821
  async function executeWithOpenCode(skills, scenario, options) {
2773
2822
  const skillNames = skills.map((s) => s.name).join(", ");
2774
2823
  console.log("[executeWithOpenCode] Starting execution", {
@@ -2783,7 +2832,8 @@ async function executeWithOpenCode(skills, scenario, options) {
2783
2832
  });
2784
2833
  const startTime = /* @__PURE__ */ new Date();
2785
2834
  const maxTurns = options.maxTurns ?? 10;
2786
- const { config, providerID, modelID } = await buildOpenCodeConfig({
2835
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2836
+ const { env, providerID, modelID } = await buildOpenCodeEnv({
2787
2837
  model: options.model,
2788
2838
  temperature: options.temperature,
2789
2839
  maxTurns,
@@ -2792,12 +2842,6 @@ async function executeWithOpenCode(skills, scenario, options) {
2792
2842
  mcps: options.mcps,
2793
2843
  cwd: options.cwd
2794
2844
  });
2795
- const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
2796
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2797
- const abortController = new AbortController();
2798
- let timeoutHandle;
2799
- let heartbeatHandle;
2800
- let timedOut = false;
2801
2845
  const traceContext = options.traceContext;
2802
2846
  let traceStepNumber = 0;
2803
2847
  let lastAction = "Starting...";
@@ -2814,7 +2858,7 @@ async function executeWithOpenCode(skills, scenario, options) {
2814
2858
  stepNumber: 0,
2815
2859
  type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2816
2860
  outputPreview: JSON.stringify({
2817
- event: "pre-sdk-execution",
2861
+ event: "pre-cli-execution",
2818
2862
  model: `${providerID}/${modelID}`,
2819
2863
  maxTurns,
2820
2864
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
@@ -2827,105 +2871,200 @@ async function executeWithOpenCode(skills, scenario, options) {
2827
2871
  traceContext.authToken
2828
2872
  );
2829
2873
  }
2830
- let server;
2831
- try {
2832
- ensureOpenCodeInPath();
2833
- console.log("[SDK-DEBUG] Starting OpenCode server...");
2834
- server = await createOpencodeServer({
2835
- config,
2836
- signal: abortController.signal,
2837
- timeout: 3e4
2838
- });
2839
- console.log(`[SDK-DEBUG] Server started at ${server.url}`);
2840
- const client = createOpencodeClient({
2841
- baseUrl: server.url,
2842
- directory: options.cwd
2843
- });
2844
- const session = await client.session.create({
2845
- body: { title: `eval-${scenario.name}` }
2846
- });
2847
- if (!session.data) {
2848
- const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
2849
- throw new Error(
2850
- `OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
2874
+ let systemPrompt;
2875
+ if (options.systemPrompt === null || options.systemPrompt === "") {
2876
+ } else if (options.systemPrompt != null) {
2877
+ systemPrompt = options.systemPrompt;
2878
+ } else {
2879
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
2880
+ }
2881
+ if (systemPrompt) {
2882
+ await writeSystemPromptRule(options.cwd, systemPrompt);
2883
+ }
2884
+ const args = [
2885
+ "run",
2886
+ "--format",
2887
+ "json",
2888
+ "--thinking",
2889
+ "--variant",
2890
+ "high",
2891
+ "--model",
2892
+ `${providerID}/${modelID}`,
2893
+ "--dir",
2894
+ options.cwd,
2895
+ // NOTE: Trigger prompt is passed as a positional CLI arg. On Linux a single
2896
+ // arg is capped at 128 KB (MAX_ARG_STRLEN); on macOS the combined args+env
2897
+ // share a ~1 MB limit. Prompts exceeding this would fail with E2BIG.
2898
+ // In practice eval prompts are well under this limit.
2899
+ scenario.triggerPrompt
2900
+ ];
2901
+ console.log("[executeWithOpenCode] Spawning: opencode", args.slice(0, 5));
2902
+ return new Promise((resolve2, reject) => {
2903
+ let resolved = false;
2904
+ let stderr = "";
2905
+ let lineBuffer = "";
2906
+ let lastOutputTime = Date.now();
2907
+ const allEvents = [];
2908
+ const timers = {};
2909
+ const cleanup = () => {
2910
+ if (timers.timeout) clearTimeout(timers.timeout);
2911
+ if (timers.idleCheck) clearInterval(timers.idleCheck);
2912
+ if (timers.heartbeat) clearInterval(timers.heartbeat);
2913
+ };
2914
+ const finalize = (success, error) => {
2915
+ if (resolved) return;
2916
+ resolved = true;
2917
+ cleanup();
2918
+ if (!success) {
2919
+ if (traceContext) {
2920
+ emitTraceEvent(
2921
+ {
2922
+ evalRunId: traceContext.evalRunId,
2923
+ scenarioId: traceContext.scenarioId,
2924
+ scenarioName: traceContext.scenarioName,
2925
+ targetId: traceContext.targetId,
2926
+ targetName: traceContext.targetName,
2927
+ stepNumber: traceStepNumber + 1,
2928
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2929
+ outputPreview: JSON.stringify({
2930
+ event: "cli-execution-failed",
2931
+ error: error?.message ?? "Unknown error"
2932
+ }).slice(0, 2e3),
2933
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2934
+ isComplete: true
2935
+ },
2936
+ traceContext.tracePushUrl,
2937
+ traceContext.routeHeader,
2938
+ traceContext.authToken
2939
+ );
2940
+ }
2941
+ reject(
2942
+ error ?? new Error(
2943
+ `OpenCode CLI execution failed (exit code unknown).
2944
+ Stderr: ${stderr.slice(0, 1e3)}`
2945
+ )
2946
+ );
2947
+ return;
2948
+ }
2949
+ const endTime = /* @__PURE__ */ new Date();
2950
+ const totalDurationMs = endTime.getTime() - startTime.getTime();
2951
+ let outputText = "";
2952
+ for (const { event: evt } of allEvents) {
2953
+ if (evt.type === "text") {
2954
+ outputText += evt.part.text;
2955
+ }
2956
+ }
2957
+ if (!outputText) {
2958
+ reject(
2959
+ new Error(
2960
+ `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
2961
+ )
2962
+ );
2963
+ return;
2964
+ }
2965
+ let inputTokens = 0;
2966
+ let outputTokens = 0;
2967
+ let costUsd = 0;
2968
+ for (const { event: evt } of allEvents) {
2969
+ if (evt.type === "step_finish") {
2970
+ const sf = evt;
2971
+ inputTokens += sf.part.tokens.input;
2972
+ outputTokens += sf.part.tokens.output;
2973
+ costUsd += sf.part.cost;
2974
+ }
2975
+ }
2976
+ if (traceContext) {
2977
+ emitTraceEvent(
2978
+ {
2979
+ evalRunId: traceContext.evalRunId,
2980
+ scenarioId: traceContext.scenarioId,
2981
+ scenarioName: traceContext.scenarioName,
2982
+ targetId: traceContext.targetId,
2983
+ targetName: traceContext.targetName,
2984
+ stepNumber: traceStepNumber + 1,
2985
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
2986
+ outputPreview: "Scenario execution completed",
2987
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2988
+ isComplete: true
2989
+ },
2990
+ traceContext.tracePushUrl,
2991
+ traceContext.routeHeader,
2992
+ traceContext.authToken
2993
+ );
2994
+ }
2995
+ const modelStr = options.model || `${providerID}/${modelID}`;
2996
+ const llmTrace = buildLLMTrace(
2997
+ allEvents,
2998
+ totalDurationMs,
2999
+ modelStr,
3000
+ providerID,
3001
+ startTime
2851
3002
  );
2852
- }
2853
- const sessionId = session.data.id;
2854
- console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
2855
- let eventStreamAbort;
3003
+ const conversation = buildConversation2(allEvents);
3004
+ resolve2({
3005
+ result: {
3006
+ outputText,
3007
+ durationMs: totalDurationMs,
3008
+ usage: {
3009
+ inputTokens,
3010
+ outputTokens,
3011
+ totalTokens: inputTokens + outputTokens
3012
+ },
3013
+ costUsd
3014
+ },
3015
+ llmTrace,
3016
+ conversation
3017
+ });
3018
+ };
3019
+ let child;
3020
+ try {
3021
+ child = (0, import_child_process.spawn)("opencode", args, {
3022
+ cwd: options.cwd,
3023
+ env,
3024
+ stdio: ["ignore", "pipe", "pipe"],
3025
+ detached: true
3026
+ });
3027
+ } catch (spawnError) {
3028
+ reject(
3029
+ new Error(
3030
+ `Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
3031
+ )
3032
+ );
3033
+ return;
3034
+ }
3035
+ timers.timeout = setTimeout(() => {
3036
+ if (!resolved) {
3037
+ console.error(`[OpenCode] Process timed out after ${SDK_TIMEOUT_MS}ms`);
3038
+ killProcess(child, resolved);
3039
+ finalize(
3040
+ false,
3041
+ new Error(
3042
+ `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
3043
+ )
3044
+ );
3045
+ }
3046
+ }, SDK_TIMEOUT_MS);
3047
+ timers.idleCheck = setInterval(() => {
3048
+ if (resolved) return;
3049
+ const idleTime = Date.now() - lastOutputTime;
3050
+ if (idleTime >= IDLE_TIMEOUT_MS) {
3051
+ console.warn(
3052
+ `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
3053
+ );
3054
+ killProcess(child, resolved);
3055
+ finalize(
3056
+ false,
3057
+ new Error(
3058
+ `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
3059
+ )
3060
+ );
3061
+ }
3062
+ }, IDLE_CHECK_INTERVAL_MS);
2856
3063
  if (traceContext) {
2857
- eventStreamAbort = new AbortController();
2858
3064
  const executionStartTime = Date.now();
2859
- (async () => {
2860
- try {
2861
- const events = await client.event.subscribe();
2862
- for await (const event of events.stream) {
2863
- if (eventStreamAbort.signal.aborted) break;
2864
- const evt = event;
2865
- if (evt.type === "message.part.updated") {
2866
- const { part } = evt.properties;
2867
- traceStepNumber++;
2868
- const traceEvent = createTraceEventFromPart(
2869
- part,
2870
- traceContext,
2871
- traceStepNumber,
2872
- false
2873
- );
2874
- if (traceEvent) {
2875
- lastToolName = traceEvent.toolName;
2876
- lastFilePath = traceEvent.filePath;
2877
- if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
2878
- lastAction = "Thinking...";
2879
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
2880
- lastAction = extractToolAction(
2881
- traceEvent.toolName ?? "",
2882
- void 0
2883
- );
2884
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
2885
- lastAction = `Writing: ${traceEvent.filePath || "file"}`;
2886
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
2887
- lastAction = `Reading: ${traceEvent.filePath || "file"}`;
2888
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
2889
- lastAction = "Processing response...";
2890
- }
2891
- emitTraceEvent(
2892
- traceEvent,
2893
- traceContext.tracePushUrl,
2894
- traceContext.routeHeader,
2895
- traceContext.authToken
2896
- );
2897
- }
2898
- } else if (evt.type === "session.error") {
2899
- const props = evt.properties;
2900
- traceStepNumber++;
2901
- emitTraceEvent(
2902
- {
2903
- evalRunId: traceContext.evalRunId,
2904
- scenarioId: traceContext.scenarioId,
2905
- scenarioName: traceContext.scenarioName,
2906
- targetId: traceContext.targetId,
2907
- targetName: traceContext.targetName,
2908
- stepNumber: traceStepNumber,
2909
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2910
- outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
2911
- 0,
2912
- 500
2913
- ),
2914
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2915
- isComplete: false
2916
- },
2917
- traceContext.tracePushUrl,
2918
- traceContext.routeHeader,
2919
- traceContext.authToken
2920
- );
2921
- }
2922
- }
2923
- } catch {
2924
- }
2925
- })();
2926
3065
  let lastReportedAction = "";
2927
3066
  let sameActionCount = 0;
2928
- heartbeatHandle = setInterval(() => {
3067
+ timers.heartbeat = setInterval(() => {
2929
3068
  const elapsedMs = Date.now() - executionStartTime;
2930
3069
  let progressMessage = lastAction;
2931
3070
  if (lastAction === lastReportedAction) {
@@ -2966,212 +3105,83 @@ async function executeWithOpenCode(skills, scenario, options) {
2966
3105
  );
2967
3106
  }, 1e4);
2968
3107
  }
2969
- const promptPromise = (async () => {
2970
- let systemPrompt;
2971
- if (options.systemPrompt === null || options.systemPrompt === "") {
2972
- } else if (options.systemPrompt != null) {
2973
- systemPrompt = options.systemPrompt;
2974
- } else {
2975
- systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
2976
- }
2977
- console.log("[SDK-DEBUG] Sending prompt...");
2978
- const result = await client.session.prompt({
2979
- path: { id: sessionId },
2980
- body: {
2981
- model: { providerID, modelID },
2982
- ...systemPrompt ? { system: systemPrompt } : {},
2983
- parts: [{ type: "text", text: scenario.triggerPrompt }]
2984
- }
2985
- });
2986
- return result;
2987
- })();
2988
- const timeoutPromise = new Promise((_, reject) => {
2989
- timeoutHandle = setTimeout(() => {
2990
- timedOut = true;
2991
- client.session.abort({ path: { id: sessionId } }).catch(() => {
2992
- });
2993
- reject(
2994
- new Error(
2995
- `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
2996
- )
2997
- );
2998
- }, SDK_TIMEOUT_MS);
2999
- });
3000
- const promptResult = await Promise.race([promptPromise, timeoutPromise]);
3001
- if (timeoutHandle) clearTimeout(timeoutHandle);
3002
- if (heartbeatHandle) clearInterval(heartbeatHandle);
3003
- if (eventStreamAbort) eventStreamAbort.abort();
3004
- if ("error" in promptResult && promptResult.error) {
3005
- const errPayload = promptResult.error;
3006
- throw new Error(
3007
- `Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
3008
- );
3009
- }
3010
- console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
3011
- const messagesResponse = await client.session.messages({
3012
- path: { id: sessionId }
3013
- });
3014
- const allMessages = messagesResponse.data ?? [];
3015
- console.log(
3016
- `[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
3017
- );
3018
- if (traceContext) {
3019
- emitTraceEvent(
3020
- {
3021
- evalRunId: traceContext.evalRunId,
3022
- scenarioId: traceContext.scenarioId,
3023
- scenarioName: traceContext.scenarioName,
3024
- targetId: traceContext.targetId,
3025
- targetName: traceContext.targetName,
3026
- stepNumber: traceStepNumber + 1,
3027
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
3028
- outputPreview: "Scenario execution completed",
3029
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3030
- isComplete: true
3031
- },
3032
- traceContext.tracePushUrl,
3033
- traceContext.routeHeader,
3034
- traceContext.authToken
3035
- );
3036
- }
3037
- const endTime = /* @__PURE__ */ new Date();
3038
- const totalDurationMs = endTime.getTime() - startTime.getTime();
3039
- const resultData = promptResult.data;
3040
- const lastAssistantInfo = resultData?.info;
3041
- if (lastAssistantInfo?.error) {
3042
- const err = lastAssistantInfo.error;
3043
- throw new Error(
3044
- `Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
3045
- );
3046
- }
3047
- let outputText = "";
3048
- if (resultData?.parts) {
3049
- for (const part of resultData.parts) {
3050
- if (part.type === "text") {
3051
- outputText += part.text;
3052
- }
3053
- }
3054
- }
3055
- if (!outputText && allMessages.length > 0) {
3056
- for (let i = allMessages.length - 1; i >= 0; i--) {
3057
- const msg = allMessages[i];
3058
- if (msg.info.role === "assistant") {
3059
- const assistantInfo = msg.info;
3060
- if (assistantInfo.error) {
3061
- throw new Error(
3062
- `Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
3063
- );
3064
- }
3065
- for (const part of msg.parts) {
3066
- if (part.type === "text") {
3067
- outputText += part.text;
3108
+ child.stdout?.on("data", (data) => {
3109
+ const text = data.toString();
3110
+ lastOutputTime = Date.now();
3111
+ lineBuffer += text;
3112
+ const lines = lineBuffer.split("\n");
3113
+ lineBuffer = lines.pop() || "";
3114
+ for (const line of lines) {
3115
+ if (!line.trim()) continue;
3116
+ const evt = tryParseJson(line);
3117
+ if (!evt || !evt.type) continue;
3118
+ allEvents.push({ event: evt, receivedAt: Date.now() });
3119
+ if (traceContext) {
3120
+ traceStepNumber++;
3121
+ const traceEvt = createTraceEventFromNdjson(
3122
+ evt,
3123
+ traceContext,
3124
+ traceStepNumber,
3125
+ false
3126
+ );
3127
+ if (traceEvt) {
3128
+ lastToolName = traceEvt.toolName;
3129
+ lastFilePath = traceEvt.filePath;
3130
+ if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
3131
+ lastAction = "Thinking...";
3132
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
3133
+ lastAction = extractToolAction(
3134
+ traceEvt.toolName ?? "",
3135
+ void 0
3136
+ );
3137
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
3138
+ lastAction = `Writing: ${traceEvt.filePath || "file"}`;
3139
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
3140
+ lastAction = `Reading: ${traceEvt.filePath || "file"}`;
3141
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
3142
+ lastAction = "Processing response...";
3068
3143
  }
3144
+ emitTraceEvent(
3145
+ traceEvt,
3146
+ traceContext.tracePushUrl,
3147
+ traceContext.routeHeader,
3148
+ traceContext.authToken
3149
+ );
3069
3150
  }
3070
- if (outputText) break;
3071
3151
  }
3072
3152
  }
3073
- }
3074
- if (!outputText) {
3075
- const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
3076
- if (!hasAssistant) {
3077
- throw new Error(
3078
- `Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
3079
- );
3080
- }
3081
- }
3082
- const usage = lastAssistantInfo ? {
3083
- inputTokens: lastAssistantInfo.tokens.input,
3084
- outputTokens: lastAssistantInfo.tokens.output,
3085
- totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
3086
- } : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
3087
- const costUsd = lastAssistantInfo?.cost;
3088
- const modelStr = options.model || DEFAULT_MODEL3;
3089
- const llmTrace = buildLLMTrace(
3090
- allMessages,
3091
- totalDurationMs,
3092
- modelStr,
3093
- providerID
3094
- );
3095
- const conversation = buildConversation2(allMessages);
3096
- return {
3097
- result: {
3098
- outputText,
3099
- durationMs: totalDurationMs,
3100
- usage,
3101
- costUsd
3102
- },
3103
- llmTrace,
3104
- conversation
3105
- };
3106
- } catch (sdkError) {
3107
- if (timeoutHandle) clearTimeout(timeoutHandle);
3108
- if (heartbeatHandle) clearInterval(heartbeatHandle);
3109
- if (timedOut) {
3110
- console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
3111
- }
3112
- const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
3113
- const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
3114
- const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
3115
- const causeDetails = [];
3116
- let current = sdkError;
3117
- while (current instanceof Error && current.cause) {
3118
- current = current.cause;
3119
- if (current instanceof Error) {
3120
- causeDetails.push(`${current.name}: ${current.message}`);
3121
- } else {
3122
- causeDetails.push(String(current));
3153
+ });
3154
+ child.stderr?.on("data", (data) => {
3155
+ const text = data.toString();
3156
+ stderr += text;
3157
+ lastOutputTime = Date.now();
3158
+ });
3159
+ child.on("close", (code) => {
3160
+ if (lineBuffer.trim()) {
3161
+ const evt = tryParseJson(lineBuffer);
3162
+ if (evt && evt.type) {
3163
+ allEvents.push({ event: evt, receivedAt: Date.now() });
3164
+ }
3123
3165
  }
3124
- }
3125
- const causeChain = causeDetails.length > 0 ? `
3126
- Cause chain: ${causeDetails.join(" -> ")}` : "";
3127
- console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
3128
- console.error("[SDK-ERROR] Error name:", errorName);
3129
- console.error("[SDK-ERROR] Error message:", errorMessage);
3130
- if (causeDetails.length > 0) {
3131
- console.error("[SDK-ERROR] Cause chain:", causeDetails.join(" -> "));
3132
- }
3133
- if (errorStack) {
3134
- console.error("[SDK-ERROR] Stack:", errorStack);
3135
- }
3136
- if (traceContext) {
3137
- emitTraceEvent(
3138
- {
3139
- evalRunId: traceContext.evalRunId,
3140
- scenarioId: traceContext.scenarioId,
3141
- scenarioName: traceContext.scenarioName,
3142
- targetId: traceContext.targetId,
3143
- targetName: traceContext.targetName,
3144
- stepNumber: traceStepNumber + 1,
3145
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3146
- outputPreview: JSON.stringify({
3147
- event: "sdk-execution-failed",
3148
- error: errorMessage,
3149
- errorName,
3150
- ...causeDetails.length > 0 && {
3151
- causeChain: causeDetails.join(" -> ")
3152
- }
3153
- }).slice(0, 2e3),
3154
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3155
- isComplete: true
3156
- },
3157
- traceContext.tracePushUrl,
3158
- traceContext.routeHeader,
3159
- traceContext.authToken
3166
+ console.log(
3167
+ `[executeWithOpenCode] Process exited with code ${code}, ${allEvents.length} events collected`
3160
3168
  );
3161
- }
3162
- throw new Error(
3163
- `OpenCode SDK execution failed: ${errorMessage}` + causeChain + (errorStack ? `
3164
- Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
3165
- );
3166
- } finally {
3167
- if (server) {
3168
- try {
3169
- server.close();
3170
- console.log("[SDK-DEBUG] OpenCode server closed");
3171
- } catch {
3169
+ if (code === 0) {
3170
+ finalize(true);
3171
+ } else {
3172
+ finalize(
3173
+ false,
3174
+ new Error(
3175
+ `OpenCode CLI exited with code ${code}.
3176
+ Stderr: ${stderr.slice(0, 1e3)}`
3177
+ )
3178
+ );
3172
3179
  }
3173
- }
3174
- }
3180
+ });
3181
+ child.on("error", (error) => {
3182
+ finalize(false, new Error(`OpenCode CLI spawn error: ${error.message}`));
3183
+ });
3184
+ });
3175
3185
  }
3176
3186
 
3177
3187
  // src/run-scenario/agents/opencode/opencode-adapter.ts
@@ -3757,7 +3767,7 @@ defaultRegistry.register(simpleAgentAdapter);
3757
3767
 
3758
3768
  // src/run-scenario/file-diff.ts
3759
3769
  var import_fs2 = require("fs");
3760
- var import_path10 = require("path");
3770
+ var import_path11 = require("path");
3761
3771
 
3762
3772
  // ../../node_modules/diff/lib/index.mjs
3763
3773
  function Diff() {
@@ -3933,7 +3943,7 @@ Diff.prototype = {
3933
3943
  tokenize: function tokenize(value) {
3934
3944
  return Array.from(value);
3935
3945
  },
3936
- join: function join8(chars) {
3946
+ join: function join9(chars) {
3937
3947
  return chars.join("");
3938
3948
  },
3939
3949
  postProcess: function postProcess(changeObjects) {
@@ -4382,8 +4392,8 @@ function snapshotDirectory(dir, baseDir) {
4382
4392
  }
4383
4393
  const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
4384
4394
  for (const entry of entries) {
4385
- const fullPath = (0, import_path10.join)(dir, entry.name);
4386
- const relativePath = (0, import_path10.relative)(base, fullPath);
4395
+ const fullPath = (0, import_path11.join)(dir, entry.name);
4396
+ const relativePath = (0, import_path11.relative)(base, fullPath);
4387
4397
  if (shouldIgnore(entry.name)) {
4388
4398
  continue;
4389
4399
  }