@wix/evalforge-evaluator 0.118.0 → 0.120.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -32,6 +32,7 @@ function loadConfig() {
32
32
  aiGatewayHeaders[key] = value;
33
33
  }
34
34
  }
35
+ aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
35
36
  const tracePushUrl = process.env.TRACE_PUSH_URL;
36
37
  const routeHeader = process.env.EVAL_ROUTE_HEADER;
37
38
  const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -1192,10 +1193,10 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1192
1193
  };
1193
1194
  }
1194
1195
  async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1195
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1196
+ const { mkdir: mkdirAsync, writeFile: writeFile7 } = await import("fs/promises");
1196
1197
  const claudeDir = `${cwd}/.claude`;
1197
1198
  await mkdirAsync(claudeDir, { recursive: true });
1198
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1199
+ await writeFile7(`${claudeDir}/settings.json`, "{}", {
1199
1200
  flag: "wx"
1200
1201
  }).catch(() => {
1201
1202
  });
@@ -2154,13 +2155,21 @@ defaultRegistry.register(claudeCodeAdapter);
2154
2155
  import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
2155
2156
 
2156
2157
  // src/run-scenario/agents/opencode/execute.ts
2157
- import { homedir as homedir2 } from "os";
2158
+ import { spawn } from "child_process";
2158
2159
  import {
2159
- ClaudeModel as ClaudeModel3,
2160
2160
  DEFAULT_EVALUATOR_SYSTEM_PROMPT as DEFAULT_EVALUATOR_SYSTEM_PROMPT2,
2161
2161
  LiveTraceEventType as LiveTraceEventType2
2162
2162
  } from "@wix/evalforge-types";
2163
2163
 
2164
+ // src/run-scenario/agents/opencode/types.ts
2165
+ function tryParseJson(text) {
2166
+ try {
2167
+ return JSON.parse(text);
2168
+ } catch {
2169
+ return null;
2170
+ }
2171
+ }
2172
+
2164
2173
  // src/run-scenario/agents/opencode/write-skills.ts
2165
2174
  import { mkdir as mkdir5 } from "fs/promises";
2166
2175
  import { join as join6 } from "path";
@@ -2258,6 +2267,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHub
2258
2267
  }
2259
2268
 
2260
2269
  // src/run-scenario/agents/opencode/config.ts
2270
+ import { homedir as homedir2 } from "os";
2261
2271
  import {
2262
2272
  ClaudeModel as ClaudeModel2,
2263
2273
  AVAILABLE_OPENAI_MODEL_IDS
@@ -2313,7 +2323,14 @@ function toOpenCodeMcpConfig(servers) {
2313
2323
  }
2314
2324
  return result;
2315
2325
  }
2316
- async function buildOpenCodeConfig(options) {
2326
+ function ensureOpenCodeInPath(currentPath) {
2327
+ const opencodeBin = `${homedir2()}/.opencode/bin`;
2328
+ if (currentPath.includes(opencodeBin)) {
2329
+ return currentPath;
2330
+ }
2331
+ return `${opencodeBin}:${currentPath}`;
2332
+ }
2333
+ async function buildOpenCodeEnv(options) {
2317
2334
  const modelStr = options.model || DEFAULT_MODEL2;
2318
2335
  const { providerID, modelID } = parseModel(modelStr);
2319
2336
  const provider = {};
@@ -2325,9 +2342,7 @@ async function buildOpenCodeConfig(options) {
2325
2342
  if (options.aiGatewayHeaders) {
2326
2343
  providerOptions.headers = { ...options.aiGatewayHeaders };
2327
2344
  }
2328
- provider[providerID] = {
2329
- options: providerOptions
2330
- };
2345
+ provider[providerID] = { options: providerOptions };
2331
2346
  }
2332
2347
  let mcp;
2333
2348
  if (options.mcps && options.mcps.length > 0) {
@@ -2368,70 +2383,81 @@ async function buildOpenCodeConfig(options) {
2368
2383
  },
2369
2384
  ...mcp ? { mcp } : {}
2370
2385
  };
2371
- return { config, providerID, modelID };
2386
+ const env = {
2387
+ ...process.env,
2388
+ PATH: ensureOpenCodeInPath(process.env.PATH || ""),
2389
+ OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
2390
+ OPENCODE_DISABLE_LSP_DOWNLOAD: "true"
2391
+ };
2392
+ return { env, providerID, modelID };
2372
2393
  }
2373
2394
 
2374
2395
  // src/run-scenario/agents/opencode/build-trace.ts
2375
2396
  import { LLMStepType as LLMStepType2 } from "@wix/evalforge-types";
2376
2397
  import { randomUUID as randomUUID2 } from "crypto";
2377
- function buildLLMTrace(messages, totalDurationMs, model, provider) {
2378
- const assistantMessages = messages.filter(
2379
- (m) => m.info.role === "assistant"
2380
- );
2381
- const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
2382
- const { info, parts } = msg;
2383
- let text = "";
2384
- let thinking = "";
2385
- const toolCalls = [];
2386
- let stepInputTokens = 0;
2387
- let stepOutputTokens = 0;
2388
- let stepCost = 0;
2389
- let finishReason = "unknown";
2390
- for (const part of parts) {
2391
- switch (part.type) {
2392
- case "text": {
2393
- const textPart = part;
2394
- text += textPart.text;
2395
- break;
2396
- }
2397
- case "reasoning": {
2398
- const reasoningPart = part;
2399
- thinking += reasoningPart.text;
2400
- break;
2401
- }
2402
- case "tool": {
2403
- const toolPart = part;
2404
- toolCalls.push({
2405
- toolName: toolPart.tool,
2406
- args: toolPart.state.input
2407
- });
2408
- break;
2409
- }
2410
- case "step-finish": {
2411
- const sf = part;
2412
- stepInputTokens += sf.tokens.input;
2413
- stepOutputTokens += sf.tokens.output;
2414
- stepCost += sf.cost;
2415
- finishReason = sf.reason;
2416
- break;
2417
- }
2398
+ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, executionStartTime) {
2399
+ const turns = [];
2400
+ let current = {
2401
+ textParts: [],
2402
+ reasoningParts: [],
2403
+ toolCalls: []
2404
+ };
2405
+ for (const { event: evt, receivedAt } of timestampedEvents) {
2406
+ switch (evt.type) {
2407
+ case "text":
2408
+ current.textParts.push(evt.part.text);
2409
+ break;
2410
+ case "reasoning":
2411
+ current.reasoningParts.push(evt.part.text);
2412
+ break;
2413
+ case "tool_use": {
2414
+ const tu = evt;
2415
+ current.toolCalls.push({
2416
+ toolName: tu.part.tool,
2417
+ args: tu.part.state.input
2418
+ });
2419
+ break;
2420
+ }
2421
+ case "step_finish": {
2422
+ const sf = evt;
2423
+ current.stepFinish = sf.part;
2424
+ current.receivedAt = receivedAt;
2425
+ turns.push(current);
2426
+ current = {
2427
+ textParts: [],
2428
+ reasoningParts: [],
2429
+ toolCalls: []
2430
+ };
2431
+ break;
2418
2432
  }
2419
2433
  }
2420
- if (stepInputTokens === 0 && stepOutputTokens === 0) {
2421
- stepInputTokens = info.tokens.input;
2422
- stepOutputTokens = info.tokens.output;
2423
- stepCost = info.cost;
2424
- }
2425
- const startedAt = new Date(info.time.created).toISOString();
2426
- const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
2427
- const durationMs = Math.max(0, completedAt - info.time.created);
2428
- const isSuccess = finishReason !== "error";
2429
- const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2430
- const stepModel = info.modelID || model;
2431
- const stepProvider = info.providerID || provider;
2432
- const toolCallCount = toolCalls.length;
2434
+ }
2435
+ if (current.textParts.length > 0 || current.reasoningParts.length > 0 || current.toolCalls.length > 0) {
2436
+ if (timestampedEvents.length > 0) {
2437
+ current.receivedAt = timestampedEvents[timestampedEvents.length - 1].receivedAt;
2438
+ }
2439
+ turns.push(current);
2440
+ }
2441
+ const executionStartMs = executionStartTime.getTime();
2442
+ const allSteps = turns.flatMap((turn, turnIndex) => {
2443
+ const sf = turn.stepFinish;
2444
+ const stepInputTokens = sf?.tokens.input ?? 0;
2445
+ const stepOutputTokens = sf?.tokens.output ?? 0;
2446
+ const stepCost = sf?.cost ?? 0;
2447
+ const finishReason = sf?.reason ?? "unknown";
2448
+ const stepModel = sf?.modelID || model;
2449
+ const stepProvider = sf?.providerID || provider;
2450
+ const turnEndMs = turn.receivedAt ?? executionStartMs + totalDurationMs;
2451
+ const prevEndMs = turnIndex > 0 ? turns[turnIndex - 1].receivedAt ?? executionStartMs : executionStartMs;
2452
+ const durationMs = Math.max(0, turnEndMs - prevEndMs);
2453
+ const startedAt = new Date(prevEndMs).toISOString();
2454
+ const text = turn.textParts.join("");
2455
+ const thinking = turn.reasoningParts.join("");
2456
+ const toolCallCount = turn.toolCalls.length;
2433
2457
  const hasThinking = !!thinking;
2434
2458
  const hasText = !!text;
2459
+ const isSuccess = finishReason !== "error";
2460
+ const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2435
2461
  const subSteps = [];
2436
2462
  const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
2437
2463
  const toolSubSteps = toolCallCount;
@@ -2441,7 +2467,6 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2441
2467
  subSteps.push({
2442
2468
  id: randomUUID2(),
2443
2469
  stepNumber: 0,
2444
- // renumbered below
2445
2470
  turnIndex,
2446
2471
  type: LLMStepType2.THINKING,
2447
2472
  model: stepModel,
@@ -2463,7 +2488,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2463
2488
  }
2464
2489
  if (toolCallCount > 0) {
2465
2490
  for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
2466
- const tc = toolCalls[tcIdx];
2491
+ const tc = turn.toolCalls[tcIdx];
2467
2492
  const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
2468
2493
  const toolBudgetSteps = toolSubSteps + textSubSteps;
2469
2494
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
@@ -2542,11 +2567,21 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2542
2567
  }
2543
2568
  return subSteps;
2544
2569
  }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
2545
- const totalTokens = buildTotalTokens(assistantMessages);
2546
- const totalCost = assistantMessages.reduce((sum, m) => {
2547
- const aMsg = m.info;
2548
- return sum + aMsg.cost;
2549
- }, 0);
2570
+ let totalPrompt = 0;
2571
+ let totalCompletion = 0;
2572
+ let totalCost = 0;
2573
+ for (const turn of turns) {
2574
+ if (turn.stepFinish) {
2575
+ totalPrompt += turn.stepFinish.tokens.input;
2576
+ totalCompletion += turn.stepFinish.tokens.output;
2577
+ totalCost += turn.stepFinish.cost;
2578
+ }
2579
+ }
2580
+ const totalTokens = {
2581
+ prompt: totalPrompt,
2582
+ completion: totalCompletion,
2583
+ total: totalPrompt + totalCompletion
2584
+ };
2550
2585
  const stepTypeBreakdown = {};
2551
2586
  for (const step of allSteps) {
2552
2587
  const entry = stepTypeBreakdown[step.type] ?? {
@@ -2564,7 +2599,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2564
2599
  const modelUsed = allSteps[0]?.model || model;
2565
2600
  const summary = {
2566
2601
  totalSteps: allSteps.length,
2567
- totalTurns: assistantMessages.length,
2602
+ totalTurns: turns.length,
2568
2603
  totalDurationMs,
2569
2604
  totalTokens,
2570
2605
  totalCostUsd: totalCost,
@@ -2585,116 +2620,100 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2585
2620
  summary
2586
2621
  };
2587
2622
  }
2588
- function buildTotalTokens(assistantMessages) {
2589
- let prompt = 0;
2590
- let completion = 0;
2591
- for (const { info } of assistantMessages) {
2592
- prompt += info.tokens.input;
2593
- completion += info.tokens.output;
2594
- }
2595
- return { prompt, completion, total: prompt + completion };
2596
- }
2597
2623
 
2598
2624
  // src/run-scenario/agents/opencode/build-conversation.ts
2599
- function buildConversation2(messages) {
2625
+ function buildConversation2(timestampedEvents) {
2600
2626
  const result = [];
2601
- for (const { info, parts } of messages) {
2602
- const timestamp = new Date(info.time.created).toISOString();
2603
- if (info.role === "assistant") {
2604
- const content = [];
2605
- for (const part of parts) {
2606
- switch (part.type) {
2607
- case "text": {
2608
- const textPart = part;
2609
- content.push({ type: "text", text: textPart.text });
2610
- break;
2611
- }
2612
- case "reasoning": {
2613
- const reasoningPart = part;
2614
- content.push({ type: "thinking", thinking: reasoningPart.text });
2615
- break;
2616
- }
2617
- case "tool": {
2618
- const toolPart = part;
2619
- content.push({
2620
- type: "tool_use",
2621
- toolName: toolPart.tool,
2622
- toolId: toolPart.callID,
2623
- input: toolPart.state.input
2624
- });
2625
- break;
2626
- }
2627
- }
2627
+ let assistantContent = [];
2628
+ let userContent = [];
2629
+ let latestReceivedAt = 0;
2630
+ const flushAssistant = () => {
2631
+ if (assistantContent.length > 0) {
2632
+ const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
2633
+ result.push({ role: "assistant", content: assistantContent, timestamp });
2634
+ assistantContent = [];
2635
+ }
2636
+ };
2637
+ const flushUser = () => {
2638
+ if (userContent.length > 0) {
2639
+ const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
2640
+ result.push({ role: "user", content: userContent, timestamp });
2641
+ userContent = [];
2642
+ }
2643
+ };
2644
+ for (const { event: evt, receivedAt } of timestampedEvents) {
2645
+ latestReceivedAt = receivedAt;
2646
+ switch (evt.type) {
2647
+ case "text": {
2648
+ const te = evt;
2649
+ assistantContent.push({ type: "text", text: te.part.text });
2650
+ break;
2628
2651
  }
2629
- if (content.length > 0) {
2630
- result.push({ role: "assistant", content, timestamp });
2652
+ case "reasoning": {
2653
+ const re = evt;
2654
+ assistantContent.push({ type: "thinking", thinking: re.part.text });
2655
+ break;
2631
2656
  }
2632
- } else if (info.role === "user") {
2633
- const content = [];
2634
- for (const part of parts) {
2635
- if (part.type === "text") {
2636
- const textPart = part;
2637
- content.push({ type: "text", text: textPart.text });
2638
- } else if (part.type === "tool") {
2639
- const toolPart = part;
2640
- const state = toolPart.state;
2641
- if (state.status === "completed") {
2642
- const completed = state;
2643
- content.push({
2644
- type: "tool_result",
2645
- toolUseId: toolPart.callID,
2646
- content: completed.output
2647
- });
2648
- } else if (state.status === "error") {
2649
- const errState = state;
2650
- content.push({
2651
- type: "tool_result",
2652
- toolUseId: toolPart.callID,
2653
- content: errState.error,
2654
- isError: true
2655
- });
2656
- }
2657
+ case "tool_use": {
2658
+ const tu = evt;
2659
+ assistantContent.push({
2660
+ type: "tool_use",
2661
+ toolName: tu.part.tool,
2662
+ toolId: tu.part.callID,
2663
+ input: tu.part.state.input
2664
+ });
2665
+ if (tu.part.state.status === "completed" || tu.part.state.status === "error") {
2666
+ flushAssistant();
2667
+ const isError = tu.part.state.status === "error";
2668
+ const content = isError ? tu.part.state.error || "Tool execution failed" : tu.part.state.output || "";
2669
+ userContent.push({
2670
+ type: "tool_result",
2671
+ toolUseId: tu.part.callID,
2672
+ content,
2673
+ ...isError ? { isError: true } : {}
2674
+ });
2675
+ flushUser();
2657
2676
  }
2677
+ break;
2658
2678
  }
2659
- if (content.length > 0) {
2660
- result.push({ role: "user", content, timestamp });
2679
+ case "step_finish": {
2680
+ flushAssistant();
2681
+ flushUser();
2682
+ break;
2661
2683
  }
2662
2684
  }
2663
2685
  }
2686
+ flushAssistant();
2687
+ flushUser();
2664
2688
  return result;
2665
2689
  }
2666
2690
 
2667
2691
  // src/run-scenario/agents/opencode/execute.ts
2668
- var DEFAULT_MODEL3 = `anthropic/${ClaudeModel3.CLAUDE_4_5_SONNET_1_0}`;
2669
- function ensureOpenCodeInPath() {
2670
- const opencodeBin = `${homedir2()}/.opencode/bin`;
2671
- const currentPath = process.env.PATH || "";
2672
- if (!currentPath.includes(opencodeBin)) {
2673
- process.env.PATH = `${opencodeBin}:${currentPath}`;
2674
- }
2675
- }
2692
+ import { writeFile as writeFile6, mkdir as mkdir7 } from "fs/promises";
2693
+ import { join as join8 } from "path";
2694
+ var KILL_GRACE_PERIOD_MS = 5e3;
2695
+ var IDLE_TIMEOUT_MS = 12e4;
2696
+ var IDLE_CHECK_INTERVAL_MS = 15e3;
2676
2697
  function extractToolAction(toolName, args) {
2677
2698
  if (!toolName) return "Using tool...";
2678
- const a = args;
2679
- if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
2680
- const desc = String(a.description).slice(0, 55);
2681
- return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
2682
- }
2683
- if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
2684
- const cmd = String(a.command).slice(0, 50);
2685
- return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
2686
- }
2687
- if (a?.file_path || a?.path || a?.target_file) {
2688
- const filePath = String(a.file_path || a.path || a.target_file).slice(
2689
- 0,
2690
- 50
2691
- );
2699
+ if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
2700
+ const desc = String(args.description).slice(0, 55);
2701
+ return `Task: ${desc}${String(args.description).length > 55 ? "..." : ""}`;
2702
+ }
2703
+ if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && args?.command) {
2704
+ const cmd = String(args.command).slice(0, 50);
2705
+ return `Running: ${cmd}${String(args.command).length > 50 ? "..." : ""}`;
2706
+ }
2707
+ if (args?.file_path || args?.path || args?.target_file) {
2708
+ const filePath = String(
2709
+ args.file_path || args.path || args.target_file
2710
+ ).slice(0, 50);
2692
2711
  if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
2693
2712
  if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
2694
2713
  }
2695
2714
  return `Using ${toolName}...`;
2696
2715
  }
2697
- function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2716
+ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
2698
2717
  const base = {
2699
2718
  evalRunId: context.evalRunId,
2700
2719
  scenarioId: context.scenarioId,
@@ -2705,42 +2724,41 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2705
2724
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2706
2725
  isComplete
2707
2726
  };
2708
- switch (part.type) {
2727
+ switch (evt.type) {
2709
2728
  case "text": {
2710
- const textPart = part;
2729
+ const te = evt;
2711
2730
  return {
2712
2731
  ...base,
2713
2732
  type: LiveTraceEventType2.COMPLETION,
2714
- outputPreview: textPart.text.slice(0, 500)
2733
+ outputPreview: te.part.text.slice(0, 500)
2715
2734
  };
2716
2735
  }
2717
- case "reasoning": {
2718
- const reasoningPart = part;
2736
+ case "reasoning":
2719
2737
  return {
2720
2738
  ...base,
2721
2739
  type: LiveTraceEventType2.THINKING,
2722
- thinking: reasoningPart.text.slice(0, 500)
2740
+ thinking: evt.part.text.slice(0, 500)
2723
2741
  };
2724
- }
2725
- case "tool": {
2726
- const toolPart = part;
2727
- const toolName = toolPart.tool;
2728
- const args = toolPart.state.input;
2742
+ case "tool_use": {
2743
+ const tu = evt;
2744
+ const toolName = tu.part.tool;
2745
+ const args = tu.part.state.input;
2729
2746
  const toolArgs = JSON.stringify(args).slice(0, 500);
2730
2747
  let type = LiveTraceEventType2.TOOL_USE;
2731
2748
  let filePath;
2732
- const a = args;
2733
- if (a.file_path || a.path || a.target_file) {
2734
- filePath = String(a.file_path || a.path || a.target_file);
2735
- if (/write|edit/i.test(toolName)) {
2736
- type = LiveTraceEventType2.FILE_WRITE;
2737
- } else if (/read|view/i.test(toolName)) {
2738
- type = LiveTraceEventType2.FILE_READ;
2749
+ if (args) {
2750
+ if (args.file_path || args.path || args.target_file) {
2751
+ filePath = String(args.file_path || args.path || args.target_file);
2752
+ if (/write|edit/i.test(toolName)) {
2753
+ type = LiveTraceEventType2.FILE_WRITE;
2754
+ } else if (/read|view/i.test(toolName)) {
2755
+ type = LiveTraceEventType2.FILE_READ;
2756
+ }
2739
2757
  }
2740
2758
  }
2741
2759
  return { ...base, type, toolName, toolArgs, filePath };
2742
2760
  }
2743
- case "step-finish":
2761
+ case "step_finish":
2744
2762
  return {
2745
2763
  ...base,
2746
2764
  type: LiveTraceEventType2.PROGRESS,
@@ -2770,6 +2788,37 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2770
2788
  );
2771
2789
  }
2772
2790
  }
2791
+ async function writeSystemPromptRule(cwd, systemPrompt) {
2792
+ const rulesDir = join8(cwd, ".opencode", "rules");
2793
+ await mkdir7(rulesDir, { recursive: true });
2794
+ await writeFile6(
2795
+ join8(rulesDir, "evalforge-system-prompt.md"),
2796
+ systemPrompt,
2797
+ "utf-8"
2798
+ );
2799
+ }
2800
+ function killProcess(child, resolved) {
2801
+ if (!child) return;
2802
+ const killSignal = (signal) => {
2803
+ if (child.pid) {
2804
+ try {
2805
+ process.kill(-child.pid, signal);
2806
+ console.log(
2807
+ `[OpenCode] Sent ${signal} to process group (pid: -${child.pid})`
2808
+ );
2809
+ return;
2810
+ } catch {
2811
+ }
2812
+ }
2813
+ child.kill(signal);
2814
+ };
2815
+ killSignal("SIGTERM");
2816
+ setTimeout(() => {
2817
+ if (child && !resolved) {
2818
+ killSignal("SIGKILL");
2819
+ }
2820
+ }, KILL_GRACE_PERIOD_MS);
2821
+ }
2773
2822
  async function executeWithOpenCode(skills, scenario, options) {
2774
2823
  const skillNames = skills.map((s) => s.name).join(", ");
2775
2824
  console.log("[executeWithOpenCode] Starting execution", {
@@ -2784,7 +2833,8 @@ async function executeWithOpenCode(skills, scenario, options) {
2784
2833
  });
2785
2834
  const startTime = /* @__PURE__ */ new Date();
2786
2835
  const maxTurns = options.maxTurns ?? 10;
2787
- const { config, providerID, modelID } = await buildOpenCodeConfig({
2836
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2837
+ const { env, providerID, modelID } = await buildOpenCodeEnv({
2788
2838
  model: options.model,
2789
2839
  temperature: options.temperature,
2790
2840
  maxTurns,
@@ -2793,12 +2843,6 @@ async function executeWithOpenCode(skills, scenario, options) {
2793
2843
  mcps: options.mcps,
2794
2844
  cwd: options.cwd
2795
2845
  });
2796
- const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
2797
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2798
- const abortController = new AbortController();
2799
- let timeoutHandle;
2800
- let heartbeatHandle;
2801
- let timedOut = false;
2802
2846
  const traceContext = options.traceContext;
2803
2847
  let traceStepNumber = 0;
2804
2848
  let lastAction = "Starting...";
@@ -2815,7 +2859,7 @@ async function executeWithOpenCode(skills, scenario, options) {
2815
2859
  stepNumber: 0,
2816
2860
  type: LiveTraceEventType2.DIAGNOSTIC,
2817
2861
  outputPreview: JSON.stringify({
2818
- event: "pre-sdk-execution",
2862
+ event: "pre-cli-execution",
2819
2863
  model: `${providerID}/${modelID}`,
2820
2864
  maxTurns,
2821
2865
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
@@ -2828,105 +2872,200 @@ async function executeWithOpenCode(skills, scenario, options) {
2828
2872
  traceContext.authToken
2829
2873
  );
2830
2874
  }
2831
- let server;
2832
- try {
2833
- ensureOpenCodeInPath();
2834
- console.log("[SDK-DEBUG] Starting OpenCode server...");
2835
- server = await createOpencodeServer({
2836
- config,
2837
- signal: abortController.signal,
2838
- timeout: 3e4
2839
- });
2840
- console.log(`[SDK-DEBUG] Server started at ${server.url}`);
2841
- const client = createOpencodeClient({
2842
- baseUrl: server.url,
2843
- directory: options.cwd
2844
- });
2845
- const session = await client.session.create({
2846
- body: { title: `eval-${scenario.name}` }
2847
- });
2848
- if (!session.data) {
2849
- const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
2850
- throw new Error(
2851
- `OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
2875
+ let systemPrompt;
2876
+ if (options.systemPrompt === null || options.systemPrompt === "") {
2877
+ } else if (options.systemPrompt != null) {
2878
+ systemPrompt = options.systemPrompt;
2879
+ } else {
2880
+ systemPrompt = DEFAULT_EVALUATOR_SYSTEM_PROMPT2;
2881
+ }
2882
+ if (systemPrompt) {
2883
+ await writeSystemPromptRule(options.cwd, systemPrompt);
2884
+ }
2885
+ const args = [
2886
+ "run",
2887
+ "--format",
2888
+ "json",
2889
+ "--thinking",
2890
+ "--variant",
2891
+ "high",
2892
+ "--model",
2893
+ `${providerID}/${modelID}`,
2894
+ "--dir",
2895
+ options.cwd,
2896
+ // NOTE: Trigger prompt is passed as a positional CLI arg. On Linux a single
2897
+ // arg is capped at 128 KB (MAX_ARG_STRLEN); on macOS the combined args+env
2898
+ // share a ~1 MB limit. Prompts exceeding this would fail with E2BIG.
2899
+ // In practice eval prompts are well under this limit.
2900
+ scenario.triggerPrompt
2901
+ ];
2902
+ console.log("[executeWithOpenCode] Spawning: opencode", args.slice(0, 5));
2903
+ return new Promise((resolve2, reject) => {
2904
+ let resolved = false;
2905
+ let stderr = "";
2906
+ let lineBuffer = "";
2907
+ let lastOutputTime = Date.now();
2908
+ const allEvents = [];
2909
+ const timers = {};
2910
+ const cleanup = () => {
2911
+ if (timers.timeout) clearTimeout(timers.timeout);
2912
+ if (timers.idleCheck) clearInterval(timers.idleCheck);
2913
+ if (timers.heartbeat) clearInterval(timers.heartbeat);
2914
+ };
2915
+ const finalize = (success, error) => {
2916
+ if (resolved) return;
2917
+ resolved = true;
2918
+ cleanup();
2919
+ if (!success) {
2920
+ if (traceContext) {
2921
+ emitTraceEvent(
2922
+ {
2923
+ evalRunId: traceContext.evalRunId,
2924
+ scenarioId: traceContext.scenarioId,
2925
+ scenarioName: traceContext.scenarioName,
2926
+ targetId: traceContext.targetId,
2927
+ targetName: traceContext.targetName,
2928
+ stepNumber: traceStepNumber + 1,
2929
+ type: LiveTraceEventType2.DIAGNOSTIC,
2930
+ outputPreview: JSON.stringify({
2931
+ event: "cli-execution-failed",
2932
+ error: error?.message ?? "Unknown error"
2933
+ }).slice(0, 2e3),
2934
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2935
+ isComplete: true
2936
+ },
2937
+ traceContext.tracePushUrl,
2938
+ traceContext.routeHeader,
2939
+ traceContext.authToken
2940
+ );
2941
+ }
2942
+ reject(
2943
+ error ?? new Error(
2944
+ `OpenCode CLI execution failed (exit code unknown).
2945
+ Stderr: ${stderr.slice(0, 1e3)}`
2946
+ )
2947
+ );
2948
+ return;
2949
+ }
2950
+ const endTime = /* @__PURE__ */ new Date();
2951
+ const totalDurationMs = endTime.getTime() - startTime.getTime();
2952
+ let outputText = "";
2953
+ for (const { event: evt } of allEvents) {
2954
+ if (evt.type === "text") {
2955
+ outputText += evt.part.text;
2956
+ }
2957
+ }
2958
+ if (!outputText) {
2959
+ reject(
2960
+ new Error(
2961
+ `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
2962
+ )
2963
+ );
2964
+ return;
2965
+ }
2966
+ let inputTokens = 0;
2967
+ let outputTokens = 0;
2968
+ let costUsd = 0;
2969
+ for (const { event: evt } of allEvents) {
2970
+ if (evt.type === "step_finish") {
2971
+ const sf = evt;
2972
+ inputTokens += sf.part.tokens.input;
2973
+ outputTokens += sf.part.tokens.output;
2974
+ costUsd += sf.part.cost;
2975
+ }
2976
+ }
2977
+ if (traceContext) {
2978
+ emitTraceEvent(
2979
+ {
2980
+ evalRunId: traceContext.evalRunId,
2981
+ scenarioId: traceContext.scenarioId,
2982
+ scenarioName: traceContext.scenarioName,
2983
+ targetId: traceContext.targetId,
2984
+ targetName: traceContext.targetName,
2985
+ stepNumber: traceStepNumber + 1,
2986
+ type: LiveTraceEventType2.COMPLETION,
2987
+ outputPreview: "Scenario execution completed",
2988
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2989
+ isComplete: true
2990
+ },
2991
+ traceContext.tracePushUrl,
2992
+ traceContext.routeHeader,
2993
+ traceContext.authToken
2994
+ );
2995
+ }
2996
+ const modelStr = options.model || `${providerID}/${modelID}`;
2997
+ const llmTrace = buildLLMTrace(
2998
+ allEvents,
2999
+ totalDurationMs,
3000
+ modelStr,
3001
+ providerID,
3002
+ startTime
2852
3003
  );
2853
- }
2854
- const sessionId = session.data.id;
2855
- console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
2856
- let eventStreamAbort;
3004
+ const conversation = buildConversation2(allEvents);
3005
+ resolve2({
3006
+ result: {
3007
+ outputText,
3008
+ durationMs: totalDurationMs,
3009
+ usage: {
3010
+ inputTokens,
3011
+ outputTokens,
3012
+ totalTokens: inputTokens + outputTokens
3013
+ },
3014
+ costUsd
3015
+ },
3016
+ llmTrace,
3017
+ conversation
3018
+ });
3019
+ };
3020
+ let child;
3021
+ try {
3022
+ child = spawn("opencode", args, {
3023
+ cwd: options.cwd,
3024
+ env,
3025
+ stdio: ["ignore", "pipe", "pipe"],
3026
+ detached: true
3027
+ });
3028
+ } catch (spawnError) {
3029
+ reject(
3030
+ new Error(
3031
+ `Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
3032
+ )
3033
+ );
3034
+ return;
3035
+ }
3036
+ timers.timeout = setTimeout(() => {
3037
+ if (!resolved) {
3038
+ console.error(`[OpenCode] Process timed out after ${SDK_TIMEOUT_MS}ms`);
3039
+ killProcess(child, resolved);
3040
+ finalize(
3041
+ false,
3042
+ new Error(
3043
+ `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
3044
+ )
3045
+ );
3046
+ }
3047
+ }, SDK_TIMEOUT_MS);
3048
+ timers.idleCheck = setInterval(() => {
3049
+ if (resolved) return;
3050
+ const idleTime = Date.now() - lastOutputTime;
3051
+ if (idleTime >= IDLE_TIMEOUT_MS) {
3052
+ console.warn(
3053
+ `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
3054
+ );
3055
+ killProcess(child, resolved);
3056
+ finalize(
3057
+ false,
3058
+ new Error(
3059
+ `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
3060
+ )
3061
+ );
3062
+ }
3063
+ }, IDLE_CHECK_INTERVAL_MS);
2857
3064
  if (traceContext) {
2858
- eventStreamAbort = new AbortController();
2859
3065
  const executionStartTime = Date.now();
2860
- (async () => {
2861
- try {
2862
- const events = await client.event.subscribe();
2863
- for await (const event of events.stream) {
2864
- if (eventStreamAbort.signal.aborted) break;
2865
- const evt = event;
2866
- if (evt.type === "message.part.updated") {
2867
- const { part } = evt.properties;
2868
- traceStepNumber++;
2869
- const traceEvent = createTraceEventFromPart(
2870
- part,
2871
- traceContext,
2872
- traceStepNumber,
2873
- false
2874
- );
2875
- if (traceEvent) {
2876
- lastToolName = traceEvent.toolName;
2877
- lastFilePath = traceEvent.filePath;
2878
- if (traceEvent.type === LiveTraceEventType2.THINKING) {
2879
- lastAction = "Thinking...";
2880
- } else if (traceEvent.type === LiveTraceEventType2.TOOL_USE) {
2881
- lastAction = extractToolAction(
2882
- traceEvent.toolName ?? "",
2883
- void 0
2884
- );
2885
- } else if (traceEvent.type === LiveTraceEventType2.FILE_WRITE) {
2886
- lastAction = `Writing: ${traceEvent.filePath || "file"}`;
2887
- } else if (traceEvent.type === LiveTraceEventType2.FILE_READ) {
2888
- lastAction = `Reading: ${traceEvent.filePath || "file"}`;
2889
- } else if (traceEvent.type === LiveTraceEventType2.COMPLETION) {
2890
- lastAction = "Processing response...";
2891
- }
2892
- emitTraceEvent(
2893
- traceEvent,
2894
- traceContext.tracePushUrl,
2895
- traceContext.routeHeader,
2896
- traceContext.authToken
2897
- );
2898
- }
2899
- } else if (evt.type === "session.error") {
2900
- const props = evt.properties;
2901
- traceStepNumber++;
2902
- emitTraceEvent(
2903
- {
2904
- evalRunId: traceContext.evalRunId,
2905
- scenarioId: traceContext.scenarioId,
2906
- scenarioName: traceContext.scenarioName,
2907
- targetId: traceContext.targetId,
2908
- targetName: traceContext.targetName,
2909
- stepNumber: traceStepNumber,
2910
- type: LiveTraceEventType2.DIAGNOSTIC,
2911
- outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
2912
- 0,
2913
- 500
2914
- ),
2915
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2916
- isComplete: false
2917
- },
2918
- traceContext.tracePushUrl,
2919
- traceContext.routeHeader,
2920
- traceContext.authToken
2921
- );
2922
- }
2923
- }
2924
- } catch {
2925
- }
2926
- })();
2927
3066
  let lastReportedAction = "";
2928
3067
  let sameActionCount = 0;
2929
- heartbeatHandle = setInterval(() => {
3068
+ timers.heartbeat = setInterval(() => {
2930
3069
  const elapsedMs = Date.now() - executionStartTime;
2931
3070
  let progressMessage = lastAction;
2932
3071
  if (lastAction === lastReportedAction) {
@@ -2967,212 +3106,83 @@ async function executeWithOpenCode(skills, scenario, options) {
2967
3106
  );
2968
3107
  }, 1e4);
2969
3108
  }
2970
- const promptPromise = (async () => {
2971
- let systemPrompt;
2972
- if (options.systemPrompt === null || options.systemPrompt === "") {
2973
- } else if (options.systemPrompt != null) {
2974
- systemPrompt = options.systemPrompt;
2975
- } else {
2976
- systemPrompt = DEFAULT_EVALUATOR_SYSTEM_PROMPT2;
2977
- }
2978
- console.log("[SDK-DEBUG] Sending prompt...");
2979
- const result = await client.session.prompt({
2980
- path: { id: sessionId },
2981
- body: {
2982
- model: { providerID, modelID },
2983
- ...systemPrompt ? { system: systemPrompt } : {},
2984
- parts: [{ type: "text", text: scenario.triggerPrompt }]
2985
- }
2986
- });
2987
- return result;
2988
- })();
2989
- const timeoutPromise = new Promise((_, reject) => {
2990
- timeoutHandle = setTimeout(() => {
2991
- timedOut = true;
2992
- client.session.abort({ path: { id: sessionId } }).catch(() => {
2993
- });
2994
- reject(
2995
- new Error(
2996
- `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
2997
- )
2998
- );
2999
- }, SDK_TIMEOUT_MS);
3000
- });
3001
- const promptResult = await Promise.race([promptPromise, timeoutPromise]);
3002
- if (timeoutHandle) clearTimeout(timeoutHandle);
3003
- if (heartbeatHandle) clearInterval(heartbeatHandle);
3004
- if (eventStreamAbort) eventStreamAbort.abort();
3005
- if ("error" in promptResult && promptResult.error) {
3006
- const errPayload = promptResult.error;
3007
- throw new Error(
3008
- `Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
3009
- );
3010
- }
3011
- console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
3012
- const messagesResponse = await client.session.messages({
3013
- path: { id: sessionId }
3014
- });
3015
- const allMessages = messagesResponse.data ?? [];
3016
- console.log(
3017
- `[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
3018
- );
3019
- if (traceContext) {
3020
- emitTraceEvent(
3021
- {
3022
- evalRunId: traceContext.evalRunId,
3023
- scenarioId: traceContext.scenarioId,
3024
- scenarioName: traceContext.scenarioName,
3025
- targetId: traceContext.targetId,
3026
- targetName: traceContext.targetName,
3027
- stepNumber: traceStepNumber + 1,
3028
- type: LiveTraceEventType2.COMPLETION,
3029
- outputPreview: "Scenario execution completed",
3030
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3031
- isComplete: true
3032
- },
3033
- traceContext.tracePushUrl,
3034
- traceContext.routeHeader,
3035
- traceContext.authToken
3036
- );
3037
- }
3038
- const endTime = /* @__PURE__ */ new Date();
3039
- const totalDurationMs = endTime.getTime() - startTime.getTime();
3040
- const resultData = promptResult.data;
3041
- const lastAssistantInfo = resultData?.info;
3042
- if (lastAssistantInfo?.error) {
3043
- const err = lastAssistantInfo.error;
3044
- throw new Error(
3045
- `Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
3046
- );
3047
- }
3048
- let outputText = "";
3049
- if (resultData?.parts) {
3050
- for (const part of resultData.parts) {
3051
- if (part.type === "text") {
3052
- outputText += part.text;
3053
- }
3054
- }
3055
- }
3056
- if (!outputText && allMessages.length > 0) {
3057
- for (let i = allMessages.length - 1; i >= 0; i--) {
3058
- const msg = allMessages[i];
3059
- if (msg.info.role === "assistant") {
3060
- const assistantInfo = msg.info;
3061
- if (assistantInfo.error) {
3062
- throw new Error(
3063
- `Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
3064
- );
3065
- }
3066
- for (const part of msg.parts) {
3067
- if (part.type === "text") {
3068
- outputText += part.text;
3109
+ child.stdout?.on("data", (data) => {
3110
+ const text = data.toString();
3111
+ lastOutputTime = Date.now();
3112
+ lineBuffer += text;
3113
+ const lines = lineBuffer.split("\n");
3114
+ lineBuffer = lines.pop() || "";
3115
+ for (const line of lines) {
3116
+ if (!line.trim()) continue;
3117
+ const evt = tryParseJson(line);
3118
+ if (!evt || !evt.type) continue;
3119
+ allEvents.push({ event: evt, receivedAt: Date.now() });
3120
+ if (traceContext) {
3121
+ traceStepNumber++;
3122
+ const traceEvt = createTraceEventFromNdjson(
3123
+ evt,
3124
+ traceContext,
3125
+ traceStepNumber,
3126
+ false
3127
+ );
3128
+ if (traceEvt) {
3129
+ lastToolName = traceEvt.toolName;
3130
+ lastFilePath = traceEvt.filePath;
3131
+ if (traceEvt.type === LiveTraceEventType2.THINKING) {
3132
+ lastAction = "Thinking...";
3133
+ } else if (traceEvt.type === LiveTraceEventType2.TOOL_USE) {
3134
+ lastAction = extractToolAction(
3135
+ traceEvt.toolName ?? "",
3136
+ void 0
3137
+ );
3138
+ } else if (traceEvt.type === LiveTraceEventType2.FILE_WRITE) {
3139
+ lastAction = `Writing: ${traceEvt.filePath || "file"}`;
3140
+ } else if (traceEvt.type === LiveTraceEventType2.FILE_READ) {
3141
+ lastAction = `Reading: ${traceEvt.filePath || "file"}`;
3142
+ } else if (traceEvt.type === LiveTraceEventType2.COMPLETION) {
3143
+ lastAction = "Processing response...";
3069
3144
  }
3145
+ emitTraceEvent(
3146
+ traceEvt,
3147
+ traceContext.tracePushUrl,
3148
+ traceContext.routeHeader,
3149
+ traceContext.authToken
3150
+ );
3070
3151
  }
3071
- if (outputText) break;
3072
3152
  }
3073
3153
  }
3074
- }
3075
- if (!outputText) {
3076
- const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
3077
- if (!hasAssistant) {
3078
- throw new Error(
3079
- `Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
3080
- );
3081
- }
3082
- }
3083
- const usage = lastAssistantInfo ? {
3084
- inputTokens: lastAssistantInfo.tokens.input,
3085
- outputTokens: lastAssistantInfo.tokens.output,
3086
- totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
3087
- } : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
3088
- const costUsd = lastAssistantInfo?.cost;
3089
- const modelStr = options.model || DEFAULT_MODEL3;
3090
- const llmTrace = buildLLMTrace(
3091
- allMessages,
3092
- totalDurationMs,
3093
- modelStr,
3094
- providerID
3095
- );
3096
- const conversation = buildConversation2(allMessages);
3097
- return {
3098
- result: {
3099
- outputText,
3100
- durationMs: totalDurationMs,
3101
- usage,
3102
- costUsd
3103
- },
3104
- llmTrace,
3105
- conversation
3106
- };
3107
- } catch (sdkError) {
3108
- if (timeoutHandle) clearTimeout(timeoutHandle);
3109
- if (heartbeatHandle) clearInterval(heartbeatHandle);
3110
- if (timedOut) {
3111
- console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
3112
- }
3113
- const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
3114
- const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
3115
- const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
3116
- const causeDetails = [];
3117
- let current = sdkError;
3118
- while (current instanceof Error && current.cause) {
3119
- current = current.cause;
3120
- if (current instanceof Error) {
3121
- causeDetails.push(`${current.name}: ${current.message}`);
3122
- } else {
3123
- causeDetails.push(String(current));
3154
+ });
3155
+ child.stderr?.on("data", (data) => {
3156
+ const text = data.toString();
3157
+ stderr += text;
3158
+ lastOutputTime = Date.now();
3159
+ });
3160
+ child.on("close", (code) => {
3161
+ if (lineBuffer.trim()) {
3162
+ const evt = tryParseJson(lineBuffer);
3163
+ if (evt && evt.type) {
3164
+ allEvents.push({ event: evt, receivedAt: Date.now() });
3165
+ }
3124
3166
  }
3125
- }
3126
- const causeChain = causeDetails.length > 0 ? `
3127
- Cause chain: ${causeDetails.join(" -> ")}` : "";
3128
- console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
3129
- console.error("[SDK-ERROR] Error name:", errorName);
3130
- console.error("[SDK-ERROR] Error message:", errorMessage);
3131
- if (causeDetails.length > 0) {
3132
- console.error("[SDK-ERROR] Cause chain:", causeDetails.join(" -> "));
3133
- }
3134
- if (errorStack) {
3135
- console.error("[SDK-ERROR] Stack:", errorStack);
3136
- }
3137
- if (traceContext) {
3138
- emitTraceEvent(
3139
- {
3140
- evalRunId: traceContext.evalRunId,
3141
- scenarioId: traceContext.scenarioId,
3142
- scenarioName: traceContext.scenarioName,
3143
- targetId: traceContext.targetId,
3144
- targetName: traceContext.targetName,
3145
- stepNumber: traceStepNumber + 1,
3146
- type: LiveTraceEventType2.DIAGNOSTIC,
3147
- outputPreview: JSON.stringify({
3148
- event: "sdk-execution-failed",
3149
- error: errorMessage,
3150
- errorName,
3151
- ...causeDetails.length > 0 && {
3152
- causeChain: causeDetails.join(" -> ")
3153
- }
3154
- }).slice(0, 2e3),
3155
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3156
- isComplete: true
3157
- },
3158
- traceContext.tracePushUrl,
3159
- traceContext.routeHeader,
3160
- traceContext.authToken
3167
+ console.log(
3168
+ `[executeWithOpenCode] Process exited with code ${code}, ${allEvents.length} events collected`
3161
3169
  );
3162
- }
3163
- throw new Error(
3164
- `OpenCode SDK execution failed: ${errorMessage}` + causeChain + (errorStack ? `
3165
- Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
3166
- );
3167
- } finally {
3168
- if (server) {
3169
- try {
3170
- server.close();
3171
- console.log("[SDK-DEBUG] OpenCode server closed");
3172
- } catch {
3170
+ if (code === 0) {
3171
+ finalize(true);
3172
+ } else {
3173
+ finalize(
3174
+ false,
3175
+ new Error(
3176
+ `OpenCode CLI exited with code ${code}.
3177
+ Stderr: ${stderr.slice(0, 1e3)}`
3178
+ )
3179
+ );
3173
3180
  }
3174
- }
3175
- }
3181
+ });
3182
+ child.on("error", (error) => {
3183
+ finalize(false, new Error(`OpenCode CLI spawn error: ${error.message}`));
3184
+ });
3185
+ });
3176
3186
  }
3177
3187
 
3178
3188
  // src/run-scenario/agents/opencode/opencode-adapter.ts
@@ -3403,7 +3413,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
3403
3413
  }
3404
3414
 
3405
3415
  // src/run-scenario/agents/simple-agent/build-conversation.ts
3406
- function buildConversation3(triggerPrompt, steps, executionStartMs) {
3416
+ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
3407
3417
  const messages = [];
3408
3418
  messages.push({
3409
3419
  role: "user",
@@ -3412,11 +3422,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3412
3422
  });
3413
3423
  for (let i = 0; i < steps.length; i++) {
3414
3424
  const step = steps[i];
3415
- const stepTimestamp = estimateStepTimestamp(
3416
- executionStartMs,
3417
- i,
3418
- steps.length
3419
- );
3425
+ const stepTimestamp = new Date(
3426
+ stepTimestamps[i] ?? executionStartMs
3427
+ ).toISOString();
3420
3428
  const assistantContent = [];
3421
3429
  if (step.reasoningText) {
3422
3430
  assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3459,10 +3467,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3459
3467
  }
3460
3468
  return messages;
3461
3469
  }
3462
- function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
3463
- const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
3464
- return new Date(startMs + Math.round(offset * 1e3)).toISOString();
3465
- }
3466
3470
 
3467
3471
  // src/run-scenario/agents/simple-agent/execute.ts
3468
3472
  var PROVIDER_ANTHROPIC2 = "anthropic";
@@ -3547,6 +3551,7 @@ async function executeWithAiSdk(context) {
3547
3551
  }
3548
3552
  }
3549
3553
  };
3554
+ const stepTimestamps = [];
3550
3555
  const result = await generateText({
3551
3556
  model,
3552
3557
  system: systemPrompt,
@@ -3555,7 +3560,34 @@ async function executeWithAiSdk(context) {
3555
3560
  maxOutputTokens: modelConfig.maxTokens,
3556
3561
  tools: mcpTools,
3557
3562
  stopWhen: mcpTools ? stepCountIs(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : stepCountIs(1),
3558
- providerOptions: providerOpts
3563
+ providerOptions: providerOpts,
3564
+ onStepFinish: (step) => {
3565
+ stepTimestamps.push(Date.now());
3566
+ if (traceContext) {
3567
+ const isToolStep = step.toolCalls.length > 0;
3568
+ const firstToolCall = step.toolCalls[0];
3569
+ emitTraceEvent(
3570
+ {
3571
+ evalRunId: traceContext.evalRunId,
3572
+ scenarioId: traceContext.scenarioId,
3573
+ scenarioName: traceContext.scenarioName,
3574
+ targetId: traceContext.targetId,
3575
+ targetName: traceContext.targetName,
3576
+ stepNumber: stepTimestamps.length,
3577
+ type: isToolStep ? LiveTraceEventType3.TOOL_USE : LiveTraceEventType3.COMPLETION,
3578
+ toolName: firstToolCall?.toolName,
3579
+ toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3580
+ outputPreview: step.text?.slice(0, 500),
3581
+ elapsedMs: Date.now() - startTime,
3582
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3583
+ isComplete: false
3584
+ },
3585
+ traceContext.tracePushUrl,
3586
+ traceContext.routeHeader,
3587
+ traceContext.authToken
3588
+ );
3589
+ }
3590
+ }
3559
3591
  });
3560
3592
  const durationMs = Date.now() - startTime;
3561
3593
  const usage = {
@@ -3569,16 +3601,17 @@ async function executeWithAiSdk(context) {
3569
3601
  usage,
3570
3602
  modelConfig.model,
3571
3603
  provider,
3572
- startTime
3604
+ startTime,
3605
+ stepTimestamps
3573
3606
  );
3574
3607
  if (traceContext) {
3575
- emitStepEvents(traceContext, result.steps, startTime);
3576
- emitCompletionEvent(traceContext, result.steps.length + 1);
3608
+ emitCompletionEvent(traceContext, stepTimestamps.length + 1);
3577
3609
  }
3578
3610
  const conversation = buildConversation3(
3579
3611
  scenario.triggerPrompt,
3580
3612
  result.steps,
3581
- startTime
3613
+ startTime,
3614
+ stepTimestamps
3582
3615
  );
3583
3616
  return {
3584
3617
  outputText: result.text,
@@ -3619,20 +3652,16 @@ function findToolResultError(step) {
3619
3652
  }
3620
3653
  return null;
3621
3654
  }
3622
- function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
3623
- const totalStepTokens = steps.reduce(
3624
- (sum, s) => sum + (s.usage.totalTokens ?? 0),
3625
- 0
3626
- );
3655
+ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
3627
3656
  const traceSteps = steps.map((step, i) => {
3628
- const stepTokens = step.usage.totalTokens ?? 0;
3629
- const proportion = totalStepTokens > 0 ? stepTokens / totalStepTokens : 0;
3630
- const stepDurationMs = Math.round(totalDurationMs * proportion);
3657
+ const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
3658
+ const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
3659
+ const stepDurationMs = stepFinishedAt - stepStartedAt;
3631
3660
  const firstToolCall = step.toolCalls[0];
3632
3661
  const tokenUsage = {
3633
3662
  prompt: step.usage.inputTokens ?? 0,
3634
3663
  completion: step.usage.outputTokens ?? 0,
3635
- total: stepTokens
3664
+ total: step.usage.totalTokens ?? 0
3636
3665
  };
3637
3666
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
3638
3667
  const toolResultError = findToolResultError(step);
@@ -3643,9 +3672,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
3643
3672
  type: step.toolCalls.length > 0 ? LLMStepType3.TOOL_USE : LLMStepType3.COMPLETION,
3644
3673
  model: modelId,
3645
3674
  provider,
3646
- startedAt: new Date(
3647
- executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
3648
- ).toISOString(),
3675
+ startedAt: new Date(stepStartedAt).toISOString(),
3649
3676
  durationMs: stepDurationMs,
3650
3677
  tokenUsage,
3651
3678
  costUsd,
@@ -3703,33 +3730,6 @@ function emitStartEvent(traceContext, startTime) {
3703
3730
  traceContext.authToken
3704
3731
  );
3705
3732
  }
3706
- function emitStepEvents(traceContext, steps, startTime) {
3707
- for (let i = 0; i < steps.length; i++) {
3708
- const step = steps[i];
3709
- const isToolStep = step.toolCalls.length > 0;
3710
- const firstToolCall = step.toolCalls[0];
3711
- emitTraceEvent(
3712
- {
3713
- evalRunId: traceContext.evalRunId,
3714
- scenarioId: traceContext.scenarioId,
3715
- scenarioName: traceContext.scenarioName,
3716
- targetId: traceContext.targetId,
3717
- targetName: traceContext.targetName,
3718
- stepNumber: i + 1,
3719
- type: isToolStep ? LiveTraceEventType3.TOOL_USE : LiveTraceEventType3.COMPLETION,
3720
- toolName: firstToolCall?.toolName,
3721
- toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3722
- outputPreview: step.text?.slice(0, 500),
3723
- elapsedMs: Date.now() - startTime,
3724
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3725
- isComplete: false
3726
- },
3727
- traceContext.tracePushUrl,
3728
- traceContext.routeHeader,
3729
- traceContext.authToken
3730
- );
3731
- }
3732
- }
3733
3733
  function emitCompletionEvent(traceContext, stepNumber) {
3734
3734
  emitTraceEvent(
3735
3735
  {
@@ -3766,7 +3766,7 @@ defaultRegistry.register(simpleAgentAdapter);
3766
3766
 
3767
3767
  // src/run-scenario/file-diff.ts
3768
3768
  import { readdirSync, readFileSync as readFileSync2, statSync, existsSync as existsSync2 } from "fs";
3769
- import { join as join9, relative } from "path";
3769
+ import { join as join10, relative } from "path";
3770
3770
 
3771
3771
  // ../../node_modules/diff/lib/index.mjs
3772
3772
  function Diff() {
@@ -3942,7 +3942,7 @@ Diff.prototype = {
3942
3942
  tokenize: function tokenize(value) {
3943
3943
  return Array.from(value);
3944
3944
  },
3945
- join: function join8(chars) {
3945
+ join: function join9(chars) {
3946
3946
  return chars.join("");
3947
3947
  },
3948
3948
  postProcess: function postProcess(changeObjects) {
@@ -4391,7 +4391,7 @@ function snapshotDirectory(dir, baseDir) {
4391
4391
  }
4392
4392
  const entries = readdirSync(dir, { withFileTypes: true });
4393
4393
  for (const entry of entries) {
4394
- const fullPath = join9(dir, entry.name);
4394
+ const fullPath = join10(dir, entry.name);
4395
4395
  const relativePath = relative(base, fullPath);
4396
4396
  if (shouldIgnore(entry.name)) {
4397
4397
  continue;