@wix/evalforge-evaluator 0.118.0 → 0.120.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -55,6 +55,7 @@ function loadConfig() {
55
55
  aiGatewayHeaders[key] = value;
56
56
  }
57
57
  }
58
+ aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
58
59
  const tracePushUrl = process.env.TRACE_PUSH_URL;
59
60
  const routeHeader = process.env.EVAL_ROUTE_HEADER;
60
61
  const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -1200,10 +1201,10 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1200
1201
  };
1201
1202
  }
1202
1203
  async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1203
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1204
+ const { mkdir: mkdirAsync, writeFile: writeFile7 } = await import("fs/promises");
1204
1205
  const claudeDir = `${cwd}/.claude`;
1205
1206
  await mkdirAsync(claudeDir, { recursive: true });
1206
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1207
+ await writeFile7(`${claudeDir}/settings.json`, "{}", {
1207
1208
  flag: "wx"
1208
1209
  }).catch(() => {
1209
1210
  });
@@ -2162,9 +2163,18 @@ defaultRegistry.register(claudeCodeAdapter);
2162
2163
  var import_evalforge_types9 = require("@wix/evalforge-types");
2163
2164
 
2164
2165
  // src/run-scenario/agents/opencode/execute.ts
2165
- var import_os3 = require("os");
2166
+ var import_child_process = require("child_process");
2166
2167
  var import_evalforge_types8 = require("@wix/evalforge-types");
2167
2168
 
2169
+ // src/run-scenario/agents/opencode/types.ts
2170
+ function tryParseJson(text) {
2171
+ try {
2172
+ return JSON.parse(text);
2173
+ } catch {
2174
+ return null;
2175
+ }
2176
+ }
2177
+
2168
2178
  // src/run-scenario/agents/opencode/write-skills.ts
2169
2179
  var import_promises7 = require("fs/promises");
2170
2180
  var import_path8 = require("path");
@@ -2260,6 +2270,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
2260
2270
  }
2261
2271
 
2262
2272
  // src/run-scenario/agents/opencode/config.ts
2273
+ var import_os3 = require("os");
2263
2274
  var import_evalforge_types6 = require("@wix/evalforge-types");
2264
2275
  var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2265
2276
  function parseModel(model) {
@@ -2312,7 +2323,14 @@ function toOpenCodeMcpConfig(servers) {
2312
2323
  }
2313
2324
  return result;
2314
2325
  }
2315
- async function buildOpenCodeConfig(options) {
2326
+ function ensureOpenCodeInPath(currentPath) {
2327
+ const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
2328
+ if (currentPath.includes(opencodeBin)) {
2329
+ return currentPath;
2330
+ }
2331
+ return `${opencodeBin}:${currentPath}`;
2332
+ }
2333
+ async function buildOpenCodeEnv(options) {
2316
2334
  const modelStr = options.model || DEFAULT_MODEL2;
2317
2335
  const { providerID, modelID } = parseModel(modelStr);
2318
2336
  const provider = {};
@@ -2324,9 +2342,7 @@ async function buildOpenCodeConfig(options) {
2324
2342
  if (options.aiGatewayHeaders) {
2325
2343
  providerOptions.headers = { ...options.aiGatewayHeaders };
2326
2344
  }
2327
- provider[providerID] = {
2328
- options: providerOptions
2329
- };
2345
+ provider[providerID] = { options: providerOptions };
2330
2346
  }
2331
2347
  let mcp;
2332
2348
  if (options.mcps && options.mcps.length > 0) {
@@ -2367,70 +2383,81 @@ async function buildOpenCodeConfig(options) {
2367
2383
  },
2368
2384
  ...mcp ? { mcp } : {}
2369
2385
  };
2370
- return { config, providerID, modelID };
2386
+ const env = {
2387
+ ...process.env,
2388
+ PATH: ensureOpenCodeInPath(process.env.PATH || ""),
2389
+ OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
2390
+ OPENCODE_DISABLE_LSP_DOWNLOAD: "true"
2391
+ };
2392
+ return { env, providerID, modelID };
2371
2393
  }
2372
2394
 
2373
2395
  // src/run-scenario/agents/opencode/build-trace.ts
2374
2396
  var import_evalforge_types7 = require("@wix/evalforge-types");
2375
2397
  var import_crypto2 = require("crypto");
2376
- function buildLLMTrace(messages, totalDurationMs, model, provider) {
2377
- const assistantMessages = messages.filter(
2378
- (m) => m.info.role === "assistant"
2379
- );
2380
- const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
2381
- const { info, parts } = msg;
2382
- let text = "";
2383
- let thinking = "";
2384
- const toolCalls = [];
2385
- let stepInputTokens = 0;
2386
- let stepOutputTokens = 0;
2387
- let stepCost = 0;
2388
- let finishReason = "unknown";
2389
- for (const part of parts) {
2390
- switch (part.type) {
2391
- case "text": {
2392
- const textPart = part;
2393
- text += textPart.text;
2394
- break;
2395
- }
2396
- case "reasoning": {
2397
- const reasoningPart = part;
2398
- thinking += reasoningPart.text;
2399
- break;
2400
- }
2401
- case "tool": {
2402
- const toolPart = part;
2403
- toolCalls.push({
2404
- toolName: toolPart.tool,
2405
- args: toolPart.state.input
2406
- });
2407
- break;
2408
- }
2409
- case "step-finish": {
2410
- const sf = part;
2411
- stepInputTokens += sf.tokens.input;
2412
- stepOutputTokens += sf.tokens.output;
2413
- stepCost += sf.cost;
2414
- finishReason = sf.reason;
2415
- break;
2416
- }
2398
+ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, executionStartTime) {
2399
+ const turns = [];
2400
+ let current = {
2401
+ textParts: [],
2402
+ reasoningParts: [],
2403
+ toolCalls: []
2404
+ };
2405
+ for (const { event: evt, receivedAt } of timestampedEvents) {
2406
+ switch (evt.type) {
2407
+ case "text":
2408
+ current.textParts.push(evt.part.text);
2409
+ break;
2410
+ case "reasoning":
2411
+ current.reasoningParts.push(evt.part.text);
2412
+ break;
2413
+ case "tool_use": {
2414
+ const tu = evt;
2415
+ current.toolCalls.push({
2416
+ toolName: tu.part.tool,
2417
+ args: tu.part.state.input
2418
+ });
2419
+ break;
2420
+ }
2421
+ case "step_finish": {
2422
+ const sf = evt;
2423
+ current.stepFinish = sf.part;
2424
+ current.receivedAt = receivedAt;
2425
+ turns.push(current);
2426
+ current = {
2427
+ textParts: [],
2428
+ reasoningParts: [],
2429
+ toolCalls: []
2430
+ };
2431
+ break;
2417
2432
  }
2418
2433
  }
2419
- if (stepInputTokens === 0 && stepOutputTokens === 0) {
2420
- stepInputTokens = info.tokens.input;
2421
- stepOutputTokens = info.tokens.output;
2422
- stepCost = info.cost;
2423
- }
2424
- const startedAt = new Date(info.time.created).toISOString();
2425
- const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
2426
- const durationMs = Math.max(0, completedAt - info.time.created);
2427
- const isSuccess = finishReason !== "error";
2428
- const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2429
- const stepModel = info.modelID || model;
2430
- const stepProvider = info.providerID || provider;
2431
- const toolCallCount = toolCalls.length;
2434
+ }
2435
+ if (current.textParts.length > 0 || current.reasoningParts.length > 0 || current.toolCalls.length > 0) {
2436
+ if (timestampedEvents.length > 0) {
2437
+ current.receivedAt = timestampedEvents[timestampedEvents.length - 1].receivedAt;
2438
+ }
2439
+ turns.push(current);
2440
+ }
2441
+ const executionStartMs = executionStartTime.getTime();
2442
+ const allSteps = turns.flatMap((turn, turnIndex) => {
2443
+ const sf = turn.stepFinish;
2444
+ const stepInputTokens = sf?.tokens.input ?? 0;
2445
+ const stepOutputTokens = sf?.tokens.output ?? 0;
2446
+ const stepCost = sf?.cost ?? 0;
2447
+ const finishReason = sf?.reason ?? "unknown";
2448
+ const stepModel = sf?.modelID || model;
2449
+ const stepProvider = sf?.providerID || provider;
2450
+ const turnEndMs = turn.receivedAt ?? executionStartMs + totalDurationMs;
2451
+ const prevEndMs = turnIndex > 0 ? turns[turnIndex - 1].receivedAt ?? executionStartMs : executionStartMs;
2452
+ const durationMs = Math.max(0, turnEndMs - prevEndMs);
2453
+ const startedAt = new Date(prevEndMs).toISOString();
2454
+ const text = turn.textParts.join("");
2455
+ const thinking = turn.reasoningParts.join("");
2456
+ const toolCallCount = turn.toolCalls.length;
2432
2457
  const hasThinking = !!thinking;
2433
2458
  const hasText = !!text;
2459
+ const isSuccess = finishReason !== "error";
2460
+ const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2434
2461
  const subSteps = [];
2435
2462
  const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
2436
2463
  const toolSubSteps = toolCallCount;
@@ -2440,7 +2467,6 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2440
2467
  subSteps.push({
2441
2468
  id: (0, import_crypto2.randomUUID)(),
2442
2469
  stepNumber: 0,
2443
- // renumbered below
2444
2470
  turnIndex,
2445
2471
  type: import_evalforge_types7.LLMStepType.THINKING,
2446
2472
  model: stepModel,
@@ -2462,7 +2488,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2462
2488
  }
2463
2489
  if (toolCallCount > 0) {
2464
2490
  for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
2465
- const tc = toolCalls[tcIdx];
2491
+ const tc = turn.toolCalls[tcIdx];
2466
2492
  const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
2467
2493
  const toolBudgetSteps = toolSubSteps + textSubSteps;
2468
2494
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
@@ -2541,11 +2567,21 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2541
2567
  }
2542
2568
  return subSteps;
2543
2569
  }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
2544
- const totalTokens = buildTotalTokens(assistantMessages);
2545
- const totalCost = assistantMessages.reduce((sum, m) => {
2546
- const aMsg = m.info;
2547
- return sum + aMsg.cost;
2548
- }, 0);
2570
+ let totalPrompt = 0;
2571
+ let totalCompletion = 0;
2572
+ let totalCost = 0;
2573
+ for (const turn of turns) {
2574
+ if (turn.stepFinish) {
2575
+ totalPrompt += turn.stepFinish.tokens.input;
2576
+ totalCompletion += turn.stepFinish.tokens.output;
2577
+ totalCost += turn.stepFinish.cost;
2578
+ }
2579
+ }
2580
+ const totalTokens = {
2581
+ prompt: totalPrompt,
2582
+ completion: totalCompletion,
2583
+ total: totalPrompt + totalCompletion
2584
+ };
2549
2585
  const stepTypeBreakdown = {};
2550
2586
  for (const step of allSteps) {
2551
2587
  const entry = stepTypeBreakdown[step.type] ?? {
@@ -2563,7 +2599,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2563
2599
  const modelUsed = allSteps[0]?.model || model;
2564
2600
  const summary = {
2565
2601
  totalSteps: allSteps.length,
2566
- totalTurns: assistantMessages.length,
2602
+ totalTurns: turns.length,
2567
2603
  totalDurationMs,
2568
2604
  totalTokens,
2569
2605
  totalCostUsd: totalCost,
@@ -2584,116 +2620,100 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2584
2620
  summary
2585
2621
  };
2586
2622
  }
2587
- function buildTotalTokens(assistantMessages) {
2588
- let prompt = 0;
2589
- let completion = 0;
2590
- for (const { info } of assistantMessages) {
2591
- prompt += info.tokens.input;
2592
- completion += info.tokens.output;
2593
- }
2594
- return { prompt, completion, total: prompt + completion };
2595
- }
2596
2623
 
2597
2624
  // src/run-scenario/agents/opencode/build-conversation.ts
2598
- function buildConversation2(messages) {
2625
+ function buildConversation2(timestampedEvents) {
2599
2626
  const result = [];
2600
- for (const { info, parts } of messages) {
2601
- const timestamp = new Date(info.time.created).toISOString();
2602
- if (info.role === "assistant") {
2603
- const content = [];
2604
- for (const part of parts) {
2605
- switch (part.type) {
2606
- case "text": {
2607
- const textPart = part;
2608
- content.push({ type: "text", text: textPart.text });
2609
- break;
2610
- }
2611
- case "reasoning": {
2612
- const reasoningPart = part;
2613
- content.push({ type: "thinking", thinking: reasoningPart.text });
2614
- break;
2615
- }
2616
- case "tool": {
2617
- const toolPart = part;
2618
- content.push({
2619
- type: "tool_use",
2620
- toolName: toolPart.tool,
2621
- toolId: toolPart.callID,
2622
- input: toolPart.state.input
2623
- });
2624
- break;
2625
- }
2626
- }
2627
+ let assistantContent = [];
2628
+ let userContent = [];
2629
+ let latestReceivedAt = 0;
2630
+ const flushAssistant = () => {
2631
+ if (assistantContent.length > 0) {
2632
+ const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
2633
+ result.push({ role: "assistant", content: assistantContent, timestamp });
2634
+ assistantContent = [];
2635
+ }
2636
+ };
2637
+ const flushUser = () => {
2638
+ if (userContent.length > 0) {
2639
+ const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
2640
+ result.push({ role: "user", content: userContent, timestamp });
2641
+ userContent = [];
2642
+ }
2643
+ };
2644
+ for (const { event: evt, receivedAt } of timestampedEvents) {
2645
+ latestReceivedAt = receivedAt;
2646
+ switch (evt.type) {
2647
+ case "text": {
2648
+ const te = evt;
2649
+ assistantContent.push({ type: "text", text: te.part.text });
2650
+ break;
2627
2651
  }
2628
- if (content.length > 0) {
2629
- result.push({ role: "assistant", content, timestamp });
2652
+ case "reasoning": {
2653
+ const re = evt;
2654
+ assistantContent.push({ type: "thinking", thinking: re.part.text });
2655
+ break;
2630
2656
  }
2631
- } else if (info.role === "user") {
2632
- const content = [];
2633
- for (const part of parts) {
2634
- if (part.type === "text") {
2635
- const textPart = part;
2636
- content.push({ type: "text", text: textPart.text });
2637
- } else if (part.type === "tool") {
2638
- const toolPart = part;
2639
- const state = toolPart.state;
2640
- if (state.status === "completed") {
2641
- const completed = state;
2642
- content.push({
2643
- type: "tool_result",
2644
- toolUseId: toolPart.callID,
2645
- content: completed.output
2646
- });
2647
- } else if (state.status === "error") {
2648
- const errState = state;
2649
- content.push({
2650
- type: "tool_result",
2651
- toolUseId: toolPart.callID,
2652
- content: errState.error,
2653
- isError: true
2654
- });
2655
- }
2657
+ case "tool_use": {
2658
+ const tu = evt;
2659
+ assistantContent.push({
2660
+ type: "tool_use",
2661
+ toolName: tu.part.tool,
2662
+ toolId: tu.part.callID,
2663
+ input: tu.part.state.input
2664
+ });
2665
+ if (tu.part.state.status === "completed" || tu.part.state.status === "error") {
2666
+ flushAssistant();
2667
+ const isError = tu.part.state.status === "error";
2668
+ const content = isError ? tu.part.state.error || "Tool execution failed" : tu.part.state.output || "";
2669
+ userContent.push({
2670
+ type: "tool_result",
2671
+ toolUseId: tu.part.callID,
2672
+ content,
2673
+ ...isError ? { isError: true } : {}
2674
+ });
2675
+ flushUser();
2656
2676
  }
2677
+ break;
2657
2678
  }
2658
- if (content.length > 0) {
2659
- result.push({ role: "user", content, timestamp });
2679
+ case "step_finish": {
2680
+ flushAssistant();
2681
+ flushUser();
2682
+ break;
2660
2683
  }
2661
2684
  }
2662
2685
  }
2686
+ flushAssistant();
2687
+ flushUser();
2663
2688
  return result;
2664
2689
  }
2665
2690
 
2666
2691
  // src/run-scenario/agents/opencode/execute.ts
2667
- var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2668
- function ensureOpenCodeInPath() {
2669
- const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
2670
- const currentPath = process.env.PATH || "";
2671
- if (!currentPath.includes(opencodeBin)) {
2672
- process.env.PATH = `${opencodeBin}:${currentPath}`;
2673
- }
2674
- }
2692
+ var import_promises9 = require("fs/promises");
2693
+ var import_path10 = require("path");
2694
+ var KILL_GRACE_PERIOD_MS = 5e3;
2695
+ var IDLE_TIMEOUT_MS = 12e4;
2696
+ var IDLE_CHECK_INTERVAL_MS = 15e3;
2675
2697
  function extractToolAction(toolName, args) {
2676
2698
  if (!toolName) return "Using tool...";
2677
- const a = args;
2678
- if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
2679
- const desc = String(a.description).slice(0, 55);
2680
- return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
2681
- }
2682
- if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
2683
- const cmd = String(a.command).slice(0, 50);
2684
- return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
2685
- }
2686
- if (a?.file_path || a?.path || a?.target_file) {
2687
- const filePath = String(a.file_path || a.path || a.target_file).slice(
2688
- 0,
2689
- 50
2690
- );
2699
+ if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
2700
+ const desc = String(args.description).slice(0, 55);
2701
+ return `Task: ${desc}${String(args.description).length > 55 ? "..." : ""}`;
2702
+ }
2703
+ if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && args?.command) {
2704
+ const cmd = String(args.command).slice(0, 50);
2705
+ return `Running: ${cmd}${String(args.command).length > 50 ? "..." : ""}`;
2706
+ }
2707
+ if (args?.file_path || args?.path || args?.target_file) {
2708
+ const filePath = String(
2709
+ args.file_path || args.path || args.target_file
2710
+ ).slice(0, 50);
2691
2711
  if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
2692
2712
  if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
2693
2713
  }
2694
2714
  return `Using ${toolName}...`;
2695
2715
  }
2696
- function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2716
+ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
2697
2717
  const base = {
2698
2718
  evalRunId: context.evalRunId,
2699
2719
  scenarioId: context.scenarioId,
@@ -2704,42 +2724,41 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2704
2724
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2705
2725
  isComplete
2706
2726
  };
2707
- switch (part.type) {
2727
+ switch (evt.type) {
2708
2728
  case "text": {
2709
- const textPart = part;
2729
+ const te = evt;
2710
2730
  return {
2711
2731
  ...base,
2712
2732
  type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
2713
- outputPreview: textPart.text.slice(0, 500)
2733
+ outputPreview: te.part.text.slice(0, 500)
2714
2734
  };
2715
2735
  }
2716
- case "reasoning": {
2717
- const reasoningPart = part;
2736
+ case "reasoning":
2718
2737
  return {
2719
2738
  ...base,
2720
2739
  type: import_evalforge_types8.LiveTraceEventType.THINKING,
2721
- thinking: reasoningPart.text.slice(0, 500)
2740
+ thinking: evt.part.text.slice(0, 500)
2722
2741
  };
2723
- }
2724
- case "tool": {
2725
- const toolPart = part;
2726
- const toolName = toolPart.tool;
2727
- const args = toolPart.state.input;
2742
+ case "tool_use": {
2743
+ const tu = evt;
2744
+ const toolName = tu.part.tool;
2745
+ const args = tu.part.state.input;
2728
2746
  const toolArgs = JSON.stringify(args).slice(0, 500);
2729
2747
  let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
2730
2748
  let filePath;
2731
- const a = args;
2732
- if (a.file_path || a.path || a.target_file) {
2733
- filePath = String(a.file_path || a.path || a.target_file);
2734
- if (/write|edit/i.test(toolName)) {
2735
- type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
2736
- } else if (/read|view/i.test(toolName)) {
2737
- type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
2749
+ if (args) {
2750
+ if (args.file_path || args.path || args.target_file) {
2751
+ filePath = String(args.file_path || args.path || args.target_file);
2752
+ if (/write|edit/i.test(toolName)) {
2753
+ type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
2754
+ } else if (/read|view/i.test(toolName)) {
2755
+ type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
2756
+ }
2738
2757
  }
2739
2758
  }
2740
2759
  return { ...base, type, toolName, toolArgs, filePath };
2741
2760
  }
2742
- case "step-finish":
2761
+ case "step_finish":
2743
2762
  return {
2744
2763
  ...base,
2745
2764
  type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
@@ -2769,6 +2788,37 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2769
2788
  );
2770
2789
  }
2771
2790
  }
2791
+ async function writeSystemPromptRule(cwd, systemPrompt) {
2792
+ const rulesDir = (0, import_path10.join)(cwd, ".opencode", "rules");
2793
+ await (0, import_promises9.mkdir)(rulesDir, { recursive: true });
2794
+ await (0, import_promises9.writeFile)(
2795
+ (0, import_path10.join)(rulesDir, "evalforge-system-prompt.md"),
2796
+ systemPrompt,
2797
+ "utf-8"
2798
+ );
2799
+ }
2800
+ function killProcess(child, resolved) {
2801
+ if (!child) return;
2802
+ const killSignal = (signal) => {
2803
+ if (child.pid) {
2804
+ try {
2805
+ process.kill(-child.pid, signal);
2806
+ console.log(
2807
+ `[OpenCode] Sent ${signal} to process group (pid: -${child.pid})`
2808
+ );
2809
+ return;
2810
+ } catch {
2811
+ }
2812
+ }
2813
+ child.kill(signal);
2814
+ };
2815
+ killSignal("SIGTERM");
2816
+ setTimeout(() => {
2817
+ if (child && !resolved) {
2818
+ killSignal("SIGKILL");
2819
+ }
2820
+ }, KILL_GRACE_PERIOD_MS);
2821
+ }
2772
2822
  async function executeWithOpenCode(skills, scenario, options) {
2773
2823
  const skillNames = skills.map((s) => s.name).join(", ");
2774
2824
  console.log("[executeWithOpenCode] Starting execution", {
@@ -2783,7 +2833,8 @@ async function executeWithOpenCode(skills, scenario, options) {
2783
2833
  });
2784
2834
  const startTime = /* @__PURE__ */ new Date();
2785
2835
  const maxTurns = options.maxTurns ?? 10;
2786
- const { config, providerID, modelID } = await buildOpenCodeConfig({
2836
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2837
+ const { env, providerID, modelID } = await buildOpenCodeEnv({
2787
2838
  model: options.model,
2788
2839
  temperature: options.temperature,
2789
2840
  maxTurns,
@@ -2792,12 +2843,6 @@ async function executeWithOpenCode(skills, scenario, options) {
2792
2843
  mcps: options.mcps,
2793
2844
  cwd: options.cwd
2794
2845
  });
2795
- const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
2796
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2797
- const abortController = new AbortController();
2798
- let timeoutHandle;
2799
- let heartbeatHandle;
2800
- let timedOut = false;
2801
2846
  const traceContext = options.traceContext;
2802
2847
  let traceStepNumber = 0;
2803
2848
  let lastAction = "Starting...";
@@ -2814,7 +2859,7 @@ async function executeWithOpenCode(skills, scenario, options) {
2814
2859
  stepNumber: 0,
2815
2860
  type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2816
2861
  outputPreview: JSON.stringify({
2817
- event: "pre-sdk-execution",
2862
+ event: "pre-cli-execution",
2818
2863
  model: `${providerID}/${modelID}`,
2819
2864
  maxTurns,
2820
2865
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
@@ -2827,105 +2872,200 @@ async function executeWithOpenCode(skills, scenario, options) {
2827
2872
  traceContext.authToken
2828
2873
  );
2829
2874
  }
2830
- let server;
2831
- try {
2832
- ensureOpenCodeInPath();
2833
- console.log("[SDK-DEBUG] Starting OpenCode server...");
2834
- server = await createOpencodeServer({
2835
- config,
2836
- signal: abortController.signal,
2837
- timeout: 3e4
2838
- });
2839
- console.log(`[SDK-DEBUG] Server started at ${server.url}`);
2840
- const client = createOpencodeClient({
2841
- baseUrl: server.url,
2842
- directory: options.cwd
2843
- });
2844
- const session = await client.session.create({
2845
- body: { title: `eval-${scenario.name}` }
2846
- });
2847
- if (!session.data) {
2848
- const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
2849
- throw new Error(
2850
- `OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
2875
+ let systemPrompt;
2876
+ if (options.systemPrompt === null || options.systemPrompt === "") {
2877
+ } else if (options.systemPrompt != null) {
2878
+ systemPrompt = options.systemPrompt;
2879
+ } else {
2880
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
2881
+ }
2882
+ if (systemPrompt) {
2883
+ await writeSystemPromptRule(options.cwd, systemPrompt);
2884
+ }
2885
+ const args = [
2886
+ "run",
2887
+ "--format",
2888
+ "json",
2889
+ "--thinking",
2890
+ "--variant",
2891
+ "high",
2892
+ "--model",
2893
+ `${providerID}/${modelID}`,
2894
+ "--dir",
2895
+ options.cwd,
2896
+ // NOTE: Trigger prompt is passed as a positional CLI arg. On Linux a single
2897
+ // arg is capped at 128 KB (MAX_ARG_STRLEN); on macOS the combined args+env
2898
+ // share a ~1 MB limit. Prompts exceeding this would fail with E2BIG.
2899
+ // In practice eval prompts are well under this limit.
2900
+ scenario.triggerPrompt
2901
+ ];
2902
+ console.log("[executeWithOpenCode] Spawning: opencode", args.slice(0, 5));
2903
+ return new Promise((resolve2, reject) => {
2904
+ let resolved = false;
2905
+ let stderr = "";
2906
+ let lineBuffer = "";
2907
+ let lastOutputTime = Date.now();
2908
+ const allEvents = [];
2909
+ const timers = {};
2910
+ const cleanup = () => {
2911
+ if (timers.timeout) clearTimeout(timers.timeout);
2912
+ if (timers.idleCheck) clearInterval(timers.idleCheck);
2913
+ if (timers.heartbeat) clearInterval(timers.heartbeat);
2914
+ };
2915
+ const finalize = (success, error) => {
2916
+ if (resolved) return;
2917
+ resolved = true;
2918
+ cleanup();
2919
+ if (!success) {
2920
+ if (traceContext) {
2921
+ emitTraceEvent(
2922
+ {
2923
+ evalRunId: traceContext.evalRunId,
2924
+ scenarioId: traceContext.scenarioId,
2925
+ scenarioName: traceContext.scenarioName,
2926
+ targetId: traceContext.targetId,
2927
+ targetName: traceContext.targetName,
2928
+ stepNumber: traceStepNumber + 1,
2929
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2930
+ outputPreview: JSON.stringify({
2931
+ event: "cli-execution-failed",
2932
+ error: error?.message ?? "Unknown error"
2933
+ }).slice(0, 2e3),
2934
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2935
+ isComplete: true
2936
+ },
2937
+ traceContext.tracePushUrl,
2938
+ traceContext.routeHeader,
2939
+ traceContext.authToken
2940
+ );
2941
+ }
2942
+ reject(
2943
+ error ?? new Error(
2944
+ `OpenCode CLI execution failed (exit code unknown).
2945
+ Stderr: ${stderr.slice(0, 1e3)}`
2946
+ )
2947
+ );
2948
+ return;
2949
+ }
2950
+ const endTime = /* @__PURE__ */ new Date();
2951
+ const totalDurationMs = endTime.getTime() - startTime.getTime();
2952
+ let outputText = "";
2953
+ for (const { event: evt } of allEvents) {
2954
+ if (evt.type === "text") {
2955
+ outputText += evt.part.text;
2956
+ }
2957
+ }
2958
+ if (!outputText) {
2959
+ reject(
2960
+ new Error(
2961
+ `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
2962
+ )
2963
+ );
2964
+ return;
2965
+ }
2966
+ let inputTokens = 0;
2967
+ let outputTokens = 0;
2968
+ let costUsd = 0;
2969
+ for (const { event: evt } of allEvents) {
2970
+ if (evt.type === "step_finish") {
2971
+ const sf = evt;
2972
+ inputTokens += sf.part.tokens.input;
2973
+ outputTokens += sf.part.tokens.output;
2974
+ costUsd += sf.part.cost;
2975
+ }
2976
+ }
2977
+ if (traceContext) {
2978
+ emitTraceEvent(
2979
+ {
2980
+ evalRunId: traceContext.evalRunId,
2981
+ scenarioId: traceContext.scenarioId,
2982
+ scenarioName: traceContext.scenarioName,
2983
+ targetId: traceContext.targetId,
2984
+ targetName: traceContext.targetName,
2985
+ stepNumber: traceStepNumber + 1,
2986
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
2987
+ outputPreview: "Scenario execution completed",
2988
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2989
+ isComplete: true
2990
+ },
2991
+ traceContext.tracePushUrl,
2992
+ traceContext.routeHeader,
2993
+ traceContext.authToken
2994
+ );
2995
+ }
2996
+ const modelStr = options.model || `${providerID}/${modelID}`;
2997
+ const llmTrace = buildLLMTrace(
2998
+ allEvents,
2999
+ totalDurationMs,
3000
+ modelStr,
3001
+ providerID,
3002
+ startTime
2851
3003
  );
2852
- }
2853
- const sessionId = session.data.id;
2854
- console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
2855
- let eventStreamAbort;
3004
+ const conversation = buildConversation2(allEvents);
3005
+ resolve2({
3006
+ result: {
3007
+ outputText,
3008
+ durationMs: totalDurationMs,
3009
+ usage: {
3010
+ inputTokens,
3011
+ outputTokens,
3012
+ totalTokens: inputTokens + outputTokens
3013
+ },
3014
+ costUsd
3015
+ },
3016
+ llmTrace,
3017
+ conversation
3018
+ });
3019
+ };
3020
+ let child;
3021
+ try {
3022
+ child = (0, import_child_process.spawn)("opencode", args, {
3023
+ cwd: options.cwd,
3024
+ env,
3025
+ stdio: ["ignore", "pipe", "pipe"],
3026
+ detached: true
3027
+ });
3028
+ } catch (spawnError) {
3029
+ reject(
3030
+ new Error(
3031
+ `Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
3032
+ )
3033
+ );
3034
+ return;
3035
+ }
3036
+ timers.timeout = setTimeout(() => {
3037
+ if (!resolved) {
3038
+ console.error(`[OpenCode] Process timed out after ${SDK_TIMEOUT_MS}ms`);
3039
+ killProcess(child, resolved);
3040
+ finalize(
3041
+ false,
3042
+ new Error(
3043
+ `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
3044
+ )
3045
+ );
3046
+ }
3047
+ }, SDK_TIMEOUT_MS);
3048
+ timers.idleCheck = setInterval(() => {
3049
+ if (resolved) return;
3050
+ const idleTime = Date.now() - lastOutputTime;
3051
+ if (idleTime >= IDLE_TIMEOUT_MS) {
3052
+ console.warn(
3053
+ `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
3054
+ );
3055
+ killProcess(child, resolved);
3056
+ finalize(
3057
+ false,
3058
+ new Error(
3059
+ `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
3060
+ )
3061
+ );
3062
+ }
3063
+ }, IDLE_CHECK_INTERVAL_MS);
2856
3064
  if (traceContext) {
2857
- eventStreamAbort = new AbortController();
2858
3065
  const executionStartTime = Date.now();
2859
- (async () => {
2860
- try {
2861
- const events = await client.event.subscribe();
2862
- for await (const event of events.stream) {
2863
- if (eventStreamAbort.signal.aborted) break;
2864
- const evt = event;
2865
- if (evt.type === "message.part.updated") {
2866
- const { part } = evt.properties;
2867
- traceStepNumber++;
2868
- const traceEvent = createTraceEventFromPart(
2869
- part,
2870
- traceContext,
2871
- traceStepNumber,
2872
- false
2873
- );
2874
- if (traceEvent) {
2875
- lastToolName = traceEvent.toolName;
2876
- lastFilePath = traceEvent.filePath;
2877
- if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
2878
- lastAction = "Thinking...";
2879
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
2880
- lastAction = extractToolAction(
2881
- traceEvent.toolName ?? "",
2882
- void 0
2883
- );
2884
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
2885
- lastAction = `Writing: ${traceEvent.filePath || "file"}`;
2886
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
2887
- lastAction = `Reading: ${traceEvent.filePath || "file"}`;
2888
- } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
2889
- lastAction = "Processing response...";
2890
- }
2891
- emitTraceEvent(
2892
- traceEvent,
2893
- traceContext.tracePushUrl,
2894
- traceContext.routeHeader,
2895
- traceContext.authToken
2896
- );
2897
- }
2898
- } else if (evt.type === "session.error") {
2899
- const props = evt.properties;
2900
- traceStepNumber++;
2901
- emitTraceEvent(
2902
- {
2903
- evalRunId: traceContext.evalRunId,
2904
- scenarioId: traceContext.scenarioId,
2905
- scenarioName: traceContext.scenarioName,
2906
- targetId: traceContext.targetId,
2907
- targetName: traceContext.targetName,
2908
- stepNumber: traceStepNumber,
2909
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2910
- outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
2911
- 0,
2912
- 500
2913
- ),
2914
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2915
- isComplete: false
2916
- },
2917
- traceContext.tracePushUrl,
2918
- traceContext.routeHeader,
2919
- traceContext.authToken
2920
- );
2921
- }
2922
- }
2923
- } catch {
2924
- }
2925
- })();
2926
3066
  let lastReportedAction = "";
2927
3067
  let sameActionCount = 0;
2928
- heartbeatHandle = setInterval(() => {
3068
+ timers.heartbeat = setInterval(() => {
2929
3069
  const elapsedMs = Date.now() - executionStartTime;
2930
3070
  let progressMessage = lastAction;
2931
3071
  if (lastAction === lastReportedAction) {
@@ -2966,212 +3106,83 @@ async function executeWithOpenCode(skills, scenario, options) {
2966
3106
  );
2967
3107
  }, 1e4);
2968
3108
  }
2969
- const promptPromise = (async () => {
2970
- let systemPrompt;
2971
- if (options.systemPrompt === null || options.systemPrompt === "") {
2972
- } else if (options.systemPrompt != null) {
2973
- systemPrompt = options.systemPrompt;
2974
- } else {
2975
- systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
2976
- }
2977
- console.log("[SDK-DEBUG] Sending prompt...");
2978
- const result = await client.session.prompt({
2979
- path: { id: sessionId },
2980
- body: {
2981
- model: { providerID, modelID },
2982
- ...systemPrompt ? { system: systemPrompt } : {},
2983
- parts: [{ type: "text", text: scenario.triggerPrompt }]
2984
- }
2985
- });
2986
- return result;
2987
- })();
2988
- const timeoutPromise = new Promise((_, reject) => {
2989
- timeoutHandle = setTimeout(() => {
2990
- timedOut = true;
2991
- client.session.abort({ path: { id: sessionId } }).catch(() => {
2992
- });
2993
- reject(
2994
- new Error(
2995
- `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
2996
- )
2997
- );
2998
- }, SDK_TIMEOUT_MS);
2999
- });
3000
- const promptResult = await Promise.race([promptPromise, timeoutPromise]);
3001
- if (timeoutHandle) clearTimeout(timeoutHandle);
3002
- if (heartbeatHandle) clearInterval(heartbeatHandle);
3003
- if (eventStreamAbort) eventStreamAbort.abort();
3004
- if ("error" in promptResult && promptResult.error) {
3005
- const errPayload = promptResult.error;
3006
- throw new Error(
3007
- `Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
3008
- );
3009
- }
3010
- console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
3011
- const messagesResponse = await client.session.messages({
3012
- path: { id: sessionId }
3013
- });
3014
- const allMessages = messagesResponse.data ?? [];
3015
- console.log(
3016
- `[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
3017
- );
3018
- if (traceContext) {
3019
- emitTraceEvent(
3020
- {
3021
- evalRunId: traceContext.evalRunId,
3022
- scenarioId: traceContext.scenarioId,
3023
- scenarioName: traceContext.scenarioName,
3024
- targetId: traceContext.targetId,
3025
- targetName: traceContext.targetName,
3026
- stepNumber: traceStepNumber + 1,
3027
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
3028
- outputPreview: "Scenario execution completed",
3029
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3030
- isComplete: true
3031
- },
3032
- traceContext.tracePushUrl,
3033
- traceContext.routeHeader,
3034
- traceContext.authToken
3035
- );
3036
- }
3037
- const endTime = /* @__PURE__ */ new Date();
3038
- const totalDurationMs = endTime.getTime() - startTime.getTime();
3039
- const resultData = promptResult.data;
3040
- const lastAssistantInfo = resultData?.info;
3041
- if (lastAssistantInfo?.error) {
3042
- const err = lastAssistantInfo.error;
3043
- throw new Error(
3044
- `Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
3045
- );
3046
- }
3047
- let outputText = "";
3048
- if (resultData?.parts) {
3049
- for (const part of resultData.parts) {
3050
- if (part.type === "text") {
3051
- outputText += part.text;
3052
- }
3053
- }
3054
- }
3055
- if (!outputText && allMessages.length > 0) {
3056
- for (let i = allMessages.length - 1; i >= 0; i--) {
3057
- const msg = allMessages[i];
3058
- if (msg.info.role === "assistant") {
3059
- const assistantInfo = msg.info;
3060
- if (assistantInfo.error) {
3061
- throw new Error(
3062
- `Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
3063
- );
3064
- }
3065
- for (const part of msg.parts) {
3066
- if (part.type === "text") {
3067
- outputText += part.text;
3109
+ child.stdout?.on("data", (data) => {
3110
+ const text = data.toString();
3111
+ lastOutputTime = Date.now();
3112
+ lineBuffer += text;
3113
+ const lines = lineBuffer.split("\n");
3114
+ lineBuffer = lines.pop() || "";
3115
+ for (const line of lines) {
3116
+ if (!line.trim()) continue;
3117
+ const evt = tryParseJson(line);
3118
+ if (!evt || !evt.type) continue;
3119
+ allEvents.push({ event: evt, receivedAt: Date.now() });
3120
+ if (traceContext) {
3121
+ traceStepNumber++;
3122
+ const traceEvt = createTraceEventFromNdjson(
3123
+ evt,
3124
+ traceContext,
3125
+ traceStepNumber,
3126
+ false
3127
+ );
3128
+ if (traceEvt) {
3129
+ lastToolName = traceEvt.toolName;
3130
+ lastFilePath = traceEvt.filePath;
3131
+ if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
3132
+ lastAction = "Thinking...";
3133
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
3134
+ lastAction = extractToolAction(
3135
+ traceEvt.toolName ?? "",
3136
+ void 0
3137
+ );
3138
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
3139
+ lastAction = `Writing: ${traceEvt.filePath || "file"}`;
3140
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
3141
+ lastAction = `Reading: ${traceEvt.filePath || "file"}`;
3142
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
3143
+ lastAction = "Processing response...";
3068
3144
  }
3145
+ emitTraceEvent(
3146
+ traceEvt,
3147
+ traceContext.tracePushUrl,
3148
+ traceContext.routeHeader,
3149
+ traceContext.authToken
3150
+ );
3069
3151
  }
3070
- if (outputText) break;
3071
3152
  }
3072
3153
  }
3073
- }
3074
- if (!outputText) {
3075
- const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
3076
- if (!hasAssistant) {
3077
- throw new Error(
3078
- `Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
3079
- );
3080
- }
3081
- }
3082
- const usage = lastAssistantInfo ? {
3083
- inputTokens: lastAssistantInfo.tokens.input,
3084
- outputTokens: lastAssistantInfo.tokens.output,
3085
- totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
3086
- } : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
3087
- const costUsd = lastAssistantInfo?.cost;
3088
- const modelStr = options.model || DEFAULT_MODEL3;
3089
- const llmTrace = buildLLMTrace(
3090
- allMessages,
3091
- totalDurationMs,
3092
- modelStr,
3093
- providerID
3094
- );
3095
- const conversation = buildConversation2(allMessages);
3096
- return {
3097
- result: {
3098
- outputText,
3099
- durationMs: totalDurationMs,
3100
- usage,
3101
- costUsd
3102
- },
3103
- llmTrace,
3104
- conversation
3105
- };
3106
- } catch (sdkError) {
3107
- if (timeoutHandle) clearTimeout(timeoutHandle);
3108
- if (heartbeatHandle) clearInterval(heartbeatHandle);
3109
- if (timedOut) {
3110
- console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
3111
- }
3112
- const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
3113
- const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
3114
- const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
3115
- const causeDetails = [];
3116
- let current = sdkError;
3117
- while (current instanceof Error && current.cause) {
3118
- current = current.cause;
3119
- if (current instanceof Error) {
3120
- causeDetails.push(`${current.name}: ${current.message}`);
3121
- } else {
3122
- causeDetails.push(String(current));
3154
+ });
3155
+ child.stderr?.on("data", (data) => {
3156
+ const text = data.toString();
3157
+ stderr += text;
3158
+ lastOutputTime = Date.now();
3159
+ });
3160
+ child.on("close", (code) => {
3161
+ if (lineBuffer.trim()) {
3162
+ const evt = tryParseJson(lineBuffer);
3163
+ if (evt && evt.type) {
3164
+ allEvents.push({ event: evt, receivedAt: Date.now() });
3165
+ }
3123
3166
  }
3124
- }
3125
- const causeChain = causeDetails.length > 0 ? `
3126
- Cause chain: ${causeDetails.join(" -> ")}` : "";
3127
- console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
3128
- console.error("[SDK-ERROR] Error name:", errorName);
3129
- console.error("[SDK-ERROR] Error message:", errorMessage);
3130
- if (causeDetails.length > 0) {
3131
- console.error("[SDK-ERROR] Cause chain:", causeDetails.join(" -> "));
3132
- }
3133
- if (errorStack) {
3134
- console.error("[SDK-ERROR] Stack:", errorStack);
3135
- }
3136
- if (traceContext) {
3137
- emitTraceEvent(
3138
- {
3139
- evalRunId: traceContext.evalRunId,
3140
- scenarioId: traceContext.scenarioId,
3141
- scenarioName: traceContext.scenarioName,
3142
- targetId: traceContext.targetId,
3143
- targetName: traceContext.targetName,
3144
- stepNumber: traceStepNumber + 1,
3145
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3146
- outputPreview: JSON.stringify({
3147
- event: "sdk-execution-failed",
3148
- error: errorMessage,
3149
- errorName,
3150
- ...causeDetails.length > 0 && {
3151
- causeChain: causeDetails.join(" -> ")
3152
- }
3153
- }).slice(0, 2e3),
3154
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3155
- isComplete: true
3156
- },
3157
- traceContext.tracePushUrl,
3158
- traceContext.routeHeader,
3159
- traceContext.authToken
3167
+ console.log(
3168
+ `[executeWithOpenCode] Process exited with code ${code}, ${allEvents.length} events collected`
3160
3169
  );
3161
- }
3162
- throw new Error(
3163
- `OpenCode SDK execution failed: ${errorMessage}` + causeChain + (errorStack ? `
3164
- Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
3165
- );
3166
- } finally {
3167
- if (server) {
3168
- try {
3169
- server.close();
3170
- console.log("[SDK-DEBUG] OpenCode server closed");
3171
- } catch {
3170
+ if (code === 0) {
3171
+ finalize(true);
3172
+ } else {
3173
+ finalize(
3174
+ false,
3175
+ new Error(
3176
+ `OpenCode CLI exited with code ${code}.
3177
+ Stderr: ${stderr.slice(0, 1e3)}`
3178
+ )
3179
+ );
3172
3180
  }
3173
- }
3174
- }
3181
+ });
3182
+ child.on("error", (error) => {
3183
+ finalize(false, new Error(`OpenCode CLI spawn error: ${error.message}`));
3184
+ });
3185
+ });
3175
3186
  }
3176
3187
 
3177
3188
  // src/run-scenario/agents/opencode/opencode-adapter.ts
@@ -3394,7 +3405,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
3394
3405
  }
3395
3406
 
3396
3407
  // src/run-scenario/agents/simple-agent/build-conversation.ts
3397
- function buildConversation3(triggerPrompt, steps, executionStartMs) {
3408
+ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
3398
3409
  const messages = [];
3399
3410
  messages.push({
3400
3411
  role: "user",
@@ -3403,11 +3414,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3403
3414
  });
3404
3415
  for (let i = 0; i < steps.length; i++) {
3405
3416
  const step = steps[i];
3406
- const stepTimestamp = estimateStepTimestamp(
3407
- executionStartMs,
3408
- i,
3409
- steps.length
3410
- );
3417
+ const stepTimestamp = new Date(
3418
+ stepTimestamps[i] ?? executionStartMs
3419
+ ).toISOString();
3411
3420
  const assistantContent = [];
3412
3421
  if (step.reasoningText) {
3413
3422
  assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3450,10 +3459,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3450
3459
  }
3451
3460
  return messages;
3452
3461
  }
3453
- function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
3454
- const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
3455
- return new Date(startMs + Math.round(offset * 1e3)).toISOString();
3456
- }
3457
3462
 
3458
3463
  // src/run-scenario/agents/simple-agent/execute.ts
3459
3464
  var PROVIDER_ANTHROPIC2 = "anthropic";
@@ -3538,6 +3543,7 @@ async function executeWithAiSdk(context) {
3538
3543
  }
3539
3544
  }
3540
3545
  };
3546
+ const stepTimestamps = [];
3541
3547
  const result = await (0, import_ai.generateText)({
3542
3548
  model,
3543
3549
  system: systemPrompt,
@@ -3546,7 +3552,34 @@ async function executeWithAiSdk(context) {
3546
3552
  maxOutputTokens: modelConfig.maxTokens,
3547
3553
  tools: mcpTools,
3548
3554
  stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
3549
- providerOptions: providerOpts
3555
+ providerOptions: providerOpts,
3556
+ onStepFinish: (step) => {
3557
+ stepTimestamps.push(Date.now());
3558
+ if (traceContext) {
3559
+ const isToolStep = step.toolCalls.length > 0;
3560
+ const firstToolCall = step.toolCalls[0];
3561
+ emitTraceEvent(
3562
+ {
3563
+ evalRunId: traceContext.evalRunId,
3564
+ scenarioId: traceContext.scenarioId,
3565
+ scenarioName: traceContext.scenarioName,
3566
+ targetId: traceContext.targetId,
3567
+ targetName: traceContext.targetName,
3568
+ stepNumber: stepTimestamps.length,
3569
+ type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
3570
+ toolName: firstToolCall?.toolName,
3571
+ toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3572
+ outputPreview: step.text?.slice(0, 500),
3573
+ elapsedMs: Date.now() - startTime,
3574
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3575
+ isComplete: false
3576
+ },
3577
+ traceContext.tracePushUrl,
3578
+ traceContext.routeHeader,
3579
+ traceContext.authToken
3580
+ );
3581
+ }
3582
+ }
3550
3583
  });
3551
3584
  const durationMs = Date.now() - startTime;
3552
3585
  const usage = {
@@ -3560,16 +3593,17 @@ async function executeWithAiSdk(context) {
3560
3593
  usage,
3561
3594
  modelConfig.model,
3562
3595
  provider,
3563
- startTime
3596
+ startTime,
3597
+ stepTimestamps
3564
3598
  );
3565
3599
  if (traceContext) {
3566
- emitStepEvents(traceContext, result.steps, startTime);
3567
- emitCompletionEvent(traceContext, result.steps.length + 1);
3600
+ emitCompletionEvent(traceContext, stepTimestamps.length + 1);
3568
3601
  }
3569
3602
  const conversation = buildConversation3(
3570
3603
  scenario.triggerPrompt,
3571
3604
  result.steps,
3572
- startTime
3605
+ startTime,
3606
+ stepTimestamps
3573
3607
  );
3574
3608
  return {
3575
3609
  outputText: result.text,
@@ -3610,20 +3644,16 @@ function findToolResultError(step) {
3610
3644
  }
3611
3645
  return null;
3612
3646
  }
3613
- function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
3614
- const totalStepTokens = steps.reduce(
3615
- (sum, s) => sum + (s.usage.totalTokens ?? 0),
3616
- 0
3617
- );
3647
+ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
3618
3648
  const traceSteps = steps.map((step, i) => {
3619
- const stepTokens = step.usage.totalTokens ?? 0;
3620
- const proportion = totalStepTokens > 0 ? stepTokens / totalStepTokens : 0;
3621
- const stepDurationMs = Math.round(totalDurationMs * proportion);
3649
+ const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
3650
+ const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
3651
+ const stepDurationMs = stepFinishedAt - stepStartedAt;
3622
3652
  const firstToolCall = step.toolCalls[0];
3623
3653
  const tokenUsage = {
3624
3654
  prompt: step.usage.inputTokens ?? 0,
3625
3655
  completion: step.usage.outputTokens ?? 0,
3626
- total: stepTokens
3656
+ total: step.usage.totalTokens ?? 0
3627
3657
  };
3628
3658
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
3629
3659
  const toolResultError = findToolResultError(step);
@@ -3634,9 +3664,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
3634
3664
  type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
3635
3665
  model: modelId,
3636
3666
  provider,
3637
- startedAt: new Date(
3638
- executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
3639
- ).toISOString(),
3667
+ startedAt: new Date(stepStartedAt).toISOString(),
3640
3668
  durationMs: stepDurationMs,
3641
3669
  tokenUsage,
3642
3670
  costUsd,
@@ -3694,33 +3722,6 @@ function emitStartEvent(traceContext, startTime) {
3694
3722
  traceContext.authToken
3695
3723
  );
3696
3724
  }
3697
- function emitStepEvents(traceContext, steps, startTime) {
3698
- for (let i = 0; i < steps.length; i++) {
3699
- const step = steps[i];
3700
- const isToolStep = step.toolCalls.length > 0;
3701
- const firstToolCall = step.toolCalls[0];
3702
- emitTraceEvent(
3703
- {
3704
- evalRunId: traceContext.evalRunId,
3705
- scenarioId: traceContext.scenarioId,
3706
- scenarioName: traceContext.scenarioName,
3707
- targetId: traceContext.targetId,
3708
- targetName: traceContext.targetName,
3709
- stepNumber: i + 1,
3710
- type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
3711
- toolName: firstToolCall?.toolName,
3712
- toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3713
- outputPreview: step.text?.slice(0, 500),
3714
- elapsedMs: Date.now() - startTime,
3715
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3716
- isComplete: false
3717
- },
3718
- traceContext.tracePushUrl,
3719
- traceContext.routeHeader,
3720
- traceContext.authToken
3721
- );
3722
- }
3723
- }
3724
3725
  function emitCompletionEvent(traceContext, stepNumber) {
3725
3726
  emitTraceEvent(
3726
3727
  {
@@ -3757,7 +3758,7 @@ defaultRegistry.register(simpleAgentAdapter);
3757
3758
 
3758
3759
  // src/run-scenario/file-diff.ts
3759
3760
  var import_fs2 = require("fs");
3760
- var import_path10 = require("path");
3761
+ var import_path11 = require("path");
3761
3762
 
3762
3763
  // ../../node_modules/diff/lib/index.mjs
3763
3764
  function Diff() {
@@ -3933,7 +3934,7 @@ Diff.prototype = {
3933
3934
  tokenize: function tokenize(value) {
3934
3935
  return Array.from(value);
3935
3936
  },
3936
- join: function join8(chars) {
3937
+ join: function join9(chars) {
3937
3938
  return chars.join("");
3938
3939
  },
3939
3940
  postProcess: function postProcess(changeObjects) {
@@ -4382,8 +4383,8 @@ function snapshotDirectory(dir, baseDir) {
4382
4383
  }
4383
4384
  const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
4384
4385
  for (const entry of entries) {
4385
- const fullPath = (0, import_path10.join)(dir, entry.name);
4386
- const relativePath = (0, import_path10.relative)(base, fullPath);
4386
+ const fullPath = (0, import_path11.join)(dir, entry.name);
4387
+ const relativePath = (0, import_path11.relative)(base, fullPath);
4387
4388
  if (shouldIgnore(entry.name)) {
4388
4389
  continue;
4389
4390
  }