@wix/evalforge-evaluator 0.122.0 → 0.124.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -2606,8 +2606,8 @@ var import_promises9 = require("fs/promises");
2606
2606
  var import_path10 = require("path");
2607
2607
  var KILL_GRACE_PERIOD_MS = 5e3;
2608
2608
  var IDLE_TIMEOUT_MS = 12e4;
2609
- var TOOL_RUNNING_IDLE_TIMEOUT_MS = 36e4;
2610
2609
  var IDLE_CHECK_INTERVAL_MS = 15e3;
2610
+ var MAX_IDLE_RETRIES = 3;
2611
2611
  function extractToolAction(toolName, args) {
2612
2612
  if (!toolName) return "Using tool...";
2613
2613
  if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
@@ -2733,246 +2733,126 @@ function killProcess(child, resolved) {
2733
2733
  }
2734
2734
  }, KILL_GRACE_PERIOD_MS);
2735
2735
  }
2736
- async function executeWithOpenCode(skills, scenario, options) {
2737
- const skillNames = skills.map((s) => s.name).join(", ");
2738
- console.log("[executeWithOpenCode] Starting execution", {
2739
- skillCount: skills.length,
2740
- skillNames,
2741
- scenarioId: scenario.id,
2742
- scenarioName: scenario.name,
2743
- cwd: options.cwd,
2744
- aiGatewayUrl: options.aiGatewayUrl,
2745
- hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2746
- model: options.model
2747
- });
2748
- const startTime = /* @__PURE__ */ new Date();
2749
- const maxTurns = options.maxTurns ?? 10;
2750
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2751
- const { env, providerID, modelID } = await buildOpenCodeEnv({
2752
- model: options.model,
2753
- temperature: options.temperature,
2754
- maxTurns,
2755
- aiGatewayUrl: options.aiGatewayUrl,
2756
- aiGatewayHeaders: options.aiGatewayHeaders,
2757
- mcps: options.mcps,
2758
- cwd: options.cwd
2759
- });
2760
- const traceContext = options.traceContext;
2761
- let traceStepNumber = 0;
2762
- let lastAction = "Starting...";
2763
- let lastToolName;
2764
- let lastFilePath;
2765
- let isToolRunning = false;
2766
- if (traceContext) {
2767
- emitTraceEvent(
2768
- {
2769
- evalRunId: traceContext.evalRunId,
2770
- scenarioId: traceContext.scenarioId,
2771
- scenarioName: traceContext.scenarioName,
2772
- targetId: traceContext.targetId,
2773
- targetName: traceContext.targetName,
2774
- stepNumber: 0,
2775
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2776
- outputPreview: JSON.stringify({
2777
- event: "pre-cli-execution",
2778
- model: `${providerID}/${modelID}`,
2779
- maxTurns,
2780
- timestamp: (/* @__PURE__ */ new Date()).toISOString()
2781
- }),
2782
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2783
- isComplete: false
2784
- },
2785
- traceContext.tracePushUrl,
2786
- traceContext.routeHeader,
2787
- traceContext.authToken
2788
- );
2789
- }
2790
- let systemPrompt;
2791
- if (options.systemPrompt === null || options.systemPrompt === "") {
2792
- } else if (options.systemPrompt != null) {
2793
- systemPrompt = options.systemPrompt;
2794
- } else {
2795
- systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
2796
- }
2797
- if (systemPrompt) {
2798
- await writeSystemPromptRule(options.cwd, systemPrompt);
2736
+ function summarizeToolActions(events) {
2737
+ const actions = [];
2738
+ for (const { event: evt } of events) {
2739
+ if (evt.type === "tool_use") {
2740
+ const tu = evt;
2741
+ const tool = tu.part.tool;
2742
+ const input = tu.part.state.input;
2743
+ const filePath = input?.file_path || input?.path || input?.target_file;
2744
+ if (filePath) {
2745
+ actions.push(`- ${tool}: ${String(filePath)}`);
2746
+ } else if (input?.command) {
2747
+ actions.push(`- ${tool}: ${String(input.command).slice(0, 80)}`);
2748
+ } else {
2749
+ actions.push(`- ${tool}`);
2750
+ }
2751
+ }
2799
2752
  }
2800
- const args = [
2801
- "run",
2802
- "--format",
2803
- "json",
2804
- "--thinking",
2805
- "--variant",
2806
- "high",
2807
- "--model",
2808
- `${providerID}/${modelID}`,
2809
- "--dir",
2810
- options.cwd,
2811
- // NOTE: Trigger prompt is passed as a positional CLI arg. On Linux a single
2812
- // arg is capped at 128 KB (MAX_ARG_STRLEN); on macOS the combined args+env
2813
- // share a ~1 MB limit. Prompts exceeding this would fail with E2BIG.
2814
- // In practice eval prompts are well under this limit.
2815
- scenario.triggerPrompt
2816
- ];
2817
- console.log("[executeWithOpenCode] Spawning: opencode", args.slice(0, 5));
2818
- return new Promise((resolve2, reject) => {
2753
+ return actions.length > 0 ? actions.join("\n") : "(no tool actions recorded)";
2754
+ }
2755
+ function buildRecoveryPrompt(originalPrompt, events) {
2756
+ const toolSummary = summarizeToolActions(events);
2757
+ return `You are continuing a task that was interrupted due to a session error.
2758
+
2759
+ ORIGINAL TASK:
2760
+ ${originalPrompt}
2761
+
2762
+ ACTIONS ALREADY COMPLETED IN THE PREVIOUS SESSION:
2763
+ ${toolSummary}
2764
+
2765
+ INSTRUCTIONS:
2766
+ 1. Review the actions listed above that were already completed in the previous session
2767
+ 2. Check the filesystem to verify what was already done
2768
+ 3. Continue with any remaining work needed to fulfill the original task
2769
+ 4. Do NOT redo work that is already done \u2014 only continue from where the previous session left off`;
2770
+ }
2771
+ function spawnOpenCodeProcess(opts) {
2772
+ const {
2773
+ args,
2774
+ env,
2775
+ cwd,
2776
+ skillNames,
2777
+ scenarioName,
2778
+ sdkTimeoutMs,
2779
+ traceContext,
2780
+ initialStepNumber
2781
+ } = opts;
2782
+ return new Promise((resolve2) => {
2819
2783
  let resolved = false;
2820
2784
  let stderr = "";
2821
2785
  let lineBuffer = "";
2822
2786
  let lastOutputTime = Date.now();
2823
- const allEvents = [];
2787
+ let traceStepNumber = initialStepNumber;
2788
+ let lastAction = "Starting...";
2789
+ let lastToolName;
2790
+ let lastFilePath;
2791
+ const events = [];
2824
2792
  const timers = {};
2825
2793
  const cleanup = () => {
2826
2794
  if (timers.timeout) clearTimeout(timers.timeout);
2827
2795
  if (timers.idleCheck) clearInterval(timers.idleCheck);
2828
2796
  if (timers.heartbeat) clearInterval(timers.heartbeat);
2829
2797
  };
2830
- const finalize = (success, error) => {
2798
+ const finalize = (success, isIdleTimeout, error) => {
2831
2799
  if (resolved) return;
2832
2800
  resolved = true;
2833
2801
  cleanup();
2834
- if (!success) {
2835
- if (traceContext) {
2836
- emitTraceEvent(
2837
- {
2838
- evalRunId: traceContext.evalRunId,
2839
- scenarioId: traceContext.scenarioId,
2840
- scenarioName: traceContext.scenarioName,
2841
- targetId: traceContext.targetId,
2842
- targetName: traceContext.targetName,
2843
- stepNumber: traceStepNumber + 1,
2844
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2845
- outputPreview: JSON.stringify({
2846
- event: "cli-execution-failed",
2847
- error: error?.message ?? "Unknown error"
2848
- }).slice(0, 2e3),
2849
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2850
- isComplete: true
2851
- },
2852
- traceContext.tracePushUrl,
2853
- traceContext.routeHeader,
2854
- traceContext.authToken
2855
- );
2856
- }
2857
- reject(
2858
- error ?? new Error(
2859
- `OpenCode CLI execution failed (exit code unknown).
2860
- Stderr: ${stderr.slice(0, 1e3)}`
2861
- )
2862
- );
2863
- return;
2864
- }
2865
- const endTime = /* @__PURE__ */ new Date();
2866
- const totalDurationMs = endTime.getTime() - startTime.getTime();
2867
- let outputText = "";
2868
- for (const { event: evt } of allEvents) {
2869
- if (evt.type === "text") {
2870
- outputText += evt.part.text;
2871
- }
2872
- }
2873
- if (!outputText) {
2874
- reject(
2875
- new Error(
2876
- `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
2877
- )
2878
- );
2879
- return;
2880
- }
2881
- let inputTokens = 0;
2882
- let outputTokens = 0;
2883
- let costUsd = 0;
2884
- for (const { event: evt } of allEvents) {
2885
- if (evt.type === "step_finish") {
2886
- const sf = evt;
2887
- inputTokens += sf.part.tokens.input;
2888
- outputTokens += sf.part.tokens.output;
2889
- costUsd += sf.part.cost;
2890
- }
2891
- }
2892
- if (traceContext) {
2893
- emitTraceEvent(
2894
- {
2895
- evalRunId: traceContext.evalRunId,
2896
- scenarioId: traceContext.scenarioId,
2897
- scenarioName: traceContext.scenarioName,
2898
- targetId: traceContext.targetId,
2899
- targetName: traceContext.targetName,
2900
- stepNumber: traceStepNumber + 1,
2901
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
2902
- outputPreview: "Scenario execution completed",
2903
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2904
- isComplete: true
2905
- },
2906
- traceContext.tracePushUrl,
2907
- traceContext.routeHeader,
2908
- traceContext.authToken
2909
- );
2910
- }
2911
- const modelStr = options.model || `${providerID}/${modelID}`;
2912
- const llmTrace = buildLLMTrace(
2913
- allEvents,
2914
- totalDurationMs,
2915
- modelStr,
2916
- providerID,
2917
- startTime
2918
- );
2919
- const conversation = buildConversation2(allEvents);
2920
2802
  resolve2({
2921
- result: {
2922
- outputText,
2923
- durationMs: totalDurationMs,
2924
- usage: {
2925
- inputTokens,
2926
- outputTokens,
2927
- totalTokens: inputTokens + outputTokens
2928
- },
2929
- costUsd
2930
- },
2931
- llmTrace,
2932
- conversation
2803
+ events,
2804
+ success,
2805
+ isIdleTimeout,
2806
+ error,
2807
+ finalStepNumber: traceStepNumber
2933
2808
  });
2934
2809
  };
2935
2810
  let child;
2936
2811
  try {
2937
2812
  child = (0, import_child_process.spawn)("opencode", args, {
2938
- cwd: options.cwd,
2813
+ cwd,
2939
2814
  env,
2940
2815
  stdio: ["ignore", "pipe", "pipe"],
2941
2816
  detached: true
2942
2817
  });
2943
2818
  } catch (spawnError) {
2944
- reject(
2945
- new Error(
2819
+ resolve2({
2820
+ events: [],
2821
+ success: false,
2822
+ isIdleTimeout: false,
2823
+ error: new Error(
2946
2824
  `Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
2947
- )
2948
- );
2825
+ ),
2826
+ finalStepNumber: traceStepNumber
2827
+ });
2949
2828
  return;
2950
2829
  }
2951
2830
  timers.timeout = setTimeout(() => {
2952
2831
  if (!resolved) {
2953
- console.error(`[OpenCode] Process timed out after ${SDK_TIMEOUT_MS}ms`);
2832
+ console.error(`[OpenCode] Process timed out after ${sdkTimeoutMs}ms`);
2954
2833
  killProcess(child, resolved);
2955
2834
  finalize(
2835
+ false,
2956
2836
  false,
2957
2837
  new Error(
2958
- `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
2838
+ `OpenCode execution timed out after ${sdkTimeoutMs}ms. Skills: ${skillNames}, Scenario: ${scenarioName}`
2959
2839
  )
2960
2840
  );
2961
2841
  }
2962
- }, SDK_TIMEOUT_MS);
2842
+ }, sdkTimeoutMs);
2963
2843
  timers.idleCheck = setInterval(() => {
2964
2844
  if (resolved) return;
2965
2845
  const idleTime = Date.now() - lastOutputTime;
2966
- const effectiveTimeout = isToolRunning ? TOOL_RUNNING_IDLE_TIMEOUT_MS : IDLE_TIMEOUT_MS;
2967
- if (idleTime >= effectiveTimeout) {
2846
+ if (idleTime >= IDLE_TIMEOUT_MS) {
2968
2847
  console.warn(
2969
- `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s (tool running: ${isToolRunning}). Killing process.`
2848
+ `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
2970
2849
  );
2971
2850
  killProcess(child, resolved);
2972
2851
  finalize(
2973
2852
  false,
2853
+ true,
2974
2854
  new Error(
2975
- `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout, tool running: ${isToolRunning}). Skills: ${skillNames}, Scenario: ${scenario.name}`
2855
+ `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenarioName}`
2976
2856
  )
2977
2857
  );
2978
2858
  }
@@ -3032,14 +2912,7 @@ Stderr: ${stderr.slice(0, 1e3)}`
3032
2912
  if (!line.trim()) continue;
3033
2913
  const evt = tryParseJson(line);
3034
2914
  if (!evt || !evt.type) continue;
3035
- allEvents.push({ event: evt, receivedAt: Date.now() });
3036
- if (evt.type === "tool_use") {
3037
- const tu = evt;
3038
- const status = tu.part.state.status;
3039
- isToolRunning = status !== "completed" && status !== "error";
3040
- } else {
3041
- isToolRunning = false;
3042
- }
2915
+ events.push({ event: evt, receivedAt: Date.now() });
3043
2916
  if (traceContext) {
3044
2917
  traceStepNumber++;
3045
2918
  const traceEvt = createTraceEventFromNdjson(
@@ -3084,16 +2957,17 @@ Stderr: ${stderr.slice(0, 1e3)}`
3084
2957
  if (lineBuffer.trim()) {
3085
2958
  const evt = tryParseJson(lineBuffer);
3086
2959
  if (evt && evt.type) {
3087
- allEvents.push({ event: evt, receivedAt: Date.now() });
2960
+ events.push({ event: evt, receivedAt: Date.now() });
3088
2961
  }
3089
2962
  }
3090
2963
  console.log(
3091
- `[executeWithOpenCode] Process exited with code ${code}, ${allEvents.length} events collected`
2964
+ `[executeWithOpenCode] Process exited with code ${code}, ${events.length} events collected`
3092
2965
  );
3093
2966
  if (code === 0) {
3094
- finalize(true);
2967
+ finalize(true, false);
3095
2968
  } else {
3096
2969
  finalize(
2970
+ false,
3097
2971
  false,
3098
2972
  new Error(
3099
2973
  `OpenCode CLI exited with code ${code}.
@@ -3103,10 +2977,240 @@ Stderr: ${stderr.slice(0, 1e3)}`
3103
2977
  }
3104
2978
  });
3105
2979
  child.on("error", (error) => {
3106
- finalize(false, new Error(`OpenCode CLI spawn error: ${error.message}`));
2980
+ finalize(
2981
+ false,
2982
+ false,
2983
+ new Error(`OpenCode CLI spawn error: ${error.message}`)
2984
+ );
3107
2985
  });
3108
2986
  });
3109
2987
  }
2988
+ async function executeWithOpenCode(skills, scenario, options) {
2989
+ const skillNames = skills.map((s) => s.name).join(", ");
2990
+ console.log("[executeWithOpenCode] Starting execution", {
2991
+ skillCount: skills.length,
2992
+ skillNames,
2993
+ scenarioId: scenario.id,
2994
+ scenarioName: scenario.name,
2995
+ cwd: options.cwd,
2996
+ aiGatewayUrl: options.aiGatewayUrl,
2997
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2998
+ model: options.model
2999
+ });
3000
+ const startTime = /* @__PURE__ */ new Date();
3001
+ const maxTurns = options.maxTurns ?? 10;
3002
+ const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
3003
+ const { env, providerID, modelID } = await buildOpenCodeEnv({
3004
+ model: options.model,
3005
+ temperature: options.temperature,
3006
+ maxTurns,
3007
+ aiGatewayUrl: options.aiGatewayUrl,
3008
+ aiGatewayHeaders: options.aiGatewayHeaders,
3009
+ mcps: options.mcps,
3010
+ cwd: options.cwd
3011
+ });
3012
+ const traceContext = options.traceContext;
3013
+ if (traceContext) {
3014
+ emitTraceEvent(
3015
+ {
3016
+ evalRunId: traceContext.evalRunId,
3017
+ scenarioId: traceContext.scenarioId,
3018
+ scenarioName: traceContext.scenarioName,
3019
+ targetId: traceContext.targetId,
3020
+ targetName: traceContext.targetName,
3021
+ stepNumber: 0,
3022
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3023
+ outputPreview: JSON.stringify({
3024
+ event: "pre-cli-execution",
3025
+ model: `${providerID}/${modelID}`,
3026
+ maxTurns,
3027
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
3028
+ }),
3029
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3030
+ isComplete: false
3031
+ },
3032
+ traceContext.tracePushUrl,
3033
+ traceContext.routeHeader,
3034
+ traceContext.authToken
3035
+ );
3036
+ }
3037
+ let systemPrompt;
3038
+ if (options.systemPrompt === null || options.systemPrompt === "") {
3039
+ } else if (options.systemPrompt != null) {
3040
+ systemPrompt = options.systemPrompt;
3041
+ } else {
3042
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
3043
+ }
3044
+ if (systemPrompt) {
3045
+ await writeSystemPromptRule(options.cwd, systemPrompt);
3046
+ }
3047
+ const baseArgs = [
3048
+ "run",
3049
+ "--format",
3050
+ "json",
3051
+ "--thinking",
3052
+ "--variant",
3053
+ "high",
3054
+ "--model",
3055
+ `${providerID}/${modelID}`,
3056
+ "--dir",
3057
+ options.cwd
3058
+ ];
3059
+ const accumulatedEvents = [];
3060
+ let traceStepNumber = 0;
3061
+ let lastAttemptResult;
3062
+ for (let attempt = 1; attempt <= MAX_IDLE_RETRIES; attempt++) {
3063
+ const prompt = attempt === 1 ? scenario.triggerPrompt : buildRecoveryPrompt(scenario.triggerPrompt, accumulatedEvents);
3064
+ if (attempt > 1) {
3065
+ console.log(
3066
+ `[OpenCode] Retry attempt ${attempt}/${MAX_IDLE_RETRIES} \u2014 starting fresh session with recovery context`
3067
+ );
3068
+ if (traceContext) {
3069
+ emitTraceEvent(
3070
+ {
3071
+ evalRunId: traceContext.evalRunId,
3072
+ scenarioId: traceContext.scenarioId,
3073
+ scenarioName: traceContext.scenarioName,
3074
+ targetId: traceContext.targetId,
3075
+ targetName: traceContext.targetName,
3076
+ stepNumber: traceStepNumber + 1,
3077
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3078
+ outputPreview: JSON.stringify({
3079
+ event: "idle-timeout-retry",
3080
+ attempt,
3081
+ maxRetries: MAX_IDLE_RETRIES,
3082
+ eventsFromPreviousAttempts: accumulatedEvents.length
3083
+ }),
3084
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3085
+ isComplete: false
3086
+ },
3087
+ traceContext.tracePushUrl,
3088
+ traceContext.routeHeader,
3089
+ traceContext.authToken
3090
+ );
3091
+ }
3092
+ }
3093
+ const args = [...baseArgs, prompt];
3094
+ console.log(
3095
+ `[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
3096
+ args.slice(0, 5)
3097
+ );
3098
+ lastAttemptResult = await spawnOpenCodeProcess({
3099
+ args,
3100
+ env,
3101
+ cwd: options.cwd,
3102
+ skillNames,
3103
+ scenarioName: scenario.name,
3104
+ sdkTimeoutMs,
3105
+ traceContext,
3106
+ initialStepNumber: traceStepNumber
3107
+ });
3108
+ accumulatedEvents.push(...lastAttemptResult.events);
3109
+ traceStepNumber = lastAttemptResult.finalStepNumber;
3110
+ if (lastAttemptResult.success) {
3111
+ break;
3112
+ }
3113
+ if (!lastAttemptResult.isIdleTimeout || attempt >= MAX_IDLE_RETRIES) {
3114
+ if (traceContext) {
3115
+ emitTraceEvent(
3116
+ {
3117
+ evalRunId: traceContext.evalRunId,
3118
+ scenarioId: traceContext.scenarioId,
3119
+ scenarioName: traceContext.scenarioName,
3120
+ targetId: traceContext.targetId,
3121
+ targetName: traceContext.targetName,
3122
+ stepNumber: traceStepNumber + 1,
3123
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3124
+ outputPreview: JSON.stringify({
3125
+ event: "cli-execution-failed",
3126
+ error: lastAttemptResult.error?.message ?? "Unknown error",
3127
+ attempt,
3128
+ isIdleTimeout: lastAttemptResult.isIdleTimeout
3129
+ }).slice(0, 2e3),
3130
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3131
+ isComplete: true
3132
+ },
3133
+ traceContext.tracePushUrl,
3134
+ traceContext.routeHeader,
3135
+ traceContext.authToken
3136
+ );
3137
+ }
3138
+ throw lastAttemptResult.error ?? new Error(
3139
+ `OpenCode CLI execution failed.
3140
+ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
3141
+ );
3142
+ }
3143
+ console.warn(
3144
+ `[OpenCode] Attempt ${attempt} failed due to idle timeout, will retry`
3145
+ );
3146
+ }
3147
+ const endTime = /* @__PURE__ */ new Date();
3148
+ const totalDurationMs = endTime.getTime() - startTime.getTime();
3149
+ let outputText = "";
3150
+ for (const { event: evt } of accumulatedEvents) {
3151
+ if (evt.type === "text") {
3152
+ outputText += evt.part.text;
3153
+ }
3154
+ }
3155
+ if (!outputText) {
3156
+ throw new Error(
3157
+ `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${accumulatedEvents.length}`
3158
+ );
3159
+ }
3160
+ let inputTokens = 0;
3161
+ let outputTokens = 0;
3162
+ let costUsd = 0;
3163
+ for (const { event: evt } of accumulatedEvents) {
3164
+ if (evt.type === "step_finish") {
3165
+ const sf = evt;
3166
+ inputTokens += sf.part.tokens.input;
3167
+ outputTokens += sf.part.tokens.output;
3168
+ costUsd += sf.part.cost;
3169
+ }
3170
+ }
3171
+ if (traceContext) {
3172
+ emitTraceEvent(
3173
+ {
3174
+ evalRunId: traceContext.evalRunId,
3175
+ scenarioId: traceContext.scenarioId,
3176
+ scenarioName: traceContext.scenarioName,
3177
+ targetId: traceContext.targetId,
3178
+ targetName: traceContext.targetName,
3179
+ stepNumber: traceStepNumber + 1,
3180
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
3181
+ outputPreview: "Scenario execution completed",
3182
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3183
+ isComplete: true
3184
+ },
3185
+ traceContext.tracePushUrl,
3186
+ traceContext.routeHeader,
3187
+ traceContext.authToken
3188
+ );
3189
+ }
3190
+ const modelStr = options.model || `${providerID}/${modelID}`;
3191
+ const llmTrace = buildLLMTrace(
3192
+ accumulatedEvents,
3193
+ totalDurationMs,
3194
+ modelStr,
3195
+ providerID,
3196
+ startTime
3197
+ );
3198
+ const conversation = buildConversation2(accumulatedEvents);
3199
+ return {
3200
+ result: {
3201
+ outputText,
3202
+ durationMs: totalDurationMs,
3203
+ usage: {
3204
+ inputTokens,
3205
+ outputTokens,
3206
+ totalTokens: inputTokens + outputTokens
3207
+ },
3208
+ costUsd
3209
+ },
3210
+ llmTrace,
3211
+ conversation
3212
+ };
3213
+ }
3110
3214
 
3111
3215
  // src/run-scenario/agents/opencode/opencode-adapter.ts
3112
3216
  var OpenCodeAdapter = class {
@@ -4465,13 +4569,14 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4465
4569
  infrastructurePaths
4466
4570
  );
4467
4571
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4572
+ const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4468
4573
  return {
4469
4574
  id: (0, import_crypto4.randomUUID)(),
4470
4575
  targetId,
4471
4576
  targetName,
4472
4577
  scenarioId: scenario.id,
4473
4578
  scenarioName: scenario.name,
4474
- modelConfig: agent?.modelConfig,
4579
+ modelConfig: resolvedModelConfig,
4475
4580
  duration: durationMs,
4476
4581
  outputText,
4477
4582
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,