@wix/evalforge-evaluator 0.121.0 → 0.123.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -2607,6 +2607,7 @@ var import_path10 = require("path");
2607
2607
  var KILL_GRACE_PERIOD_MS = 5e3;
2608
2608
  var IDLE_TIMEOUT_MS = 12e4;
2609
2609
  var IDLE_CHECK_INTERVAL_MS = 15e3;
2610
+ var MAX_IDLE_RETRIES = 3;
2610
2611
  function extractToolAction(toolName, args) {
2611
2612
  if (!toolName) return "Using tool...";
2612
2613
  if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
@@ -2732,232 +2733,113 @@ function killProcess(child, resolved) {
2732
2733
  }
2733
2734
  }, KILL_GRACE_PERIOD_MS);
2734
2735
  }
2735
- async function executeWithOpenCode(skills, scenario, options) {
2736
- const skillNames = skills.map((s) => s.name).join(", ");
2737
- console.log("[executeWithOpenCode] Starting execution", {
2738
- skillCount: skills.length,
2739
- skillNames,
2740
- scenarioId: scenario.id,
2741
- scenarioName: scenario.name,
2742
- cwd: options.cwd,
2743
- aiGatewayUrl: options.aiGatewayUrl,
2744
- hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2745
- model: options.model
2746
- });
2747
- const startTime = /* @__PURE__ */ new Date();
2748
- const maxTurns = options.maxTurns ?? 10;
2749
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2750
- const { env, providerID, modelID } = await buildOpenCodeEnv({
2751
- model: options.model,
2752
- temperature: options.temperature,
2753
- maxTurns,
2754
- aiGatewayUrl: options.aiGatewayUrl,
2755
- aiGatewayHeaders: options.aiGatewayHeaders,
2756
- mcps: options.mcps,
2757
- cwd: options.cwd
2758
- });
2759
- const traceContext = options.traceContext;
2760
- let traceStepNumber = 0;
2761
- let lastAction = "Starting...";
2762
- let lastToolName;
2763
- let lastFilePath;
2764
- if (traceContext) {
2765
- emitTraceEvent(
2766
- {
2767
- evalRunId: traceContext.evalRunId,
2768
- scenarioId: traceContext.scenarioId,
2769
- scenarioName: traceContext.scenarioName,
2770
- targetId: traceContext.targetId,
2771
- targetName: traceContext.targetName,
2772
- stepNumber: 0,
2773
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2774
- outputPreview: JSON.stringify({
2775
- event: "pre-cli-execution",
2776
- model: `${providerID}/${modelID}`,
2777
- maxTurns,
2778
- timestamp: (/* @__PURE__ */ new Date()).toISOString()
2779
- }),
2780
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2781
- isComplete: false
2782
- },
2783
- traceContext.tracePushUrl,
2784
- traceContext.routeHeader,
2785
- traceContext.authToken
2786
- );
2787
- }
2788
- let systemPrompt;
2789
- if (options.systemPrompt === null || options.systemPrompt === "") {
2790
- } else if (options.systemPrompt != null) {
2791
- systemPrompt = options.systemPrompt;
2792
- } else {
2793
- systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
2794
- }
2795
- if (systemPrompt) {
2796
- await writeSystemPromptRule(options.cwd, systemPrompt);
2736
+ function summarizeToolActions(events) {
2737
+ const actions = [];
2738
+ for (const { event: evt } of events) {
2739
+ if (evt.type === "tool_use") {
2740
+ const tu = evt;
2741
+ const tool = tu.part.tool;
2742
+ const input = tu.part.state.input;
2743
+ const filePath = input?.file_path || input?.path || input?.target_file;
2744
+ if (filePath) {
2745
+ actions.push(`- ${tool}: ${String(filePath)}`);
2746
+ } else if (input?.command) {
2747
+ actions.push(`- ${tool}: ${String(input.command).slice(0, 80)}`);
2748
+ } else {
2749
+ actions.push(`- ${tool}`);
2750
+ }
2751
+ }
2797
2752
  }
2798
- const args = [
2799
- "run",
2800
- "--format",
2801
- "json",
2802
- "--thinking",
2803
- "--variant",
2804
- "high",
2805
- "--model",
2806
- `${providerID}/${modelID}`,
2807
- "--dir",
2808
- options.cwd,
2809
- // NOTE: Trigger prompt is passed as a positional CLI arg. On Linux a single
2810
- // arg is capped at 128 KB (MAX_ARG_STRLEN); on macOS the combined args+env
2811
- // share a ~1 MB limit. Prompts exceeding this would fail with E2BIG.
2812
- // In practice eval prompts are well under this limit.
2813
- scenario.triggerPrompt
2814
- ];
2815
- console.log("[executeWithOpenCode] Spawning: opencode", args.slice(0, 5));
2816
- return new Promise((resolve2, reject) => {
2753
+ return actions.length > 0 ? actions.join("\n") : "(no tool actions recorded)";
2754
+ }
2755
+ function buildRecoveryPrompt(originalPrompt, events) {
2756
+ const toolSummary = summarizeToolActions(events);
2757
+ return `You are continuing a task that was interrupted due to a session error.
2758
+
2759
+ ORIGINAL TASK:
2760
+ ${originalPrompt}
2761
+
2762
+ ACTIONS ALREADY COMPLETED IN THE PREVIOUS SESSION:
2763
+ ${toolSummary}
2764
+
2765
+ INSTRUCTIONS:
2766
+ 1. Review the actions listed above that were already completed in the previous session
2767
+ 2. Check the filesystem to verify what was already done
2768
+ 3. Continue with any remaining work needed to fulfill the original task
2769
+ 4. Do NOT redo work that is already done \u2014 only continue from where the previous session left off`;
2770
+ }
2771
+ function spawnOpenCodeProcess(opts) {
2772
+ const {
2773
+ args,
2774
+ env,
2775
+ cwd,
2776
+ skillNames,
2777
+ scenarioName,
2778
+ sdkTimeoutMs,
2779
+ traceContext,
2780
+ initialStepNumber
2781
+ } = opts;
2782
+ return new Promise((resolve2) => {
2817
2783
  let resolved = false;
2818
2784
  let stderr = "";
2819
2785
  let lineBuffer = "";
2820
2786
  let lastOutputTime = Date.now();
2821
- const allEvents = [];
2787
+ let traceStepNumber = initialStepNumber;
2788
+ let lastAction = "Starting...";
2789
+ let lastToolName;
2790
+ let lastFilePath;
2791
+ const events = [];
2822
2792
  const timers = {};
2823
2793
  const cleanup = () => {
2824
2794
  if (timers.timeout) clearTimeout(timers.timeout);
2825
2795
  if (timers.idleCheck) clearInterval(timers.idleCheck);
2826
2796
  if (timers.heartbeat) clearInterval(timers.heartbeat);
2827
2797
  };
2828
- const finalize = (success, error) => {
2798
+ const finalize = (success, isIdleTimeout, error) => {
2829
2799
  if (resolved) return;
2830
2800
  resolved = true;
2831
2801
  cleanup();
2832
- if (!success) {
2833
- if (traceContext) {
2834
- emitTraceEvent(
2835
- {
2836
- evalRunId: traceContext.evalRunId,
2837
- scenarioId: traceContext.scenarioId,
2838
- scenarioName: traceContext.scenarioName,
2839
- targetId: traceContext.targetId,
2840
- targetName: traceContext.targetName,
2841
- stepNumber: traceStepNumber + 1,
2842
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
2843
- outputPreview: JSON.stringify({
2844
- event: "cli-execution-failed",
2845
- error: error?.message ?? "Unknown error"
2846
- }).slice(0, 2e3),
2847
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2848
- isComplete: true
2849
- },
2850
- traceContext.tracePushUrl,
2851
- traceContext.routeHeader,
2852
- traceContext.authToken
2853
- );
2854
- }
2855
- reject(
2856
- error ?? new Error(
2857
- `OpenCode CLI execution failed (exit code unknown).
2858
- Stderr: ${stderr.slice(0, 1e3)}`
2859
- )
2860
- );
2861
- return;
2862
- }
2863
- const endTime = /* @__PURE__ */ new Date();
2864
- const totalDurationMs = endTime.getTime() - startTime.getTime();
2865
- let outputText = "";
2866
- for (const { event: evt } of allEvents) {
2867
- if (evt.type === "text") {
2868
- outputText += evt.part.text;
2869
- }
2870
- }
2871
- if (!outputText) {
2872
- reject(
2873
- new Error(
2874
- `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
2875
- )
2876
- );
2877
- return;
2878
- }
2879
- let inputTokens = 0;
2880
- let outputTokens = 0;
2881
- let costUsd = 0;
2882
- for (const { event: evt } of allEvents) {
2883
- if (evt.type === "step_finish") {
2884
- const sf = evt;
2885
- inputTokens += sf.part.tokens.input;
2886
- outputTokens += sf.part.tokens.output;
2887
- costUsd += sf.part.cost;
2888
- }
2889
- }
2890
- if (traceContext) {
2891
- emitTraceEvent(
2892
- {
2893
- evalRunId: traceContext.evalRunId,
2894
- scenarioId: traceContext.scenarioId,
2895
- scenarioName: traceContext.scenarioName,
2896
- targetId: traceContext.targetId,
2897
- targetName: traceContext.targetName,
2898
- stepNumber: traceStepNumber + 1,
2899
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
2900
- outputPreview: "Scenario execution completed",
2901
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2902
- isComplete: true
2903
- },
2904
- traceContext.tracePushUrl,
2905
- traceContext.routeHeader,
2906
- traceContext.authToken
2907
- );
2908
- }
2909
- const modelStr = options.model || `${providerID}/${modelID}`;
2910
- const llmTrace = buildLLMTrace(
2911
- allEvents,
2912
- totalDurationMs,
2913
- modelStr,
2914
- providerID,
2915
- startTime
2916
- );
2917
- const conversation = buildConversation2(allEvents);
2918
2802
  resolve2({
2919
- result: {
2920
- outputText,
2921
- durationMs: totalDurationMs,
2922
- usage: {
2923
- inputTokens,
2924
- outputTokens,
2925
- totalTokens: inputTokens + outputTokens
2926
- },
2927
- costUsd
2928
- },
2929
- llmTrace,
2930
- conversation
2803
+ events,
2804
+ success,
2805
+ isIdleTimeout,
2806
+ error,
2807
+ finalStepNumber: traceStepNumber
2931
2808
  });
2932
2809
  };
2933
2810
  let child;
2934
2811
  try {
2935
2812
  child = (0, import_child_process.spawn)("opencode", args, {
2936
- cwd: options.cwd,
2813
+ cwd,
2937
2814
  env,
2938
2815
  stdio: ["ignore", "pipe", "pipe"],
2939
2816
  detached: true
2940
2817
  });
2941
2818
  } catch (spawnError) {
2942
- reject(
2943
- new Error(
2819
+ resolve2({
2820
+ events: [],
2821
+ success: false,
2822
+ isIdleTimeout: false,
2823
+ error: new Error(
2944
2824
  `Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
2945
- )
2946
- );
2825
+ ),
2826
+ finalStepNumber: traceStepNumber
2827
+ });
2947
2828
  return;
2948
2829
  }
2949
2830
  timers.timeout = setTimeout(() => {
2950
2831
  if (!resolved) {
2951
- console.error(`[OpenCode] Process timed out after ${SDK_TIMEOUT_MS}ms`);
2832
+ console.error(`[OpenCode] Process timed out after ${sdkTimeoutMs}ms`);
2952
2833
  killProcess(child, resolved);
2953
2834
  finalize(
2835
+ false,
2954
2836
  false,
2955
2837
  new Error(
2956
- `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
2838
+ `OpenCode execution timed out after ${sdkTimeoutMs}ms. Skills: ${skillNames}, Scenario: ${scenarioName}`
2957
2839
  )
2958
2840
  );
2959
2841
  }
2960
- }, SDK_TIMEOUT_MS);
2842
+ }, sdkTimeoutMs);
2961
2843
  timers.idleCheck = setInterval(() => {
2962
2844
  if (resolved) return;
2963
2845
  const idleTime = Date.now() - lastOutputTime;
@@ -2968,8 +2850,9 @@ Stderr: ${stderr.slice(0, 1e3)}`
2968
2850
  killProcess(child, resolved);
2969
2851
  finalize(
2970
2852
  false,
2853
+ true,
2971
2854
  new Error(
2972
- `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
2855
+ `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenarioName}`
2973
2856
  )
2974
2857
  );
2975
2858
  }
@@ -3029,7 +2912,7 @@ Stderr: ${stderr.slice(0, 1e3)}`
3029
2912
  if (!line.trim()) continue;
3030
2913
  const evt = tryParseJson(line);
3031
2914
  if (!evt || !evt.type) continue;
3032
- allEvents.push({ event: evt, receivedAt: Date.now() });
2915
+ events.push({ event: evt, receivedAt: Date.now() });
3033
2916
  if (traceContext) {
3034
2917
  traceStepNumber++;
3035
2918
  const traceEvt = createTraceEventFromNdjson(
@@ -3074,16 +2957,17 @@ Stderr: ${stderr.slice(0, 1e3)}`
3074
2957
  if (lineBuffer.trim()) {
3075
2958
  const evt = tryParseJson(lineBuffer);
3076
2959
  if (evt && evt.type) {
3077
- allEvents.push({ event: evt, receivedAt: Date.now() });
2960
+ events.push({ event: evt, receivedAt: Date.now() });
3078
2961
  }
3079
2962
  }
3080
2963
  console.log(
3081
- `[executeWithOpenCode] Process exited with code ${code}, ${allEvents.length} events collected`
2964
+ `[executeWithOpenCode] Process exited with code ${code}, ${events.length} events collected`
3082
2965
  );
3083
2966
  if (code === 0) {
3084
- finalize(true);
2967
+ finalize(true, false);
3085
2968
  } else {
3086
2969
  finalize(
2970
+ false,
3087
2971
  false,
3088
2972
  new Error(
3089
2973
  `OpenCode CLI exited with code ${code}.
@@ -3093,10 +2977,240 @@ Stderr: ${stderr.slice(0, 1e3)}`
3093
2977
  }
3094
2978
  });
3095
2979
  child.on("error", (error) => {
3096
- finalize(false, new Error(`OpenCode CLI spawn error: ${error.message}`));
2980
+ finalize(
2981
+ false,
2982
+ false,
2983
+ new Error(`OpenCode CLI spawn error: ${error.message}`)
2984
+ );
3097
2985
  });
3098
2986
  });
3099
2987
  }
2988
+ async function executeWithOpenCode(skills, scenario, options) {
2989
+ const skillNames = skills.map((s) => s.name).join(", ");
2990
+ console.log("[executeWithOpenCode] Starting execution", {
2991
+ skillCount: skills.length,
2992
+ skillNames,
2993
+ scenarioId: scenario.id,
2994
+ scenarioName: scenario.name,
2995
+ cwd: options.cwd,
2996
+ aiGatewayUrl: options.aiGatewayUrl,
2997
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2998
+ model: options.model
2999
+ });
3000
+ const startTime = /* @__PURE__ */ new Date();
3001
+ const maxTurns = options.maxTurns ?? 10;
3002
+ const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
3003
+ const { env, providerID, modelID } = await buildOpenCodeEnv({
3004
+ model: options.model,
3005
+ temperature: options.temperature,
3006
+ maxTurns,
3007
+ aiGatewayUrl: options.aiGatewayUrl,
3008
+ aiGatewayHeaders: options.aiGatewayHeaders,
3009
+ mcps: options.mcps,
3010
+ cwd: options.cwd
3011
+ });
3012
+ const traceContext = options.traceContext;
3013
+ if (traceContext) {
3014
+ emitTraceEvent(
3015
+ {
3016
+ evalRunId: traceContext.evalRunId,
3017
+ scenarioId: traceContext.scenarioId,
3018
+ scenarioName: traceContext.scenarioName,
3019
+ targetId: traceContext.targetId,
3020
+ targetName: traceContext.targetName,
3021
+ stepNumber: 0,
3022
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3023
+ outputPreview: JSON.stringify({
3024
+ event: "pre-cli-execution",
3025
+ model: `${providerID}/${modelID}`,
3026
+ maxTurns,
3027
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
3028
+ }),
3029
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3030
+ isComplete: false
3031
+ },
3032
+ traceContext.tracePushUrl,
3033
+ traceContext.routeHeader,
3034
+ traceContext.authToken
3035
+ );
3036
+ }
3037
+ let systemPrompt;
3038
+ if (options.systemPrompt === null || options.systemPrompt === "") {
3039
+ } else if (options.systemPrompt != null) {
3040
+ systemPrompt = options.systemPrompt;
3041
+ } else {
3042
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
3043
+ }
3044
+ if (systemPrompt) {
3045
+ await writeSystemPromptRule(options.cwd, systemPrompt);
3046
+ }
3047
+ const baseArgs = [
3048
+ "run",
3049
+ "--format",
3050
+ "json",
3051
+ "--thinking",
3052
+ "--variant",
3053
+ "high",
3054
+ "--model",
3055
+ `${providerID}/${modelID}`,
3056
+ "--dir",
3057
+ options.cwd
3058
+ ];
3059
+ const accumulatedEvents = [];
3060
+ let traceStepNumber = 0;
3061
+ let lastAttemptResult;
3062
+ for (let attempt = 1; attempt <= MAX_IDLE_RETRIES; attempt++) {
3063
+ const prompt = attempt === 1 ? scenario.triggerPrompt : buildRecoveryPrompt(scenario.triggerPrompt, accumulatedEvents);
3064
+ if (attempt > 1) {
3065
+ console.log(
3066
+ `[OpenCode] Retry attempt ${attempt}/${MAX_IDLE_RETRIES} \u2014 starting fresh session with recovery context`
3067
+ );
3068
+ if (traceContext) {
3069
+ emitTraceEvent(
3070
+ {
3071
+ evalRunId: traceContext.evalRunId,
3072
+ scenarioId: traceContext.scenarioId,
3073
+ scenarioName: traceContext.scenarioName,
3074
+ targetId: traceContext.targetId,
3075
+ targetName: traceContext.targetName,
3076
+ stepNumber: traceStepNumber + 1,
3077
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3078
+ outputPreview: JSON.stringify({
3079
+ event: "idle-timeout-retry",
3080
+ attempt,
3081
+ maxRetries: MAX_IDLE_RETRIES,
3082
+ eventsFromPreviousAttempts: accumulatedEvents.length
3083
+ }),
3084
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3085
+ isComplete: false
3086
+ },
3087
+ traceContext.tracePushUrl,
3088
+ traceContext.routeHeader,
3089
+ traceContext.authToken
3090
+ );
3091
+ }
3092
+ }
3093
+ const args = [...baseArgs, prompt];
3094
+ console.log(
3095
+ `[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
3096
+ args.slice(0, 5)
3097
+ );
3098
+ lastAttemptResult = await spawnOpenCodeProcess({
3099
+ args,
3100
+ env,
3101
+ cwd: options.cwd,
3102
+ skillNames,
3103
+ scenarioName: scenario.name,
3104
+ sdkTimeoutMs,
3105
+ traceContext,
3106
+ initialStepNumber: traceStepNumber
3107
+ });
3108
+ accumulatedEvents.push(...lastAttemptResult.events);
3109
+ traceStepNumber = lastAttemptResult.finalStepNumber;
3110
+ if (lastAttemptResult.success) {
3111
+ break;
3112
+ }
3113
+ if (!lastAttemptResult.isIdleTimeout || attempt >= MAX_IDLE_RETRIES) {
3114
+ if (traceContext) {
3115
+ emitTraceEvent(
3116
+ {
3117
+ evalRunId: traceContext.evalRunId,
3118
+ scenarioId: traceContext.scenarioId,
3119
+ scenarioName: traceContext.scenarioName,
3120
+ targetId: traceContext.targetId,
3121
+ targetName: traceContext.targetName,
3122
+ stepNumber: traceStepNumber + 1,
3123
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
3124
+ outputPreview: JSON.stringify({
3125
+ event: "cli-execution-failed",
3126
+ error: lastAttemptResult.error?.message ?? "Unknown error",
3127
+ attempt,
3128
+ isIdleTimeout: lastAttemptResult.isIdleTimeout
3129
+ }).slice(0, 2e3),
3130
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3131
+ isComplete: true
3132
+ },
3133
+ traceContext.tracePushUrl,
3134
+ traceContext.routeHeader,
3135
+ traceContext.authToken
3136
+ );
3137
+ }
3138
+ throw lastAttemptResult.error ?? new Error(
3139
+ `OpenCode CLI execution failed.
3140
+ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
3141
+ );
3142
+ }
3143
+ console.warn(
3144
+ `[OpenCode] Attempt ${attempt} failed due to idle timeout, will retry`
3145
+ );
3146
+ }
3147
+ const endTime = /* @__PURE__ */ new Date();
3148
+ const totalDurationMs = endTime.getTime() - startTime.getTime();
3149
+ let outputText = "";
3150
+ for (const { event: evt } of accumulatedEvents) {
3151
+ if (evt.type === "text") {
3152
+ outputText += evt.part.text;
3153
+ }
3154
+ }
3155
+ if (!outputText) {
3156
+ throw new Error(
3157
+ `Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${accumulatedEvents.length}`
3158
+ );
3159
+ }
3160
+ let inputTokens = 0;
3161
+ let outputTokens = 0;
3162
+ let costUsd = 0;
3163
+ for (const { event: evt } of accumulatedEvents) {
3164
+ if (evt.type === "step_finish") {
3165
+ const sf = evt;
3166
+ inputTokens += sf.part.tokens.input;
3167
+ outputTokens += sf.part.tokens.output;
3168
+ costUsd += sf.part.cost;
3169
+ }
3170
+ }
3171
+ if (traceContext) {
3172
+ emitTraceEvent(
3173
+ {
3174
+ evalRunId: traceContext.evalRunId,
3175
+ scenarioId: traceContext.scenarioId,
3176
+ scenarioName: traceContext.scenarioName,
3177
+ targetId: traceContext.targetId,
3178
+ targetName: traceContext.targetName,
3179
+ stepNumber: traceStepNumber + 1,
3180
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
3181
+ outputPreview: "Scenario execution completed",
3182
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3183
+ isComplete: true
3184
+ },
3185
+ traceContext.tracePushUrl,
3186
+ traceContext.routeHeader,
3187
+ traceContext.authToken
3188
+ );
3189
+ }
3190
+ const modelStr = options.model || `${providerID}/${modelID}`;
3191
+ const llmTrace = buildLLMTrace(
3192
+ accumulatedEvents,
3193
+ totalDurationMs,
3194
+ modelStr,
3195
+ providerID,
3196
+ startTime
3197
+ );
3198
+ const conversation = buildConversation2(accumulatedEvents);
3199
+ return {
3200
+ result: {
3201
+ outputText,
3202
+ durationMs: totalDurationMs,
3203
+ usage: {
3204
+ inputTokens,
3205
+ outputTokens,
3206
+ totalTokens: inputTokens + outputTokens
3207
+ },
3208
+ costUsd
3209
+ },
3210
+ llmTrace,
3211
+ conversation
3212
+ };
3213
+ }
3100
3214
 
3101
3215
  // src/run-scenario/agents/opencode/opencode-adapter.ts
3102
3216
  var OpenCodeAdapter = class {
@@ -4455,13 +4569,14 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4455
4569
  infrastructurePaths
4456
4570
  );
4457
4571
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4572
+ const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4458
4573
  return {
4459
4574
  id: (0, import_crypto4.randomUUID)(),
4460
4575
  targetId,
4461
4576
  targetName,
4462
4577
  scenarioId: scenario.id,
4463
4578
  scenarioName: scenario.name,
4464
- modelConfig: agent?.modelConfig,
4579
+ modelConfig: resolvedModelConfig,
4465
4580
  duration: durationMs,
4466
4581
  outputText,
4467
4582
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,