@wix/evalforge-evaluator 0.122.0 → 0.124.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +313 -208
- package/build/index.js.map +2 -2
- package/build/index.mjs +313 -208
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -3
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -2606,8 +2606,8 @@ var import_promises9 = require("fs/promises");
|
|
|
2606
2606
|
var import_path10 = require("path");
|
|
2607
2607
|
var KILL_GRACE_PERIOD_MS = 5e3;
|
|
2608
2608
|
var IDLE_TIMEOUT_MS = 12e4;
|
|
2609
|
-
var TOOL_RUNNING_IDLE_TIMEOUT_MS = 36e4;
|
|
2610
2609
|
var IDLE_CHECK_INTERVAL_MS = 15e3;
|
|
2610
|
+
var MAX_IDLE_RETRIES = 3;
|
|
2611
2611
|
function extractToolAction(toolName, args) {
|
|
2612
2612
|
if (!toolName) return "Using tool...";
|
|
2613
2613
|
if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
|
|
@@ -2733,246 +2733,126 @@ function killProcess(child, resolved) {
|
|
|
2733
2733
|
}
|
|
2734
2734
|
}, KILL_GRACE_PERIOD_MS);
|
|
2735
2735
|
}
|
|
2736
|
-
|
|
2737
|
-
const
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
model: options.model,
|
|
2753
|
-
temperature: options.temperature,
|
|
2754
|
-
maxTurns,
|
|
2755
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2756
|
-
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
2757
|
-
mcps: options.mcps,
|
|
2758
|
-
cwd: options.cwd
|
|
2759
|
-
});
|
|
2760
|
-
const traceContext = options.traceContext;
|
|
2761
|
-
let traceStepNumber = 0;
|
|
2762
|
-
let lastAction = "Starting...";
|
|
2763
|
-
let lastToolName;
|
|
2764
|
-
let lastFilePath;
|
|
2765
|
-
let isToolRunning = false;
|
|
2766
|
-
if (traceContext) {
|
|
2767
|
-
emitTraceEvent(
|
|
2768
|
-
{
|
|
2769
|
-
evalRunId: traceContext.evalRunId,
|
|
2770
|
-
scenarioId: traceContext.scenarioId,
|
|
2771
|
-
scenarioName: traceContext.scenarioName,
|
|
2772
|
-
targetId: traceContext.targetId,
|
|
2773
|
-
targetName: traceContext.targetName,
|
|
2774
|
-
stepNumber: 0,
|
|
2775
|
-
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2776
|
-
outputPreview: JSON.stringify({
|
|
2777
|
-
event: "pre-cli-execution",
|
|
2778
|
-
model: `${providerID}/${modelID}`,
|
|
2779
|
-
maxTurns,
|
|
2780
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2781
|
-
}),
|
|
2782
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2783
|
-
isComplete: false
|
|
2784
|
-
},
|
|
2785
|
-
traceContext.tracePushUrl,
|
|
2786
|
-
traceContext.routeHeader,
|
|
2787
|
-
traceContext.authToken
|
|
2788
|
-
);
|
|
2789
|
-
}
|
|
2790
|
-
let systemPrompt;
|
|
2791
|
-
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2792
|
-
} else if (options.systemPrompt != null) {
|
|
2793
|
-
systemPrompt = options.systemPrompt;
|
|
2794
|
-
} else {
|
|
2795
|
-
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
2796
|
-
}
|
|
2797
|
-
if (systemPrompt) {
|
|
2798
|
-
await writeSystemPromptRule(options.cwd, systemPrompt);
|
|
2736
|
+
function summarizeToolActions(events) {
|
|
2737
|
+
const actions = [];
|
|
2738
|
+
for (const { event: evt } of events) {
|
|
2739
|
+
if (evt.type === "tool_use") {
|
|
2740
|
+
const tu = evt;
|
|
2741
|
+
const tool = tu.part.tool;
|
|
2742
|
+
const input = tu.part.state.input;
|
|
2743
|
+
const filePath = input?.file_path || input?.path || input?.target_file;
|
|
2744
|
+
if (filePath) {
|
|
2745
|
+
actions.push(`- ${tool}: ${String(filePath)}`);
|
|
2746
|
+
} else if (input?.command) {
|
|
2747
|
+
actions.push(`- ${tool}: ${String(input.command).slice(0, 80)}`);
|
|
2748
|
+
} else {
|
|
2749
|
+
actions.push(`- ${tool}`);
|
|
2750
|
+
}
|
|
2751
|
+
}
|
|
2799
2752
|
}
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2815
|
-
|
|
2816
|
-
|
|
2817
|
-
|
|
2818
|
-
|
|
2753
|
+
return actions.length > 0 ? actions.join("\n") : "(no tool actions recorded)";
|
|
2754
|
+
}
|
|
2755
|
+
function buildRecoveryPrompt(originalPrompt, events) {
|
|
2756
|
+
const toolSummary = summarizeToolActions(events);
|
|
2757
|
+
return `You are continuing a task that was interrupted due to a session error.
|
|
2758
|
+
|
|
2759
|
+
ORIGINAL TASK:
|
|
2760
|
+
${originalPrompt}
|
|
2761
|
+
|
|
2762
|
+
ACTIONS ALREADY COMPLETED IN THE PREVIOUS SESSION:
|
|
2763
|
+
${toolSummary}
|
|
2764
|
+
|
|
2765
|
+
INSTRUCTIONS:
|
|
2766
|
+
1. Review the actions listed above that were already completed in the previous session
|
|
2767
|
+
2. Check the filesystem to verify what was already done
|
|
2768
|
+
3. Continue with any remaining work needed to fulfill the original task
|
|
2769
|
+
4. Do NOT redo work that is already done \u2014 only continue from where the previous session left off`;
|
|
2770
|
+
}
|
|
2771
|
+
function spawnOpenCodeProcess(opts) {
|
|
2772
|
+
const {
|
|
2773
|
+
args,
|
|
2774
|
+
env,
|
|
2775
|
+
cwd,
|
|
2776
|
+
skillNames,
|
|
2777
|
+
scenarioName,
|
|
2778
|
+
sdkTimeoutMs,
|
|
2779
|
+
traceContext,
|
|
2780
|
+
initialStepNumber
|
|
2781
|
+
} = opts;
|
|
2782
|
+
return new Promise((resolve2) => {
|
|
2819
2783
|
let resolved = false;
|
|
2820
2784
|
let stderr = "";
|
|
2821
2785
|
let lineBuffer = "";
|
|
2822
2786
|
let lastOutputTime = Date.now();
|
|
2823
|
-
|
|
2787
|
+
let traceStepNumber = initialStepNumber;
|
|
2788
|
+
let lastAction = "Starting...";
|
|
2789
|
+
let lastToolName;
|
|
2790
|
+
let lastFilePath;
|
|
2791
|
+
const events = [];
|
|
2824
2792
|
const timers = {};
|
|
2825
2793
|
const cleanup = () => {
|
|
2826
2794
|
if (timers.timeout) clearTimeout(timers.timeout);
|
|
2827
2795
|
if (timers.idleCheck) clearInterval(timers.idleCheck);
|
|
2828
2796
|
if (timers.heartbeat) clearInterval(timers.heartbeat);
|
|
2829
2797
|
};
|
|
2830
|
-
const finalize = (success, error) => {
|
|
2798
|
+
const finalize = (success, isIdleTimeout, error) => {
|
|
2831
2799
|
if (resolved) return;
|
|
2832
2800
|
resolved = true;
|
|
2833
2801
|
cleanup();
|
|
2834
|
-
if (!success) {
|
|
2835
|
-
if (traceContext) {
|
|
2836
|
-
emitTraceEvent(
|
|
2837
|
-
{
|
|
2838
|
-
evalRunId: traceContext.evalRunId,
|
|
2839
|
-
scenarioId: traceContext.scenarioId,
|
|
2840
|
-
scenarioName: traceContext.scenarioName,
|
|
2841
|
-
targetId: traceContext.targetId,
|
|
2842
|
-
targetName: traceContext.targetName,
|
|
2843
|
-
stepNumber: traceStepNumber + 1,
|
|
2844
|
-
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2845
|
-
outputPreview: JSON.stringify({
|
|
2846
|
-
event: "cli-execution-failed",
|
|
2847
|
-
error: error?.message ?? "Unknown error"
|
|
2848
|
-
}).slice(0, 2e3),
|
|
2849
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2850
|
-
isComplete: true
|
|
2851
|
-
},
|
|
2852
|
-
traceContext.tracePushUrl,
|
|
2853
|
-
traceContext.routeHeader,
|
|
2854
|
-
traceContext.authToken
|
|
2855
|
-
);
|
|
2856
|
-
}
|
|
2857
|
-
reject(
|
|
2858
|
-
error ?? new Error(
|
|
2859
|
-
`OpenCode CLI execution failed (exit code unknown).
|
|
2860
|
-
Stderr: ${stderr.slice(0, 1e3)}`
|
|
2861
|
-
)
|
|
2862
|
-
);
|
|
2863
|
-
return;
|
|
2864
|
-
}
|
|
2865
|
-
const endTime = /* @__PURE__ */ new Date();
|
|
2866
|
-
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
2867
|
-
let outputText = "";
|
|
2868
|
-
for (const { event: evt } of allEvents) {
|
|
2869
|
-
if (evt.type === "text") {
|
|
2870
|
-
outputText += evt.part.text;
|
|
2871
|
-
}
|
|
2872
|
-
}
|
|
2873
|
-
if (!outputText) {
|
|
2874
|
-
reject(
|
|
2875
|
-
new Error(
|
|
2876
|
-
`Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
|
|
2877
|
-
)
|
|
2878
|
-
);
|
|
2879
|
-
return;
|
|
2880
|
-
}
|
|
2881
|
-
let inputTokens = 0;
|
|
2882
|
-
let outputTokens = 0;
|
|
2883
|
-
let costUsd = 0;
|
|
2884
|
-
for (const { event: evt } of allEvents) {
|
|
2885
|
-
if (evt.type === "step_finish") {
|
|
2886
|
-
const sf = evt;
|
|
2887
|
-
inputTokens += sf.part.tokens.input;
|
|
2888
|
-
outputTokens += sf.part.tokens.output;
|
|
2889
|
-
costUsd += sf.part.cost;
|
|
2890
|
-
}
|
|
2891
|
-
}
|
|
2892
|
-
if (traceContext) {
|
|
2893
|
-
emitTraceEvent(
|
|
2894
|
-
{
|
|
2895
|
-
evalRunId: traceContext.evalRunId,
|
|
2896
|
-
scenarioId: traceContext.scenarioId,
|
|
2897
|
-
scenarioName: traceContext.scenarioName,
|
|
2898
|
-
targetId: traceContext.targetId,
|
|
2899
|
-
targetName: traceContext.targetName,
|
|
2900
|
-
stepNumber: traceStepNumber + 1,
|
|
2901
|
-
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
2902
|
-
outputPreview: "Scenario execution completed",
|
|
2903
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2904
|
-
isComplete: true
|
|
2905
|
-
},
|
|
2906
|
-
traceContext.tracePushUrl,
|
|
2907
|
-
traceContext.routeHeader,
|
|
2908
|
-
traceContext.authToken
|
|
2909
|
-
);
|
|
2910
|
-
}
|
|
2911
|
-
const modelStr = options.model || `${providerID}/${modelID}`;
|
|
2912
|
-
const llmTrace = buildLLMTrace(
|
|
2913
|
-
allEvents,
|
|
2914
|
-
totalDurationMs,
|
|
2915
|
-
modelStr,
|
|
2916
|
-
providerID,
|
|
2917
|
-
startTime
|
|
2918
|
-
);
|
|
2919
|
-
const conversation = buildConversation2(allEvents);
|
|
2920
2802
|
resolve2({
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
outputTokens,
|
|
2927
|
-
totalTokens: inputTokens + outputTokens
|
|
2928
|
-
},
|
|
2929
|
-
costUsd
|
|
2930
|
-
},
|
|
2931
|
-
llmTrace,
|
|
2932
|
-
conversation
|
|
2803
|
+
events,
|
|
2804
|
+
success,
|
|
2805
|
+
isIdleTimeout,
|
|
2806
|
+
error,
|
|
2807
|
+
finalStepNumber: traceStepNumber
|
|
2933
2808
|
});
|
|
2934
2809
|
};
|
|
2935
2810
|
let child;
|
|
2936
2811
|
try {
|
|
2937
2812
|
child = (0, import_child_process.spawn)("opencode", args, {
|
|
2938
|
-
cwd
|
|
2813
|
+
cwd,
|
|
2939
2814
|
env,
|
|
2940
2815
|
stdio: ["ignore", "pipe", "pipe"],
|
|
2941
2816
|
detached: true
|
|
2942
2817
|
});
|
|
2943
2818
|
} catch (spawnError) {
|
|
2944
|
-
|
|
2945
|
-
|
|
2819
|
+
resolve2({
|
|
2820
|
+
events: [],
|
|
2821
|
+
success: false,
|
|
2822
|
+
isIdleTimeout: false,
|
|
2823
|
+
error: new Error(
|
|
2946
2824
|
`Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
|
|
2947
|
-
)
|
|
2948
|
-
|
|
2825
|
+
),
|
|
2826
|
+
finalStepNumber: traceStepNumber
|
|
2827
|
+
});
|
|
2949
2828
|
return;
|
|
2950
2829
|
}
|
|
2951
2830
|
timers.timeout = setTimeout(() => {
|
|
2952
2831
|
if (!resolved) {
|
|
2953
|
-
console.error(`[OpenCode] Process timed out after ${
|
|
2832
|
+
console.error(`[OpenCode] Process timed out after ${sdkTimeoutMs}ms`);
|
|
2954
2833
|
killProcess(child, resolved);
|
|
2955
2834
|
finalize(
|
|
2835
|
+
false,
|
|
2956
2836
|
false,
|
|
2957
2837
|
new Error(
|
|
2958
|
-
`OpenCode execution timed out after ${
|
|
2838
|
+
`OpenCode execution timed out after ${sdkTimeoutMs}ms. Skills: ${skillNames}, Scenario: ${scenarioName}`
|
|
2959
2839
|
)
|
|
2960
2840
|
);
|
|
2961
2841
|
}
|
|
2962
|
-
},
|
|
2842
|
+
}, sdkTimeoutMs);
|
|
2963
2843
|
timers.idleCheck = setInterval(() => {
|
|
2964
2844
|
if (resolved) return;
|
|
2965
2845
|
const idleTime = Date.now() - lastOutputTime;
|
|
2966
|
-
|
|
2967
|
-
if (idleTime >= effectiveTimeout) {
|
|
2846
|
+
if (idleTime >= IDLE_TIMEOUT_MS) {
|
|
2968
2847
|
console.warn(
|
|
2969
|
-
`[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s
|
|
2848
|
+
`[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
|
|
2970
2849
|
);
|
|
2971
2850
|
killProcess(child, resolved);
|
|
2972
2851
|
finalize(
|
|
2973
2852
|
false,
|
|
2853
|
+
true,
|
|
2974
2854
|
new Error(
|
|
2975
|
-
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout
|
|
2855
|
+
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenarioName}`
|
|
2976
2856
|
)
|
|
2977
2857
|
);
|
|
2978
2858
|
}
|
|
@@ -3032,14 +2912,7 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3032
2912
|
if (!line.trim()) continue;
|
|
3033
2913
|
const evt = tryParseJson(line);
|
|
3034
2914
|
if (!evt || !evt.type) continue;
|
|
3035
|
-
|
|
3036
|
-
if (evt.type === "tool_use") {
|
|
3037
|
-
const tu = evt;
|
|
3038
|
-
const status = tu.part.state.status;
|
|
3039
|
-
isToolRunning = status !== "completed" && status !== "error";
|
|
3040
|
-
} else {
|
|
3041
|
-
isToolRunning = false;
|
|
3042
|
-
}
|
|
2915
|
+
events.push({ event: evt, receivedAt: Date.now() });
|
|
3043
2916
|
if (traceContext) {
|
|
3044
2917
|
traceStepNumber++;
|
|
3045
2918
|
const traceEvt = createTraceEventFromNdjson(
|
|
@@ -3084,16 +2957,17 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3084
2957
|
if (lineBuffer.trim()) {
|
|
3085
2958
|
const evt = tryParseJson(lineBuffer);
|
|
3086
2959
|
if (evt && evt.type) {
|
|
3087
|
-
|
|
2960
|
+
events.push({ event: evt, receivedAt: Date.now() });
|
|
3088
2961
|
}
|
|
3089
2962
|
}
|
|
3090
2963
|
console.log(
|
|
3091
|
-
`[executeWithOpenCode] Process exited with code ${code}, ${
|
|
2964
|
+
`[executeWithOpenCode] Process exited with code ${code}, ${events.length} events collected`
|
|
3092
2965
|
);
|
|
3093
2966
|
if (code === 0) {
|
|
3094
|
-
finalize(true);
|
|
2967
|
+
finalize(true, false);
|
|
3095
2968
|
} else {
|
|
3096
2969
|
finalize(
|
|
2970
|
+
false,
|
|
3097
2971
|
false,
|
|
3098
2972
|
new Error(
|
|
3099
2973
|
`OpenCode CLI exited with code ${code}.
|
|
@@ -3103,10 +2977,240 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3103
2977
|
}
|
|
3104
2978
|
});
|
|
3105
2979
|
child.on("error", (error) => {
|
|
3106
|
-
finalize(
|
|
2980
|
+
finalize(
|
|
2981
|
+
false,
|
|
2982
|
+
false,
|
|
2983
|
+
new Error(`OpenCode CLI spawn error: ${error.message}`)
|
|
2984
|
+
);
|
|
3107
2985
|
});
|
|
3108
2986
|
});
|
|
3109
2987
|
}
|
|
2988
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2989
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2990
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2991
|
+
skillCount: skills.length,
|
|
2992
|
+
skillNames,
|
|
2993
|
+
scenarioId: scenario.id,
|
|
2994
|
+
scenarioName: scenario.name,
|
|
2995
|
+
cwd: options.cwd,
|
|
2996
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2997
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2998
|
+
model: options.model
|
|
2999
|
+
});
|
|
3000
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
3001
|
+
const maxTurns = options.maxTurns ?? 10;
|
|
3002
|
+
const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
|
|
3003
|
+
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3004
|
+
model: options.model,
|
|
3005
|
+
temperature: options.temperature,
|
|
3006
|
+
maxTurns,
|
|
3007
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
3008
|
+
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
3009
|
+
mcps: options.mcps,
|
|
3010
|
+
cwd: options.cwd
|
|
3011
|
+
});
|
|
3012
|
+
const traceContext = options.traceContext;
|
|
3013
|
+
if (traceContext) {
|
|
3014
|
+
emitTraceEvent(
|
|
3015
|
+
{
|
|
3016
|
+
evalRunId: traceContext.evalRunId,
|
|
3017
|
+
scenarioId: traceContext.scenarioId,
|
|
3018
|
+
scenarioName: traceContext.scenarioName,
|
|
3019
|
+
targetId: traceContext.targetId,
|
|
3020
|
+
targetName: traceContext.targetName,
|
|
3021
|
+
stepNumber: 0,
|
|
3022
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3023
|
+
outputPreview: JSON.stringify({
|
|
3024
|
+
event: "pre-cli-execution",
|
|
3025
|
+
model: `${providerID}/${modelID}`,
|
|
3026
|
+
maxTurns,
|
|
3027
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
3028
|
+
}),
|
|
3029
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3030
|
+
isComplete: false
|
|
3031
|
+
},
|
|
3032
|
+
traceContext.tracePushUrl,
|
|
3033
|
+
traceContext.routeHeader,
|
|
3034
|
+
traceContext.authToken
|
|
3035
|
+
);
|
|
3036
|
+
}
|
|
3037
|
+
let systemPrompt;
|
|
3038
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
3039
|
+
} else if (options.systemPrompt != null) {
|
|
3040
|
+
systemPrompt = options.systemPrompt;
|
|
3041
|
+
} else {
|
|
3042
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
3043
|
+
}
|
|
3044
|
+
if (systemPrompt) {
|
|
3045
|
+
await writeSystemPromptRule(options.cwd, systemPrompt);
|
|
3046
|
+
}
|
|
3047
|
+
const baseArgs = [
|
|
3048
|
+
"run",
|
|
3049
|
+
"--format",
|
|
3050
|
+
"json",
|
|
3051
|
+
"--thinking",
|
|
3052
|
+
"--variant",
|
|
3053
|
+
"high",
|
|
3054
|
+
"--model",
|
|
3055
|
+
`${providerID}/${modelID}`,
|
|
3056
|
+
"--dir",
|
|
3057
|
+
options.cwd
|
|
3058
|
+
];
|
|
3059
|
+
const accumulatedEvents = [];
|
|
3060
|
+
let traceStepNumber = 0;
|
|
3061
|
+
let lastAttemptResult;
|
|
3062
|
+
for (let attempt = 1; attempt <= MAX_IDLE_RETRIES; attempt++) {
|
|
3063
|
+
const prompt = attempt === 1 ? scenario.triggerPrompt : buildRecoveryPrompt(scenario.triggerPrompt, accumulatedEvents);
|
|
3064
|
+
if (attempt > 1) {
|
|
3065
|
+
console.log(
|
|
3066
|
+
`[OpenCode] Retry attempt ${attempt}/${MAX_IDLE_RETRIES} \u2014 starting fresh session with recovery context`
|
|
3067
|
+
);
|
|
3068
|
+
if (traceContext) {
|
|
3069
|
+
emitTraceEvent(
|
|
3070
|
+
{
|
|
3071
|
+
evalRunId: traceContext.evalRunId,
|
|
3072
|
+
scenarioId: traceContext.scenarioId,
|
|
3073
|
+
scenarioName: traceContext.scenarioName,
|
|
3074
|
+
targetId: traceContext.targetId,
|
|
3075
|
+
targetName: traceContext.targetName,
|
|
3076
|
+
stepNumber: traceStepNumber + 1,
|
|
3077
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3078
|
+
outputPreview: JSON.stringify({
|
|
3079
|
+
event: "idle-timeout-retry",
|
|
3080
|
+
attempt,
|
|
3081
|
+
maxRetries: MAX_IDLE_RETRIES,
|
|
3082
|
+
eventsFromPreviousAttempts: accumulatedEvents.length
|
|
3083
|
+
}),
|
|
3084
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3085
|
+
isComplete: false
|
|
3086
|
+
},
|
|
3087
|
+
traceContext.tracePushUrl,
|
|
3088
|
+
traceContext.routeHeader,
|
|
3089
|
+
traceContext.authToken
|
|
3090
|
+
);
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
const args = [...baseArgs, prompt];
|
|
3094
|
+
console.log(
|
|
3095
|
+
`[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
|
|
3096
|
+
args.slice(0, 5)
|
|
3097
|
+
);
|
|
3098
|
+
lastAttemptResult = await spawnOpenCodeProcess({
|
|
3099
|
+
args,
|
|
3100
|
+
env,
|
|
3101
|
+
cwd: options.cwd,
|
|
3102
|
+
skillNames,
|
|
3103
|
+
scenarioName: scenario.name,
|
|
3104
|
+
sdkTimeoutMs,
|
|
3105
|
+
traceContext,
|
|
3106
|
+
initialStepNumber: traceStepNumber
|
|
3107
|
+
});
|
|
3108
|
+
accumulatedEvents.push(...lastAttemptResult.events);
|
|
3109
|
+
traceStepNumber = lastAttemptResult.finalStepNumber;
|
|
3110
|
+
if (lastAttemptResult.success) {
|
|
3111
|
+
break;
|
|
3112
|
+
}
|
|
3113
|
+
if (!lastAttemptResult.isIdleTimeout || attempt >= MAX_IDLE_RETRIES) {
|
|
3114
|
+
if (traceContext) {
|
|
3115
|
+
emitTraceEvent(
|
|
3116
|
+
{
|
|
3117
|
+
evalRunId: traceContext.evalRunId,
|
|
3118
|
+
scenarioId: traceContext.scenarioId,
|
|
3119
|
+
scenarioName: traceContext.scenarioName,
|
|
3120
|
+
targetId: traceContext.targetId,
|
|
3121
|
+
targetName: traceContext.targetName,
|
|
3122
|
+
stepNumber: traceStepNumber + 1,
|
|
3123
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3124
|
+
outputPreview: JSON.stringify({
|
|
3125
|
+
event: "cli-execution-failed",
|
|
3126
|
+
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
3127
|
+
attempt,
|
|
3128
|
+
isIdleTimeout: lastAttemptResult.isIdleTimeout
|
|
3129
|
+
}).slice(0, 2e3),
|
|
3130
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3131
|
+
isComplete: true
|
|
3132
|
+
},
|
|
3133
|
+
traceContext.tracePushUrl,
|
|
3134
|
+
traceContext.routeHeader,
|
|
3135
|
+
traceContext.authToken
|
|
3136
|
+
);
|
|
3137
|
+
}
|
|
3138
|
+
throw lastAttemptResult.error ?? new Error(
|
|
3139
|
+
`OpenCode CLI execution failed.
|
|
3140
|
+
Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
3141
|
+
);
|
|
3142
|
+
}
|
|
3143
|
+
console.warn(
|
|
3144
|
+
`[OpenCode] Attempt ${attempt} failed due to idle timeout, will retry`
|
|
3145
|
+
);
|
|
3146
|
+
}
|
|
3147
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
3148
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3149
|
+
let outputText = "";
|
|
3150
|
+
for (const { event: evt } of accumulatedEvents) {
|
|
3151
|
+
if (evt.type === "text") {
|
|
3152
|
+
outputText += evt.part.text;
|
|
3153
|
+
}
|
|
3154
|
+
}
|
|
3155
|
+
if (!outputText) {
|
|
3156
|
+
throw new Error(
|
|
3157
|
+
`Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${accumulatedEvents.length}`
|
|
3158
|
+
);
|
|
3159
|
+
}
|
|
3160
|
+
let inputTokens = 0;
|
|
3161
|
+
let outputTokens = 0;
|
|
3162
|
+
let costUsd = 0;
|
|
3163
|
+
for (const { event: evt } of accumulatedEvents) {
|
|
3164
|
+
if (evt.type === "step_finish") {
|
|
3165
|
+
const sf = evt;
|
|
3166
|
+
inputTokens += sf.part.tokens.input;
|
|
3167
|
+
outputTokens += sf.part.tokens.output;
|
|
3168
|
+
costUsd += sf.part.cost;
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
if (traceContext) {
|
|
3172
|
+
emitTraceEvent(
|
|
3173
|
+
{
|
|
3174
|
+
evalRunId: traceContext.evalRunId,
|
|
3175
|
+
scenarioId: traceContext.scenarioId,
|
|
3176
|
+
scenarioName: traceContext.scenarioName,
|
|
3177
|
+
targetId: traceContext.targetId,
|
|
3178
|
+
targetName: traceContext.targetName,
|
|
3179
|
+
stepNumber: traceStepNumber + 1,
|
|
3180
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
3181
|
+
outputPreview: "Scenario execution completed",
|
|
3182
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3183
|
+
isComplete: true
|
|
3184
|
+
},
|
|
3185
|
+
traceContext.tracePushUrl,
|
|
3186
|
+
traceContext.routeHeader,
|
|
3187
|
+
traceContext.authToken
|
|
3188
|
+
);
|
|
3189
|
+
}
|
|
3190
|
+
const modelStr = options.model || `${providerID}/${modelID}`;
|
|
3191
|
+
const llmTrace = buildLLMTrace(
|
|
3192
|
+
accumulatedEvents,
|
|
3193
|
+
totalDurationMs,
|
|
3194
|
+
modelStr,
|
|
3195
|
+
providerID,
|
|
3196
|
+
startTime
|
|
3197
|
+
);
|
|
3198
|
+
const conversation = buildConversation2(accumulatedEvents);
|
|
3199
|
+
return {
|
|
3200
|
+
result: {
|
|
3201
|
+
outputText,
|
|
3202
|
+
durationMs: totalDurationMs,
|
|
3203
|
+
usage: {
|
|
3204
|
+
inputTokens,
|
|
3205
|
+
outputTokens,
|
|
3206
|
+
totalTokens: inputTokens + outputTokens
|
|
3207
|
+
},
|
|
3208
|
+
costUsd
|
|
3209
|
+
},
|
|
3210
|
+
llmTrace,
|
|
3211
|
+
conversation
|
|
3212
|
+
};
|
|
3213
|
+
}
|
|
3110
3214
|
|
|
3111
3215
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
3112
3216
|
var OpenCodeAdapter = class {
|
|
@@ -4465,13 +4569,14 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4465
4569
|
infrastructurePaths
|
|
4466
4570
|
);
|
|
4467
4571
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4572
|
+
const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
4468
4573
|
return {
|
|
4469
4574
|
id: (0, import_crypto4.randomUUID)(),
|
|
4470
4575
|
targetId,
|
|
4471
4576
|
targetName,
|
|
4472
4577
|
scenarioId: scenario.id,
|
|
4473
4578
|
scenarioName: scenario.name,
|
|
4474
|
-
modelConfig:
|
|
4579
|
+
modelConfig: resolvedModelConfig,
|
|
4475
4580
|
duration: durationMs,
|
|
4476
4581
|
outputText,
|
|
4477
4582
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|