@wix/evalforge-evaluator 0.121.0 → 0.123.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +311 -196
- package/build/index.js.map +2 -2
- package/build/index.mjs +311 -196
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -3
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -2607,6 +2607,7 @@ var import_path10 = require("path");
|
|
|
2607
2607
|
var KILL_GRACE_PERIOD_MS = 5e3;
|
|
2608
2608
|
var IDLE_TIMEOUT_MS = 12e4;
|
|
2609
2609
|
var IDLE_CHECK_INTERVAL_MS = 15e3;
|
|
2610
|
+
var MAX_IDLE_RETRIES = 3;
|
|
2610
2611
|
function extractToolAction(toolName, args) {
|
|
2611
2612
|
if (!toolName) return "Using tool...";
|
|
2612
2613
|
if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
|
|
@@ -2732,232 +2733,113 @@ function killProcess(child, resolved) {
|
|
|
2732
2733
|
}
|
|
2733
2734
|
}, KILL_GRACE_PERIOD_MS);
|
|
2734
2735
|
}
|
|
2735
|
-
|
|
2736
|
-
const
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
model: options.model,
|
|
2752
|
-
temperature: options.temperature,
|
|
2753
|
-
maxTurns,
|
|
2754
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2755
|
-
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
2756
|
-
mcps: options.mcps,
|
|
2757
|
-
cwd: options.cwd
|
|
2758
|
-
});
|
|
2759
|
-
const traceContext = options.traceContext;
|
|
2760
|
-
let traceStepNumber = 0;
|
|
2761
|
-
let lastAction = "Starting...";
|
|
2762
|
-
let lastToolName;
|
|
2763
|
-
let lastFilePath;
|
|
2764
|
-
if (traceContext) {
|
|
2765
|
-
emitTraceEvent(
|
|
2766
|
-
{
|
|
2767
|
-
evalRunId: traceContext.evalRunId,
|
|
2768
|
-
scenarioId: traceContext.scenarioId,
|
|
2769
|
-
scenarioName: traceContext.scenarioName,
|
|
2770
|
-
targetId: traceContext.targetId,
|
|
2771
|
-
targetName: traceContext.targetName,
|
|
2772
|
-
stepNumber: 0,
|
|
2773
|
-
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2774
|
-
outputPreview: JSON.stringify({
|
|
2775
|
-
event: "pre-cli-execution",
|
|
2776
|
-
model: `${providerID}/${modelID}`,
|
|
2777
|
-
maxTurns,
|
|
2778
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2779
|
-
}),
|
|
2780
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2781
|
-
isComplete: false
|
|
2782
|
-
},
|
|
2783
|
-
traceContext.tracePushUrl,
|
|
2784
|
-
traceContext.routeHeader,
|
|
2785
|
-
traceContext.authToken
|
|
2786
|
-
);
|
|
2787
|
-
}
|
|
2788
|
-
let systemPrompt;
|
|
2789
|
-
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2790
|
-
} else if (options.systemPrompt != null) {
|
|
2791
|
-
systemPrompt = options.systemPrompt;
|
|
2792
|
-
} else {
|
|
2793
|
-
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
2794
|
-
}
|
|
2795
|
-
if (systemPrompt) {
|
|
2796
|
-
await writeSystemPromptRule(options.cwd, systemPrompt);
|
|
2736
|
+
function summarizeToolActions(events) {
|
|
2737
|
+
const actions = [];
|
|
2738
|
+
for (const { event: evt } of events) {
|
|
2739
|
+
if (evt.type === "tool_use") {
|
|
2740
|
+
const tu = evt;
|
|
2741
|
+
const tool = tu.part.tool;
|
|
2742
|
+
const input = tu.part.state.input;
|
|
2743
|
+
const filePath = input?.file_path || input?.path || input?.target_file;
|
|
2744
|
+
if (filePath) {
|
|
2745
|
+
actions.push(`- ${tool}: ${String(filePath)}`);
|
|
2746
|
+
} else if (input?.command) {
|
|
2747
|
+
actions.push(`- ${tool}: ${String(input.command).slice(0, 80)}`);
|
|
2748
|
+
} else {
|
|
2749
|
+
actions.push(`- ${tool}`);
|
|
2750
|
+
}
|
|
2751
|
+
}
|
|
2797
2752
|
}
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2815
|
-
|
|
2816
|
-
|
|
2753
|
+
return actions.length > 0 ? actions.join("\n") : "(no tool actions recorded)";
|
|
2754
|
+
}
|
|
2755
|
+
function buildRecoveryPrompt(originalPrompt, events) {
|
|
2756
|
+
const toolSummary = summarizeToolActions(events);
|
|
2757
|
+
return `You are continuing a task that was interrupted due to a session error.
|
|
2758
|
+
|
|
2759
|
+
ORIGINAL TASK:
|
|
2760
|
+
${originalPrompt}
|
|
2761
|
+
|
|
2762
|
+
ACTIONS ALREADY COMPLETED IN THE PREVIOUS SESSION:
|
|
2763
|
+
${toolSummary}
|
|
2764
|
+
|
|
2765
|
+
INSTRUCTIONS:
|
|
2766
|
+
1. Review the actions listed above that were already completed in the previous session
|
|
2767
|
+
2. Check the filesystem to verify what was already done
|
|
2768
|
+
3. Continue with any remaining work needed to fulfill the original task
|
|
2769
|
+
4. Do NOT redo work that is already done \u2014 only continue from where the previous session left off`;
|
|
2770
|
+
}
|
|
2771
|
+
function spawnOpenCodeProcess(opts) {
|
|
2772
|
+
const {
|
|
2773
|
+
args,
|
|
2774
|
+
env,
|
|
2775
|
+
cwd,
|
|
2776
|
+
skillNames,
|
|
2777
|
+
scenarioName,
|
|
2778
|
+
sdkTimeoutMs,
|
|
2779
|
+
traceContext,
|
|
2780
|
+
initialStepNumber
|
|
2781
|
+
} = opts;
|
|
2782
|
+
return new Promise((resolve2) => {
|
|
2817
2783
|
let resolved = false;
|
|
2818
2784
|
let stderr = "";
|
|
2819
2785
|
let lineBuffer = "";
|
|
2820
2786
|
let lastOutputTime = Date.now();
|
|
2821
|
-
|
|
2787
|
+
let traceStepNumber = initialStepNumber;
|
|
2788
|
+
let lastAction = "Starting...";
|
|
2789
|
+
let lastToolName;
|
|
2790
|
+
let lastFilePath;
|
|
2791
|
+
const events = [];
|
|
2822
2792
|
const timers = {};
|
|
2823
2793
|
const cleanup = () => {
|
|
2824
2794
|
if (timers.timeout) clearTimeout(timers.timeout);
|
|
2825
2795
|
if (timers.idleCheck) clearInterval(timers.idleCheck);
|
|
2826
2796
|
if (timers.heartbeat) clearInterval(timers.heartbeat);
|
|
2827
2797
|
};
|
|
2828
|
-
const finalize = (success, error) => {
|
|
2798
|
+
const finalize = (success, isIdleTimeout, error) => {
|
|
2829
2799
|
if (resolved) return;
|
|
2830
2800
|
resolved = true;
|
|
2831
2801
|
cleanup();
|
|
2832
|
-
if (!success) {
|
|
2833
|
-
if (traceContext) {
|
|
2834
|
-
emitTraceEvent(
|
|
2835
|
-
{
|
|
2836
|
-
evalRunId: traceContext.evalRunId,
|
|
2837
|
-
scenarioId: traceContext.scenarioId,
|
|
2838
|
-
scenarioName: traceContext.scenarioName,
|
|
2839
|
-
targetId: traceContext.targetId,
|
|
2840
|
-
targetName: traceContext.targetName,
|
|
2841
|
-
stepNumber: traceStepNumber + 1,
|
|
2842
|
-
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2843
|
-
outputPreview: JSON.stringify({
|
|
2844
|
-
event: "cli-execution-failed",
|
|
2845
|
-
error: error?.message ?? "Unknown error"
|
|
2846
|
-
}).slice(0, 2e3),
|
|
2847
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2848
|
-
isComplete: true
|
|
2849
|
-
},
|
|
2850
|
-
traceContext.tracePushUrl,
|
|
2851
|
-
traceContext.routeHeader,
|
|
2852
|
-
traceContext.authToken
|
|
2853
|
-
);
|
|
2854
|
-
}
|
|
2855
|
-
reject(
|
|
2856
|
-
error ?? new Error(
|
|
2857
|
-
`OpenCode CLI execution failed (exit code unknown).
|
|
2858
|
-
Stderr: ${stderr.slice(0, 1e3)}`
|
|
2859
|
-
)
|
|
2860
|
-
);
|
|
2861
|
-
return;
|
|
2862
|
-
}
|
|
2863
|
-
const endTime = /* @__PURE__ */ new Date();
|
|
2864
|
-
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
2865
|
-
let outputText = "";
|
|
2866
|
-
for (const { event: evt } of allEvents) {
|
|
2867
|
-
if (evt.type === "text") {
|
|
2868
|
-
outputText += evt.part.text;
|
|
2869
|
-
}
|
|
2870
|
-
}
|
|
2871
|
-
if (!outputText) {
|
|
2872
|
-
reject(
|
|
2873
|
-
new Error(
|
|
2874
|
-
`Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
|
|
2875
|
-
)
|
|
2876
|
-
);
|
|
2877
|
-
return;
|
|
2878
|
-
}
|
|
2879
|
-
let inputTokens = 0;
|
|
2880
|
-
let outputTokens = 0;
|
|
2881
|
-
let costUsd = 0;
|
|
2882
|
-
for (const { event: evt } of allEvents) {
|
|
2883
|
-
if (evt.type === "step_finish") {
|
|
2884
|
-
const sf = evt;
|
|
2885
|
-
inputTokens += sf.part.tokens.input;
|
|
2886
|
-
outputTokens += sf.part.tokens.output;
|
|
2887
|
-
costUsd += sf.part.cost;
|
|
2888
|
-
}
|
|
2889
|
-
}
|
|
2890
|
-
if (traceContext) {
|
|
2891
|
-
emitTraceEvent(
|
|
2892
|
-
{
|
|
2893
|
-
evalRunId: traceContext.evalRunId,
|
|
2894
|
-
scenarioId: traceContext.scenarioId,
|
|
2895
|
-
scenarioName: traceContext.scenarioName,
|
|
2896
|
-
targetId: traceContext.targetId,
|
|
2897
|
-
targetName: traceContext.targetName,
|
|
2898
|
-
stepNumber: traceStepNumber + 1,
|
|
2899
|
-
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
2900
|
-
outputPreview: "Scenario execution completed",
|
|
2901
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2902
|
-
isComplete: true
|
|
2903
|
-
},
|
|
2904
|
-
traceContext.tracePushUrl,
|
|
2905
|
-
traceContext.routeHeader,
|
|
2906
|
-
traceContext.authToken
|
|
2907
|
-
);
|
|
2908
|
-
}
|
|
2909
|
-
const modelStr = options.model || `${providerID}/${modelID}`;
|
|
2910
|
-
const llmTrace = buildLLMTrace(
|
|
2911
|
-
allEvents,
|
|
2912
|
-
totalDurationMs,
|
|
2913
|
-
modelStr,
|
|
2914
|
-
providerID,
|
|
2915
|
-
startTime
|
|
2916
|
-
);
|
|
2917
|
-
const conversation = buildConversation2(allEvents);
|
|
2918
2802
|
resolve2({
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
outputTokens,
|
|
2925
|
-
totalTokens: inputTokens + outputTokens
|
|
2926
|
-
},
|
|
2927
|
-
costUsd
|
|
2928
|
-
},
|
|
2929
|
-
llmTrace,
|
|
2930
|
-
conversation
|
|
2803
|
+
events,
|
|
2804
|
+
success,
|
|
2805
|
+
isIdleTimeout,
|
|
2806
|
+
error,
|
|
2807
|
+
finalStepNumber: traceStepNumber
|
|
2931
2808
|
});
|
|
2932
2809
|
};
|
|
2933
2810
|
let child;
|
|
2934
2811
|
try {
|
|
2935
2812
|
child = (0, import_child_process.spawn)("opencode", args, {
|
|
2936
|
-
cwd
|
|
2813
|
+
cwd,
|
|
2937
2814
|
env,
|
|
2938
2815
|
stdio: ["ignore", "pipe", "pipe"],
|
|
2939
2816
|
detached: true
|
|
2940
2817
|
});
|
|
2941
2818
|
} catch (spawnError) {
|
|
2942
|
-
|
|
2943
|
-
|
|
2819
|
+
resolve2({
|
|
2820
|
+
events: [],
|
|
2821
|
+
success: false,
|
|
2822
|
+
isIdleTimeout: false,
|
|
2823
|
+
error: new Error(
|
|
2944
2824
|
`Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
|
|
2945
|
-
)
|
|
2946
|
-
|
|
2825
|
+
),
|
|
2826
|
+
finalStepNumber: traceStepNumber
|
|
2827
|
+
});
|
|
2947
2828
|
return;
|
|
2948
2829
|
}
|
|
2949
2830
|
timers.timeout = setTimeout(() => {
|
|
2950
2831
|
if (!resolved) {
|
|
2951
|
-
console.error(`[OpenCode] Process timed out after ${
|
|
2832
|
+
console.error(`[OpenCode] Process timed out after ${sdkTimeoutMs}ms`);
|
|
2952
2833
|
killProcess(child, resolved);
|
|
2953
2834
|
finalize(
|
|
2835
|
+
false,
|
|
2954
2836
|
false,
|
|
2955
2837
|
new Error(
|
|
2956
|
-
`OpenCode execution timed out after ${
|
|
2838
|
+
`OpenCode execution timed out after ${sdkTimeoutMs}ms. Skills: ${skillNames}, Scenario: ${scenarioName}`
|
|
2957
2839
|
)
|
|
2958
2840
|
);
|
|
2959
2841
|
}
|
|
2960
|
-
},
|
|
2842
|
+
}, sdkTimeoutMs);
|
|
2961
2843
|
timers.idleCheck = setInterval(() => {
|
|
2962
2844
|
if (resolved) return;
|
|
2963
2845
|
const idleTime = Date.now() - lastOutputTime;
|
|
@@ -2968,8 +2850,9 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
2968
2850
|
killProcess(child, resolved);
|
|
2969
2851
|
finalize(
|
|
2970
2852
|
false,
|
|
2853
|
+
true,
|
|
2971
2854
|
new Error(
|
|
2972
|
-
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${
|
|
2855
|
+
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenarioName}`
|
|
2973
2856
|
)
|
|
2974
2857
|
);
|
|
2975
2858
|
}
|
|
@@ -3029,7 +2912,7 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3029
2912
|
if (!line.trim()) continue;
|
|
3030
2913
|
const evt = tryParseJson(line);
|
|
3031
2914
|
if (!evt || !evt.type) continue;
|
|
3032
|
-
|
|
2915
|
+
events.push({ event: evt, receivedAt: Date.now() });
|
|
3033
2916
|
if (traceContext) {
|
|
3034
2917
|
traceStepNumber++;
|
|
3035
2918
|
const traceEvt = createTraceEventFromNdjson(
|
|
@@ -3074,16 +2957,17 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3074
2957
|
if (lineBuffer.trim()) {
|
|
3075
2958
|
const evt = tryParseJson(lineBuffer);
|
|
3076
2959
|
if (evt && evt.type) {
|
|
3077
|
-
|
|
2960
|
+
events.push({ event: evt, receivedAt: Date.now() });
|
|
3078
2961
|
}
|
|
3079
2962
|
}
|
|
3080
2963
|
console.log(
|
|
3081
|
-
`[executeWithOpenCode] Process exited with code ${code}, ${
|
|
2964
|
+
`[executeWithOpenCode] Process exited with code ${code}, ${events.length} events collected`
|
|
3082
2965
|
);
|
|
3083
2966
|
if (code === 0) {
|
|
3084
|
-
finalize(true);
|
|
2967
|
+
finalize(true, false);
|
|
3085
2968
|
} else {
|
|
3086
2969
|
finalize(
|
|
2970
|
+
false,
|
|
3087
2971
|
false,
|
|
3088
2972
|
new Error(
|
|
3089
2973
|
`OpenCode CLI exited with code ${code}.
|
|
@@ -3093,10 +2977,240 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3093
2977
|
}
|
|
3094
2978
|
});
|
|
3095
2979
|
child.on("error", (error) => {
|
|
3096
|
-
finalize(
|
|
2980
|
+
finalize(
|
|
2981
|
+
false,
|
|
2982
|
+
false,
|
|
2983
|
+
new Error(`OpenCode CLI spawn error: ${error.message}`)
|
|
2984
|
+
);
|
|
3097
2985
|
});
|
|
3098
2986
|
});
|
|
3099
2987
|
}
|
|
2988
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2989
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2990
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2991
|
+
skillCount: skills.length,
|
|
2992
|
+
skillNames,
|
|
2993
|
+
scenarioId: scenario.id,
|
|
2994
|
+
scenarioName: scenario.name,
|
|
2995
|
+
cwd: options.cwd,
|
|
2996
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2997
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2998
|
+
model: options.model
|
|
2999
|
+
});
|
|
3000
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
3001
|
+
const maxTurns = options.maxTurns ?? 10;
|
|
3002
|
+
const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
|
|
3003
|
+
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3004
|
+
model: options.model,
|
|
3005
|
+
temperature: options.temperature,
|
|
3006
|
+
maxTurns,
|
|
3007
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
3008
|
+
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
3009
|
+
mcps: options.mcps,
|
|
3010
|
+
cwd: options.cwd
|
|
3011
|
+
});
|
|
3012
|
+
const traceContext = options.traceContext;
|
|
3013
|
+
if (traceContext) {
|
|
3014
|
+
emitTraceEvent(
|
|
3015
|
+
{
|
|
3016
|
+
evalRunId: traceContext.evalRunId,
|
|
3017
|
+
scenarioId: traceContext.scenarioId,
|
|
3018
|
+
scenarioName: traceContext.scenarioName,
|
|
3019
|
+
targetId: traceContext.targetId,
|
|
3020
|
+
targetName: traceContext.targetName,
|
|
3021
|
+
stepNumber: 0,
|
|
3022
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3023
|
+
outputPreview: JSON.stringify({
|
|
3024
|
+
event: "pre-cli-execution",
|
|
3025
|
+
model: `${providerID}/${modelID}`,
|
|
3026
|
+
maxTurns,
|
|
3027
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
3028
|
+
}),
|
|
3029
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3030
|
+
isComplete: false
|
|
3031
|
+
},
|
|
3032
|
+
traceContext.tracePushUrl,
|
|
3033
|
+
traceContext.routeHeader,
|
|
3034
|
+
traceContext.authToken
|
|
3035
|
+
);
|
|
3036
|
+
}
|
|
3037
|
+
let systemPrompt;
|
|
3038
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
3039
|
+
} else if (options.systemPrompt != null) {
|
|
3040
|
+
systemPrompt = options.systemPrompt;
|
|
3041
|
+
} else {
|
|
3042
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
3043
|
+
}
|
|
3044
|
+
if (systemPrompt) {
|
|
3045
|
+
await writeSystemPromptRule(options.cwd, systemPrompt);
|
|
3046
|
+
}
|
|
3047
|
+
const baseArgs = [
|
|
3048
|
+
"run",
|
|
3049
|
+
"--format",
|
|
3050
|
+
"json",
|
|
3051
|
+
"--thinking",
|
|
3052
|
+
"--variant",
|
|
3053
|
+
"high",
|
|
3054
|
+
"--model",
|
|
3055
|
+
`${providerID}/${modelID}`,
|
|
3056
|
+
"--dir",
|
|
3057
|
+
options.cwd
|
|
3058
|
+
];
|
|
3059
|
+
const accumulatedEvents = [];
|
|
3060
|
+
let traceStepNumber = 0;
|
|
3061
|
+
let lastAttemptResult;
|
|
3062
|
+
for (let attempt = 1; attempt <= MAX_IDLE_RETRIES; attempt++) {
|
|
3063
|
+
const prompt = attempt === 1 ? scenario.triggerPrompt : buildRecoveryPrompt(scenario.triggerPrompt, accumulatedEvents);
|
|
3064
|
+
if (attempt > 1) {
|
|
3065
|
+
console.log(
|
|
3066
|
+
`[OpenCode] Retry attempt ${attempt}/${MAX_IDLE_RETRIES} \u2014 starting fresh session with recovery context`
|
|
3067
|
+
);
|
|
3068
|
+
if (traceContext) {
|
|
3069
|
+
emitTraceEvent(
|
|
3070
|
+
{
|
|
3071
|
+
evalRunId: traceContext.evalRunId,
|
|
3072
|
+
scenarioId: traceContext.scenarioId,
|
|
3073
|
+
scenarioName: traceContext.scenarioName,
|
|
3074
|
+
targetId: traceContext.targetId,
|
|
3075
|
+
targetName: traceContext.targetName,
|
|
3076
|
+
stepNumber: traceStepNumber + 1,
|
|
3077
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3078
|
+
outputPreview: JSON.stringify({
|
|
3079
|
+
event: "idle-timeout-retry",
|
|
3080
|
+
attempt,
|
|
3081
|
+
maxRetries: MAX_IDLE_RETRIES,
|
|
3082
|
+
eventsFromPreviousAttempts: accumulatedEvents.length
|
|
3083
|
+
}),
|
|
3084
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3085
|
+
isComplete: false
|
|
3086
|
+
},
|
|
3087
|
+
traceContext.tracePushUrl,
|
|
3088
|
+
traceContext.routeHeader,
|
|
3089
|
+
traceContext.authToken
|
|
3090
|
+
);
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
const args = [...baseArgs, prompt];
|
|
3094
|
+
console.log(
|
|
3095
|
+
`[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
|
|
3096
|
+
args.slice(0, 5)
|
|
3097
|
+
);
|
|
3098
|
+
lastAttemptResult = await spawnOpenCodeProcess({
|
|
3099
|
+
args,
|
|
3100
|
+
env,
|
|
3101
|
+
cwd: options.cwd,
|
|
3102
|
+
skillNames,
|
|
3103
|
+
scenarioName: scenario.name,
|
|
3104
|
+
sdkTimeoutMs,
|
|
3105
|
+
traceContext,
|
|
3106
|
+
initialStepNumber: traceStepNumber
|
|
3107
|
+
});
|
|
3108
|
+
accumulatedEvents.push(...lastAttemptResult.events);
|
|
3109
|
+
traceStepNumber = lastAttemptResult.finalStepNumber;
|
|
3110
|
+
if (lastAttemptResult.success) {
|
|
3111
|
+
break;
|
|
3112
|
+
}
|
|
3113
|
+
if (!lastAttemptResult.isIdleTimeout || attempt >= MAX_IDLE_RETRIES) {
|
|
3114
|
+
if (traceContext) {
|
|
3115
|
+
emitTraceEvent(
|
|
3116
|
+
{
|
|
3117
|
+
evalRunId: traceContext.evalRunId,
|
|
3118
|
+
scenarioId: traceContext.scenarioId,
|
|
3119
|
+
scenarioName: traceContext.scenarioName,
|
|
3120
|
+
targetId: traceContext.targetId,
|
|
3121
|
+
targetName: traceContext.targetName,
|
|
3122
|
+
stepNumber: traceStepNumber + 1,
|
|
3123
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3124
|
+
outputPreview: JSON.stringify({
|
|
3125
|
+
event: "cli-execution-failed",
|
|
3126
|
+
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
3127
|
+
attempt,
|
|
3128
|
+
isIdleTimeout: lastAttemptResult.isIdleTimeout
|
|
3129
|
+
}).slice(0, 2e3),
|
|
3130
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3131
|
+
isComplete: true
|
|
3132
|
+
},
|
|
3133
|
+
traceContext.tracePushUrl,
|
|
3134
|
+
traceContext.routeHeader,
|
|
3135
|
+
traceContext.authToken
|
|
3136
|
+
);
|
|
3137
|
+
}
|
|
3138
|
+
throw lastAttemptResult.error ?? new Error(
|
|
3139
|
+
`OpenCode CLI execution failed.
|
|
3140
|
+
Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
3141
|
+
);
|
|
3142
|
+
}
|
|
3143
|
+
console.warn(
|
|
3144
|
+
`[OpenCode] Attempt ${attempt} failed due to idle timeout, will retry`
|
|
3145
|
+
);
|
|
3146
|
+
}
|
|
3147
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
3148
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3149
|
+
let outputText = "";
|
|
3150
|
+
for (const { event: evt } of accumulatedEvents) {
|
|
3151
|
+
if (evt.type === "text") {
|
|
3152
|
+
outputText += evt.part.text;
|
|
3153
|
+
}
|
|
3154
|
+
}
|
|
3155
|
+
if (!outputText) {
|
|
3156
|
+
throw new Error(
|
|
3157
|
+
`Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${accumulatedEvents.length}`
|
|
3158
|
+
);
|
|
3159
|
+
}
|
|
3160
|
+
let inputTokens = 0;
|
|
3161
|
+
let outputTokens = 0;
|
|
3162
|
+
let costUsd = 0;
|
|
3163
|
+
for (const { event: evt } of accumulatedEvents) {
|
|
3164
|
+
if (evt.type === "step_finish") {
|
|
3165
|
+
const sf = evt;
|
|
3166
|
+
inputTokens += sf.part.tokens.input;
|
|
3167
|
+
outputTokens += sf.part.tokens.output;
|
|
3168
|
+
costUsd += sf.part.cost;
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
if (traceContext) {
|
|
3172
|
+
emitTraceEvent(
|
|
3173
|
+
{
|
|
3174
|
+
evalRunId: traceContext.evalRunId,
|
|
3175
|
+
scenarioId: traceContext.scenarioId,
|
|
3176
|
+
scenarioName: traceContext.scenarioName,
|
|
3177
|
+
targetId: traceContext.targetId,
|
|
3178
|
+
targetName: traceContext.targetName,
|
|
3179
|
+
stepNumber: traceStepNumber + 1,
|
|
3180
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
3181
|
+
outputPreview: "Scenario execution completed",
|
|
3182
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3183
|
+
isComplete: true
|
|
3184
|
+
},
|
|
3185
|
+
traceContext.tracePushUrl,
|
|
3186
|
+
traceContext.routeHeader,
|
|
3187
|
+
traceContext.authToken
|
|
3188
|
+
);
|
|
3189
|
+
}
|
|
3190
|
+
const modelStr = options.model || `${providerID}/${modelID}`;
|
|
3191
|
+
const llmTrace = buildLLMTrace(
|
|
3192
|
+
accumulatedEvents,
|
|
3193
|
+
totalDurationMs,
|
|
3194
|
+
modelStr,
|
|
3195
|
+
providerID,
|
|
3196
|
+
startTime
|
|
3197
|
+
);
|
|
3198
|
+
const conversation = buildConversation2(accumulatedEvents);
|
|
3199
|
+
return {
|
|
3200
|
+
result: {
|
|
3201
|
+
outputText,
|
|
3202
|
+
durationMs: totalDurationMs,
|
|
3203
|
+
usage: {
|
|
3204
|
+
inputTokens,
|
|
3205
|
+
outputTokens,
|
|
3206
|
+
totalTokens: inputTokens + outputTokens
|
|
3207
|
+
},
|
|
3208
|
+
costUsd
|
|
3209
|
+
},
|
|
3210
|
+
llmTrace,
|
|
3211
|
+
conversation
|
|
3212
|
+
};
|
|
3213
|
+
}
|
|
3100
3214
|
|
|
3101
3215
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
3102
3216
|
var OpenCodeAdapter = class {
|
|
@@ -4455,13 +4569,14 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4455
4569
|
infrastructurePaths
|
|
4456
4570
|
);
|
|
4457
4571
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4572
|
+
const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
4458
4573
|
return {
|
|
4459
4574
|
id: (0, import_crypto4.randomUUID)(),
|
|
4460
4575
|
targetId,
|
|
4461
4576
|
targetName,
|
|
4462
4577
|
scenarioId: scenario.id,
|
|
4463
4578
|
scenarioName: scenario.name,
|
|
4464
|
-
modelConfig:
|
|
4579
|
+
modelConfig: resolvedModelConfig,
|
|
4465
4580
|
duration: durationMs,
|
|
4466
4581
|
outputText,
|
|
4467
4582
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|