@wix/evalforge-evaluator 0.117.0 → 0.119.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +493 -483
- package/build/index.js.map +4 -4
- package/build/index.mjs +492 -483
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/opencode/build-conversation.d.ts +7 -4
- package/build/types/run-scenario/agents/opencode/build-trace.d.ts +6 -9
- package/build/types/run-scenario/agents/opencode/config.d.ts +5 -11
- package/build/types/run-scenario/agents/opencode/execute.d.ts +3 -4
- package/build/types/run-scenario/agents/opencode/index.d.ts +1 -1
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +2 -3
- package/build/types/run-scenario/agents/opencode/types.d.ts +51 -6
- package/package.json +7 -8
package/build/index.js
CHANGED
|
@@ -1200,10 +1200,10 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1200
1200
|
};
|
|
1201
1201
|
}
|
|
1202
1202
|
async function prepareClaudeCodeEnvironment(cwd, skills, options) {
|
|
1203
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1203
|
+
const { mkdir: mkdirAsync, writeFile: writeFile7 } = await import("fs/promises");
|
|
1204
1204
|
const claudeDir = `${cwd}/.claude`;
|
|
1205
1205
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
1206
|
-
await
|
|
1206
|
+
await writeFile7(`${claudeDir}/settings.json`, "{}", {
|
|
1207
1207
|
flag: "wx"
|
|
1208
1208
|
}).catch(() => {
|
|
1209
1209
|
});
|
|
@@ -2162,9 +2162,18 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
2162
2162
|
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
2163
2163
|
|
|
2164
2164
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2165
|
-
var
|
|
2165
|
+
var import_child_process = require("child_process");
|
|
2166
2166
|
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
2167
2167
|
|
|
2168
|
+
// src/run-scenario/agents/opencode/types.ts
|
|
2169
|
+
function tryParseJson(text) {
|
|
2170
|
+
try {
|
|
2171
|
+
return JSON.parse(text);
|
|
2172
|
+
} catch {
|
|
2173
|
+
return null;
|
|
2174
|
+
}
|
|
2175
|
+
}
|
|
2176
|
+
|
|
2168
2177
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2169
2178
|
var import_promises7 = require("fs/promises");
|
|
2170
2179
|
var import_path8 = require("path");
|
|
@@ -2260,6 +2269,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
2260
2269
|
}
|
|
2261
2270
|
|
|
2262
2271
|
// src/run-scenario/agents/opencode/config.ts
|
|
2272
|
+
var import_os3 = require("os");
|
|
2263
2273
|
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
2264
2274
|
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2265
2275
|
function parseModel(model) {
|
|
@@ -2312,7 +2322,14 @@ function toOpenCodeMcpConfig(servers) {
|
|
|
2312
2322
|
}
|
|
2313
2323
|
return result;
|
|
2314
2324
|
}
|
|
2315
|
-
|
|
2325
|
+
function ensureOpenCodeInPath(currentPath) {
|
|
2326
|
+
const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
|
|
2327
|
+
if (currentPath.includes(opencodeBin)) {
|
|
2328
|
+
return currentPath;
|
|
2329
|
+
}
|
|
2330
|
+
return `${opencodeBin}:${currentPath}`;
|
|
2331
|
+
}
|
|
2332
|
+
async function buildOpenCodeEnv(options) {
|
|
2316
2333
|
const modelStr = options.model || DEFAULT_MODEL2;
|
|
2317
2334
|
const { providerID, modelID } = parseModel(modelStr);
|
|
2318
2335
|
const provider = {};
|
|
@@ -2324,9 +2341,7 @@ async function buildOpenCodeConfig(options) {
|
|
|
2324
2341
|
if (options.aiGatewayHeaders) {
|
|
2325
2342
|
providerOptions.headers = { ...options.aiGatewayHeaders };
|
|
2326
2343
|
}
|
|
2327
|
-
provider[providerID] = {
|
|
2328
|
-
options: providerOptions
|
|
2329
|
-
};
|
|
2344
|
+
provider[providerID] = { options: providerOptions };
|
|
2330
2345
|
}
|
|
2331
2346
|
let mcp;
|
|
2332
2347
|
if (options.mcps && options.mcps.length > 0) {
|
|
@@ -2367,70 +2382,81 @@ async function buildOpenCodeConfig(options) {
|
|
|
2367
2382
|
},
|
|
2368
2383
|
...mcp ? { mcp } : {}
|
|
2369
2384
|
};
|
|
2370
|
-
|
|
2385
|
+
const env = {
|
|
2386
|
+
...process.env,
|
|
2387
|
+
PATH: ensureOpenCodeInPath(process.env.PATH || ""),
|
|
2388
|
+
OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
|
|
2389
|
+
OPENCODE_DISABLE_LSP_DOWNLOAD: "true"
|
|
2390
|
+
};
|
|
2391
|
+
return { env, providerID, modelID };
|
|
2371
2392
|
}
|
|
2372
2393
|
|
|
2373
2394
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2374
2395
|
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2375
2396
|
var import_crypto2 = require("crypto");
|
|
2376
|
-
function buildLLMTrace(
|
|
2377
|
-
const
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
}
|
|
2409
|
-
|
|
2410
|
-
const sf = part;
|
|
2411
|
-
stepInputTokens += sf.tokens.input;
|
|
2412
|
-
stepOutputTokens += sf.tokens.output;
|
|
2413
|
-
stepCost += sf.cost;
|
|
2414
|
-
finishReason = sf.reason;
|
|
2415
|
-
break;
|
|
2416
|
-
}
|
|
2397
|
+
function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, executionStartTime) {
|
|
2398
|
+
const turns = [];
|
|
2399
|
+
let current = {
|
|
2400
|
+
textParts: [],
|
|
2401
|
+
reasoningParts: [],
|
|
2402
|
+
toolCalls: []
|
|
2403
|
+
};
|
|
2404
|
+
for (const { event: evt, receivedAt } of timestampedEvents) {
|
|
2405
|
+
switch (evt.type) {
|
|
2406
|
+
case "text":
|
|
2407
|
+
current.textParts.push(evt.part.text);
|
|
2408
|
+
break;
|
|
2409
|
+
case "reasoning":
|
|
2410
|
+
current.reasoningParts.push(evt.part.text);
|
|
2411
|
+
break;
|
|
2412
|
+
case "tool_use": {
|
|
2413
|
+
const tu = evt;
|
|
2414
|
+
current.toolCalls.push({
|
|
2415
|
+
toolName: tu.part.tool,
|
|
2416
|
+
args: tu.part.state.input
|
|
2417
|
+
});
|
|
2418
|
+
break;
|
|
2419
|
+
}
|
|
2420
|
+
case "step_finish": {
|
|
2421
|
+
const sf = evt;
|
|
2422
|
+
current.stepFinish = sf.part;
|
|
2423
|
+
current.receivedAt = receivedAt;
|
|
2424
|
+
turns.push(current);
|
|
2425
|
+
current = {
|
|
2426
|
+
textParts: [],
|
|
2427
|
+
reasoningParts: [],
|
|
2428
|
+
toolCalls: []
|
|
2429
|
+
};
|
|
2430
|
+
break;
|
|
2417
2431
|
}
|
|
2418
2432
|
}
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
}
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
const
|
|
2429
|
-
const
|
|
2430
|
-
const
|
|
2431
|
-
const
|
|
2433
|
+
}
|
|
2434
|
+
if (current.textParts.length > 0 || current.reasoningParts.length > 0 || current.toolCalls.length > 0) {
|
|
2435
|
+
if (timestampedEvents.length > 0) {
|
|
2436
|
+
current.receivedAt = timestampedEvents[timestampedEvents.length - 1].receivedAt;
|
|
2437
|
+
}
|
|
2438
|
+
turns.push(current);
|
|
2439
|
+
}
|
|
2440
|
+
const executionStartMs = executionStartTime.getTime();
|
|
2441
|
+
const allSteps = turns.flatMap((turn, turnIndex) => {
|
|
2442
|
+
const sf = turn.stepFinish;
|
|
2443
|
+
const stepInputTokens = sf?.tokens.input ?? 0;
|
|
2444
|
+
const stepOutputTokens = sf?.tokens.output ?? 0;
|
|
2445
|
+
const stepCost = sf?.cost ?? 0;
|
|
2446
|
+
const finishReason = sf?.reason ?? "unknown";
|
|
2447
|
+
const stepModel = sf?.modelID || model;
|
|
2448
|
+
const stepProvider = sf?.providerID || provider;
|
|
2449
|
+
const turnEndMs = turn.receivedAt ?? executionStartMs + totalDurationMs;
|
|
2450
|
+
const prevEndMs = turnIndex > 0 ? turns[turnIndex - 1].receivedAt ?? executionStartMs : executionStartMs;
|
|
2451
|
+
const durationMs = Math.max(0, turnEndMs - prevEndMs);
|
|
2452
|
+
const startedAt = new Date(prevEndMs).toISOString();
|
|
2453
|
+
const text = turn.textParts.join("");
|
|
2454
|
+
const thinking = turn.reasoningParts.join("");
|
|
2455
|
+
const toolCallCount = turn.toolCalls.length;
|
|
2432
2456
|
const hasThinking = !!thinking;
|
|
2433
2457
|
const hasText = !!text;
|
|
2458
|
+
const isSuccess = finishReason !== "error";
|
|
2459
|
+
const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
|
|
2434
2460
|
const subSteps = [];
|
|
2435
2461
|
const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
|
|
2436
2462
|
const toolSubSteps = toolCallCount;
|
|
@@ -2440,7 +2466,6 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
|
2440
2466
|
subSteps.push({
|
|
2441
2467
|
id: (0, import_crypto2.randomUUID)(),
|
|
2442
2468
|
stepNumber: 0,
|
|
2443
|
-
// renumbered below
|
|
2444
2469
|
turnIndex,
|
|
2445
2470
|
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
2446
2471
|
model: stepModel,
|
|
@@ -2462,7 +2487,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
|
2462
2487
|
}
|
|
2463
2488
|
if (toolCallCount > 0) {
|
|
2464
2489
|
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
2465
|
-
const tc = toolCalls[tcIdx];
|
|
2490
|
+
const tc = turn.toolCalls[tcIdx];
|
|
2466
2491
|
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
2467
2492
|
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
2468
2493
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
@@ -2541,11 +2566,21 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
|
2541
2566
|
}
|
|
2542
2567
|
return subSteps;
|
|
2543
2568
|
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
2544
|
-
|
|
2545
|
-
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2569
|
+
let totalPrompt = 0;
|
|
2570
|
+
let totalCompletion = 0;
|
|
2571
|
+
let totalCost = 0;
|
|
2572
|
+
for (const turn of turns) {
|
|
2573
|
+
if (turn.stepFinish) {
|
|
2574
|
+
totalPrompt += turn.stepFinish.tokens.input;
|
|
2575
|
+
totalCompletion += turn.stepFinish.tokens.output;
|
|
2576
|
+
totalCost += turn.stepFinish.cost;
|
|
2577
|
+
}
|
|
2578
|
+
}
|
|
2579
|
+
const totalTokens = {
|
|
2580
|
+
prompt: totalPrompt,
|
|
2581
|
+
completion: totalCompletion,
|
|
2582
|
+
total: totalPrompt + totalCompletion
|
|
2583
|
+
};
|
|
2549
2584
|
const stepTypeBreakdown = {};
|
|
2550
2585
|
for (const step of allSteps) {
|
|
2551
2586
|
const entry = stepTypeBreakdown[step.type] ?? {
|
|
@@ -2563,7 +2598,7 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
|
2563
2598
|
const modelUsed = allSteps[0]?.model || model;
|
|
2564
2599
|
const summary = {
|
|
2565
2600
|
totalSteps: allSteps.length,
|
|
2566
|
-
totalTurns:
|
|
2601
|
+
totalTurns: turns.length,
|
|
2567
2602
|
totalDurationMs,
|
|
2568
2603
|
totalTokens,
|
|
2569
2604
|
totalCostUsd: totalCost,
|
|
@@ -2584,116 +2619,100 @@ function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
|
2584
2619
|
summary
|
|
2585
2620
|
};
|
|
2586
2621
|
}
|
|
2587
|
-
function buildTotalTokens(assistantMessages) {
|
|
2588
|
-
let prompt = 0;
|
|
2589
|
-
let completion = 0;
|
|
2590
|
-
for (const { info } of assistantMessages) {
|
|
2591
|
-
prompt += info.tokens.input;
|
|
2592
|
-
completion += info.tokens.output;
|
|
2593
|
-
}
|
|
2594
|
-
return { prompt, completion, total: prompt + completion };
|
|
2595
|
-
}
|
|
2596
2622
|
|
|
2597
2623
|
// src/run-scenario/agents/opencode/build-conversation.ts
|
|
2598
|
-
function buildConversation2(
|
|
2624
|
+
function buildConversation2(timestampedEvents) {
|
|
2599
2625
|
const result = [];
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
break;
|
|
2625
|
-
}
|
|
2626
|
-
}
|
|
2626
|
+
let assistantContent = [];
|
|
2627
|
+
let userContent = [];
|
|
2628
|
+
let latestReceivedAt = 0;
|
|
2629
|
+
const flushAssistant = () => {
|
|
2630
|
+
if (assistantContent.length > 0) {
|
|
2631
|
+
const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
|
|
2632
|
+
result.push({ role: "assistant", content: assistantContent, timestamp });
|
|
2633
|
+
assistantContent = [];
|
|
2634
|
+
}
|
|
2635
|
+
};
|
|
2636
|
+
const flushUser = () => {
|
|
2637
|
+
if (userContent.length > 0) {
|
|
2638
|
+
const timestamp = latestReceivedAt > 0 ? new Date(latestReceivedAt).toISOString() : (/* @__PURE__ */ new Date()).toISOString();
|
|
2639
|
+
result.push({ role: "user", content: userContent, timestamp });
|
|
2640
|
+
userContent = [];
|
|
2641
|
+
}
|
|
2642
|
+
};
|
|
2643
|
+
for (const { event: evt, receivedAt } of timestampedEvents) {
|
|
2644
|
+
latestReceivedAt = receivedAt;
|
|
2645
|
+
switch (evt.type) {
|
|
2646
|
+
case "text": {
|
|
2647
|
+
const te = evt;
|
|
2648
|
+
assistantContent.push({ type: "text", text: te.part.text });
|
|
2649
|
+
break;
|
|
2627
2650
|
}
|
|
2628
|
-
|
|
2629
|
-
|
|
2651
|
+
case "reasoning": {
|
|
2652
|
+
const re = evt;
|
|
2653
|
+
assistantContent.push({ type: "thinking", thinking: re.part.text });
|
|
2654
|
+
break;
|
|
2630
2655
|
}
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
type: "tool_result",
|
|
2651
|
-
toolUseId: toolPart.callID,
|
|
2652
|
-
content: errState.error,
|
|
2653
|
-
isError: true
|
|
2654
|
-
});
|
|
2655
|
-
}
|
|
2656
|
+
case "tool_use": {
|
|
2657
|
+
const tu = evt;
|
|
2658
|
+
assistantContent.push({
|
|
2659
|
+
type: "tool_use",
|
|
2660
|
+
toolName: tu.part.tool,
|
|
2661
|
+
toolId: tu.part.callID,
|
|
2662
|
+
input: tu.part.state.input
|
|
2663
|
+
});
|
|
2664
|
+
if (tu.part.state.status === "completed" || tu.part.state.status === "error") {
|
|
2665
|
+
flushAssistant();
|
|
2666
|
+
const isError = tu.part.state.status === "error";
|
|
2667
|
+
const content = isError ? tu.part.state.error || "Tool execution failed" : tu.part.state.output || "";
|
|
2668
|
+
userContent.push({
|
|
2669
|
+
type: "tool_result",
|
|
2670
|
+
toolUseId: tu.part.callID,
|
|
2671
|
+
content,
|
|
2672
|
+
...isError ? { isError: true } : {}
|
|
2673
|
+
});
|
|
2674
|
+
flushUser();
|
|
2656
2675
|
}
|
|
2676
|
+
break;
|
|
2657
2677
|
}
|
|
2658
|
-
|
|
2659
|
-
|
|
2678
|
+
case "step_finish": {
|
|
2679
|
+
flushAssistant();
|
|
2680
|
+
flushUser();
|
|
2681
|
+
break;
|
|
2660
2682
|
}
|
|
2661
2683
|
}
|
|
2662
2684
|
}
|
|
2685
|
+
flushAssistant();
|
|
2686
|
+
flushUser();
|
|
2663
2687
|
return result;
|
|
2664
2688
|
}
|
|
2665
2689
|
|
|
2666
2690
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2667
|
-
var
|
|
2668
|
-
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
process.env.PATH = `${opencodeBin}:${currentPath}`;
|
|
2673
|
-
}
|
|
2674
|
-
}
|
|
2691
|
+
var import_promises9 = require("fs/promises");
|
|
2692
|
+
var import_path10 = require("path");
|
|
2693
|
+
var KILL_GRACE_PERIOD_MS = 5e3;
|
|
2694
|
+
var IDLE_TIMEOUT_MS = 12e4;
|
|
2695
|
+
var IDLE_CHECK_INTERVAL_MS = 15e3;
|
|
2675
2696
|
function extractToolAction(toolName, args) {
|
|
2676
2697
|
if (!toolName) return "Using tool...";
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2687
|
-
|
|
2688
|
-
|
|
2689
|
-
50
|
|
2690
|
-
);
|
|
2698
|
+
if ((toolName === "Task" || toolName === "dispatch_agent") && args?.description) {
|
|
2699
|
+
const desc = String(args.description).slice(0, 55);
|
|
2700
|
+
return `Task: ${desc}${String(args.description).length > 55 ? "..." : ""}`;
|
|
2701
|
+
}
|
|
2702
|
+
if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && args?.command) {
|
|
2703
|
+
const cmd = String(args.command).slice(0, 50);
|
|
2704
|
+
return `Running: ${cmd}${String(args.command).length > 50 ? "..." : ""}`;
|
|
2705
|
+
}
|
|
2706
|
+
if (args?.file_path || args?.path || args?.target_file) {
|
|
2707
|
+
const filePath = String(
|
|
2708
|
+
args.file_path || args.path || args.target_file
|
|
2709
|
+
).slice(0, 50);
|
|
2691
2710
|
if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
|
|
2692
2711
|
if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
|
|
2693
2712
|
}
|
|
2694
2713
|
return `Using ${toolName}...`;
|
|
2695
2714
|
}
|
|
2696
|
-
function
|
|
2715
|
+
function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
2697
2716
|
const base = {
|
|
2698
2717
|
evalRunId: context.evalRunId,
|
|
2699
2718
|
scenarioId: context.scenarioId,
|
|
@@ -2704,42 +2723,41 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
|
2704
2723
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2705
2724
|
isComplete
|
|
2706
2725
|
};
|
|
2707
|
-
switch (
|
|
2726
|
+
switch (evt.type) {
|
|
2708
2727
|
case "text": {
|
|
2709
|
-
const
|
|
2728
|
+
const te = evt;
|
|
2710
2729
|
return {
|
|
2711
2730
|
...base,
|
|
2712
2731
|
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
2713
|
-
outputPreview:
|
|
2732
|
+
outputPreview: te.part.text.slice(0, 500)
|
|
2714
2733
|
};
|
|
2715
2734
|
}
|
|
2716
|
-
case "reasoning":
|
|
2717
|
-
const reasoningPart = part;
|
|
2735
|
+
case "reasoning":
|
|
2718
2736
|
return {
|
|
2719
2737
|
...base,
|
|
2720
2738
|
type: import_evalforge_types8.LiveTraceEventType.THINKING,
|
|
2721
|
-
thinking:
|
|
2739
|
+
thinking: evt.part.text.slice(0, 500)
|
|
2722
2740
|
};
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
const
|
|
2726
|
-
const
|
|
2727
|
-
const args = toolPart.state.input;
|
|
2741
|
+
case "tool_use": {
|
|
2742
|
+
const tu = evt;
|
|
2743
|
+
const toolName = tu.part.tool;
|
|
2744
|
+
const args = tu.part.state.input;
|
|
2728
2745
|
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
2729
2746
|
let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
|
|
2730
2747
|
let filePath;
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
|
|
2735
|
-
|
|
2736
|
-
|
|
2737
|
-
|
|
2748
|
+
if (args) {
|
|
2749
|
+
if (args.file_path || args.path || args.target_file) {
|
|
2750
|
+
filePath = String(args.file_path || args.path || args.target_file);
|
|
2751
|
+
if (/write|edit/i.test(toolName)) {
|
|
2752
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
|
|
2753
|
+
} else if (/read|view/i.test(toolName)) {
|
|
2754
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
|
|
2755
|
+
}
|
|
2738
2756
|
}
|
|
2739
2757
|
}
|
|
2740
2758
|
return { ...base, type, toolName, toolArgs, filePath };
|
|
2741
2759
|
}
|
|
2742
|
-
case "
|
|
2760
|
+
case "step_finish":
|
|
2743
2761
|
return {
|
|
2744
2762
|
...base,
|
|
2745
2763
|
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
@@ -2769,6 +2787,37 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
2769
2787
|
);
|
|
2770
2788
|
}
|
|
2771
2789
|
}
|
|
2790
|
+
async function writeSystemPromptRule(cwd, systemPrompt) {
|
|
2791
|
+
const rulesDir = (0, import_path10.join)(cwd, ".opencode", "rules");
|
|
2792
|
+
await (0, import_promises9.mkdir)(rulesDir, { recursive: true });
|
|
2793
|
+
await (0, import_promises9.writeFile)(
|
|
2794
|
+
(0, import_path10.join)(rulesDir, "evalforge-system-prompt.md"),
|
|
2795
|
+
systemPrompt,
|
|
2796
|
+
"utf-8"
|
|
2797
|
+
);
|
|
2798
|
+
}
|
|
2799
|
+
function killProcess(child, resolved) {
|
|
2800
|
+
if (!child) return;
|
|
2801
|
+
const killSignal = (signal) => {
|
|
2802
|
+
if (child.pid) {
|
|
2803
|
+
try {
|
|
2804
|
+
process.kill(-child.pid, signal);
|
|
2805
|
+
console.log(
|
|
2806
|
+
`[OpenCode] Sent ${signal} to process group (pid: -${child.pid})`
|
|
2807
|
+
);
|
|
2808
|
+
return;
|
|
2809
|
+
} catch {
|
|
2810
|
+
}
|
|
2811
|
+
}
|
|
2812
|
+
child.kill(signal);
|
|
2813
|
+
};
|
|
2814
|
+
killSignal("SIGTERM");
|
|
2815
|
+
setTimeout(() => {
|
|
2816
|
+
if (child && !resolved) {
|
|
2817
|
+
killSignal("SIGKILL");
|
|
2818
|
+
}
|
|
2819
|
+
}, KILL_GRACE_PERIOD_MS);
|
|
2820
|
+
}
|
|
2772
2821
|
async function executeWithOpenCode(skills, scenario, options) {
|
|
2773
2822
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2774
2823
|
console.log("[executeWithOpenCode] Starting execution", {
|
|
@@ -2783,7 +2832,8 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2783
2832
|
});
|
|
2784
2833
|
const startTime = /* @__PURE__ */ new Date();
|
|
2785
2834
|
const maxTurns = options.maxTurns ?? 10;
|
|
2786
|
-
const
|
|
2835
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
2836
|
+
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
2787
2837
|
model: options.model,
|
|
2788
2838
|
temperature: options.temperature,
|
|
2789
2839
|
maxTurns,
|
|
@@ -2792,12 +2842,6 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2792
2842
|
mcps: options.mcps,
|
|
2793
2843
|
cwd: options.cwd
|
|
2794
2844
|
});
|
|
2795
|
-
const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
|
|
2796
|
-
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
2797
|
-
const abortController = new AbortController();
|
|
2798
|
-
let timeoutHandle;
|
|
2799
|
-
let heartbeatHandle;
|
|
2800
|
-
let timedOut = false;
|
|
2801
2845
|
const traceContext = options.traceContext;
|
|
2802
2846
|
let traceStepNumber = 0;
|
|
2803
2847
|
let lastAction = "Starting...";
|
|
@@ -2814,7 +2858,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2814
2858
|
stepNumber: 0,
|
|
2815
2859
|
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2816
2860
|
outputPreview: JSON.stringify({
|
|
2817
|
-
event: "pre-
|
|
2861
|
+
event: "pre-cli-execution",
|
|
2818
2862
|
model: `${providerID}/${modelID}`,
|
|
2819
2863
|
maxTurns,
|
|
2820
2864
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
@@ -2827,105 +2871,200 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2827
2871
|
traceContext.authToken
|
|
2828
2872
|
);
|
|
2829
2873
|
}
|
|
2830
|
-
let
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
|
|
2840
|
-
|
|
2841
|
-
|
|
2842
|
-
|
|
2843
|
-
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
|
|
2847
|
-
|
|
2848
|
-
|
|
2849
|
-
|
|
2850
|
-
|
|
2874
|
+
let systemPrompt;
|
|
2875
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2876
|
+
} else if (options.systemPrompt != null) {
|
|
2877
|
+
systemPrompt = options.systemPrompt;
|
|
2878
|
+
} else {
|
|
2879
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
2880
|
+
}
|
|
2881
|
+
if (systemPrompt) {
|
|
2882
|
+
await writeSystemPromptRule(options.cwd, systemPrompt);
|
|
2883
|
+
}
|
|
2884
|
+
const args = [
|
|
2885
|
+
"run",
|
|
2886
|
+
"--format",
|
|
2887
|
+
"json",
|
|
2888
|
+
"--thinking",
|
|
2889
|
+
"--variant",
|
|
2890
|
+
"high",
|
|
2891
|
+
"--model",
|
|
2892
|
+
`${providerID}/${modelID}`,
|
|
2893
|
+
"--dir",
|
|
2894
|
+
options.cwd,
|
|
2895
|
+
// NOTE: Trigger prompt is passed as a positional CLI arg. On Linux a single
|
|
2896
|
+
// arg is capped at 128 KB (MAX_ARG_STRLEN); on macOS the combined args+env
|
|
2897
|
+
// share a ~1 MB limit. Prompts exceeding this would fail with E2BIG.
|
|
2898
|
+
// In practice eval prompts are well under this limit.
|
|
2899
|
+
scenario.triggerPrompt
|
|
2900
|
+
];
|
|
2901
|
+
console.log("[executeWithOpenCode] Spawning: opencode", args.slice(0, 5));
|
|
2902
|
+
return new Promise((resolve2, reject) => {
|
|
2903
|
+
let resolved = false;
|
|
2904
|
+
let stderr = "";
|
|
2905
|
+
let lineBuffer = "";
|
|
2906
|
+
let lastOutputTime = Date.now();
|
|
2907
|
+
const allEvents = [];
|
|
2908
|
+
const timers = {};
|
|
2909
|
+
const cleanup = () => {
|
|
2910
|
+
if (timers.timeout) clearTimeout(timers.timeout);
|
|
2911
|
+
if (timers.idleCheck) clearInterval(timers.idleCheck);
|
|
2912
|
+
if (timers.heartbeat) clearInterval(timers.heartbeat);
|
|
2913
|
+
};
|
|
2914
|
+
const finalize = (success, error) => {
|
|
2915
|
+
if (resolved) return;
|
|
2916
|
+
resolved = true;
|
|
2917
|
+
cleanup();
|
|
2918
|
+
if (!success) {
|
|
2919
|
+
if (traceContext) {
|
|
2920
|
+
emitTraceEvent(
|
|
2921
|
+
{
|
|
2922
|
+
evalRunId: traceContext.evalRunId,
|
|
2923
|
+
scenarioId: traceContext.scenarioId,
|
|
2924
|
+
scenarioName: traceContext.scenarioName,
|
|
2925
|
+
targetId: traceContext.targetId,
|
|
2926
|
+
targetName: traceContext.targetName,
|
|
2927
|
+
stepNumber: traceStepNumber + 1,
|
|
2928
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2929
|
+
outputPreview: JSON.stringify({
|
|
2930
|
+
event: "cli-execution-failed",
|
|
2931
|
+
error: error?.message ?? "Unknown error"
|
|
2932
|
+
}).slice(0, 2e3),
|
|
2933
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2934
|
+
isComplete: true
|
|
2935
|
+
},
|
|
2936
|
+
traceContext.tracePushUrl,
|
|
2937
|
+
traceContext.routeHeader,
|
|
2938
|
+
traceContext.authToken
|
|
2939
|
+
);
|
|
2940
|
+
}
|
|
2941
|
+
reject(
|
|
2942
|
+
error ?? new Error(
|
|
2943
|
+
`OpenCode CLI execution failed (exit code unknown).
|
|
2944
|
+
Stderr: ${stderr.slice(0, 1e3)}`
|
|
2945
|
+
)
|
|
2946
|
+
);
|
|
2947
|
+
return;
|
|
2948
|
+
}
|
|
2949
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
2950
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
2951
|
+
let outputText = "";
|
|
2952
|
+
for (const { event: evt } of allEvents) {
|
|
2953
|
+
if (evt.type === "text") {
|
|
2954
|
+
outputText += evt.part.text;
|
|
2955
|
+
}
|
|
2956
|
+
}
|
|
2957
|
+
if (!outputText) {
|
|
2958
|
+
reject(
|
|
2959
|
+
new Error(
|
|
2960
|
+
`Agent produced no text output. Model: ${providerID}/${modelID}, Events: ${allEvents.length}`
|
|
2961
|
+
)
|
|
2962
|
+
);
|
|
2963
|
+
return;
|
|
2964
|
+
}
|
|
2965
|
+
let inputTokens = 0;
|
|
2966
|
+
let outputTokens = 0;
|
|
2967
|
+
let costUsd = 0;
|
|
2968
|
+
for (const { event: evt } of allEvents) {
|
|
2969
|
+
if (evt.type === "step_finish") {
|
|
2970
|
+
const sf = evt;
|
|
2971
|
+
inputTokens += sf.part.tokens.input;
|
|
2972
|
+
outputTokens += sf.part.tokens.output;
|
|
2973
|
+
costUsd += sf.part.cost;
|
|
2974
|
+
}
|
|
2975
|
+
}
|
|
2976
|
+
if (traceContext) {
|
|
2977
|
+
emitTraceEvent(
|
|
2978
|
+
{
|
|
2979
|
+
evalRunId: traceContext.evalRunId,
|
|
2980
|
+
scenarioId: traceContext.scenarioId,
|
|
2981
|
+
scenarioName: traceContext.scenarioName,
|
|
2982
|
+
targetId: traceContext.targetId,
|
|
2983
|
+
targetName: traceContext.targetName,
|
|
2984
|
+
stepNumber: traceStepNumber + 1,
|
|
2985
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
2986
|
+
outputPreview: "Scenario execution completed",
|
|
2987
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2988
|
+
isComplete: true
|
|
2989
|
+
},
|
|
2990
|
+
traceContext.tracePushUrl,
|
|
2991
|
+
traceContext.routeHeader,
|
|
2992
|
+
traceContext.authToken
|
|
2993
|
+
);
|
|
2994
|
+
}
|
|
2995
|
+
const modelStr = options.model || `${providerID}/${modelID}`;
|
|
2996
|
+
const llmTrace = buildLLMTrace(
|
|
2997
|
+
allEvents,
|
|
2998
|
+
totalDurationMs,
|
|
2999
|
+
modelStr,
|
|
3000
|
+
providerID,
|
|
3001
|
+
startTime
|
|
2851
3002
|
);
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
|
|
3003
|
+
const conversation = buildConversation2(allEvents);
|
|
3004
|
+
resolve2({
|
|
3005
|
+
result: {
|
|
3006
|
+
outputText,
|
|
3007
|
+
durationMs: totalDurationMs,
|
|
3008
|
+
usage: {
|
|
3009
|
+
inputTokens,
|
|
3010
|
+
outputTokens,
|
|
3011
|
+
totalTokens: inputTokens + outputTokens
|
|
3012
|
+
},
|
|
3013
|
+
costUsd
|
|
3014
|
+
},
|
|
3015
|
+
llmTrace,
|
|
3016
|
+
conversation
|
|
3017
|
+
});
|
|
3018
|
+
};
|
|
3019
|
+
let child;
|
|
3020
|
+
try {
|
|
3021
|
+
child = (0, import_child_process.spawn)("opencode", args, {
|
|
3022
|
+
cwd: options.cwd,
|
|
3023
|
+
env,
|
|
3024
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
3025
|
+
detached: true
|
|
3026
|
+
});
|
|
3027
|
+
} catch (spawnError) {
|
|
3028
|
+
reject(
|
|
3029
|
+
new Error(
|
|
3030
|
+
`Failed to spawn opencode: ${spawnError instanceof Error ? spawnError.message : String(spawnError)}`
|
|
3031
|
+
)
|
|
3032
|
+
);
|
|
3033
|
+
return;
|
|
3034
|
+
}
|
|
3035
|
+
timers.timeout = setTimeout(() => {
|
|
3036
|
+
if (!resolved) {
|
|
3037
|
+
console.error(`[OpenCode] Process timed out after ${SDK_TIMEOUT_MS}ms`);
|
|
3038
|
+
killProcess(child, resolved);
|
|
3039
|
+
finalize(
|
|
3040
|
+
false,
|
|
3041
|
+
new Error(
|
|
3042
|
+
`OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
|
|
3043
|
+
)
|
|
3044
|
+
);
|
|
3045
|
+
}
|
|
3046
|
+
}, SDK_TIMEOUT_MS);
|
|
3047
|
+
timers.idleCheck = setInterval(() => {
|
|
3048
|
+
if (resolved) return;
|
|
3049
|
+
const idleTime = Date.now() - lastOutputTime;
|
|
3050
|
+
if (idleTime >= IDLE_TIMEOUT_MS) {
|
|
3051
|
+
console.warn(
|
|
3052
|
+
`[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
|
|
3053
|
+
);
|
|
3054
|
+
killProcess(child, resolved);
|
|
3055
|
+
finalize(
|
|
3056
|
+
false,
|
|
3057
|
+
new Error(
|
|
3058
|
+
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
|
|
3059
|
+
)
|
|
3060
|
+
);
|
|
3061
|
+
}
|
|
3062
|
+
}, IDLE_CHECK_INTERVAL_MS);
|
|
2856
3063
|
if (traceContext) {
|
|
2857
|
-
eventStreamAbort = new AbortController();
|
|
2858
3064
|
const executionStartTime = Date.now();
|
|
2859
|
-
(async () => {
|
|
2860
|
-
try {
|
|
2861
|
-
const events = await client.event.subscribe();
|
|
2862
|
-
for await (const event of events.stream) {
|
|
2863
|
-
if (eventStreamAbort.signal.aborted) break;
|
|
2864
|
-
const evt = event;
|
|
2865
|
-
if (evt.type === "message.part.updated") {
|
|
2866
|
-
const { part } = evt.properties;
|
|
2867
|
-
traceStepNumber++;
|
|
2868
|
-
const traceEvent = createTraceEventFromPart(
|
|
2869
|
-
part,
|
|
2870
|
-
traceContext,
|
|
2871
|
-
traceStepNumber,
|
|
2872
|
-
false
|
|
2873
|
-
);
|
|
2874
|
-
if (traceEvent) {
|
|
2875
|
-
lastToolName = traceEvent.toolName;
|
|
2876
|
-
lastFilePath = traceEvent.filePath;
|
|
2877
|
-
if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
2878
|
-
lastAction = "Thinking...";
|
|
2879
|
-
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
2880
|
-
lastAction = extractToolAction(
|
|
2881
|
-
traceEvent.toolName ?? "",
|
|
2882
|
-
void 0
|
|
2883
|
-
);
|
|
2884
|
-
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
2885
|
-
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
2886
|
-
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
2887
|
-
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
2888
|
-
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
2889
|
-
lastAction = "Processing response...";
|
|
2890
|
-
}
|
|
2891
|
-
emitTraceEvent(
|
|
2892
|
-
traceEvent,
|
|
2893
|
-
traceContext.tracePushUrl,
|
|
2894
|
-
traceContext.routeHeader,
|
|
2895
|
-
traceContext.authToken
|
|
2896
|
-
);
|
|
2897
|
-
}
|
|
2898
|
-
} else if (evt.type === "session.error") {
|
|
2899
|
-
const props = evt.properties;
|
|
2900
|
-
traceStepNumber++;
|
|
2901
|
-
emitTraceEvent(
|
|
2902
|
-
{
|
|
2903
|
-
evalRunId: traceContext.evalRunId,
|
|
2904
|
-
scenarioId: traceContext.scenarioId,
|
|
2905
|
-
scenarioName: traceContext.scenarioName,
|
|
2906
|
-
targetId: traceContext.targetId,
|
|
2907
|
-
targetName: traceContext.targetName,
|
|
2908
|
-
stepNumber: traceStepNumber,
|
|
2909
|
-
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2910
|
-
outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
|
|
2911
|
-
0,
|
|
2912
|
-
500
|
|
2913
|
-
),
|
|
2914
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2915
|
-
isComplete: false
|
|
2916
|
-
},
|
|
2917
|
-
traceContext.tracePushUrl,
|
|
2918
|
-
traceContext.routeHeader,
|
|
2919
|
-
traceContext.authToken
|
|
2920
|
-
);
|
|
2921
|
-
}
|
|
2922
|
-
}
|
|
2923
|
-
} catch {
|
|
2924
|
-
}
|
|
2925
|
-
})();
|
|
2926
3065
|
let lastReportedAction = "";
|
|
2927
3066
|
let sameActionCount = 0;
|
|
2928
|
-
|
|
3067
|
+
timers.heartbeat = setInterval(() => {
|
|
2929
3068
|
const elapsedMs = Date.now() - executionStartTime;
|
|
2930
3069
|
let progressMessage = lastAction;
|
|
2931
3070
|
if (lastAction === lastReportedAction) {
|
|
@@ -2966,212 +3105,83 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2966
3105
|
);
|
|
2967
3106
|
}, 1e4);
|
|
2968
3107
|
}
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
if ("error" in promptResult && promptResult.error) {
|
|
3005
|
-
const errPayload = promptResult.error;
|
|
3006
|
-
throw new Error(
|
|
3007
|
-
`Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
|
|
3008
|
-
);
|
|
3009
|
-
}
|
|
3010
|
-
console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
|
|
3011
|
-
const messagesResponse = await client.session.messages({
|
|
3012
|
-
path: { id: sessionId }
|
|
3013
|
-
});
|
|
3014
|
-
const allMessages = messagesResponse.data ?? [];
|
|
3015
|
-
console.log(
|
|
3016
|
-
`[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
|
|
3017
|
-
);
|
|
3018
|
-
if (traceContext) {
|
|
3019
|
-
emitTraceEvent(
|
|
3020
|
-
{
|
|
3021
|
-
evalRunId: traceContext.evalRunId,
|
|
3022
|
-
scenarioId: traceContext.scenarioId,
|
|
3023
|
-
scenarioName: traceContext.scenarioName,
|
|
3024
|
-
targetId: traceContext.targetId,
|
|
3025
|
-
targetName: traceContext.targetName,
|
|
3026
|
-
stepNumber: traceStepNumber + 1,
|
|
3027
|
-
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
3028
|
-
outputPreview: "Scenario execution completed",
|
|
3029
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3030
|
-
isComplete: true
|
|
3031
|
-
},
|
|
3032
|
-
traceContext.tracePushUrl,
|
|
3033
|
-
traceContext.routeHeader,
|
|
3034
|
-
traceContext.authToken
|
|
3035
|
-
);
|
|
3036
|
-
}
|
|
3037
|
-
const endTime = /* @__PURE__ */ new Date();
|
|
3038
|
-
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3039
|
-
const resultData = promptResult.data;
|
|
3040
|
-
const lastAssistantInfo = resultData?.info;
|
|
3041
|
-
if (lastAssistantInfo?.error) {
|
|
3042
|
-
const err = lastAssistantInfo.error;
|
|
3043
|
-
throw new Error(
|
|
3044
|
-
`Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
|
|
3045
|
-
);
|
|
3046
|
-
}
|
|
3047
|
-
let outputText = "";
|
|
3048
|
-
if (resultData?.parts) {
|
|
3049
|
-
for (const part of resultData.parts) {
|
|
3050
|
-
if (part.type === "text") {
|
|
3051
|
-
outputText += part.text;
|
|
3052
|
-
}
|
|
3053
|
-
}
|
|
3054
|
-
}
|
|
3055
|
-
if (!outputText && allMessages.length > 0) {
|
|
3056
|
-
for (let i = allMessages.length - 1; i >= 0; i--) {
|
|
3057
|
-
const msg = allMessages[i];
|
|
3058
|
-
if (msg.info.role === "assistant") {
|
|
3059
|
-
const assistantInfo = msg.info;
|
|
3060
|
-
if (assistantInfo.error) {
|
|
3061
|
-
throw new Error(
|
|
3062
|
-
`Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
|
|
3063
|
-
);
|
|
3064
|
-
}
|
|
3065
|
-
for (const part of msg.parts) {
|
|
3066
|
-
if (part.type === "text") {
|
|
3067
|
-
outputText += part.text;
|
|
3108
|
+
child.stdout?.on("data", (data) => {
|
|
3109
|
+
const text = data.toString();
|
|
3110
|
+
lastOutputTime = Date.now();
|
|
3111
|
+
lineBuffer += text;
|
|
3112
|
+
const lines = lineBuffer.split("\n");
|
|
3113
|
+
lineBuffer = lines.pop() || "";
|
|
3114
|
+
for (const line of lines) {
|
|
3115
|
+
if (!line.trim()) continue;
|
|
3116
|
+
const evt = tryParseJson(line);
|
|
3117
|
+
if (!evt || !evt.type) continue;
|
|
3118
|
+
allEvents.push({ event: evt, receivedAt: Date.now() });
|
|
3119
|
+
if (traceContext) {
|
|
3120
|
+
traceStepNumber++;
|
|
3121
|
+
const traceEvt = createTraceEventFromNdjson(
|
|
3122
|
+
evt,
|
|
3123
|
+
traceContext,
|
|
3124
|
+
traceStepNumber,
|
|
3125
|
+
false
|
|
3126
|
+
);
|
|
3127
|
+
if (traceEvt) {
|
|
3128
|
+
lastToolName = traceEvt.toolName;
|
|
3129
|
+
lastFilePath = traceEvt.filePath;
|
|
3130
|
+
if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
3131
|
+
lastAction = "Thinking...";
|
|
3132
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
3133
|
+
lastAction = extractToolAction(
|
|
3134
|
+
traceEvt.toolName ?? "",
|
|
3135
|
+
void 0
|
|
3136
|
+
);
|
|
3137
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
3138
|
+
lastAction = `Writing: ${traceEvt.filePath || "file"}`;
|
|
3139
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
3140
|
+
lastAction = `Reading: ${traceEvt.filePath || "file"}`;
|
|
3141
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
3142
|
+
lastAction = "Processing response...";
|
|
3068
3143
|
}
|
|
3144
|
+
emitTraceEvent(
|
|
3145
|
+
traceEvt,
|
|
3146
|
+
traceContext.tracePushUrl,
|
|
3147
|
+
traceContext.routeHeader,
|
|
3148
|
+
traceContext.authToken
|
|
3149
|
+
);
|
|
3069
3150
|
}
|
|
3070
|
-
if (outputText) break;
|
|
3071
3151
|
}
|
|
3072
3152
|
}
|
|
3073
|
-
}
|
|
3074
|
-
|
|
3075
|
-
const
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
|
|
3086
|
-
} : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
3087
|
-
const costUsd = lastAssistantInfo?.cost;
|
|
3088
|
-
const modelStr = options.model || DEFAULT_MODEL3;
|
|
3089
|
-
const llmTrace = buildLLMTrace(
|
|
3090
|
-
allMessages,
|
|
3091
|
-
totalDurationMs,
|
|
3092
|
-
modelStr,
|
|
3093
|
-
providerID
|
|
3094
|
-
);
|
|
3095
|
-
const conversation = buildConversation2(allMessages);
|
|
3096
|
-
return {
|
|
3097
|
-
result: {
|
|
3098
|
-
outputText,
|
|
3099
|
-
durationMs: totalDurationMs,
|
|
3100
|
-
usage,
|
|
3101
|
-
costUsd
|
|
3102
|
-
},
|
|
3103
|
-
llmTrace,
|
|
3104
|
-
conversation
|
|
3105
|
-
};
|
|
3106
|
-
} catch (sdkError) {
|
|
3107
|
-
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
3108
|
-
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
3109
|
-
if (timedOut) {
|
|
3110
|
-
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
3111
|
-
}
|
|
3112
|
-
const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
|
|
3113
|
-
const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
|
|
3114
|
-
const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
|
|
3115
|
-
const causeDetails = [];
|
|
3116
|
-
let current = sdkError;
|
|
3117
|
-
while (current instanceof Error && current.cause) {
|
|
3118
|
-
current = current.cause;
|
|
3119
|
-
if (current instanceof Error) {
|
|
3120
|
-
causeDetails.push(`${current.name}: ${current.message}`);
|
|
3121
|
-
} else {
|
|
3122
|
-
causeDetails.push(String(current));
|
|
3153
|
+
});
|
|
3154
|
+
child.stderr?.on("data", (data) => {
|
|
3155
|
+
const text = data.toString();
|
|
3156
|
+
stderr += text;
|
|
3157
|
+
lastOutputTime = Date.now();
|
|
3158
|
+
});
|
|
3159
|
+
child.on("close", (code) => {
|
|
3160
|
+
if (lineBuffer.trim()) {
|
|
3161
|
+
const evt = tryParseJson(lineBuffer);
|
|
3162
|
+
if (evt && evt.type) {
|
|
3163
|
+
allEvents.push({ event: evt, receivedAt: Date.now() });
|
|
3164
|
+
}
|
|
3123
3165
|
}
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
Cause chain: ${causeDetails.join(" -> ")}` : "";
|
|
3127
|
-
console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
|
|
3128
|
-
console.error("[SDK-ERROR] Error name:", errorName);
|
|
3129
|
-
console.error("[SDK-ERROR] Error message:", errorMessage);
|
|
3130
|
-
if (causeDetails.length > 0) {
|
|
3131
|
-
console.error("[SDK-ERROR] Cause chain:", causeDetails.join(" -> "));
|
|
3132
|
-
}
|
|
3133
|
-
if (errorStack) {
|
|
3134
|
-
console.error("[SDK-ERROR] Stack:", errorStack);
|
|
3135
|
-
}
|
|
3136
|
-
if (traceContext) {
|
|
3137
|
-
emitTraceEvent(
|
|
3138
|
-
{
|
|
3139
|
-
evalRunId: traceContext.evalRunId,
|
|
3140
|
-
scenarioId: traceContext.scenarioId,
|
|
3141
|
-
scenarioName: traceContext.scenarioName,
|
|
3142
|
-
targetId: traceContext.targetId,
|
|
3143
|
-
targetName: traceContext.targetName,
|
|
3144
|
-
stepNumber: traceStepNumber + 1,
|
|
3145
|
-
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3146
|
-
outputPreview: JSON.stringify({
|
|
3147
|
-
event: "sdk-execution-failed",
|
|
3148
|
-
error: errorMessage,
|
|
3149
|
-
errorName,
|
|
3150
|
-
...causeDetails.length > 0 && {
|
|
3151
|
-
causeChain: causeDetails.join(" -> ")
|
|
3152
|
-
}
|
|
3153
|
-
}).slice(0, 2e3),
|
|
3154
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3155
|
-
isComplete: true
|
|
3156
|
-
},
|
|
3157
|
-
traceContext.tracePushUrl,
|
|
3158
|
-
traceContext.routeHeader,
|
|
3159
|
-
traceContext.authToken
|
|
3166
|
+
console.log(
|
|
3167
|
+
`[executeWithOpenCode] Process exited with code ${code}, ${allEvents.length} events collected`
|
|
3160
3168
|
);
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
} catch {
|
|
3169
|
+
if (code === 0) {
|
|
3170
|
+
finalize(true);
|
|
3171
|
+
} else {
|
|
3172
|
+
finalize(
|
|
3173
|
+
false,
|
|
3174
|
+
new Error(
|
|
3175
|
+
`OpenCode CLI exited with code ${code}.
|
|
3176
|
+
Stderr: ${stderr.slice(0, 1e3)}`
|
|
3177
|
+
)
|
|
3178
|
+
);
|
|
3172
3179
|
}
|
|
3173
|
-
}
|
|
3174
|
-
|
|
3180
|
+
});
|
|
3181
|
+
child.on("error", (error) => {
|
|
3182
|
+
finalize(false, new Error(`OpenCode CLI spawn error: ${error.message}`));
|
|
3183
|
+
});
|
|
3184
|
+
});
|
|
3175
3185
|
}
|
|
3176
3186
|
|
|
3177
3187
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
@@ -3757,7 +3767,7 @@ defaultRegistry.register(simpleAgentAdapter);
|
|
|
3757
3767
|
|
|
3758
3768
|
// src/run-scenario/file-diff.ts
|
|
3759
3769
|
var import_fs2 = require("fs");
|
|
3760
|
-
var
|
|
3770
|
+
var import_path11 = require("path");
|
|
3761
3771
|
|
|
3762
3772
|
// ../../node_modules/diff/lib/index.mjs
|
|
3763
3773
|
function Diff() {
|
|
@@ -3933,7 +3943,7 @@ Diff.prototype = {
|
|
|
3933
3943
|
tokenize: function tokenize(value) {
|
|
3934
3944
|
return Array.from(value);
|
|
3935
3945
|
},
|
|
3936
|
-
join: function
|
|
3946
|
+
join: function join9(chars) {
|
|
3937
3947
|
return chars.join("");
|
|
3938
3948
|
},
|
|
3939
3949
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -4382,8 +4392,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
4382
4392
|
}
|
|
4383
4393
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
4384
4394
|
for (const entry of entries) {
|
|
4385
|
-
const fullPath = (0,
|
|
4386
|
-
const relativePath = (0,
|
|
4395
|
+
const fullPath = (0, import_path11.join)(dir, entry.name);
|
|
4396
|
+
const relativePath = (0, import_path11.relative)(base, fullPath);
|
|
4387
4397
|
if (shouldIgnore(entry.name)) {
|
|
4388
4398
|
continue;
|
|
4389
4399
|
}
|