@wix/evalforge-evaluator 0.112.0 → 0.113.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -581,7 +581,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
581
581
  }
582
582
 
583
583
  // src/run-scenario/run-agent-with-context.ts
584
- import { randomUUID as randomUUID3 } from "crypto";
584
+ import { randomUUID as randomUUID4 } from "crypto";
585
585
 
586
586
  // src/run-scenario/agents/registry.ts
587
587
  var AgentAdapterRegistry = class {
@@ -1214,10 +1214,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
1214
1214
  }
1215
1215
  const startTime = /* @__PURE__ */ new Date();
1216
1216
  const allMessages = [];
1217
- const { mkdir: mkdirAsync, writeFile: writeFile5 } = await import("fs/promises");
1217
+ const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1218
1218
  const claudeDir = `${options.cwd}/.claude`;
1219
1219
  await mkdirAsync(claudeDir, { recursive: true });
1220
- await writeFile5(`${claudeDir}/settings.json`, "{}", {
1220
+ await writeFile6(`${claudeDir}/settings.json`, "{}", {
1221
1221
  flag: "wx"
1222
1222
  }).catch(() => {
1223
1223
  });
@@ -2137,197 +2137,1249 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
2137
2137
  // src/run-scenario/agents/claude-code/index.ts
2138
2138
  defaultRegistry.register(claudeCodeAdapter);
2139
2139
 
2140
- // src/run-scenario/agents/simple-agent/execute.ts
2141
- import {
2142
- generateText,
2143
- stepCountIs
2144
- } from "ai";
2145
- import { createAnthropic } from "@ai-sdk/anthropic";
2146
- import { createOpenAI } from "@ai-sdk/openai";
2140
+ // src/run-scenario/agents/opencode/opencode-adapter.ts
2141
+ import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
2142
+
2143
+ // src/run-scenario/agents/opencode/execute.ts
2147
2144
  import {
2148
- AVAILABLE_CLAUDE_MODEL_IDS,
2149
- OPENAI_RESPONSES_MODEL_IDS,
2150
- LLMStepType as LLMStepType2,
2145
+ ClaudeModel as ClaudeModel3,
2146
+ DEFAULT_EVALUATOR_SYSTEM_PROMPT as DEFAULT_EVALUATOR_SYSTEM_PROMPT2,
2151
2147
  LiveTraceEventType as LiveTraceEventType2
2152
2148
  } from "@wix/evalforge-types";
2153
- import { randomUUID as randomUUID2 } from "crypto";
2154
2149
 
2155
- // src/run-scenario/agents/simple-agent/mcp-tools.ts
2156
- import { createMCPClient } from "@ai-sdk/mcp";
2157
- import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio";
2158
- async function buildMcpTools(mcps, cwd) {
2159
- const allTools = {};
2160
- const clients = [];
2161
- try {
2162
- for (const mcp of mcps) {
2163
- const resolvedConfig = await resolveMcpPlaceholders(
2164
- mcp.config,
2165
- { cwd }
2150
+ // src/run-scenario/agents/opencode/write-skills.ts
2151
+ import { mkdir as mkdir5 } from "fs/promises";
2152
+ import { join as join6 } from "path";
2153
+ import { fetchGitHubFolder as fetchGitHubFolder3 } from "@wix/evalforge-github-client";
2154
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = fetchGitHubFolder3) {
2155
+ await Promise.all(
2156
+ skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
2157
+ );
2158
+ }
2159
+ async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
2160
+ const skillName = skill.name;
2161
+ const skillDir = join6(cwd, ".opencode", "skills", skillName);
2162
+ await mkdir5(skillDir, { recursive: true });
2163
+ const version = skill.latestVersion;
2164
+ if (version?.files && version.files.length > 0) {
2165
+ await writeFilesToDirectory(skillDir, version.files);
2166
+ console.log(
2167
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
2168
+ );
2169
+ } else if (skill.source) {
2170
+ try {
2171
+ const files = await fetchFn(skill.source, {
2172
+ userAgent: "EvalForge-Evaluator"
2173
+ });
2174
+ await writeFilesToDirectory(skillDir, files);
2175
+ console.log(
2176
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
2177
+ );
2178
+ } catch (error) {
2179
+ const message = error instanceof Error ? error.message : "Unknown error";
2180
+ console.error(
2181
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
2182
+ );
2183
+ throw new Error(
2184
+ `Failed to write skill ${skillName} to filesystem: ${message}`
2166
2185
  );
2167
- for (const [serverName, serverConfig] of Object.entries(resolvedConfig)) {
2168
- const config = serverConfig;
2169
- const transport = buildTransport(serverName, config, cwd);
2170
- const client = await createMCPClient({ transport });
2171
- clients.push(client);
2172
- const tools = await client.tools();
2173
- for (const [toolName, tool] of Object.entries(tools)) {
2174
- allTools[`${serverName}__${toolName}`] = tool;
2175
- }
2176
- }
2177
2186
  }
2178
- } catch (err) {
2179
- await closeMcpClients(clients);
2180
- throw err;
2187
+ } else {
2188
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
2181
2189
  }
2182
- return { tools: allTools, clients };
2183
2190
  }
2184
- async function closeMcpClients(clients) {
2185
- await Promise.allSettled(clients.map((c) => c.close()));
2191
+
2192
+ // src/run-scenario/agents/opencode/write-sub-agents.ts
2193
+ import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
2194
+ import { join as join7 } from "path";
2195
+ import {
2196
+ fetchGitHubFile as fetchGitHubFile2
2197
+ } from "@wix/evalforge-github-client";
2198
+ var AGENTS_DIR2 = ".opencode/agents";
2199
+ function toAgentFilename2(name, index, nameCount) {
2200
+ const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
2201
+ const count = nameCount.get(base) ?? 0;
2202
+ nameCount.set(base, count + 1);
2203
+ return count === 0 ? base : `${base}-${count + 1}`;
2186
2204
  }
2187
- function buildTransport(serverName, config, cwd) {
2188
- const type = config.type;
2189
- const headers = config.headers;
2190
- if (type === "stdio" || config.command) {
2191
- return new Experimental_StdioMCPTransport({
2192
- command: config.command,
2193
- args: config.args ?? [],
2194
- env: { ...config.env, PWD: cwd },
2195
- cwd
2196
- });
2205
+ async function resolveSubAgentContent2(agent, fetchFn) {
2206
+ if (agent.source) {
2207
+ try {
2208
+ const content = await fetchFn(agent.source, {
2209
+ userAgent: "EvalForge-Evaluator"
2210
+ });
2211
+ console.log(
2212
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
2213
+ );
2214
+ return content;
2215
+ } catch (error) {
2216
+ const message = error instanceof Error ? error.message : "Unknown error";
2217
+ console.error(
2218
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
2219
+ );
2220
+ throw new Error(
2221
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
2222
+ );
2223
+ }
2197
2224
  }
2198
- if (type === "http") {
2199
- return {
2200
- type: "http",
2201
- url: config.url,
2202
- ...headers && { headers }
2203
- };
2225
+ if (!agent.subAgentMd) {
2226
+ console.warn(
2227
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
2228
+ );
2204
2229
  }
2205
- if (type === "sse" || config.url) {
2230
+ return agent.subAgentMd;
2231
+ }
2232
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHubFile2) {
2233
+ if (subAgents.length === 0) return;
2234
+ const agentsDir = join7(cwd, AGENTS_DIR2);
2235
+ await mkdir6(agentsDir, { recursive: true });
2236
+ const nameCount = /* @__PURE__ */ new Map();
2237
+ for (const [i, agent] of subAgents.entries()) {
2238
+ const filename = toAgentFilename2(agent.name, i, nameCount);
2239
+ const filePath = join7(agentsDir, `${filename}.md`);
2240
+ const content = await resolveSubAgentContent2(agent, fetchFn);
2241
+ await writeFile5(filePath, content, "utf8");
2242
+ }
2243
+ console.log(`[SubAgents] Written to ${agentsDir}`);
2244
+ }
2245
+
2246
+ // src/run-scenario/agents/opencode/config.ts
2247
+ import {
2248
+ ClaudeModel as ClaudeModel2,
2249
+ AVAILABLE_OPENAI_MODEL_IDS
2250
+ } from "@wix/evalforge-types";
2251
+ var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
2252
+ function parseModel(model) {
2253
+ const slashIndex = model.indexOf("/");
2254
+ if (slashIndex > 0) {
2206
2255
  return {
2207
- type: "sse",
2208
- url: config.url,
2209
- ...headers && { headers }
2256
+ providerID: model.slice(0, slashIndex),
2257
+ modelID: model.slice(slashIndex + 1)
2210
2258
  };
2211
2259
  }
2212
- throw new Error(
2213
- `MCP server "${serverName}" has unsupported transport config (type=${type ?? "unset"}). Expected type "stdio", "http", or "sse", or a config with "command" or "url".`
2260
+ const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
2261
+ model
2214
2262
  );
2263
+ return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
2215
2264
  }
2216
-
2217
- // src/run-scenario/agents/shared/detect-tool-error.ts
2218
- function detectMcpToolError(output) {
2219
- if (output == null) return null;
2220
- if (typeof output === "object" && "isError" in output) {
2221
- const obj = output;
2222
- if (obj.isError === true) {
2223
- return extractErrorText(obj.content);
2265
+ function toOpenCodeMcpConfig(servers) {
2266
+ const result = {};
2267
+ for (const [name, entry] of Object.entries(servers)) {
2268
+ if (entry.type === "local" || entry.type === "remote") {
2269
+ result[name] = entry;
2270
+ continue;
2224
2271
  }
2225
- }
2226
- const str = typeof output === "string" ? output : null;
2227
- if (str && (str.includes('"isError":true') || str.includes('"isError": true'))) {
2228
- try {
2229
- const parsed = JSON.parse(str);
2230
- if (parsed.isError === true) {
2231
- return extractErrorText(parsed.content);
2232
- }
2233
- } catch {
2234
- return str.slice(0, 500);
2272
+ if (entry.url && typeof entry.url === "string") {
2273
+ result[name] = {
2274
+ type: "remote",
2275
+ url: entry.url,
2276
+ ...entry.headers ? { headers: entry.headers } : {},
2277
+ ...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
2278
+ };
2279
+ continue;
2235
2280
  }
2281
+ if (entry.command && typeof entry.command === "string") {
2282
+ const commandArray = [
2283
+ entry.command,
2284
+ ...entry.args || []
2285
+ ];
2286
+ result[name] = {
2287
+ type: "local",
2288
+ command: commandArray,
2289
+ ...entry.env ? { environment: entry.env } : {},
2290
+ ...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
2291
+ };
2292
+ continue;
2293
+ }
2294
+ console.warn(
2295
+ `[MCP] Server "${name}" has unrecognized format, passing through as-is:`,
2296
+ JSON.stringify(entry)
2297
+ );
2298
+ result[name] = entry;
2236
2299
  }
2237
- return null;
2238
- }
2239
- function extractErrorText(content) {
2240
- if (Array.isArray(content)) {
2241
- const text = content.filter((c) => typeof c.text === "string").map((c) => c.text).join("\n");
2242
- if (text) return text.slice(0, 500);
2243
- }
2244
- return "Tool call failed";
2300
+ return result;
2245
2301
  }
2246
-
2247
- // src/run-scenario/agents/simple-agent/cost-calculation.ts
2248
- import { normalizeModelId } from "@wix/evalforge-types";
2249
- var PROVIDER_ANTHROPIC = "anthropic";
2250
- var MODEL_PRICING = {
2251
- // Anthropic Claude 4.6
2252
- "claude-sonnet-4-6": { input: 3, output: 15 },
2253
- "claude-opus-4-6": { input: 15, output: 75 },
2254
- // Anthropic — Claude 4.5
2255
- "claude-opus-4-5": { input: 5, output: 25 },
2256
- "claude-sonnet-4-5": { input: 3, output: 15 },
2257
- "claude-haiku-4-5": { input: 1, output: 5 },
2258
- // Anthropic — Claude 4
2259
- "claude-opus-4": { input: 15, output: 75 },
2260
- "claude-sonnet-4": { input: 3, output: 15 },
2261
- // OpenAI — GPT-5
2262
- "gpt-5": { input: 1.25, output: 10 },
2263
- "gpt-5-mini": { input: 0.25, output: 2 },
2264
- "gpt-5-nano": { input: 0.05, output: 0.4 },
2265
- // OpenAI — GPT-4.1
2266
- "gpt-4.1": { input: 2, output: 8 },
2267
- "gpt-4.1-mini": { input: 0.4, output: 1.6 },
2268
- "gpt-4.1-nano": { input: 0.1, output: 0.4 },
2269
- // OpenAI — GPT-4o
2270
- "gpt-4o": { input: 2.5, output: 10 },
2271
- "gpt-4o-mini": { input: 0.15, output: 0.6 },
2272
- // OpenAI — Reasoning
2273
- o3: { input: 2, output: 8 },
2274
- "o4-mini": { input: 1.1, output: 4.4 },
2275
- "o3-mini": { input: 1.1, output: 4.4 },
2276
- o1: { input: 15, output: 60 }
2277
- };
2278
- function extractGatewayCost(step, provider) {
2279
- try {
2280
- if (provider === PROVIDER_ANTHROPIC) {
2281
- const meta = step.providerMetadata;
2282
- const anthropic = meta?.anthropic;
2283
- const usage = anthropic?.usage;
2284
- const cost2 = usage?.total_cost_usd;
2285
- return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
2302
+ async function buildOpenCodeConfig(options) {
2303
+ const modelStr = options.model || DEFAULT_MODEL2;
2304
+ const { providerID, modelID } = parseModel(modelStr);
2305
+ const provider = {};
2306
+ if (options.aiGatewayUrl) {
2307
+ const providerOptions = {
2308
+ baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
2309
+ apiKey: "sk-placeholder-auth-handled-by-gateway"
2310
+ };
2311
+ if (options.aiGatewayHeaders) {
2312
+ providerOptions.headers = { ...options.aiGatewayHeaders };
2286
2313
  }
2287
- const body = step.response?.body;
2288
- const cost = body?.total_cost_usd;
2289
- return typeof cost === "number" && cost > 0 ? cost : void 0;
2290
- } catch {
2291
- return void 0;
2314
+ provider[providerID] = {
2315
+ options: providerOptions
2316
+ };
2292
2317
  }
2293
- }
2294
- function calculateFromPricing(modelId, tokenUsage) {
2295
- const normalized = normalizeModelId(modelId);
2296
- const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
2297
- if (!pricing) return 0;
2298
- return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
2299
- }
2300
- function calculateStepCost(step, modelId, provider, tokenUsage) {
2301
- return extractGatewayCost(step, provider) ?? calculateFromPricing(modelId, tokenUsage);
2318
+ let mcp;
2319
+ if (options.mcps && options.mcps.length > 0) {
2320
+ const mcpServers = {};
2321
+ for (const mcpEntity of options.mcps) {
2322
+ const entityConfig = mcpEntity.config;
2323
+ for (const [key, value] of Object.entries(entityConfig)) {
2324
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
2325
+ throw new Error(
2326
+ `MCP "${mcpEntity.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}).`
2327
+ );
2328
+ }
2329
+ mcpServers[key] = value;
2330
+ }
2331
+ }
2332
+ const resolved = await resolveMcpPlaceholders(mcpServers, {
2333
+ cwd: options.cwd
2334
+ });
2335
+ mcp = toOpenCodeMcpConfig(resolved);
2336
+ }
2337
+ const agentOverrides = {};
2338
+ if (options.temperature != null) {
2339
+ agentOverrides.temperature = options.temperature;
2340
+ }
2341
+ if (options.maxTurns != null) {
2342
+ agentOverrides.maxSteps = options.maxTurns;
2343
+ }
2344
+ const config = {
2345
+ model: `${providerID}/${modelID}`,
2346
+ provider,
2347
+ ...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
2348
+ permission: {
2349
+ edit: "allow",
2350
+ bash: "allow",
2351
+ webfetch: "allow",
2352
+ doom_loop: "allow",
2353
+ external_directory: "allow"
2354
+ },
2355
+ ...mcp ? { mcp } : {}
2356
+ };
2357
+ return { config, providerID, modelID };
2302
2358
  }
2303
2359
 
2304
- // src/run-scenario/agents/simple-agent/build-conversation.ts
2305
- function buildConversation2(triggerPrompt, steps, executionStartMs) {
2306
- const messages = [];
2307
- messages.push({
2308
- role: "user",
2309
- content: [{ type: "text", text: triggerPrompt }],
2310
- timestamp: new Date(executionStartMs).toISOString()
2311
- });
2312
- for (let i = 0; i < steps.length; i++) {
2313
- const step = steps[i];
2314
- const stepTimestamp = estimateStepTimestamp(
2315
- executionStartMs,
2316
- i,
2317
- steps.length
2318
- );
2319
- const assistantContent = [];
2320
- if (step.reasoningText) {
2321
- assistantContent.push({ type: "thinking", thinking: step.reasoningText });
2322
- }
2323
- if (step.text) {
2324
- assistantContent.push({ type: "text", text: step.text });
2325
- }
2326
- for (const tc of step.toolCalls) {
2327
- assistantContent.push({
2328
- type: "tool_use",
2329
- toolName: tc.toolName,
2330
- toolId: tc.toolCallId,
2360
+ // src/run-scenario/agents/opencode/build-trace.ts
2361
+ import { LLMStepType as LLMStepType2 } from "@wix/evalforge-types";
2362
+ import { randomUUID as randomUUID2 } from "crypto";
2363
+ function buildLLMTrace(messages, totalDurationMs, model, provider) {
2364
+ const assistantMessages = messages.filter(
2365
+ (m) => m.info.role === "assistant"
2366
+ );
2367
+ const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
2368
+ const { info, parts } = msg;
2369
+ let text = "";
2370
+ let thinking = "";
2371
+ const toolCalls = [];
2372
+ let stepInputTokens = 0;
2373
+ let stepOutputTokens = 0;
2374
+ let stepCost = 0;
2375
+ let finishReason = "unknown";
2376
+ for (const part of parts) {
2377
+ switch (part.type) {
2378
+ case "text": {
2379
+ const textPart = part;
2380
+ text += textPart.text;
2381
+ break;
2382
+ }
2383
+ case "reasoning": {
2384
+ const reasoningPart = part;
2385
+ thinking += reasoningPart.text;
2386
+ break;
2387
+ }
2388
+ case "tool": {
2389
+ const toolPart = part;
2390
+ toolCalls.push({
2391
+ toolName: toolPart.tool,
2392
+ args: toolPart.state.input
2393
+ });
2394
+ break;
2395
+ }
2396
+ case "step-finish": {
2397
+ const sf = part;
2398
+ stepInputTokens += sf.tokens.input;
2399
+ stepOutputTokens += sf.tokens.output;
2400
+ stepCost += sf.cost;
2401
+ finishReason = sf.reason;
2402
+ break;
2403
+ }
2404
+ }
2405
+ }
2406
+ if (stepInputTokens === 0 && stepOutputTokens === 0) {
2407
+ stepInputTokens = info.tokens.input;
2408
+ stepOutputTokens = info.tokens.output;
2409
+ stepCost = info.cost;
2410
+ }
2411
+ const startedAt = new Date(info.time.created).toISOString();
2412
+ const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
2413
+ const durationMs = Math.max(0, completedAt - info.time.created);
2414
+ const isSuccess = finishReason !== "error";
2415
+ const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
2416
+ const stepModel = info.modelID || model;
2417
+ const stepProvider = info.providerID || provider;
2418
+ const toolCallCount = toolCalls.length;
2419
+ const hasThinking = !!thinking;
2420
+ const hasText = !!text;
2421
+ const subSteps = [];
2422
+ const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
2423
+ const toolSubSteps = toolCallCount;
2424
+ const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
2425
+ const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
2426
+ if (hasThinking && (hasText || toolCallCount > 0)) {
2427
+ subSteps.push({
2428
+ id: randomUUID2(),
2429
+ stepNumber: 0,
2430
+ // renumbered below
2431
+ turnIndex,
2432
+ type: LLMStepType2.THINKING,
2433
+ model: stepModel,
2434
+ provider: stepProvider,
2435
+ startedAt,
2436
+ durationMs: Math.round(durationMs / totalSubSteps),
2437
+ tokenUsage: {
2438
+ prompt: Math.round(stepInputTokens / totalSubSteps),
2439
+ completion: Math.round(stepOutputTokens / totalSubSteps),
2440
+ total: Math.round(
2441
+ (stepInputTokens + stepOutputTokens) / totalSubSteps
2442
+ )
2443
+ },
2444
+ costUsd: stepCost / totalSubSteps,
2445
+ outputPreview: thinking.slice(0, 200),
2446
+ success: isSuccess,
2447
+ error: errorMsg
2448
+ });
2449
+ }
2450
+ if (toolCallCount > 0) {
2451
+ for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
2452
+ const tc = toolCalls[tcIdx];
2453
+ const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
2454
+ const toolBudgetSteps = toolSubSteps + textSubSteps;
2455
+ const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
2456
+ const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
2457
+ subSteps.push({
2458
+ id: randomUUID2(),
2459
+ stepNumber: 0,
2460
+ turnIndex,
2461
+ type: LLMStepType2.TOOL_USE,
2462
+ model: stepModel,
2463
+ provider: stepProvider,
2464
+ startedAt,
2465
+ durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
2466
+ tokenUsage: {
2467
+ prompt: Math.round(
2468
+ stepInputTokens * remainingFraction * toolFraction
2469
+ ),
2470
+ completion: Math.round(
2471
+ stepOutputTokens * remainingFraction * toolFraction
2472
+ ),
2473
+ total: Math.round(
2474
+ (stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
2475
+ )
2476
+ },
2477
+ costUsd: stepCost * remainingFraction * toolFraction,
2478
+ toolName: tc.toolName,
2479
+ toolArguments: JSON.stringify(tc.args),
2480
+ outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
2481
+ success: isSuccess,
2482
+ error: errorMsg
2483
+ });
2484
+ }
2485
+ }
2486
+ if (hasText && toolCallCount > 0) {
2487
+ subSteps.push({
2488
+ id: randomUUID2(),
2489
+ stepNumber: 0,
2490
+ turnIndex,
2491
+ type: LLMStepType2.COMPLETION,
2492
+ model: stepModel,
2493
+ provider: stepProvider,
2494
+ startedAt,
2495
+ durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
2496
+ tokenUsage: {
2497
+ prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
2498
+ completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
2499
+ total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
2500
+ },
2501
+ costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
2502
+ outputPreview: text.slice(0, 200),
2503
+ success: isSuccess,
2504
+ error: errorMsg
2505
+ });
2506
+ }
2507
+ if (subSteps.length === 0) {
2508
+ const stepType = hasThinking && !hasText ? LLMStepType2.THINKING : LLMStepType2.COMPLETION;
2509
+ subSteps.push({
2510
+ id: randomUUID2(),
2511
+ stepNumber: 0,
2512
+ turnIndex,
2513
+ type: stepType,
2514
+ model: stepModel,
2515
+ provider: stepProvider,
2516
+ startedAt,
2517
+ durationMs,
2518
+ tokenUsage: {
2519
+ prompt: stepInputTokens,
2520
+ completion: stepOutputTokens,
2521
+ total: stepInputTokens + stepOutputTokens
2522
+ },
2523
+ costUsd: stepCost,
2524
+ outputPreview: (text || thinking)?.slice(0, 200),
2525
+ success: isSuccess,
2526
+ error: errorMsg
2527
+ });
2528
+ }
2529
+ return subSteps;
2530
+ }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
2531
+ const totalTokens = buildTotalTokens(assistantMessages);
2532
+ const totalCost = assistantMessages.reduce((sum, m) => {
2533
+ const aMsg = m.info;
2534
+ return sum + aMsg.cost;
2535
+ }, 0);
2536
+ const stepTypeBreakdown = {};
2537
+ for (const step of allSteps) {
2538
+ const entry = stepTypeBreakdown[step.type] ?? {
2539
+ count: 0,
2540
+ durationMs: 0,
2541
+ tokens: 0,
2542
+ costUsd: 0
2543
+ };
2544
+ entry.count += 1;
2545
+ entry.durationMs += step.durationMs;
2546
+ entry.tokens += step.tokenUsage.total;
2547
+ entry.costUsd += step.costUsd;
2548
+ stepTypeBreakdown[step.type] = entry;
2549
+ }
2550
+ const modelUsed = allSteps[0]?.model || model;
2551
+ const summary = {
2552
+ totalSteps: allSteps.length,
2553
+ totalTurns: assistantMessages.length,
2554
+ totalDurationMs,
2555
+ totalTokens,
2556
+ totalCostUsd: totalCost,
2557
+ modelBreakdown: {
2558
+ [modelUsed]: {
2559
+ count: allSteps.length,
2560
+ durationMs: totalDurationMs,
2561
+ tokens: totalTokens.total,
2562
+ costUsd: totalCost
2563
+ }
2564
+ },
2565
+ modelsUsed: [modelUsed],
2566
+ stepTypeBreakdown
2567
+ };
2568
+ return {
2569
+ id: randomUUID2(),
2570
+ steps: allSteps,
2571
+ summary
2572
+ };
2573
+ }
2574
+ function buildTotalTokens(assistantMessages) {
2575
+ let prompt = 0;
2576
+ let completion = 0;
2577
+ for (const { info } of assistantMessages) {
2578
+ prompt += info.tokens.input;
2579
+ completion += info.tokens.output;
2580
+ }
2581
+ return { prompt, completion, total: prompt + completion };
2582
+ }
2583
+
2584
+ // src/run-scenario/agents/opencode/build-conversation.ts
2585
+ function buildConversation2(messages) {
2586
+ const result = [];
2587
+ for (const { info, parts } of messages) {
2588
+ const timestamp = new Date(info.time.created).toISOString();
2589
+ if (info.role === "assistant") {
2590
+ const content = [];
2591
+ for (const part of parts) {
2592
+ switch (part.type) {
2593
+ case "text": {
2594
+ const textPart = part;
2595
+ content.push({ type: "text", text: textPart.text });
2596
+ break;
2597
+ }
2598
+ case "reasoning": {
2599
+ const reasoningPart = part;
2600
+ content.push({ type: "thinking", thinking: reasoningPart.text });
2601
+ break;
2602
+ }
2603
+ case "tool": {
2604
+ const toolPart = part;
2605
+ content.push({
2606
+ type: "tool_use",
2607
+ toolName: toolPart.tool,
2608
+ toolId: toolPart.callID,
2609
+ input: toolPart.state.input
2610
+ });
2611
+ break;
2612
+ }
2613
+ }
2614
+ }
2615
+ if (content.length > 0) {
2616
+ result.push({ role: "assistant", content, timestamp });
2617
+ }
2618
+ } else if (info.role === "user") {
2619
+ const content = [];
2620
+ for (const part of parts) {
2621
+ if (part.type === "text") {
2622
+ const textPart = part;
2623
+ content.push({ type: "text", text: textPart.text });
2624
+ } else if (part.type === "tool") {
2625
+ const toolPart = part;
2626
+ const state = toolPart.state;
2627
+ if (state.status === "completed") {
2628
+ const completed = state;
2629
+ content.push({
2630
+ type: "tool_result",
2631
+ toolUseId: toolPart.callID,
2632
+ content: completed.output
2633
+ });
2634
+ } else if (state.status === "error") {
2635
+ const errState = state;
2636
+ content.push({
2637
+ type: "tool_result",
2638
+ toolUseId: toolPart.callID,
2639
+ content: errState.error,
2640
+ isError: true
2641
+ });
2642
+ }
2643
+ }
2644
+ }
2645
+ if (content.length > 0) {
2646
+ result.push({ role: "user", content, timestamp });
2647
+ }
2648
+ }
2649
+ }
2650
+ return result;
2651
+ }
2652
+
2653
+ // src/run-scenario/agents/opencode/execute.ts
2654
+ var DEFAULT_MODEL3 = `anthropic/${ClaudeModel3.CLAUDE_4_5_SONNET_1_0}`;
2655
+ function extractToolAction(toolName, args) {
2656
+ if (!toolName) return "Using tool...";
2657
+ const a = args;
2658
+ if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
2659
+ const desc = String(a.description).slice(0, 55);
2660
+ return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
2661
+ }
2662
+ if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
2663
+ const cmd = String(a.command).slice(0, 50);
2664
+ return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
2665
+ }
2666
+ if (a?.file_path || a?.path || a?.target_file) {
2667
+ const filePath = String(a.file_path || a.path || a.target_file).slice(
2668
+ 0,
2669
+ 50
2670
+ );
2671
+ if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
2672
+ if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
2673
+ }
2674
+ return `Using ${toolName}...`;
2675
+ }
2676
+ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2677
+ const base = {
2678
+ evalRunId: context.evalRunId,
2679
+ scenarioId: context.scenarioId,
2680
+ scenarioName: context.scenarioName,
2681
+ targetId: context.targetId,
2682
+ targetName: context.targetName,
2683
+ stepNumber,
2684
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2685
+ isComplete
2686
+ };
2687
+ switch (part.type) {
2688
+ case "text": {
2689
+ const textPart = part;
2690
+ return {
2691
+ ...base,
2692
+ type: LiveTraceEventType2.COMPLETION,
2693
+ outputPreview: textPart.text.slice(0, 500)
2694
+ };
2695
+ }
2696
+ case "reasoning": {
2697
+ const reasoningPart = part;
2698
+ return {
2699
+ ...base,
2700
+ type: LiveTraceEventType2.THINKING,
2701
+ thinking: reasoningPart.text.slice(0, 500)
2702
+ };
2703
+ }
2704
+ case "tool": {
2705
+ const toolPart = part;
2706
+ const toolName = toolPart.tool;
2707
+ const args = toolPart.state.input;
2708
+ const toolArgs = JSON.stringify(args).slice(0, 500);
2709
+ let type = LiveTraceEventType2.TOOL_USE;
2710
+ let filePath;
2711
+ const a = args;
2712
+ if (a.file_path || a.path || a.target_file) {
2713
+ filePath = String(a.file_path || a.path || a.target_file);
2714
+ if (/write|edit/i.test(toolName)) {
2715
+ type = LiveTraceEventType2.FILE_WRITE;
2716
+ } else if (/read|view/i.test(toolName)) {
2717
+ type = LiveTraceEventType2.FILE_READ;
2718
+ }
2719
+ }
2720
+ return { ...base, type, toolName, toolArgs, filePath };
2721
+ }
2722
+ case "step-finish":
2723
+ return {
2724
+ ...base,
2725
+ type: LiveTraceEventType2.PROGRESS,
2726
+ outputPreview: "Step completed"
2727
+ };
2728
+ default:
2729
+ return null;
2730
+ }
2731
+ }
2732
+ async function executeWithOpenCode(skills, scenario, options) {
2733
+ const skillNames = skills.map((s) => s.name).join(", ");
2734
+ console.log("[executeWithOpenCode] Starting execution", {
2735
+ skillCount: skills.length,
2736
+ skillNames,
2737
+ scenarioId: scenario.id,
2738
+ scenarioName: scenario.name,
2739
+ cwd: options.cwd,
2740
+ aiGatewayUrl: options.aiGatewayUrl,
2741
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2742
+ model: options.model
2743
+ });
2744
+ const startTime = /* @__PURE__ */ new Date();
2745
+ if (options.mcps && options.mcps.length > 0) {
2746
+ console.log(
2747
+ `[MCP] ${options.mcps.length} MCP(s) will be configured inline`
2748
+ );
2749
+ }
2750
+ if (options.subAgents && options.subAgents.length > 0) {
2751
+ await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
2752
+ }
2753
+ if (options.rules && options.rules.length > 0) {
2754
+ await writeRulesToFilesystem(options.cwd, options.rules);
2755
+ }
2756
+ try {
2757
+ await writeSkillsToFilesystem2(options.cwd, skills);
2758
+ } catch (writeError) {
2759
+ throw new Error(
2760
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
2761
+ );
2762
+ }
2763
+ const maxTurns = options.maxTurns ?? 10;
2764
+ const { config, providerID, modelID } = await buildOpenCodeConfig({
2765
+ model: options.model,
2766
+ temperature: options.temperature,
2767
+ maxTurns,
2768
+ aiGatewayUrl: options.aiGatewayUrl,
2769
+ aiGatewayHeaders: options.aiGatewayHeaders,
2770
+ mcps: options.mcps,
2771
+ cwd: options.cwd
2772
+ });
2773
+ const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
2774
+ const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
2775
+ const abortController = new AbortController();
2776
+ let timeoutHandle;
2777
+ let heartbeatHandle;
2778
+ let timedOut = false;
2779
+ const traceContext = options.traceContext;
2780
+ let traceStepNumber = 0;
2781
+ let lastAction = "Starting...";
2782
+ let lastToolName;
2783
+ let lastFilePath;
2784
+ if (traceContext) {
2785
+ emitTraceEvent(
2786
+ {
2787
+ evalRunId: traceContext.evalRunId,
2788
+ scenarioId: traceContext.scenarioId,
2789
+ scenarioName: traceContext.scenarioName,
2790
+ targetId: traceContext.targetId,
2791
+ targetName: traceContext.targetName,
2792
+ stepNumber: 0,
2793
+ type: LiveTraceEventType2.DIAGNOSTIC,
2794
+ outputPreview: JSON.stringify({
2795
+ event: "pre-sdk-execution",
2796
+ model: `${providerID}/${modelID}`,
2797
+ maxTurns,
2798
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2799
+ }),
2800
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2801
+ isComplete: false
2802
+ },
2803
+ traceContext.tracePushUrl,
2804
+ traceContext.routeHeader,
2805
+ traceContext.authToken
2806
+ );
2807
+ }
2808
+ let server;
2809
+ try {
2810
+ console.log("[SDK-DEBUG] Starting OpenCode server...");
2811
+ server = await createOpencodeServer({
2812
+ config,
2813
+ signal: abortController.signal,
2814
+ timeout: 3e4
2815
+ });
2816
+ console.log(`[SDK-DEBUG] Server started at ${server.url}`);
2817
+ const client = createOpencodeClient({
2818
+ baseUrl: server.url,
2819
+ directory: options.cwd
2820
+ });
2821
+ const session = await client.session.create({
2822
+ body: { title: `eval-${scenario.name}` }
2823
+ });
2824
+ if (!session.data) {
2825
+ const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
2826
+ throw new Error(
2827
+ `OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
2828
+ );
2829
+ }
2830
+ const sessionId = session.data.id;
2831
+ console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
2832
+ let eventStreamAbort;
2833
+ if (traceContext) {
2834
+ eventStreamAbort = new AbortController();
2835
+ const executionStartTime = Date.now();
2836
+ (async () => {
2837
+ try {
2838
+ const events = await client.event.subscribe();
2839
+ for await (const event of events.stream) {
2840
+ if (eventStreamAbort.signal.aborted) break;
2841
+ const evt = event;
2842
+ if (evt.type === "message.part.updated") {
2843
+ const { part } = evt.properties;
2844
+ traceStepNumber++;
2845
+ const traceEvent = createTraceEventFromPart(
2846
+ part,
2847
+ traceContext,
2848
+ traceStepNumber,
2849
+ false
2850
+ );
2851
+ if (traceEvent) {
2852
+ lastToolName = traceEvent.toolName;
2853
+ lastFilePath = traceEvent.filePath;
2854
+ if (traceEvent.type === LiveTraceEventType2.THINKING) {
2855
+ lastAction = "Thinking...";
2856
+ } else if (traceEvent.type === LiveTraceEventType2.TOOL_USE) {
2857
+ lastAction = extractToolAction(
2858
+ traceEvent.toolName ?? "",
2859
+ void 0
2860
+ );
2861
+ } else if (traceEvent.type === LiveTraceEventType2.FILE_WRITE) {
2862
+ lastAction = `Writing: ${traceEvent.filePath || "file"}`;
2863
+ } else if (traceEvent.type === LiveTraceEventType2.FILE_READ) {
2864
+ lastAction = `Reading: ${traceEvent.filePath || "file"}`;
2865
+ } else if (traceEvent.type === LiveTraceEventType2.COMPLETION) {
2866
+ lastAction = "Processing response...";
2867
+ }
2868
+ emitTraceEvent(
2869
+ traceEvent,
2870
+ traceContext.tracePushUrl,
2871
+ traceContext.routeHeader,
2872
+ traceContext.authToken
2873
+ );
2874
+ }
2875
+ } else if (evt.type === "session.error") {
2876
+ const props = evt.properties;
2877
+ traceStepNumber++;
2878
+ emitTraceEvent(
2879
+ {
2880
+ evalRunId: traceContext.evalRunId,
2881
+ scenarioId: traceContext.scenarioId,
2882
+ scenarioName: traceContext.scenarioName,
2883
+ targetId: traceContext.targetId,
2884
+ targetName: traceContext.targetName,
2885
+ stepNumber: traceStepNumber,
2886
+ type: LiveTraceEventType2.DIAGNOSTIC,
2887
+ outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
2888
+ 0,
2889
+ 500
2890
+ ),
2891
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2892
+ isComplete: false
2893
+ },
2894
+ traceContext.tracePushUrl,
2895
+ traceContext.routeHeader,
2896
+ traceContext.authToken
2897
+ );
2898
+ }
2899
+ }
2900
+ } catch {
2901
+ }
2902
+ })();
2903
+ let lastReportedAction = "";
2904
+ let sameActionCount = 0;
2905
+ heartbeatHandle = setInterval(() => {
2906
+ const elapsedMs = Date.now() - executionStartTime;
2907
+ let progressMessage = lastAction;
2908
+ if (lastAction === lastReportedAction) {
2909
+ sameActionCount++;
2910
+ } else {
2911
+ sameActionCount = 1;
2912
+ lastReportedAction = lastAction;
2913
+ }
2914
+ const isTaskTool = lastToolName === "Task" || lastToolName === "dispatch_agent";
2915
+ if (isTaskTool && sameActionCount > 1) {
2916
+ progressMessage = `Waiting for ${lastAction}`;
2917
+ } else if (lastToolName && lastFilePath) {
2918
+ progressMessage = `${lastToolName}: ${lastFilePath}`;
2919
+ } else if (lastToolName && !isTaskTool) {
2920
+ progressMessage = `Using ${lastToolName}...`;
2921
+ }
2922
+ const elapsedSec = Math.round(elapsedMs / 1e3);
2923
+ progressMessage += ` (${elapsedSec}s, step ${traceStepNumber})`;
2924
+ emitTraceEvent(
2925
+ {
2926
+ evalRunId: traceContext.evalRunId,
2927
+ scenarioId: traceContext.scenarioId,
2928
+ scenarioName: traceContext.scenarioName,
2929
+ targetId: traceContext.targetId,
2930
+ targetName: traceContext.targetName,
2931
+ stepNumber: traceStepNumber,
2932
+ type: LiveTraceEventType2.PROGRESS,
2933
+ outputPreview: progressMessage,
2934
+ toolName: lastToolName,
2935
+ filePath: lastFilePath,
2936
+ elapsedMs,
2937
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2938
+ isComplete: false
2939
+ },
2940
+ traceContext.tracePushUrl,
2941
+ traceContext.routeHeader,
2942
+ traceContext.authToken
2943
+ );
2944
+ }, 1e4);
2945
+ }
2946
+ const promptPromise = (async () => {
2947
+ let systemPrompt;
2948
+ if (options.systemPrompt === null || options.systemPrompt === "") {
2949
+ } else if (options.systemPrompt != null) {
2950
+ systemPrompt = options.systemPrompt;
2951
+ } else {
2952
+ systemPrompt = DEFAULT_EVALUATOR_SYSTEM_PROMPT2;
2953
+ }
2954
+ console.log("[SDK-DEBUG] Sending prompt...");
2955
+ const result = await client.session.prompt({
2956
+ path: { id: sessionId },
2957
+ body: {
2958
+ model: { providerID, modelID },
2959
+ ...systemPrompt ? { system: systemPrompt } : {},
2960
+ parts: [{ type: "text", text: scenario.triggerPrompt }]
2961
+ }
2962
+ });
2963
+ return result;
2964
+ })();
2965
+ const timeoutPromise = new Promise((_, reject) => {
2966
+ timeoutHandle = setTimeout(() => {
2967
+ timedOut = true;
2968
+ client.session.abort({ path: { id: sessionId } }).catch(() => {
2969
+ });
2970
+ reject(
2971
+ new Error(
2972
+ `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
2973
+ )
2974
+ );
2975
+ }, SDK_TIMEOUT_MS);
2976
+ });
2977
+ const promptResult = await Promise.race([promptPromise, timeoutPromise]);
2978
+ if (timeoutHandle) clearTimeout(timeoutHandle);
2979
+ if (heartbeatHandle) clearInterval(heartbeatHandle);
2980
+ if (eventStreamAbort) eventStreamAbort.abort();
2981
+ if ("error" in promptResult && promptResult.error) {
2982
+ const errPayload = promptResult.error;
2983
+ throw new Error(
2984
+ `Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
2985
+ );
2986
+ }
2987
+ console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
2988
+ const messagesResponse = await client.session.messages({
2989
+ path: { id: sessionId }
2990
+ });
2991
+ const allMessages = messagesResponse.data ?? [];
2992
+ console.log(
2993
+ `[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
2994
+ );
2995
+ if (traceContext) {
2996
+ emitTraceEvent(
2997
+ {
2998
+ evalRunId: traceContext.evalRunId,
2999
+ scenarioId: traceContext.scenarioId,
3000
+ scenarioName: traceContext.scenarioName,
3001
+ targetId: traceContext.targetId,
3002
+ targetName: traceContext.targetName,
3003
+ stepNumber: traceStepNumber + 1,
3004
+ type: LiveTraceEventType2.COMPLETION,
3005
+ outputPreview: "Scenario execution completed",
3006
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3007
+ isComplete: true
3008
+ },
3009
+ traceContext.tracePushUrl,
3010
+ traceContext.routeHeader,
3011
+ traceContext.authToken
3012
+ );
3013
+ }
3014
+ const endTime = /* @__PURE__ */ new Date();
3015
+ const totalDurationMs = endTime.getTime() - startTime.getTime();
3016
+ const resultData = promptResult.data;
3017
+ const lastAssistantInfo = resultData?.info;
3018
+ if (lastAssistantInfo?.error) {
3019
+ const err = lastAssistantInfo.error;
3020
+ throw new Error(
3021
+ `Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
3022
+ );
3023
+ }
3024
+ let outputText = "";
3025
+ if (resultData?.parts) {
3026
+ for (const part of resultData.parts) {
3027
+ if (part.type === "text") {
3028
+ outputText += part.text;
3029
+ }
3030
+ }
3031
+ }
3032
+ if (!outputText && allMessages.length > 0) {
3033
+ for (let i = allMessages.length - 1; i >= 0; i--) {
3034
+ const msg = allMessages[i];
3035
+ if (msg.info.role === "assistant") {
3036
+ const assistantInfo = msg.info;
3037
+ if (assistantInfo.error) {
3038
+ throw new Error(
3039
+ `Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
3040
+ );
3041
+ }
3042
+ for (const part of msg.parts) {
3043
+ if (part.type === "text") {
3044
+ outputText += part.text;
3045
+ }
3046
+ }
3047
+ if (outputText) break;
3048
+ }
3049
+ }
3050
+ }
3051
+ if (!outputText) {
3052
+ const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
3053
+ if (!hasAssistant) {
3054
+ throw new Error(
3055
+ `Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
3056
+ );
3057
+ }
3058
+ }
3059
+ const usage = lastAssistantInfo ? {
3060
+ inputTokens: lastAssistantInfo.tokens.input,
3061
+ outputTokens: lastAssistantInfo.tokens.output,
3062
+ totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
3063
+ } : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
3064
+ const costUsd = lastAssistantInfo?.cost;
3065
+ const modelStr = options.model || DEFAULT_MODEL3;
3066
+ const llmTrace = buildLLMTrace(
3067
+ allMessages,
3068
+ totalDurationMs,
3069
+ modelStr,
3070
+ providerID
3071
+ );
3072
+ const conversation = buildConversation2(allMessages);
3073
+ return {
3074
+ result: {
3075
+ outputText,
3076
+ durationMs: totalDurationMs,
3077
+ usage,
3078
+ costUsd
3079
+ },
3080
+ llmTrace,
3081
+ conversation
3082
+ };
3083
+ } catch (sdkError) {
3084
+ if (timeoutHandle) clearTimeout(timeoutHandle);
3085
+ if (heartbeatHandle) clearInterval(heartbeatHandle);
3086
+ if (timedOut) {
3087
+ console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
3088
+ }
3089
+ const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
3090
+ const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
3091
+ const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
3092
+ console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
3093
+ console.error("[SDK-ERROR] Error name:", errorName);
3094
+ console.error("[SDK-ERROR] Error message:", errorMessage);
3095
+ if (errorStack) {
3096
+ console.error("[SDK-ERROR] Stack:", errorStack);
3097
+ }
3098
+ if (traceContext) {
3099
+ emitTraceEvent(
3100
+ {
3101
+ evalRunId: traceContext.evalRunId,
3102
+ scenarioId: traceContext.scenarioId,
3103
+ scenarioName: traceContext.scenarioName,
3104
+ targetId: traceContext.targetId,
3105
+ targetName: traceContext.targetName,
3106
+ stepNumber: traceStepNumber + 1,
3107
+ type: LiveTraceEventType2.DIAGNOSTIC,
3108
+ outputPreview: JSON.stringify({
3109
+ event: "sdk-execution-failed",
3110
+ error: errorMessage,
3111
+ errorName
3112
+ }).slice(0, 2e3),
3113
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3114
+ isComplete: true
3115
+ },
3116
+ traceContext.tracePushUrl,
3117
+ traceContext.routeHeader,
3118
+ traceContext.authToken
3119
+ );
3120
+ }
3121
+ throw new Error(
3122
+ `OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
3123
+ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
3124
+ );
3125
+ } finally {
3126
+ if (server) {
3127
+ try {
3128
+ server.close();
3129
+ console.log("[SDK-DEBUG] OpenCode server closed");
3130
+ } catch {
3131
+ }
3132
+ }
3133
+ }
3134
+ }
3135
+
3136
+ // src/run-scenario/agents/opencode/opencode-adapter.ts
3137
+ var OpenCodeAdapter = class {
3138
+ id = "opencode";
3139
+ name = "OpenCode";
3140
+ supportedCommands = [AgentRunCommand2.OPENCODE];
3141
+ async execute(context) {
3142
+ const {
3143
+ skills,
3144
+ scenario,
3145
+ cwd,
3146
+ modelConfig,
3147
+ aiGatewayUrl,
3148
+ aiGatewayHeaders,
3149
+ traceContext,
3150
+ mcps,
3151
+ subAgents,
3152
+ rules,
3153
+ systemPrompt
3154
+ } = context;
3155
+ const options = {
3156
+ cwd,
3157
+ model: modelConfig?.model,
3158
+ temperature: modelConfig?.temperature,
3159
+ maxTurns: modelConfig?.maxTurns,
3160
+ aiGatewayUrl,
3161
+ aiGatewayHeaders,
3162
+ traceContext,
3163
+ mcps,
3164
+ subAgents,
3165
+ rules,
3166
+ systemPrompt
3167
+ };
3168
+ const { result, llmTrace, conversation } = await executeWithOpenCode(
3169
+ skills,
3170
+ scenario,
3171
+ options
3172
+ );
3173
+ return {
3174
+ outputText: result.outputText,
3175
+ durationMs: result.durationMs,
3176
+ usage: {
3177
+ inputTokens: result.usage.inputTokens,
3178
+ outputTokens: result.usage.outputTokens,
3179
+ totalTokens: result.usage.totalTokens
3180
+ },
3181
+ costUsd: result.costUsd,
3182
+ llmTrace,
3183
+ conversation
3184
+ };
3185
+ }
3186
+ };
3187
+ var openCodeAdapter = new OpenCodeAdapter();
3188
+
3189
+ // src/run-scenario/agents/opencode/index.ts
3190
+ defaultRegistry.register(openCodeAdapter);
3191
+
3192
+ // src/run-scenario/agents/simple-agent/execute.ts
3193
+ import {
3194
+ generateText,
3195
+ stepCountIs
3196
+ } from "ai";
3197
+ import { createAnthropic } from "@ai-sdk/anthropic";
3198
+ import { createOpenAI } from "@ai-sdk/openai";
3199
+ import {
3200
+ AVAILABLE_CLAUDE_MODEL_IDS,
3201
+ OPENAI_RESPONSES_MODEL_IDS,
3202
+ LLMStepType as LLMStepType3,
3203
+ LiveTraceEventType as LiveTraceEventType3
3204
+ } from "@wix/evalforge-types";
3205
+ import { randomUUID as randomUUID3 } from "crypto";
3206
+
3207
+ // src/run-scenario/agents/simple-agent/mcp-tools.ts
3208
+ import { createMCPClient } from "@ai-sdk/mcp";
3209
+ import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio";
3210
+ async function buildMcpTools(mcps, cwd) {
3211
+ const allTools = {};
3212
+ const clients = [];
3213
+ try {
3214
+ for (const mcp of mcps) {
3215
+ const resolvedConfig = await resolveMcpPlaceholders(
3216
+ mcp.config,
3217
+ { cwd }
3218
+ );
3219
+ for (const [serverName, serverConfig] of Object.entries(resolvedConfig)) {
3220
+ const config = serverConfig;
3221
+ const transport = buildTransport(serverName, config, cwd);
3222
+ const client = await createMCPClient({ transport });
3223
+ clients.push(client);
3224
+ const tools = await client.tools();
3225
+ for (const [toolName, tool] of Object.entries(tools)) {
3226
+ allTools[`${serverName}__${toolName}`] = tool;
3227
+ }
3228
+ }
3229
+ }
3230
+ } catch (err) {
3231
+ await closeMcpClients(clients);
3232
+ throw err;
3233
+ }
3234
+ return { tools: allTools, clients };
3235
+ }
3236
+ async function closeMcpClients(clients) {
3237
+ await Promise.allSettled(clients.map((c) => c.close()));
3238
+ }
3239
+ function buildTransport(serverName, config, cwd) {
3240
+ const type = config.type;
3241
+ const headers = config.headers;
3242
+ if (type === "stdio" || config.command) {
3243
+ return new Experimental_StdioMCPTransport({
3244
+ command: config.command,
3245
+ args: config.args ?? [],
3246
+ env: { ...config.env, PWD: cwd },
3247
+ cwd
3248
+ });
3249
+ }
3250
+ if (type === "http") {
3251
+ return {
3252
+ type: "http",
3253
+ url: config.url,
3254
+ ...headers && { headers }
3255
+ };
3256
+ }
3257
+ if (type === "sse" || config.url) {
3258
+ return {
3259
+ type: "sse",
3260
+ url: config.url,
3261
+ ...headers && { headers }
3262
+ };
3263
+ }
3264
+ throw new Error(
3265
+ `MCP server "${serverName}" has unsupported transport config (type=${type ?? "unset"}). Expected type "stdio", "http", or "sse", or a config with "command" or "url".`
3266
+ );
3267
+ }
3268
+
3269
+ // src/run-scenario/agents/shared/detect-tool-error.ts
3270
+ function detectMcpToolError(output) {
3271
+ if (output == null) return null;
3272
+ if (typeof output === "object" && "isError" in output) {
3273
+ const obj = output;
3274
+ if (obj.isError === true) {
3275
+ return extractErrorText(obj.content);
3276
+ }
3277
+ }
3278
+ const str = typeof output === "string" ? output : null;
3279
+ if (str && (str.includes('"isError":true') || str.includes('"isError": true'))) {
3280
+ try {
3281
+ const parsed = JSON.parse(str);
3282
+ if (parsed.isError === true) {
3283
+ return extractErrorText(parsed.content);
3284
+ }
3285
+ } catch {
3286
+ return str.slice(0, 500);
3287
+ }
3288
+ }
3289
+ return null;
3290
+ }
3291
+ function extractErrorText(content) {
3292
+ if (Array.isArray(content)) {
3293
+ const text = content.filter((c) => typeof c.text === "string").map((c) => c.text).join("\n");
3294
+ if (text) return text.slice(0, 500);
3295
+ }
3296
+ return "Tool call failed";
3297
+ }
3298
+
3299
+ // src/run-scenario/agents/simple-agent/cost-calculation.ts
3300
+ import { normalizeModelId } from "@wix/evalforge-types";
3301
+ var PROVIDER_ANTHROPIC = "anthropic";
3302
+ var MODEL_PRICING = {
3303
+ // Anthropic — Claude 4.6
3304
+ "claude-sonnet-4-6": { input: 3, output: 15 },
3305
+ "claude-opus-4-6": { input: 15, output: 75 },
3306
+ // Anthropic — Claude 4.5
3307
+ "claude-opus-4-5": { input: 5, output: 25 },
3308
+ "claude-sonnet-4-5": { input: 3, output: 15 },
3309
+ "claude-haiku-4-5": { input: 1, output: 5 },
3310
+ // Anthropic — Claude 4
3311
+ "claude-opus-4": { input: 15, output: 75 },
3312
+ "claude-sonnet-4": { input: 3, output: 15 },
3313
+ // OpenAI — GPT-5
3314
+ "gpt-5": { input: 1.25, output: 10 },
3315
+ "gpt-5-mini": { input: 0.25, output: 2 },
3316
+ "gpt-5-nano": { input: 0.05, output: 0.4 },
3317
+ // OpenAI — GPT-4.1
3318
+ "gpt-4.1": { input: 2, output: 8 },
3319
+ "gpt-4.1-mini": { input: 0.4, output: 1.6 },
3320
+ "gpt-4.1-nano": { input: 0.1, output: 0.4 },
3321
+ // OpenAI — GPT-4o
3322
+ "gpt-4o": { input: 2.5, output: 10 },
3323
+ "gpt-4o-mini": { input: 0.15, output: 0.6 },
3324
+ // OpenAI — Reasoning
3325
+ o3: { input: 2, output: 8 },
3326
+ "o4-mini": { input: 1.1, output: 4.4 },
3327
+ "o3-mini": { input: 1.1, output: 4.4 },
3328
+ o1: { input: 15, output: 60 }
3329
+ };
3330
+ function extractGatewayCost(step, provider) {
3331
+ try {
3332
+ if (provider === PROVIDER_ANTHROPIC) {
3333
+ const meta = step.providerMetadata;
3334
+ const anthropic = meta?.anthropic;
3335
+ const usage = anthropic?.usage;
3336
+ const cost2 = usage?.total_cost_usd;
3337
+ return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
3338
+ }
3339
+ const body = step.response?.body;
3340
+ const cost = body?.total_cost_usd;
3341
+ return typeof cost === "number" && cost > 0 ? cost : void 0;
3342
+ } catch {
3343
+ return void 0;
3344
+ }
3345
+ }
3346
+ function calculateFromPricing(modelId, tokenUsage) {
3347
+ const normalized = normalizeModelId(modelId);
3348
+ const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
3349
+ if (!pricing) return 0;
3350
+ return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
3351
+ }
3352
+ function calculateStepCost(step, modelId, provider, tokenUsage) {
3353
+ return extractGatewayCost(step, provider) ?? calculateFromPricing(modelId, tokenUsage);
3354
+ }
3355
+
3356
+ // src/run-scenario/agents/simple-agent/build-conversation.ts
3357
+ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3358
+ const messages = [];
3359
+ messages.push({
3360
+ role: "user",
3361
+ content: [{ type: "text", text: triggerPrompt }],
3362
+ timestamp: new Date(executionStartMs).toISOString()
3363
+ });
3364
+ for (let i = 0; i < steps.length; i++) {
3365
+ const step = steps[i];
3366
+ const stepTimestamp = estimateStepTimestamp(
3367
+ executionStartMs,
3368
+ i,
3369
+ steps.length
3370
+ );
3371
+ const assistantContent = [];
3372
+ if (step.reasoningText) {
3373
+ assistantContent.push({ type: "thinking", thinking: step.reasoningText });
3374
+ }
3375
+ if (step.text) {
3376
+ assistantContent.push({ type: "text", text: step.text });
3377
+ }
3378
+ for (const tc of step.toolCalls) {
3379
+ assistantContent.push({
3380
+ type: "tool_use",
3381
+ toolName: tc.toolName,
3382
+ toolId: tc.toolCallId,
2331
3383
  input: tc.input
2332
3384
  });
2333
3385
  }
@@ -2462,7 +3514,7 @@ async function executeWithAiSdk(context) {
2462
3514
  outputTokens: result.usage.outputTokens ?? 0,
2463
3515
  totalTokens: result.usage.totalTokens ?? 0
2464
3516
  };
2465
- const llmTrace = buildLLMTrace(
3517
+ const llmTrace = buildLLMTrace2(
2466
3518
  result.steps,
2467
3519
  durationMs,
2468
3520
  usage,
@@ -2474,7 +3526,7 @@ async function executeWithAiSdk(context) {
2474
3526
  emitStepEvents(traceContext, result.steps, startTime);
2475
3527
  emitCompletionEvent(traceContext, result.steps.length + 1);
2476
3528
  }
2477
- const conversation = buildConversation2(
3529
+ const conversation = buildConversation3(
2478
3530
  scenario.triggerPrompt,
2479
3531
  result.steps,
2480
3532
  startTime
@@ -2518,7 +3570,7 @@ function findToolResultError(step) {
2518
3570
  }
2519
3571
  return null;
2520
3572
  }
2521
- function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
3573
+ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
2522
3574
  const totalStepTokens = steps.reduce(
2523
3575
  (sum, s) => sum + (s.usage.totalTokens ?? 0),
2524
3576
  0
@@ -2536,10 +3588,10 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
2536
3588
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
2537
3589
  const toolResultError = findToolResultError(step);
2538
3590
  return {
2539
- id: randomUUID2(),
3591
+ id: randomUUID3(),
2540
3592
  stepNumber: i + 1,
2541
3593
  turnIndex: i,
2542
- type: step.toolCalls.length > 0 ? LLMStepType2.TOOL_USE : LLMStepType2.COMPLETION,
3594
+ type: step.toolCalls.length > 0 ? LLMStepType3.TOOL_USE : LLMStepType3.COMPLETION,
2543
3595
  model: modelId,
2544
3596
  provider,
2545
3597
  startedAt: new Date(
@@ -2562,7 +3614,7 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
2562
3614
  total: totalUsage.totalTokens
2563
3615
  };
2564
3616
  return {
2565
- id: randomUUID2(),
3617
+ id: randomUUID3(),
2566
3618
  steps: traceSteps,
2567
3619
  summary: {
2568
3620
  totalSteps: traceSteps.length,
@@ -2591,7 +3643,7 @@ function emitStartEvent(traceContext, startTime) {
2591
3643
  targetId: traceContext.targetId,
2592
3644
  targetName: traceContext.targetName,
2593
3645
  stepNumber: 0,
2594
- type: LiveTraceEventType2.PROGRESS,
3646
+ type: LiveTraceEventType3.PROGRESS,
2595
3647
  outputPreview: "Starting Simple Agent execution...",
2596
3648
  elapsedMs: Date.now() - startTime,
2597
3649
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -2615,7 +3667,7 @@ function emitStepEvents(traceContext, steps, startTime) {
2615
3667
  targetId: traceContext.targetId,
2616
3668
  targetName: traceContext.targetName,
2617
3669
  stepNumber: i + 1,
2618
- type: isToolStep ? LiveTraceEventType2.TOOL_USE : LiveTraceEventType2.COMPLETION,
3670
+ type: isToolStep ? LiveTraceEventType3.TOOL_USE : LiveTraceEventType3.COMPLETION,
2619
3671
  toolName: firstToolCall?.toolName,
2620
3672
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
2621
3673
  outputPreview: step.text?.slice(0, 500),
@@ -2638,7 +3690,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
2638
3690
  targetId: traceContext.targetId,
2639
3691
  targetName: traceContext.targetName,
2640
3692
  stepNumber,
2641
- type: LiveTraceEventType2.COMPLETION,
3693
+ type: LiveTraceEventType3.COMPLETION,
2642
3694
  outputPreview: "Scenario execution completed",
2643
3695
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2644
3696
  isComplete: true
@@ -2665,7 +3717,7 @@ defaultRegistry.register(simpleAgentAdapter);
2665
3717
 
2666
3718
  // src/run-scenario/file-diff.ts
2667
3719
  import { readdirSync, readFileSync as readFileSync2, statSync, existsSync as existsSync2 } from "fs";
2668
- import { join as join7, relative } from "path";
3720
+ import { join as join9, relative } from "path";
2669
3721
 
2670
3722
  // ../../node_modules/diff/lib/index.mjs
2671
3723
  function Diff() {
@@ -2841,7 +3893,7 @@ Diff.prototype = {
2841
3893
  tokenize: function tokenize(value) {
2842
3894
  return Array.from(value);
2843
3895
  },
2844
- join: function join6(chars) {
3896
+ join: function join8(chars) {
2845
3897
  return chars.join("");
2846
3898
  },
2847
3899
  postProcess: function postProcess(changeObjects) {
@@ -3281,7 +4333,7 @@ function snapshotDirectory(dir, baseDir) {
3281
4333
  }
3282
4334
  const entries = readdirSync(dir, { withFileTypes: true });
3283
4335
  for (const entry of entries) {
3284
- const fullPath = join7(dir, entry.name);
4336
+ const fullPath = join9(dir, entry.name);
3285
4337
  const relativePath = relative(base, fullPath);
3286
4338
  if (shouldIgnore(entry.name)) {
3287
4339
  continue;
@@ -3390,8 +4442,8 @@ function extractTemplateFiles(before, after) {
3390
4442
  }
3391
4443
 
3392
4444
  // src/run-scenario/run-agent-with-context.ts
3393
- import { AgentRunCommand as AgentRunCommand2, AgentType } from "@wix/evalforge-types";
3394
- var DEFAULT_AGENT_COMMAND = AgentRunCommand2.CLAUDE;
4445
+ import { AgentRunCommand as AgentRunCommand3, AgentType } from "@wix/evalforge-types";
4446
+ var DEFAULT_AGENT_COMMAND = AgentRunCommand3.CLAUDE;
3395
4447
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
3396
4448
  const agent = evalData.agent ?? void 0;
3397
4449
  const isSDK = agent?.agentType === AgentType.SDK;
@@ -3429,7 +4481,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
3429
4481
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
3430
4482
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
3431
4483
  return {
3432
- id: randomUUID3(),
4484
+ id: randomUUID4(),
3433
4485
  targetId,
3434
4486
  targetName,
3435
4487
  scenarioId: scenario.id,