@wix/evalforge-evaluator 0.112.0 → 0.113.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +1240 -197
- package/build/index.js.map +4 -4
- package/build/index.mjs +1237 -185
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +1 -1
- package/build/types/run-scenario/agents/index.d.ts +2 -0
- package/build/types/run-scenario/agents/opencode/build-conversation.d.ts +7 -0
- package/build/types/run-scenario/agents/opencode/build-trace.d.ts +13 -0
- package/build/types/run-scenario/agents/opencode/config.d.ts +27 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/index.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +18 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +32 -0
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +12 -0
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +7 -0
- package/package.json +8 -7
package/build/index.mjs
CHANGED
|
@@ -581,7 +581,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
581
581
|
}
|
|
582
582
|
|
|
583
583
|
// src/run-scenario/run-agent-with-context.ts
|
|
584
|
-
import { randomUUID as
|
|
584
|
+
import { randomUUID as randomUUID4 } from "crypto";
|
|
585
585
|
|
|
586
586
|
// src/run-scenario/agents/registry.ts
|
|
587
587
|
var AgentAdapterRegistry = class {
|
|
@@ -1214,10 +1214,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1214
1214
|
}
|
|
1215
1215
|
const startTime = /* @__PURE__ */ new Date();
|
|
1216
1216
|
const allMessages = [];
|
|
1217
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1217
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1218
1218
|
const claudeDir = `${options.cwd}/.claude`;
|
|
1219
1219
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
1220
|
-
await
|
|
1220
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1221
1221
|
flag: "wx"
|
|
1222
1222
|
}).catch(() => {
|
|
1223
1223
|
});
|
|
@@ -2137,197 +2137,1249 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
2137
2137
|
// src/run-scenario/agents/claude-code/index.ts
|
|
2138
2138
|
defaultRegistry.register(claudeCodeAdapter);
|
|
2139
2139
|
|
|
2140
|
-
// src/run-scenario/agents/
|
|
2141
|
-
import {
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
} from "ai";
|
|
2145
|
-
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
2146
|
-
import { createOpenAI } from "@ai-sdk/openai";
|
|
2140
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
2141
|
+
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2142
|
+
|
|
2143
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2147
2144
|
import {
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
LLMStepType as LLMStepType2,
|
|
2145
|
+
ClaudeModel as ClaudeModel3,
|
|
2146
|
+
DEFAULT_EVALUATOR_SYSTEM_PROMPT as DEFAULT_EVALUATOR_SYSTEM_PROMPT2,
|
|
2151
2147
|
LiveTraceEventType as LiveTraceEventType2
|
|
2152
2148
|
} from "@wix/evalforge-types";
|
|
2153
|
-
import { randomUUID as randomUUID2 } from "crypto";
|
|
2154
2149
|
|
|
2155
|
-
// src/run-scenario/agents/
|
|
2156
|
-
import {
|
|
2157
|
-
import {
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2150
|
+
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2151
|
+
import { mkdir as mkdir5 } from "fs/promises";
|
|
2152
|
+
import { join as join6 } from "path";
|
|
2153
|
+
import { fetchGitHubFolder as fetchGitHubFolder3 } from "@wix/evalforge-github-client";
|
|
2154
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = fetchGitHubFolder3) {
|
|
2155
|
+
await Promise.all(
|
|
2156
|
+
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
2157
|
+
);
|
|
2158
|
+
}
|
|
2159
|
+
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
2160
|
+
const skillName = skill.name;
|
|
2161
|
+
const skillDir = join6(cwd, ".opencode", "skills", skillName);
|
|
2162
|
+
await mkdir5(skillDir, { recursive: true });
|
|
2163
|
+
const version = skill.latestVersion;
|
|
2164
|
+
if (version?.files && version.files.length > 0) {
|
|
2165
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
2166
|
+
console.log(
|
|
2167
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
2168
|
+
);
|
|
2169
|
+
} else if (skill.source) {
|
|
2170
|
+
try {
|
|
2171
|
+
const files = await fetchFn(skill.source, {
|
|
2172
|
+
userAgent: "EvalForge-Evaluator"
|
|
2173
|
+
});
|
|
2174
|
+
await writeFilesToDirectory(skillDir, files);
|
|
2175
|
+
console.log(
|
|
2176
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
2177
|
+
);
|
|
2178
|
+
} catch (error) {
|
|
2179
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2180
|
+
console.error(
|
|
2181
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
2182
|
+
);
|
|
2183
|
+
throw new Error(
|
|
2184
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
2166
2185
|
);
|
|
2167
|
-
for (const [serverName, serverConfig] of Object.entries(resolvedConfig)) {
|
|
2168
|
-
const config = serverConfig;
|
|
2169
|
-
const transport = buildTransport(serverName, config, cwd);
|
|
2170
|
-
const client = await createMCPClient({ transport });
|
|
2171
|
-
clients.push(client);
|
|
2172
|
-
const tools = await client.tools();
|
|
2173
|
-
for (const [toolName, tool] of Object.entries(tools)) {
|
|
2174
|
-
allTools[`${serverName}__${toolName}`] = tool;
|
|
2175
|
-
}
|
|
2176
|
-
}
|
|
2177
2186
|
}
|
|
2178
|
-
}
|
|
2179
|
-
|
|
2180
|
-
throw err;
|
|
2187
|
+
} else {
|
|
2188
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
2181
2189
|
}
|
|
2182
|
-
return { tools: allTools, clients };
|
|
2183
2190
|
}
|
|
2184
|
-
|
|
2185
|
-
|
|
2191
|
+
|
|
2192
|
+
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
2193
|
+
import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
|
|
2194
|
+
import { join as join7 } from "path";
|
|
2195
|
+
import {
|
|
2196
|
+
fetchGitHubFile as fetchGitHubFile2
|
|
2197
|
+
} from "@wix/evalforge-github-client";
|
|
2198
|
+
var AGENTS_DIR2 = ".opencode/agents";
|
|
2199
|
+
function toAgentFilename2(name, index, nameCount) {
|
|
2200
|
+
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
2201
|
+
const count = nameCount.get(base) ?? 0;
|
|
2202
|
+
nameCount.set(base, count + 1);
|
|
2203
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
2186
2204
|
}
|
|
2187
|
-
function
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2205
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
2206
|
+
if (agent.source) {
|
|
2207
|
+
try {
|
|
2208
|
+
const content = await fetchFn(agent.source, {
|
|
2209
|
+
userAgent: "EvalForge-Evaluator"
|
|
2210
|
+
});
|
|
2211
|
+
console.log(
|
|
2212
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
2213
|
+
);
|
|
2214
|
+
return content;
|
|
2215
|
+
} catch (error) {
|
|
2216
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2217
|
+
console.error(
|
|
2218
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
2219
|
+
);
|
|
2220
|
+
throw new Error(
|
|
2221
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
2222
|
+
);
|
|
2223
|
+
}
|
|
2197
2224
|
}
|
|
2198
|
-
if (
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
...headers && { headers }
|
|
2203
|
-
};
|
|
2225
|
+
if (!agent.subAgentMd) {
|
|
2226
|
+
console.warn(
|
|
2227
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
2228
|
+
);
|
|
2204
2229
|
}
|
|
2205
|
-
|
|
2230
|
+
return agent.subAgentMd;
|
|
2231
|
+
}
|
|
2232
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHubFile2) {
|
|
2233
|
+
if (subAgents.length === 0) return;
|
|
2234
|
+
const agentsDir = join7(cwd, AGENTS_DIR2);
|
|
2235
|
+
await mkdir6(agentsDir, { recursive: true });
|
|
2236
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
2237
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
2238
|
+
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
2239
|
+
const filePath = join7(agentsDir, `${filename}.md`);
|
|
2240
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
2241
|
+
await writeFile5(filePath, content, "utf8");
|
|
2242
|
+
}
|
|
2243
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2246
|
+
// src/run-scenario/agents/opencode/config.ts
|
|
2247
|
+
import {
|
|
2248
|
+
ClaudeModel as ClaudeModel2,
|
|
2249
|
+
AVAILABLE_OPENAI_MODEL_IDS
|
|
2250
|
+
} from "@wix/evalforge-types";
|
|
2251
|
+
var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
|
|
2252
|
+
function parseModel(model) {
|
|
2253
|
+
const slashIndex = model.indexOf("/");
|
|
2254
|
+
if (slashIndex > 0) {
|
|
2206
2255
|
return {
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
...headers && { headers }
|
|
2256
|
+
providerID: model.slice(0, slashIndex),
|
|
2257
|
+
modelID: model.slice(slashIndex + 1)
|
|
2210
2258
|
};
|
|
2211
2259
|
}
|
|
2212
|
-
|
|
2213
|
-
|
|
2260
|
+
const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2261
|
+
model
|
|
2214
2262
|
);
|
|
2263
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
|
|
2215
2264
|
}
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
if (obj.isError === true) {
|
|
2223
|
-
return extractErrorText(obj.content);
|
|
2265
|
+
function toOpenCodeMcpConfig(servers) {
|
|
2266
|
+
const result = {};
|
|
2267
|
+
for (const [name, entry] of Object.entries(servers)) {
|
|
2268
|
+
if (entry.type === "local" || entry.type === "remote") {
|
|
2269
|
+
result[name] = entry;
|
|
2270
|
+
continue;
|
|
2224
2271
|
}
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
} catch {
|
|
2234
|
-
return str.slice(0, 500);
|
|
2272
|
+
if (entry.url && typeof entry.url === "string") {
|
|
2273
|
+
result[name] = {
|
|
2274
|
+
type: "remote",
|
|
2275
|
+
url: entry.url,
|
|
2276
|
+
...entry.headers ? { headers: entry.headers } : {},
|
|
2277
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2278
|
+
};
|
|
2279
|
+
continue;
|
|
2235
2280
|
}
|
|
2281
|
+
if (entry.command && typeof entry.command === "string") {
|
|
2282
|
+
const commandArray = [
|
|
2283
|
+
entry.command,
|
|
2284
|
+
...entry.args || []
|
|
2285
|
+
];
|
|
2286
|
+
result[name] = {
|
|
2287
|
+
type: "local",
|
|
2288
|
+
command: commandArray,
|
|
2289
|
+
...entry.env ? { environment: entry.env } : {},
|
|
2290
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2291
|
+
};
|
|
2292
|
+
continue;
|
|
2293
|
+
}
|
|
2294
|
+
console.warn(
|
|
2295
|
+
`[MCP] Server "${name}" has unrecognized format, passing through as-is:`,
|
|
2296
|
+
JSON.stringify(entry)
|
|
2297
|
+
);
|
|
2298
|
+
result[name] = entry;
|
|
2236
2299
|
}
|
|
2237
|
-
return
|
|
2238
|
-
}
|
|
2239
|
-
function extractErrorText(content) {
|
|
2240
|
-
if (Array.isArray(content)) {
|
|
2241
|
-
const text = content.filter((c) => typeof c.text === "string").map((c) => c.text).join("\n");
|
|
2242
|
-
if (text) return text.slice(0, 500);
|
|
2243
|
-
}
|
|
2244
|
-
return "Tool call failed";
|
|
2300
|
+
return result;
|
|
2245
2301
|
}
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
2258
|
-
// Anthropic — Claude 4
|
|
2259
|
-
"claude-opus-4": { input: 15, output: 75 },
|
|
2260
|
-
"claude-sonnet-4": { input: 3, output: 15 },
|
|
2261
|
-
// OpenAI — GPT-5
|
|
2262
|
-
"gpt-5": { input: 1.25, output: 10 },
|
|
2263
|
-
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
2264
|
-
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
2265
|
-
// OpenAI — GPT-4.1
|
|
2266
|
-
"gpt-4.1": { input: 2, output: 8 },
|
|
2267
|
-
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
2268
|
-
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
2269
|
-
// OpenAI — GPT-4o
|
|
2270
|
-
"gpt-4o": { input: 2.5, output: 10 },
|
|
2271
|
-
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
2272
|
-
// OpenAI — Reasoning
|
|
2273
|
-
o3: { input: 2, output: 8 },
|
|
2274
|
-
"o4-mini": { input: 1.1, output: 4.4 },
|
|
2275
|
-
"o3-mini": { input: 1.1, output: 4.4 },
|
|
2276
|
-
o1: { input: 15, output: 60 }
|
|
2277
|
-
};
|
|
2278
|
-
function extractGatewayCost(step, provider) {
|
|
2279
|
-
try {
|
|
2280
|
-
if (provider === PROVIDER_ANTHROPIC) {
|
|
2281
|
-
const meta = step.providerMetadata;
|
|
2282
|
-
const anthropic = meta?.anthropic;
|
|
2283
|
-
const usage = anthropic?.usage;
|
|
2284
|
-
const cost2 = usage?.total_cost_usd;
|
|
2285
|
-
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
2302
|
+
async function buildOpenCodeConfig(options) {
|
|
2303
|
+
const modelStr = options.model || DEFAULT_MODEL2;
|
|
2304
|
+
const { providerID, modelID } = parseModel(modelStr);
|
|
2305
|
+
const provider = {};
|
|
2306
|
+
if (options.aiGatewayUrl) {
|
|
2307
|
+
const providerOptions = {
|
|
2308
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
|
|
2309
|
+
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2310
|
+
};
|
|
2311
|
+
if (options.aiGatewayHeaders) {
|
|
2312
|
+
providerOptions.headers = { ...options.aiGatewayHeaders };
|
|
2286
2313
|
}
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
} catch {
|
|
2291
|
-
return void 0;
|
|
2314
|
+
provider[providerID] = {
|
|
2315
|
+
options: providerOptions
|
|
2316
|
+
};
|
|
2292
2317
|
}
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2318
|
+
let mcp;
|
|
2319
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2320
|
+
const mcpServers = {};
|
|
2321
|
+
for (const mcpEntity of options.mcps) {
|
|
2322
|
+
const entityConfig = mcpEntity.config;
|
|
2323
|
+
for (const [key, value] of Object.entries(entityConfig)) {
|
|
2324
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
2325
|
+
throw new Error(
|
|
2326
|
+
`MCP "${mcpEntity.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}).`
|
|
2327
|
+
);
|
|
2328
|
+
}
|
|
2329
|
+
mcpServers[key] = value;
|
|
2330
|
+
}
|
|
2331
|
+
}
|
|
2332
|
+
const resolved = await resolveMcpPlaceholders(mcpServers, {
|
|
2333
|
+
cwd: options.cwd
|
|
2334
|
+
});
|
|
2335
|
+
mcp = toOpenCodeMcpConfig(resolved);
|
|
2336
|
+
}
|
|
2337
|
+
const agentOverrides = {};
|
|
2338
|
+
if (options.temperature != null) {
|
|
2339
|
+
agentOverrides.temperature = options.temperature;
|
|
2340
|
+
}
|
|
2341
|
+
if (options.maxTurns != null) {
|
|
2342
|
+
agentOverrides.maxSteps = options.maxTurns;
|
|
2343
|
+
}
|
|
2344
|
+
const config = {
|
|
2345
|
+
model: `${providerID}/${modelID}`,
|
|
2346
|
+
provider,
|
|
2347
|
+
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2348
|
+
permission: {
|
|
2349
|
+
edit: "allow",
|
|
2350
|
+
bash: "allow",
|
|
2351
|
+
webfetch: "allow",
|
|
2352
|
+
doom_loop: "allow",
|
|
2353
|
+
external_directory: "allow"
|
|
2354
|
+
},
|
|
2355
|
+
...mcp ? { mcp } : {}
|
|
2356
|
+
};
|
|
2357
|
+
return { config, providerID, modelID };
|
|
2302
2358
|
}
|
|
2303
2359
|
|
|
2304
|
-
// src/run-scenario/agents/
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2360
|
+
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2361
|
+
import { LLMStepType as LLMStepType2 } from "@wix/evalforge-types";
|
|
2362
|
+
import { randomUUID as randomUUID2 } from "crypto";
|
|
2363
|
+
function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
2364
|
+
const assistantMessages = messages.filter(
|
|
2365
|
+
(m) => m.info.role === "assistant"
|
|
2366
|
+
);
|
|
2367
|
+
const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
|
|
2368
|
+
const { info, parts } = msg;
|
|
2369
|
+
let text = "";
|
|
2370
|
+
let thinking = "";
|
|
2371
|
+
const toolCalls = [];
|
|
2372
|
+
let stepInputTokens = 0;
|
|
2373
|
+
let stepOutputTokens = 0;
|
|
2374
|
+
let stepCost = 0;
|
|
2375
|
+
let finishReason = "unknown";
|
|
2376
|
+
for (const part of parts) {
|
|
2377
|
+
switch (part.type) {
|
|
2378
|
+
case "text": {
|
|
2379
|
+
const textPart = part;
|
|
2380
|
+
text += textPart.text;
|
|
2381
|
+
break;
|
|
2382
|
+
}
|
|
2383
|
+
case "reasoning": {
|
|
2384
|
+
const reasoningPart = part;
|
|
2385
|
+
thinking += reasoningPart.text;
|
|
2386
|
+
break;
|
|
2387
|
+
}
|
|
2388
|
+
case "tool": {
|
|
2389
|
+
const toolPart = part;
|
|
2390
|
+
toolCalls.push({
|
|
2391
|
+
toolName: toolPart.tool,
|
|
2392
|
+
args: toolPart.state.input
|
|
2393
|
+
});
|
|
2394
|
+
break;
|
|
2395
|
+
}
|
|
2396
|
+
case "step-finish": {
|
|
2397
|
+
const sf = part;
|
|
2398
|
+
stepInputTokens += sf.tokens.input;
|
|
2399
|
+
stepOutputTokens += sf.tokens.output;
|
|
2400
|
+
stepCost += sf.cost;
|
|
2401
|
+
finishReason = sf.reason;
|
|
2402
|
+
break;
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
}
|
|
2406
|
+
if (stepInputTokens === 0 && stepOutputTokens === 0) {
|
|
2407
|
+
stepInputTokens = info.tokens.input;
|
|
2408
|
+
stepOutputTokens = info.tokens.output;
|
|
2409
|
+
stepCost = info.cost;
|
|
2410
|
+
}
|
|
2411
|
+
const startedAt = new Date(info.time.created).toISOString();
|
|
2412
|
+
const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
|
|
2413
|
+
const durationMs = Math.max(0, completedAt - info.time.created);
|
|
2414
|
+
const isSuccess = finishReason !== "error";
|
|
2415
|
+
const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
|
|
2416
|
+
const stepModel = info.modelID || model;
|
|
2417
|
+
const stepProvider = info.providerID || provider;
|
|
2418
|
+
const toolCallCount = toolCalls.length;
|
|
2419
|
+
const hasThinking = !!thinking;
|
|
2420
|
+
const hasText = !!text;
|
|
2421
|
+
const subSteps = [];
|
|
2422
|
+
const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
|
|
2423
|
+
const toolSubSteps = toolCallCount;
|
|
2424
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
2425
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
2426
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
2427
|
+
subSteps.push({
|
|
2428
|
+
id: randomUUID2(),
|
|
2429
|
+
stepNumber: 0,
|
|
2430
|
+
// renumbered below
|
|
2431
|
+
turnIndex,
|
|
2432
|
+
type: LLMStepType2.THINKING,
|
|
2433
|
+
model: stepModel,
|
|
2434
|
+
provider: stepProvider,
|
|
2435
|
+
startedAt,
|
|
2436
|
+
durationMs: Math.round(durationMs / totalSubSteps),
|
|
2437
|
+
tokenUsage: {
|
|
2438
|
+
prompt: Math.round(stepInputTokens / totalSubSteps),
|
|
2439
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
2440
|
+
total: Math.round(
|
|
2441
|
+
(stepInputTokens + stepOutputTokens) / totalSubSteps
|
|
2442
|
+
)
|
|
2443
|
+
},
|
|
2444
|
+
costUsd: stepCost / totalSubSteps,
|
|
2445
|
+
outputPreview: thinking.slice(0, 200),
|
|
2446
|
+
success: isSuccess,
|
|
2447
|
+
error: errorMsg
|
|
2448
|
+
});
|
|
2449
|
+
}
|
|
2450
|
+
if (toolCallCount > 0) {
|
|
2451
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
2452
|
+
const tc = toolCalls[tcIdx];
|
|
2453
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
2454
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
2455
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
2456
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
2457
|
+
subSteps.push({
|
|
2458
|
+
id: randomUUID2(),
|
|
2459
|
+
stepNumber: 0,
|
|
2460
|
+
turnIndex,
|
|
2461
|
+
type: LLMStepType2.TOOL_USE,
|
|
2462
|
+
model: stepModel,
|
|
2463
|
+
provider: stepProvider,
|
|
2464
|
+
startedAt,
|
|
2465
|
+
durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
|
|
2466
|
+
tokenUsage: {
|
|
2467
|
+
prompt: Math.round(
|
|
2468
|
+
stepInputTokens * remainingFraction * toolFraction
|
|
2469
|
+
),
|
|
2470
|
+
completion: Math.round(
|
|
2471
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
2472
|
+
),
|
|
2473
|
+
total: Math.round(
|
|
2474
|
+
(stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
2475
|
+
)
|
|
2476
|
+
},
|
|
2477
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
2478
|
+
toolName: tc.toolName,
|
|
2479
|
+
toolArguments: JSON.stringify(tc.args),
|
|
2480
|
+
outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
|
|
2481
|
+
success: isSuccess,
|
|
2482
|
+
error: errorMsg
|
|
2483
|
+
});
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
if (hasText && toolCallCount > 0) {
|
|
2487
|
+
subSteps.push({
|
|
2488
|
+
id: randomUUID2(),
|
|
2489
|
+
stepNumber: 0,
|
|
2490
|
+
turnIndex,
|
|
2491
|
+
type: LLMStepType2.COMPLETION,
|
|
2492
|
+
model: stepModel,
|
|
2493
|
+
provider: stepProvider,
|
|
2494
|
+
startedAt,
|
|
2495
|
+
durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
2496
|
+
tokenUsage: {
|
|
2497
|
+
prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2498
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2499
|
+
total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2500
|
+
},
|
|
2501
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2502
|
+
outputPreview: text.slice(0, 200),
|
|
2503
|
+
success: isSuccess,
|
|
2504
|
+
error: errorMsg
|
|
2505
|
+
});
|
|
2506
|
+
}
|
|
2507
|
+
if (subSteps.length === 0) {
|
|
2508
|
+
const stepType = hasThinking && !hasText ? LLMStepType2.THINKING : LLMStepType2.COMPLETION;
|
|
2509
|
+
subSteps.push({
|
|
2510
|
+
id: randomUUID2(),
|
|
2511
|
+
stepNumber: 0,
|
|
2512
|
+
turnIndex,
|
|
2513
|
+
type: stepType,
|
|
2514
|
+
model: stepModel,
|
|
2515
|
+
provider: stepProvider,
|
|
2516
|
+
startedAt,
|
|
2517
|
+
durationMs,
|
|
2518
|
+
tokenUsage: {
|
|
2519
|
+
prompt: stepInputTokens,
|
|
2520
|
+
completion: stepOutputTokens,
|
|
2521
|
+
total: stepInputTokens + stepOutputTokens
|
|
2522
|
+
},
|
|
2523
|
+
costUsd: stepCost,
|
|
2524
|
+
outputPreview: (text || thinking)?.slice(0, 200),
|
|
2525
|
+
success: isSuccess,
|
|
2526
|
+
error: errorMsg
|
|
2527
|
+
});
|
|
2528
|
+
}
|
|
2529
|
+
return subSteps;
|
|
2530
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
2531
|
+
const totalTokens = buildTotalTokens(assistantMessages);
|
|
2532
|
+
const totalCost = assistantMessages.reduce((sum, m) => {
|
|
2533
|
+
const aMsg = m.info;
|
|
2534
|
+
return sum + aMsg.cost;
|
|
2535
|
+
}, 0);
|
|
2536
|
+
const stepTypeBreakdown = {};
|
|
2537
|
+
for (const step of allSteps) {
|
|
2538
|
+
const entry = stepTypeBreakdown[step.type] ?? {
|
|
2539
|
+
count: 0,
|
|
2540
|
+
durationMs: 0,
|
|
2541
|
+
tokens: 0,
|
|
2542
|
+
costUsd: 0
|
|
2543
|
+
};
|
|
2544
|
+
entry.count += 1;
|
|
2545
|
+
entry.durationMs += step.durationMs;
|
|
2546
|
+
entry.tokens += step.tokenUsage.total;
|
|
2547
|
+
entry.costUsd += step.costUsd;
|
|
2548
|
+
stepTypeBreakdown[step.type] = entry;
|
|
2549
|
+
}
|
|
2550
|
+
const modelUsed = allSteps[0]?.model || model;
|
|
2551
|
+
const summary = {
|
|
2552
|
+
totalSteps: allSteps.length,
|
|
2553
|
+
totalTurns: assistantMessages.length,
|
|
2554
|
+
totalDurationMs,
|
|
2555
|
+
totalTokens,
|
|
2556
|
+
totalCostUsd: totalCost,
|
|
2557
|
+
modelBreakdown: {
|
|
2558
|
+
[modelUsed]: {
|
|
2559
|
+
count: allSteps.length,
|
|
2560
|
+
durationMs: totalDurationMs,
|
|
2561
|
+
tokens: totalTokens.total,
|
|
2562
|
+
costUsd: totalCost
|
|
2563
|
+
}
|
|
2564
|
+
},
|
|
2565
|
+
modelsUsed: [modelUsed],
|
|
2566
|
+
stepTypeBreakdown
|
|
2567
|
+
};
|
|
2568
|
+
return {
|
|
2569
|
+
id: randomUUID2(),
|
|
2570
|
+
steps: allSteps,
|
|
2571
|
+
summary
|
|
2572
|
+
};
|
|
2573
|
+
}
|
|
2574
|
+
function buildTotalTokens(assistantMessages) {
|
|
2575
|
+
let prompt = 0;
|
|
2576
|
+
let completion = 0;
|
|
2577
|
+
for (const { info } of assistantMessages) {
|
|
2578
|
+
prompt += info.tokens.input;
|
|
2579
|
+
completion += info.tokens.output;
|
|
2580
|
+
}
|
|
2581
|
+
return { prompt, completion, total: prompt + completion };
|
|
2582
|
+
}
|
|
2583
|
+
|
|
2584
|
+
// src/run-scenario/agents/opencode/build-conversation.ts
|
|
2585
|
+
function buildConversation2(messages) {
|
|
2586
|
+
const result = [];
|
|
2587
|
+
for (const { info, parts } of messages) {
|
|
2588
|
+
const timestamp = new Date(info.time.created).toISOString();
|
|
2589
|
+
if (info.role === "assistant") {
|
|
2590
|
+
const content = [];
|
|
2591
|
+
for (const part of parts) {
|
|
2592
|
+
switch (part.type) {
|
|
2593
|
+
case "text": {
|
|
2594
|
+
const textPart = part;
|
|
2595
|
+
content.push({ type: "text", text: textPart.text });
|
|
2596
|
+
break;
|
|
2597
|
+
}
|
|
2598
|
+
case "reasoning": {
|
|
2599
|
+
const reasoningPart = part;
|
|
2600
|
+
content.push({ type: "thinking", thinking: reasoningPart.text });
|
|
2601
|
+
break;
|
|
2602
|
+
}
|
|
2603
|
+
case "tool": {
|
|
2604
|
+
const toolPart = part;
|
|
2605
|
+
content.push({
|
|
2606
|
+
type: "tool_use",
|
|
2607
|
+
toolName: toolPart.tool,
|
|
2608
|
+
toolId: toolPart.callID,
|
|
2609
|
+
input: toolPart.state.input
|
|
2610
|
+
});
|
|
2611
|
+
break;
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2615
|
+
if (content.length > 0) {
|
|
2616
|
+
result.push({ role: "assistant", content, timestamp });
|
|
2617
|
+
}
|
|
2618
|
+
} else if (info.role === "user") {
|
|
2619
|
+
const content = [];
|
|
2620
|
+
for (const part of parts) {
|
|
2621
|
+
if (part.type === "text") {
|
|
2622
|
+
const textPart = part;
|
|
2623
|
+
content.push({ type: "text", text: textPart.text });
|
|
2624
|
+
} else if (part.type === "tool") {
|
|
2625
|
+
const toolPart = part;
|
|
2626
|
+
const state = toolPart.state;
|
|
2627
|
+
if (state.status === "completed") {
|
|
2628
|
+
const completed = state;
|
|
2629
|
+
content.push({
|
|
2630
|
+
type: "tool_result",
|
|
2631
|
+
toolUseId: toolPart.callID,
|
|
2632
|
+
content: completed.output
|
|
2633
|
+
});
|
|
2634
|
+
} else if (state.status === "error") {
|
|
2635
|
+
const errState = state;
|
|
2636
|
+
content.push({
|
|
2637
|
+
type: "tool_result",
|
|
2638
|
+
toolUseId: toolPart.callID,
|
|
2639
|
+
content: errState.error,
|
|
2640
|
+
isError: true
|
|
2641
|
+
});
|
|
2642
|
+
}
|
|
2643
|
+
}
|
|
2644
|
+
}
|
|
2645
|
+
if (content.length > 0) {
|
|
2646
|
+
result.push({ role: "user", content, timestamp });
|
|
2647
|
+
}
|
|
2648
|
+
}
|
|
2649
|
+
}
|
|
2650
|
+
return result;
|
|
2651
|
+
}
|
|
2652
|
+
|
|
2653
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2654
|
+
var DEFAULT_MODEL3 = `anthropic/${ClaudeModel3.CLAUDE_4_5_SONNET_1_0}`;
|
|
2655
|
+
function extractToolAction(toolName, args) {
|
|
2656
|
+
if (!toolName) return "Using tool...";
|
|
2657
|
+
const a = args;
|
|
2658
|
+
if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
|
|
2659
|
+
const desc = String(a.description).slice(0, 55);
|
|
2660
|
+
return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
|
|
2661
|
+
}
|
|
2662
|
+
if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
|
|
2663
|
+
const cmd = String(a.command).slice(0, 50);
|
|
2664
|
+
return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
|
|
2665
|
+
}
|
|
2666
|
+
if (a?.file_path || a?.path || a?.target_file) {
|
|
2667
|
+
const filePath = String(a.file_path || a.path || a.target_file).slice(
|
|
2668
|
+
0,
|
|
2669
|
+
50
|
|
2670
|
+
);
|
|
2671
|
+
if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
|
|
2672
|
+
if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
|
|
2673
|
+
}
|
|
2674
|
+
return `Using ${toolName}...`;
|
|
2675
|
+
}
|
|
2676
|
+
function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
2677
|
+
const base = {
|
|
2678
|
+
evalRunId: context.evalRunId,
|
|
2679
|
+
scenarioId: context.scenarioId,
|
|
2680
|
+
scenarioName: context.scenarioName,
|
|
2681
|
+
targetId: context.targetId,
|
|
2682
|
+
targetName: context.targetName,
|
|
2683
|
+
stepNumber,
|
|
2684
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2685
|
+
isComplete
|
|
2686
|
+
};
|
|
2687
|
+
switch (part.type) {
|
|
2688
|
+
case "text": {
|
|
2689
|
+
const textPart = part;
|
|
2690
|
+
return {
|
|
2691
|
+
...base,
|
|
2692
|
+
type: LiveTraceEventType2.COMPLETION,
|
|
2693
|
+
outputPreview: textPart.text.slice(0, 500)
|
|
2694
|
+
};
|
|
2695
|
+
}
|
|
2696
|
+
case "reasoning": {
|
|
2697
|
+
const reasoningPart = part;
|
|
2698
|
+
return {
|
|
2699
|
+
...base,
|
|
2700
|
+
type: LiveTraceEventType2.THINKING,
|
|
2701
|
+
thinking: reasoningPart.text.slice(0, 500)
|
|
2702
|
+
};
|
|
2703
|
+
}
|
|
2704
|
+
case "tool": {
|
|
2705
|
+
const toolPart = part;
|
|
2706
|
+
const toolName = toolPart.tool;
|
|
2707
|
+
const args = toolPart.state.input;
|
|
2708
|
+
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
2709
|
+
let type = LiveTraceEventType2.TOOL_USE;
|
|
2710
|
+
let filePath;
|
|
2711
|
+
const a = args;
|
|
2712
|
+
if (a.file_path || a.path || a.target_file) {
|
|
2713
|
+
filePath = String(a.file_path || a.path || a.target_file);
|
|
2714
|
+
if (/write|edit/i.test(toolName)) {
|
|
2715
|
+
type = LiveTraceEventType2.FILE_WRITE;
|
|
2716
|
+
} else if (/read|view/i.test(toolName)) {
|
|
2717
|
+
type = LiveTraceEventType2.FILE_READ;
|
|
2718
|
+
}
|
|
2719
|
+
}
|
|
2720
|
+
return { ...base, type, toolName, toolArgs, filePath };
|
|
2721
|
+
}
|
|
2722
|
+
case "step-finish":
|
|
2723
|
+
return {
|
|
2724
|
+
...base,
|
|
2725
|
+
type: LiveTraceEventType2.PROGRESS,
|
|
2726
|
+
outputPreview: "Step completed"
|
|
2727
|
+
};
|
|
2728
|
+
default:
|
|
2729
|
+
return null;
|
|
2730
|
+
}
|
|
2731
|
+
}
|
|
2732
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2733
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2734
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2735
|
+
skillCount: skills.length,
|
|
2736
|
+
skillNames,
|
|
2737
|
+
scenarioId: scenario.id,
|
|
2738
|
+
scenarioName: scenario.name,
|
|
2739
|
+
cwd: options.cwd,
|
|
2740
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2741
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2742
|
+
model: options.model
|
|
2743
|
+
});
|
|
2744
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2745
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2746
|
+
console.log(
|
|
2747
|
+
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2748
|
+
);
|
|
2749
|
+
}
|
|
2750
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
2751
|
+
await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
|
|
2752
|
+
}
|
|
2753
|
+
if (options.rules && options.rules.length > 0) {
|
|
2754
|
+
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
2755
|
+
}
|
|
2756
|
+
try {
|
|
2757
|
+
await writeSkillsToFilesystem2(options.cwd, skills);
|
|
2758
|
+
} catch (writeError) {
|
|
2759
|
+
throw new Error(
|
|
2760
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2761
|
+
);
|
|
2762
|
+
}
|
|
2763
|
+
const maxTurns = options.maxTurns ?? 10;
|
|
2764
|
+
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2765
|
+
model: options.model,
|
|
2766
|
+
temperature: options.temperature,
|
|
2767
|
+
maxTurns,
|
|
2768
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2769
|
+
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
2770
|
+
mcps: options.mcps,
|
|
2771
|
+
cwd: options.cwd
|
|
2772
|
+
});
|
|
2773
|
+
const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
|
|
2774
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
2775
|
+
const abortController = new AbortController();
|
|
2776
|
+
let timeoutHandle;
|
|
2777
|
+
let heartbeatHandle;
|
|
2778
|
+
let timedOut = false;
|
|
2779
|
+
const traceContext = options.traceContext;
|
|
2780
|
+
let traceStepNumber = 0;
|
|
2781
|
+
let lastAction = "Starting...";
|
|
2782
|
+
let lastToolName;
|
|
2783
|
+
let lastFilePath;
|
|
2784
|
+
if (traceContext) {
|
|
2785
|
+
emitTraceEvent(
|
|
2786
|
+
{
|
|
2787
|
+
evalRunId: traceContext.evalRunId,
|
|
2788
|
+
scenarioId: traceContext.scenarioId,
|
|
2789
|
+
scenarioName: traceContext.scenarioName,
|
|
2790
|
+
targetId: traceContext.targetId,
|
|
2791
|
+
targetName: traceContext.targetName,
|
|
2792
|
+
stepNumber: 0,
|
|
2793
|
+
type: LiveTraceEventType2.DIAGNOSTIC,
|
|
2794
|
+
outputPreview: JSON.stringify({
|
|
2795
|
+
event: "pre-sdk-execution",
|
|
2796
|
+
model: `${providerID}/${modelID}`,
|
|
2797
|
+
maxTurns,
|
|
2798
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2799
|
+
}),
|
|
2800
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2801
|
+
isComplete: false
|
|
2802
|
+
},
|
|
2803
|
+
traceContext.tracePushUrl,
|
|
2804
|
+
traceContext.routeHeader,
|
|
2805
|
+
traceContext.authToken
|
|
2806
|
+
);
|
|
2807
|
+
}
|
|
2808
|
+
let server;
|
|
2809
|
+
try {
|
|
2810
|
+
console.log("[SDK-DEBUG] Starting OpenCode server...");
|
|
2811
|
+
server = await createOpencodeServer({
|
|
2812
|
+
config,
|
|
2813
|
+
signal: abortController.signal,
|
|
2814
|
+
timeout: 3e4
|
|
2815
|
+
});
|
|
2816
|
+
console.log(`[SDK-DEBUG] Server started at ${server.url}`);
|
|
2817
|
+
const client = createOpencodeClient({
|
|
2818
|
+
baseUrl: server.url,
|
|
2819
|
+
directory: options.cwd
|
|
2820
|
+
});
|
|
2821
|
+
const session = await client.session.create({
|
|
2822
|
+
body: { title: `eval-${scenario.name}` }
|
|
2823
|
+
});
|
|
2824
|
+
if (!session.data) {
|
|
2825
|
+
const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
|
|
2826
|
+
throw new Error(
|
|
2827
|
+
`OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
|
|
2828
|
+
);
|
|
2829
|
+
}
|
|
2830
|
+
const sessionId = session.data.id;
|
|
2831
|
+
console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
|
|
2832
|
+
let eventStreamAbort;
|
|
2833
|
+
if (traceContext) {
|
|
2834
|
+
eventStreamAbort = new AbortController();
|
|
2835
|
+
const executionStartTime = Date.now();
|
|
2836
|
+
(async () => {
|
|
2837
|
+
try {
|
|
2838
|
+
const events = await client.event.subscribe();
|
|
2839
|
+
for await (const event of events.stream) {
|
|
2840
|
+
if (eventStreamAbort.signal.aborted) break;
|
|
2841
|
+
const evt = event;
|
|
2842
|
+
if (evt.type === "message.part.updated") {
|
|
2843
|
+
const { part } = evt.properties;
|
|
2844
|
+
traceStepNumber++;
|
|
2845
|
+
const traceEvent = createTraceEventFromPart(
|
|
2846
|
+
part,
|
|
2847
|
+
traceContext,
|
|
2848
|
+
traceStepNumber,
|
|
2849
|
+
false
|
|
2850
|
+
);
|
|
2851
|
+
if (traceEvent) {
|
|
2852
|
+
lastToolName = traceEvent.toolName;
|
|
2853
|
+
lastFilePath = traceEvent.filePath;
|
|
2854
|
+
if (traceEvent.type === LiveTraceEventType2.THINKING) {
|
|
2855
|
+
lastAction = "Thinking...";
|
|
2856
|
+
} else if (traceEvent.type === LiveTraceEventType2.TOOL_USE) {
|
|
2857
|
+
lastAction = extractToolAction(
|
|
2858
|
+
traceEvent.toolName ?? "",
|
|
2859
|
+
void 0
|
|
2860
|
+
);
|
|
2861
|
+
} else if (traceEvent.type === LiveTraceEventType2.FILE_WRITE) {
|
|
2862
|
+
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
2863
|
+
} else if (traceEvent.type === LiveTraceEventType2.FILE_READ) {
|
|
2864
|
+
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
2865
|
+
} else if (traceEvent.type === LiveTraceEventType2.COMPLETION) {
|
|
2866
|
+
lastAction = "Processing response...";
|
|
2867
|
+
}
|
|
2868
|
+
emitTraceEvent(
|
|
2869
|
+
traceEvent,
|
|
2870
|
+
traceContext.tracePushUrl,
|
|
2871
|
+
traceContext.routeHeader,
|
|
2872
|
+
traceContext.authToken
|
|
2873
|
+
);
|
|
2874
|
+
}
|
|
2875
|
+
} else if (evt.type === "session.error") {
|
|
2876
|
+
const props = evt.properties;
|
|
2877
|
+
traceStepNumber++;
|
|
2878
|
+
emitTraceEvent(
|
|
2879
|
+
{
|
|
2880
|
+
evalRunId: traceContext.evalRunId,
|
|
2881
|
+
scenarioId: traceContext.scenarioId,
|
|
2882
|
+
scenarioName: traceContext.scenarioName,
|
|
2883
|
+
targetId: traceContext.targetId,
|
|
2884
|
+
targetName: traceContext.targetName,
|
|
2885
|
+
stepNumber: traceStepNumber,
|
|
2886
|
+
type: LiveTraceEventType2.DIAGNOSTIC,
|
|
2887
|
+
outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
|
|
2888
|
+
0,
|
|
2889
|
+
500
|
|
2890
|
+
),
|
|
2891
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2892
|
+
isComplete: false
|
|
2893
|
+
},
|
|
2894
|
+
traceContext.tracePushUrl,
|
|
2895
|
+
traceContext.routeHeader,
|
|
2896
|
+
traceContext.authToken
|
|
2897
|
+
);
|
|
2898
|
+
}
|
|
2899
|
+
}
|
|
2900
|
+
} catch {
|
|
2901
|
+
}
|
|
2902
|
+
})();
|
|
2903
|
+
let lastReportedAction = "";
|
|
2904
|
+
let sameActionCount = 0;
|
|
2905
|
+
heartbeatHandle = setInterval(() => {
|
|
2906
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
2907
|
+
let progressMessage = lastAction;
|
|
2908
|
+
if (lastAction === lastReportedAction) {
|
|
2909
|
+
sameActionCount++;
|
|
2910
|
+
} else {
|
|
2911
|
+
sameActionCount = 1;
|
|
2912
|
+
lastReportedAction = lastAction;
|
|
2913
|
+
}
|
|
2914
|
+
const isTaskTool = lastToolName === "Task" || lastToolName === "dispatch_agent";
|
|
2915
|
+
if (isTaskTool && sameActionCount > 1) {
|
|
2916
|
+
progressMessage = `Waiting for ${lastAction}`;
|
|
2917
|
+
} else if (lastToolName && lastFilePath) {
|
|
2918
|
+
progressMessage = `${lastToolName}: ${lastFilePath}`;
|
|
2919
|
+
} else if (lastToolName && !isTaskTool) {
|
|
2920
|
+
progressMessage = `Using ${lastToolName}...`;
|
|
2921
|
+
}
|
|
2922
|
+
const elapsedSec = Math.round(elapsedMs / 1e3);
|
|
2923
|
+
progressMessage += ` (${elapsedSec}s, step ${traceStepNumber})`;
|
|
2924
|
+
emitTraceEvent(
|
|
2925
|
+
{
|
|
2926
|
+
evalRunId: traceContext.evalRunId,
|
|
2927
|
+
scenarioId: traceContext.scenarioId,
|
|
2928
|
+
scenarioName: traceContext.scenarioName,
|
|
2929
|
+
targetId: traceContext.targetId,
|
|
2930
|
+
targetName: traceContext.targetName,
|
|
2931
|
+
stepNumber: traceStepNumber,
|
|
2932
|
+
type: LiveTraceEventType2.PROGRESS,
|
|
2933
|
+
outputPreview: progressMessage,
|
|
2934
|
+
toolName: lastToolName,
|
|
2935
|
+
filePath: lastFilePath,
|
|
2936
|
+
elapsedMs,
|
|
2937
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2938
|
+
isComplete: false
|
|
2939
|
+
},
|
|
2940
|
+
traceContext.tracePushUrl,
|
|
2941
|
+
traceContext.routeHeader,
|
|
2942
|
+
traceContext.authToken
|
|
2943
|
+
);
|
|
2944
|
+
}, 1e4);
|
|
2945
|
+
}
|
|
2946
|
+
const promptPromise = (async () => {
|
|
2947
|
+
let systemPrompt;
|
|
2948
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2949
|
+
} else if (options.systemPrompt != null) {
|
|
2950
|
+
systemPrompt = options.systemPrompt;
|
|
2951
|
+
} else {
|
|
2952
|
+
systemPrompt = DEFAULT_EVALUATOR_SYSTEM_PROMPT2;
|
|
2953
|
+
}
|
|
2954
|
+
console.log("[SDK-DEBUG] Sending prompt...");
|
|
2955
|
+
const result = await client.session.prompt({
|
|
2956
|
+
path: { id: sessionId },
|
|
2957
|
+
body: {
|
|
2958
|
+
model: { providerID, modelID },
|
|
2959
|
+
...systemPrompt ? { system: systemPrompt } : {},
|
|
2960
|
+
parts: [{ type: "text", text: scenario.triggerPrompt }]
|
|
2961
|
+
}
|
|
2962
|
+
});
|
|
2963
|
+
return result;
|
|
2964
|
+
})();
|
|
2965
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
2966
|
+
timeoutHandle = setTimeout(() => {
|
|
2967
|
+
timedOut = true;
|
|
2968
|
+
client.session.abort({ path: { id: sessionId } }).catch(() => {
|
|
2969
|
+
});
|
|
2970
|
+
reject(
|
|
2971
|
+
new Error(
|
|
2972
|
+
`OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
|
|
2973
|
+
)
|
|
2974
|
+
);
|
|
2975
|
+
}, SDK_TIMEOUT_MS);
|
|
2976
|
+
});
|
|
2977
|
+
const promptResult = await Promise.race([promptPromise, timeoutPromise]);
|
|
2978
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
2979
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
2980
|
+
if (eventStreamAbort) eventStreamAbort.abort();
|
|
2981
|
+
if ("error" in promptResult && promptResult.error) {
|
|
2982
|
+
const errPayload = promptResult.error;
|
|
2983
|
+
throw new Error(
|
|
2984
|
+
`Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
|
|
2985
|
+
);
|
|
2986
|
+
}
|
|
2987
|
+
console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
|
|
2988
|
+
const messagesResponse = await client.session.messages({
|
|
2989
|
+
path: { id: sessionId }
|
|
2990
|
+
});
|
|
2991
|
+
const allMessages = messagesResponse.data ?? [];
|
|
2992
|
+
console.log(
|
|
2993
|
+
`[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
|
|
2994
|
+
);
|
|
2995
|
+
if (traceContext) {
|
|
2996
|
+
emitTraceEvent(
|
|
2997
|
+
{
|
|
2998
|
+
evalRunId: traceContext.evalRunId,
|
|
2999
|
+
scenarioId: traceContext.scenarioId,
|
|
3000
|
+
scenarioName: traceContext.scenarioName,
|
|
3001
|
+
targetId: traceContext.targetId,
|
|
3002
|
+
targetName: traceContext.targetName,
|
|
3003
|
+
stepNumber: traceStepNumber + 1,
|
|
3004
|
+
type: LiveTraceEventType2.COMPLETION,
|
|
3005
|
+
outputPreview: "Scenario execution completed",
|
|
3006
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3007
|
+
isComplete: true
|
|
3008
|
+
},
|
|
3009
|
+
traceContext.tracePushUrl,
|
|
3010
|
+
traceContext.routeHeader,
|
|
3011
|
+
traceContext.authToken
|
|
3012
|
+
);
|
|
3013
|
+
}
|
|
3014
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
3015
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3016
|
+
const resultData = promptResult.data;
|
|
3017
|
+
const lastAssistantInfo = resultData?.info;
|
|
3018
|
+
if (lastAssistantInfo?.error) {
|
|
3019
|
+
const err = lastAssistantInfo.error;
|
|
3020
|
+
throw new Error(
|
|
3021
|
+
`Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
|
|
3022
|
+
);
|
|
3023
|
+
}
|
|
3024
|
+
let outputText = "";
|
|
3025
|
+
if (resultData?.parts) {
|
|
3026
|
+
for (const part of resultData.parts) {
|
|
3027
|
+
if (part.type === "text") {
|
|
3028
|
+
outputText += part.text;
|
|
3029
|
+
}
|
|
3030
|
+
}
|
|
3031
|
+
}
|
|
3032
|
+
if (!outputText && allMessages.length > 0) {
|
|
3033
|
+
for (let i = allMessages.length - 1; i >= 0; i--) {
|
|
3034
|
+
const msg = allMessages[i];
|
|
3035
|
+
if (msg.info.role === "assistant") {
|
|
3036
|
+
const assistantInfo = msg.info;
|
|
3037
|
+
if (assistantInfo.error) {
|
|
3038
|
+
throw new Error(
|
|
3039
|
+
`Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
|
|
3040
|
+
);
|
|
3041
|
+
}
|
|
3042
|
+
for (const part of msg.parts) {
|
|
3043
|
+
if (part.type === "text") {
|
|
3044
|
+
outputText += part.text;
|
|
3045
|
+
}
|
|
3046
|
+
}
|
|
3047
|
+
if (outputText) break;
|
|
3048
|
+
}
|
|
3049
|
+
}
|
|
3050
|
+
}
|
|
3051
|
+
if (!outputText) {
|
|
3052
|
+
const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
|
|
3053
|
+
if (!hasAssistant) {
|
|
3054
|
+
throw new Error(
|
|
3055
|
+
`Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
|
|
3056
|
+
);
|
|
3057
|
+
}
|
|
3058
|
+
}
|
|
3059
|
+
const usage = lastAssistantInfo ? {
|
|
3060
|
+
inputTokens: lastAssistantInfo.tokens.input,
|
|
3061
|
+
outputTokens: lastAssistantInfo.tokens.output,
|
|
3062
|
+
totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
|
|
3063
|
+
} : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
3064
|
+
const costUsd = lastAssistantInfo?.cost;
|
|
3065
|
+
const modelStr = options.model || DEFAULT_MODEL3;
|
|
3066
|
+
const llmTrace = buildLLMTrace(
|
|
3067
|
+
allMessages,
|
|
3068
|
+
totalDurationMs,
|
|
3069
|
+
modelStr,
|
|
3070
|
+
providerID
|
|
3071
|
+
);
|
|
3072
|
+
const conversation = buildConversation2(allMessages);
|
|
3073
|
+
return {
|
|
3074
|
+
result: {
|
|
3075
|
+
outputText,
|
|
3076
|
+
durationMs: totalDurationMs,
|
|
3077
|
+
usage,
|
|
3078
|
+
costUsd
|
|
3079
|
+
},
|
|
3080
|
+
llmTrace,
|
|
3081
|
+
conversation
|
|
3082
|
+
};
|
|
3083
|
+
} catch (sdkError) {
|
|
3084
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
3085
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
3086
|
+
if (timedOut) {
|
|
3087
|
+
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
3088
|
+
}
|
|
3089
|
+
const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
|
|
3090
|
+
const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
|
|
3091
|
+
const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
|
|
3092
|
+
console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
|
|
3093
|
+
console.error("[SDK-ERROR] Error name:", errorName);
|
|
3094
|
+
console.error("[SDK-ERROR] Error message:", errorMessage);
|
|
3095
|
+
if (errorStack) {
|
|
3096
|
+
console.error("[SDK-ERROR] Stack:", errorStack);
|
|
3097
|
+
}
|
|
3098
|
+
if (traceContext) {
|
|
3099
|
+
emitTraceEvent(
|
|
3100
|
+
{
|
|
3101
|
+
evalRunId: traceContext.evalRunId,
|
|
3102
|
+
scenarioId: traceContext.scenarioId,
|
|
3103
|
+
scenarioName: traceContext.scenarioName,
|
|
3104
|
+
targetId: traceContext.targetId,
|
|
3105
|
+
targetName: traceContext.targetName,
|
|
3106
|
+
stepNumber: traceStepNumber + 1,
|
|
3107
|
+
type: LiveTraceEventType2.DIAGNOSTIC,
|
|
3108
|
+
outputPreview: JSON.stringify({
|
|
3109
|
+
event: "sdk-execution-failed",
|
|
3110
|
+
error: errorMessage,
|
|
3111
|
+
errorName
|
|
3112
|
+
}).slice(0, 2e3),
|
|
3113
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3114
|
+
isComplete: true
|
|
3115
|
+
},
|
|
3116
|
+
traceContext.tracePushUrl,
|
|
3117
|
+
traceContext.routeHeader,
|
|
3118
|
+
traceContext.authToken
|
|
3119
|
+
);
|
|
3120
|
+
}
|
|
3121
|
+
throw new Error(
|
|
3122
|
+
`OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
|
|
3123
|
+
Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
3124
|
+
);
|
|
3125
|
+
} finally {
|
|
3126
|
+
if (server) {
|
|
3127
|
+
try {
|
|
3128
|
+
server.close();
|
|
3129
|
+
console.log("[SDK-DEBUG] OpenCode server closed");
|
|
3130
|
+
} catch {
|
|
3131
|
+
}
|
|
3132
|
+
}
|
|
3133
|
+
}
|
|
3134
|
+
}
|
|
3135
|
+
|
|
3136
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
3137
|
+
var OpenCodeAdapter = class {
|
|
3138
|
+
id = "opencode";
|
|
3139
|
+
name = "OpenCode";
|
|
3140
|
+
supportedCommands = [AgentRunCommand2.OPENCODE];
|
|
3141
|
+
async execute(context) {
|
|
3142
|
+
const {
|
|
3143
|
+
skills,
|
|
3144
|
+
scenario,
|
|
3145
|
+
cwd,
|
|
3146
|
+
modelConfig,
|
|
3147
|
+
aiGatewayUrl,
|
|
3148
|
+
aiGatewayHeaders,
|
|
3149
|
+
traceContext,
|
|
3150
|
+
mcps,
|
|
3151
|
+
subAgents,
|
|
3152
|
+
rules,
|
|
3153
|
+
systemPrompt
|
|
3154
|
+
} = context;
|
|
3155
|
+
const options = {
|
|
3156
|
+
cwd,
|
|
3157
|
+
model: modelConfig?.model,
|
|
3158
|
+
temperature: modelConfig?.temperature,
|
|
3159
|
+
maxTurns: modelConfig?.maxTurns,
|
|
3160
|
+
aiGatewayUrl,
|
|
3161
|
+
aiGatewayHeaders,
|
|
3162
|
+
traceContext,
|
|
3163
|
+
mcps,
|
|
3164
|
+
subAgents,
|
|
3165
|
+
rules,
|
|
3166
|
+
systemPrompt
|
|
3167
|
+
};
|
|
3168
|
+
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3169
|
+
skills,
|
|
3170
|
+
scenario,
|
|
3171
|
+
options
|
|
3172
|
+
);
|
|
3173
|
+
return {
|
|
3174
|
+
outputText: result.outputText,
|
|
3175
|
+
durationMs: result.durationMs,
|
|
3176
|
+
usage: {
|
|
3177
|
+
inputTokens: result.usage.inputTokens,
|
|
3178
|
+
outputTokens: result.usage.outputTokens,
|
|
3179
|
+
totalTokens: result.usage.totalTokens
|
|
3180
|
+
},
|
|
3181
|
+
costUsd: result.costUsd,
|
|
3182
|
+
llmTrace,
|
|
3183
|
+
conversation
|
|
3184
|
+
};
|
|
3185
|
+
}
|
|
3186
|
+
};
|
|
3187
|
+
var openCodeAdapter = new OpenCodeAdapter();
|
|
3188
|
+
|
|
3189
|
+
// src/run-scenario/agents/opencode/index.ts
|
|
3190
|
+
defaultRegistry.register(openCodeAdapter);
|
|
3191
|
+
|
|
3192
|
+
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3193
|
+
import {
|
|
3194
|
+
generateText,
|
|
3195
|
+
stepCountIs
|
|
3196
|
+
} from "ai";
|
|
3197
|
+
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
3198
|
+
import { createOpenAI } from "@ai-sdk/openai";
|
|
3199
|
+
import {
|
|
3200
|
+
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
3201
|
+
OPENAI_RESPONSES_MODEL_IDS,
|
|
3202
|
+
LLMStepType as LLMStepType3,
|
|
3203
|
+
LiveTraceEventType as LiveTraceEventType3
|
|
3204
|
+
} from "@wix/evalforge-types";
|
|
3205
|
+
import { randomUUID as randomUUID3 } from "crypto";
|
|
3206
|
+
|
|
3207
|
+
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
3208
|
+
import { createMCPClient } from "@ai-sdk/mcp";
|
|
3209
|
+
import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio";
|
|
3210
|
+
async function buildMcpTools(mcps, cwd) {
|
|
3211
|
+
const allTools = {};
|
|
3212
|
+
const clients = [];
|
|
3213
|
+
try {
|
|
3214
|
+
for (const mcp of mcps) {
|
|
3215
|
+
const resolvedConfig = await resolveMcpPlaceholders(
|
|
3216
|
+
mcp.config,
|
|
3217
|
+
{ cwd }
|
|
3218
|
+
);
|
|
3219
|
+
for (const [serverName, serverConfig] of Object.entries(resolvedConfig)) {
|
|
3220
|
+
const config = serverConfig;
|
|
3221
|
+
const transport = buildTransport(serverName, config, cwd);
|
|
3222
|
+
const client = await createMCPClient({ transport });
|
|
3223
|
+
clients.push(client);
|
|
3224
|
+
const tools = await client.tools();
|
|
3225
|
+
for (const [toolName, tool] of Object.entries(tools)) {
|
|
3226
|
+
allTools[`${serverName}__${toolName}`] = tool;
|
|
3227
|
+
}
|
|
3228
|
+
}
|
|
3229
|
+
}
|
|
3230
|
+
} catch (err) {
|
|
3231
|
+
await closeMcpClients(clients);
|
|
3232
|
+
throw err;
|
|
3233
|
+
}
|
|
3234
|
+
return { tools: allTools, clients };
|
|
3235
|
+
}
|
|
3236
|
+
async function closeMcpClients(clients) {
|
|
3237
|
+
await Promise.allSettled(clients.map((c) => c.close()));
|
|
3238
|
+
}
|
|
3239
|
+
function buildTransport(serverName, config, cwd) {
|
|
3240
|
+
const type = config.type;
|
|
3241
|
+
const headers = config.headers;
|
|
3242
|
+
if (type === "stdio" || config.command) {
|
|
3243
|
+
return new Experimental_StdioMCPTransport({
|
|
3244
|
+
command: config.command,
|
|
3245
|
+
args: config.args ?? [],
|
|
3246
|
+
env: { ...config.env, PWD: cwd },
|
|
3247
|
+
cwd
|
|
3248
|
+
});
|
|
3249
|
+
}
|
|
3250
|
+
if (type === "http") {
|
|
3251
|
+
return {
|
|
3252
|
+
type: "http",
|
|
3253
|
+
url: config.url,
|
|
3254
|
+
...headers && { headers }
|
|
3255
|
+
};
|
|
3256
|
+
}
|
|
3257
|
+
if (type === "sse" || config.url) {
|
|
3258
|
+
return {
|
|
3259
|
+
type: "sse",
|
|
3260
|
+
url: config.url,
|
|
3261
|
+
...headers && { headers }
|
|
3262
|
+
};
|
|
3263
|
+
}
|
|
3264
|
+
throw new Error(
|
|
3265
|
+
`MCP server "${serverName}" has unsupported transport config (type=${type ?? "unset"}). Expected type "stdio", "http", or "sse", or a config with "command" or "url".`
|
|
3266
|
+
);
|
|
3267
|
+
}
|
|
3268
|
+
|
|
3269
|
+
// src/run-scenario/agents/shared/detect-tool-error.ts
|
|
3270
|
+
function detectMcpToolError(output) {
|
|
3271
|
+
if (output == null) return null;
|
|
3272
|
+
if (typeof output === "object" && "isError" in output) {
|
|
3273
|
+
const obj = output;
|
|
3274
|
+
if (obj.isError === true) {
|
|
3275
|
+
return extractErrorText(obj.content);
|
|
3276
|
+
}
|
|
3277
|
+
}
|
|
3278
|
+
const str = typeof output === "string" ? output : null;
|
|
3279
|
+
if (str && (str.includes('"isError":true') || str.includes('"isError": true'))) {
|
|
3280
|
+
try {
|
|
3281
|
+
const parsed = JSON.parse(str);
|
|
3282
|
+
if (parsed.isError === true) {
|
|
3283
|
+
return extractErrorText(parsed.content);
|
|
3284
|
+
}
|
|
3285
|
+
} catch {
|
|
3286
|
+
return str.slice(0, 500);
|
|
3287
|
+
}
|
|
3288
|
+
}
|
|
3289
|
+
return null;
|
|
3290
|
+
}
|
|
3291
|
+
function extractErrorText(content) {
|
|
3292
|
+
if (Array.isArray(content)) {
|
|
3293
|
+
const text = content.filter((c) => typeof c.text === "string").map((c) => c.text).join("\n");
|
|
3294
|
+
if (text) return text.slice(0, 500);
|
|
3295
|
+
}
|
|
3296
|
+
return "Tool call failed";
|
|
3297
|
+
}
|
|
3298
|
+
|
|
3299
|
+
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3300
|
+
import { normalizeModelId } from "@wix/evalforge-types";
|
|
3301
|
+
var PROVIDER_ANTHROPIC = "anthropic";
|
|
3302
|
+
var MODEL_PRICING = {
|
|
3303
|
+
// Anthropic — Claude 4.6
|
|
3304
|
+
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
3305
|
+
"claude-opus-4-6": { input: 15, output: 75 },
|
|
3306
|
+
// Anthropic — Claude 4.5
|
|
3307
|
+
"claude-opus-4-5": { input: 5, output: 25 },
|
|
3308
|
+
"claude-sonnet-4-5": { input: 3, output: 15 },
|
|
3309
|
+
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
3310
|
+
// Anthropic — Claude 4
|
|
3311
|
+
"claude-opus-4": { input: 15, output: 75 },
|
|
3312
|
+
"claude-sonnet-4": { input: 3, output: 15 },
|
|
3313
|
+
// OpenAI — GPT-5
|
|
3314
|
+
"gpt-5": { input: 1.25, output: 10 },
|
|
3315
|
+
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
3316
|
+
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
3317
|
+
// OpenAI — GPT-4.1
|
|
3318
|
+
"gpt-4.1": { input: 2, output: 8 },
|
|
3319
|
+
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
3320
|
+
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
3321
|
+
// OpenAI — GPT-4o
|
|
3322
|
+
"gpt-4o": { input: 2.5, output: 10 },
|
|
3323
|
+
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
3324
|
+
// OpenAI — Reasoning
|
|
3325
|
+
o3: { input: 2, output: 8 },
|
|
3326
|
+
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3327
|
+
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3328
|
+
o1: { input: 15, output: 60 }
|
|
3329
|
+
};
|
|
3330
|
+
function extractGatewayCost(step, provider) {
|
|
3331
|
+
try {
|
|
3332
|
+
if (provider === PROVIDER_ANTHROPIC) {
|
|
3333
|
+
const meta = step.providerMetadata;
|
|
3334
|
+
const anthropic = meta?.anthropic;
|
|
3335
|
+
const usage = anthropic?.usage;
|
|
3336
|
+
const cost2 = usage?.total_cost_usd;
|
|
3337
|
+
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
3338
|
+
}
|
|
3339
|
+
const body = step.response?.body;
|
|
3340
|
+
const cost = body?.total_cost_usd;
|
|
3341
|
+
return typeof cost === "number" && cost > 0 ? cost : void 0;
|
|
3342
|
+
} catch {
|
|
3343
|
+
return void 0;
|
|
3344
|
+
}
|
|
3345
|
+
}
|
|
3346
|
+
function calculateFromPricing(modelId, tokenUsage) {
|
|
3347
|
+
const normalized = normalizeModelId(modelId);
|
|
3348
|
+
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
3349
|
+
if (!pricing) return 0;
|
|
3350
|
+
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
3351
|
+
}
|
|
3352
|
+
function calculateStepCost(step, modelId, provider, tokenUsage) {
|
|
3353
|
+
return extractGatewayCost(step, provider) ?? calculateFromPricing(modelId, tokenUsage);
|
|
3354
|
+
}
|
|
3355
|
+
|
|
3356
|
+
// src/run-scenario/agents/simple-agent/build-conversation.ts
|
|
3357
|
+
function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
3358
|
+
const messages = [];
|
|
3359
|
+
messages.push({
|
|
3360
|
+
role: "user",
|
|
3361
|
+
content: [{ type: "text", text: triggerPrompt }],
|
|
3362
|
+
timestamp: new Date(executionStartMs).toISOString()
|
|
3363
|
+
});
|
|
3364
|
+
for (let i = 0; i < steps.length; i++) {
|
|
3365
|
+
const step = steps[i];
|
|
3366
|
+
const stepTimestamp = estimateStepTimestamp(
|
|
3367
|
+
executionStartMs,
|
|
3368
|
+
i,
|
|
3369
|
+
steps.length
|
|
3370
|
+
);
|
|
3371
|
+
const assistantContent = [];
|
|
3372
|
+
if (step.reasoningText) {
|
|
3373
|
+
assistantContent.push({ type: "thinking", thinking: step.reasoningText });
|
|
3374
|
+
}
|
|
3375
|
+
if (step.text) {
|
|
3376
|
+
assistantContent.push({ type: "text", text: step.text });
|
|
3377
|
+
}
|
|
3378
|
+
for (const tc of step.toolCalls) {
|
|
3379
|
+
assistantContent.push({
|
|
3380
|
+
type: "tool_use",
|
|
3381
|
+
toolName: tc.toolName,
|
|
3382
|
+
toolId: tc.toolCallId,
|
|
2331
3383
|
input: tc.input
|
|
2332
3384
|
});
|
|
2333
3385
|
}
|
|
@@ -2462,7 +3514,7 @@ async function executeWithAiSdk(context) {
|
|
|
2462
3514
|
outputTokens: result.usage.outputTokens ?? 0,
|
|
2463
3515
|
totalTokens: result.usage.totalTokens ?? 0
|
|
2464
3516
|
};
|
|
2465
|
-
const llmTrace =
|
|
3517
|
+
const llmTrace = buildLLMTrace2(
|
|
2466
3518
|
result.steps,
|
|
2467
3519
|
durationMs,
|
|
2468
3520
|
usage,
|
|
@@ -2474,7 +3526,7 @@ async function executeWithAiSdk(context) {
|
|
|
2474
3526
|
emitStepEvents(traceContext, result.steps, startTime);
|
|
2475
3527
|
emitCompletionEvent(traceContext, result.steps.length + 1);
|
|
2476
3528
|
}
|
|
2477
|
-
const conversation =
|
|
3529
|
+
const conversation = buildConversation3(
|
|
2478
3530
|
scenario.triggerPrompt,
|
|
2479
3531
|
result.steps,
|
|
2480
3532
|
startTime
|
|
@@ -2518,7 +3570,7 @@ function findToolResultError(step) {
|
|
|
2518
3570
|
}
|
|
2519
3571
|
return null;
|
|
2520
3572
|
}
|
|
2521
|
-
function
|
|
3573
|
+
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
|
|
2522
3574
|
const totalStepTokens = steps.reduce(
|
|
2523
3575
|
(sum, s) => sum + (s.usage.totalTokens ?? 0),
|
|
2524
3576
|
0
|
|
@@ -2536,10 +3588,10 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2536
3588
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
2537
3589
|
const toolResultError = findToolResultError(step);
|
|
2538
3590
|
return {
|
|
2539
|
-
id:
|
|
3591
|
+
id: randomUUID3(),
|
|
2540
3592
|
stepNumber: i + 1,
|
|
2541
3593
|
turnIndex: i,
|
|
2542
|
-
type: step.toolCalls.length > 0 ?
|
|
3594
|
+
type: step.toolCalls.length > 0 ? LLMStepType3.TOOL_USE : LLMStepType3.COMPLETION,
|
|
2543
3595
|
model: modelId,
|
|
2544
3596
|
provider,
|
|
2545
3597
|
startedAt: new Date(
|
|
@@ -2562,7 +3614,7 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2562
3614
|
total: totalUsage.totalTokens
|
|
2563
3615
|
};
|
|
2564
3616
|
return {
|
|
2565
|
-
id:
|
|
3617
|
+
id: randomUUID3(),
|
|
2566
3618
|
steps: traceSteps,
|
|
2567
3619
|
summary: {
|
|
2568
3620
|
totalSteps: traceSteps.length,
|
|
@@ -2591,7 +3643,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
2591
3643
|
targetId: traceContext.targetId,
|
|
2592
3644
|
targetName: traceContext.targetName,
|
|
2593
3645
|
stepNumber: 0,
|
|
2594
|
-
type:
|
|
3646
|
+
type: LiveTraceEventType3.PROGRESS,
|
|
2595
3647
|
outputPreview: "Starting Simple Agent execution...",
|
|
2596
3648
|
elapsedMs: Date.now() - startTime,
|
|
2597
3649
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -2615,7 +3667,7 @@ function emitStepEvents(traceContext, steps, startTime) {
|
|
|
2615
3667
|
targetId: traceContext.targetId,
|
|
2616
3668
|
targetName: traceContext.targetName,
|
|
2617
3669
|
stepNumber: i + 1,
|
|
2618
|
-
type: isToolStep ?
|
|
3670
|
+
type: isToolStep ? LiveTraceEventType3.TOOL_USE : LiveTraceEventType3.COMPLETION,
|
|
2619
3671
|
toolName: firstToolCall?.toolName,
|
|
2620
3672
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
2621
3673
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -2638,7 +3690,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
2638
3690
|
targetId: traceContext.targetId,
|
|
2639
3691
|
targetName: traceContext.targetName,
|
|
2640
3692
|
stepNumber,
|
|
2641
|
-
type:
|
|
3693
|
+
type: LiveTraceEventType3.COMPLETION,
|
|
2642
3694
|
outputPreview: "Scenario execution completed",
|
|
2643
3695
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2644
3696
|
isComplete: true
|
|
@@ -2665,7 +3717,7 @@ defaultRegistry.register(simpleAgentAdapter);
|
|
|
2665
3717
|
|
|
2666
3718
|
// src/run-scenario/file-diff.ts
|
|
2667
3719
|
import { readdirSync, readFileSync as readFileSync2, statSync, existsSync as existsSync2 } from "fs";
|
|
2668
|
-
import { join as
|
|
3720
|
+
import { join as join9, relative } from "path";
|
|
2669
3721
|
|
|
2670
3722
|
// ../../node_modules/diff/lib/index.mjs
|
|
2671
3723
|
function Diff() {
|
|
@@ -2841,7 +3893,7 @@ Diff.prototype = {
|
|
|
2841
3893
|
tokenize: function tokenize(value) {
|
|
2842
3894
|
return Array.from(value);
|
|
2843
3895
|
},
|
|
2844
|
-
join: function
|
|
3896
|
+
join: function join8(chars) {
|
|
2845
3897
|
return chars.join("");
|
|
2846
3898
|
},
|
|
2847
3899
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -3281,7 +4333,7 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
3281
4333
|
}
|
|
3282
4334
|
const entries = readdirSync(dir, { withFileTypes: true });
|
|
3283
4335
|
for (const entry of entries) {
|
|
3284
|
-
const fullPath =
|
|
4336
|
+
const fullPath = join9(dir, entry.name);
|
|
3285
4337
|
const relativePath = relative(base, fullPath);
|
|
3286
4338
|
if (shouldIgnore(entry.name)) {
|
|
3287
4339
|
continue;
|
|
@@ -3390,8 +4442,8 @@ function extractTemplateFiles(before, after) {
|
|
|
3390
4442
|
}
|
|
3391
4443
|
|
|
3392
4444
|
// src/run-scenario/run-agent-with-context.ts
|
|
3393
|
-
import { AgentRunCommand as
|
|
3394
|
-
var DEFAULT_AGENT_COMMAND =
|
|
4445
|
+
import { AgentRunCommand as AgentRunCommand3, AgentType } from "@wix/evalforge-types";
|
|
4446
|
+
var DEFAULT_AGENT_COMMAND = AgentRunCommand3.CLAUDE;
|
|
3395
4447
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
3396
4448
|
const agent = evalData.agent ?? void 0;
|
|
3397
4449
|
const isSDK = agent?.agentType === AgentType.SDK;
|
|
@@ -3429,7 +4481,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
3429
4481
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
3430
4482
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
3431
4483
|
return {
|
|
3432
|
-
id:
|
|
4484
|
+
id: randomUUID4(),
|
|
3433
4485
|
targetId,
|
|
3434
4486
|
targetName,
|
|
3435
4487
|
scenarioId: scenario.id,
|