@wix/evalforge-evaluator 0.112.0 → 0.114.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +1240 -197
- package/build/index.js.map +4 -4
- package/build/index.mjs +1237 -185
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +1 -1
- package/build/types/run-scenario/agents/index.d.ts +2 -0
- package/build/types/run-scenario/agents/opencode/build-conversation.d.ts +7 -0
- package/build/types/run-scenario/agents/opencode/build-trace.d.ts +13 -0
- package/build/types/run-scenario/agents/opencode/config.d.ts +27 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/index.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +18 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +32 -0
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +12 -0
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +7 -0
- package/package.json +8 -7
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -509,7 +509,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
509
509
|
}
|
|
510
510
|
|
|
511
511
|
// src/run-scenario/index.ts
|
|
512
|
-
var
|
|
512
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
513
513
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
514
514
|
|
|
515
515
|
// src/run-scenario/environment.ts
|
|
@@ -596,7 +596,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
// src/run-scenario/run-agent-with-context.ts
|
|
599
|
-
var
|
|
599
|
+
var import_crypto4 = require("crypto");
|
|
600
600
|
|
|
601
601
|
// src/run-scenario/agents/registry.ts
|
|
602
602
|
var AgentAdapterRegistry = class {
|
|
@@ -1222,10 +1222,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1222
1222
|
}
|
|
1223
1223
|
const startTime = /* @__PURE__ */ new Date();
|
|
1224
1224
|
const allMessages = [];
|
|
1225
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1225
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1226
1226
|
const claudeDir = `${options.cwd}/.claude`;
|
|
1227
1227
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
1228
|
-
await
|
|
1228
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1229
1229
|
flag: "wx"
|
|
1230
1230
|
}).catch(() => {
|
|
1231
1231
|
});
|
|
@@ -2145,192 +2145,1235 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
2145
2145
|
// src/run-scenario/agents/claude-code/index.ts
|
|
2146
2146
|
defaultRegistry.register(claudeCodeAdapter);
|
|
2147
2147
|
|
|
2148
|
-
// src/run-scenario/agents/
|
|
2149
|
-
var
|
|
2150
|
-
var import_anthropic = require("@ai-sdk/anthropic");
|
|
2151
|
-
var import_openai = require("@ai-sdk/openai");
|
|
2152
|
-
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2153
|
-
var import_crypto2 = require("crypto");
|
|
2148
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
2149
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
2154
2150
|
|
|
2155
|
-
// src/run-scenario/agents/
|
|
2156
|
-
var
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2151
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2152
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
2153
|
+
|
|
2154
|
+
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2155
|
+
var import_promises7 = require("fs/promises");
|
|
2156
|
+
var import_path8 = require("path");
|
|
2157
|
+
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
2158
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
2159
|
+
await Promise.all(
|
|
2160
|
+
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
2161
|
+
);
|
|
2162
|
+
}
|
|
2163
|
+
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
2164
|
+
const skillName = skill.name;
|
|
2165
|
+
const skillDir = (0, import_path8.join)(cwd, ".opencode", "skills", skillName);
|
|
2166
|
+
await (0, import_promises7.mkdir)(skillDir, { recursive: true });
|
|
2167
|
+
const version = skill.latestVersion;
|
|
2168
|
+
if (version?.files && version.files.length > 0) {
|
|
2169
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
2170
|
+
console.log(
|
|
2171
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
2172
|
+
);
|
|
2173
|
+
} else if (skill.source) {
|
|
2174
|
+
try {
|
|
2175
|
+
const files = await fetchFn(skill.source, {
|
|
2176
|
+
userAgent: "EvalForge-Evaluator"
|
|
2177
|
+
});
|
|
2178
|
+
await writeFilesToDirectory(skillDir, files);
|
|
2179
|
+
console.log(
|
|
2180
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
2181
|
+
);
|
|
2182
|
+
} catch (error) {
|
|
2183
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2184
|
+
console.error(
|
|
2185
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
2186
|
+
);
|
|
2187
|
+
throw new Error(
|
|
2188
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
2166
2189
|
);
|
|
2167
|
-
for (const [serverName, serverConfig] of Object.entries(resolvedConfig)) {
|
|
2168
|
-
const config = serverConfig;
|
|
2169
|
-
const transport = buildTransport(serverName, config, cwd);
|
|
2170
|
-
const client = await (0, import_mcp.createMCPClient)({ transport });
|
|
2171
|
-
clients.push(client);
|
|
2172
|
-
const tools = await client.tools();
|
|
2173
|
-
for (const [toolName, tool] of Object.entries(tools)) {
|
|
2174
|
-
allTools[`${serverName}__${toolName}`] = tool;
|
|
2175
|
-
}
|
|
2176
|
-
}
|
|
2177
2190
|
}
|
|
2178
|
-
}
|
|
2179
|
-
|
|
2180
|
-
throw err;
|
|
2191
|
+
} else {
|
|
2192
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
2181
2193
|
}
|
|
2182
|
-
return { tools: allTools, clients };
|
|
2183
2194
|
}
|
|
2184
|
-
|
|
2185
|
-
|
|
2195
|
+
|
|
2196
|
+
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
2197
|
+
var import_promises8 = require("fs/promises");
|
|
2198
|
+
var import_path9 = require("path");
|
|
2199
|
+
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
2200
|
+
var AGENTS_DIR2 = ".opencode/agents";
|
|
2201
|
+
function toAgentFilename2(name, index, nameCount) {
|
|
2202
|
+
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
2203
|
+
const count = nameCount.get(base) ?? 0;
|
|
2204
|
+
nameCount.set(base, count + 1);
|
|
2205
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
2186
2206
|
}
|
|
2187
|
-
function
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2207
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
2208
|
+
if (agent.source) {
|
|
2209
|
+
try {
|
|
2210
|
+
const content = await fetchFn(agent.source, {
|
|
2211
|
+
userAgent: "EvalForge-Evaluator"
|
|
2212
|
+
});
|
|
2213
|
+
console.log(
|
|
2214
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
2215
|
+
);
|
|
2216
|
+
return content;
|
|
2217
|
+
} catch (error) {
|
|
2218
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2219
|
+
console.error(
|
|
2220
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
2221
|
+
);
|
|
2222
|
+
throw new Error(
|
|
2223
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
2224
|
+
);
|
|
2225
|
+
}
|
|
2197
2226
|
}
|
|
2198
|
-
if (
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
...headers && { headers }
|
|
2203
|
-
};
|
|
2227
|
+
if (!agent.subAgentMd) {
|
|
2228
|
+
console.warn(
|
|
2229
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
2230
|
+
);
|
|
2204
2231
|
}
|
|
2205
|
-
|
|
2232
|
+
return agent.subAgentMd;
|
|
2233
|
+
}
|
|
2234
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
2235
|
+
if (subAgents.length === 0) return;
|
|
2236
|
+
const agentsDir = (0, import_path9.join)(cwd, AGENTS_DIR2);
|
|
2237
|
+
await (0, import_promises8.mkdir)(agentsDir, { recursive: true });
|
|
2238
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
2239
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
2240
|
+
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
2241
|
+
const filePath = (0, import_path9.join)(agentsDir, `${filename}.md`);
|
|
2242
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
2243
|
+
await (0, import_promises8.writeFile)(filePath, content, "utf8");
|
|
2244
|
+
}
|
|
2245
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2248
|
+
// src/run-scenario/agents/opencode/config.ts
|
|
2249
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
2250
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2251
|
+
function parseModel(model) {
|
|
2252
|
+
const slashIndex = model.indexOf("/");
|
|
2253
|
+
if (slashIndex > 0) {
|
|
2206
2254
|
return {
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
...headers && { headers }
|
|
2255
|
+
providerID: model.slice(0, slashIndex),
|
|
2256
|
+
modelID: model.slice(slashIndex + 1)
|
|
2210
2257
|
};
|
|
2211
2258
|
}
|
|
2212
|
-
|
|
2213
|
-
|
|
2259
|
+
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2260
|
+
model
|
|
2214
2261
|
);
|
|
2262
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
|
|
2215
2263
|
}
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
if (obj.isError === true) {
|
|
2223
|
-
return extractErrorText(obj.content);
|
|
2264
|
+
function toOpenCodeMcpConfig(servers) {
|
|
2265
|
+
const result = {};
|
|
2266
|
+
for (const [name, entry] of Object.entries(servers)) {
|
|
2267
|
+
if (entry.type === "local" || entry.type === "remote") {
|
|
2268
|
+
result[name] = entry;
|
|
2269
|
+
continue;
|
|
2224
2270
|
}
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
} catch {
|
|
2234
|
-
return str.slice(0, 500);
|
|
2271
|
+
if (entry.url && typeof entry.url === "string") {
|
|
2272
|
+
result[name] = {
|
|
2273
|
+
type: "remote",
|
|
2274
|
+
url: entry.url,
|
|
2275
|
+
...entry.headers ? { headers: entry.headers } : {},
|
|
2276
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2277
|
+
};
|
|
2278
|
+
continue;
|
|
2235
2279
|
}
|
|
2280
|
+
if (entry.command && typeof entry.command === "string") {
|
|
2281
|
+
const commandArray = [
|
|
2282
|
+
entry.command,
|
|
2283
|
+
...entry.args || []
|
|
2284
|
+
];
|
|
2285
|
+
result[name] = {
|
|
2286
|
+
type: "local",
|
|
2287
|
+
command: commandArray,
|
|
2288
|
+
...entry.env ? { environment: entry.env } : {},
|
|
2289
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2290
|
+
};
|
|
2291
|
+
continue;
|
|
2292
|
+
}
|
|
2293
|
+
console.warn(
|
|
2294
|
+
`[MCP] Server "${name}" has unrecognized format, passing through as-is:`,
|
|
2295
|
+
JSON.stringify(entry)
|
|
2296
|
+
);
|
|
2297
|
+
result[name] = entry;
|
|
2236
2298
|
}
|
|
2237
|
-
return
|
|
2238
|
-
}
|
|
2239
|
-
function extractErrorText(content) {
|
|
2240
|
-
if (Array.isArray(content)) {
|
|
2241
|
-
const text = content.filter((c) => typeof c.text === "string").map((c) => c.text).join("\n");
|
|
2242
|
-
if (text) return text.slice(0, 500);
|
|
2243
|
-
}
|
|
2244
|
-
return "Tool call failed";
|
|
2299
|
+
return result;
|
|
2245
2300
|
}
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
2258
|
-
// Anthropic — Claude 4
|
|
2259
|
-
"claude-opus-4": { input: 15, output: 75 },
|
|
2260
|
-
"claude-sonnet-4": { input: 3, output: 15 },
|
|
2261
|
-
// OpenAI — GPT-5
|
|
2262
|
-
"gpt-5": { input: 1.25, output: 10 },
|
|
2263
|
-
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
2264
|
-
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
2265
|
-
// OpenAI — GPT-4.1
|
|
2266
|
-
"gpt-4.1": { input: 2, output: 8 },
|
|
2267
|
-
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
2268
|
-
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
2269
|
-
// OpenAI — GPT-4o
|
|
2270
|
-
"gpt-4o": { input: 2.5, output: 10 },
|
|
2271
|
-
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
2272
|
-
// OpenAI — Reasoning
|
|
2273
|
-
o3: { input: 2, output: 8 },
|
|
2274
|
-
"o4-mini": { input: 1.1, output: 4.4 },
|
|
2275
|
-
"o3-mini": { input: 1.1, output: 4.4 },
|
|
2276
|
-
o1: { input: 15, output: 60 }
|
|
2277
|
-
};
|
|
2278
|
-
function extractGatewayCost(step, provider) {
|
|
2279
|
-
try {
|
|
2280
|
-
if (provider === PROVIDER_ANTHROPIC) {
|
|
2281
|
-
const meta = step.providerMetadata;
|
|
2282
|
-
const anthropic = meta?.anthropic;
|
|
2283
|
-
const usage = anthropic?.usage;
|
|
2284
|
-
const cost2 = usage?.total_cost_usd;
|
|
2285
|
-
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
2301
|
+
async function buildOpenCodeConfig(options) {
|
|
2302
|
+
const modelStr = options.model || DEFAULT_MODEL2;
|
|
2303
|
+
const { providerID, modelID } = parseModel(modelStr);
|
|
2304
|
+
const provider = {};
|
|
2305
|
+
if (options.aiGatewayUrl) {
|
|
2306
|
+
const providerOptions = {
|
|
2307
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
|
|
2308
|
+
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2309
|
+
};
|
|
2310
|
+
if (options.aiGatewayHeaders) {
|
|
2311
|
+
providerOptions.headers = { ...options.aiGatewayHeaders };
|
|
2286
2312
|
}
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
} catch {
|
|
2291
|
-
return void 0;
|
|
2313
|
+
provider[providerID] = {
|
|
2314
|
+
options: providerOptions
|
|
2315
|
+
};
|
|
2292
2316
|
}
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2317
|
+
let mcp;
|
|
2318
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2319
|
+
const mcpServers = {};
|
|
2320
|
+
for (const mcpEntity of options.mcps) {
|
|
2321
|
+
const entityConfig = mcpEntity.config;
|
|
2322
|
+
for (const [key, value] of Object.entries(entityConfig)) {
|
|
2323
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
2324
|
+
throw new Error(
|
|
2325
|
+
`MCP "${mcpEntity.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}).`
|
|
2326
|
+
);
|
|
2327
|
+
}
|
|
2328
|
+
mcpServers[key] = value;
|
|
2329
|
+
}
|
|
2330
|
+
}
|
|
2331
|
+
const resolved = await resolveMcpPlaceholders(mcpServers, {
|
|
2332
|
+
cwd: options.cwd
|
|
2333
|
+
});
|
|
2334
|
+
mcp = toOpenCodeMcpConfig(resolved);
|
|
2335
|
+
}
|
|
2336
|
+
const agentOverrides = {};
|
|
2337
|
+
if (options.temperature != null) {
|
|
2338
|
+
agentOverrides.temperature = options.temperature;
|
|
2339
|
+
}
|
|
2340
|
+
if (options.maxTurns != null) {
|
|
2341
|
+
agentOverrides.maxSteps = options.maxTurns;
|
|
2342
|
+
}
|
|
2343
|
+
const config = {
|
|
2344
|
+
model: `${providerID}/${modelID}`,
|
|
2345
|
+
provider,
|
|
2346
|
+
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2347
|
+
permission: {
|
|
2348
|
+
edit: "allow",
|
|
2349
|
+
bash: "allow",
|
|
2350
|
+
webfetch: "allow",
|
|
2351
|
+
doom_loop: "allow",
|
|
2352
|
+
external_directory: "allow"
|
|
2353
|
+
},
|
|
2354
|
+
...mcp ? { mcp } : {}
|
|
2355
|
+
};
|
|
2356
|
+
return { config, providerID, modelID };
|
|
2302
2357
|
}
|
|
2303
2358
|
|
|
2304
|
-
// src/run-scenario/agents/
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2359
|
+
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2360
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2361
|
+
var import_crypto2 = require("crypto");
|
|
2362
|
+
function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
2363
|
+
const assistantMessages = messages.filter(
|
|
2364
|
+
(m) => m.info.role === "assistant"
|
|
2365
|
+
);
|
|
2366
|
+
const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
|
|
2367
|
+
const { info, parts } = msg;
|
|
2368
|
+
let text = "";
|
|
2369
|
+
let thinking = "";
|
|
2370
|
+
const toolCalls = [];
|
|
2371
|
+
let stepInputTokens = 0;
|
|
2372
|
+
let stepOutputTokens = 0;
|
|
2373
|
+
let stepCost = 0;
|
|
2374
|
+
let finishReason = "unknown";
|
|
2375
|
+
for (const part of parts) {
|
|
2376
|
+
switch (part.type) {
|
|
2377
|
+
case "text": {
|
|
2378
|
+
const textPart = part;
|
|
2379
|
+
text += textPart.text;
|
|
2380
|
+
break;
|
|
2381
|
+
}
|
|
2382
|
+
case "reasoning": {
|
|
2383
|
+
const reasoningPart = part;
|
|
2384
|
+
thinking += reasoningPart.text;
|
|
2385
|
+
break;
|
|
2386
|
+
}
|
|
2387
|
+
case "tool": {
|
|
2388
|
+
const toolPart = part;
|
|
2389
|
+
toolCalls.push({
|
|
2390
|
+
toolName: toolPart.tool,
|
|
2391
|
+
args: toolPart.state.input
|
|
2392
|
+
});
|
|
2393
|
+
break;
|
|
2394
|
+
}
|
|
2395
|
+
case "step-finish": {
|
|
2396
|
+
const sf = part;
|
|
2397
|
+
stepInputTokens += sf.tokens.input;
|
|
2398
|
+
stepOutputTokens += sf.tokens.output;
|
|
2399
|
+
stepCost += sf.cost;
|
|
2400
|
+
finishReason = sf.reason;
|
|
2401
|
+
break;
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
if (stepInputTokens === 0 && stepOutputTokens === 0) {
|
|
2406
|
+
stepInputTokens = info.tokens.input;
|
|
2407
|
+
stepOutputTokens = info.tokens.output;
|
|
2408
|
+
stepCost = info.cost;
|
|
2409
|
+
}
|
|
2410
|
+
const startedAt = new Date(info.time.created).toISOString();
|
|
2411
|
+
const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
|
|
2412
|
+
const durationMs = Math.max(0, completedAt - info.time.created);
|
|
2413
|
+
const isSuccess = finishReason !== "error";
|
|
2414
|
+
const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
|
|
2415
|
+
const stepModel = info.modelID || model;
|
|
2416
|
+
const stepProvider = info.providerID || provider;
|
|
2417
|
+
const toolCallCount = toolCalls.length;
|
|
2418
|
+
const hasThinking = !!thinking;
|
|
2419
|
+
const hasText = !!text;
|
|
2420
|
+
const subSteps = [];
|
|
2421
|
+
const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
|
|
2422
|
+
const toolSubSteps = toolCallCount;
|
|
2423
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
2424
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
2425
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
2426
|
+
subSteps.push({
|
|
2427
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2428
|
+
stepNumber: 0,
|
|
2429
|
+
// renumbered below
|
|
2430
|
+
turnIndex,
|
|
2431
|
+
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
2432
|
+
model: stepModel,
|
|
2433
|
+
provider: stepProvider,
|
|
2434
|
+
startedAt,
|
|
2435
|
+
durationMs: Math.round(durationMs / totalSubSteps),
|
|
2436
|
+
tokenUsage: {
|
|
2437
|
+
prompt: Math.round(stepInputTokens / totalSubSteps),
|
|
2438
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
2439
|
+
total: Math.round(
|
|
2440
|
+
(stepInputTokens + stepOutputTokens) / totalSubSteps
|
|
2441
|
+
)
|
|
2442
|
+
},
|
|
2443
|
+
costUsd: stepCost / totalSubSteps,
|
|
2444
|
+
outputPreview: thinking.slice(0, 200),
|
|
2445
|
+
success: isSuccess,
|
|
2446
|
+
error: errorMsg
|
|
2447
|
+
});
|
|
2448
|
+
}
|
|
2449
|
+
if (toolCallCount > 0) {
|
|
2450
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
2451
|
+
const tc = toolCalls[tcIdx];
|
|
2452
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
2453
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
2454
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
2455
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
2456
|
+
subSteps.push({
|
|
2457
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2458
|
+
stepNumber: 0,
|
|
2459
|
+
turnIndex,
|
|
2460
|
+
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
2461
|
+
model: stepModel,
|
|
2462
|
+
provider: stepProvider,
|
|
2463
|
+
startedAt,
|
|
2464
|
+
durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
|
|
2465
|
+
tokenUsage: {
|
|
2466
|
+
prompt: Math.round(
|
|
2467
|
+
stepInputTokens * remainingFraction * toolFraction
|
|
2468
|
+
),
|
|
2469
|
+
completion: Math.round(
|
|
2470
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
2471
|
+
),
|
|
2472
|
+
total: Math.round(
|
|
2473
|
+
(stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
2474
|
+
)
|
|
2475
|
+
},
|
|
2476
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
2477
|
+
toolName: tc.toolName,
|
|
2478
|
+
toolArguments: JSON.stringify(tc.args),
|
|
2479
|
+
outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
|
|
2480
|
+
success: isSuccess,
|
|
2481
|
+
error: errorMsg
|
|
2482
|
+
});
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2485
|
+
if (hasText && toolCallCount > 0) {
|
|
2486
|
+
subSteps.push({
|
|
2487
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2488
|
+
stepNumber: 0,
|
|
2489
|
+
turnIndex,
|
|
2490
|
+
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
2491
|
+
model: stepModel,
|
|
2492
|
+
provider: stepProvider,
|
|
2493
|
+
startedAt,
|
|
2494
|
+
durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
2495
|
+
tokenUsage: {
|
|
2496
|
+
prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2497
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2498
|
+
total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2499
|
+
},
|
|
2500
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2501
|
+
outputPreview: text.slice(0, 200),
|
|
2502
|
+
success: isSuccess,
|
|
2503
|
+
error: errorMsg
|
|
2504
|
+
});
|
|
2505
|
+
}
|
|
2506
|
+
if (subSteps.length === 0) {
|
|
2507
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
2508
|
+
subSteps.push({
|
|
2509
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2510
|
+
stepNumber: 0,
|
|
2511
|
+
turnIndex,
|
|
2512
|
+
type: stepType,
|
|
2513
|
+
model: stepModel,
|
|
2514
|
+
provider: stepProvider,
|
|
2515
|
+
startedAt,
|
|
2516
|
+
durationMs,
|
|
2517
|
+
tokenUsage: {
|
|
2518
|
+
prompt: stepInputTokens,
|
|
2519
|
+
completion: stepOutputTokens,
|
|
2520
|
+
total: stepInputTokens + stepOutputTokens
|
|
2521
|
+
},
|
|
2522
|
+
costUsd: stepCost,
|
|
2523
|
+
outputPreview: (text || thinking)?.slice(0, 200),
|
|
2524
|
+
success: isSuccess,
|
|
2525
|
+
error: errorMsg
|
|
2526
|
+
});
|
|
2527
|
+
}
|
|
2528
|
+
return subSteps;
|
|
2529
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
2530
|
+
const totalTokens = buildTotalTokens(assistantMessages);
|
|
2531
|
+
const totalCost = assistantMessages.reduce((sum, m) => {
|
|
2532
|
+
const aMsg = m.info;
|
|
2533
|
+
return sum + aMsg.cost;
|
|
2534
|
+
}, 0);
|
|
2535
|
+
const stepTypeBreakdown = {};
|
|
2536
|
+
for (const step of allSteps) {
|
|
2537
|
+
const entry = stepTypeBreakdown[step.type] ?? {
|
|
2538
|
+
count: 0,
|
|
2539
|
+
durationMs: 0,
|
|
2540
|
+
tokens: 0,
|
|
2541
|
+
costUsd: 0
|
|
2542
|
+
};
|
|
2543
|
+
entry.count += 1;
|
|
2544
|
+
entry.durationMs += step.durationMs;
|
|
2545
|
+
entry.tokens += step.tokenUsage.total;
|
|
2546
|
+
entry.costUsd += step.costUsd;
|
|
2547
|
+
stepTypeBreakdown[step.type] = entry;
|
|
2548
|
+
}
|
|
2549
|
+
const modelUsed = allSteps[0]?.model || model;
|
|
2550
|
+
const summary = {
|
|
2551
|
+
totalSteps: allSteps.length,
|
|
2552
|
+
totalTurns: assistantMessages.length,
|
|
2553
|
+
totalDurationMs,
|
|
2554
|
+
totalTokens,
|
|
2555
|
+
totalCostUsd: totalCost,
|
|
2556
|
+
modelBreakdown: {
|
|
2557
|
+
[modelUsed]: {
|
|
2558
|
+
count: allSteps.length,
|
|
2559
|
+
durationMs: totalDurationMs,
|
|
2560
|
+
tokens: totalTokens.total,
|
|
2561
|
+
costUsd: totalCost
|
|
2562
|
+
}
|
|
2563
|
+
},
|
|
2564
|
+
modelsUsed: [modelUsed],
|
|
2565
|
+
stepTypeBreakdown
|
|
2566
|
+
};
|
|
2567
|
+
return {
|
|
2568
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2569
|
+
steps: allSteps,
|
|
2570
|
+
summary
|
|
2571
|
+
};
|
|
2572
|
+
}
|
|
2573
|
+
function buildTotalTokens(assistantMessages) {
|
|
2574
|
+
let prompt = 0;
|
|
2575
|
+
let completion = 0;
|
|
2576
|
+
for (const { info } of assistantMessages) {
|
|
2577
|
+
prompt += info.tokens.input;
|
|
2578
|
+
completion += info.tokens.output;
|
|
2579
|
+
}
|
|
2580
|
+
return { prompt, completion, total: prompt + completion };
|
|
2581
|
+
}
|
|
2582
|
+
|
|
2583
|
+
// src/run-scenario/agents/opencode/build-conversation.ts
|
|
2584
|
+
function buildConversation2(messages) {
|
|
2585
|
+
const result = [];
|
|
2586
|
+
for (const { info, parts } of messages) {
|
|
2587
|
+
const timestamp = new Date(info.time.created).toISOString();
|
|
2588
|
+
if (info.role === "assistant") {
|
|
2589
|
+
const content = [];
|
|
2590
|
+
for (const part of parts) {
|
|
2591
|
+
switch (part.type) {
|
|
2592
|
+
case "text": {
|
|
2593
|
+
const textPart = part;
|
|
2594
|
+
content.push({ type: "text", text: textPart.text });
|
|
2595
|
+
break;
|
|
2596
|
+
}
|
|
2597
|
+
case "reasoning": {
|
|
2598
|
+
const reasoningPart = part;
|
|
2599
|
+
content.push({ type: "thinking", thinking: reasoningPart.text });
|
|
2600
|
+
break;
|
|
2601
|
+
}
|
|
2602
|
+
case "tool": {
|
|
2603
|
+
const toolPart = part;
|
|
2604
|
+
content.push({
|
|
2605
|
+
type: "tool_use",
|
|
2606
|
+
toolName: toolPart.tool,
|
|
2607
|
+
toolId: toolPart.callID,
|
|
2608
|
+
input: toolPart.state.input
|
|
2609
|
+
});
|
|
2610
|
+
break;
|
|
2611
|
+
}
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
if (content.length > 0) {
|
|
2615
|
+
result.push({ role: "assistant", content, timestamp });
|
|
2616
|
+
}
|
|
2617
|
+
} else if (info.role === "user") {
|
|
2618
|
+
const content = [];
|
|
2619
|
+
for (const part of parts) {
|
|
2620
|
+
if (part.type === "text") {
|
|
2621
|
+
const textPart = part;
|
|
2622
|
+
content.push({ type: "text", text: textPart.text });
|
|
2623
|
+
} else if (part.type === "tool") {
|
|
2624
|
+
const toolPart = part;
|
|
2625
|
+
const state = toolPart.state;
|
|
2626
|
+
if (state.status === "completed") {
|
|
2627
|
+
const completed = state;
|
|
2628
|
+
content.push({
|
|
2629
|
+
type: "tool_result",
|
|
2630
|
+
toolUseId: toolPart.callID,
|
|
2631
|
+
content: completed.output
|
|
2632
|
+
});
|
|
2633
|
+
} else if (state.status === "error") {
|
|
2634
|
+
const errState = state;
|
|
2635
|
+
content.push({
|
|
2636
|
+
type: "tool_result",
|
|
2637
|
+
toolUseId: toolPart.callID,
|
|
2638
|
+
content: errState.error,
|
|
2639
|
+
isError: true
|
|
2640
|
+
});
|
|
2641
|
+
}
|
|
2642
|
+
}
|
|
2643
|
+
}
|
|
2644
|
+
if (content.length > 0) {
|
|
2645
|
+
result.push({ role: "user", content, timestamp });
|
|
2646
|
+
}
|
|
2647
|
+
}
|
|
2648
|
+
}
|
|
2649
|
+
return result;
|
|
2650
|
+
}
|
|
2651
|
+
|
|
2652
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2653
|
+
var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2654
|
+
function extractToolAction(toolName, args) {
|
|
2655
|
+
if (!toolName) return "Using tool...";
|
|
2656
|
+
const a = args;
|
|
2657
|
+
if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
|
|
2658
|
+
const desc = String(a.description).slice(0, 55);
|
|
2659
|
+
return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
|
|
2660
|
+
}
|
|
2661
|
+
if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
|
|
2662
|
+
const cmd = String(a.command).slice(0, 50);
|
|
2663
|
+
return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
|
|
2664
|
+
}
|
|
2665
|
+
if (a?.file_path || a?.path || a?.target_file) {
|
|
2666
|
+
const filePath = String(a.file_path || a.path || a.target_file).slice(
|
|
2667
|
+
0,
|
|
2668
|
+
50
|
|
2669
|
+
);
|
|
2670
|
+
if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
|
|
2671
|
+
if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
|
|
2672
|
+
}
|
|
2673
|
+
return `Using ${toolName}...`;
|
|
2674
|
+
}
|
|
2675
|
+
function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
2676
|
+
const base = {
|
|
2677
|
+
evalRunId: context.evalRunId,
|
|
2678
|
+
scenarioId: context.scenarioId,
|
|
2679
|
+
scenarioName: context.scenarioName,
|
|
2680
|
+
targetId: context.targetId,
|
|
2681
|
+
targetName: context.targetName,
|
|
2682
|
+
stepNumber,
|
|
2683
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2684
|
+
isComplete
|
|
2685
|
+
};
|
|
2686
|
+
switch (part.type) {
|
|
2687
|
+
case "text": {
|
|
2688
|
+
const textPart = part;
|
|
2689
|
+
return {
|
|
2690
|
+
...base,
|
|
2691
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
2692
|
+
outputPreview: textPart.text.slice(0, 500)
|
|
2693
|
+
};
|
|
2694
|
+
}
|
|
2695
|
+
case "reasoning": {
|
|
2696
|
+
const reasoningPart = part;
|
|
2697
|
+
return {
|
|
2698
|
+
...base,
|
|
2699
|
+
type: import_evalforge_types8.LiveTraceEventType.THINKING,
|
|
2700
|
+
thinking: reasoningPart.text.slice(0, 500)
|
|
2701
|
+
};
|
|
2702
|
+
}
|
|
2703
|
+
case "tool": {
|
|
2704
|
+
const toolPart = part;
|
|
2705
|
+
const toolName = toolPart.tool;
|
|
2706
|
+
const args = toolPart.state.input;
|
|
2707
|
+
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
2708
|
+
let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
|
|
2709
|
+
let filePath;
|
|
2710
|
+
const a = args;
|
|
2711
|
+
if (a.file_path || a.path || a.target_file) {
|
|
2712
|
+
filePath = String(a.file_path || a.path || a.target_file);
|
|
2713
|
+
if (/write|edit/i.test(toolName)) {
|
|
2714
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
|
|
2715
|
+
} else if (/read|view/i.test(toolName)) {
|
|
2716
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
|
|
2717
|
+
}
|
|
2718
|
+
}
|
|
2719
|
+
return { ...base, type, toolName, toolArgs, filePath };
|
|
2720
|
+
}
|
|
2721
|
+
case "step-finish":
|
|
2722
|
+
return {
|
|
2723
|
+
...base,
|
|
2724
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
2725
|
+
outputPreview: "Step completed"
|
|
2726
|
+
};
|
|
2727
|
+
default:
|
|
2728
|
+
return null;
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2732
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2733
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2734
|
+
skillCount: skills.length,
|
|
2735
|
+
skillNames,
|
|
2736
|
+
scenarioId: scenario.id,
|
|
2737
|
+
scenarioName: scenario.name,
|
|
2738
|
+
cwd: options.cwd,
|
|
2739
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2740
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2741
|
+
model: options.model
|
|
2742
|
+
});
|
|
2743
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2744
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2745
|
+
console.log(
|
|
2746
|
+
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2747
|
+
);
|
|
2748
|
+
}
|
|
2749
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
2750
|
+
await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
|
|
2751
|
+
}
|
|
2752
|
+
if (options.rules && options.rules.length > 0) {
|
|
2753
|
+
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
2754
|
+
}
|
|
2755
|
+
try {
|
|
2756
|
+
await writeSkillsToFilesystem2(options.cwd, skills);
|
|
2757
|
+
} catch (writeError) {
|
|
2758
|
+
throw new Error(
|
|
2759
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2760
|
+
);
|
|
2761
|
+
}
|
|
2762
|
+
const maxTurns = options.maxTurns ?? 10;
|
|
2763
|
+
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2764
|
+
model: options.model,
|
|
2765
|
+
temperature: options.temperature,
|
|
2766
|
+
maxTurns,
|
|
2767
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2768
|
+
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
2769
|
+
mcps: options.mcps,
|
|
2770
|
+
cwd: options.cwd
|
|
2771
|
+
});
|
|
2772
|
+
const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
|
|
2773
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
2774
|
+
const abortController = new AbortController();
|
|
2775
|
+
let timeoutHandle;
|
|
2776
|
+
let heartbeatHandle;
|
|
2777
|
+
let timedOut = false;
|
|
2778
|
+
const traceContext = options.traceContext;
|
|
2779
|
+
let traceStepNumber = 0;
|
|
2780
|
+
let lastAction = "Starting...";
|
|
2781
|
+
let lastToolName;
|
|
2782
|
+
let lastFilePath;
|
|
2783
|
+
if (traceContext) {
|
|
2784
|
+
emitTraceEvent(
|
|
2785
|
+
{
|
|
2786
|
+
evalRunId: traceContext.evalRunId,
|
|
2787
|
+
scenarioId: traceContext.scenarioId,
|
|
2788
|
+
scenarioName: traceContext.scenarioName,
|
|
2789
|
+
targetId: traceContext.targetId,
|
|
2790
|
+
targetName: traceContext.targetName,
|
|
2791
|
+
stepNumber: 0,
|
|
2792
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2793
|
+
outputPreview: JSON.stringify({
|
|
2794
|
+
event: "pre-sdk-execution",
|
|
2795
|
+
model: `${providerID}/${modelID}`,
|
|
2796
|
+
maxTurns,
|
|
2797
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2798
|
+
}),
|
|
2799
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2800
|
+
isComplete: false
|
|
2801
|
+
},
|
|
2802
|
+
traceContext.tracePushUrl,
|
|
2803
|
+
traceContext.routeHeader,
|
|
2804
|
+
traceContext.authToken
|
|
2805
|
+
);
|
|
2806
|
+
}
|
|
2807
|
+
let server;
|
|
2808
|
+
try {
|
|
2809
|
+
console.log("[SDK-DEBUG] Starting OpenCode server...");
|
|
2810
|
+
server = await createOpencodeServer({
|
|
2811
|
+
config,
|
|
2812
|
+
signal: abortController.signal,
|
|
2813
|
+
timeout: 3e4
|
|
2814
|
+
});
|
|
2815
|
+
console.log(`[SDK-DEBUG] Server started at ${server.url}`);
|
|
2816
|
+
const client = createOpencodeClient({
|
|
2817
|
+
baseUrl: server.url,
|
|
2818
|
+
directory: options.cwd
|
|
2819
|
+
});
|
|
2820
|
+
const session = await client.session.create({
|
|
2821
|
+
body: { title: `eval-${scenario.name}` }
|
|
2822
|
+
});
|
|
2823
|
+
if (!session.data) {
|
|
2824
|
+
const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
|
|
2825
|
+
throw new Error(
|
|
2826
|
+
`OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
|
|
2827
|
+
);
|
|
2828
|
+
}
|
|
2829
|
+
const sessionId = session.data.id;
|
|
2830
|
+
console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
|
|
2831
|
+
let eventStreamAbort;
|
|
2832
|
+
if (traceContext) {
|
|
2833
|
+
eventStreamAbort = new AbortController();
|
|
2834
|
+
const executionStartTime = Date.now();
|
|
2835
|
+
(async () => {
|
|
2836
|
+
try {
|
|
2837
|
+
const events = await client.event.subscribe();
|
|
2838
|
+
for await (const event of events.stream) {
|
|
2839
|
+
if (eventStreamAbort.signal.aborted) break;
|
|
2840
|
+
const evt = event;
|
|
2841
|
+
if (evt.type === "message.part.updated") {
|
|
2842
|
+
const { part } = evt.properties;
|
|
2843
|
+
traceStepNumber++;
|
|
2844
|
+
const traceEvent = createTraceEventFromPart(
|
|
2845
|
+
part,
|
|
2846
|
+
traceContext,
|
|
2847
|
+
traceStepNumber,
|
|
2848
|
+
false
|
|
2849
|
+
);
|
|
2850
|
+
if (traceEvent) {
|
|
2851
|
+
lastToolName = traceEvent.toolName;
|
|
2852
|
+
lastFilePath = traceEvent.filePath;
|
|
2853
|
+
if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
2854
|
+
lastAction = "Thinking...";
|
|
2855
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
2856
|
+
lastAction = extractToolAction(
|
|
2857
|
+
traceEvent.toolName ?? "",
|
|
2858
|
+
void 0
|
|
2859
|
+
);
|
|
2860
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
2861
|
+
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
2862
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
2863
|
+
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
2864
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
2865
|
+
lastAction = "Processing response...";
|
|
2866
|
+
}
|
|
2867
|
+
emitTraceEvent(
|
|
2868
|
+
traceEvent,
|
|
2869
|
+
traceContext.tracePushUrl,
|
|
2870
|
+
traceContext.routeHeader,
|
|
2871
|
+
traceContext.authToken
|
|
2872
|
+
);
|
|
2873
|
+
}
|
|
2874
|
+
} else if (evt.type === "session.error") {
|
|
2875
|
+
const props = evt.properties;
|
|
2876
|
+
traceStepNumber++;
|
|
2877
|
+
emitTraceEvent(
|
|
2878
|
+
{
|
|
2879
|
+
evalRunId: traceContext.evalRunId,
|
|
2880
|
+
scenarioId: traceContext.scenarioId,
|
|
2881
|
+
scenarioName: traceContext.scenarioName,
|
|
2882
|
+
targetId: traceContext.targetId,
|
|
2883
|
+
targetName: traceContext.targetName,
|
|
2884
|
+
stepNumber: traceStepNumber,
|
|
2885
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2886
|
+
outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
|
|
2887
|
+
0,
|
|
2888
|
+
500
|
|
2889
|
+
),
|
|
2890
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2891
|
+
isComplete: false
|
|
2892
|
+
},
|
|
2893
|
+
traceContext.tracePushUrl,
|
|
2894
|
+
traceContext.routeHeader,
|
|
2895
|
+
traceContext.authToken
|
|
2896
|
+
);
|
|
2897
|
+
}
|
|
2898
|
+
}
|
|
2899
|
+
} catch {
|
|
2900
|
+
}
|
|
2901
|
+
})();
|
|
2902
|
+
let lastReportedAction = "";
|
|
2903
|
+
let sameActionCount = 0;
|
|
2904
|
+
heartbeatHandle = setInterval(() => {
|
|
2905
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
2906
|
+
let progressMessage = lastAction;
|
|
2907
|
+
if (lastAction === lastReportedAction) {
|
|
2908
|
+
sameActionCount++;
|
|
2909
|
+
} else {
|
|
2910
|
+
sameActionCount = 1;
|
|
2911
|
+
lastReportedAction = lastAction;
|
|
2912
|
+
}
|
|
2913
|
+
const isTaskTool = lastToolName === "Task" || lastToolName === "dispatch_agent";
|
|
2914
|
+
if (isTaskTool && sameActionCount > 1) {
|
|
2915
|
+
progressMessage = `Waiting for ${lastAction}`;
|
|
2916
|
+
} else if (lastToolName && lastFilePath) {
|
|
2917
|
+
progressMessage = `${lastToolName}: ${lastFilePath}`;
|
|
2918
|
+
} else if (lastToolName && !isTaskTool) {
|
|
2919
|
+
progressMessage = `Using ${lastToolName}...`;
|
|
2920
|
+
}
|
|
2921
|
+
const elapsedSec = Math.round(elapsedMs / 1e3);
|
|
2922
|
+
progressMessage += ` (${elapsedSec}s, step ${traceStepNumber})`;
|
|
2923
|
+
emitTraceEvent(
|
|
2924
|
+
{
|
|
2925
|
+
evalRunId: traceContext.evalRunId,
|
|
2926
|
+
scenarioId: traceContext.scenarioId,
|
|
2927
|
+
scenarioName: traceContext.scenarioName,
|
|
2928
|
+
targetId: traceContext.targetId,
|
|
2929
|
+
targetName: traceContext.targetName,
|
|
2930
|
+
stepNumber: traceStepNumber,
|
|
2931
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
2932
|
+
outputPreview: progressMessage,
|
|
2933
|
+
toolName: lastToolName,
|
|
2934
|
+
filePath: lastFilePath,
|
|
2935
|
+
elapsedMs,
|
|
2936
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2937
|
+
isComplete: false
|
|
2938
|
+
},
|
|
2939
|
+
traceContext.tracePushUrl,
|
|
2940
|
+
traceContext.routeHeader,
|
|
2941
|
+
traceContext.authToken
|
|
2942
|
+
);
|
|
2943
|
+
}, 1e4);
|
|
2944
|
+
}
|
|
2945
|
+
const promptPromise = (async () => {
|
|
2946
|
+
let systemPrompt;
|
|
2947
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2948
|
+
} else if (options.systemPrompt != null) {
|
|
2949
|
+
systemPrompt = options.systemPrompt;
|
|
2950
|
+
} else {
|
|
2951
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
2952
|
+
}
|
|
2953
|
+
console.log("[SDK-DEBUG] Sending prompt...");
|
|
2954
|
+
const result = await client.session.prompt({
|
|
2955
|
+
path: { id: sessionId },
|
|
2956
|
+
body: {
|
|
2957
|
+
model: { providerID, modelID },
|
|
2958
|
+
...systemPrompt ? { system: systemPrompt } : {},
|
|
2959
|
+
parts: [{ type: "text", text: scenario.triggerPrompt }]
|
|
2960
|
+
}
|
|
2961
|
+
});
|
|
2962
|
+
return result;
|
|
2963
|
+
})();
|
|
2964
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
2965
|
+
timeoutHandle = setTimeout(() => {
|
|
2966
|
+
timedOut = true;
|
|
2967
|
+
client.session.abort({ path: { id: sessionId } }).catch(() => {
|
|
2968
|
+
});
|
|
2969
|
+
reject(
|
|
2970
|
+
new Error(
|
|
2971
|
+
`OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
|
|
2972
|
+
)
|
|
2973
|
+
);
|
|
2974
|
+
}, SDK_TIMEOUT_MS);
|
|
2975
|
+
});
|
|
2976
|
+
const promptResult = await Promise.race([promptPromise, timeoutPromise]);
|
|
2977
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
2978
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
2979
|
+
if (eventStreamAbort) eventStreamAbort.abort();
|
|
2980
|
+
if ("error" in promptResult && promptResult.error) {
|
|
2981
|
+
const errPayload = promptResult.error;
|
|
2982
|
+
throw new Error(
|
|
2983
|
+
`Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
|
|
2984
|
+
);
|
|
2985
|
+
}
|
|
2986
|
+
console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
|
|
2987
|
+
const messagesResponse = await client.session.messages({
|
|
2988
|
+
path: { id: sessionId }
|
|
2989
|
+
});
|
|
2990
|
+
const allMessages = messagesResponse.data ?? [];
|
|
2991
|
+
console.log(
|
|
2992
|
+
`[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
|
|
2993
|
+
);
|
|
2994
|
+
if (traceContext) {
|
|
2995
|
+
emitTraceEvent(
|
|
2996
|
+
{
|
|
2997
|
+
evalRunId: traceContext.evalRunId,
|
|
2998
|
+
scenarioId: traceContext.scenarioId,
|
|
2999
|
+
scenarioName: traceContext.scenarioName,
|
|
3000
|
+
targetId: traceContext.targetId,
|
|
3001
|
+
targetName: traceContext.targetName,
|
|
3002
|
+
stepNumber: traceStepNumber + 1,
|
|
3003
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
3004
|
+
outputPreview: "Scenario execution completed",
|
|
3005
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3006
|
+
isComplete: true
|
|
3007
|
+
},
|
|
3008
|
+
traceContext.tracePushUrl,
|
|
3009
|
+
traceContext.routeHeader,
|
|
3010
|
+
traceContext.authToken
|
|
3011
|
+
);
|
|
3012
|
+
}
|
|
3013
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
3014
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3015
|
+
const resultData = promptResult.data;
|
|
3016
|
+
const lastAssistantInfo = resultData?.info;
|
|
3017
|
+
if (lastAssistantInfo?.error) {
|
|
3018
|
+
const err = lastAssistantInfo.error;
|
|
3019
|
+
throw new Error(
|
|
3020
|
+
`Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
|
|
3021
|
+
);
|
|
3022
|
+
}
|
|
3023
|
+
let outputText = "";
|
|
3024
|
+
if (resultData?.parts) {
|
|
3025
|
+
for (const part of resultData.parts) {
|
|
3026
|
+
if (part.type === "text") {
|
|
3027
|
+
outputText += part.text;
|
|
3028
|
+
}
|
|
3029
|
+
}
|
|
3030
|
+
}
|
|
3031
|
+
if (!outputText && allMessages.length > 0) {
|
|
3032
|
+
for (let i = allMessages.length - 1; i >= 0; i--) {
|
|
3033
|
+
const msg = allMessages[i];
|
|
3034
|
+
if (msg.info.role === "assistant") {
|
|
3035
|
+
const assistantInfo = msg.info;
|
|
3036
|
+
if (assistantInfo.error) {
|
|
3037
|
+
throw new Error(
|
|
3038
|
+
`Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
|
|
3039
|
+
);
|
|
3040
|
+
}
|
|
3041
|
+
for (const part of msg.parts) {
|
|
3042
|
+
if (part.type === "text") {
|
|
3043
|
+
outputText += part.text;
|
|
3044
|
+
}
|
|
3045
|
+
}
|
|
3046
|
+
if (outputText) break;
|
|
3047
|
+
}
|
|
3048
|
+
}
|
|
3049
|
+
}
|
|
3050
|
+
if (!outputText) {
|
|
3051
|
+
const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
|
|
3052
|
+
if (!hasAssistant) {
|
|
3053
|
+
throw new Error(
|
|
3054
|
+
`Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
|
|
3055
|
+
);
|
|
3056
|
+
}
|
|
3057
|
+
}
|
|
3058
|
+
const usage = lastAssistantInfo ? {
|
|
3059
|
+
inputTokens: lastAssistantInfo.tokens.input,
|
|
3060
|
+
outputTokens: lastAssistantInfo.tokens.output,
|
|
3061
|
+
totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
|
|
3062
|
+
} : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
3063
|
+
const costUsd = lastAssistantInfo?.cost;
|
|
3064
|
+
const modelStr = options.model || DEFAULT_MODEL3;
|
|
3065
|
+
const llmTrace = buildLLMTrace(
|
|
3066
|
+
allMessages,
|
|
3067
|
+
totalDurationMs,
|
|
3068
|
+
modelStr,
|
|
3069
|
+
providerID
|
|
3070
|
+
);
|
|
3071
|
+
const conversation = buildConversation2(allMessages);
|
|
3072
|
+
return {
|
|
3073
|
+
result: {
|
|
3074
|
+
outputText,
|
|
3075
|
+
durationMs: totalDurationMs,
|
|
3076
|
+
usage,
|
|
3077
|
+
costUsd
|
|
3078
|
+
},
|
|
3079
|
+
llmTrace,
|
|
3080
|
+
conversation
|
|
3081
|
+
};
|
|
3082
|
+
} catch (sdkError) {
|
|
3083
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
3084
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
3085
|
+
if (timedOut) {
|
|
3086
|
+
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
3087
|
+
}
|
|
3088
|
+
const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
|
|
3089
|
+
const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
|
|
3090
|
+
const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
|
|
3091
|
+
console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
|
|
3092
|
+
console.error("[SDK-ERROR] Error name:", errorName);
|
|
3093
|
+
console.error("[SDK-ERROR] Error message:", errorMessage);
|
|
3094
|
+
if (errorStack) {
|
|
3095
|
+
console.error("[SDK-ERROR] Stack:", errorStack);
|
|
3096
|
+
}
|
|
3097
|
+
if (traceContext) {
|
|
3098
|
+
emitTraceEvent(
|
|
3099
|
+
{
|
|
3100
|
+
evalRunId: traceContext.evalRunId,
|
|
3101
|
+
scenarioId: traceContext.scenarioId,
|
|
3102
|
+
scenarioName: traceContext.scenarioName,
|
|
3103
|
+
targetId: traceContext.targetId,
|
|
3104
|
+
targetName: traceContext.targetName,
|
|
3105
|
+
stepNumber: traceStepNumber + 1,
|
|
3106
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3107
|
+
outputPreview: JSON.stringify({
|
|
3108
|
+
event: "sdk-execution-failed",
|
|
3109
|
+
error: errorMessage,
|
|
3110
|
+
errorName
|
|
3111
|
+
}).slice(0, 2e3),
|
|
3112
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3113
|
+
isComplete: true
|
|
3114
|
+
},
|
|
3115
|
+
traceContext.tracePushUrl,
|
|
3116
|
+
traceContext.routeHeader,
|
|
3117
|
+
traceContext.authToken
|
|
3118
|
+
);
|
|
3119
|
+
}
|
|
3120
|
+
throw new Error(
|
|
3121
|
+
`OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
|
|
3122
|
+
Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
3123
|
+
);
|
|
3124
|
+
} finally {
|
|
3125
|
+
if (server) {
|
|
3126
|
+
try {
|
|
3127
|
+
server.close();
|
|
3128
|
+
console.log("[SDK-DEBUG] OpenCode server closed");
|
|
3129
|
+
} catch {
|
|
3130
|
+
}
|
|
3131
|
+
}
|
|
3132
|
+
}
|
|
3133
|
+
}
|
|
3134
|
+
|
|
3135
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
3136
|
+
var OpenCodeAdapter = class {
|
|
3137
|
+
id = "opencode";
|
|
3138
|
+
name = "OpenCode";
|
|
3139
|
+
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
3140
|
+
async execute(context) {
|
|
3141
|
+
const {
|
|
3142
|
+
skills,
|
|
3143
|
+
scenario,
|
|
3144
|
+
cwd,
|
|
3145
|
+
modelConfig,
|
|
3146
|
+
aiGatewayUrl,
|
|
3147
|
+
aiGatewayHeaders,
|
|
3148
|
+
traceContext,
|
|
3149
|
+
mcps,
|
|
3150
|
+
subAgents,
|
|
3151
|
+
rules,
|
|
3152
|
+
systemPrompt
|
|
3153
|
+
} = context;
|
|
3154
|
+
const options = {
|
|
3155
|
+
cwd,
|
|
3156
|
+
model: modelConfig?.model,
|
|
3157
|
+
temperature: modelConfig?.temperature,
|
|
3158
|
+
maxTurns: modelConfig?.maxTurns,
|
|
3159
|
+
aiGatewayUrl,
|
|
3160
|
+
aiGatewayHeaders,
|
|
3161
|
+
traceContext,
|
|
3162
|
+
mcps,
|
|
3163
|
+
subAgents,
|
|
3164
|
+
rules,
|
|
3165
|
+
systemPrompt
|
|
3166
|
+
};
|
|
3167
|
+
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3168
|
+
skills,
|
|
3169
|
+
scenario,
|
|
3170
|
+
options
|
|
3171
|
+
);
|
|
3172
|
+
return {
|
|
3173
|
+
outputText: result.outputText,
|
|
3174
|
+
durationMs: result.durationMs,
|
|
3175
|
+
usage: {
|
|
3176
|
+
inputTokens: result.usage.inputTokens,
|
|
3177
|
+
outputTokens: result.usage.outputTokens,
|
|
3178
|
+
totalTokens: result.usage.totalTokens
|
|
3179
|
+
},
|
|
3180
|
+
costUsd: result.costUsd,
|
|
3181
|
+
llmTrace,
|
|
3182
|
+
conversation
|
|
3183
|
+
};
|
|
3184
|
+
}
|
|
3185
|
+
};
|
|
3186
|
+
var openCodeAdapter = new OpenCodeAdapter();
|
|
3187
|
+
|
|
3188
|
+
// src/run-scenario/agents/opencode/index.ts
|
|
3189
|
+
defaultRegistry.register(openCodeAdapter);
|
|
3190
|
+
|
|
3191
|
+
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3192
|
+
var import_ai = require("ai");
|
|
3193
|
+
var import_anthropic = require("@ai-sdk/anthropic");
|
|
3194
|
+
var import_openai = require("@ai-sdk/openai");
|
|
3195
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
3196
|
+
var import_crypto3 = require("crypto");
|
|
3197
|
+
|
|
3198
|
+
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
3199
|
+
var import_mcp = require("@ai-sdk/mcp");
|
|
3200
|
+
var import_mcp_stdio = require("@ai-sdk/mcp/mcp-stdio");
|
|
3201
|
+
async function buildMcpTools(mcps, cwd) {
|
|
3202
|
+
const allTools = {};
|
|
3203
|
+
const clients = [];
|
|
3204
|
+
try {
|
|
3205
|
+
for (const mcp of mcps) {
|
|
3206
|
+
const resolvedConfig = await resolveMcpPlaceholders(
|
|
3207
|
+
mcp.config,
|
|
3208
|
+
{ cwd }
|
|
3209
|
+
);
|
|
3210
|
+
for (const [serverName, serverConfig] of Object.entries(resolvedConfig)) {
|
|
3211
|
+
const config = serverConfig;
|
|
3212
|
+
const transport = buildTransport(serverName, config, cwd);
|
|
3213
|
+
const client = await (0, import_mcp.createMCPClient)({ transport });
|
|
3214
|
+
clients.push(client);
|
|
3215
|
+
const tools = await client.tools();
|
|
3216
|
+
for (const [toolName, tool] of Object.entries(tools)) {
|
|
3217
|
+
allTools[`${serverName}__${toolName}`] = tool;
|
|
3218
|
+
}
|
|
3219
|
+
}
|
|
3220
|
+
}
|
|
3221
|
+
} catch (err) {
|
|
3222
|
+
await closeMcpClients(clients);
|
|
3223
|
+
throw err;
|
|
3224
|
+
}
|
|
3225
|
+
return { tools: allTools, clients };
|
|
3226
|
+
}
|
|
3227
|
+
async function closeMcpClients(clients) {
|
|
3228
|
+
await Promise.allSettled(clients.map((c) => c.close()));
|
|
3229
|
+
}
|
|
3230
|
+
function buildTransport(serverName, config, cwd) {
|
|
3231
|
+
const type = config.type;
|
|
3232
|
+
const headers = config.headers;
|
|
3233
|
+
if (type === "stdio" || config.command) {
|
|
3234
|
+
return new import_mcp_stdio.Experimental_StdioMCPTransport({
|
|
3235
|
+
command: config.command,
|
|
3236
|
+
args: config.args ?? [],
|
|
3237
|
+
env: { ...config.env, PWD: cwd },
|
|
3238
|
+
cwd
|
|
3239
|
+
});
|
|
3240
|
+
}
|
|
3241
|
+
if (type === "http") {
|
|
3242
|
+
return {
|
|
3243
|
+
type: "http",
|
|
3244
|
+
url: config.url,
|
|
3245
|
+
...headers && { headers }
|
|
3246
|
+
};
|
|
3247
|
+
}
|
|
3248
|
+
if (type === "sse" || config.url) {
|
|
3249
|
+
return {
|
|
3250
|
+
type: "sse",
|
|
3251
|
+
url: config.url,
|
|
3252
|
+
...headers && { headers }
|
|
3253
|
+
};
|
|
3254
|
+
}
|
|
3255
|
+
throw new Error(
|
|
3256
|
+
`MCP server "${serverName}" has unsupported transport config (type=${type ?? "unset"}). Expected type "stdio", "http", or "sse", or a config with "command" or "url".`
|
|
3257
|
+
);
|
|
3258
|
+
}
|
|
3259
|
+
|
|
3260
|
+
// src/run-scenario/agents/shared/detect-tool-error.ts
|
|
3261
|
+
function detectMcpToolError(output) {
|
|
3262
|
+
if (output == null) return null;
|
|
3263
|
+
if (typeof output === "object" && "isError" in output) {
|
|
3264
|
+
const obj = output;
|
|
3265
|
+
if (obj.isError === true) {
|
|
3266
|
+
return extractErrorText(obj.content);
|
|
3267
|
+
}
|
|
3268
|
+
}
|
|
3269
|
+
const str = typeof output === "string" ? output : null;
|
|
3270
|
+
if (str && (str.includes('"isError":true') || str.includes('"isError": true'))) {
|
|
3271
|
+
try {
|
|
3272
|
+
const parsed = JSON.parse(str);
|
|
3273
|
+
if (parsed.isError === true) {
|
|
3274
|
+
return extractErrorText(parsed.content);
|
|
3275
|
+
}
|
|
3276
|
+
} catch {
|
|
3277
|
+
return str.slice(0, 500);
|
|
3278
|
+
}
|
|
3279
|
+
}
|
|
3280
|
+
return null;
|
|
3281
|
+
}
|
|
3282
|
+
function extractErrorText(content) {
|
|
3283
|
+
if (Array.isArray(content)) {
|
|
3284
|
+
const text = content.filter((c) => typeof c.text === "string").map((c) => c.text).join("\n");
|
|
3285
|
+
if (text) return text.slice(0, 500);
|
|
3286
|
+
}
|
|
3287
|
+
return "Tool call failed";
|
|
3288
|
+
}
|
|
3289
|
+
|
|
3290
|
+
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3291
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
3292
|
+
var PROVIDER_ANTHROPIC = "anthropic";
|
|
3293
|
+
var MODEL_PRICING = {
|
|
3294
|
+
// Anthropic — Claude 4.6
|
|
3295
|
+
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
3296
|
+
"claude-opus-4-6": { input: 15, output: 75 },
|
|
3297
|
+
// Anthropic — Claude 4.5
|
|
3298
|
+
"claude-opus-4-5": { input: 5, output: 25 },
|
|
3299
|
+
"claude-sonnet-4-5": { input: 3, output: 15 },
|
|
3300
|
+
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
3301
|
+
// Anthropic — Claude 4
|
|
3302
|
+
"claude-opus-4": { input: 15, output: 75 },
|
|
3303
|
+
"claude-sonnet-4": { input: 3, output: 15 },
|
|
3304
|
+
// OpenAI — GPT-5
|
|
3305
|
+
"gpt-5": { input: 1.25, output: 10 },
|
|
3306
|
+
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
3307
|
+
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
3308
|
+
// OpenAI — GPT-4.1
|
|
3309
|
+
"gpt-4.1": { input: 2, output: 8 },
|
|
3310
|
+
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
3311
|
+
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
3312
|
+
// OpenAI — GPT-4o
|
|
3313
|
+
"gpt-4o": { input: 2.5, output: 10 },
|
|
3314
|
+
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
3315
|
+
// OpenAI — Reasoning
|
|
3316
|
+
o3: { input: 2, output: 8 },
|
|
3317
|
+
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3318
|
+
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3319
|
+
o1: { input: 15, output: 60 }
|
|
3320
|
+
};
|
|
3321
|
+
function extractGatewayCost(step, provider) {
|
|
3322
|
+
try {
|
|
3323
|
+
if (provider === PROVIDER_ANTHROPIC) {
|
|
3324
|
+
const meta = step.providerMetadata;
|
|
3325
|
+
const anthropic = meta?.anthropic;
|
|
3326
|
+
const usage = anthropic?.usage;
|
|
3327
|
+
const cost2 = usage?.total_cost_usd;
|
|
3328
|
+
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
3329
|
+
}
|
|
3330
|
+
const body = step.response?.body;
|
|
3331
|
+
const cost = body?.total_cost_usd;
|
|
3332
|
+
return typeof cost === "number" && cost > 0 ? cost : void 0;
|
|
3333
|
+
} catch {
|
|
3334
|
+
return void 0;
|
|
3335
|
+
}
|
|
3336
|
+
}
|
|
3337
|
+
function calculateFromPricing(modelId, tokenUsage) {
|
|
3338
|
+
const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
|
|
3339
|
+
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
3340
|
+
if (!pricing) return 0;
|
|
3341
|
+
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
3342
|
+
}
|
|
3343
|
+
function calculateStepCost(step, modelId, provider, tokenUsage) {
|
|
3344
|
+
return extractGatewayCost(step, provider) ?? calculateFromPricing(modelId, tokenUsage);
|
|
3345
|
+
}
|
|
3346
|
+
|
|
3347
|
+
// src/run-scenario/agents/simple-agent/build-conversation.ts
|
|
3348
|
+
function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
3349
|
+
const messages = [];
|
|
3350
|
+
messages.push({
|
|
3351
|
+
role: "user",
|
|
3352
|
+
content: [{ type: "text", text: triggerPrompt }],
|
|
3353
|
+
timestamp: new Date(executionStartMs).toISOString()
|
|
3354
|
+
});
|
|
3355
|
+
for (let i = 0; i < steps.length; i++) {
|
|
3356
|
+
const step = steps[i];
|
|
3357
|
+
const stepTimestamp = estimateStepTimestamp(
|
|
3358
|
+
executionStartMs,
|
|
3359
|
+
i,
|
|
3360
|
+
steps.length
|
|
3361
|
+
);
|
|
3362
|
+
const assistantContent = [];
|
|
3363
|
+
if (step.reasoningText) {
|
|
3364
|
+
assistantContent.push({ type: "thinking", thinking: step.reasoningText });
|
|
3365
|
+
}
|
|
3366
|
+
if (step.text) {
|
|
3367
|
+
assistantContent.push({ type: "text", text: step.text });
|
|
3368
|
+
}
|
|
3369
|
+
for (const tc of step.toolCalls) {
|
|
3370
|
+
assistantContent.push({
|
|
3371
|
+
type: "tool_use",
|
|
3372
|
+
toolName: tc.toolName,
|
|
3373
|
+
toolId: tc.toolCallId,
|
|
3374
|
+
input: tc.input
|
|
3375
|
+
});
|
|
3376
|
+
}
|
|
2334
3377
|
if (assistantContent.length > 0) {
|
|
2335
3378
|
messages.push({
|
|
2336
3379
|
role: "assistant",
|
|
@@ -2382,7 +3425,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
2382
3425
|
apiKey: "proxy-auth",
|
|
2383
3426
|
headers
|
|
2384
3427
|
});
|
|
2385
|
-
if ([...
|
|
3428
|
+
if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
2386
3429
|
(id) => modelId === id || modelId.startsWith(id)
|
|
2387
3430
|
)) {
|
|
2388
3431
|
return openai.responses(modelId);
|
|
@@ -2390,7 +3433,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
2390
3433
|
return openai.chat(modelId);
|
|
2391
3434
|
}
|
|
2392
3435
|
function isClaudeModelId(modelId) {
|
|
2393
|
-
return
|
|
3436
|
+
return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
2394
3437
|
(id) => modelId === id || modelId.startsWith(id)
|
|
2395
3438
|
);
|
|
2396
3439
|
}
|
|
@@ -2426,7 +3469,7 @@ async function executeWithAiSdk(context) {
|
|
|
2426
3469
|
}
|
|
2427
3470
|
try {
|
|
2428
3471
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
2429
|
-
const isResponsesAPI = [...
|
|
3472
|
+
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
2430
3473
|
(id) => modelConfig.model === id || modelConfig.model.startsWith(id)
|
|
2431
3474
|
);
|
|
2432
3475
|
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
@@ -2462,7 +3505,7 @@ async function executeWithAiSdk(context) {
|
|
|
2462
3505
|
outputTokens: result.usage.outputTokens ?? 0,
|
|
2463
3506
|
totalTokens: result.usage.totalTokens ?? 0
|
|
2464
3507
|
};
|
|
2465
|
-
const llmTrace =
|
|
3508
|
+
const llmTrace = buildLLMTrace2(
|
|
2466
3509
|
result.steps,
|
|
2467
3510
|
durationMs,
|
|
2468
3511
|
usage,
|
|
@@ -2474,7 +3517,7 @@ async function executeWithAiSdk(context) {
|
|
|
2474
3517
|
emitStepEvents(traceContext, result.steps, startTime);
|
|
2475
3518
|
emitCompletionEvent(traceContext, result.steps.length + 1);
|
|
2476
3519
|
}
|
|
2477
|
-
const conversation =
|
|
3520
|
+
const conversation = buildConversation3(
|
|
2478
3521
|
scenario.triggerPrompt,
|
|
2479
3522
|
result.steps,
|
|
2480
3523
|
startTime
|
|
@@ -2518,7 +3561,7 @@ function findToolResultError(step) {
|
|
|
2518
3561
|
}
|
|
2519
3562
|
return null;
|
|
2520
3563
|
}
|
|
2521
|
-
function
|
|
3564
|
+
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
|
|
2522
3565
|
const totalStepTokens = steps.reduce(
|
|
2523
3566
|
(sum, s) => sum + (s.usage.totalTokens ?? 0),
|
|
2524
3567
|
0
|
|
@@ -2536,10 +3579,10 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2536
3579
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
2537
3580
|
const toolResultError = findToolResultError(step);
|
|
2538
3581
|
return {
|
|
2539
|
-
id: (0,
|
|
3582
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2540
3583
|
stepNumber: i + 1,
|
|
2541
3584
|
turnIndex: i,
|
|
2542
|
-
type: step.toolCalls.length > 0 ?
|
|
3585
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
2543
3586
|
model: modelId,
|
|
2544
3587
|
provider,
|
|
2545
3588
|
startedAt: new Date(
|
|
@@ -2562,7 +3605,7 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2562
3605
|
total: totalUsage.totalTokens
|
|
2563
3606
|
};
|
|
2564
3607
|
return {
|
|
2565
|
-
id: (0,
|
|
3608
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2566
3609
|
steps: traceSteps,
|
|
2567
3610
|
summary: {
|
|
2568
3611
|
totalSteps: traceSteps.length,
|
|
@@ -2591,7 +3634,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
2591
3634
|
targetId: traceContext.targetId,
|
|
2592
3635
|
targetName: traceContext.targetName,
|
|
2593
3636
|
stepNumber: 0,
|
|
2594
|
-
type:
|
|
3637
|
+
type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
|
|
2595
3638
|
outputPreview: "Starting Simple Agent execution...",
|
|
2596
3639
|
elapsedMs: Date.now() - startTime,
|
|
2597
3640
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -2615,7 +3658,7 @@ function emitStepEvents(traceContext, steps, startTime) {
|
|
|
2615
3658
|
targetId: traceContext.targetId,
|
|
2616
3659
|
targetName: traceContext.targetName,
|
|
2617
3660
|
stepNumber: i + 1,
|
|
2618
|
-
type: isToolStep ?
|
|
3661
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
2619
3662
|
toolName: firstToolCall?.toolName,
|
|
2620
3663
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
2621
3664
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -2638,7 +3681,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
2638
3681
|
targetId: traceContext.targetId,
|
|
2639
3682
|
targetName: traceContext.targetName,
|
|
2640
3683
|
stepNumber,
|
|
2641
|
-
type:
|
|
3684
|
+
type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
2642
3685
|
outputPreview: "Scenario execution completed",
|
|
2643
3686
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2644
3687
|
isComplete: true
|
|
@@ -2665,7 +3708,7 @@ defaultRegistry.register(simpleAgentAdapter);
|
|
|
2665
3708
|
|
|
2666
3709
|
// src/run-scenario/file-diff.ts
|
|
2667
3710
|
var import_fs2 = require("fs");
|
|
2668
|
-
var
|
|
3711
|
+
var import_path10 = require("path");
|
|
2669
3712
|
|
|
2670
3713
|
// ../../node_modules/diff/lib/index.mjs
|
|
2671
3714
|
function Diff() {
|
|
@@ -2841,7 +3884,7 @@ Diff.prototype = {
|
|
|
2841
3884
|
tokenize: function tokenize(value) {
|
|
2842
3885
|
return Array.from(value);
|
|
2843
3886
|
},
|
|
2844
|
-
join: function
|
|
3887
|
+
join: function join8(chars) {
|
|
2845
3888
|
return chars.join("");
|
|
2846
3889
|
},
|
|
2847
3890
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -3281,8 +4324,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
3281
4324
|
}
|
|
3282
4325
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
3283
4326
|
for (const entry of entries) {
|
|
3284
|
-
const fullPath = (0,
|
|
3285
|
-
const relativePath = (0,
|
|
4327
|
+
const fullPath = (0, import_path10.join)(dir, entry.name);
|
|
4328
|
+
const relativePath = (0, import_path10.relative)(base, fullPath);
|
|
3286
4329
|
if (shouldIgnore(entry.name)) {
|
|
3287
4330
|
continue;
|
|
3288
4331
|
}
|
|
@@ -3390,11 +4433,11 @@ function extractTemplateFiles(before, after) {
|
|
|
3390
4433
|
}
|
|
3391
4434
|
|
|
3392
4435
|
// src/run-scenario/run-agent-with-context.ts
|
|
3393
|
-
var
|
|
3394
|
-
var DEFAULT_AGENT_COMMAND =
|
|
4436
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
4437
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
|
|
3395
4438
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
3396
4439
|
const agent = evalData.agent ?? void 0;
|
|
3397
|
-
const isSDK = agent?.agentType ===
|
|
4440
|
+
const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
|
|
3398
4441
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
3399
4442
|
const adapter = getAdapter(identifier);
|
|
3400
4443
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -3429,7 +4472,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
3429
4472
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
3430
4473
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
3431
4474
|
return {
|
|
3432
|
-
id: (0,
|
|
4475
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
3433
4476
|
targetId,
|
|
3434
4477
|
targetName,
|
|
3435
4478
|
scenarioId: scenario.id,
|
|
@@ -3480,7 +4523,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3480
4523
|
})),
|
|
3481
4524
|
durationMs: partialResult.duration
|
|
3482
4525
|
};
|
|
3483
|
-
const defaultJudgeModel =
|
|
4526
|
+
const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
|
|
3484
4527
|
const assertionContext = {
|
|
3485
4528
|
workDir,
|
|
3486
4529
|
defaultJudgeModel,
|
|
@@ -3495,10 +4538,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3495
4538
|
assertionContext
|
|
3496
4539
|
) : [];
|
|
3497
4540
|
const passed = assertionResults.filter(
|
|
3498
|
-
(r) => r.status ===
|
|
4541
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
3499
4542
|
).length;
|
|
3500
4543
|
const failed = assertionResults.filter(
|
|
3501
|
-
(r) => r.status ===
|
|
4544
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
3502
4545
|
).length;
|
|
3503
4546
|
const total = assertionResults.length;
|
|
3504
4547
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -3512,7 +4555,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3512
4555
|
}
|
|
3513
4556
|
|
|
3514
4557
|
// src/error-reporter.ts
|
|
3515
|
-
var
|
|
4558
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
3516
4559
|
function formatError(error, phase, context) {
|
|
3517
4560
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
3518
4561
|
if (error instanceof Error) {
|
|
@@ -3757,7 +4800,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
3757
4800
|
};
|
|
3758
4801
|
try {
|
|
3759
4802
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
3760
|
-
status:
|
|
4803
|
+
status: import_evalforge_types15.EvalStatus.COMPLETED,
|
|
3761
4804
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3762
4805
|
});
|
|
3763
4806
|
} catch (updateErr) {
|
|
@@ -3798,7 +4841,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
3798
4841
|
authToken: config.authToken
|
|
3799
4842
|
});
|
|
3800
4843
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
3801
|
-
status:
|
|
4844
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
3802
4845
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3803
4846
|
jobError,
|
|
3804
4847
|
jobStatus: "FAILED"
|
|
@@ -3821,7 +4864,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
3821
4864
|
authToken
|
|
3822
4865
|
});
|
|
3823
4866
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
3824
|
-
status:
|
|
4867
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
3825
4868
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3826
4869
|
jobError: `Config load failed, then: ${jobError}`,
|
|
3827
4870
|
jobStatus: "FAILED"
|