@wix/evalforge-evaluator 0.111.0 → 0.113.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +1242 -117
- package/build/index.js.map +4 -4
- package/build/index.mjs +1241 -107
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +1 -1
- package/build/types/run-scenario/agents/index.d.ts +2 -0
- package/build/types/run-scenario/agents/opencode/build-conversation.d.ts +7 -0
- package/build/types/run-scenario/agents/opencode/build-trace.d.ts +13 -0
- package/build/types/run-scenario/agents/opencode/config.d.ts +27 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/index.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +18 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +32 -0
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +12 -0
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +7 -0
- package/package.json +14 -13
package/build/index.mjs
CHANGED
|
@@ -581,7 +581,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
581
581
|
}
|
|
582
582
|
|
|
583
583
|
// src/run-scenario/run-agent-with-context.ts
|
|
584
|
-
import { randomUUID as
|
|
584
|
+
import { randomUUID as randomUUID4 } from "crypto";
|
|
585
585
|
|
|
586
586
|
// src/run-scenario/agents/registry.ts
|
|
587
587
|
var AgentAdapterRegistry = class {
|
|
@@ -1214,10 +1214,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1214
1214
|
}
|
|
1215
1215
|
const startTime = /* @__PURE__ */ new Date();
|
|
1216
1216
|
const allMessages = [];
|
|
1217
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1217
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1218
1218
|
const claudeDir = `${options.cwd}/.claude`;
|
|
1219
1219
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
1220
|
-
await
|
|
1220
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1221
1221
|
flag: "wx"
|
|
1222
1222
|
}).catch(() => {
|
|
1223
1223
|
});
|
|
@@ -1253,7 +1253,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1253
1253
|
"Edit",
|
|
1254
1254
|
"Bash",
|
|
1255
1255
|
"Glob",
|
|
1256
|
-
"Grep"
|
|
1256
|
+
"Grep",
|
|
1257
|
+
"Agent",
|
|
1258
|
+
"WebFetch",
|
|
1259
|
+
"WebSearch"
|
|
1257
1260
|
];
|
|
1258
1261
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1259
1262
|
const queryOptions = {
|
|
@@ -1888,13 +1891,15 @@ function extractTotalUsage(result) {
|
|
|
1888
1891
|
}
|
|
1889
1892
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
1890
1893
|
const totalCost = usage.costUsd ?? 0;
|
|
1891
|
-
const
|
|
1892
|
-
|
|
1894
|
+
const effectiveInput = (s) => s.usage.inputTokens + (s.usage.cacheReadTokens ?? 0) + (s.usage.cacheWriteTokens ?? 0);
|
|
1895
|
+
const totalStepEffectiveInput = steps.reduce(
|
|
1896
|
+
(sum, s) => sum + effectiveInput(s),
|
|
1893
1897
|
0
|
|
1894
1898
|
);
|
|
1895
1899
|
const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
|
|
1896
|
-
const
|
|
1897
|
-
const
|
|
1900
|
+
const authoritativeEffectiveInput = usage.inputTokens + (usage.cacheReadTokens ?? 0) + (usage.cacheWriteTokens ?? 0);
|
|
1901
|
+
const inputTokensDuplicated = authoritativeEffectiveInput > 0 && totalStepEffectiveInput > authoritativeEffectiveInput * 1.2;
|
|
1902
|
+
const traceSteps = steps.flatMap((step, turnIndex) => {
|
|
1898
1903
|
let stepPromptTokens;
|
|
1899
1904
|
let stepOutputTokens;
|
|
1900
1905
|
let proportion;
|
|
@@ -1903,34 +1908,128 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1903
1908
|
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
1904
1909
|
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
1905
1910
|
} else {
|
|
1906
|
-
|
|
1907
|
-
|
|
1911
|
+
const stepEffective = effectiveInput(step);
|
|
1912
|
+
proportion = totalStepEffectiveInput > 0 ? stepEffective / totalStepEffectiveInput : 0;
|
|
1913
|
+
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
1908
1914
|
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
1909
1915
|
}
|
|
1910
|
-
const stepTotalTokens = stepPromptTokens + stepOutputTokens;
|
|
1911
1916
|
const costProportion = proportion;
|
|
1912
|
-
const
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1917
|
+
const toolCallCount = step.toolCalls?.length ?? 0;
|
|
1918
|
+
const isSuccess = step.finishReason !== "error" && !step.hasToolError;
|
|
1919
|
+
const errorMsg = step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : step.finishReason === "error" ? "Generation failed" : void 0;
|
|
1920
|
+
const subSteps = [];
|
|
1921
|
+
const stepCost = totalCost * costProportion;
|
|
1922
|
+
const hasThinking = !!step.thinking;
|
|
1923
|
+
const hasText = !!step.text;
|
|
1924
|
+
const thinkingSubSteps = hasThinking ? 1 : 0;
|
|
1925
|
+
const toolSubSteps = toolCallCount > 0 ? toolCallCount : 0;
|
|
1926
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
1927
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
1928
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
1929
|
+
subSteps.push({
|
|
1930
|
+
id: randomUUID(),
|
|
1931
|
+
stepNumber: 0,
|
|
1932
|
+
// renumbered below
|
|
1933
|
+
turnIndex,
|
|
1934
|
+
type: LLMStepType.THINKING,
|
|
1935
|
+
model,
|
|
1936
|
+
provider: "anthropic",
|
|
1937
|
+
startedAt: step.startedAt.toISOString(),
|
|
1938
|
+
durationMs: Math.round(step.durationMs / totalSubSteps),
|
|
1939
|
+
tokenUsage: {
|
|
1940
|
+
prompt: Math.round(stepPromptTokens / totalSubSteps),
|
|
1941
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
1942
|
+
total: Math.round(
|
|
1943
|
+
(stepPromptTokens + stepOutputTokens) / totalSubSteps
|
|
1944
|
+
)
|
|
1945
|
+
},
|
|
1946
|
+
costUsd: stepCost / totalSubSteps,
|
|
1947
|
+
outputPreview: step.thinking?.slice(0, 200),
|
|
1948
|
+
success: isSuccess,
|
|
1949
|
+
error: errorMsg
|
|
1950
|
+
});
|
|
1951
|
+
}
|
|
1952
|
+
if (toolCallCount > 0) {
|
|
1953
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
1954
|
+
const tc = step.toolCalls[tcIdx];
|
|
1955
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
1956
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
1957
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
1958
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
1959
|
+
subSteps.push({
|
|
1960
|
+
id: randomUUID(),
|
|
1961
|
+
stepNumber: 0,
|
|
1962
|
+
turnIndex,
|
|
1963
|
+
type: LLMStepType.TOOL_USE,
|
|
1964
|
+
model,
|
|
1965
|
+
provider: "anthropic",
|
|
1966
|
+
startedAt: step.startedAt.toISOString(),
|
|
1967
|
+
durationMs: isLast ? step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(step.durationMs * remainingFraction * toolFraction),
|
|
1968
|
+
tokenUsage: {
|
|
1969
|
+
prompt: Math.round(
|
|
1970
|
+
stepPromptTokens * remainingFraction * toolFraction
|
|
1971
|
+
),
|
|
1972
|
+
completion: Math.round(
|
|
1973
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
1974
|
+
),
|
|
1975
|
+
total: Math.round(
|
|
1976
|
+
(stepPromptTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
1977
|
+
)
|
|
1978
|
+
},
|
|
1979
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
1980
|
+
toolName: tc.toolName,
|
|
1981
|
+
toolArguments: JSON.stringify(tc.args),
|
|
1982
|
+
outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
|
|
1983
|
+
success: isSuccess,
|
|
1984
|
+
error: errorMsg
|
|
1985
|
+
});
|
|
1986
|
+
}
|
|
1987
|
+
}
|
|
1988
|
+
if (hasText && toolCallCount > 0) {
|
|
1989
|
+
subSteps.push({
|
|
1990
|
+
id: randomUUID(),
|
|
1991
|
+
stepNumber: 0,
|
|
1992
|
+
turnIndex,
|
|
1993
|
+
type: LLMStepType.COMPLETION,
|
|
1994
|
+
model,
|
|
1995
|
+
provider: "anthropic",
|
|
1996
|
+
startedAt: step.startedAt.toISOString(),
|
|
1997
|
+
durationMs: step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
1998
|
+
tokenUsage: {
|
|
1999
|
+
prompt: stepPromptTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2000
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2001
|
+
total: stepPromptTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2002
|
+
},
|
|
2003
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2004
|
+
outputPreview: step.text?.slice(0, 200),
|
|
2005
|
+
success: isSuccess,
|
|
2006
|
+
error: errorMsg
|
|
2007
|
+
});
|
|
2008
|
+
}
|
|
2009
|
+
if (subSteps.length === 0) {
|
|
2010
|
+
const stepType = hasThinking && !hasText ? LLMStepType.THINKING : LLMStepType.COMPLETION;
|
|
2011
|
+
subSteps.push({
|
|
2012
|
+
id: randomUUID(),
|
|
2013
|
+
stepNumber: 0,
|
|
2014
|
+
turnIndex,
|
|
2015
|
+
type: stepType,
|
|
2016
|
+
model,
|
|
2017
|
+
provider: "anthropic",
|
|
2018
|
+
startedAt: step.startedAt.toISOString(),
|
|
2019
|
+
durationMs: step.durationMs,
|
|
2020
|
+
tokenUsage: {
|
|
2021
|
+
prompt: stepPromptTokens,
|
|
2022
|
+
completion: stepOutputTokens,
|
|
2023
|
+
total: stepPromptTokens + stepOutputTokens
|
|
2024
|
+
},
|
|
2025
|
+
costUsd: stepCost,
|
|
2026
|
+
outputPreview: (step.text || step.thinking)?.slice(0, 200),
|
|
2027
|
+
success: isSuccess,
|
|
2028
|
+
error: errorMsg
|
|
2029
|
+
});
|
|
2030
|
+
}
|
|
2031
|
+
return subSteps;
|
|
2032
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
1934
2033
|
const finalTokens = {
|
|
1935
2034
|
prompt: usage.inputTokens,
|
|
1936
2035
|
completion: usage.outputTokens,
|
|
@@ -1952,6 +2051,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1952
2051
|
}
|
|
1953
2052
|
const summary = {
|
|
1954
2053
|
totalSteps: traceSteps.length,
|
|
2054
|
+
totalTurns: steps.length,
|
|
1955
2055
|
totalDurationMs,
|
|
1956
2056
|
totalTokens: finalTokens,
|
|
1957
2057
|
totalCostUsd: totalCost,
|
|
@@ -2037,6 +2137,1058 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
2037
2137
|
// src/run-scenario/agents/claude-code/index.ts
|
|
2038
2138
|
defaultRegistry.register(claudeCodeAdapter);
|
|
2039
2139
|
|
|
2140
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
2141
|
+
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2142
|
+
|
|
2143
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2144
|
+
import {
|
|
2145
|
+
ClaudeModel as ClaudeModel3,
|
|
2146
|
+
DEFAULT_EVALUATOR_SYSTEM_PROMPT as DEFAULT_EVALUATOR_SYSTEM_PROMPT2,
|
|
2147
|
+
LiveTraceEventType as LiveTraceEventType2
|
|
2148
|
+
} from "@wix/evalforge-types";
|
|
2149
|
+
|
|
2150
|
+
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2151
|
+
import { mkdir as mkdir5 } from "fs/promises";
|
|
2152
|
+
import { join as join6 } from "path";
|
|
2153
|
+
import { fetchGitHubFolder as fetchGitHubFolder3 } from "@wix/evalforge-github-client";
|
|
2154
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = fetchGitHubFolder3) {
|
|
2155
|
+
await Promise.all(
|
|
2156
|
+
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
2157
|
+
);
|
|
2158
|
+
}
|
|
2159
|
+
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
2160
|
+
const skillName = skill.name;
|
|
2161
|
+
const skillDir = join6(cwd, ".opencode", "skills", skillName);
|
|
2162
|
+
await mkdir5(skillDir, { recursive: true });
|
|
2163
|
+
const version = skill.latestVersion;
|
|
2164
|
+
if (version?.files && version.files.length > 0) {
|
|
2165
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
2166
|
+
console.log(
|
|
2167
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
2168
|
+
);
|
|
2169
|
+
} else if (skill.source) {
|
|
2170
|
+
try {
|
|
2171
|
+
const files = await fetchFn(skill.source, {
|
|
2172
|
+
userAgent: "EvalForge-Evaluator"
|
|
2173
|
+
});
|
|
2174
|
+
await writeFilesToDirectory(skillDir, files);
|
|
2175
|
+
console.log(
|
|
2176
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
2177
|
+
);
|
|
2178
|
+
} catch (error) {
|
|
2179
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2180
|
+
console.error(
|
|
2181
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
2182
|
+
);
|
|
2183
|
+
throw new Error(
|
|
2184
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
2185
|
+
);
|
|
2186
|
+
}
|
|
2187
|
+
} else {
|
|
2188
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
2189
|
+
}
|
|
2190
|
+
}
|
|
2191
|
+
|
|
2192
|
+
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
2193
|
+
import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
|
|
2194
|
+
import { join as join7 } from "path";
|
|
2195
|
+
import {
|
|
2196
|
+
fetchGitHubFile as fetchGitHubFile2
|
|
2197
|
+
} from "@wix/evalforge-github-client";
|
|
2198
|
+
var AGENTS_DIR2 = ".opencode/agents";
|
|
2199
|
+
function toAgentFilename2(name, index, nameCount) {
|
|
2200
|
+
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
2201
|
+
const count = nameCount.get(base) ?? 0;
|
|
2202
|
+
nameCount.set(base, count + 1);
|
|
2203
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
2204
|
+
}
|
|
2205
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
2206
|
+
if (agent.source) {
|
|
2207
|
+
try {
|
|
2208
|
+
const content = await fetchFn(agent.source, {
|
|
2209
|
+
userAgent: "EvalForge-Evaluator"
|
|
2210
|
+
});
|
|
2211
|
+
console.log(
|
|
2212
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
2213
|
+
);
|
|
2214
|
+
return content;
|
|
2215
|
+
} catch (error) {
|
|
2216
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2217
|
+
console.error(
|
|
2218
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
2219
|
+
);
|
|
2220
|
+
throw new Error(
|
|
2221
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
2222
|
+
);
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
if (!agent.subAgentMd) {
|
|
2226
|
+
console.warn(
|
|
2227
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
2228
|
+
);
|
|
2229
|
+
}
|
|
2230
|
+
return agent.subAgentMd;
|
|
2231
|
+
}
|
|
2232
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHubFile2) {
|
|
2233
|
+
if (subAgents.length === 0) return;
|
|
2234
|
+
const agentsDir = join7(cwd, AGENTS_DIR2);
|
|
2235
|
+
await mkdir6(agentsDir, { recursive: true });
|
|
2236
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
2237
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
2238
|
+
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
2239
|
+
const filePath = join7(agentsDir, `${filename}.md`);
|
|
2240
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
2241
|
+
await writeFile5(filePath, content, "utf8");
|
|
2242
|
+
}
|
|
2243
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2246
|
+
// src/run-scenario/agents/opencode/config.ts
|
|
2247
|
+
import {
|
|
2248
|
+
ClaudeModel as ClaudeModel2,
|
|
2249
|
+
AVAILABLE_OPENAI_MODEL_IDS
|
|
2250
|
+
} from "@wix/evalforge-types";
|
|
2251
|
+
var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
|
|
2252
|
+
function parseModel(model) {
|
|
2253
|
+
const slashIndex = model.indexOf("/");
|
|
2254
|
+
if (slashIndex > 0) {
|
|
2255
|
+
return {
|
|
2256
|
+
providerID: model.slice(0, slashIndex),
|
|
2257
|
+
modelID: model.slice(slashIndex + 1)
|
|
2258
|
+
};
|
|
2259
|
+
}
|
|
2260
|
+
const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2261
|
+
model
|
|
2262
|
+
);
|
|
2263
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
|
|
2264
|
+
}
|
|
2265
|
+
function toOpenCodeMcpConfig(servers) {
|
|
2266
|
+
const result = {};
|
|
2267
|
+
for (const [name, entry] of Object.entries(servers)) {
|
|
2268
|
+
if (entry.type === "local" || entry.type === "remote") {
|
|
2269
|
+
result[name] = entry;
|
|
2270
|
+
continue;
|
|
2271
|
+
}
|
|
2272
|
+
if (entry.url && typeof entry.url === "string") {
|
|
2273
|
+
result[name] = {
|
|
2274
|
+
type: "remote",
|
|
2275
|
+
url: entry.url,
|
|
2276
|
+
...entry.headers ? { headers: entry.headers } : {},
|
|
2277
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2278
|
+
};
|
|
2279
|
+
continue;
|
|
2280
|
+
}
|
|
2281
|
+
if (entry.command && typeof entry.command === "string") {
|
|
2282
|
+
const commandArray = [
|
|
2283
|
+
entry.command,
|
|
2284
|
+
...entry.args || []
|
|
2285
|
+
];
|
|
2286
|
+
result[name] = {
|
|
2287
|
+
type: "local",
|
|
2288
|
+
command: commandArray,
|
|
2289
|
+
...entry.env ? { environment: entry.env } : {},
|
|
2290
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2291
|
+
};
|
|
2292
|
+
continue;
|
|
2293
|
+
}
|
|
2294
|
+
console.warn(
|
|
2295
|
+
`[MCP] Server "${name}" has unrecognized format, passing through as-is:`,
|
|
2296
|
+
JSON.stringify(entry)
|
|
2297
|
+
);
|
|
2298
|
+
result[name] = entry;
|
|
2299
|
+
}
|
|
2300
|
+
return result;
|
|
2301
|
+
}
|
|
2302
|
+
async function buildOpenCodeConfig(options) {
|
|
2303
|
+
const modelStr = options.model || DEFAULT_MODEL2;
|
|
2304
|
+
const { providerID, modelID } = parseModel(modelStr);
|
|
2305
|
+
const provider = {};
|
|
2306
|
+
if (options.aiGatewayUrl) {
|
|
2307
|
+
const providerOptions = {
|
|
2308
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
|
|
2309
|
+
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2310
|
+
};
|
|
2311
|
+
if (options.aiGatewayHeaders) {
|
|
2312
|
+
providerOptions.headers = { ...options.aiGatewayHeaders };
|
|
2313
|
+
}
|
|
2314
|
+
provider[providerID] = {
|
|
2315
|
+
options: providerOptions
|
|
2316
|
+
};
|
|
2317
|
+
}
|
|
2318
|
+
let mcp;
|
|
2319
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2320
|
+
const mcpServers = {};
|
|
2321
|
+
for (const mcpEntity of options.mcps) {
|
|
2322
|
+
const entityConfig = mcpEntity.config;
|
|
2323
|
+
for (const [key, value] of Object.entries(entityConfig)) {
|
|
2324
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
2325
|
+
throw new Error(
|
|
2326
|
+
`MCP "${mcpEntity.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}).`
|
|
2327
|
+
);
|
|
2328
|
+
}
|
|
2329
|
+
mcpServers[key] = value;
|
|
2330
|
+
}
|
|
2331
|
+
}
|
|
2332
|
+
const resolved = await resolveMcpPlaceholders(mcpServers, {
|
|
2333
|
+
cwd: options.cwd
|
|
2334
|
+
});
|
|
2335
|
+
mcp = toOpenCodeMcpConfig(resolved);
|
|
2336
|
+
}
|
|
2337
|
+
const agentOverrides = {};
|
|
2338
|
+
if (options.temperature != null) {
|
|
2339
|
+
agentOverrides.temperature = options.temperature;
|
|
2340
|
+
}
|
|
2341
|
+
if (options.maxTurns != null) {
|
|
2342
|
+
agentOverrides.maxSteps = options.maxTurns;
|
|
2343
|
+
}
|
|
2344
|
+
const config = {
|
|
2345
|
+
model: `${providerID}/${modelID}`,
|
|
2346
|
+
provider,
|
|
2347
|
+
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2348
|
+
permission: {
|
|
2349
|
+
edit: "allow",
|
|
2350
|
+
bash: "allow",
|
|
2351
|
+
webfetch: "allow",
|
|
2352
|
+
doom_loop: "allow",
|
|
2353
|
+
external_directory: "allow"
|
|
2354
|
+
},
|
|
2355
|
+
...mcp ? { mcp } : {}
|
|
2356
|
+
};
|
|
2357
|
+
return { config, providerID, modelID };
|
|
2358
|
+
}
|
|
2359
|
+
|
|
2360
|
+
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2361
|
+
import { LLMStepType as LLMStepType2 } from "@wix/evalforge-types";
|
|
2362
|
+
import { randomUUID as randomUUID2 } from "crypto";
|
|
2363
|
+
function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
2364
|
+
const assistantMessages = messages.filter(
|
|
2365
|
+
(m) => m.info.role === "assistant"
|
|
2366
|
+
);
|
|
2367
|
+
const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
|
|
2368
|
+
const { info, parts } = msg;
|
|
2369
|
+
let text = "";
|
|
2370
|
+
let thinking = "";
|
|
2371
|
+
const toolCalls = [];
|
|
2372
|
+
let stepInputTokens = 0;
|
|
2373
|
+
let stepOutputTokens = 0;
|
|
2374
|
+
let stepCost = 0;
|
|
2375
|
+
let finishReason = "unknown";
|
|
2376
|
+
for (const part of parts) {
|
|
2377
|
+
switch (part.type) {
|
|
2378
|
+
case "text": {
|
|
2379
|
+
const textPart = part;
|
|
2380
|
+
text += textPart.text;
|
|
2381
|
+
break;
|
|
2382
|
+
}
|
|
2383
|
+
case "reasoning": {
|
|
2384
|
+
const reasoningPart = part;
|
|
2385
|
+
thinking += reasoningPart.text;
|
|
2386
|
+
break;
|
|
2387
|
+
}
|
|
2388
|
+
case "tool": {
|
|
2389
|
+
const toolPart = part;
|
|
2390
|
+
toolCalls.push({
|
|
2391
|
+
toolName: toolPart.tool,
|
|
2392
|
+
args: toolPart.state.input
|
|
2393
|
+
});
|
|
2394
|
+
break;
|
|
2395
|
+
}
|
|
2396
|
+
case "step-finish": {
|
|
2397
|
+
const sf = part;
|
|
2398
|
+
stepInputTokens += sf.tokens.input;
|
|
2399
|
+
stepOutputTokens += sf.tokens.output;
|
|
2400
|
+
stepCost += sf.cost;
|
|
2401
|
+
finishReason = sf.reason;
|
|
2402
|
+
break;
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
}
|
|
2406
|
+
if (stepInputTokens === 0 && stepOutputTokens === 0) {
|
|
2407
|
+
stepInputTokens = info.tokens.input;
|
|
2408
|
+
stepOutputTokens = info.tokens.output;
|
|
2409
|
+
stepCost = info.cost;
|
|
2410
|
+
}
|
|
2411
|
+
const startedAt = new Date(info.time.created).toISOString();
|
|
2412
|
+
const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
|
|
2413
|
+
const durationMs = Math.max(0, completedAt - info.time.created);
|
|
2414
|
+
const isSuccess = finishReason !== "error";
|
|
2415
|
+
const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
|
|
2416
|
+
const stepModel = info.modelID || model;
|
|
2417
|
+
const stepProvider = info.providerID || provider;
|
|
2418
|
+
const toolCallCount = toolCalls.length;
|
|
2419
|
+
const hasThinking = !!thinking;
|
|
2420
|
+
const hasText = !!text;
|
|
2421
|
+
const subSteps = [];
|
|
2422
|
+
const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
|
|
2423
|
+
const toolSubSteps = toolCallCount;
|
|
2424
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
2425
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
2426
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
2427
|
+
subSteps.push({
|
|
2428
|
+
id: randomUUID2(),
|
|
2429
|
+
stepNumber: 0,
|
|
2430
|
+
// renumbered below
|
|
2431
|
+
turnIndex,
|
|
2432
|
+
type: LLMStepType2.THINKING,
|
|
2433
|
+
model: stepModel,
|
|
2434
|
+
provider: stepProvider,
|
|
2435
|
+
startedAt,
|
|
2436
|
+
durationMs: Math.round(durationMs / totalSubSteps),
|
|
2437
|
+
tokenUsage: {
|
|
2438
|
+
prompt: Math.round(stepInputTokens / totalSubSteps),
|
|
2439
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
2440
|
+
total: Math.round(
|
|
2441
|
+
(stepInputTokens + stepOutputTokens) / totalSubSteps
|
|
2442
|
+
)
|
|
2443
|
+
},
|
|
2444
|
+
costUsd: stepCost / totalSubSteps,
|
|
2445
|
+
outputPreview: thinking.slice(0, 200),
|
|
2446
|
+
success: isSuccess,
|
|
2447
|
+
error: errorMsg
|
|
2448
|
+
});
|
|
2449
|
+
}
|
|
2450
|
+
if (toolCallCount > 0) {
|
|
2451
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
2452
|
+
const tc = toolCalls[tcIdx];
|
|
2453
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
2454
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
2455
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
2456
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
2457
|
+
subSteps.push({
|
|
2458
|
+
id: randomUUID2(),
|
|
2459
|
+
stepNumber: 0,
|
|
2460
|
+
turnIndex,
|
|
2461
|
+
type: LLMStepType2.TOOL_USE,
|
|
2462
|
+
model: stepModel,
|
|
2463
|
+
provider: stepProvider,
|
|
2464
|
+
startedAt,
|
|
2465
|
+
durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
|
|
2466
|
+
tokenUsage: {
|
|
2467
|
+
prompt: Math.round(
|
|
2468
|
+
stepInputTokens * remainingFraction * toolFraction
|
|
2469
|
+
),
|
|
2470
|
+
completion: Math.round(
|
|
2471
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
2472
|
+
),
|
|
2473
|
+
total: Math.round(
|
|
2474
|
+
(stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
2475
|
+
)
|
|
2476
|
+
},
|
|
2477
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
2478
|
+
toolName: tc.toolName,
|
|
2479
|
+
toolArguments: JSON.stringify(tc.args),
|
|
2480
|
+
outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
|
|
2481
|
+
success: isSuccess,
|
|
2482
|
+
error: errorMsg
|
|
2483
|
+
});
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
if (hasText && toolCallCount > 0) {
|
|
2487
|
+
subSteps.push({
|
|
2488
|
+
id: randomUUID2(),
|
|
2489
|
+
stepNumber: 0,
|
|
2490
|
+
turnIndex,
|
|
2491
|
+
type: LLMStepType2.COMPLETION,
|
|
2492
|
+
model: stepModel,
|
|
2493
|
+
provider: stepProvider,
|
|
2494
|
+
startedAt,
|
|
2495
|
+
durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
2496
|
+
tokenUsage: {
|
|
2497
|
+
prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2498
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2499
|
+
total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2500
|
+
},
|
|
2501
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2502
|
+
outputPreview: text.slice(0, 200),
|
|
2503
|
+
success: isSuccess,
|
|
2504
|
+
error: errorMsg
|
|
2505
|
+
});
|
|
2506
|
+
}
|
|
2507
|
+
if (subSteps.length === 0) {
|
|
2508
|
+
const stepType = hasThinking && !hasText ? LLMStepType2.THINKING : LLMStepType2.COMPLETION;
|
|
2509
|
+
subSteps.push({
|
|
2510
|
+
id: randomUUID2(),
|
|
2511
|
+
stepNumber: 0,
|
|
2512
|
+
turnIndex,
|
|
2513
|
+
type: stepType,
|
|
2514
|
+
model: stepModel,
|
|
2515
|
+
provider: stepProvider,
|
|
2516
|
+
startedAt,
|
|
2517
|
+
durationMs,
|
|
2518
|
+
tokenUsage: {
|
|
2519
|
+
prompt: stepInputTokens,
|
|
2520
|
+
completion: stepOutputTokens,
|
|
2521
|
+
total: stepInputTokens + stepOutputTokens
|
|
2522
|
+
},
|
|
2523
|
+
costUsd: stepCost,
|
|
2524
|
+
outputPreview: (text || thinking)?.slice(0, 200),
|
|
2525
|
+
success: isSuccess,
|
|
2526
|
+
error: errorMsg
|
|
2527
|
+
});
|
|
2528
|
+
}
|
|
2529
|
+
return subSteps;
|
|
2530
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
2531
|
+
const totalTokens = buildTotalTokens(assistantMessages);
|
|
2532
|
+
const totalCost = assistantMessages.reduce((sum, m) => {
|
|
2533
|
+
const aMsg = m.info;
|
|
2534
|
+
return sum + aMsg.cost;
|
|
2535
|
+
}, 0);
|
|
2536
|
+
const stepTypeBreakdown = {};
|
|
2537
|
+
for (const step of allSteps) {
|
|
2538
|
+
const entry = stepTypeBreakdown[step.type] ?? {
|
|
2539
|
+
count: 0,
|
|
2540
|
+
durationMs: 0,
|
|
2541
|
+
tokens: 0,
|
|
2542
|
+
costUsd: 0
|
|
2543
|
+
};
|
|
2544
|
+
entry.count += 1;
|
|
2545
|
+
entry.durationMs += step.durationMs;
|
|
2546
|
+
entry.tokens += step.tokenUsage.total;
|
|
2547
|
+
entry.costUsd += step.costUsd;
|
|
2548
|
+
stepTypeBreakdown[step.type] = entry;
|
|
2549
|
+
}
|
|
2550
|
+
const modelUsed = allSteps[0]?.model || model;
|
|
2551
|
+
const summary = {
|
|
2552
|
+
totalSteps: allSteps.length,
|
|
2553
|
+
totalTurns: assistantMessages.length,
|
|
2554
|
+
totalDurationMs,
|
|
2555
|
+
totalTokens,
|
|
2556
|
+
totalCostUsd: totalCost,
|
|
2557
|
+
modelBreakdown: {
|
|
2558
|
+
[modelUsed]: {
|
|
2559
|
+
count: allSteps.length,
|
|
2560
|
+
durationMs: totalDurationMs,
|
|
2561
|
+
tokens: totalTokens.total,
|
|
2562
|
+
costUsd: totalCost
|
|
2563
|
+
}
|
|
2564
|
+
},
|
|
2565
|
+
modelsUsed: [modelUsed],
|
|
2566
|
+
stepTypeBreakdown
|
|
2567
|
+
};
|
|
2568
|
+
return {
|
|
2569
|
+
id: randomUUID2(),
|
|
2570
|
+
steps: allSteps,
|
|
2571
|
+
summary
|
|
2572
|
+
};
|
|
2573
|
+
}
|
|
2574
|
+
function buildTotalTokens(assistantMessages) {
|
|
2575
|
+
let prompt = 0;
|
|
2576
|
+
let completion = 0;
|
|
2577
|
+
for (const { info } of assistantMessages) {
|
|
2578
|
+
prompt += info.tokens.input;
|
|
2579
|
+
completion += info.tokens.output;
|
|
2580
|
+
}
|
|
2581
|
+
return { prompt, completion, total: prompt + completion };
|
|
2582
|
+
}
|
|
2583
|
+
|
|
2584
|
+
// src/run-scenario/agents/opencode/build-conversation.ts
|
|
2585
|
+
function buildConversation2(messages) {
|
|
2586
|
+
const result = [];
|
|
2587
|
+
for (const { info, parts } of messages) {
|
|
2588
|
+
const timestamp = new Date(info.time.created).toISOString();
|
|
2589
|
+
if (info.role === "assistant") {
|
|
2590
|
+
const content = [];
|
|
2591
|
+
for (const part of parts) {
|
|
2592
|
+
switch (part.type) {
|
|
2593
|
+
case "text": {
|
|
2594
|
+
const textPart = part;
|
|
2595
|
+
content.push({ type: "text", text: textPart.text });
|
|
2596
|
+
break;
|
|
2597
|
+
}
|
|
2598
|
+
case "reasoning": {
|
|
2599
|
+
const reasoningPart = part;
|
|
2600
|
+
content.push({ type: "thinking", thinking: reasoningPart.text });
|
|
2601
|
+
break;
|
|
2602
|
+
}
|
|
2603
|
+
case "tool": {
|
|
2604
|
+
const toolPart = part;
|
|
2605
|
+
content.push({
|
|
2606
|
+
type: "tool_use",
|
|
2607
|
+
toolName: toolPart.tool,
|
|
2608
|
+
toolId: toolPart.callID,
|
|
2609
|
+
input: toolPart.state.input
|
|
2610
|
+
});
|
|
2611
|
+
break;
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2615
|
+
if (content.length > 0) {
|
|
2616
|
+
result.push({ role: "assistant", content, timestamp });
|
|
2617
|
+
}
|
|
2618
|
+
} else if (info.role === "user") {
|
|
2619
|
+
const content = [];
|
|
2620
|
+
for (const part of parts) {
|
|
2621
|
+
if (part.type === "text") {
|
|
2622
|
+
const textPart = part;
|
|
2623
|
+
content.push({ type: "text", text: textPart.text });
|
|
2624
|
+
} else if (part.type === "tool") {
|
|
2625
|
+
const toolPart = part;
|
|
2626
|
+
const state = toolPart.state;
|
|
2627
|
+
if (state.status === "completed") {
|
|
2628
|
+
const completed = state;
|
|
2629
|
+
content.push({
|
|
2630
|
+
type: "tool_result",
|
|
2631
|
+
toolUseId: toolPart.callID,
|
|
2632
|
+
content: completed.output
|
|
2633
|
+
});
|
|
2634
|
+
} else if (state.status === "error") {
|
|
2635
|
+
const errState = state;
|
|
2636
|
+
content.push({
|
|
2637
|
+
type: "tool_result",
|
|
2638
|
+
toolUseId: toolPart.callID,
|
|
2639
|
+
content: errState.error,
|
|
2640
|
+
isError: true
|
|
2641
|
+
});
|
|
2642
|
+
}
|
|
2643
|
+
}
|
|
2644
|
+
}
|
|
2645
|
+
if (content.length > 0) {
|
|
2646
|
+
result.push({ role: "user", content, timestamp });
|
|
2647
|
+
}
|
|
2648
|
+
}
|
|
2649
|
+
}
|
|
2650
|
+
return result;
|
|
2651
|
+
}
|
|
2652
|
+
|
|
2653
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2654
|
+
var DEFAULT_MODEL3 = `anthropic/${ClaudeModel3.CLAUDE_4_5_SONNET_1_0}`;
|
|
2655
|
+
function extractToolAction(toolName, args) {
|
|
2656
|
+
if (!toolName) return "Using tool...";
|
|
2657
|
+
const a = args;
|
|
2658
|
+
if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
|
|
2659
|
+
const desc = String(a.description).slice(0, 55);
|
|
2660
|
+
return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
|
|
2661
|
+
}
|
|
2662
|
+
if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
|
|
2663
|
+
const cmd = String(a.command).slice(0, 50);
|
|
2664
|
+
return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
|
|
2665
|
+
}
|
|
2666
|
+
if (a?.file_path || a?.path || a?.target_file) {
|
|
2667
|
+
const filePath = String(a.file_path || a.path || a.target_file).slice(
|
|
2668
|
+
0,
|
|
2669
|
+
50
|
|
2670
|
+
);
|
|
2671
|
+
if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
|
|
2672
|
+
if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
|
|
2673
|
+
}
|
|
2674
|
+
return `Using ${toolName}...`;
|
|
2675
|
+
}
|
|
2676
|
+
function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
2677
|
+
const base = {
|
|
2678
|
+
evalRunId: context.evalRunId,
|
|
2679
|
+
scenarioId: context.scenarioId,
|
|
2680
|
+
scenarioName: context.scenarioName,
|
|
2681
|
+
targetId: context.targetId,
|
|
2682
|
+
targetName: context.targetName,
|
|
2683
|
+
stepNumber,
|
|
2684
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2685
|
+
isComplete
|
|
2686
|
+
};
|
|
2687
|
+
switch (part.type) {
|
|
2688
|
+
case "text": {
|
|
2689
|
+
const textPart = part;
|
|
2690
|
+
return {
|
|
2691
|
+
...base,
|
|
2692
|
+
type: LiveTraceEventType2.COMPLETION,
|
|
2693
|
+
outputPreview: textPart.text.slice(0, 500)
|
|
2694
|
+
};
|
|
2695
|
+
}
|
|
2696
|
+
case "reasoning": {
|
|
2697
|
+
const reasoningPart = part;
|
|
2698
|
+
return {
|
|
2699
|
+
...base,
|
|
2700
|
+
type: LiveTraceEventType2.THINKING,
|
|
2701
|
+
thinking: reasoningPart.text.slice(0, 500)
|
|
2702
|
+
};
|
|
2703
|
+
}
|
|
2704
|
+
case "tool": {
|
|
2705
|
+
const toolPart = part;
|
|
2706
|
+
const toolName = toolPart.tool;
|
|
2707
|
+
const args = toolPart.state.input;
|
|
2708
|
+
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
2709
|
+
let type = LiveTraceEventType2.TOOL_USE;
|
|
2710
|
+
let filePath;
|
|
2711
|
+
const a = args;
|
|
2712
|
+
if (a.file_path || a.path || a.target_file) {
|
|
2713
|
+
filePath = String(a.file_path || a.path || a.target_file);
|
|
2714
|
+
if (/write|edit/i.test(toolName)) {
|
|
2715
|
+
type = LiveTraceEventType2.FILE_WRITE;
|
|
2716
|
+
} else if (/read|view/i.test(toolName)) {
|
|
2717
|
+
type = LiveTraceEventType2.FILE_READ;
|
|
2718
|
+
}
|
|
2719
|
+
}
|
|
2720
|
+
return { ...base, type, toolName, toolArgs, filePath };
|
|
2721
|
+
}
|
|
2722
|
+
case "step-finish":
|
|
2723
|
+
return {
|
|
2724
|
+
...base,
|
|
2725
|
+
type: LiveTraceEventType2.PROGRESS,
|
|
2726
|
+
outputPreview: "Step completed"
|
|
2727
|
+
};
|
|
2728
|
+
default:
|
|
2729
|
+
return null;
|
|
2730
|
+
}
|
|
2731
|
+
}
|
|
2732
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2733
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2734
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2735
|
+
skillCount: skills.length,
|
|
2736
|
+
skillNames,
|
|
2737
|
+
scenarioId: scenario.id,
|
|
2738
|
+
scenarioName: scenario.name,
|
|
2739
|
+
cwd: options.cwd,
|
|
2740
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2741
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2742
|
+
model: options.model
|
|
2743
|
+
});
|
|
2744
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2745
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2746
|
+
console.log(
|
|
2747
|
+
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2748
|
+
);
|
|
2749
|
+
}
|
|
2750
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
2751
|
+
await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
|
|
2752
|
+
}
|
|
2753
|
+
if (options.rules && options.rules.length > 0) {
|
|
2754
|
+
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
2755
|
+
}
|
|
2756
|
+
try {
|
|
2757
|
+
await writeSkillsToFilesystem2(options.cwd, skills);
|
|
2758
|
+
} catch (writeError) {
|
|
2759
|
+
throw new Error(
|
|
2760
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2761
|
+
);
|
|
2762
|
+
}
|
|
2763
|
+
const maxTurns = options.maxTurns ?? 10;
|
|
2764
|
+
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2765
|
+
model: options.model,
|
|
2766
|
+
temperature: options.temperature,
|
|
2767
|
+
maxTurns,
|
|
2768
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2769
|
+
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
2770
|
+
mcps: options.mcps,
|
|
2771
|
+
cwd: options.cwd
|
|
2772
|
+
});
|
|
2773
|
+
const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
|
|
2774
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
2775
|
+
const abortController = new AbortController();
|
|
2776
|
+
let timeoutHandle;
|
|
2777
|
+
let heartbeatHandle;
|
|
2778
|
+
let timedOut = false;
|
|
2779
|
+
const traceContext = options.traceContext;
|
|
2780
|
+
let traceStepNumber = 0;
|
|
2781
|
+
let lastAction = "Starting...";
|
|
2782
|
+
let lastToolName;
|
|
2783
|
+
let lastFilePath;
|
|
2784
|
+
if (traceContext) {
|
|
2785
|
+
emitTraceEvent(
|
|
2786
|
+
{
|
|
2787
|
+
evalRunId: traceContext.evalRunId,
|
|
2788
|
+
scenarioId: traceContext.scenarioId,
|
|
2789
|
+
scenarioName: traceContext.scenarioName,
|
|
2790
|
+
targetId: traceContext.targetId,
|
|
2791
|
+
targetName: traceContext.targetName,
|
|
2792
|
+
stepNumber: 0,
|
|
2793
|
+
type: LiveTraceEventType2.DIAGNOSTIC,
|
|
2794
|
+
outputPreview: JSON.stringify({
|
|
2795
|
+
event: "pre-sdk-execution",
|
|
2796
|
+
model: `${providerID}/${modelID}`,
|
|
2797
|
+
maxTurns,
|
|
2798
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2799
|
+
}),
|
|
2800
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2801
|
+
isComplete: false
|
|
2802
|
+
},
|
|
2803
|
+
traceContext.tracePushUrl,
|
|
2804
|
+
traceContext.routeHeader,
|
|
2805
|
+
traceContext.authToken
|
|
2806
|
+
);
|
|
2807
|
+
}
|
|
2808
|
+
let server;
|
|
2809
|
+
try {
|
|
2810
|
+
console.log("[SDK-DEBUG] Starting OpenCode server...");
|
|
2811
|
+
server = await createOpencodeServer({
|
|
2812
|
+
config,
|
|
2813
|
+
signal: abortController.signal,
|
|
2814
|
+
timeout: 3e4
|
|
2815
|
+
});
|
|
2816
|
+
console.log(`[SDK-DEBUG] Server started at ${server.url}`);
|
|
2817
|
+
const client = createOpencodeClient({
|
|
2818
|
+
baseUrl: server.url,
|
|
2819
|
+
directory: options.cwd
|
|
2820
|
+
});
|
|
2821
|
+
const session = await client.session.create({
|
|
2822
|
+
body: { title: `eval-${scenario.name}` }
|
|
2823
|
+
});
|
|
2824
|
+
if (!session.data) {
|
|
2825
|
+
const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
|
|
2826
|
+
throw new Error(
|
|
2827
|
+
`OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
|
|
2828
|
+
);
|
|
2829
|
+
}
|
|
2830
|
+
const sessionId = session.data.id;
|
|
2831
|
+
console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
|
|
2832
|
+
let eventStreamAbort;
|
|
2833
|
+
if (traceContext) {
|
|
2834
|
+
eventStreamAbort = new AbortController();
|
|
2835
|
+
const executionStartTime = Date.now();
|
|
2836
|
+
(async () => {
|
|
2837
|
+
try {
|
|
2838
|
+
const events = await client.event.subscribe();
|
|
2839
|
+
for await (const event of events.stream) {
|
|
2840
|
+
if (eventStreamAbort.signal.aborted) break;
|
|
2841
|
+
const evt = event;
|
|
2842
|
+
if (evt.type === "message.part.updated") {
|
|
2843
|
+
const { part } = evt.properties;
|
|
2844
|
+
traceStepNumber++;
|
|
2845
|
+
const traceEvent = createTraceEventFromPart(
|
|
2846
|
+
part,
|
|
2847
|
+
traceContext,
|
|
2848
|
+
traceStepNumber,
|
|
2849
|
+
false
|
|
2850
|
+
);
|
|
2851
|
+
if (traceEvent) {
|
|
2852
|
+
lastToolName = traceEvent.toolName;
|
|
2853
|
+
lastFilePath = traceEvent.filePath;
|
|
2854
|
+
if (traceEvent.type === LiveTraceEventType2.THINKING) {
|
|
2855
|
+
lastAction = "Thinking...";
|
|
2856
|
+
} else if (traceEvent.type === LiveTraceEventType2.TOOL_USE) {
|
|
2857
|
+
lastAction = extractToolAction(
|
|
2858
|
+
traceEvent.toolName ?? "",
|
|
2859
|
+
void 0
|
|
2860
|
+
);
|
|
2861
|
+
} else if (traceEvent.type === LiveTraceEventType2.FILE_WRITE) {
|
|
2862
|
+
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
2863
|
+
} else if (traceEvent.type === LiveTraceEventType2.FILE_READ) {
|
|
2864
|
+
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
2865
|
+
} else if (traceEvent.type === LiveTraceEventType2.COMPLETION) {
|
|
2866
|
+
lastAction = "Processing response...";
|
|
2867
|
+
}
|
|
2868
|
+
emitTraceEvent(
|
|
2869
|
+
traceEvent,
|
|
2870
|
+
traceContext.tracePushUrl,
|
|
2871
|
+
traceContext.routeHeader,
|
|
2872
|
+
traceContext.authToken
|
|
2873
|
+
);
|
|
2874
|
+
}
|
|
2875
|
+
} else if (evt.type === "session.error") {
|
|
2876
|
+
const props = evt.properties;
|
|
2877
|
+
traceStepNumber++;
|
|
2878
|
+
emitTraceEvent(
|
|
2879
|
+
{
|
|
2880
|
+
evalRunId: traceContext.evalRunId,
|
|
2881
|
+
scenarioId: traceContext.scenarioId,
|
|
2882
|
+
scenarioName: traceContext.scenarioName,
|
|
2883
|
+
targetId: traceContext.targetId,
|
|
2884
|
+
targetName: traceContext.targetName,
|
|
2885
|
+
stepNumber: traceStepNumber,
|
|
2886
|
+
type: LiveTraceEventType2.DIAGNOSTIC,
|
|
2887
|
+
outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
|
|
2888
|
+
0,
|
|
2889
|
+
500
|
|
2890
|
+
),
|
|
2891
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2892
|
+
isComplete: false
|
|
2893
|
+
},
|
|
2894
|
+
traceContext.tracePushUrl,
|
|
2895
|
+
traceContext.routeHeader,
|
|
2896
|
+
traceContext.authToken
|
|
2897
|
+
);
|
|
2898
|
+
}
|
|
2899
|
+
}
|
|
2900
|
+
} catch {
|
|
2901
|
+
}
|
|
2902
|
+
})();
|
|
2903
|
+
let lastReportedAction = "";
|
|
2904
|
+
let sameActionCount = 0;
|
|
2905
|
+
heartbeatHandle = setInterval(() => {
|
|
2906
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
2907
|
+
let progressMessage = lastAction;
|
|
2908
|
+
if (lastAction === lastReportedAction) {
|
|
2909
|
+
sameActionCount++;
|
|
2910
|
+
} else {
|
|
2911
|
+
sameActionCount = 1;
|
|
2912
|
+
lastReportedAction = lastAction;
|
|
2913
|
+
}
|
|
2914
|
+
const isTaskTool = lastToolName === "Task" || lastToolName === "dispatch_agent";
|
|
2915
|
+
if (isTaskTool && sameActionCount > 1) {
|
|
2916
|
+
progressMessage = `Waiting for ${lastAction}`;
|
|
2917
|
+
} else if (lastToolName && lastFilePath) {
|
|
2918
|
+
progressMessage = `${lastToolName}: ${lastFilePath}`;
|
|
2919
|
+
} else if (lastToolName && !isTaskTool) {
|
|
2920
|
+
progressMessage = `Using ${lastToolName}...`;
|
|
2921
|
+
}
|
|
2922
|
+
const elapsedSec = Math.round(elapsedMs / 1e3);
|
|
2923
|
+
progressMessage += ` (${elapsedSec}s, step ${traceStepNumber})`;
|
|
2924
|
+
emitTraceEvent(
|
|
2925
|
+
{
|
|
2926
|
+
evalRunId: traceContext.evalRunId,
|
|
2927
|
+
scenarioId: traceContext.scenarioId,
|
|
2928
|
+
scenarioName: traceContext.scenarioName,
|
|
2929
|
+
targetId: traceContext.targetId,
|
|
2930
|
+
targetName: traceContext.targetName,
|
|
2931
|
+
stepNumber: traceStepNumber,
|
|
2932
|
+
type: LiveTraceEventType2.PROGRESS,
|
|
2933
|
+
outputPreview: progressMessage,
|
|
2934
|
+
toolName: lastToolName,
|
|
2935
|
+
filePath: lastFilePath,
|
|
2936
|
+
elapsedMs,
|
|
2937
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2938
|
+
isComplete: false
|
|
2939
|
+
},
|
|
2940
|
+
traceContext.tracePushUrl,
|
|
2941
|
+
traceContext.routeHeader,
|
|
2942
|
+
traceContext.authToken
|
|
2943
|
+
);
|
|
2944
|
+
}, 1e4);
|
|
2945
|
+
}
|
|
2946
|
+
const promptPromise = (async () => {
|
|
2947
|
+
let systemPrompt;
|
|
2948
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2949
|
+
} else if (options.systemPrompt != null) {
|
|
2950
|
+
systemPrompt = options.systemPrompt;
|
|
2951
|
+
} else {
|
|
2952
|
+
systemPrompt = DEFAULT_EVALUATOR_SYSTEM_PROMPT2;
|
|
2953
|
+
}
|
|
2954
|
+
console.log("[SDK-DEBUG] Sending prompt...");
|
|
2955
|
+
const result = await client.session.prompt({
|
|
2956
|
+
path: { id: sessionId },
|
|
2957
|
+
body: {
|
|
2958
|
+
model: { providerID, modelID },
|
|
2959
|
+
...systemPrompt ? { system: systemPrompt } : {},
|
|
2960
|
+
parts: [{ type: "text", text: scenario.triggerPrompt }]
|
|
2961
|
+
}
|
|
2962
|
+
});
|
|
2963
|
+
return result;
|
|
2964
|
+
})();
|
|
2965
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
2966
|
+
timeoutHandle = setTimeout(() => {
|
|
2967
|
+
timedOut = true;
|
|
2968
|
+
client.session.abort({ path: { id: sessionId } }).catch(() => {
|
|
2969
|
+
});
|
|
2970
|
+
reject(
|
|
2971
|
+
new Error(
|
|
2972
|
+
`OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
|
|
2973
|
+
)
|
|
2974
|
+
);
|
|
2975
|
+
}, SDK_TIMEOUT_MS);
|
|
2976
|
+
});
|
|
2977
|
+
const promptResult = await Promise.race([promptPromise, timeoutPromise]);
|
|
2978
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
2979
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
2980
|
+
if (eventStreamAbort) eventStreamAbort.abort();
|
|
2981
|
+
if ("error" in promptResult && promptResult.error) {
|
|
2982
|
+
const errPayload = promptResult.error;
|
|
2983
|
+
throw new Error(
|
|
2984
|
+
`Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
|
|
2985
|
+
);
|
|
2986
|
+
}
|
|
2987
|
+
console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
|
|
2988
|
+
const messagesResponse = await client.session.messages({
|
|
2989
|
+
path: { id: sessionId }
|
|
2990
|
+
});
|
|
2991
|
+
const allMessages = messagesResponse.data ?? [];
|
|
2992
|
+
console.log(
|
|
2993
|
+
`[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
|
|
2994
|
+
);
|
|
2995
|
+
if (traceContext) {
|
|
2996
|
+
emitTraceEvent(
|
|
2997
|
+
{
|
|
2998
|
+
evalRunId: traceContext.evalRunId,
|
|
2999
|
+
scenarioId: traceContext.scenarioId,
|
|
3000
|
+
scenarioName: traceContext.scenarioName,
|
|
3001
|
+
targetId: traceContext.targetId,
|
|
3002
|
+
targetName: traceContext.targetName,
|
|
3003
|
+
stepNumber: traceStepNumber + 1,
|
|
3004
|
+
type: LiveTraceEventType2.COMPLETION,
|
|
3005
|
+
outputPreview: "Scenario execution completed",
|
|
3006
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3007
|
+
isComplete: true
|
|
3008
|
+
},
|
|
3009
|
+
traceContext.tracePushUrl,
|
|
3010
|
+
traceContext.routeHeader,
|
|
3011
|
+
traceContext.authToken
|
|
3012
|
+
);
|
|
3013
|
+
}
|
|
3014
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
3015
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3016
|
+
const resultData = promptResult.data;
|
|
3017
|
+
const lastAssistantInfo = resultData?.info;
|
|
3018
|
+
if (lastAssistantInfo?.error) {
|
|
3019
|
+
const err = lastAssistantInfo.error;
|
|
3020
|
+
throw new Error(
|
|
3021
|
+
`Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
|
|
3022
|
+
);
|
|
3023
|
+
}
|
|
3024
|
+
let outputText = "";
|
|
3025
|
+
if (resultData?.parts) {
|
|
3026
|
+
for (const part of resultData.parts) {
|
|
3027
|
+
if (part.type === "text") {
|
|
3028
|
+
outputText += part.text;
|
|
3029
|
+
}
|
|
3030
|
+
}
|
|
3031
|
+
}
|
|
3032
|
+
if (!outputText && allMessages.length > 0) {
|
|
3033
|
+
for (let i = allMessages.length - 1; i >= 0; i--) {
|
|
3034
|
+
const msg = allMessages[i];
|
|
3035
|
+
if (msg.info.role === "assistant") {
|
|
3036
|
+
const assistantInfo = msg.info;
|
|
3037
|
+
if (assistantInfo.error) {
|
|
3038
|
+
throw new Error(
|
|
3039
|
+
`Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
|
|
3040
|
+
);
|
|
3041
|
+
}
|
|
3042
|
+
for (const part of msg.parts) {
|
|
3043
|
+
if (part.type === "text") {
|
|
3044
|
+
outputText += part.text;
|
|
3045
|
+
}
|
|
3046
|
+
}
|
|
3047
|
+
if (outputText) break;
|
|
3048
|
+
}
|
|
3049
|
+
}
|
|
3050
|
+
}
|
|
3051
|
+
if (!outputText) {
|
|
3052
|
+
const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
|
|
3053
|
+
if (!hasAssistant) {
|
|
3054
|
+
throw new Error(
|
|
3055
|
+
`Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
|
|
3056
|
+
);
|
|
3057
|
+
}
|
|
3058
|
+
}
|
|
3059
|
+
const usage = lastAssistantInfo ? {
|
|
3060
|
+
inputTokens: lastAssistantInfo.tokens.input,
|
|
3061
|
+
outputTokens: lastAssistantInfo.tokens.output,
|
|
3062
|
+
totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
|
|
3063
|
+
} : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
3064
|
+
const costUsd = lastAssistantInfo?.cost;
|
|
3065
|
+
const modelStr = options.model || DEFAULT_MODEL3;
|
|
3066
|
+
const llmTrace = buildLLMTrace(
|
|
3067
|
+
allMessages,
|
|
3068
|
+
totalDurationMs,
|
|
3069
|
+
modelStr,
|
|
3070
|
+
providerID
|
|
3071
|
+
);
|
|
3072
|
+
const conversation = buildConversation2(allMessages);
|
|
3073
|
+
return {
|
|
3074
|
+
result: {
|
|
3075
|
+
outputText,
|
|
3076
|
+
durationMs: totalDurationMs,
|
|
3077
|
+
usage,
|
|
3078
|
+
costUsd
|
|
3079
|
+
},
|
|
3080
|
+
llmTrace,
|
|
3081
|
+
conversation
|
|
3082
|
+
};
|
|
3083
|
+
} catch (sdkError) {
|
|
3084
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
3085
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
3086
|
+
if (timedOut) {
|
|
3087
|
+
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
3088
|
+
}
|
|
3089
|
+
const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
|
|
3090
|
+
const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
|
|
3091
|
+
const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
|
|
3092
|
+
console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
|
|
3093
|
+
console.error("[SDK-ERROR] Error name:", errorName);
|
|
3094
|
+
console.error("[SDK-ERROR] Error message:", errorMessage);
|
|
3095
|
+
if (errorStack) {
|
|
3096
|
+
console.error("[SDK-ERROR] Stack:", errorStack);
|
|
3097
|
+
}
|
|
3098
|
+
if (traceContext) {
|
|
3099
|
+
emitTraceEvent(
|
|
3100
|
+
{
|
|
3101
|
+
evalRunId: traceContext.evalRunId,
|
|
3102
|
+
scenarioId: traceContext.scenarioId,
|
|
3103
|
+
scenarioName: traceContext.scenarioName,
|
|
3104
|
+
targetId: traceContext.targetId,
|
|
3105
|
+
targetName: traceContext.targetName,
|
|
3106
|
+
stepNumber: traceStepNumber + 1,
|
|
3107
|
+
type: LiveTraceEventType2.DIAGNOSTIC,
|
|
3108
|
+
outputPreview: JSON.stringify({
|
|
3109
|
+
event: "sdk-execution-failed",
|
|
3110
|
+
error: errorMessage,
|
|
3111
|
+
errorName
|
|
3112
|
+
}).slice(0, 2e3),
|
|
3113
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3114
|
+
isComplete: true
|
|
3115
|
+
},
|
|
3116
|
+
traceContext.tracePushUrl,
|
|
3117
|
+
traceContext.routeHeader,
|
|
3118
|
+
traceContext.authToken
|
|
3119
|
+
);
|
|
3120
|
+
}
|
|
3121
|
+
throw new Error(
|
|
3122
|
+
`OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
|
|
3123
|
+
Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
3124
|
+
);
|
|
3125
|
+
} finally {
|
|
3126
|
+
if (server) {
|
|
3127
|
+
try {
|
|
3128
|
+
server.close();
|
|
3129
|
+
console.log("[SDK-DEBUG] OpenCode server closed");
|
|
3130
|
+
} catch {
|
|
3131
|
+
}
|
|
3132
|
+
}
|
|
3133
|
+
}
|
|
3134
|
+
}
|
|
3135
|
+
|
|
3136
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
3137
|
+
var OpenCodeAdapter = class {
|
|
3138
|
+
id = "opencode";
|
|
3139
|
+
name = "OpenCode";
|
|
3140
|
+
supportedCommands = [AgentRunCommand2.OPENCODE];
|
|
3141
|
+
async execute(context) {
|
|
3142
|
+
const {
|
|
3143
|
+
skills,
|
|
3144
|
+
scenario,
|
|
3145
|
+
cwd,
|
|
3146
|
+
modelConfig,
|
|
3147
|
+
aiGatewayUrl,
|
|
3148
|
+
aiGatewayHeaders,
|
|
3149
|
+
traceContext,
|
|
3150
|
+
mcps,
|
|
3151
|
+
subAgents,
|
|
3152
|
+
rules,
|
|
3153
|
+
systemPrompt
|
|
3154
|
+
} = context;
|
|
3155
|
+
const options = {
|
|
3156
|
+
cwd,
|
|
3157
|
+
model: modelConfig?.model,
|
|
3158
|
+
temperature: modelConfig?.temperature,
|
|
3159
|
+
maxTurns: modelConfig?.maxTurns,
|
|
3160
|
+
aiGatewayUrl,
|
|
3161
|
+
aiGatewayHeaders,
|
|
3162
|
+
traceContext,
|
|
3163
|
+
mcps,
|
|
3164
|
+
subAgents,
|
|
3165
|
+
rules,
|
|
3166
|
+
systemPrompt
|
|
3167
|
+
};
|
|
3168
|
+
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3169
|
+
skills,
|
|
3170
|
+
scenario,
|
|
3171
|
+
options
|
|
3172
|
+
);
|
|
3173
|
+
return {
|
|
3174
|
+
outputText: result.outputText,
|
|
3175
|
+
durationMs: result.durationMs,
|
|
3176
|
+
usage: {
|
|
3177
|
+
inputTokens: result.usage.inputTokens,
|
|
3178
|
+
outputTokens: result.usage.outputTokens,
|
|
3179
|
+
totalTokens: result.usage.totalTokens
|
|
3180
|
+
},
|
|
3181
|
+
costUsd: result.costUsd,
|
|
3182
|
+
llmTrace,
|
|
3183
|
+
conversation
|
|
3184
|
+
};
|
|
3185
|
+
}
|
|
3186
|
+
};
|
|
3187
|
+
var openCodeAdapter = new OpenCodeAdapter();
|
|
3188
|
+
|
|
3189
|
+
// src/run-scenario/agents/opencode/index.ts
|
|
3190
|
+
defaultRegistry.register(openCodeAdapter);
|
|
3191
|
+
|
|
2040
3192
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
2041
3193
|
import {
|
|
2042
3194
|
generateText,
|
|
@@ -2047,10 +3199,10 @@ import { createOpenAI } from "@ai-sdk/openai";
|
|
|
2047
3199
|
import {
|
|
2048
3200
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
2049
3201
|
OPENAI_RESPONSES_MODEL_IDS,
|
|
2050
|
-
LLMStepType as
|
|
2051
|
-
LiveTraceEventType as
|
|
3202
|
+
LLMStepType as LLMStepType3,
|
|
3203
|
+
LiveTraceEventType as LiveTraceEventType3
|
|
2052
3204
|
} from "@wix/evalforge-types";
|
|
2053
|
-
import { randomUUID as
|
|
3205
|
+
import { randomUUID as randomUUID3 } from "crypto";
|
|
2054
3206
|
|
|
2055
3207
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
2056
3208
|
import { createMCPClient } from "@ai-sdk/mcp";
|
|
@@ -2145,48 +3297,35 @@ function extractErrorText(content) {
|
|
|
2145
3297
|
}
|
|
2146
3298
|
|
|
2147
3299
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3300
|
+
import { normalizeModelId } from "@wix/evalforge-types";
|
|
2148
3301
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
2149
3302
|
var MODEL_PRICING = {
|
|
3303
|
+
// Anthropic — Claude 4.6
|
|
3304
|
+
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
3305
|
+
"claude-opus-4-6": { input: 15, output: 75 },
|
|
2150
3306
|
// Anthropic — Claude 4.5
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
// Anthropic — Claude 4
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
|
|
2158
|
-
// Anthropic — Claude 3.x
|
|
2159
|
-
CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
|
|
2160
|
-
CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
|
|
2161
|
-
CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
|
|
3307
|
+
"claude-opus-4-5": { input: 5, output: 25 },
|
|
3308
|
+
"claude-sonnet-4-5": { input: 3, output: 15 },
|
|
3309
|
+
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
3310
|
+
// Anthropic — Claude 4
|
|
3311
|
+
"claude-opus-4": { input: 15, output: 75 },
|
|
3312
|
+
"claude-sonnet-4": { input: 3, output: 15 },
|
|
2162
3313
|
// OpenAI — GPT-5
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
GPT_5_NANO_2025_08_07: { input: 0.05, output: 0.4 },
|
|
3314
|
+
"gpt-5": { input: 1.25, output: 10 },
|
|
3315
|
+
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
3316
|
+
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
2167
3317
|
// OpenAI — GPT-4.1
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
3318
|
+
"gpt-4.1": { input: 2, output: 8 },
|
|
3319
|
+
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
3320
|
+
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
2171
3321
|
// OpenAI — GPT-4o
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
GPT_4O_2024_11_20: { input: 2.5, output: 10 },
|
|
2175
|
-
GPT_4O_MINI_2024_07_18: { input: 0.15, output: 0.6 },
|
|
3322
|
+
"gpt-4o": { input: 2.5, output: 10 },
|
|
3323
|
+
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
2176
3324
|
// OpenAI — Reasoning
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
O1_MINI: { input: 1.1, output: 4.4 },
|
|
2182
|
-
O1_MINI_2024_09_12: { input: 1.1, output: 4.4 },
|
|
2183
|
-
O1_PREVIEW: { input: 15, output: 60 },
|
|
2184
|
-
O1_PREVIEW_2024_09_12: { input: 15, output: 60 },
|
|
2185
|
-
// OpenAI — Legacy
|
|
2186
|
-
GPT_4_TURBO_2024_04_09: { input: 10, output: 30 },
|
|
2187
|
-
GPT_4_1106_PREVIEW: { input: 10, output: 30 },
|
|
2188
|
-
GPT_3_5_TURBO: { input: 0.5, output: 1.5 },
|
|
2189
|
-
GPT_3_5_TURBO_0125: { input: 0.5, output: 1.5 }
|
|
3325
|
+
o3: { input: 2, output: 8 },
|
|
3326
|
+
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3327
|
+
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3328
|
+
o1: { input: 15, output: 60 }
|
|
2190
3329
|
};
|
|
2191
3330
|
function extractGatewayCost(step, provider) {
|
|
2192
3331
|
try {
|
|
@@ -2205,7 +3344,8 @@ function extractGatewayCost(step, provider) {
|
|
|
2205
3344
|
}
|
|
2206
3345
|
}
|
|
2207
3346
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
2208
|
-
const
|
|
3347
|
+
const normalized = normalizeModelId(modelId);
|
|
3348
|
+
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
2209
3349
|
if (!pricing) return 0;
|
|
2210
3350
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
2211
3351
|
}
|
|
@@ -2214,7 +3354,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
|
|
|
2214
3354
|
}
|
|
2215
3355
|
|
|
2216
3356
|
// src/run-scenario/agents/simple-agent/build-conversation.ts
|
|
2217
|
-
function
|
|
3357
|
+
function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
2218
3358
|
const messages = [];
|
|
2219
3359
|
messages.push({
|
|
2220
3360
|
role: "user",
|
|
@@ -2280,9 +3420,7 @@ var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
|
2280
3420
|
var PROVIDER_OPENAI = "openai";
|
|
2281
3421
|
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
2282
3422
|
function createModel(modelId, baseUrl, headers) {
|
|
2283
|
-
const isClaudeModel =
|
|
2284
|
-
modelId
|
|
2285
|
-
);
|
|
3423
|
+
const isClaudeModel = isClaudeModelId(modelId);
|
|
2286
3424
|
if (isClaudeModel) {
|
|
2287
3425
|
const anthropic = createAnthropic({
|
|
2288
3426
|
baseURL: `${baseUrl}/proxy/anthropic`,
|
|
@@ -2296,13 +3434,17 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
2296
3434
|
apiKey: "proxy-auth",
|
|
2297
3435
|
headers
|
|
2298
3436
|
});
|
|
2299
|
-
if (OPENAI_RESPONSES_MODEL_IDS.
|
|
3437
|
+
if ([...OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3438
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3439
|
+
)) {
|
|
2300
3440
|
return openai.responses(modelId);
|
|
2301
3441
|
}
|
|
2302
3442
|
return openai.chat(modelId);
|
|
2303
3443
|
}
|
|
2304
3444
|
function isClaudeModelId(modelId) {
|
|
2305
|
-
return AVAILABLE_CLAUDE_MODEL_IDS.
|
|
3445
|
+
return AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
3446
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3447
|
+
);
|
|
2306
3448
|
}
|
|
2307
3449
|
function extractSkillContent(files) {
|
|
2308
3450
|
if (!files || files.length === 0) return void 0;
|
|
@@ -2336,7 +3478,9 @@ async function executeWithAiSdk(context) {
|
|
|
2336
3478
|
}
|
|
2337
3479
|
try {
|
|
2338
3480
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
2339
|
-
const isResponsesAPI = OPENAI_RESPONSES_MODEL_IDS.
|
|
3481
|
+
const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3482
|
+
(id) => modelConfig.model === id || modelConfig.model.startsWith(id)
|
|
3483
|
+
);
|
|
2340
3484
|
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
2341
3485
|
const providerOpts = {
|
|
2342
3486
|
...isAnthropic && {
|
|
@@ -2370,7 +3514,7 @@ async function executeWithAiSdk(context) {
|
|
|
2370
3514
|
outputTokens: result.usage.outputTokens ?? 0,
|
|
2371
3515
|
totalTokens: result.usage.totalTokens ?? 0
|
|
2372
3516
|
};
|
|
2373
|
-
const llmTrace =
|
|
3517
|
+
const llmTrace = buildLLMTrace2(
|
|
2374
3518
|
result.steps,
|
|
2375
3519
|
durationMs,
|
|
2376
3520
|
usage,
|
|
@@ -2382,7 +3526,7 @@ async function executeWithAiSdk(context) {
|
|
|
2382
3526
|
emitStepEvents(traceContext, result.steps, startTime);
|
|
2383
3527
|
emitCompletionEvent(traceContext, result.steps.length + 1);
|
|
2384
3528
|
}
|
|
2385
|
-
const conversation =
|
|
3529
|
+
const conversation = buildConversation3(
|
|
2386
3530
|
scenario.triggerPrompt,
|
|
2387
3531
|
result.steps,
|
|
2388
3532
|
startTime
|
|
@@ -2426,7 +3570,7 @@ function findToolResultError(step) {
|
|
|
2426
3570
|
}
|
|
2427
3571
|
return null;
|
|
2428
3572
|
}
|
|
2429
|
-
function
|
|
3573
|
+
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
|
|
2430
3574
|
const totalStepTokens = steps.reduce(
|
|
2431
3575
|
(sum, s) => sum + (s.usage.totalTokens ?? 0),
|
|
2432
3576
|
0
|
|
@@ -2444,9 +3588,10 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2444
3588
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
2445
3589
|
const toolResultError = findToolResultError(step);
|
|
2446
3590
|
return {
|
|
2447
|
-
id:
|
|
3591
|
+
id: randomUUID3(),
|
|
2448
3592
|
stepNumber: i + 1,
|
|
2449
|
-
|
|
3593
|
+
turnIndex: i,
|
|
3594
|
+
type: step.toolCalls.length > 0 ? LLMStepType3.TOOL_USE : LLMStepType3.COMPLETION,
|
|
2450
3595
|
model: modelId,
|
|
2451
3596
|
provider,
|
|
2452
3597
|
startedAt: new Date(
|
|
@@ -2469,10 +3614,11 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2469
3614
|
total: totalUsage.totalTokens
|
|
2470
3615
|
};
|
|
2471
3616
|
return {
|
|
2472
|
-
id:
|
|
3617
|
+
id: randomUUID3(),
|
|
2473
3618
|
steps: traceSteps,
|
|
2474
3619
|
summary: {
|
|
2475
3620
|
totalSteps: traceSteps.length,
|
|
3621
|
+
totalTurns: traceSteps.length,
|
|
2476
3622
|
totalDurationMs,
|
|
2477
3623
|
totalTokens: finalTokens,
|
|
2478
3624
|
totalCostUsd,
|
|
@@ -2497,7 +3643,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
2497
3643
|
targetId: traceContext.targetId,
|
|
2498
3644
|
targetName: traceContext.targetName,
|
|
2499
3645
|
stepNumber: 0,
|
|
2500
|
-
type:
|
|
3646
|
+
type: LiveTraceEventType3.PROGRESS,
|
|
2501
3647
|
outputPreview: "Starting Simple Agent execution...",
|
|
2502
3648
|
elapsedMs: Date.now() - startTime,
|
|
2503
3649
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -2521,7 +3667,7 @@ function emitStepEvents(traceContext, steps, startTime) {
|
|
|
2521
3667
|
targetId: traceContext.targetId,
|
|
2522
3668
|
targetName: traceContext.targetName,
|
|
2523
3669
|
stepNumber: i + 1,
|
|
2524
|
-
type: isToolStep ?
|
|
3670
|
+
type: isToolStep ? LiveTraceEventType3.TOOL_USE : LiveTraceEventType3.COMPLETION,
|
|
2525
3671
|
toolName: firstToolCall?.toolName,
|
|
2526
3672
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
2527
3673
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -2544,7 +3690,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
2544
3690
|
targetId: traceContext.targetId,
|
|
2545
3691
|
targetName: traceContext.targetName,
|
|
2546
3692
|
stepNumber,
|
|
2547
|
-
type:
|
|
3693
|
+
type: LiveTraceEventType3.COMPLETION,
|
|
2548
3694
|
outputPreview: "Scenario execution completed",
|
|
2549
3695
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2550
3696
|
isComplete: true
|
|
@@ -2571,7 +3717,7 @@ defaultRegistry.register(simpleAgentAdapter);
|
|
|
2571
3717
|
|
|
2572
3718
|
// src/run-scenario/file-diff.ts
|
|
2573
3719
|
import { readdirSync, readFileSync as readFileSync2, statSync, existsSync as existsSync2 } from "fs";
|
|
2574
|
-
import { join as
|
|
3720
|
+
import { join as join9, relative } from "path";
|
|
2575
3721
|
|
|
2576
3722
|
// ../../node_modules/diff/lib/index.mjs
|
|
2577
3723
|
function Diff() {
|
|
@@ -2747,7 +3893,7 @@ Diff.prototype = {
|
|
|
2747
3893
|
tokenize: function tokenize(value) {
|
|
2748
3894
|
return Array.from(value);
|
|
2749
3895
|
},
|
|
2750
|
-
join: function
|
|
3896
|
+
join: function join8(chars) {
|
|
2751
3897
|
return chars.join("");
|
|
2752
3898
|
},
|
|
2753
3899
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -3187,7 +4333,7 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
3187
4333
|
}
|
|
3188
4334
|
const entries = readdirSync(dir, { withFileTypes: true });
|
|
3189
4335
|
for (const entry of entries) {
|
|
3190
|
-
const fullPath =
|
|
4336
|
+
const fullPath = join9(dir, entry.name);
|
|
3191
4337
|
const relativePath = relative(base, fullPath);
|
|
3192
4338
|
if (shouldIgnore(entry.name)) {
|
|
3193
4339
|
continue;
|
|
@@ -3296,15 +4442,9 @@ function extractTemplateFiles(before, after) {
|
|
|
3296
4442
|
}
|
|
3297
4443
|
|
|
3298
4444
|
// src/run-scenario/run-agent-with-context.ts
|
|
3299
|
-
import { AgentRunCommand as
|
|
3300
|
-
var DEFAULT_AGENT_COMMAND =
|
|
4445
|
+
import { AgentRunCommand as AgentRunCommand3, AgentType } from "@wix/evalforge-types";
|
|
4446
|
+
var DEFAULT_AGENT_COMMAND = AgentRunCommand3.CLAUDE;
|
|
3301
4447
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
3302
|
-
const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
|
|
3303
|
-
if (!hasEntities) {
|
|
3304
|
-
throw new Error(
|
|
3305
|
-
`Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
3306
|
-
);
|
|
3307
|
-
}
|
|
3308
4448
|
const agent = evalData.agent ?? void 0;
|
|
3309
4449
|
const isSDK = agent?.agentType === AgentType.SDK;
|
|
3310
4450
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
@@ -3341,7 +4481,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
3341
4481
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
3342
4482
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
3343
4483
|
return {
|
|
3344
|
-
id:
|
|
4484
|
+
id: randomUUID4(),
|
|
3345
4485
|
targetId,
|
|
3346
4486
|
targetName,
|
|
3347
4487
|
scenarioId: scenario.id,
|
|
@@ -3598,13 +4738,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
3598
4738
|
presetId: evalData.evalRun.presetId,
|
|
3599
4739
|
skillIds: evalData.evalRun.skillIds
|
|
3600
4740
|
};
|
|
3601
|
-
|
|
3602
|
-
if (scenarioItems.length > 0 && !hasEntities) {
|
|
3603
|
-
throw new Error(
|
|
3604
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
3605
|
-
);
|
|
3606
|
-
}
|
|
3607
|
-
if (scenarioItems.length > 0 && hasEntities && !agent) {
|
|
4741
|
+
if (scenarioItems.length > 0 && !agent) {
|
|
3608
4742
|
throw new Error(
|
|
3609
4743
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
3610
4744
|
);
|