@wix/evalforge-evaluator 0.111.0 → 0.113.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +1242 -117
- package/build/index.js.map +4 -4
- package/build/index.mjs +1241 -107
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +1 -1
- package/build/types/run-scenario/agents/index.d.ts +2 -0
- package/build/types/run-scenario/agents/opencode/build-conversation.d.ts +7 -0
- package/build/types/run-scenario/agents/opencode/build-trace.d.ts +13 -0
- package/build/types/run-scenario/agents/opencode/config.d.ts +27 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/index.d.ts +14 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +18 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +32 -0
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +12 -0
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +7 -0
- package/package.json +14 -13
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -509,7 +509,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
509
509
|
}
|
|
510
510
|
|
|
511
511
|
// src/run-scenario/index.ts
|
|
512
|
-
var
|
|
512
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
513
513
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
514
514
|
|
|
515
515
|
// src/run-scenario/environment.ts
|
|
@@ -596,7 +596,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
// src/run-scenario/run-agent-with-context.ts
|
|
599
|
-
var
|
|
599
|
+
var import_crypto4 = require("crypto");
|
|
600
600
|
|
|
601
601
|
// src/run-scenario/agents/registry.ts
|
|
602
602
|
var AgentAdapterRegistry = class {
|
|
@@ -1222,10 +1222,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1222
1222
|
}
|
|
1223
1223
|
const startTime = /* @__PURE__ */ new Date();
|
|
1224
1224
|
const allMessages = [];
|
|
1225
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1225
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1226
1226
|
const claudeDir = `${options.cwd}/.claude`;
|
|
1227
1227
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
1228
|
-
await
|
|
1228
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1229
1229
|
flag: "wx"
|
|
1230
1230
|
}).catch(() => {
|
|
1231
1231
|
});
|
|
@@ -1261,7 +1261,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1261
1261
|
"Edit",
|
|
1262
1262
|
"Bash",
|
|
1263
1263
|
"Glob",
|
|
1264
|
-
"Grep"
|
|
1264
|
+
"Grep",
|
|
1265
|
+
"Agent",
|
|
1266
|
+
"WebFetch",
|
|
1267
|
+
"WebSearch"
|
|
1265
1268
|
];
|
|
1266
1269
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1267
1270
|
const queryOptions = {
|
|
@@ -1896,13 +1899,15 @@ function extractTotalUsage(result) {
|
|
|
1896
1899
|
}
|
|
1897
1900
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
1898
1901
|
const totalCost = usage.costUsd ?? 0;
|
|
1899
|
-
const
|
|
1900
|
-
|
|
1902
|
+
const effectiveInput = (s) => s.usage.inputTokens + (s.usage.cacheReadTokens ?? 0) + (s.usage.cacheWriteTokens ?? 0);
|
|
1903
|
+
const totalStepEffectiveInput = steps.reduce(
|
|
1904
|
+
(sum, s) => sum + effectiveInput(s),
|
|
1901
1905
|
0
|
|
1902
1906
|
);
|
|
1903
1907
|
const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
|
|
1904
|
-
const
|
|
1905
|
-
const
|
|
1908
|
+
const authoritativeEffectiveInput = usage.inputTokens + (usage.cacheReadTokens ?? 0) + (usage.cacheWriteTokens ?? 0);
|
|
1909
|
+
const inputTokensDuplicated = authoritativeEffectiveInput > 0 && totalStepEffectiveInput > authoritativeEffectiveInput * 1.2;
|
|
1910
|
+
const traceSteps = steps.flatMap((step, turnIndex) => {
|
|
1906
1911
|
let stepPromptTokens;
|
|
1907
1912
|
let stepOutputTokens;
|
|
1908
1913
|
let proportion;
|
|
@@ -1911,34 +1916,128 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1911
1916
|
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
1912
1917
|
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
1913
1918
|
} else {
|
|
1914
|
-
|
|
1915
|
-
|
|
1919
|
+
const stepEffective = effectiveInput(step);
|
|
1920
|
+
proportion = totalStepEffectiveInput > 0 ? stepEffective / totalStepEffectiveInput : 0;
|
|
1921
|
+
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
1916
1922
|
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
1917
1923
|
}
|
|
1918
|
-
const stepTotalTokens = stepPromptTokens + stepOutputTokens;
|
|
1919
1924
|
const costProportion = proportion;
|
|
1920
|
-
const
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1925
|
+
const toolCallCount = step.toolCalls?.length ?? 0;
|
|
1926
|
+
const isSuccess = step.finishReason !== "error" && !step.hasToolError;
|
|
1927
|
+
const errorMsg = step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : step.finishReason === "error" ? "Generation failed" : void 0;
|
|
1928
|
+
const subSteps = [];
|
|
1929
|
+
const stepCost = totalCost * costProportion;
|
|
1930
|
+
const hasThinking = !!step.thinking;
|
|
1931
|
+
const hasText = !!step.text;
|
|
1932
|
+
const thinkingSubSteps = hasThinking ? 1 : 0;
|
|
1933
|
+
const toolSubSteps = toolCallCount > 0 ? toolCallCount : 0;
|
|
1934
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
1935
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
1936
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
1937
|
+
subSteps.push({
|
|
1938
|
+
id: (0, import_crypto.randomUUID)(),
|
|
1939
|
+
stepNumber: 0,
|
|
1940
|
+
// renumbered below
|
|
1941
|
+
turnIndex,
|
|
1942
|
+
type: import_evalforge_types4.LLMStepType.THINKING,
|
|
1943
|
+
model,
|
|
1944
|
+
provider: "anthropic",
|
|
1945
|
+
startedAt: step.startedAt.toISOString(),
|
|
1946
|
+
durationMs: Math.round(step.durationMs / totalSubSteps),
|
|
1947
|
+
tokenUsage: {
|
|
1948
|
+
prompt: Math.round(stepPromptTokens / totalSubSteps),
|
|
1949
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
1950
|
+
total: Math.round(
|
|
1951
|
+
(stepPromptTokens + stepOutputTokens) / totalSubSteps
|
|
1952
|
+
)
|
|
1953
|
+
},
|
|
1954
|
+
costUsd: stepCost / totalSubSteps,
|
|
1955
|
+
outputPreview: step.thinking?.slice(0, 200),
|
|
1956
|
+
success: isSuccess,
|
|
1957
|
+
error: errorMsg
|
|
1958
|
+
});
|
|
1959
|
+
}
|
|
1960
|
+
if (toolCallCount > 0) {
|
|
1961
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
1962
|
+
const tc = step.toolCalls[tcIdx];
|
|
1963
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
1964
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
1965
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
1966
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
1967
|
+
subSteps.push({
|
|
1968
|
+
id: (0, import_crypto.randomUUID)(),
|
|
1969
|
+
stepNumber: 0,
|
|
1970
|
+
turnIndex,
|
|
1971
|
+
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
1972
|
+
model,
|
|
1973
|
+
provider: "anthropic",
|
|
1974
|
+
startedAt: step.startedAt.toISOString(),
|
|
1975
|
+
durationMs: isLast ? step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(step.durationMs * remainingFraction * toolFraction),
|
|
1976
|
+
tokenUsage: {
|
|
1977
|
+
prompt: Math.round(
|
|
1978
|
+
stepPromptTokens * remainingFraction * toolFraction
|
|
1979
|
+
),
|
|
1980
|
+
completion: Math.round(
|
|
1981
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
1982
|
+
),
|
|
1983
|
+
total: Math.round(
|
|
1984
|
+
(stepPromptTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
1985
|
+
)
|
|
1986
|
+
},
|
|
1987
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
1988
|
+
toolName: tc.toolName,
|
|
1989
|
+
toolArguments: JSON.stringify(tc.args),
|
|
1990
|
+
outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
|
|
1991
|
+
success: isSuccess,
|
|
1992
|
+
error: errorMsg
|
|
1993
|
+
});
|
|
1994
|
+
}
|
|
1995
|
+
}
|
|
1996
|
+
if (hasText && toolCallCount > 0) {
|
|
1997
|
+
subSteps.push({
|
|
1998
|
+
id: (0, import_crypto.randomUUID)(),
|
|
1999
|
+
stepNumber: 0,
|
|
2000
|
+
turnIndex,
|
|
2001
|
+
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
2002
|
+
model,
|
|
2003
|
+
provider: "anthropic",
|
|
2004
|
+
startedAt: step.startedAt.toISOString(),
|
|
2005
|
+
durationMs: step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
2006
|
+
tokenUsage: {
|
|
2007
|
+
prompt: stepPromptTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2008
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2009
|
+
total: stepPromptTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2010
|
+
},
|
|
2011
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2012
|
+
outputPreview: step.text?.slice(0, 200),
|
|
2013
|
+
success: isSuccess,
|
|
2014
|
+
error: errorMsg
|
|
2015
|
+
});
|
|
2016
|
+
}
|
|
2017
|
+
if (subSteps.length === 0) {
|
|
2018
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
2019
|
+
subSteps.push({
|
|
2020
|
+
id: (0, import_crypto.randomUUID)(),
|
|
2021
|
+
stepNumber: 0,
|
|
2022
|
+
turnIndex,
|
|
2023
|
+
type: stepType,
|
|
2024
|
+
model,
|
|
2025
|
+
provider: "anthropic",
|
|
2026
|
+
startedAt: step.startedAt.toISOString(),
|
|
2027
|
+
durationMs: step.durationMs,
|
|
2028
|
+
tokenUsage: {
|
|
2029
|
+
prompt: stepPromptTokens,
|
|
2030
|
+
completion: stepOutputTokens,
|
|
2031
|
+
total: stepPromptTokens + stepOutputTokens
|
|
2032
|
+
},
|
|
2033
|
+
costUsd: stepCost,
|
|
2034
|
+
outputPreview: (step.text || step.thinking)?.slice(0, 200),
|
|
2035
|
+
success: isSuccess,
|
|
2036
|
+
error: errorMsg
|
|
2037
|
+
});
|
|
2038
|
+
}
|
|
2039
|
+
return subSteps;
|
|
2040
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
1942
2041
|
const finalTokens = {
|
|
1943
2042
|
prompt: usage.inputTokens,
|
|
1944
2043
|
completion: usage.outputTokens,
|
|
@@ -1960,6 +2059,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1960
2059
|
}
|
|
1961
2060
|
const summary = {
|
|
1962
2061
|
totalSteps: traceSteps.length,
|
|
2062
|
+
totalTurns: steps.length,
|
|
1963
2063
|
totalDurationMs,
|
|
1964
2064
|
totalTokens: finalTokens,
|
|
1965
2065
|
totalCostUsd: totalCost,
|
|
@@ -2045,12 +2145,1055 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
2045
2145
|
// src/run-scenario/agents/claude-code/index.ts
|
|
2046
2146
|
defaultRegistry.register(claudeCodeAdapter);
|
|
2047
2147
|
|
|
2148
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
2149
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
2150
|
+
|
|
2151
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2152
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
2153
|
+
|
|
2154
|
+
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2155
|
+
var import_promises7 = require("fs/promises");
|
|
2156
|
+
var import_path8 = require("path");
|
|
2157
|
+
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
2158
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
2159
|
+
await Promise.all(
|
|
2160
|
+
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
2161
|
+
);
|
|
2162
|
+
}
|
|
2163
|
+
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
2164
|
+
const skillName = skill.name;
|
|
2165
|
+
const skillDir = (0, import_path8.join)(cwd, ".opencode", "skills", skillName);
|
|
2166
|
+
await (0, import_promises7.mkdir)(skillDir, { recursive: true });
|
|
2167
|
+
const version = skill.latestVersion;
|
|
2168
|
+
if (version?.files && version.files.length > 0) {
|
|
2169
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
2170
|
+
console.log(
|
|
2171
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
2172
|
+
);
|
|
2173
|
+
} else if (skill.source) {
|
|
2174
|
+
try {
|
|
2175
|
+
const files = await fetchFn(skill.source, {
|
|
2176
|
+
userAgent: "EvalForge-Evaluator"
|
|
2177
|
+
});
|
|
2178
|
+
await writeFilesToDirectory(skillDir, files);
|
|
2179
|
+
console.log(
|
|
2180
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
2181
|
+
);
|
|
2182
|
+
} catch (error) {
|
|
2183
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2184
|
+
console.error(
|
|
2185
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
2186
|
+
);
|
|
2187
|
+
throw new Error(
|
|
2188
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
2189
|
+
);
|
|
2190
|
+
}
|
|
2191
|
+
} else {
|
|
2192
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
|
|
2196
|
+
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
2197
|
+
var import_promises8 = require("fs/promises");
|
|
2198
|
+
var import_path9 = require("path");
|
|
2199
|
+
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
2200
|
+
var AGENTS_DIR2 = ".opencode/agents";
|
|
2201
|
+
function toAgentFilename2(name, index, nameCount) {
|
|
2202
|
+
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
2203
|
+
const count = nameCount.get(base) ?? 0;
|
|
2204
|
+
nameCount.set(base, count + 1);
|
|
2205
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
2206
|
+
}
|
|
2207
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
2208
|
+
if (agent.source) {
|
|
2209
|
+
try {
|
|
2210
|
+
const content = await fetchFn(agent.source, {
|
|
2211
|
+
userAgent: "EvalForge-Evaluator"
|
|
2212
|
+
});
|
|
2213
|
+
console.log(
|
|
2214
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
2215
|
+
);
|
|
2216
|
+
return content;
|
|
2217
|
+
} catch (error) {
|
|
2218
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
2219
|
+
console.error(
|
|
2220
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
2221
|
+
);
|
|
2222
|
+
throw new Error(
|
|
2223
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
2224
|
+
);
|
|
2225
|
+
}
|
|
2226
|
+
}
|
|
2227
|
+
if (!agent.subAgentMd) {
|
|
2228
|
+
console.warn(
|
|
2229
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
2230
|
+
);
|
|
2231
|
+
}
|
|
2232
|
+
return agent.subAgentMd;
|
|
2233
|
+
}
|
|
2234
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
2235
|
+
if (subAgents.length === 0) return;
|
|
2236
|
+
const agentsDir = (0, import_path9.join)(cwd, AGENTS_DIR2);
|
|
2237
|
+
await (0, import_promises8.mkdir)(agentsDir, { recursive: true });
|
|
2238
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
2239
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
2240
|
+
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
2241
|
+
const filePath = (0, import_path9.join)(agentsDir, `${filename}.md`);
|
|
2242
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
2243
|
+
await (0, import_promises8.writeFile)(filePath, content, "utf8");
|
|
2244
|
+
}
|
|
2245
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2248
|
+
// src/run-scenario/agents/opencode/config.ts
|
|
2249
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
2250
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2251
|
+
function parseModel(model) {
|
|
2252
|
+
const slashIndex = model.indexOf("/");
|
|
2253
|
+
if (slashIndex > 0) {
|
|
2254
|
+
return {
|
|
2255
|
+
providerID: model.slice(0, slashIndex),
|
|
2256
|
+
modelID: model.slice(slashIndex + 1)
|
|
2257
|
+
};
|
|
2258
|
+
}
|
|
2259
|
+
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2260
|
+
model
|
|
2261
|
+
);
|
|
2262
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
|
|
2263
|
+
}
|
|
2264
|
+
function toOpenCodeMcpConfig(servers) {
|
|
2265
|
+
const result = {};
|
|
2266
|
+
for (const [name, entry] of Object.entries(servers)) {
|
|
2267
|
+
if (entry.type === "local" || entry.type === "remote") {
|
|
2268
|
+
result[name] = entry;
|
|
2269
|
+
continue;
|
|
2270
|
+
}
|
|
2271
|
+
if (entry.url && typeof entry.url === "string") {
|
|
2272
|
+
result[name] = {
|
|
2273
|
+
type: "remote",
|
|
2274
|
+
url: entry.url,
|
|
2275
|
+
...entry.headers ? { headers: entry.headers } : {},
|
|
2276
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2277
|
+
};
|
|
2278
|
+
continue;
|
|
2279
|
+
}
|
|
2280
|
+
if (entry.command && typeof entry.command === "string") {
|
|
2281
|
+
const commandArray = [
|
|
2282
|
+
entry.command,
|
|
2283
|
+
...entry.args || []
|
|
2284
|
+
];
|
|
2285
|
+
result[name] = {
|
|
2286
|
+
type: "local",
|
|
2287
|
+
command: commandArray,
|
|
2288
|
+
...entry.env ? { environment: entry.env } : {},
|
|
2289
|
+
...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
|
|
2290
|
+
};
|
|
2291
|
+
continue;
|
|
2292
|
+
}
|
|
2293
|
+
console.warn(
|
|
2294
|
+
`[MCP] Server "${name}" has unrecognized format, passing through as-is:`,
|
|
2295
|
+
JSON.stringify(entry)
|
|
2296
|
+
);
|
|
2297
|
+
result[name] = entry;
|
|
2298
|
+
}
|
|
2299
|
+
return result;
|
|
2300
|
+
}
|
|
2301
|
+
async function buildOpenCodeConfig(options) {
|
|
2302
|
+
const modelStr = options.model || DEFAULT_MODEL2;
|
|
2303
|
+
const { providerID, modelID } = parseModel(modelStr);
|
|
2304
|
+
const provider = {};
|
|
2305
|
+
if (options.aiGatewayUrl) {
|
|
2306
|
+
const providerOptions = {
|
|
2307
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
|
|
2308
|
+
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2309
|
+
};
|
|
2310
|
+
if (options.aiGatewayHeaders) {
|
|
2311
|
+
providerOptions.headers = { ...options.aiGatewayHeaders };
|
|
2312
|
+
}
|
|
2313
|
+
provider[providerID] = {
|
|
2314
|
+
options: providerOptions
|
|
2315
|
+
};
|
|
2316
|
+
}
|
|
2317
|
+
let mcp;
|
|
2318
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2319
|
+
const mcpServers = {};
|
|
2320
|
+
for (const mcpEntity of options.mcps) {
|
|
2321
|
+
const entityConfig = mcpEntity.config;
|
|
2322
|
+
for (const [key, value] of Object.entries(entityConfig)) {
|
|
2323
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
2324
|
+
throw new Error(
|
|
2325
|
+
`MCP "${mcpEntity.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}).`
|
|
2326
|
+
);
|
|
2327
|
+
}
|
|
2328
|
+
mcpServers[key] = value;
|
|
2329
|
+
}
|
|
2330
|
+
}
|
|
2331
|
+
const resolved = await resolveMcpPlaceholders(mcpServers, {
|
|
2332
|
+
cwd: options.cwd
|
|
2333
|
+
});
|
|
2334
|
+
mcp = toOpenCodeMcpConfig(resolved);
|
|
2335
|
+
}
|
|
2336
|
+
const agentOverrides = {};
|
|
2337
|
+
if (options.temperature != null) {
|
|
2338
|
+
agentOverrides.temperature = options.temperature;
|
|
2339
|
+
}
|
|
2340
|
+
if (options.maxTurns != null) {
|
|
2341
|
+
agentOverrides.maxSteps = options.maxTurns;
|
|
2342
|
+
}
|
|
2343
|
+
const config = {
|
|
2344
|
+
model: `${providerID}/${modelID}`,
|
|
2345
|
+
provider,
|
|
2346
|
+
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2347
|
+
permission: {
|
|
2348
|
+
edit: "allow",
|
|
2349
|
+
bash: "allow",
|
|
2350
|
+
webfetch: "allow",
|
|
2351
|
+
doom_loop: "allow",
|
|
2352
|
+
external_directory: "allow"
|
|
2353
|
+
},
|
|
2354
|
+
...mcp ? { mcp } : {}
|
|
2355
|
+
};
|
|
2356
|
+
return { config, providerID, modelID };
|
|
2357
|
+
}
|
|
2358
|
+
|
|
2359
|
+
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2360
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2361
|
+
var import_crypto2 = require("crypto");
|
|
2362
|
+
function buildLLMTrace(messages, totalDurationMs, model, provider) {
|
|
2363
|
+
const assistantMessages = messages.filter(
|
|
2364
|
+
(m) => m.info.role === "assistant"
|
|
2365
|
+
);
|
|
2366
|
+
const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
|
|
2367
|
+
const { info, parts } = msg;
|
|
2368
|
+
let text = "";
|
|
2369
|
+
let thinking = "";
|
|
2370
|
+
const toolCalls = [];
|
|
2371
|
+
let stepInputTokens = 0;
|
|
2372
|
+
let stepOutputTokens = 0;
|
|
2373
|
+
let stepCost = 0;
|
|
2374
|
+
let finishReason = "unknown";
|
|
2375
|
+
for (const part of parts) {
|
|
2376
|
+
switch (part.type) {
|
|
2377
|
+
case "text": {
|
|
2378
|
+
const textPart = part;
|
|
2379
|
+
text += textPart.text;
|
|
2380
|
+
break;
|
|
2381
|
+
}
|
|
2382
|
+
case "reasoning": {
|
|
2383
|
+
const reasoningPart = part;
|
|
2384
|
+
thinking += reasoningPart.text;
|
|
2385
|
+
break;
|
|
2386
|
+
}
|
|
2387
|
+
case "tool": {
|
|
2388
|
+
const toolPart = part;
|
|
2389
|
+
toolCalls.push({
|
|
2390
|
+
toolName: toolPart.tool,
|
|
2391
|
+
args: toolPart.state.input
|
|
2392
|
+
});
|
|
2393
|
+
break;
|
|
2394
|
+
}
|
|
2395
|
+
case "step-finish": {
|
|
2396
|
+
const sf = part;
|
|
2397
|
+
stepInputTokens += sf.tokens.input;
|
|
2398
|
+
stepOutputTokens += sf.tokens.output;
|
|
2399
|
+
stepCost += sf.cost;
|
|
2400
|
+
finishReason = sf.reason;
|
|
2401
|
+
break;
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
if (stepInputTokens === 0 && stepOutputTokens === 0) {
|
|
2406
|
+
stepInputTokens = info.tokens.input;
|
|
2407
|
+
stepOutputTokens = info.tokens.output;
|
|
2408
|
+
stepCost = info.cost;
|
|
2409
|
+
}
|
|
2410
|
+
const startedAt = new Date(info.time.created).toISOString();
|
|
2411
|
+
const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
|
|
2412
|
+
const durationMs = Math.max(0, completedAt - info.time.created);
|
|
2413
|
+
const isSuccess = finishReason !== "error";
|
|
2414
|
+
const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
|
|
2415
|
+
const stepModel = info.modelID || model;
|
|
2416
|
+
const stepProvider = info.providerID || provider;
|
|
2417
|
+
const toolCallCount = toolCalls.length;
|
|
2418
|
+
const hasThinking = !!thinking;
|
|
2419
|
+
const hasText = !!text;
|
|
2420
|
+
const subSteps = [];
|
|
2421
|
+
const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
|
|
2422
|
+
const toolSubSteps = toolCallCount;
|
|
2423
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
2424
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
2425
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
2426
|
+
subSteps.push({
|
|
2427
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2428
|
+
stepNumber: 0,
|
|
2429
|
+
// renumbered below
|
|
2430
|
+
turnIndex,
|
|
2431
|
+
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
2432
|
+
model: stepModel,
|
|
2433
|
+
provider: stepProvider,
|
|
2434
|
+
startedAt,
|
|
2435
|
+
durationMs: Math.round(durationMs / totalSubSteps),
|
|
2436
|
+
tokenUsage: {
|
|
2437
|
+
prompt: Math.round(stepInputTokens / totalSubSteps),
|
|
2438
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
2439
|
+
total: Math.round(
|
|
2440
|
+
(stepInputTokens + stepOutputTokens) / totalSubSteps
|
|
2441
|
+
)
|
|
2442
|
+
},
|
|
2443
|
+
costUsd: stepCost / totalSubSteps,
|
|
2444
|
+
outputPreview: thinking.slice(0, 200),
|
|
2445
|
+
success: isSuccess,
|
|
2446
|
+
error: errorMsg
|
|
2447
|
+
});
|
|
2448
|
+
}
|
|
2449
|
+
if (toolCallCount > 0) {
|
|
2450
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
2451
|
+
const tc = toolCalls[tcIdx];
|
|
2452
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
2453
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
2454
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
2455
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
2456
|
+
subSteps.push({
|
|
2457
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2458
|
+
stepNumber: 0,
|
|
2459
|
+
turnIndex,
|
|
2460
|
+
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
2461
|
+
model: stepModel,
|
|
2462
|
+
provider: stepProvider,
|
|
2463
|
+
startedAt,
|
|
2464
|
+
durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
|
|
2465
|
+
tokenUsage: {
|
|
2466
|
+
prompt: Math.round(
|
|
2467
|
+
stepInputTokens * remainingFraction * toolFraction
|
|
2468
|
+
),
|
|
2469
|
+
completion: Math.round(
|
|
2470
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
2471
|
+
),
|
|
2472
|
+
total: Math.round(
|
|
2473
|
+
(stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
2474
|
+
)
|
|
2475
|
+
},
|
|
2476
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
2477
|
+
toolName: tc.toolName,
|
|
2478
|
+
toolArguments: JSON.stringify(tc.args),
|
|
2479
|
+
outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
|
|
2480
|
+
success: isSuccess,
|
|
2481
|
+
error: errorMsg
|
|
2482
|
+
});
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2485
|
+
if (hasText && toolCallCount > 0) {
|
|
2486
|
+
subSteps.push({
|
|
2487
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2488
|
+
stepNumber: 0,
|
|
2489
|
+
turnIndex,
|
|
2490
|
+
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
2491
|
+
model: stepModel,
|
|
2492
|
+
provider: stepProvider,
|
|
2493
|
+
startedAt,
|
|
2494
|
+
durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
2495
|
+
tokenUsage: {
|
|
2496
|
+
prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2497
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2498
|
+
total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2499
|
+
},
|
|
2500
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2501
|
+
outputPreview: text.slice(0, 200),
|
|
2502
|
+
success: isSuccess,
|
|
2503
|
+
error: errorMsg
|
|
2504
|
+
});
|
|
2505
|
+
}
|
|
2506
|
+
if (subSteps.length === 0) {
|
|
2507
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
2508
|
+
subSteps.push({
|
|
2509
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2510
|
+
stepNumber: 0,
|
|
2511
|
+
turnIndex,
|
|
2512
|
+
type: stepType,
|
|
2513
|
+
model: stepModel,
|
|
2514
|
+
provider: stepProvider,
|
|
2515
|
+
startedAt,
|
|
2516
|
+
durationMs,
|
|
2517
|
+
tokenUsage: {
|
|
2518
|
+
prompt: stepInputTokens,
|
|
2519
|
+
completion: stepOutputTokens,
|
|
2520
|
+
total: stepInputTokens + stepOutputTokens
|
|
2521
|
+
},
|
|
2522
|
+
costUsd: stepCost,
|
|
2523
|
+
outputPreview: (text || thinking)?.slice(0, 200),
|
|
2524
|
+
success: isSuccess,
|
|
2525
|
+
error: errorMsg
|
|
2526
|
+
});
|
|
2527
|
+
}
|
|
2528
|
+
return subSteps;
|
|
2529
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
2530
|
+
const totalTokens = buildTotalTokens(assistantMessages);
|
|
2531
|
+
const totalCost = assistantMessages.reduce((sum, m) => {
|
|
2532
|
+
const aMsg = m.info;
|
|
2533
|
+
return sum + aMsg.cost;
|
|
2534
|
+
}, 0);
|
|
2535
|
+
const stepTypeBreakdown = {};
|
|
2536
|
+
for (const step of allSteps) {
|
|
2537
|
+
const entry = stepTypeBreakdown[step.type] ?? {
|
|
2538
|
+
count: 0,
|
|
2539
|
+
durationMs: 0,
|
|
2540
|
+
tokens: 0,
|
|
2541
|
+
costUsd: 0
|
|
2542
|
+
};
|
|
2543
|
+
entry.count += 1;
|
|
2544
|
+
entry.durationMs += step.durationMs;
|
|
2545
|
+
entry.tokens += step.tokenUsage.total;
|
|
2546
|
+
entry.costUsd += step.costUsd;
|
|
2547
|
+
stepTypeBreakdown[step.type] = entry;
|
|
2548
|
+
}
|
|
2549
|
+
const modelUsed = allSteps[0]?.model || model;
|
|
2550
|
+
const summary = {
|
|
2551
|
+
totalSteps: allSteps.length,
|
|
2552
|
+
totalTurns: assistantMessages.length,
|
|
2553
|
+
totalDurationMs,
|
|
2554
|
+
totalTokens,
|
|
2555
|
+
totalCostUsd: totalCost,
|
|
2556
|
+
modelBreakdown: {
|
|
2557
|
+
[modelUsed]: {
|
|
2558
|
+
count: allSteps.length,
|
|
2559
|
+
durationMs: totalDurationMs,
|
|
2560
|
+
tokens: totalTokens.total,
|
|
2561
|
+
costUsd: totalCost
|
|
2562
|
+
}
|
|
2563
|
+
},
|
|
2564
|
+
modelsUsed: [modelUsed],
|
|
2565
|
+
stepTypeBreakdown
|
|
2566
|
+
};
|
|
2567
|
+
return {
|
|
2568
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2569
|
+
steps: allSteps,
|
|
2570
|
+
summary
|
|
2571
|
+
};
|
|
2572
|
+
}
|
|
2573
|
+
function buildTotalTokens(assistantMessages) {
|
|
2574
|
+
let prompt = 0;
|
|
2575
|
+
let completion = 0;
|
|
2576
|
+
for (const { info } of assistantMessages) {
|
|
2577
|
+
prompt += info.tokens.input;
|
|
2578
|
+
completion += info.tokens.output;
|
|
2579
|
+
}
|
|
2580
|
+
return { prompt, completion, total: prompt + completion };
|
|
2581
|
+
}
|
|
2582
|
+
|
|
2583
|
+
// src/run-scenario/agents/opencode/build-conversation.ts
|
|
2584
|
+
function buildConversation2(messages) {
|
|
2585
|
+
const result = [];
|
|
2586
|
+
for (const { info, parts } of messages) {
|
|
2587
|
+
const timestamp = new Date(info.time.created).toISOString();
|
|
2588
|
+
if (info.role === "assistant") {
|
|
2589
|
+
const content = [];
|
|
2590
|
+
for (const part of parts) {
|
|
2591
|
+
switch (part.type) {
|
|
2592
|
+
case "text": {
|
|
2593
|
+
const textPart = part;
|
|
2594
|
+
content.push({ type: "text", text: textPart.text });
|
|
2595
|
+
break;
|
|
2596
|
+
}
|
|
2597
|
+
case "reasoning": {
|
|
2598
|
+
const reasoningPart = part;
|
|
2599
|
+
content.push({ type: "thinking", thinking: reasoningPart.text });
|
|
2600
|
+
break;
|
|
2601
|
+
}
|
|
2602
|
+
case "tool": {
|
|
2603
|
+
const toolPart = part;
|
|
2604
|
+
content.push({
|
|
2605
|
+
type: "tool_use",
|
|
2606
|
+
toolName: toolPart.tool,
|
|
2607
|
+
toolId: toolPart.callID,
|
|
2608
|
+
input: toolPart.state.input
|
|
2609
|
+
});
|
|
2610
|
+
break;
|
|
2611
|
+
}
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
if (content.length > 0) {
|
|
2615
|
+
result.push({ role: "assistant", content, timestamp });
|
|
2616
|
+
}
|
|
2617
|
+
} else if (info.role === "user") {
|
|
2618
|
+
const content = [];
|
|
2619
|
+
for (const part of parts) {
|
|
2620
|
+
if (part.type === "text") {
|
|
2621
|
+
const textPart = part;
|
|
2622
|
+
content.push({ type: "text", text: textPart.text });
|
|
2623
|
+
} else if (part.type === "tool") {
|
|
2624
|
+
const toolPart = part;
|
|
2625
|
+
const state = toolPart.state;
|
|
2626
|
+
if (state.status === "completed") {
|
|
2627
|
+
const completed = state;
|
|
2628
|
+
content.push({
|
|
2629
|
+
type: "tool_result",
|
|
2630
|
+
toolUseId: toolPart.callID,
|
|
2631
|
+
content: completed.output
|
|
2632
|
+
});
|
|
2633
|
+
} else if (state.status === "error") {
|
|
2634
|
+
const errState = state;
|
|
2635
|
+
content.push({
|
|
2636
|
+
type: "tool_result",
|
|
2637
|
+
toolUseId: toolPart.callID,
|
|
2638
|
+
content: errState.error,
|
|
2639
|
+
isError: true
|
|
2640
|
+
});
|
|
2641
|
+
}
|
|
2642
|
+
}
|
|
2643
|
+
}
|
|
2644
|
+
if (content.length > 0) {
|
|
2645
|
+
result.push({ role: "user", content, timestamp });
|
|
2646
|
+
}
|
|
2647
|
+
}
|
|
2648
|
+
}
|
|
2649
|
+
return result;
|
|
2650
|
+
}
|
|
2651
|
+
|
|
2652
|
+
// src/run-scenario/agents/opencode/execute.ts
|
|
2653
|
+
var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2654
|
+
function extractToolAction(toolName, args) {
|
|
2655
|
+
if (!toolName) return "Using tool...";
|
|
2656
|
+
const a = args;
|
|
2657
|
+
if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
|
|
2658
|
+
const desc = String(a.description).slice(0, 55);
|
|
2659
|
+
return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
|
|
2660
|
+
}
|
|
2661
|
+
if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
|
|
2662
|
+
const cmd = String(a.command).slice(0, 50);
|
|
2663
|
+
return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
|
|
2664
|
+
}
|
|
2665
|
+
if (a?.file_path || a?.path || a?.target_file) {
|
|
2666
|
+
const filePath = String(a.file_path || a.path || a.target_file).slice(
|
|
2667
|
+
0,
|
|
2668
|
+
50
|
|
2669
|
+
);
|
|
2670
|
+
if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
|
|
2671
|
+
if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
|
|
2672
|
+
}
|
|
2673
|
+
return `Using ${toolName}...`;
|
|
2674
|
+
}
|
|
2675
|
+
function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
2676
|
+
const base = {
|
|
2677
|
+
evalRunId: context.evalRunId,
|
|
2678
|
+
scenarioId: context.scenarioId,
|
|
2679
|
+
scenarioName: context.scenarioName,
|
|
2680
|
+
targetId: context.targetId,
|
|
2681
|
+
targetName: context.targetName,
|
|
2682
|
+
stepNumber,
|
|
2683
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2684
|
+
isComplete
|
|
2685
|
+
};
|
|
2686
|
+
switch (part.type) {
|
|
2687
|
+
case "text": {
|
|
2688
|
+
const textPart = part;
|
|
2689
|
+
return {
|
|
2690
|
+
...base,
|
|
2691
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
2692
|
+
outputPreview: textPart.text.slice(0, 500)
|
|
2693
|
+
};
|
|
2694
|
+
}
|
|
2695
|
+
case "reasoning": {
|
|
2696
|
+
const reasoningPart = part;
|
|
2697
|
+
return {
|
|
2698
|
+
...base,
|
|
2699
|
+
type: import_evalforge_types8.LiveTraceEventType.THINKING,
|
|
2700
|
+
thinking: reasoningPart.text.slice(0, 500)
|
|
2701
|
+
};
|
|
2702
|
+
}
|
|
2703
|
+
case "tool": {
|
|
2704
|
+
const toolPart = part;
|
|
2705
|
+
const toolName = toolPart.tool;
|
|
2706
|
+
const args = toolPart.state.input;
|
|
2707
|
+
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
2708
|
+
let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
|
|
2709
|
+
let filePath;
|
|
2710
|
+
const a = args;
|
|
2711
|
+
if (a.file_path || a.path || a.target_file) {
|
|
2712
|
+
filePath = String(a.file_path || a.path || a.target_file);
|
|
2713
|
+
if (/write|edit/i.test(toolName)) {
|
|
2714
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
|
|
2715
|
+
} else if (/read|view/i.test(toolName)) {
|
|
2716
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
|
|
2717
|
+
}
|
|
2718
|
+
}
|
|
2719
|
+
return { ...base, type, toolName, toolArgs, filePath };
|
|
2720
|
+
}
|
|
2721
|
+
case "step-finish":
|
|
2722
|
+
return {
|
|
2723
|
+
...base,
|
|
2724
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
2725
|
+
outputPreview: "Step completed"
|
|
2726
|
+
};
|
|
2727
|
+
default:
|
|
2728
|
+
return null;
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2732
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2733
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2734
|
+
skillCount: skills.length,
|
|
2735
|
+
skillNames,
|
|
2736
|
+
scenarioId: scenario.id,
|
|
2737
|
+
scenarioName: scenario.name,
|
|
2738
|
+
cwd: options.cwd,
|
|
2739
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2740
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2741
|
+
model: options.model
|
|
2742
|
+
});
|
|
2743
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2744
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
2745
|
+
console.log(
|
|
2746
|
+
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2747
|
+
);
|
|
2748
|
+
}
|
|
2749
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
2750
|
+
await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
|
|
2751
|
+
}
|
|
2752
|
+
if (options.rules && options.rules.length > 0) {
|
|
2753
|
+
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
2754
|
+
}
|
|
2755
|
+
try {
|
|
2756
|
+
await writeSkillsToFilesystem2(options.cwd, skills);
|
|
2757
|
+
} catch (writeError) {
|
|
2758
|
+
throw new Error(
|
|
2759
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2760
|
+
);
|
|
2761
|
+
}
|
|
2762
|
+
const maxTurns = options.maxTurns ?? 10;
|
|
2763
|
+
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2764
|
+
model: options.model,
|
|
2765
|
+
temperature: options.temperature,
|
|
2766
|
+
maxTurns,
|
|
2767
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2768
|
+
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
2769
|
+
mcps: options.mcps,
|
|
2770
|
+
cwd: options.cwd
|
|
2771
|
+
});
|
|
2772
|
+
const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
|
|
2773
|
+
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
2774
|
+
const abortController = new AbortController();
|
|
2775
|
+
let timeoutHandle;
|
|
2776
|
+
let heartbeatHandle;
|
|
2777
|
+
let timedOut = false;
|
|
2778
|
+
const traceContext = options.traceContext;
|
|
2779
|
+
let traceStepNumber = 0;
|
|
2780
|
+
let lastAction = "Starting...";
|
|
2781
|
+
let lastToolName;
|
|
2782
|
+
let lastFilePath;
|
|
2783
|
+
if (traceContext) {
|
|
2784
|
+
emitTraceEvent(
|
|
2785
|
+
{
|
|
2786
|
+
evalRunId: traceContext.evalRunId,
|
|
2787
|
+
scenarioId: traceContext.scenarioId,
|
|
2788
|
+
scenarioName: traceContext.scenarioName,
|
|
2789
|
+
targetId: traceContext.targetId,
|
|
2790
|
+
targetName: traceContext.targetName,
|
|
2791
|
+
stepNumber: 0,
|
|
2792
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2793
|
+
outputPreview: JSON.stringify({
|
|
2794
|
+
event: "pre-sdk-execution",
|
|
2795
|
+
model: `${providerID}/${modelID}`,
|
|
2796
|
+
maxTurns,
|
|
2797
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2798
|
+
}),
|
|
2799
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2800
|
+
isComplete: false
|
|
2801
|
+
},
|
|
2802
|
+
traceContext.tracePushUrl,
|
|
2803
|
+
traceContext.routeHeader,
|
|
2804
|
+
traceContext.authToken
|
|
2805
|
+
);
|
|
2806
|
+
}
|
|
2807
|
+
let server;
|
|
2808
|
+
try {
|
|
2809
|
+
console.log("[SDK-DEBUG] Starting OpenCode server...");
|
|
2810
|
+
server = await createOpencodeServer({
|
|
2811
|
+
config,
|
|
2812
|
+
signal: abortController.signal,
|
|
2813
|
+
timeout: 3e4
|
|
2814
|
+
});
|
|
2815
|
+
console.log(`[SDK-DEBUG] Server started at ${server.url}`);
|
|
2816
|
+
const client = createOpencodeClient({
|
|
2817
|
+
baseUrl: server.url,
|
|
2818
|
+
directory: options.cwd
|
|
2819
|
+
});
|
|
2820
|
+
const session = await client.session.create({
|
|
2821
|
+
body: { title: `eval-${scenario.name}` }
|
|
2822
|
+
});
|
|
2823
|
+
if (!session.data) {
|
|
2824
|
+
const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
|
|
2825
|
+
throw new Error(
|
|
2826
|
+
`OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
|
|
2827
|
+
);
|
|
2828
|
+
}
|
|
2829
|
+
const sessionId = session.data.id;
|
|
2830
|
+
console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
|
|
2831
|
+
let eventStreamAbort;
|
|
2832
|
+
if (traceContext) {
|
|
2833
|
+
eventStreamAbort = new AbortController();
|
|
2834
|
+
const executionStartTime = Date.now();
|
|
2835
|
+
(async () => {
|
|
2836
|
+
try {
|
|
2837
|
+
const events = await client.event.subscribe();
|
|
2838
|
+
for await (const event of events.stream) {
|
|
2839
|
+
if (eventStreamAbort.signal.aborted) break;
|
|
2840
|
+
const evt = event;
|
|
2841
|
+
if (evt.type === "message.part.updated") {
|
|
2842
|
+
const { part } = evt.properties;
|
|
2843
|
+
traceStepNumber++;
|
|
2844
|
+
const traceEvent = createTraceEventFromPart(
|
|
2845
|
+
part,
|
|
2846
|
+
traceContext,
|
|
2847
|
+
traceStepNumber,
|
|
2848
|
+
false
|
|
2849
|
+
);
|
|
2850
|
+
if (traceEvent) {
|
|
2851
|
+
lastToolName = traceEvent.toolName;
|
|
2852
|
+
lastFilePath = traceEvent.filePath;
|
|
2853
|
+
if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
2854
|
+
lastAction = "Thinking...";
|
|
2855
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
2856
|
+
lastAction = extractToolAction(
|
|
2857
|
+
traceEvent.toolName ?? "",
|
|
2858
|
+
void 0
|
|
2859
|
+
);
|
|
2860
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
2861
|
+
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
2862
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
2863
|
+
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
2864
|
+
} else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
2865
|
+
lastAction = "Processing response...";
|
|
2866
|
+
}
|
|
2867
|
+
emitTraceEvent(
|
|
2868
|
+
traceEvent,
|
|
2869
|
+
traceContext.tracePushUrl,
|
|
2870
|
+
traceContext.routeHeader,
|
|
2871
|
+
traceContext.authToken
|
|
2872
|
+
);
|
|
2873
|
+
}
|
|
2874
|
+
} else if (evt.type === "session.error") {
|
|
2875
|
+
const props = evt.properties;
|
|
2876
|
+
traceStepNumber++;
|
|
2877
|
+
emitTraceEvent(
|
|
2878
|
+
{
|
|
2879
|
+
evalRunId: traceContext.evalRunId,
|
|
2880
|
+
scenarioId: traceContext.scenarioId,
|
|
2881
|
+
scenarioName: traceContext.scenarioName,
|
|
2882
|
+
targetId: traceContext.targetId,
|
|
2883
|
+
targetName: traceContext.targetName,
|
|
2884
|
+
stepNumber: traceStepNumber,
|
|
2885
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
2886
|
+
outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
|
|
2887
|
+
0,
|
|
2888
|
+
500
|
|
2889
|
+
),
|
|
2890
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2891
|
+
isComplete: false
|
|
2892
|
+
},
|
|
2893
|
+
traceContext.tracePushUrl,
|
|
2894
|
+
traceContext.routeHeader,
|
|
2895
|
+
traceContext.authToken
|
|
2896
|
+
);
|
|
2897
|
+
}
|
|
2898
|
+
}
|
|
2899
|
+
} catch {
|
|
2900
|
+
}
|
|
2901
|
+
})();
|
|
2902
|
+
let lastReportedAction = "";
|
|
2903
|
+
let sameActionCount = 0;
|
|
2904
|
+
heartbeatHandle = setInterval(() => {
|
|
2905
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
2906
|
+
let progressMessage = lastAction;
|
|
2907
|
+
if (lastAction === lastReportedAction) {
|
|
2908
|
+
sameActionCount++;
|
|
2909
|
+
} else {
|
|
2910
|
+
sameActionCount = 1;
|
|
2911
|
+
lastReportedAction = lastAction;
|
|
2912
|
+
}
|
|
2913
|
+
const isTaskTool = lastToolName === "Task" || lastToolName === "dispatch_agent";
|
|
2914
|
+
if (isTaskTool && sameActionCount > 1) {
|
|
2915
|
+
progressMessage = `Waiting for ${lastAction}`;
|
|
2916
|
+
} else if (lastToolName && lastFilePath) {
|
|
2917
|
+
progressMessage = `${lastToolName}: ${lastFilePath}`;
|
|
2918
|
+
} else if (lastToolName && !isTaskTool) {
|
|
2919
|
+
progressMessage = `Using ${lastToolName}...`;
|
|
2920
|
+
}
|
|
2921
|
+
const elapsedSec = Math.round(elapsedMs / 1e3);
|
|
2922
|
+
progressMessage += ` (${elapsedSec}s, step ${traceStepNumber})`;
|
|
2923
|
+
emitTraceEvent(
|
|
2924
|
+
{
|
|
2925
|
+
evalRunId: traceContext.evalRunId,
|
|
2926
|
+
scenarioId: traceContext.scenarioId,
|
|
2927
|
+
scenarioName: traceContext.scenarioName,
|
|
2928
|
+
targetId: traceContext.targetId,
|
|
2929
|
+
targetName: traceContext.targetName,
|
|
2930
|
+
stepNumber: traceStepNumber,
|
|
2931
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
2932
|
+
outputPreview: progressMessage,
|
|
2933
|
+
toolName: lastToolName,
|
|
2934
|
+
filePath: lastFilePath,
|
|
2935
|
+
elapsedMs,
|
|
2936
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2937
|
+
isComplete: false
|
|
2938
|
+
},
|
|
2939
|
+
traceContext.tracePushUrl,
|
|
2940
|
+
traceContext.routeHeader,
|
|
2941
|
+
traceContext.authToken
|
|
2942
|
+
);
|
|
2943
|
+
}, 1e4);
|
|
2944
|
+
}
|
|
2945
|
+
const promptPromise = (async () => {
|
|
2946
|
+
let systemPrompt;
|
|
2947
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
2948
|
+
} else if (options.systemPrompt != null) {
|
|
2949
|
+
systemPrompt = options.systemPrompt;
|
|
2950
|
+
} else {
|
|
2951
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
2952
|
+
}
|
|
2953
|
+
console.log("[SDK-DEBUG] Sending prompt...");
|
|
2954
|
+
const result = await client.session.prompt({
|
|
2955
|
+
path: { id: sessionId },
|
|
2956
|
+
body: {
|
|
2957
|
+
model: { providerID, modelID },
|
|
2958
|
+
...systemPrompt ? { system: systemPrompt } : {},
|
|
2959
|
+
parts: [{ type: "text", text: scenario.triggerPrompt }]
|
|
2960
|
+
}
|
|
2961
|
+
});
|
|
2962
|
+
return result;
|
|
2963
|
+
})();
|
|
2964
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
2965
|
+
timeoutHandle = setTimeout(() => {
|
|
2966
|
+
timedOut = true;
|
|
2967
|
+
client.session.abort({ path: { id: sessionId } }).catch(() => {
|
|
2968
|
+
});
|
|
2969
|
+
reject(
|
|
2970
|
+
new Error(
|
|
2971
|
+
`OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
|
|
2972
|
+
)
|
|
2973
|
+
);
|
|
2974
|
+
}, SDK_TIMEOUT_MS);
|
|
2975
|
+
});
|
|
2976
|
+
const promptResult = await Promise.race([promptPromise, timeoutPromise]);
|
|
2977
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
2978
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
2979
|
+
if (eventStreamAbort) eventStreamAbort.abort();
|
|
2980
|
+
if ("error" in promptResult && promptResult.error) {
|
|
2981
|
+
const errPayload = promptResult.error;
|
|
2982
|
+
throw new Error(
|
|
2983
|
+
`Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
|
|
2984
|
+
);
|
|
2985
|
+
}
|
|
2986
|
+
console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
|
|
2987
|
+
const messagesResponse = await client.session.messages({
|
|
2988
|
+
path: { id: sessionId }
|
|
2989
|
+
});
|
|
2990
|
+
const allMessages = messagesResponse.data ?? [];
|
|
2991
|
+
console.log(
|
|
2992
|
+
`[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
|
|
2993
|
+
);
|
|
2994
|
+
if (traceContext) {
|
|
2995
|
+
emitTraceEvent(
|
|
2996
|
+
{
|
|
2997
|
+
evalRunId: traceContext.evalRunId,
|
|
2998
|
+
scenarioId: traceContext.scenarioId,
|
|
2999
|
+
scenarioName: traceContext.scenarioName,
|
|
3000
|
+
targetId: traceContext.targetId,
|
|
3001
|
+
targetName: traceContext.targetName,
|
|
3002
|
+
stepNumber: traceStepNumber + 1,
|
|
3003
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
3004
|
+
outputPreview: "Scenario execution completed",
|
|
3005
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3006
|
+
isComplete: true
|
|
3007
|
+
},
|
|
3008
|
+
traceContext.tracePushUrl,
|
|
3009
|
+
traceContext.routeHeader,
|
|
3010
|
+
traceContext.authToken
|
|
3011
|
+
);
|
|
3012
|
+
}
|
|
3013
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
3014
|
+
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
3015
|
+
const resultData = promptResult.data;
|
|
3016
|
+
const lastAssistantInfo = resultData?.info;
|
|
3017
|
+
if (lastAssistantInfo?.error) {
|
|
3018
|
+
const err = lastAssistantInfo.error;
|
|
3019
|
+
throw new Error(
|
|
3020
|
+
`Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
|
|
3021
|
+
);
|
|
3022
|
+
}
|
|
3023
|
+
let outputText = "";
|
|
3024
|
+
if (resultData?.parts) {
|
|
3025
|
+
for (const part of resultData.parts) {
|
|
3026
|
+
if (part.type === "text") {
|
|
3027
|
+
outputText += part.text;
|
|
3028
|
+
}
|
|
3029
|
+
}
|
|
3030
|
+
}
|
|
3031
|
+
if (!outputText && allMessages.length > 0) {
|
|
3032
|
+
for (let i = allMessages.length - 1; i >= 0; i--) {
|
|
3033
|
+
const msg = allMessages[i];
|
|
3034
|
+
if (msg.info.role === "assistant") {
|
|
3035
|
+
const assistantInfo = msg.info;
|
|
3036
|
+
if (assistantInfo.error) {
|
|
3037
|
+
throw new Error(
|
|
3038
|
+
`Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
|
|
3039
|
+
);
|
|
3040
|
+
}
|
|
3041
|
+
for (const part of msg.parts) {
|
|
3042
|
+
if (part.type === "text") {
|
|
3043
|
+
outputText += part.text;
|
|
3044
|
+
}
|
|
3045
|
+
}
|
|
3046
|
+
if (outputText) break;
|
|
3047
|
+
}
|
|
3048
|
+
}
|
|
3049
|
+
}
|
|
3050
|
+
if (!outputText) {
|
|
3051
|
+
const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
|
|
3052
|
+
if (!hasAssistant) {
|
|
3053
|
+
throw new Error(
|
|
3054
|
+
`Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
|
|
3055
|
+
);
|
|
3056
|
+
}
|
|
3057
|
+
}
|
|
3058
|
+
const usage = lastAssistantInfo ? {
|
|
3059
|
+
inputTokens: lastAssistantInfo.tokens.input,
|
|
3060
|
+
outputTokens: lastAssistantInfo.tokens.output,
|
|
3061
|
+
totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
|
|
3062
|
+
} : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
|
|
3063
|
+
const costUsd = lastAssistantInfo?.cost;
|
|
3064
|
+
const modelStr = options.model || DEFAULT_MODEL3;
|
|
3065
|
+
const llmTrace = buildLLMTrace(
|
|
3066
|
+
allMessages,
|
|
3067
|
+
totalDurationMs,
|
|
3068
|
+
modelStr,
|
|
3069
|
+
providerID
|
|
3070
|
+
);
|
|
3071
|
+
const conversation = buildConversation2(allMessages);
|
|
3072
|
+
return {
|
|
3073
|
+
result: {
|
|
3074
|
+
outputText,
|
|
3075
|
+
durationMs: totalDurationMs,
|
|
3076
|
+
usage,
|
|
3077
|
+
costUsd
|
|
3078
|
+
},
|
|
3079
|
+
llmTrace,
|
|
3080
|
+
conversation
|
|
3081
|
+
};
|
|
3082
|
+
} catch (sdkError) {
|
|
3083
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
3084
|
+
if (heartbeatHandle) clearInterval(heartbeatHandle);
|
|
3085
|
+
if (timedOut) {
|
|
3086
|
+
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
3087
|
+
}
|
|
3088
|
+
const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
|
|
3089
|
+
const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
|
|
3090
|
+
const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
|
|
3091
|
+
console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
|
|
3092
|
+
console.error("[SDK-ERROR] Error name:", errorName);
|
|
3093
|
+
console.error("[SDK-ERROR] Error message:", errorMessage);
|
|
3094
|
+
if (errorStack) {
|
|
3095
|
+
console.error("[SDK-ERROR] Stack:", errorStack);
|
|
3096
|
+
}
|
|
3097
|
+
if (traceContext) {
|
|
3098
|
+
emitTraceEvent(
|
|
3099
|
+
{
|
|
3100
|
+
evalRunId: traceContext.evalRunId,
|
|
3101
|
+
scenarioId: traceContext.scenarioId,
|
|
3102
|
+
scenarioName: traceContext.scenarioName,
|
|
3103
|
+
targetId: traceContext.targetId,
|
|
3104
|
+
targetName: traceContext.targetName,
|
|
3105
|
+
stepNumber: traceStepNumber + 1,
|
|
3106
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
3107
|
+
outputPreview: JSON.stringify({
|
|
3108
|
+
event: "sdk-execution-failed",
|
|
3109
|
+
error: errorMessage,
|
|
3110
|
+
errorName
|
|
3111
|
+
}).slice(0, 2e3),
|
|
3112
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3113
|
+
isComplete: true
|
|
3114
|
+
},
|
|
3115
|
+
traceContext.tracePushUrl,
|
|
3116
|
+
traceContext.routeHeader,
|
|
3117
|
+
traceContext.authToken
|
|
3118
|
+
);
|
|
3119
|
+
}
|
|
3120
|
+
throw new Error(
|
|
3121
|
+
`OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
|
|
3122
|
+
Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
3123
|
+
);
|
|
3124
|
+
} finally {
|
|
3125
|
+
if (server) {
|
|
3126
|
+
try {
|
|
3127
|
+
server.close();
|
|
3128
|
+
console.log("[SDK-DEBUG] OpenCode server closed");
|
|
3129
|
+
} catch {
|
|
3130
|
+
}
|
|
3131
|
+
}
|
|
3132
|
+
}
|
|
3133
|
+
}
|
|
3134
|
+
|
|
3135
|
+
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
3136
|
+
var OpenCodeAdapter = class {
|
|
3137
|
+
id = "opencode";
|
|
3138
|
+
name = "OpenCode";
|
|
3139
|
+
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
3140
|
+
async execute(context) {
|
|
3141
|
+
const {
|
|
3142
|
+
skills,
|
|
3143
|
+
scenario,
|
|
3144
|
+
cwd,
|
|
3145
|
+
modelConfig,
|
|
3146
|
+
aiGatewayUrl,
|
|
3147
|
+
aiGatewayHeaders,
|
|
3148
|
+
traceContext,
|
|
3149
|
+
mcps,
|
|
3150
|
+
subAgents,
|
|
3151
|
+
rules,
|
|
3152
|
+
systemPrompt
|
|
3153
|
+
} = context;
|
|
3154
|
+
const options = {
|
|
3155
|
+
cwd,
|
|
3156
|
+
model: modelConfig?.model,
|
|
3157
|
+
temperature: modelConfig?.temperature,
|
|
3158
|
+
maxTurns: modelConfig?.maxTurns,
|
|
3159
|
+
aiGatewayUrl,
|
|
3160
|
+
aiGatewayHeaders,
|
|
3161
|
+
traceContext,
|
|
3162
|
+
mcps,
|
|
3163
|
+
subAgents,
|
|
3164
|
+
rules,
|
|
3165
|
+
systemPrompt
|
|
3166
|
+
};
|
|
3167
|
+
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3168
|
+
skills,
|
|
3169
|
+
scenario,
|
|
3170
|
+
options
|
|
3171
|
+
);
|
|
3172
|
+
return {
|
|
3173
|
+
outputText: result.outputText,
|
|
3174
|
+
durationMs: result.durationMs,
|
|
3175
|
+
usage: {
|
|
3176
|
+
inputTokens: result.usage.inputTokens,
|
|
3177
|
+
outputTokens: result.usage.outputTokens,
|
|
3178
|
+
totalTokens: result.usage.totalTokens
|
|
3179
|
+
},
|
|
3180
|
+
costUsd: result.costUsd,
|
|
3181
|
+
llmTrace,
|
|
3182
|
+
conversation
|
|
3183
|
+
};
|
|
3184
|
+
}
|
|
3185
|
+
};
|
|
3186
|
+
var openCodeAdapter = new OpenCodeAdapter();
|
|
3187
|
+
|
|
3188
|
+
// src/run-scenario/agents/opencode/index.ts
|
|
3189
|
+
defaultRegistry.register(openCodeAdapter);
|
|
3190
|
+
|
|
2048
3191
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
2049
3192
|
var import_ai = require("ai");
|
|
2050
3193
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
2051
3194
|
var import_openai = require("@ai-sdk/openai");
|
|
2052
|
-
var
|
|
2053
|
-
var
|
|
3195
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
3196
|
+
var import_crypto3 = require("crypto");
|
|
2054
3197
|
|
|
2055
3198
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
2056
3199
|
var import_mcp = require("@ai-sdk/mcp");
|
|
@@ -2145,48 +3288,35 @@ function extractErrorText(content) {
|
|
|
2145
3288
|
}
|
|
2146
3289
|
|
|
2147
3290
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3291
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
2148
3292
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
2149
3293
|
var MODEL_PRICING = {
|
|
3294
|
+
// Anthropic — Claude 4.6
|
|
3295
|
+
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
3296
|
+
"claude-opus-4-6": { input: 15, output: 75 },
|
|
2150
3297
|
// Anthropic — Claude 4.5
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
// Anthropic — Claude 4
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
|
|
2158
|
-
// Anthropic — Claude 3.x
|
|
2159
|
-
CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
|
|
2160
|
-
CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
|
|
2161
|
-
CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
|
|
3298
|
+
"claude-opus-4-5": { input: 5, output: 25 },
|
|
3299
|
+
"claude-sonnet-4-5": { input: 3, output: 15 },
|
|
3300
|
+
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
3301
|
+
// Anthropic — Claude 4
|
|
3302
|
+
"claude-opus-4": { input: 15, output: 75 },
|
|
3303
|
+
"claude-sonnet-4": { input: 3, output: 15 },
|
|
2162
3304
|
// OpenAI — GPT-5
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
GPT_5_NANO_2025_08_07: { input: 0.05, output: 0.4 },
|
|
3305
|
+
"gpt-5": { input: 1.25, output: 10 },
|
|
3306
|
+
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
3307
|
+
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
2167
3308
|
// OpenAI — GPT-4.1
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
3309
|
+
"gpt-4.1": { input: 2, output: 8 },
|
|
3310
|
+
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
3311
|
+
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
2171
3312
|
// OpenAI — GPT-4o
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
GPT_4O_2024_11_20: { input: 2.5, output: 10 },
|
|
2175
|
-
GPT_4O_MINI_2024_07_18: { input: 0.15, output: 0.6 },
|
|
3313
|
+
"gpt-4o": { input: 2.5, output: 10 },
|
|
3314
|
+
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
2176
3315
|
// OpenAI — Reasoning
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
O1_MINI: { input: 1.1, output: 4.4 },
|
|
2182
|
-
O1_MINI_2024_09_12: { input: 1.1, output: 4.4 },
|
|
2183
|
-
O1_PREVIEW: { input: 15, output: 60 },
|
|
2184
|
-
O1_PREVIEW_2024_09_12: { input: 15, output: 60 },
|
|
2185
|
-
// OpenAI — Legacy
|
|
2186
|
-
GPT_4_TURBO_2024_04_09: { input: 10, output: 30 },
|
|
2187
|
-
GPT_4_1106_PREVIEW: { input: 10, output: 30 },
|
|
2188
|
-
GPT_3_5_TURBO: { input: 0.5, output: 1.5 },
|
|
2189
|
-
GPT_3_5_TURBO_0125: { input: 0.5, output: 1.5 }
|
|
3316
|
+
o3: { input: 2, output: 8 },
|
|
3317
|
+
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3318
|
+
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3319
|
+
o1: { input: 15, output: 60 }
|
|
2190
3320
|
};
|
|
2191
3321
|
function extractGatewayCost(step, provider) {
|
|
2192
3322
|
try {
|
|
@@ -2205,7 +3335,8 @@ function extractGatewayCost(step, provider) {
|
|
|
2205
3335
|
}
|
|
2206
3336
|
}
|
|
2207
3337
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
2208
|
-
const
|
|
3338
|
+
const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
|
|
3339
|
+
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
2209
3340
|
if (!pricing) return 0;
|
|
2210
3341
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
2211
3342
|
}
|
|
@@ -2214,7 +3345,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
|
|
|
2214
3345
|
}
|
|
2215
3346
|
|
|
2216
3347
|
// src/run-scenario/agents/simple-agent/build-conversation.ts
|
|
2217
|
-
function
|
|
3348
|
+
function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
2218
3349
|
const messages = [];
|
|
2219
3350
|
messages.push({
|
|
2220
3351
|
role: "user",
|
|
@@ -2280,9 +3411,7 @@ var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
|
2280
3411
|
var PROVIDER_OPENAI = "openai";
|
|
2281
3412
|
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
2282
3413
|
function createModel(modelId, baseUrl, headers) {
|
|
2283
|
-
const isClaudeModel =
|
|
2284
|
-
modelId
|
|
2285
|
-
);
|
|
3414
|
+
const isClaudeModel = isClaudeModelId(modelId);
|
|
2286
3415
|
if (isClaudeModel) {
|
|
2287
3416
|
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
2288
3417
|
baseURL: `${baseUrl}/proxy/anthropic`,
|
|
@@ -2296,13 +3425,17 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
2296
3425
|
apiKey: "proxy-auth",
|
|
2297
3426
|
headers
|
|
2298
3427
|
});
|
|
2299
|
-
if (
|
|
3428
|
+
if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3429
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3430
|
+
)) {
|
|
2300
3431
|
return openai.responses(modelId);
|
|
2301
3432
|
}
|
|
2302
3433
|
return openai.chat(modelId);
|
|
2303
3434
|
}
|
|
2304
3435
|
function isClaudeModelId(modelId) {
|
|
2305
|
-
return
|
|
3436
|
+
return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
3437
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3438
|
+
);
|
|
2306
3439
|
}
|
|
2307
3440
|
function extractSkillContent(files) {
|
|
2308
3441
|
if (!files || files.length === 0) return void 0;
|
|
@@ -2336,7 +3469,9 @@ async function executeWithAiSdk(context) {
|
|
|
2336
3469
|
}
|
|
2337
3470
|
try {
|
|
2338
3471
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
2339
|
-
const isResponsesAPI =
|
|
3472
|
+
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3473
|
+
(id) => modelConfig.model === id || modelConfig.model.startsWith(id)
|
|
3474
|
+
);
|
|
2340
3475
|
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
2341
3476
|
const providerOpts = {
|
|
2342
3477
|
...isAnthropic && {
|
|
@@ -2370,7 +3505,7 @@ async function executeWithAiSdk(context) {
|
|
|
2370
3505
|
outputTokens: result.usage.outputTokens ?? 0,
|
|
2371
3506
|
totalTokens: result.usage.totalTokens ?? 0
|
|
2372
3507
|
};
|
|
2373
|
-
const llmTrace =
|
|
3508
|
+
const llmTrace = buildLLMTrace2(
|
|
2374
3509
|
result.steps,
|
|
2375
3510
|
durationMs,
|
|
2376
3511
|
usage,
|
|
@@ -2382,7 +3517,7 @@ async function executeWithAiSdk(context) {
|
|
|
2382
3517
|
emitStepEvents(traceContext, result.steps, startTime);
|
|
2383
3518
|
emitCompletionEvent(traceContext, result.steps.length + 1);
|
|
2384
3519
|
}
|
|
2385
|
-
const conversation =
|
|
3520
|
+
const conversation = buildConversation3(
|
|
2386
3521
|
scenario.triggerPrompt,
|
|
2387
3522
|
result.steps,
|
|
2388
3523
|
startTime
|
|
@@ -2426,7 +3561,7 @@ function findToolResultError(step) {
|
|
|
2426
3561
|
}
|
|
2427
3562
|
return null;
|
|
2428
3563
|
}
|
|
2429
|
-
function
|
|
3564
|
+
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
|
|
2430
3565
|
const totalStepTokens = steps.reduce(
|
|
2431
3566
|
(sum, s) => sum + (s.usage.totalTokens ?? 0),
|
|
2432
3567
|
0
|
|
@@ -2444,9 +3579,10 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2444
3579
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
2445
3580
|
const toolResultError = findToolResultError(step);
|
|
2446
3581
|
return {
|
|
2447
|
-
id: (0,
|
|
3582
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2448
3583
|
stepNumber: i + 1,
|
|
2449
|
-
|
|
3584
|
+
turnIndex: i,
|
|
3585
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
2450
3586
|
model: modelId,
|
|
2451
3587
|
provider,
|
|
2452
3588
|
startedAt: new Date(
|
|
@@ -2469,10 +3605,11 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2469
3605
|
total: totalUsage.totalTokens
|
|
2470
3606
|
};
|
|
2471
3607
|
return {
|
|
2472
|
-
id: (0,
|
|
3608
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2473
3609
|
steps: traceSteps,
|
|
2474
3610
|
summary: {
|
|
2475
3611
|
totalSteps: traceSteps.length,
|
|
3612
|
+
totalTurns: traceSteps.length,
|
|
2476
3613
|
totalDurationMs,
|
|
2477
3614
|
totalTokens: finalTokens,
|
|
2478
3615
|
totalCostUsd,
|
|
@@ -2497,7 +3634,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
2497
3634
|
targetId: traceContext.targetId,
|
|
2498
3635
|
targetName: traceContext.targetName,
|
|
2499
3636
|
stepNumber: 0,
|
|
2500
|
-
type:
|
|
3637
|
+
type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
|
|
2501
3638
|
outputPreview: "Starting Simple Agent execution...",
|
|
2502
3639
|
elapsedMs: Date.now() - startTime,
|
|
2503
3640
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -2521,7 +3658,7 @@ function emitStepEvents(traceContext, steps, startTime) {
|
|
|
2521
3658
|
targetId: traceContext.targetId,
|
|
2522
3659
|
targetName: traceContext.targetName,
|
|
2523
3660
|
stepNumber: i + 1,
|
|
2524
|
-
type: isToolStep ?
|
|
3661
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
2525
3662
|
toolName: firstToolCall?.toolName,
|
|
2526
3663
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
2527
3664
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -2544,7 +3681,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
2544
3681
|
targetId: traceContext.targetId,
|
|
2545
3682
|
targetName: traceContext.targetName,
|
|
2546
3683
|
stepNumber,
|
|
2547
|
-
type:
|
|
3684
|
+
type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
2548
3685
|
outputPreview: "Scenario execution completed",
|
|
2549
3686
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2550
3687
|
isComplete: true
|
|
@@ -2571,7 +3708,7 @@ defaultRegistry.register(simpleAgentAdapter);
|
|
|
2571
3708
|
|
|
2572
3709
|
// src/run-scenario/file-diff.ts
|
|
2573
3710
|
var import_fs2 = require("fs");
|
|
2574
|
-
var
|
|
3711
|
+
var import_path10 = require("path");
|
|
2575
3712
|
|
|
2576
3713
|
// ../../node_modules/diff/lib/index.mjs
|
|
2577
3714
|
function Diff() {
|
|
@@ -2747,7 +3884,7 @@ Diff.prototype = {
|
|
|
2747
3884
|
tokenize: function tokenize(value) {
|
|
2748
3885
|
return Array.from(value);
|
|
2749
3886
|
},
|
|
2750
|
-
join: function
|
|
3887
|
+
join: function join8(chars) {
|
|
2751
3888
|
return chars.join("");
|
|
2752
3889
|
},
|
|
2753
3890
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -3187,8 +4324,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
3187
4324
|
}
|
|
3188
4325
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
3189
4326
|
for (const entry of entries) {
|
|
3190
|
-
const fullPath = (0,
|
|
3191
|
-
const relativePath = (0,
|
|
4327
|
+
const fullPath = (0, import_path10.join)(dir, entry.name);
|
|
4328
|
+
const relativePath = (0, import_path10.relative)(base, fullPath);
|
|
3192
4329
|
if (shouldIgnore(entry.name)) {
|
|
3193
4330
|
continue;
|
|
3194
4331
|
}
|
|
@@ -3296,17 +4433,11 @@ function extractTemplateFiles(before, after) {
|
|
|
3296
4433
|
}
|
|
3297
4434
|
|
|
3298
4435
|
// src/run-scenario/run-agent-with-context.ts
|
|
3299
|
-
var
|
|
3300
|
-
var DEFAULT_AGENT_COMMAND =
|
|
4436
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
4437
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
|
|
3301
4438
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
3302
|
-
const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
|
|
3303
|
-
if (!hasEntities) {
|
|
3304
|
-
throw new Error(
|
|
3305
|
-
`Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
3306
|
-
);
|
|
3307
|
-
}
|
|
3308
4439
|
const agent = evalData.agent ?? void 0;
|
|
3309
|
-
const isSDK = agent?.agentType ===
|
|
4440
|
+
const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
|
|
3310
4441
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
3311
4442
|
const adapter = getAdapter(identifier);
|
|
3312
4443
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -3341,7 +4472,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
3341
4472
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
3342
4473
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
3343
4474
|
return {
|
|
3344
|
-
id: (0,
|
|
4475
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
3345
4476
|
targetId,
|
|
3346
4477
|
targetName,
|
|
3347
4478
|
scenarioId: scenario.id,
|
|
@@ -3392,7 +4523,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3392
4523
|
})),
|
|
3393
4524
|
durationMs: partialResult.duration
|
|
3394
4525
|
};
|
|
3395
|
-
const defaultJudgeModel =
|
|
4526
|
+
const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
|
|
3396
4527
|
const assertionContext = {
|
|
3397
4528
|
workDir,
|
|
3398
4529
|
defaultJudgeModel,
|
|
@@ -3407,10 +4538,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3407
4538
|
assertionContext
|
|
3408
4539
|
) : [];
|
|
3409
4540
|
const passed = assertionResults.filter(
|
|
3410
|
-
(r) => r.status ===
|
|
4541
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
3411
4542
|
).length;
|
|
3412
4543
|
const failed = assertionResults.filter(
|
|
3413
|
-
(r) => r.status ===
|
|
4544
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
3414
4545
|
).length;
|
|
3415
4546
|
const total = assertionResults.length;
|
|
3416
4547
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -3424,7 +4555,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3424
4555
|
}
|
|
3425
4556
|
|
|
3426
4557
|
// src/error-reporter.ts
|
|
3427
|
-
var
|
|
4558
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
3428
4559
|
function formatError(error, phase, context) {
|
|
3429
4560
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
3430
4561
|
if (error instanceof Error) {
|
|
@@ -3598,13 +4729,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
3598
4729
|
presetId: evalData.evalRun.presetId,
|
|
3599
4730
|
skillIds: evalData.evalRun.skillIds
|
|
3600
4731
|
};
|
|
3601
|
-
|
|
3602
|
-
if (scenarioItems.length > 0 && !hasEntities) {
|
|
3603
|
-
throw new Error(
|
|
3604
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
3605
|
-
);
|
|
3606
|
-
}
|
|
3607
|
-
if (scenarioItems.length > 0 && hasEntities && !agent) {
|
|
4732
|
+
if (scenarioItems.length > 0 && !agent) {
|
|
3608
4733
|
throw new Error(
|
|
3609
4734
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
3610
4735
|
);
|
|
@@ -3675,7 +4800,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
3675
4800
|
};
|
|
3676
4801
|
try {
|
|
3677
4802
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
3678
|
-
status:
|
|
4803
|
+
status: import_evalforge_types15.EvalStatus.COMPLETED,
|
|
3679
4804
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3680
4805
|
});
|
|
3681
4806
|
} catch (updateErr) {
|
|
@@ -3716,7 +4841,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
3716
4841
|
authToken: config.authToken
|
|
3717
4842
|
});
|
|
3718
4843
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
3719
|
-
status:
|
|
4844
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
3720
4845
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3721
4846
|
jobError,
|
|
3722
4847
|
jobStatus: "FAILED"
|
|
@@ -3739,7 +4864,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
3739
4864
|
authToken
|
|
3740
4865
|
});
|
|
3741
4866
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
3742
|
-
status:
|
|
4867
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
3743
4868
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3744
4869
|
jobError: `Config load failed, then: ${jobError}`,
|
|
3745
4870
|
jobStatus: "FAILED"
|