@wix/evalforge-evaluator 0.111.0 → 0.112.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +184 -102
- package/build/index.js.map +3 -3
- package/build/index.mjs +167 -85
- package/build/index.mjs.map +2 -2
- package/package.json +11 -11
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -509,7 +509,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
509
509
|
}
|
|
510
510
|
|
|
511
511
|
// src/run-scenario/index.ts
|
|
512
|
-
var
|
|
512
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
513
513
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
514
514
|
|
|
515
515
|
// src/run-scenario/environment.ts
|
|
@@ -1261,7 +1261,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1261
1261
|
"Edit",
|
|
1262
1262
|
"Bash",
|
|
1263
1263
|
"Glob",
|
|
1264
|
-
"Grep"
|
|
1264
|
+
"Grep",
|
|
1265
|
+
"Agent",
|
|
1266
|
+
"WebFetch",
|
|
1267
|
+
"WebSearch"
|
|
1265
1268
|
];
|
|
1266
1269
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1267
1270
|
const queryOptions = {
|
|
@@ -1896,13 +1899,15 @@ function extractTotalUsage(result) {
|
|
|
1896
1899
|
}
|
|
1897
1900
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
1898
1901
|
const totalCost = usage.costUsd ?? 0;
|
|
1899
|
-
const
|
|
1900
|
-
|
|
1902
|
+
const effectiveInput = (s) => s.usage.inputTokens + (s.usage.cacheReadTokens ?? 0) + (s.usage.cacheWriteTokens ?? 0);
|
|
1903
|
+
const totalStepEffectiveInput = steps.reduce(
|
|
1904
|
+
(sum, s) => sum + effectiveInput(s),
|
|
1901
1905
|
0
|
|
1902
1906
|
);
|
|
1903
1907
|
const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
|
|
1904
|
-
const
|
|
1905
|
-
const
|
|
1908
|
+
const authoritativeEffectiveInput = usage.inputTokens + (usage.cacheReadTokens ?? 0) + (usage.cacheWriteTokens ?? 0);
|
|
1909
|
+
const inputTokensDuplicated = authoritativeEffectiveInput > 0 && totalStepEffectiveInput > authoritativeEffectiveInput * 1.2;
|
|
1910
|
+
const traceSteps = steps.flatMap((step, turnIndex) => {
|
|
1906
1911
|
let stepPromptTokens;
|
|
1907
1912
|
let stepOutputTokens;
|
|
1908
1913
|
let proportion;
|
|
@@ -1911,34 +1916,128 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1911
1916
|
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
1912
1917
|
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
1913
1918
|
} else {
|
|
1914
|
-
|
|
1915
|
-
|
|
1919
|
+
const stepEffective = effectiveInput(step);
|
|
1920
|
+
proportion = totalStepEffectiveInput > 0 ? stepEffective / totalStepEffectiveInput : 0;
|
|
1921
|
+
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
1916
1922
|
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
1917
1923
|
}
|
|
1918
|
-
const stepTotalTokens = stepPromptTokens + stepOutputTokens;
|
|
1919
1924
|
const costProportion = proportion;
|
|
1920
|
-
const
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1925
|
+
const toolCallCount = step.toolCalls?.length ?? 0;
|
|
1926
|
+
const isSuccess = step.finishReason !== "error" && !step.hasToolError;
|
|
1927
|
+
const errorMsg = step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : step.finishReason === "error" ? "Generation failed" : void 0;
|
|
1928
|
+
const subSteps = [];
|
|
1929
|
+
const stepCost = totalCost * costProportion;
|
|
1930
|
+
const hasThinking = !!step.thinking;
|
|
1931
|
+
const hasText = !!step.text;
|
|
1932
|
+
const thinkingSubSteps = hasThinking ? 1 : 0;
|
|
1933
|
+
const toolSubSteps = toolCallCount > 0 ? toolCallCount : 0;
|
|
1934
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
1935
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
1936
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
1937
|
+
subSteps.push({
|
|
1938
|
+
id: (0, import_crypto.randomUUID)(),
|
|
1939
|
+
stepNumber: 0,
|
|
1940
|
+
// renumbered below
|
|
1941
|
+
turnIndex,
|
|
1942
|
+
type: import_evalforge_types4.LLMStepType.THINKING,
|
|
1943
|
+
model,
|
|
1944
|
+
provider: "anthropic",
|
|
1945
|
+
startedAt: step.startedAt.toISOString(),
|
|
1946
|
+
durationMs: Math.round(step.durationMs / totalSubSteps),
|
|
1947
|
+
tokenUsage: {
|
|
1948
|
+
prompt: Math.round(stepPromptTokens / totalSubSteps),
|
|
1949
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
1950
|
+
total: Math.round(
|
|
1951
|
+
(stepPromptTokens + stepOutputTokens) / totalSubSteps
|
|
1952
|
+
)
|
|
1953
|
+
},
|
|
1954
|
+
costUsd: stepCost / totalSubSteps,
|
|
1955
|
+
outputPreview: step.thinking?.slice(0, 200),
|
|
1956
|
+
success: isSuccess,
|
|
1957
|
+
error: errorMsg
|
|
1958
|
+
});
|
|
1959
|
+
}
|
|
1960
|
+
if (toolCallCount > 0) {
|
|
1961
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
1962
|
+
const tc = step.toolCalls[tcIdx];
|
|
1963
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
1964
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
1965
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
1966
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
1967
|
+
subSteps.push({
|
|
1968
|
+
id: (0, import_crypto.randomUUID)(),
|
|
1969
|
+
stepNumber: 0,
|
|
1970
|
+
turnIndex,
|
|
1971
|
+
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
1972
|
+
model,
|
|
1973
|
+
provider: "anthropic",
|
|
1974
|
+
startedAt: step.startedAt.toISOString(),
|
|
1975
|
+
durationMs: isLast ? step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(step.durationMs * remainingFraction * toolFraction),
|
|
1976
|
+
tokenUsage: {
|
|
1977
|
+
prompt: Math.round(
|
|
1978
|
+
stepPromptTokens * remainingFraction * toolFraction
|
|
1979
|
+
),
|
|
1980
|
+
completion: Math.round(
|
|
1981
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
1982
|
+
),
|
|
1983
|
+
total: Math.round(
|
|
1984
|
+
(stepPromptTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
1985
|
+
)
|
|
1986
|
+
},
|
|
1987
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
1988
|
+
toolName: tc.toolName,
|
|
1989
|
+
toolArguments: JSON.stringify(tc.args),
|
|
1990
|
+
outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
|
|
1991
|
+
success: isSuccess,
|
|
1992
|
+
error: errorMsg
|
|
1993
|
+
});
|
|
1994
|
+
}
|
|
1995
|
+
}
|
|
1996
|
+
if (hasText && toolCallCount > 0) {
|
|
1997
|
+
subSteps.push({
|
|
1998
|
+
id: (0, import_crypto.randomUUID)(),
|
|
1999
|
+
stepNumber: 0,
|
|
2000
|
+
turnIndex,
|
|
2001
|
+
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
2002
|
+
model,
|
|
2003
|
+
provider: "anthropic",
|
|
2004
|
+
startedAt: step.startedAt.toISOString(),
|
|
2005
|
+
durationMs: step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
2006
|
+
tokenUsage: {
|
|
2007
|
+
prompt: stepPromptTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
2008
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
2009
|
+
total: stepPromptTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
2010
|
+
},
|
|
2011
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
2012
|
+
outputPreview: step.text?.slice(0, 200),
|
|
2013
|
+
success: isSuccess,
|
|
2014
|
+
error: errorMsg
|
|
2015
|
+
});
|
|
2016
|
+
}
|
|
2017
|
+
if (subSteps.length === 0) {
|
|
2018
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
2019
|
+
subSteps.push({
|
|
2020
|
+
id: (0, import_crypto.randomUUID)(),
|
|
2021
|
+
stepNumber: 0,
|
|
2022
|
+
turnIndex,
|
|
2023
|
+
type: stepType,
|
|
2024
|
+
model,
|
|
2025
|
+
provider: "anthropic",
|
|
2026
|
+
startedAt: step.startedAt.toISOString(),
|
|
2027
|
+
durationMs: step.durationMs,
|
|
2028
|
+
tokenUsage: {
|
|
2029
|
+
prompt: stepPromptTokens,
|
|
2030
|
+
completion: stepOutputTokens,
|
|
2031
|
+
total: stepPromptTokens + stepOutputTokens
|
|
2032
|
+
},
|
|
2033
|
+
costUsd: stepCost,
|
|
2034
|
+
outputPreview: (step.text || step.thinking)?.slice(0, 200),
|
|
2035
|
+
success: isSuccess,
|
|
2036
|
+
error: errorMsg
|
|
2037
|
+
});
|
|
2038
|
+
}
|
|
2039
|
+
return subSteps;
|
|
2040
|
+
}).map((s, i) => ({ ...s, stepNumber: i + 1 }));
|
|
1942
2041
|
const finalTokens = {
|
|
1943
2042
|
prompt: usage.inputTokens,
|
|
1944
2043
|
completion: usage.outputTokens,
|
|
@@ -1960,6 +2059,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1960
2059
|
}
|
|
1961
2060
|
const summary = {
|
|
1962
2061
|
totalSteps: traceSteps.length,
|
|
2062
|
+
totalTurns: steps.length,
|
|
1963
2063
|
totalDurationMs,
|
|
1964
2064
|
totalTokens: finalTokens,
|
|
1965
2065
|
totalCostUsd: totalCost,
|
|
@@ -2049,7 +2149,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
2049
2149
|
var import_ai = require("ai");
|
|
2050
2150
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
2051
2151
|
var import_openai = require("@ai-sdk/openai");
|
|
2052
|
-
var
|
|
2152
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2053
2153
|
var import_crypto2 = require("crypto");
|
|
2054
2154
|
|
|
2055
2155
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
@@ -2145,48 +2245,35 @@ function extractErrorText(content) {
|
|
|
2145
2245
|
}
|
|
2146
2246
|
|
|
2147
2247
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
2248
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
2148
2249
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
2149
2250
|
var MODEL_PRICING = {
|
|
2251
|
+
// Anthropic — Claude 4.6
|
|
2252
|
+
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
2253
|
+
"claude-opus-4-6": { input: 15, output: 75 },
|
|
2150
2254
|
// Anthropic — Claude 4.5
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
// Anthropic — Claude 4
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
|
|
2158
|
-
// Anthropic — Claude 3.x
|
|
2159
|
-
CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
|
|
2160
|
-
CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
|
|
2161
|
-
CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
|
|
2255
|
+
"claude-opus-4-5": { input: 5, output: 25 },
|
|
2256
|
+
"claude-sonnet-4-5": { input: 3, output: 15 },
|
|
2257
|
+
"claude-haiku-4-5": { input: 1, output: 5 },
|
|
2258
|
+
// Anthropic — Claude 4
|
|
2259
|
+
"claude-opus-4": { input: 15, output: 75 },
|
|
2260
|
+
"claude-sonnet-4": { input: 3, output: 15 },
|
|
2162
2261
|
// OpenAI — GPT-5
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
GPT_5_NANO_2025_08_07: { input: 0.05, output: 0.4 },
|
|
2262
|
+
"gpt-5": { input: 1.25, output: 10 },
|
|
2263
|
+
"gpt-5-mini": { input: 0.25, output: 2 },
|
|
2264
|
+
"gpt-5-nano": { input: 0.05, output: 0.4 },
|
|
2167
2265
|
// OpenAI — GPT-4.1
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2266
|
+
"gpt-4.1": { input: 2, output: 8 },
|
|
2267
|
+
"gpt-4.1-mini": { input: 0.4, output: 1.6 },
|
|
2268
|
+
"gpt-4.1-nano": { input: 0.1, output: 0.4 },
|
|
2171
2269
|
// OpenAI — GPT-4o
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
GPT_4O_2024_11_20: { input: 2.5, output: 10 },
|
|
2175
|
-
GPT_4O_MINI_2024_07_18: { input: 0.15, output: 0.6 },
|
|
2270
|
+
"gpt-4o": { input: 2.5, output: 10 },
|
|
2271
|
+
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
2176
2272
|
// OpenAI — Reasoning
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
O1_MINI: { input: 1.1, output: 4.4 },
|
|
2182
|
-
O1_MINI_2024_09_12: { input: 1.1, output: 4.4 },
|
|
2183
|
-
O1_PREVIEW: { input: 15, output: 60 },
|
|
2184
|
-
O1_PREVIEW_2024_09_12: { input: 15, output: 60 },
|
|
2185
|
-
// OpenAI — Legacy
|
|
2186
|
-
GPT_4_TURBO_2024_04_09: { input: 10, output: 30 },
|
|
2187
|
-
GPT_4_1106_PREVIEW: { input: 10, output: 30 },
|
|
2188
|
-
GPT_3_5_TURBO: { input: 0.5, output: 1.5 },
|
|
2189
|
-
GPT_3_5_TURBO_0125: { input: 0.5, output: 1.5 }
|
|
2273
|
+
o3: { input: 2, output: 8 },
|
|
2274
|
+
"o4-mini": { input: 1.1, output: 4.4 },
|
|
2275
|
+
"o3-mini": { input: 1.1, output: 4.4 },
|
|
2276
|
+
o1: { input: 15, output: 60 }
|
|
2190
2277
|
};
|
|
2191
2278
|
function extractGatewayCost(step, provider) {
|
|
2192
2279
|
try {
|
|
@@ -2205,7 +2292,8 @@ function extractGatewayCost(step, provider) {
|
|
|
2205
2292
|
}
|
|
2206
2293
|
}
|
|
2207
2294
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
2208
|
-
const
|
|
2295
|
+
const normalized = (0, import_evalforge_types6.normalizeModelId)(modelId);
|
|
2296
|
+
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
2209
2297
|
if (!pricing) return 0;
|
|
2210
2298
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
2211
2299
|
}
|
|
@@ -2280,9 +2368,7 @@ var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
|
2280
2368
|
var PROVIDER_OPENAI = "openai";
|
|
2281
2369
|
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
2282
2370
|
function createModel(modelId, baseUrl, headers) {
|
|
2283
|
-
const isClaudeModel =
|
|
2284
|
-
modelId
|
|
2285
|
-
);
|
|
2371
|
+
const isClaudeModel = isClaudeModelId(modelId);
|
|
2286
2372
|
if (isClaudeModel) {
|
|
2287
2373
|
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
2288
2374
|
baseURL: `${baseUrl}/proxy/anthropic`,
|
|
@@ -2296,13 +2382,17 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
2296
2382
|
apiKey: "proxy-auth",
|
|
2297
2383
|
headers
|
|
2298
2384
|
});
|
|
2299
|
-
if (
|
|
2385
|
+
if ([...import_evalforge_types7.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
2386
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
2387
|
+
)) {
|
|
2300
2388
|
return openai.responses(modelId);
|
|
2301
2389
|
}
|
|
2302
2390
|
return openai.chat(modelId);
|
|
2303
2391
|
}
|
|
2304
2392
|
function isClaudeModelId(modelId) {
|
|
2305
|
-
return
|
|
2393
|
+
return import_evalforge_types7.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
2394
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
2395
|
+
);
|
|
2306
2396
|
}
|
|
2307
2397
|
function extractSkillContent(files) {
|
|
2308
2398
|
if (!files || files.length === 0) return void 0;
|
|
@@ -2336,7 +2426,9 @@ async function executeWithAiSdk(context) {
|
|
|
2336
2426
|
}
|
|
2337
2427
|
try {
|
|
2338
2428
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
2339
|
-
const isResponsesAPI =
|
|
2429
|
+
const isResponsesAPI = [...import_evalforge_types7.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
2430
|
+
(id) => modelConfig.model === id || modelConfig.model.startsWith(id)
|
|
2431
|
+
);
|
|
2340
2432
|
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
2341
2433
|
const providerOpts = {
|
|
2342
2434
|
...isAnthropic && {
|
|
@@ -2446,7 +2538,8 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2446
2538
|
return {
|
|
2447
2539
|
id: (0, import_crypto2.randomUUID)(),
|
|
2448
2540
|
stepNumber: i + 1,
|
|
2449
|
-
|
|
2541
|
+
turnIndex: i,
|
|
2542
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types7.LLMStepType.TOOL_USE : import_evalforge_types7.LLMStepType.COMPLETION,
|
|
2450
2543
|
model: modelId,
|
|
2451
2544
|
provider,
|
|
2452
2545
|
startedAt: new Date(
|
|
@@ -2473,6 +2566,7 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
|
|
|
2473
2566
|
steps: traceSteps,
|
|
2474
2567
|
summary: {
|
|
2475
2568
|
totalSteps: traceSteps.length,
|
|
2569
|
+
totalTurns: traceSteps.length,
|
|
2476
2570
|
totalDurationMs,
|
|
2477
2571
|
totalTokens: finalTokens,
|
|
2478
2572
|
totalCostUsd,
|
|
@@ -2497,7 +2591,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
2497
2591
|
targetId: traceContext.targetId,
|
|
2498
2592
|
targetName: traceContext.targetName,
|
|
2499
2593
|
stepNumber: 0,
|
|
2500
|
-
type:
|
|
2594
|
+
type: import_evalforge_types7.LiveTraceEventType.PROGRESS,
|
|
2501
2595
|
outputPreview: "Starting Simple Agent execution...",
|
|
2502
2596
|
elapsedMs: Date.now() - startTime,
|
|
2503
2597
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -2521,7 +2615,7 @@ function emitStepEvents(traceContext, steps, startTime) {
|
|
|
2521
2615
|
targetId: traceContext.targetId,
|
|
2522
2616
|
targetName: traceContext.targetName,
|
|
2523
2617
|
stepNumber: i + 1,
|
|
2524
|
-
type: isToolStep ?
|
|
2618
|
+
type: isToolStep ? import_evalforge_types7.LiveTraceEventType.TOOL_USE : import_evalforge_types7.LiveTraceEventType.COMPLETION,
|
|
2525
2619
|
toolName: firstToolCall?.toolName,
|
|
2526
2620
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
2527
2621
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -2544,7 +2638,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
2544
2638
|
targetId: traceContext.targetId,
|
|
2545
2639
|
targetName: traceContext.targetName,
|
|
2546
2640
|
stepNumber,
|
|
2547
|
-
type:
|
|
2641
|
+
type: import_evalforge_types7.LiveTraceEventType.COMPLETION,
|
|
2548
2642
|
outputPreview: "Scenario execution completed",
|
|
2549
2643
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2550
2644
|
isComplete: true
|
|
@@ -3296,17 +3390,11 @@ function extractTemplateFiles(before, after) {
|
|
|
3296
3390
|
}
|
|
3297
3391
|
|
|
3298
3392
|
// src/run-scenario/run-agent-with-context.ts
|
|
3299
|
-
var
|
|
3300
|
-
var DEFAULT_AGENT_COMMAND =
|
|
3393
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
3394
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types8.AgentRunCommand.CLAUDE;
|
|
3301
3395
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
3302
|
-
const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
|
|
3303
|
-
if (!hasEntities) {
|
|
3304
|
-
throw new Error(
|
|
3305
|
-
`Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
3306
|
-
);
|
|
3307
|
-
}
|
|
3308
3396
|
const agent = evalData.agent ?? void 0;
|
|
3309
|
-
const isSDK = agent?.agentType ===
|
|
3397
|
+
const isSDK = agent?.agentType === import_evalforge_types8.AgentType.SDK;
|
|
3310
3398
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
3311
3399
|
const adapter = getAdapter(identifier);
|
|
3312
3400
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -3392,7 +3480,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3392
3480
|
})),
|
|
3393
3481
|
durationMs: partialResult.duration
|
|
3394
3482
|
};
|
|
3395
|
-
const defaultJudgeModel =
|
|
3483
|
+
const defaultJudgeModel = import_evalforge_types9.DEFAULT_JUDGE_MODEL;
|
|
3396
3484
|
const assertionContext = {
|
|
3397
3485
|
workDir,
|
|
3398
3486
|
defaultJudgeModel,
|
|
@@ -3407,10 +3495,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3407
3495
|
assertionContext
|
|
3408
3496
|
) : [];
|
|
3409
3497
|
const passed = assertionResults.filter(
|
|
3410
|
-
(r) => r.status ===
|
|
3498
|
+
(r) => r.status === import_evalforge_types9.AssertionResultStatus.PASSED
|
|
3411
3499
|
).length;
|
|
3412
3500
|
const failed = assertionResults.filter(
|
|
3413
|
-
(r) => r.status ===
|
|
3501
|
+
(r) => r.status === import_evalforge_types9.AssertionResultStatus.FAILED
|
|
3414
3502
|
).length;
|
|
3415
3503
|
const total = assertionResults.length;
|
|
3416
3504
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -3424,7 +3512,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
3424
3512
|
}
|
|
3425
3513
|
|
|
3426
3514
|
// src/error-reporter.ts
|
|
3427
|
-
var
|
|
3515
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
3428
3516
|
function formatError(error, phase, context) {
|
|
3429
3517
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
3430
3518
|
if (error instanceof Error) {
|
|
@@ -3598,13 +3686,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
3598
3686
|
presetId: evalData.evalRun.presetId,
|
|
3599
3687
|
skillIds: evalData.evalRun.skillIds
|
|
3600
3688
|
};
|
|
3601
|
-
|
|
3602
|
-
if (scenarioItems.length > 0 && !hasEntities) {
|
|
3603
|
-
throw new Error(
|
|
3604
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
3605
|
-
);
|
|
3606
|
-
}
|
|
3607
|
-
if (scenarioItems.length > 0 && hasEntities && !agent) {
|
|
3689
|
+
if (scenarioItems.length > 0 && !agent) {
|
|
3608
3690
|
throw new Error(
|
|
3609
3691
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
3610
3692
|
);
|
|
@@ -3675,7 +3757,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
3675
3757
|
};
|
|
3676
3758
|
try {
|
|
3677
3759
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
3678
|
-
status:
|
|
3760
|
+
status: import_evalforge_types11.EvalStatus.COMPLETED,
|
|
3679
3761
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3680
3762
|
});
|
|
3681
3763
|
} catch (updateErr) {
|
|
@@ -3716,7 +3798,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
3716
3798
|
authToken: config.authToken
|
|
3717
3799
|
});
|
|
3718
3800
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
3719
|
-
status:
|
|
3801
|
+
status: import_evalforge_types11.EvalStatus.FAILED,
|
|
3720
3802
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3721
3803
|
jobError,
|
|
3722
3804
|
jobStatus: "FAILED"
|
|
@@ -3739,7 +3821,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
3739
3821
|
authToken
|
|
3740
3822
|
});
|
|
3741
3823
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
3742
|
-
status:
|
|
3824
|
+
status: import_evalforge_types11.EvalStatus.FAILED,
|
|
3743
3825
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3744
3826
|
jobError: `Config load failed, then: ${jobError}`,
|
|
3745
3827
|
jobStatus: "FAILED"
|