@wix/evalforge-evaluator 0.111.0 → 0.112.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types10 = require("@wix/evalforge-types");
27
+ var import_evalforge_types11 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -509,7 +509,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
509
509
  }
510
510
 
511
511
  // src/run-scenario/index.ts
512
- var import_evalforge_types8 = require("@wix/evalforge-types");
512
+ var import_evalforge_types9 = require("@wix/evalforge-types");
513
513
  var import_eval_assertions = require("@wix/eval-assertions");
514
514
 
515
515
  // src/run-scenario/environment.ts
@@ -1261,7 +1261,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
1261
1261
  "Edit",
1262
1262
  "Bash",
1263
1263
  "Glob",
1264
- "Grep"
1264
+ "Grep",
1265
+ "Agent",
1266
+ "WebFetch",
1267
+ "WebSearch"
1265
1268
  ];
1266
1269
  const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
1267
1270
  const queryOptions = {
@@ -1896,13 +1899,15 @@ function extractTotalUsage(result) {
1896
1899
  }
1897
1900
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1898
1901
  const totalCost = usage.costUsd ?? 0;
1899
- const totalStepInputTokens = steps.reduce(
1900
- (sum, s) => sum + s.usage.inputTokens,
1902
+ const effectiveInput = (s) => s.usage.inputTokens + (s.usage.cacheReadTokens ?? 0) + (s.usage.cacheWriteTokens ?? 0);
1903
+ const totalStepEffectiveInput = steps.reduce(
1904
+ (sum, s) => sum + effectiveInput(s),
1901
1905
  0
1902
1906
  );
1903
1907
  const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
1904
- const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
1905
- const traceSteps = steps.map((step, index) => {
1908
+ const authoritativeEffectiveInput = usage.inputTokens + (usage.cacheReadTokens ?? 0) + (usage.cacheWriteTokens ?? 0);
1909
+ const inputTokensDuplicated = authoritativeEffectiveInput > 0 && totalStepEffectiveInput > authoritativeEffectiveInput * 1.2;
1910
+ const traceSteps = steps.flatMap((step, turnIndex) => {
1906
1911
  let stepPromptTokens;
1907
1912
  let stepOutputTokens;
1908
1913
  let proportion;
@@ -1911,34 +1916,128 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1911
1916
  stepPromptTokens = Math.round(usage.inputTokens * proportion);
1912
1917
  stepOutputTokens = Math.round(usage.outputTokens * proportion);
1913
1918
  } else {
1914
- proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
1915
- stepPromptTokens = step.usage.inputTokens;
1919
+ const stepEffective = effectiveInput(step);
1920
+ proportion = totalStepEffectiveInput > 0 ? stepEffective / totalStepEffectiveInput : 0;
1921
+ stepPromptTokens = Math.round(usage.inputTokens * proportion);
1916
1922
  stepOutputTokens = Math.round(usage.outputTokens * proportion);
1917
1923
  }
1918
- const stepTotalTokens = stepPromptTokens + stepOutputTokens;
1919
1924
  const costProportion = proportion;
1920
- const stepType = step.toolCalls?.length ? import_evalforge_types4.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
1921
- return {
1922
- id: (0, import_crypto.randomUUID)(),
1923
- stepNumber: index + 1,
1924
- type: stepType,
1925
- model,
1926
- provider: "anthropic",
1927
- startedAt: step.startedAt.toISOString(),
1928
- durationMs: step.durationMs,
1929
- tokenUsage: {
1930
- prompt: stepPromptTokens,
1931
- completion: stepOutputTokens,
1932
- total: stepTotalTokens
1933
- },
1934
- costUsd: totalCost * costProportion,
1935
- toolName: step.toolCalls?.[0]?.toolName,
1936
- toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
1937
- outputPreview: (step.text || step.thinking)?.slice(0, 200),
1938
- success: step.finishReason !== "error" && !step.hasToolError,
1939
- error: step.finishReason === "error" ? "Generation failed" : step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : void 0
1940
- };
1941
- });
1925
+ const toolCallCount = step.toolCalls?.length ?? 0;
1926
+ const isSuccess = step.finishReason !== "error" && !step.hasToolError;
1927
+ const errorMsg = step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : step.finishReason === "error" ? "Generation failed" : void 0;
1928
+ const subSteps = [];
1929
+ const stepCost = totalCost * costProportion;
1930
+ const hasThinking = !!step.thinking;
1931
+ const hasText = !!step.text;
1932
+ const thinkingSubSteps = hasThinking ? 1 : 0;
1933
+ const toolSubSteps = toolCallCount > 0 ? toolCallCount : 0;
1934
+ const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
1935
+ const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
1936
+ if (hasThinking && (hasText || toolCallCount > 0)) {
1937
+ subSteps.push({
1938
+ id: (0, import_crypto.randomUUID)(),
1939
+ stepNumber: 0,
1940
+ // renumbered below
1941
+ turnIndex,
1942
+ type: import_evalforge_types4.LLMStepType.THINKING,
1943
+ model,
1944
+ provider: "anthropic",
1945
+ startedAt: step.startedAt.toISOString(),
1946
+ durationMs: Math.round(step.durationMs / totalSubSteps),
1947
+ tokenUsage: {
1948
+ prompt: Math.round(stepPromptTokens / totalSubSteps),
1949
+ completion: Math.round(stepOutputTokens / totalSubSteps),
1950
+ total: Math.round(
1951
+ (stepPromptTokens + stepOutputTokens) / totalSubSteps
1952
+ )
1953
+ },
1954
+ costUsd: stepCost / totalSubSteps,
1955
+ outputPreview: step.thinking?.slice(0, 200),
1956
+ success: isSuccess,
1957
+ error: errorMsg
1958
+ });
1959
+ }
1960
+ if (toolCallCount > 0) {
1961
+ for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
1962
+ const tc = step.toolCalls[tcIdx];
1963
+ const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
1964
+ const toolBudgetSteps = toolSubSteps + textSubSteps;
1965
+ const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
1966
+ const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
1967
+ subSteps.push({
1968
+ id: (0, import_crypto.randomUUID)(),
1969
+ stepNumber: 0,
1970
+ turnIndex,
1971
+ type: import_evalforge_types4.LLMStepType.TOOL_USE,
1972
+ model,
1973
+ provider: "anthropic",
1974
+ startedAt: step.startedAt.toISOString(),
1975
+ durationMs: isLast ? step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(step.durationMs * remainingFraction * toolFraction),
1976
+ tokenUsage: {
1977
+ prompt: Math.round(
1978
+ stepPromptTokens * remainingFraction * toolFraction
1979
+ ),
1980
+ completion: Math.round(
1981
+ stepOutputTokens * remainingFraction * toolFraction
1982
+ ),
1983
+ total: Math.round(
1984
+ (stepPromptTokens + stepOutputTokens) * remainingFraction * toolFraction
1985
+ )
1986
+ },
1987
+ costUsd: stepCost * remainingFraction * toolFraction,
1988
+ toolName: tc.toolName,
1989
+ toolArguments: JSON.stringify(tc.args),
1990
+ outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
1991
+ success: isSuccess,
1992
+ error: errorMsg
1993
+ });
1994
+ }
1995
+ }
1996
+ if (hasText && toolCallCount > 0) {
1997
+ subSteps.push({
1998
+ id: (0, import_crypto.randomUUID)(),
1999
+ stepNumber: 0,
2000
+ turnIndex,
2001
+ type: import_evalforge_types4.LLMStepType.COMPLETION,
2002
+ model,
2003
+ provider: "anthropic",
2004
+ startedAt: step.startedAt.toISOString(),
2005
+ durationMs: step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
2006
+ tokenUsage: {
2007
+ prompt: stepPromptTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
2008
+ completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
2009
+ total: stepPromptTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
2010
+ },
2011
+ costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
2012
+ outputPreview: step.text?.slice(0, 200),
2013
+ success: isSuccess,
2014
+ error: errorMsg
2015
+ });
2016
+ }
2017
+ if (subSteps.length === 0) {
2018
+ const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
2019
+ subSteps.push({
2020
+ id: (0, import_crypto.randomUUID)(),
2021
+ stepNumber: 0,
2022
+ turnIndex,
2023
+ type: stepType,
2024
+ model,
2025
+ provider: "anthropic",
2026
+ startedAt: step.startedAt.toISOString(),
2027
+ durationMs: step.durationMs,
2028
+ tokenUsage: {
2029
+ prompt: stepPromptTokens,
2030
+ completion: stepOutputTokens,
2031
+ total: stepPromptTokens + stepOutputTokens
2032
+ },
2033
+ costUsd: stepCost,
2034
+ outputPreview: (step.text || step.thinking)?.slice(0, 200),
2035
+ success: isSuccess,
2036
+ error: errorMsg
2037
+ });
2038
+ }
2039
+ return subSteps;
2040
+ }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
1942
2041
  const finalTokens = {
1943
2042
  prompt: usage.inputTokens,
1944
2043
  completion: usage.outputTokens,
@@ -1960,6 +2059,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1960
2059
  }
1961
2060
  const summary = {
1962
2061
  totalSteps: traceSteps.length,
2062
+ totalTurns: steps.length,
1963
2063
  totalDurationMs,
1964
2064
  totalTokens: finalTokens,
1965
2065
  totalCostUsd: totalCost,
@@ -2049,7 +2149,7 @@ defaultRegistry.register(claudeCodeAdapter);
2049
2149
  var import_ai = require("ai");
2050
2150
  var import_anthropic = require("@ai-sdk/anthropic");
2051
2151
  var import_openai = require("@ai-sdk/openai");
2052
- var import_evalforge_types6 = require("@wix/evalforge-types");
2152
+ var import_evalforge_types7 = require("@wix/evalforge-types");
2053
2153
  var import_crypto2 = require("crypto");
2054
2154
 
2055
2155
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
@@ -2145,48 +2245,35 @@ function extractErrorText(content) {
2145
2245
  }
2146
2246
 
2147
2247
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
2248
+ var import_evalforge_types6 = require("@wix/evalforge-types");
2148
2249
  var PROVIDER_ANTHROPIC = "anthropic";
2149
2250
  var MODEL_PRICING = {
2251
+ // Anthropic — Claude 4.6
2252
+ "claude-sonnet-4-6": { input: 3, output: 15 },
2253
+ "claude-opus-4-6": { input: 15, output: 75 },
2150
2254
  // Anthropic — Claude 4.5
2151
- CLAUDE_4_5_OPUS_1_0: { input: 5, output: 25 },
2152
- CLAUDE_4_5_SONNET_1_0: { input: 3, output: 15 },
2153
- CLAUDE_4_5_HAIKU_1_0: { input: 1, output: 5 },
2154
- // Anthropic — Claude 4 / 4.1
2155
- CLAUDE_4_1_OPUS_1_0: { input: 15, output: 75 },
2156
- CLAUDE_4_OPUS_1_0: { input: 15, output: 75 },
2157
- CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
2158
- // Anthropic — Claude 3.x
2159
- CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
2160
- CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
2161
- CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
2255
+ "claude-opus-4-5": { input: 5, output: 25 },
2256
+ "claude-sonnet-4-5": { input: 3, output: 15 },
2257
+ "claude-haiku-4-5": { input: 1, output: 5 },
2258
+ // Anthropic — Claude 4
2259
+ "claude-opus-4": { input: 15, output: 75 },
2260
+ "claude-sonnet-4": { input: 3, output: 15 },
2162
2261
  // OpenAI — GPT-5
2163
- GPT_5_2_2025_12_11: { input: 1.75, output: 14 },
2164
- GPT_5_2025_08_07: { input: 1.25, output: 10 },
2165
- GPT_5_MINI_2025_08_07: { input: 0.25, output: 2 },
2166
- GPT_5_NANO_2025_08_07: { input: 0.05, output: 0.4 },
2262
+ "gpt-5": { input: 1.25, output: 10 },
2263
+ "gpt-5-mini": { input: 0.25, output: 2 },
2264
+ "gpt-5-nano": { input: 0.05, output: 0.4 },
2167
2265
  // OpenAI — GPT-4.1
2168
- GPT_4_1_2025_04_14: { input: 2, output: 8 },
2169
- GPT_4_1_MINI_2025_04_14: { input: 0.4, output: 1.6 },
2170
- GPT_4_1_NANO_2025_04_14: { input: 0.1, output: 0.4 },
2266
+ "gpt-4.1": { input: 2, output: 8 },
2267
+ "gpt-4.1-mini": { input: 0.4, output: 1.6 },
2268
+ "gpt-4.1-nano": { input: 0.1, output: 0.4 },
2171
2269
  // OpenAI — GPT-4o
2172
- GPT_4O_2024_05_13: { input: 2.5, output: 10 },
2173
- GPT_4O_2024_08_06: { input: 2.5, output: 10 },
2174
- GPT_4O_2024_11_20: { input: 2.5, output: 10 },
2175
- GPT_4O_MINI_2024_07_18: { input: 0.15, output: 0.6 },
2270
+ "gpt-4o": { input: 2.5, output: 10 },
2271
+ "gpt-4o-mini": { input: 0.15, output: 0.6 },
2176
2272
  // OpenAI — Reasoning
2177
- O3_2025_04_16: { input: 2, output: 8 },
2178
- O4_MINI_2025_04_16: { input: 1.1, output: 4.4 },
2179
- O3_MINI_2025_01_31: { input: 1.1, output: 4.4 },
2180
- O1_2024_12_17: { input: 15, output: 60 },
2181
- O1_MINI: { input: 1.1, output: 4.4 },
2182
- O1_MINI_2024_09_12: { input: 1.1, output: 4.4 },
2183
- O1_PREVIEW: { input: 15, output: 60 },
2184
- O1_PREVIEW_2024_09_12: { input: 15, output: 60 },
2185
- // OpenAI — Legacy
2186
- GPT_4_TURBO_2024_04_09: { input: 10, output: 30 },
2187
- GPT_4_1106_PREVIEW: { input: 10, output: 30 },
2188
- GPT_3_5_TURBO: { input: 0.5, output: 1.5 },
2189
- GPT_3_5_TURBO_0125: { input: 0.5, output: 1.5 }
2273
+ o3: { input: 2, output: 8 },
2274
+ "o4-mini": { input: 1.1, output: 4.4 },
2275
+ "o3-mini": { input: 1.1, output: 4.4 },
2276
+ o1: { input: 15, output: 60 }
2190
2277
  };
2191
2278
  function extractGatewayCost(step, provider) {
2192
2279
  try {
@@ -2205,7 +2292,8 @@ function extractGatewayCost(step, provider) {
2205
2292
  }
2206
2293
  }
2207
2294
  function calculateFromPricing(modelId, tokenUsage) {
2208
- const pricing = MODEL_PRICING[modelId];
2295
+ const normalized = (0, import_evalforge_types6.normalizeModelId)(modelId);
2296
+ const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
2209
2297
  if (!pricing) return 0;
2210
2298
  return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
2211
2299
  }
@@ -2280,9 +2368,7 @@ var PROVIDER_ANTHROPIC2 = "anthropic";
2280
2368
  var PROVIDER_OPENAI = "openai";
2281
2369
  var DEFAULT_MAX_TOOL_STEPS = 25;
2282
2370
  function createModel(modelId, baseUrl, headers) {
2283
- const isClaudeModel = import_evalforge_types6.AVAILABLE_CLAUDE_MODEL_IDS.includes(
2284
- modelId
2285
- );
2371
+ const isClaudeModel = isClaudeModelId(modelId);
2286
2372
  if (isClaudeModel) {
2287
2373
  const anthropic = (0, import_anthropic.createAnthropic)({
2288
2374
  baseURL: `${baseUrl}/proxy/anthropic`,
@@ -2296,13 +2382,17 @@ function createModel(modelId, baseUrl, headers) {
2296
2382
  apiKey: "proxy-auth",
2297
2383
  headers
2298
2384
  });
2299
- if (import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
2385
+ if ([...import_evalforge_types7.OPENAI_RESPONSES_MODEL_IDS].some(
2386
+ (id) => modelId === id || modelId.startsWith(id)
2387
+ )) {
2300
2388
  return openai.responses(modelId);
2301
2389
  }
2302
2390
  return openai.chat(modelId);
2303
2391
  }
2304
2392
  function isClaudeModelId(modelId) {
2305
- return import_evalforge_types6.AVAILABLE_CLAUDE_MODEL_IDS.includes(modelId);
2393
+ return import_evalforge_types7.AVAILABLE_CLAUDE_MODEL_IDS.some(
2394
+ (id) => modelId === id || modelId.startsWith(id)
2395
+ );
2306
2396
  }
2307
2397
  function extractSkillContent(files) {
2308
2398
  if (!files || files.length === 0) return void 0;
@@ -2336,7 +2426,9 @@ async function executeWithAiSdk(context) {
2336
2426
  }
2337
2427
  try {
2338
2428
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
2339
- const isResponsesAPI = import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
2429
+ const isResponsesAPI = [...import_evalforge_types7.OPENAI_RESPONSES_MODEL_IDS].some(
2430
+ (id) => modelConfig.model === id || modelConfig.model.startsWith(id)
2431
+ );
2340
2432
  const supportsThinking = isAnthropic || isResponsesAPI;
2341
2433
  const providerOpts = {
2342
2434
  ...isAnthropic && {
@@ -2446,7 +2538,8 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
2446
2538
  return {
2447
2539
  id: (0, import_crypto2.randomUUID)(),
2448
2540
  stepNumber: i + 1,
2449
- type: step.toolCalls.length > 0 ? import_evalforge_types6.LLMStepType.TOOL_USE : import_evalforge_types6.LLMStepType.COMPLETION,
2541
+ turnIndex: i,
2542
+ type: step.toolCalls.length > 0 ? import_evalforge_types7.LLMStepType.TOOL_USE : import_evalforge_types7.LLMStepType.COMPLETION,
2450
2543
  model: modelId,
2451
2544
  provider,
2452
2545
  startedAt: new Date(
@@ -2473,6 +2566,7 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
2473
2566
  steps: traceSteps,
2474
2567
  summary: {
2475
2568
  totalSteps: traceSteps.length,
2569
+ totalTurns: traceSteps.length,
2476
2570
  totalDurationMs,
2477
2571
  totalTokens: finalTokens,
2478
2572
  totalCostUsd,
@@ -2497,7 +2591,7 @@ function emitStartEvent(traceContext, startTime) {
2497
2591
  targetId: traceContext.targetId,
2498
2592
  targetName: traceContext.targetName,
2499
2593
  stepNumber: 0,
2500
- type: import_evalforge_types6.LiveTraceEventType.PROGRESS,
2594
+ type: import_evalforge_types7.LiveTraceEventType.PROGRESS,
2501
2595
  outputPreview: "Starting Simple Agent execution...",
2502
2596
  elapsedMs: Date.now() - startTime,
2503
2597
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -2521,7 +2615,7 @@ function emitStepEvents(traceContext, steps, startTime) {
2521
2615
  targetId: traceContext.targetId,
2522
2616
  targetName: traceContext.targetName,
2523
2617
  stepNumber: i + 1,
2524
- type: isToolStep ? import_evalforge_types6.LiveTraceEventType.TOOL_USE : import_evalforge_types6.LiveTraceEventType.COMPLETION,
2618
+ type: isToolStep ? import_evalforge_types7.LiveTraceEventType.TOOL_USE : import_evalforge_types7.LiveTraceEventType.COMPLETION,
2525
2619
  toolName: firstToolCall?.toolName,
2526
2620
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
2527
2621
  outputPreview: step.text?.slice(0, 500),
@@ -2544,7 +2638,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
2544
2638
  targetId: traceContext.targetId,
2545
2639
  targetName: traceContext.targetName,
2546
2640
  stepNumber,
2547
- type: import_evalforge_types6.LiveTraceEventType.COMPLETION,
2641
+ type: import_evalforge_types7.LiveTraceEventType.COMPLETION,
2548
2642
  outputPreview: "Scenario execution completed",
2549
2643
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2550
2644
  isComplete: true
@@ -3296,17 +3390,11 @@ function extractTemplateFiles(before, after) {
3296
3390
  }
3297
3391
 
3298
3392
  // src/run-scenario/run-agent-with-context.ts
3299
- var import_evalforge_types7 = require("@wix/evalforge-types");
3300
- var DEFAULT_AGENT_COMMAND = import_evalforge_types7.AgentRunCommand.CLAUDE;
3393
+ var import_evalforge_types8 = require("@wix/evalforge-types");
3394
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types8.AgentRunCommand.CLAUDE;
3301
3395
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
3302
- const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
3303
- if (!hasEntities) {
3304
- throw new Error(
3305
- `Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
3306
- );
3307
- }
3308
3396
  const agent = evalData.agent ?? void 0;
3309
- const isSDK = agent?.agentType === import_evalforge_types7.AgentType.SDK;
3397
+ const isSDK = agent?.agentType === import_evalforge_types8.AgentType.SDK;
3310
3398
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
3311
3399
  const adapter = getAdapter(identifier);
3312
3400
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -3392,7 +3480,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
3392
3480
  })),
3393
3481
  durationMs: partialResult.duration
3394
3482
  };
3395
- const defaultJudgeModel = import_evalforge_types8.DEFAULT_JUDGE_MODEL;
3483
+ const defaultJudgeModel = import_evalforge_types9.DEFAULT_JUDGE_MODEL;
3396
3484
  const assertionContext = {
3397
3485
  workDir,
3398
3486
  defaultJudgeModel,
@@ -3407,10 +3495,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
3407
3495
  assertionContext
3408
3496
  ) : [];
3409
3497
  const passed = assertionResults.filter(
3410
- (r) => r.status === import_evalforge_types8.AssertionResultStatus.PASSED
3498
+ (r) => r.status === import_evalforge_types9.AssertionResultStatus.PASSED
3411
3499
  ).length;
3412
3500
  const failed = assertionResults.filter(
3413
- (r) => r.status === import_evalforge_types8.AssertionResultStatus.FAILED
3501
+ (r) => r.status === import_evalforge_types9.AssertionResultStatus.FAILED
3414
3502
  ).length;
3415
3503
  const total = assertionResults.length;
3416
3504
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -3424,7 +3512,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
3424
3512
  }
3425
3513
 
3426
3514
  // src/error-reporter.ts
3427
- var import_evalforge_types9 = require("@wix/evalforge-types");
3515
+ var import_evalforge_types10 = require("@wix/evalforge-types");
3428
3516
  function formatError(error, phase, context) {
3429
3517
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
3430
3518
  if (error instanceof Error) {
@@ -3598,13 +3686,7 @@ async function runEvaluation(projectId2, evalRunId2) {
3598
3686
  presetId: evalData.evalRun.presetId,
3599
3687
  skillIds: evalData.evalRun.skillIds
3600
3688
  };
3601
- const hasEntities = skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || evalData.rules.length > 0;
3602
- if (scenarioItems.length > 0 && !hasEntities) {
3603
- throw new Error(
3604
- `[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
3605
- );
3606
- }
3607
- if (scenarioItems.length > 0 && hasEntities && !agent) {
3689
+ if (scenarioItems.length > 0 && !agent) {
3608
3690
  throw new Error(
3609
3691
  `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
3610
3692
  );
@@ -3675,7 +3757,7 @@ async function runEvaluation(projectId2, evalRunId2) {
3675
3757
  };
3676
3758
  try {
3677
3759
  await api.updateEvalRun(projectId2, evalRunId2, {
3678
- status: import_evalforge_types10.EvalStatus.COMPLETED,
3760
+ status: import_evalforge_types11.EvalStatus.COMPLETED,
3679
3761
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
3680
3762
  });
3681
3763
  } catch (updateErr) {
@@ -3716,7 +3798,7 @@ runEvaluation(projectId, evalRunId).then(() => {
3716
3798
  authToken: config.authToken
3717
3799
  });
3718
3800
  await api.updateEvalRun(projectId, evalRunId, {
3719
- status: import_evalforge_types10.EvalStatus.FAILED,
3801
+ status: import_evalforge_types11.EvalStatus.FAILED,
3720
3802
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
3721
3803
  jobError,
3722
3804
  jobStatus: "FAILED"
@@ -3739,7 +3821,7 @@ runEvaluation(projectId, evalRunId).then(() => {
3739
3821
  authToken
3740
3822
  });
3741
3823
  await api.updateEvalRun(projectId, evalRunId, {
3742
- status: import_evalforge_types10.EvalStatus.FAILED,
3824
+ status: import_evalforge_types11.EvalStatus.FAILED,
3743
3825
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
3744
3826
  jobError: `Config load failed, then: ${jobError}`,
3745
3827
  jobStatus: "FAILED"