npm - @wix/evalforge-evaluator - Versions diffs - 0.57.0 → 0.59.0 - Mend

@wix/evalforge-evaluator 0.57.0 → 0.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/build/index.js +165 -88
package/build/index.js.map +4 -4
package/build/index.mjs +135 -58
package/build/index.mjs.map +4 -4
package/build/types/api-client.d.ts +3 -1
package/build/types/fetch-evaluation-data.d.ts +24 -2
package/build/types/run-scenario/agents/claude-code/execute.d.ts +1 -1
package/build/types/run-scenario/agents/claude-code/index.d.ts +1 -1
package/build/types/run-scenario/agents/claude-code/types.d.ts +5 -11
package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +12 -0
package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +12 -0
package/build/types/run-scenario/index.d.ts +1 -1
package/build/types/run-scenario/run-agent-with-context.d.ts +4 -6
package/package.json +4 -4

package/build/index.js CHANGED Viewed

@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
 ));
 // src/index.ts
-var import_evalforge_types6 = require("@wix/evalforge-types");
+var import_evalforge_types7 = require("@wix/evalforge-types");
 // src/config.ts
 function loadConfig() {
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
     getTemplate(projectId2, id) {
       return fetchJson(`/projects/${projectId2}/templates/${id}`);
     },
+    getMcp(projectId2, id) {
+      return fetchJson(`/projects/${projectId2}/mcps/${id}`);
+    },
+    getSubAgent(projectId2, id) {
+      return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
+    },
     getAssertion(projectId2, id) {
       return fetchJson(`/projects/${projectId2}/assertions/${id}`);
     },
@@ -188,6 +194,16 @@ function createApiClient(serverUrl, options = "") {
 // src/fetch-evaluation-data.ts
 var import_evalforge_types = require("@wix/evalforge-types");
+function parseSkillNamesFromParams(value) {
+  if (typeof value !== "string") {
+    return [];
+  }
+  const parsed = JSON.parse(value);
+  if (Array.isArray(parsed)) {
+    return parsed.map(String);
+  }
+  return [];
+}
 function applyParamsToAssertion(assertion, params) {
   if (!params || Object.keys(params).length === 0) {
     return assertion;
@@ -209,6 +225,12 @@ function applyParamsToAssertion(assertion, params) {
     }
     return { ...assertion, prompt, systemPrompt };
   }
+  if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
+    return {
+      ...assertion,
+      skillNames: parseSkillNamesFromParams(params.skillNames)
+    };
+  }
   return { ...assertion, ...params };
 }
 function resolveSystemAssertion(assertionId, params) {
@@ -218,7 +240,7 @@ function resolveSystemAssertion(assertionId, params) {
     case "skill_was_called":
       baseAssertion = {
         type: "skill_was_called",
-        skillName: params?.skillName ?? ""
+        skillNames: parseSkillNamesFromParams(params?.skillNames)
       };
       break;
     case "build_passed":
@@ -243,38 +265,15 @@ function resolveSystemAssertion(assertionId, params) {
 }
 function customAssertionToAssertion(ca, params) {
   const config = ca.config;
-  let baseAssertion;
-  switch (ca.type) {
-    case "skill_was_called":
-      baseAssertion = {
-        type: "skill_was_called",
-        skillName: config?.skillName ?? ""
-      };
-      break;
-    case "build_passed":
-      baseAssertion = {
-        type: "build_passed",
-        command: config?.command,
-        expectedExitCode: config?.expectedExitCode
-      };
-      break;
-    case "llm_judge":
-      baseAssertion = {
-        type: "llm_judge",
-        prompt: config?.prompt ?? "",
-        systemPrompt: config?.systemPrompt,
-        minScore: config?.minScore,
-        model: config?.model,
-        maxTokens: config?.maxTokens,
-        temperature: config?.temperature
-      };
-      break;
-    default:
-      baseAssertion = {
-        type: "llm_judge",
-        prompt: ""
-      };
-  }
+  const baseAssertion = {
+    type: "llm_judge",
+    prompt: config?.prompt ?? "",
+    systemPrompt: config?.systemPrompt,
+    minScore: config?.minScore,
+    model: config?.model,
+    maxTokens: config?.maxTokens,
+    temperature: config?.temperature
+  };
   return applyParamsToAssertion(baseAssertion, params);
 }
 async function fetchEvaluationData(api, projectId2, evalRunId2) {
@@ -296,6 +295,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
       );
     }
   }
+  let mcps = [];
+  if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
+    mcps = await Promise.all(
+      evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
+    );
+  }
+  let subAgents = [];
+  if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
+    subAgents = await Promise.all(
+      evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
+    );
+  }
   const templateIds = [
     ...new Set(
       scenarios.map((s) => s.templateId).filter((id) => !!id)
@@ -345,12 +356,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
     skills,
     skillsGroup,
     skillsGroupName,
+    mcps,
+    subAgents,
     scenarioItems
   };
 }
 // src/run-scenario/index.ts
-var import_evalforge_types4 = require("@wix/evalforge-types");
+var import_evalforge_types5 = require("@wix/evalforge-types");
 var import_eval_assertions = require("@wix/eval-assertions");
 // src/run-scenario/environment.ts
@@ -6340,16 +6353,61 @@ function getAdapter(runCommand) {
 }
 // src/run-scenario/agents/claude-code/claude-code-adapter.ts
-var import_evalforge_types3 = require("@wix/evalforge-types");
+var import_evalforge_types4 = require("@wix/evalforge-types");
 // src/run-scenario/agents/claude-code/execute.ts
-var import_evalforge_types2 = require("@wix/evalforge-types");
+var import_evalforge_types3 = require("@wix/evalforge-types");
 var import_crypto = require("crypto");
+var import_promises5 = require("fs/promises");
+var import_path7 = require("path");
+// src/run-scenario/agents/claude-code/write-mcp.ts
 var import_promises3 = require("fs/promises");
 var import_path5 = require("path");
+var import_evalforge_types2 = require("@wix/evalforge-types");
+async function writeMcpToFilesystem(cwd, mcps) {
+  if (mcps.length === 0) return;
+  const mcpServers = {};
+  for (const mcp of mcps) {
+    mcpServers[mcp.name] = mcp.config;
+  }
+  const content = JSON.stringify(
+    { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
+    null,
+    2
+  );
+  const filePath = (0, import_path5.join)(cwd, ".mcp.json");
+  await (0, import_promises3.writeFile)(filePath, content, "utf8");
+  console.log(`[MCP] Written to ${filePath}`);
+}
+// src/run-scenario/agents/claude-code/write-sub-agents.ts
+var import_promises4 = require("fs/promises");
+var import_path6 = require("path");
+var AGENTS_DIR = ".claude/agents";
+function toAgentFilename(name2, index, nameCount) {
+  const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
+  const count = nameCount.get(base) ?? 0;
+  nameCount.set(base, count + 1);
+  return count === 0 ? base : `${base}-${count + 1}`;
+}
+async function writeSubAgentsToFilesystem(cwd, subAgents) {
+  if (subAgents.length === 0) return;
+  const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
+  await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
+  const nameCount = /* @__PURE__ */ new Map();
+  for (const [i, agent] of subAgents.entries()) {
+    const filename = toAgentFilename(agent.name, i, nameCount);
+    const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
+    await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
+  }
+  console.log(`[SubAgents] Written to ${agentsDir}`);
+}
+// src/run-scenario/agents/claude-code/execute.ts
 var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
 function calculateStepCost(inputTokens, outputTokens, modelName) {
-  const model = import_evalforge_types2.AVAILABLE_MODELS.find(
+  const model = import_evalforge_types3.AVAILABLE_MODELS.find(
     (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
     modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
   );
@@ -6363,7 +6421,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
   return inputCost + outputCost;
 }
 function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
-  console.log(`${import_evalforge_types2.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
+  console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
   if (tracePushUrl) {
     pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
       console.error("[Trace Push] Failed to push trace event:", err);
@@ -6440,23 +6498,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
   }
 }
 function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
-  let type = import_evalforge_types2.LiveTraceEventType.COMPLETION;
+  let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
   let toolName;
   let toolArgs;
   let outputPreview;
   let filePath;
   for (const block of message.message.content) {
     if (block.type === "tool_use") {
-      type = import_evalforge_types2.LiveTraceEventType.TOOL_USE;
+      type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
       toolName = block.name;
       toolArgs = JSON.stringify(block.input).slice(0, 500);
       const input = block.input;
       if (input.file_path || input.path || input.target_file) {
         filePath = String(input.file_path || input.path || input.target_file);
         if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
-          type = import_evalforge_types2.LiveTraceEventType.FILE_WRITE;
+          type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
         } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
-          type = import_evalforge_types2.LiveTraceEventType.FILE_READ;
+          type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
         }
       }
     } else if (block.type === "text") {
@@ -6514,7 +6572,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
     }
     return {
       ...baseEvent,
-      type: import_evalforge_types2.LiveTraceEventType.USER,
+      type: import_evalforge_types3.LiveTraceEventType.USER,
       outputPreview: outputPreview || "(tool result)"
     };
   }
@@ -6522,7 +6580,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
     const sysMsg = message;
     return {
       ...baseEvent,
-      type: import_evalforge_types2.LiveTraceEventType.SYSTEM,
+      type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
       outputPreview: sysMsg.subtype || "system"
     };
   }
@@ -6531,7 +6589,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
   }
   return {
     ...baseEvent,
-    type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
+    type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
     outputPreview: `Message type: ${message.type}`
   };
 }
@@ -6574,6 +6632,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
   }
   const startTime = /* @__PURE__ */ new Date();
   const allMessages = [];
+  if (options.mcps && options.mcps.length > 0) {
+    await writeMcpToFilesystem(options.cwd, options.mcps);
+  }
+  if (options.subAgents && options.subAgents.length > 0) {
+    await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
+  }
   console.error(
     "[DEBUG-H4] writeSkillsToFilesystem START",
     JSON.stringify({
@@ -6664,15 +6728,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
   const canUseTool = async () => {
     return { behavior: "allow" };
   };
+  const baseAllowedTools = [
+    "Skill",
+    "Read",
+    "Write",
+    "Edit",
+    "Bash",
+    "Glob",
+    "Grep"
+  ];
+  const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
   const queryOptions = {
     env: sdkEnv,
     cwd: options.cwd,
     settingSources: ["project"],
-    allowedTools: ["Skill", "Read", "Write", "Edit", "Bash", "Glob", "Grep"],
+    allowedTools,
     model: options.model || DEFAULT_MODEL,
     maxTurns,
     maxThinkingTokens: options.maxThinkingTokens,
-    mcpServers: options.mcpServers,
     // Use 'default' permission mode with custom canUseTool handler
     // instead of 'bypassPermissions' which fails on root
     permissionMode: "default",
@@ -6700,10 +6773,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
   );
   console.log("[SDK-DEBUG]   settingSources:", queryOptions.settingSources);
   console.log("[SDK-DEBUG]   allowedTools:", queryOptions.allowedTools);
-  console.log(
-    "[SDK-DEBUG]   mcpServers:",
-    queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
-  );
   console.log("[SDK-DEBUG] Calling SDK query()...");
   if (traceContext) {
     const preExecEvent = {
@@ -6713,7 +6782,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
       targetId: traceContext.targetId,
       targetName: traceContext.targetName,
       stepNumber: 0,
-      type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
+      type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
       outputPreview: JSON.stringify({
         event: "pre-sdk-execution",
         model: queryOptions.model,
@@ -6782,7 +6851,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
           targetId: traceContext.targetId,
           targetName: traceContext.targetName,
           stepNumber: traceStepNumber,
-          type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
+          type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
           outputPreview: progressMessage,
           toolName: lastToolName,
           filePath: lastFilePath,
@@ -6839,18 +6908,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
           if (traceEvent) {
             lastToolName = traceEvent.toolName;
             lastFilePath = traceEvent.filePath;
-            if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.THINKING) {
+            if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
               lastAction = "Thinking...";
-            } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.TOOL_USE) {
+            } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
               lastAction = extractToolActionDescription(
                 traceEvent.toolName,
                 traceEvent.toolArgs
               );
-            } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_WRITE) {
+            } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
               lastAction = `Writing: ${traceEvent.filePath || "file"}`;
-            } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_READ) {
+            } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
               lastAction = `Reading: ${traceEvent.filePath || "file"}`;
-            } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.COMPLETION) {
+            } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
               lastAction = "Processing response...";
             }
             emitTraceEvent(
@@ -7033,7 +7102,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
         targetId: traceContext.targetId,
         targetName: traceContext.targetName,
         stepNumber: traceStepNumber + 1,
-        type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
+        type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
         outputPreview: JSON.stringify(
           {
             event: "sdk-execution-failed",
@@ -7072,7 +7141,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
         targetId: traceContext.targetId,
         targetName: traceContext.targetName,
         stepNumber: traceStepNumber + 1,
-        type: import_evalforge_types2.LiveTraceEventType.COMPLETION,
+        type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
         outputPreview: "Scenario execution completed",
         timestamp: (/* @__PURE__ */ new Date()).toISOString(),
         isComplete: true
@@ -7114,10 +7183,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
 async function writeSkillsToFilesystem(cwd, skills) {
   for (const skill of skills) {
     const skillName = skill.name;
-    const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
-    await (0, import_promises3.mkdir)(skillDir, { recursive: true });
-    const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
-    await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
+    const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
+    await (0, import_promises5.mkdir)(skillDir, { recursive: true });
+    const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
+    await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
     console.log(`[Skill] Written to ${skillPath}`);
   }
 }
@@ -7250,7 +7319,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
     return {
       id: (0, import_crypto.randomUUID)(),
       stepNumber: index + 1,
-      type: step.toolCalls?.length ? import_evalforge_types2.LLMStepType.TOOL_USE : import_evalforge_types2.LLMStepType.COMPLETION,
+      type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
       model,
       provider: "anthropic",
       startedAt: step.startedAt.toISOString(),
@@ -7321,9 +7390,11 @@ var ClaudeCodeAdapter = class {
       modelConfig,
       aiGatewayUrl,
       aiGatewayHeaders,
-      traceContext
+      traceContext,
+      mcps,
+      subAgents
     } = context;
-    const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
+    const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
     const options = {
       cwd,
       model: modelForSdk,
@@ -7331,7 +7402,9 @@ var ClaudeCodeAdapter = class {
       maxTokens: modelConfig?.maxTokens,
       aiGatewayUrl,
       aiGatewayHeaders,
-      traceContext
+      traceContext,
+      mcps,
+      subAgents
     };
     const { result, llmTrace } = await executeWithClaudeCode(
       skills,
@@ -7358,7 +7431,7 @@ defaultRegistry.register(claudeCodeAdapter);
 // src/run-scenario/file-diff.ts
 var import_fs6 = require("fs");
-var import_path6 = require("path");
+var import_path8 = require("path");
 // ../../node_modules/diff/lib/index.mjs
 function Diff() {
@@ -7534,7 +7607,7 @@ Diff.prototype = {
   tokenize: function tokenize(value) {
     return Array.from(value);
   },
-  join: function join3(chars) {
+  join: function join5(chars) {
     return chars.join("");
   },
   postProcess: function postProcess(changeObjects) {
@@ -7974,8 +8047,8 @@ function snapshotDirectory(dir, baseDir) {
   }
   const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
   for (const entry of entries) {
-    const fullPath = (0, import_path6.join)(dir, entry.name);
-    const relativePath = (0, import_path6.relative)(base, fullPath);
+    const fullPath = (0, import_path8.join)(dir, entry.name);
+    const relativePath = (0, import_path8.relative)(base, fullPath);
     if (shouldIgnore(entry.name)) {
       continue;
     }
@@ -8084,13 +8157,18 @@ function extractTemplateFiles(before, after) {
 // src/run-scenario/run-agent-with-context.ts
 var DEFAULT_AGENT_COMMAND = "claude";
-async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
+async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
+  const skillsGroupId = evalData.evalRun.skillsGroupId;
+  if (!skillsGroupId) {
+    throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
+  }
+  const agent = evalData.codeAgent ?? void 0;
   const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
   const adapter = getAdapter(runCommand);
   const startedAt = (/* @__PURE__ */ new Date()).toISOString();
   const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
   const executionContext = {
-    skills,
+    skills: evalData.skills,
     scenario,
     cwd: workDir || process.cwd(),
     modelConfig: agent?.modelConfig,
@@ -8101,11 +8179,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
       scenarioId: scenario.id,
       scenarioName: scenario.name,
       targetId: skillsGroupId,
-      targetName: skillsGroupName,
+      targetName: evalData.skillsGroupName,
       tracePushUrl: config.tracePushUrl,
       routeHeader: config.routeHeader,
       authToken: config.authToken
-    }
+    },
+    mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
+    subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
   };
   const result = await adapter.execute(executionContext);
   const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -8115,7 +8195,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
   return {
     id: (0, import_crypto2.randomUUID)(),
     targetId: skillsGroupId,
-    targetName: skillsGroupName,
+    targetName: evalData.skillsGroupName,
     scenarioId: scenario.id,
     scenarioName: scenario.name,
     modelConfig: agent?.modelConfig,
@@ -8143,10 +8223,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
     config,
     evalRunId2,
     scenario,
-    evalData.skills,
-    skillsGroupId,
-    evalData.skillsGroupName,
-    evalData.codeAgent ?? void 0,
+    evalData,
     workDir
   );
   const inlineAssertions = scenario.assertions ?? [];
@@ -8178,10 +8255,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
     assertionContext
   ) : [];
   const passed = assertionResults.filter(
-    (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
+    (r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
   ).length;
   const failed = assertionResults.filter(
-    (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
+    (r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
   ).length;
   const total = assertionResults.length;
   const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -8195,7 +8272,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
 }
 // src/error-reporter.ts
-var import_evalforge_types5 = require("@wix/evalforge-types");
+var import_evalforge_types6 = require("@wix/evalforge-types");
 function formatError(error, phase, context) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString();
   if (error instanceof Error) {
@@ -8444,7 +8521,7 @@ async function runEvaluation(projectId2, evalRunId2) {
   };
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
-      status: import_evalforge_types6.EvalStatus.COMPLETED,
+      status: import_evalforge_types7.EvalStatus.COMPLETED,
       completedAt: (/* @__PURE__ */ new Date()).toISOString()
     });
   } catch (updateErr) {
@@ -8485,7 +8562,7 @@ runEvaluation(projectId, evalRunId).then(() => {
       authToken: config.authToken
     });
     await api.updateEvalRun(projectId, evalRunId, {
-      status: import_evalforge_types6.EvalStatus.FAILED,
+      status: import_evalforge_types7.EvalStatus.FAILED,
       completedAt: (/* @__PURE__ */ new Date()).toISOString(),
       jobError,
       jobStatus: "FAILED"
@@ -8508,7 +8585,7 @@ runEvaluation(projectId, evalRunId).then(() => {
           authToken
         });
         await api.updateEvalRun(projectId, evalRunId, {
-          status: import_evalforge_types6.EvalStatus.FAILED,
+          status: import_evalforge_types7.EvalStatus.FAILED,
           completedAt: (/* @__PURE__ */ new Date()).toISOString(),
           jobError: `Config load failed, then: ${jobError}`,
           jobStatus: "FAILED"