@wix/evalforge-evaluator 0.56.0 → 0.58.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types6 = require("@wix/evalforge-types");
27
+ var import_evalforge_types7 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
166
166
  getTemplate(projectId2, id) {
167
167
  return fetchJson(`/projects/${projectId2}/templates/${id}`);
168
168
  },
169
+ getMcp(projectId2, id) {
170
+ return fetchJson(`/projects/${projectId2}/mcps/${id}`);
171
+ },
172
+ getSubAgent(projectId2, id) {
173
+ return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
174
+ },
169
175
  getAssertion(projectId2, id) {
170
176
  return fetchJson(`/projects/${projectId2}/assertions/${id}`);
171
177
  },
@@ -296,6 +302,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
296
302
  );
297
303
  }
298
304
  }
305
+ let mcps = [];
306
+ if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
307
+ mcps = await Promise.all(
308
+ evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
309
+ );
310
+ }
311
+ let subAgents = [];
312
+ if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
313
+ subAgents = await Promise.all(
314
+ evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
315
+ );
316
+ }
299
317
  const templateIds = [
300
318
  ...new Set(
301
319
  scenarios.map((s) => s.templateId).filter((id) => !!id)
@@ -345,12 +363,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
345
363
  skills,
346
364
  skillsGroup,
347
365
  skillsGroupName,
366
+ mcps,
367
+ subAgents,
348
368
  scenarioItems
349
369
  };
350
370
  }
351
371
 
352
372
  // src/run-scenario/index.ts
353
- var import_evalforge_types4 = require("@wix/evalforge-types");
373
+ var import_evalforge_types5 = require("@wix/evalforge-types");
354
374
  var import_eval_assertions = require("@wix/eval-assertions");
355
375
 
356
376
  // src/run-scenario/environment.ts
@@ -6173,9 +6193,10 @@ function cleanAppleDoubleFiles(dir) {
6173
6193
  }
6174
6194
  }
6175
6195
  async function downloadAndExtractTemplate(template, workDir) {
6176
- if (!(0, import_fs5.existsSync)(workDir)) {
6177
- (0, import_fs5.mkdirSync)(workDir, { recursive: true });
6196
+ if ((0, import_fs5.existsSync)(workDir)) {
6197
+ (0, import_fs5.rmSync)(workDir, { recursive: true });
6178
6198
  }
6199
+ (0, import_fs5.mkdirSync)(workDir, { recursive: true });
6179
6200
  const response = await fetch(template.downloadUrl);
6180
6201
  if (!response.ok) {
6181
6202
  throw new Error(
@@ -6339,16 +6360,61 @@ function getAdapter(runCommand) {
6339
6360
  }
6340
6361
 
6341
6362
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
6342
- var import_evalforge_types3 = require("@wix/evalforge-types");
6363
+ var import_evalforge_types4 = require("@wix/evalforge-types");
6343
6364
 
6344
6365
  // src/run-scenario/agents/claude-code/execute.ts
6345
- var import_evalforge_types2 = require("@wix/evalforge-types");
6366
+ var import_evalforge_types3 = require("@wix/evalforge-types");
6346
6367
  var import_crypto = require("crypto");
6368
+ var import_promises5 = require("fs/promises");
6369
+ var import_path7 = require("path");
6370
+
6371
+ // src/run-scenario/agents/claude-code/write-mcp.ts
6347
6372
  var import_promises3 = require("fs/promises");
6348
6373
  var import_path5 = require("path");
6374
+ var import_evalforge_types2 = require("@wix/evalforge-types");
6375
+ async function writeMcpToFilesystem(cwd, mcps) {
6376
+ if (mcps.length === 0) return;
6377
+ const mcpServers = {};
6378
+ for (const mcp of mcps) {
6379
+ mcpServers[mcp.name] = mcp.config;
6380
+ }
6381
+ const content = JSON.stringify(
6382
+ { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
6383
+ null,
6384
+ 2
6385
+ );
6386
+ const filePath = (0, import_path5.join)(cwd, ".mcp.json");
6387
+ await (0, import_promises3.writeFile)(filePath, content, "utf8");
6388
+ console.log(`[MCP] Written to ${filePath}`);
6389
+ }
6390
+
6391
+ // src/run-scenario/agents/claude-code/write-sub-agents.ts
6392
+ var import_promises4 = require("fs/promises");
6393
+ var import_path6 = require("path");
6394
+ var AGENTS_DIR = ".claude/agents";
6395
+ function toAgentFilename(name2, index, nameCount) {
6396
+ const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
6397
+ const count = nameCount.get(base) ?? 0;
6398
+ nameCount.set(base, count + 1);
6399
+ return count === 0 ? base : `${base}-${count + 1}`;
6400
+ }
6401
+ async function writeSubAgentsToFilesystem(cwd, subAgents) {
6402
+ if (subAgents.length === 0) return;
6403
+ const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
6404
+ await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
6405
+ const nameCount = /* @__PURE__ */ new Map();
6406
+ for (const [i, agent] of subAgents.entries()) {
6407
+ const filename = toAgentFilename(agent.name, i, nameCount);
6408
+ const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
6409
+ await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
6410
+ }
6411
+ console.log(`[SubAgents] Written to ${agentsDir}`);
6412
+ }
6413
+
6414
+ // src/run-scenario/agents/claude-code/execute.ts
6349
6415
  var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
6350
6416
  function calculateStepCost(inputTokens, outputTokens, modelName) {
6351
- const model = import_evalforge_types2.AVAILABLE_MODELS.find(
6417
+ const model = import_evalforge_types3.AVAILABLE_MODELS.find(
6352
6418
  (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
6353
6419
  modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
6354
6420
  );
@@ -6362,7 +6428,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
6362
6428
  return inputCost + outputCost;
6363
6429
  }
6364
6430
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
6365
- console.log(`${import_evalforge_types2.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6431
+ console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6366
6432
  if (tracePushUrl) {
6367
6433
  pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
6368
6434
  console.error("[Trace Push] Failed to push trace event:", err);
@@ -6439,23 +6505,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
6439
6505
  }
6440
6506
  }
6441
6507
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6442
- let type = import_evalforge_types2.LiveTraceEventType.COMPLETION;
6508
+ let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
6443
6509
  let toolName;
6444
6510
  let toolArgs;
6445
6511
  let outputPreview;
6446
6512
  let filePath;
6447
6513
  for (const block of message.message.content) {
6448
6514
  if (block.type === "tool_use") {
6449
- type = import_evalforge_types2.LiveTraceEventType.TOOL_USE;
6515
+ type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
6450
6516
  toolName = block.name;
6451
6517
  toolArgs = JSON.stringify(block.input).slice(0, 500);
6452
6518
  const input = block.input;
6453
6519
  if (input.file_path || input.path || input.target_file) {
6454
6520
  filePath = String(input.file_path || input.path || input.target_file);
6455
6521
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6456
- type = import_evalforge_types2.LiveTraceEventType.FILE_WRITE;
6522
+ type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
6457
6523
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6458
- type = import_evalforge_types2.LiveTraceEventType.FILE_READ;
6524
+ type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
6459
6525
  }
6460
6526
  }
6461
6527
  } else if (block.type === "text") {
@@ -6513,7 +6579,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6513
6579
  }
6514
6580
  return {
6515
6581
  ...baseEvent,
6516
- type: import_evalforge_types2.LiveTraceEventType.USER,
6582
+ type: import_evalforge_types3.LiveTraceEventType.USER,
6517
6583
  outputPreview: outputPreview || "(tool result)"
6518
6584
  };
6519
6585
  }
@@ -6521,7 +6587,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6521
6587
  const sysMsg = message;
6522
6588
  return {
6523
6589
  ...baseEvent,
6524
- type: import_evalforge_types2.LiveTraceEventType.SYSTEM,
6590
+ type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
6525
6591
  outputPreview: sysMsg.subtype || "system"
6526
6592
  };
6527
6593
  }
@@ -6530,7 +6596,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6530
6596
  }
6531
6597
  return {
6532
6598
  ...baseEvent,
6533
- type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
6599
+ type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
6534
6600
  outputPreview: `Message type: ${message.type}`
6535
6601
  };
6536
6602
  }
@@ -6573,6 +6639,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
6573
6639
  }
6574
6640
  const startTime = /* @__PURE__ */ new Date();
6575
6641
  const allMessages = [];
6642
+ if (options.mcps && options.mcps.length > 0) {
6643
+ await writeMcpToFilesystem(options.cwd, options.mcps);
6644
+ }
6645
+ if (options.subAgents && options.subAgents.length > 0) {
6646
+ await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
6647
+ }
6576
6648
  console.error(
6577
6649
  "[DEBUG-H4] writeSkillsToFilesystem START",
6578
6650
  JSON.stringify({
@@ -6663,15 +6735,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
6663
6735
  const canUseTool = async () => {
6664
6736
  return { behavior: "allow" };
6665
6737
  };
6738
+ const baseAllowedTools = [
6739
+ "Skill",
6740
+ "Read",
6741
+ "Write",
6742
+ "Edit",
6743
+ "Bash",
6744
+ "Glob",
6745
+ "Grep"
6746
+ ];
6747
+ const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
6666
6748
  const queryOptions = {
6667
6749
  env: sdkEnv,
6668
6750
  cwd: options.cwd,
6669
6751
  settingSources: ["project"],
6670
- allowedTools: ["Skill", "Read", "Write", "Edit", "Bash", "Glob", "Grep"],
6752
+ allowedTools,
6671
6753
  model: options.model || DEFAULT_MODEL,
6672
6754
  maxTurns,
6673
6755
  maxThinkingTokens: options.maxThinkingTokens,
6674
- mcpServers: options.mcpServers,
6675
6756
  // Use 'default' permission mode with custom canUseTool handler
6676
6757
  // instead of 'bypassPermissions' which fails on root
6677
6758
  permissionMode: "default",
@@ -6699,10 +6780,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
6699
6780
  );
6700
6781
  console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
6701
6782
  console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
6702
- console.log(
6703
- "[SDK-DEBUG] mcpServers:",
6704
- queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
6705
- );
6706
6783
  console.log("[SDK-DEBUG] Calling SDK query()...");
6707
6784
  if (traceContext) {
6708
6785
  const preExecEvent = {
@@ -6712,7 +6789,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6712
6789
  targetId: traceContext.targetId,
6713
6790
  targetName: traceContext.targetName,
6714
6791
  stepNumber: 0,
6715
- type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
6792
+ type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
6716
6793
  outputPreview: JSON.stringify({
6717
6794
  event: "pre-sdk-execution",
6718
6795
  model: queryOptions.model,
@@ -6781,7 +6858,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6781
6858
  targetId: traceContext.targetId,
6782
6859
  targetName: traceContext.targetName,
6783
6860
  stepNumber: traceStepNumber,
6784
- type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
6861
+ type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
6785
6862
  outputPreview: progressMessage,
6786
6863
  toolName: lastToolName,
6787
6864
  filePath: lastFilePath,
@@ -6838,18 +6915,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6838
6915
  if (traceEvent) {
6839
6916
  lastToolName = traceEvent.toolName;
6840
6917
  lastFilePath = traceEvent.filePath;
6841
- if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.THINKING) {
6918
+ if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
6842
6919
  lastAction = "Thinking...";
6843
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.TOOL_USE) {
6920
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
6844
6921
  lastAction = extractToolActionDescription(
6845
6922
  traceEvent.toolName,
6846
6923
  traceEvent.toolArgs
6847
6924
  );
6848
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_WRITE) {
6925
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
6849
6926
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
6850
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_READ) {
6927
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
6851
6928
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
6852
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.COMPLETION) {
6929
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
6853
6930
  lastAction = "Processing response...";
6854
6931
  }
6855
6932
  emitTraceEvent(
@@ -7032,7 +7109,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
7032
7109
  targetId: traceContext.targetId,
7033
7110
  targetName: traceContext.targetName,
7034
7111
  stepNumber: traceStepNumber + 1,
7035
- type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
7112
+ type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
7036
7113
  outputPreview: JSON.stringify(
7037
7114
  {
7038
7115
  event: "sdk-execution-failed",
@@ -7071,7 +7148,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7071
7148
  targetId: traceContext.targetId,
7072
7149
  targetName: traceContext.targetName,
7073
7150
  stepNumber: traceStepNumber + 1,
7074
- type: import_evalforge_types2.LiveTraceEventType.COMPLETION,
7151
+ type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
7075
7152
  outputPreview: "Scenario execution completed",
7076
7153
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7077
7154
  isComplete: true
@@ -7113,10 +7190,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7113
7190
  async function writeSkillsToFilesystem(cwd, skills) {
7114
7191
  for (const skill of skills) {
7115
7192
  const skillName = skill.name;
7116
- const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7117
- await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7118
- const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7119
- await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7193
+ const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
7194
+ await (0, import_promises5.mkdir)(skillDir, { recursive: true });
7195
+ const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
7196
+ await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
7120
7197
  console.log(`[Skill] Written to ${skillPath}`);
7121
7198
  }
7122
7199
  }
@@ -7249,7 +7326,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
7249
7326
  return {
7250
7327
  id: (0, import_crypto.randomUUID)(),
7251
7328
  stepNumber: index + 1,
7252
- type: step.toolCalls?.length ? import_evalforge_types2.LLMStepType.TOOL_USE : import_evalforge_types2.LLMStepType.COMPLETION,
7329
+ type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
7253
7330
  model,
7254
7331
  provider: "anthropic",
7255
7332
  startedAt: step.startedAt.toISOString(),
@@ -7320,9 +7397,11 @@ var ClaudeCodeAdapter = class {
7320
7397
  modelConfig,
7321
7398
  aiGatewayUrl,
7322
7399
  aiGatewayHeaders,
7323
- traceContext
7400
+ traceContext,
7401
+ mcps,
7402
+ subAgents
7324
7403
  } = context;
7325
- const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7404
+ const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7326
7405
  const options = {
7327
7406
  cwd,
7328
7407
  model: modelForSdk,
@@ -7330,7 +7409,9 @@ var ClaudeCodeAdapter = class {
7330
7409
  maxTokens: modelConfig?.maxTokens,
7331
7410
  aiGatewayUrl,
7332
7411
  aiGatewayHeaders,
7333
- traceContext
7412
+ traceContext,
7413
+ mcps,
7414
+ subAgents
7334
7415
  };
7335
7416
  const { result, llmTrace } = await executeWithClaudeCode(
7336
7417
  skills,
@@ -7357,7 +7438,7 @@ defaultRegistry.register(claudeCodeAdapter);
7357
7438
 
7358
7439
  // src/run-scenario/file-diff.ts
7359
7440
  var import_fs6 = require("fs");
7360
- var import_path6 = require("path");
7441
+ var import_path8 = require("path");
7361
7442
 
7362
7443
  // ../../node_modules/diff/lib/index.mjs
7363
7444
  function Diff() {
@@ -7533,7 +7614,7 @@ Diff.prototype = {
7533
7614
  tokenize: function tokenize(value) {
7534
7615
  return Array.from(value);
7535
7616
  },
7536
- join: function join3(chars) {
7617
+ join: function join5(chars) {
7537
7618
  return chars.join("");
7538
7619
  },
7539
7620
  postProcess: function postProcess(changeObjects) {
@@ -7973,8 +8054,8 @@ function snapshotDirectory(dir, baseDir) {
7973
8054
  }
7974
8055
  const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
7975
8056
  for (const entry of entries) {
7976
- const fullPath = (0, import_path6.join)(dir, entry.name);
7977
- const relativePath = (0, import_path6.relative)(base, fullPath);
8057
+ const fullPath = (0, import_path8.join)(dir, entry.name);
8058
+ const relativePath = (0, import_path8.relative)(base, fullPath);
7978
8059
  if (shouldIgnore(entry.name)) {
7979
8060
  continue;
7980
8061
  }
@@ -8083,13 +8164,18 @@ function extractTemplateFiles(before, after) {
8083
8164
 
8084
8165
  // src/run-scenario/run-agent-with-context.ts
8085
8166
  var DEFAULT_AGENT_COMMAND = "claude";
8086
- async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
8167
+ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
8168
+ const skillsGroupId = evalData.evalRun.skillsGroupId;
8169
+ if (!skillsGroupId) {
8170
+ throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
8171
+ }
8172
+ const agent = evalData.codeAgent ?? void 0;
8087
8173
  const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
8088
8174
  const adapter = getAdapter(runCommand);
8089
8175
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
8090
8176
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
8091
8177
  const executionContext = {
8092
- skills,
8178
+ skills: evalData.skills,
8093
8179
  scenario,
8094
8180
  cwd: workDir || process.cwd(),
8095
8181
  modelConfig: agent?.modelConfig,
@@ -8100,11 +8186,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
8100
8186
  scenarioId: scenario.id,
8101
8187
  scenarioName: scenario.name,
8102
8188
  targetId: skillsGroupId,
8103
- targetName: skillsGroupName,
8189
+ targetName: evalData.skillsGroupName,
8104
8190
  tracePushUrl: config.tracePushUrl,
8105
8191
  routeHeader: config.routeHeader,
8106
8192
  authToken: config.authToken
8107
- }
8193
+ },
8194
+ mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
8195
+ subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
8108
8196
  };
8109
8197
  const result = await adapter.execute(executionContext);
8110
8198
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -8114,7 +8202,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
8114
8202
  return {
8115
8203
  id: (0, import_crypto2.randomUUID)(),
8116
8204
  targetId: skillsGroupId,
8117
- targetName: skillsGroupName,
8205
+ targetName: evalData.skillsGroupName,
8118
8206
  scenarioId: scenario.id,
8119
8207
  scenarioName: scenario.name,
8120
8208
  modelConfig: agent?.modelConfig,
@@ -8142,10 +8230,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8142
8230
  config,
8143
8231
  evalRunId2,
8144
8232
  scenario,
8145
- evalData.skills,
8146
- skillsGroupId,
8147
- evalData.skillsGroupName,
8148
- evalData.codeAgent ?? void 0,
8233
+ evalData,
8149
8234
  workDir
8150
8235
  );
8151
8236
  const inlineAssertions = scenario.assertions ?? [];
@@ -8177,10 +8262,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8177
8262
  assertionContext
8178
8263
  ) : [];
8179
8264
  const passed = assertionResults.filter(
8180
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
8265
+ (r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
8181
8266
  ).length;
8182
8267
  const failed = assertionResults.filter(
8183
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
8268
+ (r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
8184
8269
  ).length;
8185
8270
  const total = assertionResults.length;
8186
8271
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -8194,7 +8279,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8194
8279
  }
8195
8280
 
8196
8281
  // src/error-reporter.ts
8197
- var import_evalforge_types5 = require("@wix/evalforge-types");
8282
+ var import_evalforge_types6 = require("@wix/evalforge-types");
8198
8283
  function formatError(error, phase, context) {
8199
8284
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
8200
8285
  if (error instanceof Error) {
@@ -8443,7 +8528,7 @@ async function runEvaluation(projectId2, evalRunId2) {
8443
8528
  };
8444
8529
  try {
8445
8530
  await api.updateEvalRun(projectId2, evalRunId2, {
8446
- status: import_evalforge_types6.EvalStatus.COMPLETED,
8531
+ status: import_evalforge_types7.EvalStatus.COMPLETED,
8447
8532
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
8448
8533
  });
8449
8534
  } catch (updateErr) {
@@ -8484,7 +8569,7 @@ runEvaluation(projectId, evalRunId).then(() => {
8484
8569
  authToken: config.authToken
8485
8570
  });
8486
8571
  await api.updateEvalRun(projectId, evalRunId, {
8487
- status: import_evalforge_types6.EvalStatus.FAILED,
8572
+ status: import_evalforge_types7.EvalStatus.FAILED,
8488
8573
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8489
8574
  jobError,
8490
8575
  jobStatus: "FAILED"
@@ -8507,7 +8592,7 @@ runEvaluation(projectId, evalRunId).then(() => {
8507
8592
  authToken
8508
8593
  });
8509
8594
  await api.updateEvalRun(projectId, evalRunId, {
8510
- status: import_evalforge_types6.EvalStatus.FAILED,
8595
+ status: import_evalforge_types7.EvalStatus.FAILED,
8511
8596
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8512
8597
  jobError: `Config load failed, then: ${jobError}`,
8513
8598
  jobStatus: "FAILED"