@wix/evalforge-evaluator 0.57.0 → 0.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types6 = require("@wix/evalforge-types");
27
+ var import_evalforge_types7 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
166
166
  getTemplate(projectId2, id) {
167
167
  return fetchJson(`/projects/${projectId2}/templates/${id}`);
168
168
  },
169
+ getMcp(projectId2, id) {
170
+ return fetchJson(`/projects/${projectId2}/mcps/${id}`);
171
+ },
172
+ getSubAgent(projectId2, id) {
173
+ return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
174
+ },
169
175
  getAssertion(projectId2, id) {
170
176
  return fetchJson(`/projects/${projectId2}/assertions/${id}`);
171
177
  },
@@ -188,6 +194,16 @@ function createApiClient(serverUrl, options = "") {
188
194
 
189
195
  // src/fetch-evaluation-data.ts
190
196
  var import_evalforge_types = require("@wix/evalforge-types");
197
+ function parseSkillNamesFromParams(value) {
198
+ if (typeof value !== "string") {
199
+ return [];
200
+ }
201
+ const parsed = JSON.parse(value);
202
+ if (Array.isArray(parsed)) {
203
+ return parsed.map(String);
204
+ }
205
+ return [];
206
+ }
191
207
  function applyParamsToAssertion(assertion, params) {
192
208
  if (!params || Object.keys(params).length === 0) {
193
209
  return assertion;
@@ -209,6 +225,12 @@ function applyParamsToAssertion(assertion, params) {
209
225
  }
210
226
  return { ...assertion, prompt, systemPrompt };
211
227
  }
228
+ if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
229
+ return {
230
+ ...assertion,
231
+ skillNames: parseSkillNamesFromParams(params.skillNames)
232
+ };
233
+ }
212
234
  return { ...assertion, ...params };
213
235
  }
214
236
  function resolveSystemAssertion(assertionId, params) {
@@ -218,7 +240,7 @@ function resolveSystemAssertion(assertionId, params) {
218
240
  case "skill_was_called":
219
241
  baseAssertion = {
220
242
  type: "skill_was_called",
221
- skillName: params?.skillName ?? ""
243
+ skillNames: parseSkillNamesFromParams(params?.skillNames)
222
244
  };
223
245
  break;
224
246
  case "build_passed":
@@ -243,38 +265,15 @@ function resolveSystemAssertion(assertionId, params) {
243
265
  }
244
266
  function customAssertionToAssertion(ca, params) {
245
267
  const config = ca.config;
246
- let baseAssertion;
247
- switch (ca.type) {
248
- case "skill_was_called":
249
- baseAssertion = {
250
- type: "skill_was_called",
251
- skillName: config?.skillName ?? ""
252
- };
253
- break;
254
- case "build_passed":
255
- baseAssertion = {
256
- type: "build_passed",
257
- command: config?.command,
258
- expectedExitCode: config?.expectedExitCode
259
- };
260
- break;
261
- case "llm_judge":
262
- baseAssertion = {
263
- type: "llm_judge",
264
- prompt: config?.prompt ?? "",
265
- systemPrompt: config?.systemPrompt,
266
- minScore: config?.minScore,
267
- model: config?.model,
268
- maxTokens: config?.maxTokens,
269
- temperature: config?.temperature
270
- };
271
- break;
272
- default:
273
- baseAssertion = {
274
- type: "llm_judge",
275
- prompt: ""
276
- };
277
- }
268
+ const baseAssertion = {
269
+ type: "llm_judge",
270
+ prompt: config?.prompt ?? "",
271
+ systemPrompt: config?.systemPrompt,
272
+ minScore: config?.minScore,
273
+ model: config?.model,
274
+ maxTokens: config?.maxTokens,
275
+ temperature: config?.temperature
276
+ };
278
277
  return applyParamsToAssertion(baseAssertion, params);
279
278
  }
280
279
  async function fetchEvaluationData(api, projectId2, evalRunId2) {
@@ -296,6 +295,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
296
295
  );
297
296
  }
298
297
  }
298
+ let mcps = [];
299
+ if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
300
+ mcps = await Promise.all(
301
+ evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
302
+ );
303
+ }
304
+ let subAgents = [];
305
+ if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
306
+ subAgents = await Promise.all(
307
+ evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
308
+ );
309
+ }
299
310
  const templateIds = [
300
311
  ...new Set(
301
312
  scenarios.map((s) => s.templateId).filter((id) => !!id)
@@ -345,12 +356,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
345
356
  skills,
346
357
  skillsGroup,
347
358
  skillsGroupName,
359
+ mcps,
360
+ subAgents,
348
361
  scenarioItems
349
362
  };
350
363
  }
351
364
 
352
365
  // src/run-scenario/index.ts
353
- var import_evalforge_types4 = require("@wix/evalforge-types");
366
+ var import_evalforge_types5 = require("@wix/evalforge-types");
354
367
  var import_eval_assertions = require("@wix/eval-assertions");
355
368
 
356
369
  // src/run-scenario/environment.ts
@@ -6340,16 +6353,61 @@ function getAdapter(runCommand) {
6340
6353
  }
6341
6354
 
6342
6355
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
6343
- var import_evalforge_types3 = require("@wix/evalforge-types");
6356
+ var import_evalforge_types4 = require("@wix/evalforge-types");
6344
6357
 
6345
6358
  // src/run-scenario/agents/claude-code/execute.ts
6346
- var import_evalforge_types2 = require("@wix/evalforge-types");
6359
+ var import_evalforge_types3 = require("@wix/evalforge-types");
6347
6360
  var import_crypto = require("crypto");
6361
+ var import_promises5 = require("fs/promises");
6362
+ var import_path7 = require("path");
6363
+
6364
+ // src/run-scenario/agents/claude-code/write-mcp.ts
6348
6365
  var import_promises3 = require("fs/promises");
6349
6366
  var import_path5 = require("path");
6367
+ var import_evalforge_types2 = require("@wix/evalforge-types");
6368
+ async function writeMcpToFilesystem(cwd, mcps) {
6369
+ if (mcps.length === 0) return;
6370
+ const mcpServers = {};
6371
+ for (const mcp of mcps) {
6372
+ mcpServers[mcp.name] = mcp.config;
6373
+ }
6374
+ const content = JSON.stringify(
6375
+ { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
6376
+ null,
6377
+ 2
6378
+ );
6379
+ const filePath = (0, import_path5.join)(cwd, ".mcp.json");
6380
+ await (0, import_promises3.writeFile)(filePath, content, "utf8");
6381
+ console.log(`[MCP] Written to ${filePath}`);
6382
+ }
6383
+
6384
+ // src/run-scenario/agents/claude-code/write-sub-agents.ts
6385
+ var import_promises4 = require("fs/promises");
6386
+ var import_path6 = require("path");
6387
+ var AGENTS_DIR = ".claude/agents";
6388
+ function toAgentFilename(name2, index, nameCount) {
6389
+ const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
6390
+ const count = nameCount.get(base) ?? 0;
6391
+ nameCount.set(base, count + 1);
6392
+ return count === 0 ? base : `${base}-${count + 1}`;
6393
+ }
6394
+ async function writeSubAgentsToFilesystem(cwd, subAgents) {
6395
+ if (subAgents.length === 0) return;
6396
+ const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
6397
+ await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
6398
+ const nameCount = /* @__PURE__ */ new Map();
6399
+ for (const [i, agent] of subAgents.entries()) {
6400
+ const filename = toAgentFilename(agent.name, i, nameCount);
6401
+ const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
6402
+ await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
6403
+ }
6404
+ console.log(`[SubAgents] Written to ${agentsDir}`);
6405
+ }
6406
+
6407
+ // src/run-scenario/agents/claude-code/execute.ts
6350
6408
  var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
6351
6409
  function calculateStepCost(inputTokens, outputTokens, modelName) {
6352
- const model = import_evalforge_types2.AVAILABLE_MODELS.find(
6410
+ const model = import_evalforge_types3.AVAILABLE_MODELS.find(
6353
6411
  (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
6354
6412
  modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
6355
6413
  );
@@ -6363,7 +6421,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
6363
6421
  return inputCost + outputCost;
6364
6422
  }
6365
6423
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
6366
- console.log(`${import_evalforge_types2.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6424
+ console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6367
6425
  if (tracePushUrl) {
6368
6426
  pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
6369
6427
  console.error("[Trace Push] Failed to push trace event:", err);
@@ -6440,23 +6498,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
6440
6498
  }
6441
6499
  }
6442
6500
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6443
- let type = import_evalforge_types2.LiveTraceEventType.COMPLETION;
6501
+ let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
6444
6502
  let toolName;
6445
6503
  let toolArgs;
6446
6504
  let outputPreview;
6447
6505
  let filePath;
6448
6506
  for (const block of message.message.content) {
6449
6507
  if (block.type === "tool_use") {
6450
- type = import_evalforge_types2.LiveTraceEventType.TOOL_USE;
6508
+ type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
6451
6509
  toolName = block.name;
6452
6510
  toolArgs = JSON.stringify(block.input).slice(0, 500);
6453
6511
  const input = block.input;
6454
6512
  if (input.file_path || input.path || input.target_file) {
6455
6513
  filePath = String(input.file_path || input.path || input.target_file);
6456
6514
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6457
- type = import_evalforge_types2.LiveTraceEventType.FILE_WRITE;
6515
+ type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
6458
6516
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6459
- type = import_evalforge_types2.LiveTraceEventType.FILE_READ;
6517
+ type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
6460
6518
  }
6461
6519
  }
6462
6520
  } else if (block.type === "text") {
@@ -6514,7 +6572,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6514
6572
  }
6515
6573
  return {
6516
6574
  ...baseEvent,
6517
- type: import_evalforge_types2.LiveTraceEventType.USER,
6575
+ type: import_evalforge_types3.LiveTraceEventType.USER,
6518
6576
  outputPreview: outputPreview || "(tool result)"
6519
6577
  };
6520
6578
  }
@@ -6522,7 +6580,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6522
6580
  const sysMsg = message;
6523
6581
  return {
6524
6582
  ...baseEvent,
6525
- type: import_evalforge_types2.LiveTraceEventType.SYSTEM,
6583
+ type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
6526
6584
  outputPreview: sysMsg.subtype || "system"
6527
6585
  };
6528
6586
  }
@@ -6531,7 +6589,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6531
6589
  }
6532
6590
  return {
6533
6591
  ...baseEvent,
6534
- type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
6592
+ type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
6535
6593
  outputPreview: `Message type: ${message.type}`
6536
6594
  };
6537
6595
  }
@@ -6574,6 +6632,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
6574
6632
  }
6575
6633
  const startTime = /* @__PURE__ */ new Date();
6576
6634
  const allMessages = [];
6635
+ if (options.mcps && options.mcps.length > 0) {
6636
+ await writeMcpToFilesystem(options.cwd, options.mcps);
6637
+ }
6638
+ if (options.subAgents && options.subAgents.length > 0) {
6639
+ await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
6640
+ }
6577
6641
  console.error(
6578
6642
  "[DEBUG-H4] writeSkillsToFilesystem START",
6579
6643
  JSON.stringify({
@@ -6664,15 +6728,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
6664
6728
  const canUseTool = async () => {
6665
6729
  return { behavior: "allow" };
6666
6730
  };
6731
+ const baseAllowedTools = [
6732
+ "Skill",
6733
+ "Read",
6734
+ "Write",
6735
+ "Edit",
6736
+ "Bash",
6737
+ "Glob",
6738
+ "Grep"
6739
+ ];
6740
+ const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
6667
6741
  const queryOptions = {
6668
6742
  env: sdkEnv,
6669
6743
  cwd: options.cwd,
6670
6744
  settingSources: ["project"],
6671
- allowedTools: ["Skill", "Read", "Write", "Edit", "Bash", "Glob", "Grep"],
6745
+ allowedTools,
6672
6746
  model: options.model || DEFAULT_MODEL,
6673
6747
  maxTurns,
6674
6748
  maxThinkingTokens: options.maxThinkingTokens,
6675
- mcpServers: options.mcpServers,
6676
6749
  // Use 'default' permission mode with custom canUseTool handler
6677
6750
  // instead of 'bypassPermissions' which fails on root
6678
6751
  permissionMode: "default",
@@ -6700,10 +6773,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
6700
6773
  );
6701
6774
  console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
6702
6775
  console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
6703
- console.log(
6704
- "[SDK-DEBUG] mcpServers:",
6705
- queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
6706
- );
6707
6776
  console.log("[SDK-DEBUG] Calling SDK query()...");
6708
6777
  if (traceContext) {
6709
6778
  const preExecEvent = {
@@ -6713,7 +6782,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6713
6782
  targetId: traceContext.targetId,
6714
6783
  targetName: traceContext.targetName,
6715
6784
  stepNumber: 0,
6716
- type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
6785
+ type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
6717
6786
  outputPreview: JSON.stringify({
6718
6787
  event: "pre-sdk-execution",
6719
6788
  model: queryOptions.model,
@@ -6782,7 +6851,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6782
6851
  targetId: traceContext.targetId,
6783
6852
  targetName: traceContext.targetName,
6784
6853
  stepNumber: traceStepNumber,
6785
- type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
6854
+ type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
6786
6855
  outputPreview: progressMessage,
6787
6856
  toolName: lastToolName,
6788
6857
  filePath: lastFilePath,
@@ -6839,18 +6908,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6839
6908
  if (traceEvent) {
6840
6909
  lastToolName = traceEvent.toolName;
6841
6910
  lastFilePath = traceEvent.filePath;
6842
- if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.THINKING) {
6911
+ if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
6843
6912
  lastAction = "Thinking...";
6844
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.TOOL_USE) {
6913
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
6845
6914
  lastAction = extractToolActionDescription(
6846
6915
  traceEvent.toolName,
6847
6916
  traceEvent.toolArgs
6848
6917
  );
6849
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_WRITE) {
6918
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
6850
6919
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
6851
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_READ) {
6920
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
6852
6921
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
6853
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.COMPLETION) {
6922
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
6854
6923
  lastAction = "Processing response...";
6855
6924
  }
6856
6925
  emitTraceEvent(
@@ -7033,7 +7102,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
7033
7102
  targetId: traceContext.targetId,
7034
7103
  targetName: traceContext.targetName,
7035
7104
  stepNumber: traceStepNumber + 1,
7036
- type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
7105
+ type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
7037
7106
  outputPreview: JSON.stringify(
7038
7107
  {
7039
7108
  event: "sdk-execution-failed",
@@ -7072,7 +7141,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7072
7141
  targetId: traceContext.targetId,
7073
7142
  targetName: traceContext.targetName,
7074
7143
  stepNumber: traceStepNumber + 1,
7075
- type: import_evalforge_types2.LiveTraceEventType.COMPLETION,
7144
+ type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
7076
7145
  outputPreview: "Scenario execution completed",
7077
7146
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7078
7147
  isComplete: true
@@ -7114,10 +7183,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7114
7183
  async function writeSkillsToFilesystem(cwd, skills) {
7115
7184
  for (const skill of skills) {
7116
7185
  const skillName = skill.name;
7117
- const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7118
- await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7119
- const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7120
- await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7186
+ const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
7187
+ await (0, import_promises5.mkdir)(skillDir, { recursive: true });
7188
+ const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
7189
+ await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
7121
7190
  console.log(`[Skill] Written to ${skillPath}`);
7122
7191
  }
7123
7192
  }
@@ -7250,7 +7319,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
7250
7319
  return {
7251
7320
  id: (0, import_crypto.randomUUID)(),
7252
7321
  stepNumber: index + 1,
7253
- type: step.toolCalls?.length ? import_evalforge_types2.LLMStepType.TOOL_USE : import_evalforge_types2.LLMStepType.COMPLETION,
7322
+ type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
7254
7323
  model,
7255
7324
  provider: "anthropic",
7256
7325
  startedAt: step.startedAt.toISOString(),
@@ -7321,9 +7390,11 @@ var ClaudeCodeAdapter = class {
7321
7390
  modelConfig,
7322
7391
  aiGatewayUrl,
7323
7392
  aiGatewayHeaders,
7324
- traceContext
7393
+ traceContext,
7394
+ mcps,
7395
+ subAgents
7325
7396
  } = context;
7326
- const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7397
+ const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7327
7398
  const options = {
7328
7399
  cwd,
7329
7400
  model: modelForSdk,
@@ -7331,7 +7402,9 @@ var ClaudeCodeAdapter = class {
7331
7402
  maxTokens: modelConfig?.maxTokens,
7332
7403
  aiGatewayUrl,
7333
7404
  aiGatewayHeaders,
7334
- traceContext
7405
+ traceContext,
7406
+ mcps,
7407
+ subAgents
7335
7408
  };
7336
7409
  const { result, llmTrace } = await executeWithClaudeCode(
7337
7410
  skills,
@@ -7358,7 +7431,7 @@ defaultRegistry.register(claudeCodeAdapter);
7358
7431
 
7359
7432
  // src/run-scenario/file-diff.ts
7360
7433
  var import_fs6 = require("fs");
7361
- var import_path6 = require("path");
7434
+ var import_path8 = require("path");
7362
7435
 
7363
7436
  // ../../node_modules/diff/lib/index.mjs
7364
7437
  function Diff() {
@@ -7534,7 +7607,7 @@ Diff.prototype = {
7534
7607
  tokenize: function tokenize(value) {
7535
7608
  return Array.from(value);
7536
7609
  },
7537
- join: function join3(chars) {
7610
+ join: function join5(chars) {
7538
7611
  return chars.join("");
7539
7612
  },
7540
7613
  postProcess: function postProcess(changeObjects) {
@@ -7974,8 +8047,8 @@ function snapshotDirectory(dir, baseDir) {
7974
8047
  }
7975
8048
  const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
7976
8049
  for (const entry of entries) {
7977
- const fullPath = (0, import_path6.join)(dir, entry.name);
7978
- const relativePath = (0, import_path6.relative)(base, fullPath);
8050
+ const fullPath = (0, import_path8.join)(dir, entry.name);
8051
+ const relativePath = (0, import_path8.relative)(base, fullPath);
7979
8052
  if (shouldIgnore(entry.name)) {
7980
8053
  continue;
7981
8054
  }
@@ -8084,13 +8157,18 @@ function extractTemplateFiles(before, after) {
8084
8157
 
8085
8158
  // src/run-scenario/run-agent-with-context.ts
8086
8159
  var DEFAULT_AGENT_COMMAND = "claude";
8087
- async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
8160
+ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
8161
+ const skillsGroupId = evalData.evalRun.skillsGroupId;
8162
+ if (!skillsGroupId) {
8163
+ throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
8164
+ }
8165
+ const agent = evalData.codeAgent ?? void 0;
8088
8166
  const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
8089
8167
  const adapter = getAdapter(runCommand);
8090
8168
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
8091
8169
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
8092
8170
  const executionContext = {
8093
- skills,
8171
+ skills: evalData.skills,
8094
8172
  scenario,
8095
8173
  cwd: workDir || process.cwd(),
8096
8174
  modelConfig: agent?.modelConfig,
@@ -8101,11 +8179,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
8101
8179
  scenarioId: scenario.id,
8102
8180
  scenarioName: scenario.name,
8103
8181
  targetId: skillsGroupId,
8104
- targetName: skillsGroupName,
8182
+ targetName: evalData.skillsGroupName,
8105
8183
  tracePushUrl: config.tracePushUrl,
8106
8184
  routeHeader: config.routeHeader,
8107
8185
  authToken: config.authToken
8108
- }
8186
+ },
8187
+ mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
8188
+ subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
8109
8189
  };
8110
8190
  const result = await adapter.execute(executionContext);
8111
8191
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -8115,7 +8195,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
8115
8195
  return {
8116
8196
  id: (0, import_crypto2.randomUUID)(),
8117
8197
  targetId: skillsGroupId,
8118
- targetName: skillsGroupName,
8198
+ targetName: evalData.skillsGroupName,
8119
8199
  scenarioId: scenario.id,
8120
8200
  scenarioName: scenario.name,
8121
8201
  modelConfig: agent?.modelConfig,
@@ -8143,10 +8223,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8143
8223
  config,
8144
8224
  evalRunId2,
8145
8225
  scenario,
8146
- evalData.skills,
8147
- skillsGroupId,
8148
- evalData.skillsGroupName,
8149
- evalData.codeAgent ?? void 0,
8226
+ evalData,
8150
8227
  workDir
8151
8228
  );
8152
8229
  const inlineAssertions = scenario.assertions ?? [];
@@ -8178,10 +8255,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8178
8255
  assertionContext
8179
8256
  ) : [];
8180
8257
  const passed = assertionResults.filter(
8181
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
8258
+ (r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
8182
8259
  ).length;
8183
8260
  const failed = assertionResults.filter(
8184
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
8261
+ (r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
8185
8262
  ).length;
8186
8263
  const total = assertionResults.length;
8187
8264
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -8195,7 +8272,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8195
8272
  }
8196
8273
 
8197
8274
  // src/error-reporter.ts
8198
- var import_evalforge_types5 = require("@wix/evalforge-types");
8275
+ var import_evalforge_types6 = require("@wix/evalforge-types");
8199
8276
  function formatError(error, phase, context) {
8200
8277
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
8201
8278
  if (error instanceof Error) {
@@ -8444,7 +8521,7 @@ async function runEvaluation(projectId2, evalRunId2) {
8444
8521
  };
8445
8522
  try {
8446
8523
  await api.updateEvalRun(projectId2, evalRunId2, {
8447
- status: import_evalforge_types6.EvalStatus.COMPLETED,
8524
+ status: import_evalforge_types7.EvalStatus.COMPLETED,
8448
8525
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
8449
8526
  });
8450
8527
  } catch (updateErr) {
@@ -8485,7 +8562,7 @@ runEvaluation(projectId, evalRunId).then(() => {
8485
8562
  authToken: config.authToken
8486
8563
  });
8487
8564
  await api.updateEvalRun(projectId, evalRunId, {
8488
- status: import_evalforge_types6.EvalStatus.FAILED,
8565
+ status: import_evalforge_types7.EvalStatus.FAILED,
8489
8566
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8490
8567
  jobError,
8491
8568
  jobStatus: "FAILED"
@@ -8508,7 +8585,7 @@ runEvaluation(projectId, evalRunId).then(() => {
8508
8585
  authToken
8509
8586
  });
8510
8587
  await api.updateEvalRun(projectId, evalRunId, {
8511
- status: import_evalforge_types6.EvalStatus.FAILED,
8588
+ status: import_evalforge_types7.EvalStatus.FAILED,
8512
8589
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8513
8590
  jobError: `Config load failed, then: ${jobError}`,
8514
8591
  jobStatus: "FAILED"