@wix/evalforge-evaluator 0.57.0 → 0.58.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types6 = require("@wix/evalforge-types");
27
+ var import_evalforge_types7 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
166
166
  getTemplate(projectId2, id) {
167
167
  return fetchJson(`/projects/${projectId2}/templates/${id}`);
168
168
  },
169
+ getMcp(projectId2, id) {
170
+ return fetchJson(`/projects/${projectId2}/mcps/${id}`);
171
+ },
172
+ getSubAgent(projectId2, id) {
173
+ return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
174
+ },
169
175
  getAssertion(projectId2, id) {
170
176
  return fetchJson(`/projects/${projectId2}/assertions/${id}`);
171
177
  },
@@ -296,6 +302,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
296
302
  );
297
303
  }
298
304
  }
305
+ let mcps = [];
306
+ if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
307
+ mcps = await Promise.all(
308
+ evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
309
+ );
310
+ }
311
+ let subAgents = [];
312
+ if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
313
+ subAgents = await Promise.all(
314
+ evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
315
+ );
316
+ }
299
317
  const templateIds = [
300
318
  ...new Set(
301
319
  scenarios.map((s) => s.templateId).filter((id) => !!id)
@@ -345,12 +363,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
345
363
  skills,
346
364
  skillsGroup,
347
365
  skillsGroupName,
366
+ mcps,
367
+ subAgents,
348
368
  scenarioItems
349
369
  };
350
370
  }
351
371
 
352
372
  // src/run-scenario/index.ts
353
- var import_evalforge_types4 = require("@wix/evalforge-types");
373
+ var import_evalforge_types5 = require("@wix/evalforge-types");
354
374
  var import_eval_assertions = require("@wix/eval-assertions");
355
375
 
356
376
  // src/run-scenario/environment.ts
@@ -6340,16 +6360,61 @@ function getAdapter(runCommand) {
6340
6360
  }
6341
6361
 
6342
6362
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
6343
- var import_evalforge_types3 = require("@wix/evalforge-types");
6363
+ var import_evalforge_types4 = require("@wix/evalforge-types");
6344
6364
 
6345
6365
  // src/run-scenario/agents/claude-code/execute.ts
6346
- var import_evalforge_types2 = require("@wix/evalforge-types");
6366
+ var import_evalforge_types3 = require("@wix/evalforge-types");
6347
6367
  var import_crypto = require("crypto");
6368
+ var import_promises5 = require("fs/promises");
6369
+ var import_path7 = require("path");
6370
+
6371
+ // src/run-scenario/agents/claude-code/write-mcp.ts
6348
6372
  var import_promises3 = require("fs/promises");
6349
6373
  var import_path5 = require("path");
6374
+ var import_evalforge_types2 = require("@wix/evalforge-types");
6375
+ async function writeMcpToFilesystem(cwd, mcps) {
6376
+ if (mcps.length === 0) return;
6377
+ const mcpServers = {};
6378
+ for (const mcp of mcps) {
6379
+ mcpServers[mcp.name] = mcp.config;
6380
+ }
6381
+ const content = JSON.stringify(
6382
+ { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
6383
+ null,
6384
+ 2
6385
+ );
6386
+ const filePath = (0, import_path5.join)(cwd, ".mcp.json");
6387
+ await (0, import_promises3.writeFile)(filePath, content, "utf8");
6388
+ console.log(`[MCP] Written to ${filePath}`);
6389
+ }
6390
+
6391
+ // src/run-scenario/agents/claude-code/write-sub-agents.ts
6392
+ var import_promises4 = require("fs/promises");
6393
+ var import_path6 = require("path");
6394
+ var AGENTS_DIR = ".claude/agents";
6395
+ function toAgentFilename(name2, index, nameCount) {
6396
+ const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
6397
+ const count = nameCount.get(base) ?? 0;
6398
+ nameCount.set(base, count + 1);
6399
+ return count === 0 ? base : `${base}-${count + 1}`;
6400
+ }
6401
+ async function writeSubAgentsToFilesystem(cwd, subAgents) {
6402
+ if (subAgents.length === 0) return;
6403
+ const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
6404
+ await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
6405
+ const nameCount = /* @__PURE__ */ new Map();
6406
+ for (const [i, agent] of subAgents.entries()) {
6407
+ const filename = toAgentFilename(agent.name, i, nameCount);
6408
+ const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
6409
+ await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
6410
+ }
6411
+ console.log(`[SubAgents] Written to ${agentsDir}`);
6412
+ }
6413
+
6414
+ // src/run-scenario/agents/claude-code/execute.ts
6350
6415
  var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
6351
6416
  function calculateStepCost(inputTokens, outputTokens, modelName) {
6352
- const model = import_evalforge_types2.AVAILABLE_MODELS.find(
6417
+ const model = import_evalforge_types3.AVAILABLE_MODELS.find(
6353
6418
  (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
6354
6419
  modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
6355
6420
  );
@@ -6363,7 +6428,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
6363
6428
  return inputCost + outputCost;
6364
6429
  }
6365
6430
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
6366
- console.log(`${import_evalforge_types2.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6431
+ console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6367
6432
  if (tracePushUrl) {
6368
6433
  pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
6369
6434
  console.error("[Trace Push] Failed to push trace event:", err);
@@ -6440,23 +6505,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
6440
6505
  }
6441
6506
  }
6442
6507
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6443
- let type = import_evalforge_types2.LiveTraceEventType.COMPLETION;
6508
+ let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
6444
6509
  let toolName;
6445
6510
  let toolArgs;
6446
6511
  let outputPreview;
6447
6512
  let filePath;
6448
6513
  for (const block of message.message.content) {
6449
6514
  if (block.type === "tool_use") {
6450
- type = import_evalforge_types2.LiveTraceEventType.TOOL_USE;
6515
+ type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
6451
6516
  toolName = block.name;
6452
6517
  toolArgs = JSON.stringify(block.input).slice(0, 500);
6453
6518
  const input = block.input;
6454
6519
  if (input.file_path || input.path || input.target_file) {
6455
6520
  filePath = String(input.file_path || input.path || input.target_file);
6456
6521
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6457
- type = import_evalforge_types2.LiveTraceEventType.FILE_WRITE;
6522
+ type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
6458
6523
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6459
- type = import_evalforge_types2.LiveTraceEventType.FILE_READ;
6524
+ type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
6460
6525
  }
6461
6526
  }
6462
6527
  } else if (block.type === "text") {
@@ -6514,7 +6579,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6514
6579
  }
6515
6580
  return {
6516
6581
  ...baseEvent,
6517
- type: import_evalforge_types2.LiveTraceEventType.USER,
6582
+ type: import_evalforge_types3.LiveTraceEventType.USER,
6518
6583
  outputPreview: outputPreview || "(tool result)"
6519
6584
  };
6520
6585
  }
@@ -6522,7 +6587,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6522
6587
  const sysMsg = message;
6523
6588
  return {
6524
6589
  ...baseEvent,
6525
- type: import_evalforge_types2.LiveTraceEventType.SYSTEM,
6590
+ type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
6526
6591
  outputPreview: sysMsg.subtype || "system"
6527
6592
  };
6528
6593
  }
@@ -6531,7 +6596,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6531
6596
  }
6532
6597
  return {
6533
6598
  ...baseEvent,
6534
- type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
6599
+ type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
6535
6600
  outputPreview: `Message type: ${message.type}`
6536
6601
  };
6537
6602
  }
@@ -6574,6 +6639,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
6574
6639
  }
6575
6640
  const startTime = /* @__PURE__ */ new Date();
6576
6641
  const allMessages = [];
6642
+ if (options.mcps && options.mcps.length > 0) {
6643
+ await writeMcpToFilesystem(options.cwd, options.mcps);
6644
+ }
6645
+ if (options.subAgents && options.subAgents.length > 0) {
6646
+ await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
6647
+ }
6577
6648
  console.error(
6578
6649
  "[DEBUG-H4] writeSkillsToFilesystem START",
6579
6650
  JSON.stringify({
@@ -6664,15 +6735,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
6664
6735
  const canUseTool = async () => {
6665
6736
  return { behavior: "allow" };
6666
6737
  };
6738
+ const baseAllowedTools = [
6739
+ "Skill",
6740
+ "Read",
6741
+ "Write",
6742
+ "Edit",
6743
+ "Bash",
6744
+ "Glob",
6745
+ "Grep"
6746
+ ];
6747
+ const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
6667
6748
  const queryOptions = {
6668
6749
  env: sdkEnv,
6669
6750
  cwd: options.cwd,
6670
6751
  settingSources: ["project"],
6671
- allowedTools: ["Skill", "Read", "Write", "Edit", "Bash", "Glob", "Grep"],
6752
+ allowedTools,
6672
6753
  model: options.model || DEFAULT_MODEL,
6673
6754
  maxTurns,
6674
6755
  maxThinkingTokens: options.maxThinkingTokens,
6675
- mcpServers: options.mcpServers,
6676
6756
  // Use 'default' permission mode with custom canUseTool handler
6677
6757
  // instead of 'bypassPermissions' which fails on root
6678
6758
  permissionMode: "default",
@@ -6700,10 +6780,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
6700
6780
  );
6701
6781
  console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
6702
6782
  console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
6703
- console.log(
6704
- "[SDK-DEBUG] mcpServers:",
6705
- queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
6706
- );
6707
6783
  console.log("[SDK-DEBUG] Calling SDK query()...");
6708
6784
  if (traceContext) {
6709
6785
  const preExecEvent = {
@@ -6713,7 +6789,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6713
6789
  targetId: traceContext.targetId,
6714
6790
  targetName: traceContext.targetName,
6715
6791
  stepNumber: 0,
6716
- type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
6792
+ type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
6717
6793
  outputPreview: JSON.stringify({
6718
6794
  event: "pre-sdk-execution",
6719
6795
  model: queryOptions.model,
@@ -6782,7 +6858,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6782
6858
  targetId: traceContext.targetId,
6783
6859
  targetName: traceContext.targetName,
6784
6860
  stepNumber: traceStepNumber,
6785
- type: import_evalforge_types2.LiveTraceEventType.PROGRESS,
6861
+ type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
6786
6862
  outputPreview: progressMessage,
6787
6863
  toolName: lastToolName,
6788
6864
  filePath: lastFilePath,
@@ -6839,18 +6915,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6839
6915
  if (traceEvent) {
6840
6916
  lastToolName = traceEvent.toolName;
6841
6917
  lastFilePath = traceEvent.filePath;
6842
- if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.THINKING) {
6918
+ if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
6843
6919
  lastAction = "Thinking...";
6844
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.TOOL_USE) {
6920
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
6845
6921
  lastAction = extractToolActionDescription(
6846
6922
  traceEvent.toolName,
6847
6923
  traceEvent.toolArgs
6848
6924
  );
6849
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_WRITE) {
6925
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
6850
6926
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
6851
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.FILE_READ) {
6927
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
6852
6928
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
6853
- } else if (traceEvent.type === import_evalforge_types2.LiveTraceEventType.COMPLETION) {
6929
+ } else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
6854
6930
  lastAction = "Processing response...";
6855
6931
  }
6856
6932
  emitTraceEvent(
@@ -7033,7 +7109,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
7033
7109
  targetId: traceContext.targetId,
7034
7110
  targetName: traceContext.targetName,
7035
7111
  stepNumber: traceStepNumber + 1,
7036
- type: import_evalforge_types2.LiveTraceEventType.DIAGNOSTIC,
7112
+ type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
7037
7113
  outputPreview: JSON.stringify(
7038
7114
  {
7039
7115
  event: "sdk-execution-failed",
@@ -7072,7 +7148,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7072
7148
  targetId: traceContext.targetId,
7073
7149
  targetName: traceContext.targetName,
7074
7150
  stepNumber: traceStepNumber + 1,
7075
- type: import_evalforge_types2.LiveTraceEventType.COMPLETION,
7151
+ type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
7076
7152
  outputPreview: "Scenario execution completed",
7077
7153
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7078
7154
  isComplete: true
@@ -7114,10 +7190,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7114
7190
  async function writeSkillsToFilesystem(cwd, skills) {
7115
7191
  for (const skill of skills) {
7116
7192
  const skillName = skill.name;
7117
- const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7118
- await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7119
- const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7120
- await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7193
+ const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
7194
+ await (0, import_promises5.mkdir)(skillDir, { recursive: true });
7195
+ const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
7196
+ await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
7121
7197
  console.log(`[Skill] Written to ${skillPath}`);
7122
7198
  }
7123
7199
  }
@@ -7250,7 +7326,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
7250
7326
  return {
7251
7327
  id: (0, import_crypto.randomUUID)(),
7252
7328
  stepNumber: index + 1,
7253
- type: step.toolCalls?.length ? import_evalforge_types2.LLMStepType.TOOL_USE : import_evalforge_types2.LLMStepType.COMPLETION,
7329
+ type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
7254
7330
  model,
7255
7331
  provider: "anthropic",
7256
7332
  startedAt: step.startedAt.toISOString(),
@@ -7321,9 +7397,11 @@ var ClaudeCodeAdapter = class {
7321
7397
  modelConfig,
7322
7398
  aiGatewayUrl,
7323
7399
  aiGatewayHeaders,
7324
- traceContext
7400
+ traceContext,
7401
+ mcps,
7402
+ subAgents
7325
7403
  } = context;
7326
- const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7404
+ const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7327
7405
  const options = {
7328
7406
  cwd,
7329
7407
  model: modelForSdk,
@@ -7331,7 +7409,9 @@ var ClaudeCodeAdapter = class {
7331
7409
  maxTokens: modelConfig?.maxTokens,
7332
7410
  aiGatewayUrl,
7333
7411
  aiGatewayHeaders,
7334
- traceContext
7412
+ traceContext,
7413
+ mcps,
7414
+ subAgents
7335
7415
  };
7336
7416
  const { result, llmTrace } = await executeWithClaudeCode(
7337
7417
  skills,
@@ -7358,7 +7438,7 @@ defaultRegistry.register(claudeCodeAdapter);
7358
7438
 
7359
7439
  // src/run-scenario/file-diff.ts
7360
7440
  var import_fs6 = require("fs");
7361
- var import_path6 = require("path");
7441
+ var import_path8 = require("path");
7362
7442
 
7363
7443
  // ../../node_modules/diff/lib/index.mjs
7364
7444
  function Diff() {
@@ -7534,7 +7614,7 @@ Diff.prototype = {
7534
7614
  tokenize: function tokenize(value) {
7535
7615
  return Array.from(value);
7536
7616
  },
7537
- join: function join3(chars) {
7617
+ join: function join5(chars) {
7538
7618
  return chars.join("");
7539
7619
  },
7540
7620
  postProcess: function postProcess(changeObjects) {
@@ -7974,8 +8054,8 @@ function snapshotDirectory(dir, baseDir) {
7974
8054
  }
7975
8055
  const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
7976
8056
  for (const entry of entries) {
7977
- const fullPath = (0, import_path6.join)(dir, entry.name);
7978
- const relativePath = (0, import_path6.relative)(base, fullPath);
8057
+ const fullPath = (0, import_path8.join)(dir, entry.name);
8058
+ const relativePath = (0, import_path8.relative)(base, fullPath);
7979
8059
  if (shouldIgnore(entry.name)) {
7980
8060
  continue;
7981
8061
  }
@@ -8084,13 +8164,18 @@ function extractTemplateFiles(before, after) {
8084
8164
 
8085
8165
  // src/run-scenario/run-agent-with-context.ts
8086
8166
  var DEFAULT_AGENT_COMMAND = "claude";
8087
- async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
8167
+ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
8168
+ const skillsGroupId = evalData.evalRun.skillsGroupId;
8169
+ if (!skillsGroupId) {
8170
+ throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
8171
+ }
8172
+ const agent = evalData.codeAgent ?? void 0;
8088
8173
  const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
8089
8174
  const adapter = getAdapter(runCommand);
8090
8175
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
8091
8176
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
8092
8177
  const executionContext = {
8093
- skills,
8178
+ skills: evalData.skills,
8094
8179
  scenario,
8095
8180
  cwd: workDir || process.cwd(),
8096
8181
  modelConfig: agent?.modelConfig,
@@ -8101,11 +8186,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
8101
8186
  scenarioId: scenario.id,
8102
8187
  scenarioName: scenario.name,
8103
8188
  targetId: skillsGroupId,
8104
- targetName: skillsGroupName,
8189
+ targetName: evalData.skillsGroupName,
8105
8190
  tracePushUrl: config.tracePushUrl,
8106
8191
  routeHeader: config.routeHeader,
8107
8192
  authToken: config.authToken
8108
- }
8193
+ },
8194
+ mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
8195
+ subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
8109
8196
  };
8110
8197
  const result = await adapter.execute(executionContext);
8111
8198
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -8115,7 +8202,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
8115
8202
  return {
8116
8203
  id: (0, import_crypto2.randomUUID)(),
8117
8204
  targetId: skillsGroupId,
8118
- targetName: skillsGroupName,
8205
+ targetName: evalData.skillsGroupName,
8119
8206
  scenarioId: scenario.id,
8120
8207
  scenarioName: scenario.name,
8121
8208
  modelConfig: agent?.modelConfig,
@@ -8143,10 +8230,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8143
8230
  config,
8144
8231
  evalRunId2,
8145
8232
  scenario,
8146
- evalData.skills,
8147
- skillsGroupId,
8148
- evalData.skillsGroupName,
8149
- evalData.codeAgent ?? void 0,
8233
+ evalData,
8150
8234
  workDir
8151
8235
  );
8152
8236
  const inlineAssertions = scenario.assertions ?? [];
@@ -8178,10 +8262,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8178
8262
  assertionContext
8179
8263
  ) : [];
8180
8264
  const passed = assertionResults.filter(
8181
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
8265
+ (r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
8182
8266
  ).length;
8183
8267
  const failed = assertionResults.filter(
8184
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
8268
+ (r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
8185
8269
  ).length;
8186
8270
  const total = assertionResults.length;
8187
8271
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -8195,7 +8279,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
8195
8279
  }
8196
8280
 
8197
8281
  // src/error-reporter.ts
8198
- var import_evalforge_types5 = require("@wix/evalforge-types");
8282
+ var import_evalforge_types6 = require("@wix/evalforge-types");
8199
8283
  function formatError(error, phase, context) {
8200
8284
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
8201
8285
  if (error instanceof Error) {
@@ -8444,7 +8528,7 @@ async function runEvaluation(projectId2, evalRunId2) {
8444
8528
  };
8445
8529
  try {
8446
8530
  await api.updateEvalRun(projectId2, evalRunId2, {
8447
- status: import_evalforge_types6.EvalStatus.COMPLETED,
8531
+ status: import_evalforge_types7.EvalStatus.COMPLETED,
8448
8532
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
8449
8533
  });
8450
8534
  } catch (updateErr) {
@@ -8485,7 +8569,7 @@ runEvaluation(projectId, evalRunId).then(() => {
8485
8569
  authToken: config.authToken
8486
8570
  });
8487
8571
  await api.updateEvalRun(projectId, evalRunId, {
8488
- status: import_evalforge_types6.EvalStatus.FAILED,
8572
+ status: import_evalforge_types7.EvalStatus.FAILED,
8489
8573
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8490
8574
  jobError,
8491
8575
  jobStatus: "FAILED"
@@ -8508,7 +8592,7 @@ runEvaluation(projectId, evalRunId).then(() => {
8508
8592
  authToken
8509
8593
  });
8510
8594
  await api.updateEvalRun(projectId, evalRunId, {
8511
- status: import_evalforge_types6.EvalStatus.FAILED,
8595
+ status: import_evalforge_types7.EvalStatus.FAILED,
8512
8596
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8513
8597
  jobError: `Config load failed, then: ${jobError}`,
8514
8598
  jobStatus: "FAILED"