@wix/evalforge-evaluator 0.182.0 → 0.184.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
5226
5226
  });
5227
5227
 
5228
5228
  // src/index.ts
5229
- var import_evalforge_types15 = require("@wix/evalforge-types");
5229
+ var import_evalforge_types16 = require("@wix/evalforge-types");
5230
5230
 
5231
5231
  // src/config.ts
5232
5232
  function loadConfig() {
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
7115
7115
  }
7116
7116
 
7117
7117
  // src/run-scenario/index.ts
7118
- var import_evalforge_types13 = require("@wix/evalforge-types");
7118
+ var import_evalforge_types14 = require("@wix/evalforge-types");
7119
7119
  var import_eval_assertions = require("@wix/eval-assertions");
7120
7120
 
7121
7121
  // src/run-scenario/environment.ts
@@ -7451,50 +7451,122 @@ function getAdapter(identifier) {
7451
7451
  }
7452
7452
 
7453
7453
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
7454
- var import_evalforge_types5 = require("@wix/evalforge-types");
7454
+ var import_evalforge_types6 = require("@wix/evalforge-types");
7455
7455
 
7456
7456
  // src/run-scenario/agents/claude-code/execute.ts
7457
- var import_evalforge_types4 = require("@wix/evalforge-types");
7457
+ var import_evalforge_types5 = require("@wix/evalforge-types");
7458
7458
 
7459
7459
  // src/run-scenario/agents/claude-code/write-skills.ts
7460
7460
  var import_promises3 = require("fs/promises");
7461
7461
  var import_path4 = require("path");
7462
+
7463
+ // src/run-scenario/agents/shared/resolve-capability-content.ts
7462
7464
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
7463
- async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7465
+ var import_evalforge_types2 = require("@wix/evalforge-types");
7466
+ var USER_AGENT = "EvalForge-Evaluator";
7467
+ async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7468
+ const version = skill.latestVersion;
7469
+ if (version?.files && version.files.length > 0) {
7470
+ console.log(
7471
+ `[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
7472
+ );
7473
+ return version.files;
7474
+ }
7475
+ if (skill.source) {
7476
+ const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
7477
+ console.log(
7478
+ `[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
7479
+ );
7480
+ return files;
7481
+ }
7482
+ throw new Error(`Skill ${skill.name} has no files and no source configured`);
7483
+ }
7484
+ async function fetchSourceFile(label, noun, name, source, fetchFn) {
7485
+ try {
7486
+ const content = await fetchFn(source, { userAgent: USER_AGENT });
7487
+ console.log(
7488
+ `[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
7489
+ );
7490
+ return content;
7491
+ } catch (error) {
7492
+ const message = error instanceof Error ? error.message : "Unknown error";
7493
+ console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
7494
+ throw new Error(
7495
+ `Failed to fetch ${noun} "${name}" from GitHub: ${message}`
7496
+ );
7497
+ }
7498
+ }
7499
+ async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7500
+ if (agent.source) {
7501
+ return fetchSourceFile(
7502
+ "SubAgents",
7503
+ "sub-agent",
7504
+ agent.name,
7505
+ agent.source,
7506
+ fetchFn
7507
+ );
7508
+ }
7509
+ if (!agent.subAgentMd) {
7510
+ console.warn(
7511
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7512
+ );
7513
+ }
7514
+ return agent.subAgentMd;
7515
+ }
7516
+ async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7517
+ if (!rule.source) {
7518
+ return rule.content;
7519
+ }
7520
+ return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
7521
+ }
7522
+ async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7523
+ if (!mcp.source) {
7524
+ return mcp.config;
7525
+ }
7526
+ const raw = await fetchSourceFile(
7527
+ "MCP",
7528
+ "MCP",
7529
+ mcp.name,
7530
+ mcp.source,
7531
+ fetchFn
7532
+ );
7533
+ let parsed;
7534
+ try {
7535
+ parsed = JSON.parse(raw);
7536
+ } catch (error) {
7537
+ const message = error instanceof Error ? error.message : "Unknown error";
7538
+ throw new Error(
7539
+ `MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
7540
+ );
7541
+ }
7542
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
7543
+ throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
7544
+ }
7545
+ const obj = parsed;
7546
+ const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
7547
+ if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
7548
+ return servers;
7549
+ }
7550
+ return obj;
7551
+ }
7552
+
7553
+ // src/run-scenario/agents/claude-code/write-skills.ts
7554
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
7464
7555
  await Promise.all(
7465
7556
  skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
7466
7557
  );
7467
7558
  }
7468
- async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7469
- const skillName = skill.name;
7470
- const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
7559
+ async function writeSkillToFilesystem(cwd, skill, fetchFn) {
7560
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
7471
7561
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7472
- const version = skill.latestVersion;
7473
- if (version?.files && version.files.length > 0) {
7474
- await writeFilesToDirectory(skillDir, version.files);
7475
- console.log(
7476
- `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
7562
+ try {
7563
+ const files = await resolveSkillFiles(skill, fetchFn);
7564
+ await writeFilesToDirectory(skillDir, files);
7565
+ } catch (error) {
7566
+ const message = error instanceof Error ? error.message : "Unknown error";
7567
+ throw new Error(
7568
+ `Failed to write skill ${skill.name} to filesystem: ${message}`
7477
7569
  );
7478
- } else if (skill.source) {
7479
- try {
7480
- const files = await fetchFn(skill.source, {
7481
- userAgent: "EvalForge-Evaluator"
7482
- });
7483
- await writeFilesToDirectory(skillDir, files);
7484
- console.log(
7485
- `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
7486
- );
7487
- } catch (error) {
7488
- const message = error instanceof Error ? error.message : "Unknown error";
7489
- console.error(
7490
- `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
7491
- );
7492
- throw new Error(
7493
- `Failed to write skill ${skillName} to filesystem: ${message}`
7494
- );
7495
- }
7496
- } else {
7497
- throw new Error(`Skill ${skillName} has no files and no source configured`);
7498
7570
  }
7499
7571
  }
7500
7572
 
@@ -7512,7 +7584,7 @@ var import_crypto2 = require("crypto");
7512
7584
  // src/run-scenario/agents/claude-code/write-mcp.ts
7513
7585
  var import_promises5 = require("fs/promises");
7514
7586
  var import_path6 = require("path");
7515
- var import_evalforge_types2 = require("@wix/evalforge-types");
7587
+ var import_evalforge_types3 = require("@wix/evalforge-types");
7516
7588
 
7517
7589
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
7518
7590
  var import_promises4 = require("fs/promises");
@@ -7557,11 +7629,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
7557
7629
  }
7558
7630
 
7559
7631
  // src/run-scenario/agents/claude-code/write-mcp.ts
7560
- async function writeMcpToFilesystem(cwd, mcps) {
7632
+ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7561
7633
  if (mcps.length === 0) return;
7562
7634
  const mcpServers = {};
7563
7635
  for (const mcp of mcps) {
7564
- const config = mcp.config;
7636
+ const config = await resolveMcpConfig(mcp, fetchFn);
7565
7637
  for (const [key, value] of Object.entries(config)) {
7566
7638
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
7567
7639
  throw new Error(
@@ -7573,7 +7645,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
7573
7645
  }
7574
7646
  const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
7575
7647
  const content = JSON.stringify(
7576
- { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
7648
+ { [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
7577
7649
  null,
7578
7650
  2
7579
7651
  );
@@ -7585,7 +7657,6 @@ async function writeMcpToFilesystem(cwd, mcps) {
7585
7657
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
7586
7658
  var import_promises6 = require("fs/promises");
7587
7659
  var import_path7 = require("path");
7588
- var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
7589
7660
  var AGENTS_DIR = ".claude/agents";
7590
7661
  function toAgentFilename(name, index, nameCount) {
7591
7662
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -7593,34 +7664,7 @@ function toAgentFilename(name, index, nameCount) {
7593
7664
  nameCount.set(base, count + 1);
7594
7665
  return count === 0 ? base : `${base}-${count + 1}`;
7595
7666
  }
7596
- async function resolveSubAgentContent(agent, fetchFn) {
7597
- if (agent.source) {
7598
- try {
7599
- const content = await fetchFn(agent.source, {
7600
- userAgent: "EvalForge-Evaluator"
7601
- });
7602
- console.log(
7603
- `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
7604
- );
7605
- return content;
7606
- } catch (error) {
7607
- const message = error instanceof Error ? error.message : "Unknown error";
7608
- console.error(
7609
- `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
7610
- );
7611
- throw new Error(
7612
- `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
7613
- );
7614
- }
7615
- }
7616
- if (!agent.subAgentMd) {
7617
- console.warn(
7618
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7619
- );
7620
- }
7621
- return agent.subAgentMd;
7622
- }
7623
- async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
7667
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7624
7668
  if (subAgents.length === 0) return;
7625
7669
  const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
7626
7670
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
@@ -7628,7 +7672,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
7628
7672
  for (const [i, agent] of subAgents.entries()) {
7629
7673
  const filename = toAgentFilename(agent.name, i, nameCount);
7630
7674
  const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
7631
- const content = await resolveSubAgentContent(agent, fetchFn);
7675
+ const content = await resolveSubAgentMd(agent, fetchFn);
7632
7676
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
7633
7677
  }
7634
7678
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -7678,18 +7722,19 @@ function validateGenericDirectory(dir, cwd) {
7678
7722
  }
7679
7723
  return trimmed;
7680
7724
  }
7681
- async function writeRulesToFilesystem(cwd, rules) {
7725
+ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7682
7726
  if (rules.length === 0) return;
7683
7727
  const nameCount = /* @__PURE__ */ new Map();
7684
7728
  let hasCursorRules = false;
7685
7729
  for (const [i, rule] of rules.entries()) {
7730
+ const content = await resolveRuleText(rule, fetchFn);
7686
7731
  switch (rule.ruleType) {
7687
7732
  case "claude-md": {
7688
- await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
7733
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
7689
7734
  break;
7690
7735
  }
7691
7736
  case "agents-md": {
7692
- await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
7737
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
7693
7738
  break;
7694
7739
  }
7695
7740
  case "cursor-rule": {
@@ -7699,7 +7744,7 @@ async function writeRulesToFilesystem(cwd, rules) {
7699
7744
  }
7700
7745
  const filename = toRuleFilename(rule.name, i, nameCount);
7701
7746
  const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
7702
- await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
7747
+ await (0, import_promises7.writeFile)(filePath, content, "utf8");
7703
7748
  break;
7704
7749
  }
7705
7750
  case "generic": {
@@ -7710,7 +7755,7 @@ async function writeRulesToFilesystem(cwd, rules) {
7710
7755
  const dirPath = (0, import_path8.join)(cwd, directory);
7711
7756
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
7712
7757
  const filename = toRuleFilename(rule.name, i, nameCount);
7713
- await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
7758
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
7714
7759
  break;
7715
7760
  }
7716
7761
  default: {
@@ -7800,14 +7845,14 @@ function buildConversation(timestampedMessages) {
7800
7845
  }
7801
7846
 
7802
7847
  // src/run-scenario/agents/shared/trace-emit.ts
7803
- var import_evalforge_types3 = require("@wix/evalforge-types");
7848
+ var import_evalforge_types4 = require("@wix/evalforge-types");
7804
7849
  function emitTraceEvent(event, pushEvent) {
7805
- console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7850
+ console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7806
7851
  pushEvent?.(event);
7807
7852
  }
7808
7853
 
7809
7854
  // src/run-scenario/agents/claude-code/execute.ts
7810
- var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7855
+ var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7811
7856
  async function* buildPromptStream(triggerPrompt, images) {
7812
7857
  yield {
7813
7858
  type: "user",
@@ -7872,7 +7917,7 @@ function extractToolActionDescription(toolName, toolArgs) {
7872
7917
  return `Using ${toolName}...`;
7873
7918
  }
7874
7919
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7875
- let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7920
+ let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7876
7921
  let toolName;
7877
7922
  let toolArgs;
7878
7923
  let outputPreview;
@@ -7880,28 +7925,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7880
7925
  let thinking;
7881
7926
  for (const block of message.message.content) {
7882
7927
  if (block.type === "tool_use") {
7883
- type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
7928
+ type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
7884
7929
  toolName = block.name;
7885
7930
  toolArgs = JSON.stringify(block.input).slice(0, 500);
7886
7931
  const input = block.input;
7887
7932
  if (input.file_path || input.path || input.target_file) {
7888
7933
  filePath = String(input.file_path || input.path || input.target_file);
7889
7934
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
7890
- type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
7935
+ type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
7891
7936
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
7892
- type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
7937
+ type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
7893
7938
  }
7894
7939
  }
7895
7940
  } else if (block.type === "text") {
7896
7941
  outputPreview = block.text.slice(0, 500);
7897
7942
  if (!toolName) {
7898
- type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7943
+ type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7899
7944
  }
7900
7945
  } else if (block.type === "thinking") {
7901
7946
  const thinkingBlock = block;
7902
7947
  thinking = thinkingBlock.thinking.slice(0, 500);
7903
7948
  if (!outputPreview && !toolName) {
7904
- type = import_evalforge_types4.LiveTraceEventType.THINKING;
7949
+ type = import_evalforge_types5.LiveTraceEventType.THINKING;
7905
7950
  }
7906
7951
  }
7907
7952
  }
@@ -7967,7 +8012,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
7967
8012
  }
7968
8013
  return {
7969
8014
  ...baseEvent,
7970
- type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
8015
+ type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
7971
8016
  outputPreview: outputPreview || "(tool result)"
7972
8017
  };
7973
8018
  }
@@ -7975,7 +8020,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
7975
8020
  const sysMsg = message;
7976
8021
  return {
7977
8022
  ...baseEvent,
7978
- type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
8023
+ type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
7979
8024
  outputPreview: sysMsg.subtype || "system"
7980
8025
  };
7981
8026
  }
@@ -7984,7 +8029,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
7984
8029
  }
7985
8030
  return {
7986
8031
  ...baseEvent,
7987
- type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8032
+ type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
7988
8033
  outputPreview: `Message type: ${message.type}`
7989
8034
  };
7990
8035
  }
@@ -8086,7 +8131,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8086
8131
  queryOptions.systemPrompt = {
8087
8132
  type: "preset",
8088
8133
  preset: "claude_code",
8089
- append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8134
+ append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8090
8135
  };
8091
8136
  }
8092
8137
  if (options.temperature !== void 0) {
@@ -8121,7 +8166,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8121
8166
  targetId: traceContext.targetId,
8122
8167
  targetName: traceContext.targetName,
8123
8168
  stepNumber: 0,
8124
- type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8169
+ type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8125
8170
  outputPreview: JSON.stringify({
8126
8171
  event: "pre-sdk-execution",
8127
8172
  model: queryOptions.model,
@@ -8185,7 +8230,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8185
8230
  targetId: traceContext.targetId,
8186
8231
  targetName: traceContext.targetName,
8187
8232
  stepNumber: traceStepNumber,
8188
- type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8233
+ type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
8189
8234
  outputPreview: progressMessage,
8190
8235
  toolName: lastToolName,
8191
8236
  filePath: lastFilePath,
@@ -8222,18 +8267,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
8222
8267
  if (traceEvent) {
8223
8268
  lastToolName = traceEvent.toolName;
8224
8269
  lastFilePath = traceEvent.filePath;
8225
- if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
8270
+ if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
8226
8271
  lastAction = "Thinking...";
8227
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
8272
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
8228
8273
  lastAction = extractToolActionDescription(
8229
8274
  traceEvent.toolName,
8230
8275
  traceEvent.toolArgs
8231
8276
  );
8232
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
8277
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
8233
8278
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
8234
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
8279
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
8235
8280
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
8236
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
8281
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
8237
8282
  lastAction = "Processing response...";
8238
8283
  }
8239
8284
  emitTraceEvent(traceEvent, traceContext.pushEvent);
@@ -8411,7 +8456,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8411
8456
  targetId: traceContext.targetId,
8412
8457
  targetName: traceContext.targetName,
8413
8458
  stepNumber: traceStepNumber + 1,
8414
- type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8459
+ type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8415
8460
  outputPreview: JSON.stringify(
8416
8461
  {
8417
8462
  event: "sdk-execution-failed",
@@ -8445,7 +8490,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
8445
8490
  targetId: traceContext.targetId,
8446
8491
  targetName: traceContext.targetName,
8447
8492
  stepNumber: traceStepNumber + 1,
8448
- type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
8493
+ type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
8449
8494
  outputPreview: "Scenario execution completed",
8450
8495
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
8451
8496
  isComplete: true
@@ -8625,9 +8670,12 @@ function processMessages(timestampedMessages, startTime, endTime) {
8625
8670
  if (!step.toolCalls) continue;
8626
8671
  for (const tc of step.toolCalls) {
8627
8672
  if (tc.toolUseId && toolResultErrors.has(tc.toolUseId)) {
8628
- step.hasToolError = true;
8629
- step.toolErrorContent = toolResultErrors.get(tc.toolUseId);
8630
- break;
8673
+ tc.isError = true;
8674
+ tc.errorContent = toolResultErrors.get(tc.toolUseId);
8675
+ if (!step.hasToolError) {
8676
+ step.hasToolError = true;
8677
+ step.toolErrorContent = tc.errorContent;
8678
+ }
8631
8679
  }
8632
8680
  }
8633
8681
  }
@@ -8717,7 +8765,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8717
8765
  stepNumber: 0,
8718
8766
  // renumbered below
8719
8767
  turnIndex,
8720
- type: import_evalforge_types4.LLMStepType.THINKING,
8768
+ type: import_evalforge_types5.LLMStepType.THINKING,
8721
8769
  model,
8722
8770
  provider: "anthropic",
8723
8771
  startedAt: step.startedAt.toISOString(),
@@ -8731,8 +8779,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8731
8779
  },
8732
8780
  costUsd: stepCost / totalSubSteps,
8733
8781
  outputPreview: step.thinking?.slice(0, 200),
8734
- success: isSuccess,
8735
- error: errorMsg
8782
+ success: true,
8783
+ error: void 0
8736
8784
  });
8737
8785
  }
8738
8786
  if (toolCallCount > 0) {
@@ -8742,11 +8790,13 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8742
8790
  const toolBudgetSteps = toolSubSteps + textSubSteps;
8743
8791
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
8744
8792
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
8793
+ const toolSuccess = !tc.isError;
8794
+ const toolError = tc.isError ? tc.errorContent ?? "Tool call failed" : void 0;
8745
8795
  subSteps.push({
8746
8796
  id: (0, import_crypto2.randomUUID)(),
8747
8797
  stepNumber: 0,
8748
8798
  turnIndex,
8749
- type: import_evalforge_types4.LLMStepType.TOOL_USE,
8799
+ type: import_evalforge_types5.LLMStepType.TOOL_USE,
8750
8800
  model,
8751
8801
  provider: "anthropic",
8752
8802
  startedAt: step.startedAt.toISOString(),
@@ -8766,8 +8816,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8766
8816
  toolName: tc.toolName,
8767
8817
  toolArguments: JSON.stringify(tc.args),
8768
8818
  outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
8769
- success: isSuccess,
8770
- error: errorMsg
8819
+ success: toolSuccess,
8820
+ error: toolError
8771
8821
  });
8772
8822
  }
8773
8823
  }
@@ -8776,7 +8826,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8776
8826
  id: (0, import_crypto2.randomUUID)(),
8777
8827
  stepNumber: 0,
8778
8828
  turnIndex,
8779
- type: import_evalforge_types4.LLMStepType.COMPLETION,
8829
+ type: import_evalforge_types5.LLMStepType.COMPLETION,
8780
8830
  model,
8781
8831
  provider: "anthropic",
8782
8832
  startedAt: step.startedAt.toISOString(),
@@ -8788,12 +8838,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8788
8838
  },
8789
8839
  costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
8790
8840
  outputPreview: step.text?.slice(0, 200),
8791
- success: isSuccess,
8792
- error: errorMsg
8841
+ success: true,
8842
+ error: void 0
8793
8843
  });
8794
8844
  }
8795
8845
  if (subSteps.length === 0) {
8796
- const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
8846
+ const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
8797
8847
  subSteps.push({
8798
8848
  id: (0, import_crypto2.randomUUID)(),
8799
8849
  stepNumber: 0,
@@ -8863,7 +8913,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8863
8913
  var ClaudeCodeAdapter = class {
8864
8914
  id = "claude-code";
8865
8915
  name = "Claude Code";
8866
- supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
8916
+ supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
8867
8917
  /**
8868
8918
  * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
8869
8919
  * before the baseline snapshot is taken.
@@ -8895,9 +8945,9 @@ var ClaudeCodeAdapter = class {
8895
8945
  rules,
8896
8946
  systemPrompt
8897
8947
  } = context;
8898
- const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8948
+ const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8899
8949
  const cfg = typed?.success ? typed.data : void 0;
8900
- const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
8950
+ const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
8901
8951
  const extras = {};
8902
8952
  if (config) {
8903
8953
  for (const [key, value] of Object.entries(config)) {
@@ -8952,11 +9002,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
8952
9002
  defaultRegistry.register(claudeCodeAdapter);
8953
9003
 
8954
9004
  // src/run-scenario/agents/opencode/opencode-adapter.ts
8955
- var import_evalforge_types9 = require("@wix/evalforge-types");
9005
+ var import_evalforge_types10 = require("@wix/evalforge-types");
8956
9006
 
8957
9007
  // src/run-scenario/agents/opencode/execute.ts
8958
9008
  var import_child_process2 = require("child_process");
8959
- var import_evalforge_types8 = require("@wix/evalforge-types");
9009
+ var import_evalforge_types9 = require("@wix/evalforge-types");
8960
9010
 
8961
9011
  // src/run-scenario/agents/opencode/types.ts
8962
9012
  function tryParseJson(text) {
@@ -8970,49 +9020,28 @@ function tryParseJson(text) {
8970
9020
  // src/run-scenario/agents/opencode/write-skills.ts
8971
9021
  var import_promises8 = require("fs/promises");
8972
9022
  var import_path9 = require("path");
8973
- var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
8974
- async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
9023
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
8975
9024
  await Promise.all(
8976
9025
  skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
8977
9026
  );
8978
9027
  }
8979
9028
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
8980
- const skillName = skill.name;
8981
- const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
9029
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
8982
9030
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
8983
- const version = skill.latestVersion;
8984
- if (version?.files && version.files.length > 0) {
8985
- await writeFilesToDirectory(skillDir, version.files);
8986
- console.log(
8987
- `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
9031
+ try {
9032
+ const files = await resolveSkillFiles(skill, fetchFn);
9033
+ await writeFilesToDirectory(skillDir, files);
9034
+ } catch (error) {
9035
+ const message = error instanceof Error ? error.message : "Unknown error";
9036
+ throw new Error(
9037
+ `Failed to write skill ${skill.name} to filesystem: ${message}`
8988
9038
  );
8989
- } else if (skill.source) {
8990
- try {
8991
- const files = await fetchFn(skill.source, {
8992
- userAgent: "EvalForge-Evaluator"
8993
- });
8994
- await writeFilesToDirectory(skillDir, files);
8995
- console.log(
8996
- `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
8997
- );
8998
- } catch (error) {
8999
- const message = error instanceof Error ? error.message : "Unknown error";
9000
- console.error(
9001
- `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
9002
- );
9003
- throw new Error(
9004
- `Failed to write skill ${skillName} to filesystem: ${message}`
9005
- );
9006
- }
9007
- } else {
9008
- throw new Error(`Skill ${skillName} has no files and no source configured`);
9009
9039
  }
9010
9040
  }
9011
9041
 
9012
9042
  // src/run-scenario/agents/opencode/write-sub-agents.ts
9013
9043
  var import_promises9 = require("fs/promises");
9014
9044
  var import_path10 = require("path");
9015
- var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
9016
9045
  var AGENTS_DIR2 = ".opencode/agents";
9017
9046
  function toAgentFilename2(name, index, nameCount) {
9018
9047
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -9020,34 +9049,7 @@ function toAgentFilename2(name, index, nameCount) {
9020
9049
  nameCount.set(base, count + 1);
9021
9050
  return count === 0 ? base : `${base}-${count + 1}`;
9022
9051
  }
9023
- async function resolveSubAgentContent2(agent, fetchFn) {
9024
- if (agent.source) {
9025
- try {
9026
- const content = await fetchFn(agent.source, {
9027
- userAgent: "EvalForge-Evaluator"
9028
- });
9029
- console.log(
9030
- `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
9031
- );
9032
- return content;
9033
- } catch (error) {
9034
- const message = error instanceof Error ? error.message : "Unknown error";
9035
- console.error(
9036
- `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
9037
- );
9038
- throw new Error(
9039
- `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
9040
- );
9041
- }
9042
- }
9043
- if (!agent.subAgentMd) {
9044
- console.warn(
9045
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
9046
- );
9047
- }
9048
- return agent.subAgentMd;
9049
- }
9050
- async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
9052
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9051
9053
  if (subAgents.length === 0) return;
9052
9054
  const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
9053
9055
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
@@ -9055,7 +9057,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
9055
9057
  for (const [i, agent] of subAgents.entries()) {
9056
9058
  const filename = toAgentFilename2(agent.name, i, nameCount);
9057
9059
  const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
9058
- const content = await resolveSubAgentContent2(agent, fetchFn);
9060
+ const content = await resolveSubAgentMd(agent, fetchFn);
9059
9061
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
9060
9062
  }
9061
9063
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -9063,8 +9065,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
9063
9065
 
9064
9066
  // src/run-scenario/agents/opencode/config.ts
9065
9067
  var import_os3 = require("os");
9066
- var import_evalforge_types6 = require("@wix/evalforge-types");
9067
- var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9068
+ var import_evalforge_types7 = require("@wix/evalforge-types");
9069
+ var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9068
9070
  var OPENCODE_MODEL_ALIASES = {
9069
9071
  "claude-sonnet-4": "claude-sonnet-4-0",
9070
9072
  "claude-opus-4": "claude-opus-4-0"
@@ -9080,10 +9082,10 @@ function parseModel(model) {
9080
9082
  };
9081
9083
  }
9082
9084
  const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
9083
- const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
9085
+ const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
9084
9086
  model
9085
9087
  );
9086
- const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
9088
+ const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
9087
9089
  model
9088
9090
  );
9089
9091
  if (isGemini) return { providerID: "google", modelID };
@@ -9152,7 +9154,7 @@ async function buildOpenCodeEnv(options) {
9152
9154
  if (options.mcps && options.mcps.length > 0) {
9153
9155
  const mcpServers = {};
9154
9156
  for (const mcpEntity of options.mcps) {
9155
- const entityConfig = mcpEntity.config;
9157
+ const entityConfig = await resolveMcpConfig(mcpEntity);
9156
9158
  for (const [key, value] of Object.entries(entityConfig)) {
9157
9159
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
9158
9160
  throw new Error(
@@ -9177,7 +9179,7 @@ async function buildOpenCodeEnv(options) {
9177
9179
  if (options.maxTurns != null && options.maxTurns > 0) {
9178
9180
  agentOverrides.maxSteps = options.maxTurns;
9179
9181
  }
9180
- const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9182
+ const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9181
9183
  const configPermission = parsed?.success ? parsed.data.permission : void 0;
9182
9184
  const defaultPermission = {
9183
9185
  "*": "allow"
@@ -9219,7 +9221,7 @@ async function buildOpenCodeEnv(options) {
9219
9221
  }
9220
9222
 
9221
9223
  // src/run-scenario/agents/opencode/build-trace.ts
9222
- var import_evalforge_types7 = require("@wix/evalforge-types");
9224
+ var import_evalforge_types8 = require("@wix/evalforge-types");
9223
9225
  var import_crypto3 = require("crypto");
9224
9226
  function toCanonicalModelId(modelId) {
9225
9227
  const slashIndex = modelId.indexOf("/");
@@ -9299,7 +9301,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9299
9301
  id: (0, import_crypto3.randomUUID)(),
9300
9302
  stepNumber: 0,
9301
9303
  turnIndex,
9302
- type: import_evalforge_types7.LLMStepType.THINKING,
9304
+ type: import_evalforge_types8.LLMStepType.THINKING,
9303
9305
  model: stepModel,
9304
9306
  provider: stepProvider,
9305
9307
  startedAt,
@@ -9328,7 +9330,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9328
9330
  id: (0, import_crypto3.randomUUID)(),
9329
9331
  stepNumber: 0,
9330
9332
  turnIndex,
9331
- type: import_evalforge_types7.LLMStepType.TOOL_USE,
9333
+ type: import_evalforge_types8.LLMStepType.TOOL_USE,
9332
9334
  model: stepModel,
9333
9335
  provider: stepProvider,
9334
9336
  startedAt,
@@ -9358,7 +9360,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9358
9360
  id: (0, import_crypto3.randomUUID)(),
9359
9361
  stepNumber: 0,
9360
9362
  turnIndex,
9361
- type: import_evalforge_types7.LLMStepType.COMPLETION,
9363
+ type: import_evalforge_types8.LLMStepType.COMPLETION,
9362
9364
  model: stepModel,
9363
9365
  provider: stepProvider,
9364
9366
  startedAt,
@@ -9375,7 +9377,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9375
9377
  });
9376
9378
  }
9377
9379
  if (subSteps.length === 0) {
9378
- const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
9380
+ const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
9379
9381
  subSteps.push({
9380
9382
  id: (0, import_crypto3.randomUUID)(),
9381
9383
  stepNumber: 0,
@@ -9576,14 +9578,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9576
9578
  const te = evt;
9577
9579
  return {
9578
9580
  ...base,
9579
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
9581
+ type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
9580
9582
  outputPreview: te.part.text.slice(0, 500)
9581
9583
  };
9582
9584
  }
9583
9585
  case "reasoning":
9584
9586
  return {
9585
9587
  ...base,
9586
- type: import_evalforge_types8.LiveTraceEventType.THINKING,
9588
+ type: import_evalforge_types9.LiveTraceEventType.THINKING,
9587
9589
  thinking: evt.part.text.slice(0, 500)
9588
9590
  };
9589
9591
  case "tool_use": {
@@ -9591,15 +9593,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9591
9593
  const toolName = tu.part.tool;
9592
9594
  const args = tu.part.state.input;
9593
9595
  const toolArgs = JSON.stringify(args).slice(0, 500);
9594
- let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
9596
+ let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
9595
9597
  let filePath;
9596
9598
  if (args) {
9597
9599
  if (args.file_path || args.path || args.target_file) {
9598
9600
  filePath = String(args.file_path || args.path || args.target_file);
9599
9601
  if (/write|edit/i.test(toolName)) {
9600
- type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
9602
+ type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
9601
9603
  } else if (/read|view/i.test(toolName)) {
9602
- type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
9604
+ type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
9603
9605
  }
9604
9606
  }
9605
9607
  }
@@ -9608,7 +9610,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9608
9610
  case "step_finish":
9609
9611
  return {
9610
9612
  ...base,
9611
- type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9613
+ type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9612
9614
  outputPreview: "Step completed"
9613
9615
  };
9614
9616
  default:
@@ -9639,7 +9641,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
9639
9641
  } else if (options.systemPrompt != null) {
9640
9642
  systemPrompt = options.systemPrompt;
9641
9643
  } else {
9642
- systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9644
+ systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9643
9645
  }
9644
9646
  if (systemPrompt) {
9645
9647
  await writeSystemPromptRule(cwd, systemPrompt);
@@ -9831,7 +9833,7 @@ function spawnOpenCodeProcess(opts) {
9831
9833
  targetId: traceContext.targetId,
9832
9834
  targetName: traceContext.targetName,
9833
9835
  stepNumber: traceStepNumber,
9834
- type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9836
+ type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9835
9837
  outputPreview: progressMessage,
9836
9838
  toolName: lastToolName,
9837
9839
  filePath: lastFilePath,
@@ -9865,18 +9867,18 @@ function spawnOpenCodeProcess(opts) {
9865
9867
  if (traceEvt) {
9866
9868
  lastToolName = traceEvt.toolName;
9867
9869
  lastFilePath = traceEvt.filePath;
9868
- if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
9870
+ if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
9869
9871
  lastAction = "Thinking...";
9870
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
9872
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
9871
9873
  lastAction = extractToolAction(
9872
9874
  traceEvt.toolName ?? "",
9873
9875
  void 0
9874
9876
  );
9875
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
9877
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
9876
9878
  lastAction = `Writing: ${traceEvt.filePath || "file"}`;
9877
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
9879
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
9878
9880
  lastAction = `Reading: ${traceEvt.filePath || "file"}`;
9879
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
9881
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
9880
9882
  lastAction = "Processing response...";
9881
9883
  }
9882
9884
  emitTraceEvent(traceEvt, traceContext.pushEvent);
@@ -9958,7 +9960,7 @@ async function executeWithOpenCode(skills, scenario, options) {
9958
9960
  targetId: traceContext.targetId,
9959
9961
  targetName: traceContext.targetName,
9960
9962
  stepNumber: 0,
9961
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
9963
+ type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
9962
9964
  outputPreview: JSON.stringify({
9963
9965
  event: "pre-cli-execution",
9964
9966
  model: `${providerID}/${modelID}`,
@@ -10012,7 +10014,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10012
10014
  targetId: traceContext.targetId,
10013
10015
  targetName: traceContext.targetName,
10014
10016
  stepNumber: traceStepNumber + 1,
10015
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10017
+ type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10016
10018
  outputPreview: JSON.stringify({
10017
10019
  event: "idle-timeout-retry",
10018
10020
  attempt,
@@ -10056,7 +10058,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10056
10058
  targetId: traceContext.targetId,
10057
10059
  targetName: traceContext.targetName,
10058
10060
  stepNumber: traceStepNumber + 1,
10059
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10061
+ type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10060
10062
  outputPreview: JSON.stringify({
10061
10063
  event: "cli-execution-failed",
10062
10064
  error: lastAttemptResult.error?.message ?? "Unknown error",
@@ -10111,7 +10113,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10111
10113
  targetId: traceContext.targetId,
10112
10114
  targetName: traceContext.targetName,
10113
10115
  stepNumber: traceStepNumber + 1,
10114
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
10116
+ type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
10115
10117
  outputPreview: "Scenario execution completed",
10116
10118
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10117
10119
  isComplete: true
@@ -10148,7 +10150,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10148
10150
  var OpenCodeAdapter = class {
10149
10151
  id = "opencode";
10150
10152
  name = "OpenCode";
10151
- supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
10153
+ supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
10152
10154
  async prepareEnvironment(context) {
10153
10155
  await prepareOpenCodeEnvironment(context.cwd, context.skills, {
10154
10156
  mcps: context.mcps,
@@ -10171,7 +10173,7 @@ var OpenCodeAdapter = class {
10171
10173
  rules,
10172
10174
  systemPrompt
10173
10175
  } = context;
10174
- const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10176
+ const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10175
10177
  const cfg = typed?.success ? typed.data : void 0;
10176
10178
  const rawMaxTurns = cfg?.maxTurns;
10177
10179
  const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
@@ -10221,7 +10223,7 @@ var import_ai = require("ai");
10221
10223
  var import_anthropic = require("@ai-sdk/anthropic");
10222
10224
  var import_google = require("@ai-sdk/google");
10223
10225
  var import_openai = require("@ai-sdk/openai");
10224
- var import_evalforge_types11 = require("@wix/evalforge-types");
10226
+ var import_evalforge_types12 = require("@wix/evalforge-types");
10225
10227
  var import_crypto4 = require("crypto");
10226
10228
 
10227
10229
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
@@ -10318,7 +10320,7 @@ function extractErrorText(content) {
10318
10320
  }
10319
10321
 
10320
10322
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
10321
- var import_evalforge_types10 = require("@wix/evalforge-types");
10323
+ var import_evalforge_types11 = require("@wix/evalforge-types");
10322
10324
  var PROVIDER_ANTHROPIC = "anthropic";
10323
10325
  var PROVIDER_GEMINI = "gemini";
10324
10326
  var MODEL_PRICING = {
@@ -10387,7 +10389,7 @@ function extractGatewayCost(step, provider) {
10387
10389
  }
10388
10390
  }
10389
10391
  function calculateFromPricing(modelId, tokenUsage) {
10390
- const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
10392
+ const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
10391
10393
  const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
10392
10394
  if (!pricing) return 0;
10393
10395
  return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
@@ -10480,7 +10482,7 @@ function createModel(modelId, baseUrl, headers) {
10480
10482
  apiKey: "proxy-auth",
10481
10483
  headers
10482
10484
  });
10483
- if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10485
+ if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10484
10486
  (id) => modelId === id || modelId.startsWith(id)
10485
10487
  )) {
10486
10488
  return openai.responses(modelId);
@@ -10488,12 +10490,12 @@ function createModel(modelId, baseUrl, headers) {
10488
10490
  return openai.chat(modelId);
10489
10491
  }
10490
10492
  function isClaudeModelId(modelId) {
10491
- return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
10493
+ return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
10492
10494
  (id) => modelId === id || modelId.startsWith(id)
10493
10495
  );
10494
10496
  }
10495
10497
  function isGeminiModelId(modelId) {
10496
- return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
10498
+ return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
10497
10499
  (id) => modelId === id || modelId.startsWith(id)
10498
10500
  );
10499
10501
  }
@@ -10513,9 +10515,9 @@ async function executeWithAiSdk(context) {
10513
10515
  mcps,
10514
10516
  traceContext
10515
10517
  } = context;
10516
- const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10518
+ const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10517
10519
  const cfg = typed?.success ? typed.data : void 0;
10518
- const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
10520
+ const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
10519
10521
  const configExtras = {};
10520
10522
  if (config) {
10521
10523
  for (const [key, value] of Object.entries(config)) {
@@ -10552,11 +10554,11 @@ async function executeWithAiSdk(context) {
10552
10554
  }, SDK_TIMEOUT_MS);
10553
10555
  try {
10554
10556
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
10555
- const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10557
+ const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10556
10558
  (id) => modelId === id || modelId.startsWith(id)
10557
10559
  );
10558
10560
  const isGemini = provider === PROVIDER_GEMINI2;
10559
- const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
10561
+ const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
10560
10562
  const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
10561
10563
  const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
10562
10564
  const reasoningEffort = cfg.reasoningEffort ?? "high";
@@ -10635,7 +10637,7 @@ async function executeWithAiSdk(context) {
10635
10637
  targetId: traceContext.targetId,
10636
10638
  targetName: traceContext.targetName,
10637
10639
  stepNumber: stepTimestamps.length,
10638
- type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
10640
+ type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
10639
10641
  toolName: firstToolCall?.toolName,
10640
10642
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
10641
10643
  outputPreview: step.text?.slice(0, 500),
@@ -10840,7 +10842,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
10840
10842
  id: (0, import_crypto4.randomUUID)(),
10841
10843
  stepNumber: i + 1,
10842
10844
  turnIndex: i,
10843
- type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
10845
+ type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
10844
10846
  model: modelId,
10845
10847
  provider,
10846
10848
  startedAt: new Date(stepStartedAt).toISOString(),
@@ -10890,7 +10892,7 @@ function emitStartEvent(traceContext, startTime) {
10890
10892
  targetId: traceContext.targetId,
10891
10893
  targetName: traceContext.targetName,
10892
10894
  stepNumber: 0,
10893
- type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
10895
+ type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
10894
10896
  outputPreview: "Starting Simple Agent execution...",
10895
10897
  elapsedMs: Date.now() - startTime,
10896
10898
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -10908,7 +10910,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
10908
10910
  targetId: traceContext.targetId,
10909
10911
  targetName: traceContext.targetName,
10910
10912
  stepNumber,
10911
- type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
10913
+ type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
10912
10914
  outputPreview: "Scenario execution completed",
10913
10915
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10914
10916
  isComplete: true
@@ -11678,11 +11680,11 @@ function substituteVariables(prompt, variables) {
11678
11680
  }
11679
11681
 
11680
11682
  // src/run-scenario/run-agent-with-context.ts
11681
- var import_evalforge_types12 = require("@wix/evalforge-types");
11682
- var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
11683
+ var import_evalforge_types13 = require("@wix/evalforge-types");
11684
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
11683
11685
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
11684
11686
  const agent = evalData.agent ?? void 0;
11685
- const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
11687
+ const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
11686
11688
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
11687
11689
  const adapter = getAdapter(identifier);
11688
11690
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -11767,14 +11769,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11767
11769
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
11768
11770
  if (template) {
11769
11771
  console.log(
11770
- (0, import_evalforge_types13.formatTraceEventLine)({
11772
+ (0, import_evalforge_types14.formatTraceEventLine)({
11771
11773
  evalRunId: evalRunId2,
11772
11774
  scenarioId: scenario.id,
11773
11775
  scenarioName: scenario.name,
11774
11776
  targetId,
11775
11777
  targetName,
11776
11778
  stepNumber: 0,
11777
- type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
11779
+ type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
11778
11780
  outputPreview: "Setting up environment (installing dependencies)...",
11779
11781
  elapsedMs: 0,
11780
11782
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -11814,7 +11816,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11814
11816
  })),
11815
11817
  durationMs: partialResult.duration
11816
11818
  };
11817
- const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
11819
+ const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
11818
11820
  const assertionContext = {
11819
11821
  workDir,
11820
11822
  defaultJudgeModel,
@@ -11829,10 +11831,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11829
11831
  assertionContext
11830
11832
  ) : [];
11831
11833
  const passed = assertionResults.filter(
11832
- (r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
11834
+ (r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
11833
11835
  ).length;
11834
11836
  const failed = assertionResults.filter(
11835
- (r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
11837
+ (r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
11836
11838
  ).length;
11837
11839
  const total = assertionResults.length;
11838
11840
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -11908,7 +11910,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
11908
11910
  }
11909
11911
 
11910
11912
  // src/error-reporter.ts
11911
- var import_evalforge_types14 = require("@wix/evalforge-types");
11913
+ var import_evalforge_types15 = require("@wix/evalforge-types");
11912
11914
  function formatError(error, phase, context) {
11913
11915
  const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
11914
11916
  if (error instanceof Error) {
@@ -12151,7 +12153,7 @@ async function runEvaluation(projectId2, evalRunId2) {
12151
12153
  totalExecutions
12152
12154
  };
12153
12155
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
12154
- const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
12156
+ const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
12155
12157
  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
12156
12158
  firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
12157
12159
  ) : void 0;
@@ -12205,7 +12207,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12205
12207
  grpcAuthToken: config.grpcAuthToken
12206
12208
  });
12207
12209
  await api.updateEvalRun(projectId, evalRunId, {
12208
- status: import_evalforge_types15.EvalStatus.FAILED,
12210
+ status: import_evalforge_types16.EvalStatus.FAILED,
12209
12211
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12210
12212
  jobError,
12211
12213
  jobStatus: "FAILED"
@@ -12230,7 +12232,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12230
12232
  grpcAuthToken
12231
12233
  });
12232
12234
  await api.updateEvalRun(projectId, evalRunId, {
12233
- status: import_evalforge_types15.EvalStatus.FAILED,
12235
+ status: import_evalforge_types16.EvalStatus.FAILED,
12234
12236
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12235
12237
  jobError: `Config load failed, then: ${jobError}`,
12236
12238
  jobStatus: "FAILED"