@wix/evalforge-evaluator 0.184.0 → 0.186.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
5226
5226
  });
5227
5227
 
5228
5228
  // src/index.ts
5229
- var import_evalforge_types16 = require("@wix/evalforge-types");
5229
+ var import_evalforge_types15 = require("@wix/evalforge-types");
5230
5230
 
5231
5231
  // src/config.ts
5232
5232
  function loadConfig() {
@@ -6816,21 +6816,37 @@ function createApiClient(serverUrl, options = "") {
6816
6816
  // The legacy REST endpoint enriched the capability with its latest version
6817
6817
  // server-side; ambassador's GetCapability returns the bare entity, so we
6818
6818
  // compose it with GetLatestCapabilityVersion in parallel here.
6819
+ //
6820
+ // The latest-version fetch is BEST-EFFORT: a failure must not drop the whole
6821
+ // capability. Otherwise one broken snapshot fetch makes the capability (e.g.
6822
+ // an MCP) silently vanish from the run. Runs that pin a version still resolve
6823
+ // their content via getCapabilityVersion downstream.
6819
6824
  async getCapability(projectId2, id) {
6820
- const [capRes, versionRes] = await Promise.all([
6825
+ const [capResult, versionResult] = await Promise.allSettled([
6821
6826
  httpClient.request(getCapability({ projectId: projectId2, capabilityId: id })),
6822
6827
  httpClient.request(
6823
6828
  getLatestCapabilityVersion({ projectId: projectId2, capabilityId: id })
6824
6829
  )
6825
6830
  ]);
6826
- const capability = capRes.data.capability;
6831
+ if (capResult.status === "rejected") {
6832
+ throw capResult.reason;
6833
+ }
6834
+ const capability = capResult.value.data.capability;
6827
6835
  if (!capability) {
6828
6836
  throw new Error(`Capability ${id} not found in project ${projectId2}`);
6829
6837
  }
6830
- const latestVersion = versionRes.data.capabilityVersion ? capabilityVersionFromProto(
6831
- versionRes.data.capabilityVersion,
6832
- projectId2
6833
- ) : void 0;
6838
+ let latestVersion;
6839
+ if (versionResult.status === "fulfilled" && versionResult.value.data.capabilityVersion) {
6840
+ latestVersion = capabilityVersionFromProto(
6841
+ versionResult.value.data.capabilityVersion,
6842
+ projectId2
6843
+ );
6844
+ } else if (versionResult.status === "rejected") {
6845
+ const reason = versionResult.reason instanceof Error ? versionResult.reason.message : String(versionResult.reason);
6846
+ console.warn(
6847
+ `[Capabilities] getLatestCapabilityVersion(${id}) failed; loading capability without a snapshot (pinned versions still resolve): ${reason}`
6848
+ );
6849
+ }
6834
6850
  return { ...capabilityFromProto(capability), latestVersion };
6835
6851
  },
6836
6852
  async getCapabilityVersion(projectId2, capabilityId, versionId) {
@@ -7115,7 +7131,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
7115
7131
  }
7116
7132
 
7117
7133
  // src/run-scenario/index.ts
7118
- var import_evalforge_types14 = require("@wix/evalforge-types");
7134
+ var import_evalforge_types13 = require("@wix/evalforge-types");
7119
7135
  var import_eval_assertions = require("@wix/eval-assertions");
7120
7136
 
7121
7137
  // src/run-scenario/environment.ts
@@ -7451,122 +7467,50 @@ function getAdapter(identifier) {
7451
7467
  }
7452
7468
 
7453
7469
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
7454
- var import_evalforge_types6 = require("@wix/evalforge-types");
7470
+ var import_evalforge_types5 = require("@wix/evalforge-types");
7455
7471
 
7456
7472
  // src/run-scenario/agents/claude-code/execute.ts
7457
- var import_evalforge_types5 = require("@wix/evalforge-types");
7473
+ var import_evalforge_types4 = require("@wix/evalforge-types");
7458
7474
 
7459
7475
  // src/run-scenario/agents/claude-code/write-skills.ts
7460
7476
  var import_promises3 = require("fs/promises");
7461
7477
  var import_path4 = require("path");
7462
-
7463
- // src/run-scenario/agents/shared/resolve-capability-content.ts
7464
7478
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
7465
- var import_evalforge_types2 = require("@wix/evalforge-types");
7466
- var USER_AGENT = "EvalForge-Evaluator";
7467
- async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7468
- const version = skill.latestVersion;
7469
- if (version?.files && version.files.length > 0) {
7470
- console.log(
7471
- `[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
7472
- );
7473
- return version.files;
7474
- }
7475
- if (skill.source) {
7476
- const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
7477
- console.log(
7478
- `[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
7479
- );
7480
- return files;
7481
- }
7482
- throw new Error(`Skill ${skill.name} has no files and no source configured`);
7483
- }
7484
- async function fetchSourceFile(label, noun, name, source, fetchFn) {
7485
- try {
7486
- const content = await fetchFn(source, { userAgent: USER_AGENT });
7487
- console.log(
7488
- `[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
7489
- );
7490
- return content;
7491
- } catch (error) {
7492
- const message = error instanceof Error ? error.message : "Unknown error";
7493
- console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
7494
- throw new Error(
7495
- `Failed to fetch ${noun} "${name}" from GitHub: ${message}`
7496
- );
7497
- }
7498
- }
7499
- async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7500
- if (agent.source) {
7501
- return fetchSourceFile(
7502
- "SubAgents",
7503
- "sub-agent",
7504
- agent.name,
7505
- agent.source,
7506
- fetchFn
7507
- );
7508
- }
7509
- if (!agent.subAgentMd) {
7510
- console.warn(
7511
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7512
- );
7513
- }
7514
- return agent.subAgentMd;
7515
- }
7516
- async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7517
- if (!rule.source) {
7518
- return rule.content;
7519
- }
7520
- return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
7521
- }
7522
- async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7523
- if (!mcp.source) {
7524
- return mcp.config;
7525
- }
7526
- const raw = await fetchSourceFile(
7527
- "MCP",
7528
- "MCP",
7529
- mcp.name,
7530
- mcp.source,
7531
- fetchFn
7532
- );
7533
- let parsed;
7534
- try {
7535
- parsed = JSON.parse(raw);
7536
- } catch (error) {
7537
- const message = error instanceof Error ? error.message : "Unknown error";
7538
- throw new Error(
7539
- `MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
7540
- );
7541
- }
7542
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
7543
- throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
7544
- }
7545
- const obj = parsed;
7546
- const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
7547
- if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
7548
- return servers;
7549
- }
7550
- return obj;
7551
- }
7552
-
7553
- // src/run-scenario/agents/claude-code/write-skills.ts
7554
- async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
7479
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7555
7480
  await Promise.all(
7556
7481
  skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
7557
7482
  );
7558
7483
  }
7559
- async function writeSkillToFilesystem(cwd, skill, fetchFn) {
7560
- const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
7484
+ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7485
+ const skillName = skill.name;
7486
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
7561
7487
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7562
- try {
7563
- const files = await resolveSkillFiles(skill, fetchFn);
7564
- await writeFilesToDirectory(skillDir, files);
7565
- } catch (error) {
7566
- const message = error instanceof Error ? error.message : "Unknown error";
7567
- throw new Error(
7568
- `Failed to write skill ${skill.name} to filesystem: ${message}`
7488
+ const version = skill.latestVersion;
7489
+ if (version?.files && version.files.length > 0) {
7490
+ await writeFilesToDirectory(skillDir, version.files);
7491
+ console.log(
7492
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
7569
7493
  );
7494
+ } else if (skill.source) {
7495
+ try {
7496
+ const files = await fetchFn(skill.source, {
7497
+ userAgent: "EvalForge-Evaluator"
7498
+ });
7499
+ await writeFilesToDirectory(skillDir, files);
7500
+ console.log(
7501
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
7502
+ );
7503
+ } catch (error) {
7504
+ const message = error instanceof Error ? error.message : "Unknown error";
7505
+ console.error(
7506
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
7507
+ );
7508
+ throw new Error(
7509
+ `Failed to write skill ${skillName} to filesystem: ${message}`
7510
+ );
7511
+ }
7512
+ } else {
7513
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
7570
7514
  }
7571
7515
  }
7572
7516
 
@@ -7584,7 +7528,7 @@ var import_crypto2 = require("crypto");
7584
7528
  // src/run-scenario/agents/claude-code/write-mcp.ts
7585
7529
  var import_promises5 = require("fs/promises");
7586
7530
  var import_path6 = require("path");
7587
- var import_evalforge_types3 = require("@wix/evalforge-types");
7531
+ var import_evalforge_types2 = require("@wix/evalforge-types");
7588
7532
 
7589
7533
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
7590
7534
  var import_promises4 = require("fs/promises");
@@ -7629,11 +7573,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
7629
7573
  }
7630
7574
 
7631
7575
  // src/run-scenario/agents/claude-code/write-mcp.ts
7632
- async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7576
+ async function writeMcpToFilesystem(cwd, mcps) {
7633
7577
  if (mcps.length === 0) return;
7634
7578
  const mcpServers = {};
7635
7579
  for (const mcp of mcps) {
7636
- const config = await resolveMcpConfig(mcp, fetchFn);
7580
+ const config = mcp.config;
7637
7581
  for (const [key, value] of Object.entries(config)) {
7638
7582
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
7639
7583
  throw new Error(
@@ -7645,7 +7589,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7645
7589
  }
7646
7590
  const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
7647
7591
  const content = JSON.stringify(
7648
- { [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
7592
+ { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
7649
7593
  null,
7650
7594
  2
7651
7595
  );
@@ -7657,6 +7601,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7657
7601
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
7658
7602
  var import_promises6 = require("fs/promises");
7659
7603
  var import_path7 = require("path");
7604
+ var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
7660
7605
  var AGENTS_DIR = ".claude/agents";
7661
7606
  function toAgentFilename(name, index, nameCount) {
7662
7607
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -7664,7 +7609,34 @@ function toAgentFilename(name, index, nameCount) {
7664
7609
  nameCount.set(base, count + 1);
7665
7610
  return count === 0 ? base : `${base}-${count + 1}`;
7666
7611
  }
7667
- async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7612
+ async function resolveSubAgentContent(agent, fetchFn) {
7613
+ if (agent.source) {
7614
+ try {
7615
+ const content = await fetchFn(agent.source, {
7616
+ userAgent: "EvalForge-Evaluator"
7617
+ });
7618
+ console.log(
7619
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
7620
+ );
7621
+ return content;
7622
+ } catch (error) {
7623
+ const message = error instanceof Error ? error.message : "Unknown error";
7624
+ console.error(
7625
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
7626
+ );
7627
+ throw new Error(
7628
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
7629
+ );
7630
+ }
7631
+ }
7632
+ if (!agent.subAgentMd) {
7633
+ console.warn(
7634
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7635
+ );
7636
+ }
7637
+ return agent.subAgentMd;
7638
+ }
7639
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
7668
7640
  if (subAgents.length === 0) return;
7669
7641
  const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
7670
7642
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
@@ -7672,7 +7644,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7672
7644
  for (const [i, agent] of subAgents.entries()) {
7673
7645
  const filename = toAgentFilename(agent.name, i, nameCount);
7674
7646
  const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
7675
- const content = await resolveSubAgentMd(agent, fetchFn);
7647
+ const content = await resolveSubAgentContent(agent, fetchFn);
7676
7648
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
7677
7649
  }
7678
7650
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -7722,19 +7694,18 @@ function validateGenericDirectory(dir, cwd) {
7722
7694
  }
7723
7695
  return trimmed;
7724
7696
  }
7725
- async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7697
+ async function writeRulesToFilesystem(cwd, rules) {
7726
7698
  if (rules.length === 0) return;
7727
7699
  const nameCount = /* @__PURE__ */ new Map();
7728
7700
  let hasCursorRules = false;
7729
7701
  for (const [i, rule] of rules.entries()) {
7730
- const content = await resolveRuleText(rule, fetchFn);
7731
7702
  switch (rule.ruleType) {
7732
7703
  case "claude-md": {
7733
- await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
7704
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
7734
7705
  break;
7735
7706
  }
7736
7707
  case "agents-md": {
7737
- await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
7708
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
7738
7709
  break;
7739
7710
  }
7740
7711
  case "cursor-rule": {
@@ -7744,7 +7715,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7744
7715
  }
7745
7716
  const filename = toRuleFilename(rule.name, i, nameCount);
7746
7717
  const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
7747
- await (0, import_promises7.writeFile)(filePath, content, "utf8");
7718
+ await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
7748
7719
  break;
7749
7720
  }
7750
7721
  case "generic": {
@@ -7755,7 +7726,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7755
7726
  const dirPath = (0, import_path8.join)(cwd, directory);
7756
7727
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
7757
7728
  const filename = toRuleFilename(rule.name, i, nameCount);
7758
- await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
7729
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
7759
7730
  break;
7760
7731
  }
7761
7732
  default: {
@@ -7845,14 +7816,14 @@ function buildConversation(timestampedMessages) {
7845
7816
  }
7846
7817
 
7847
7818
  // src/run-scenario/agents/shared/trace-emit.ts
7848
- var import_evalforge_types4 = require("@wix/evalforge-types");
7819
+ var import_evalforge_types3 = require("@wix/evalforge-types");
7849
7820
  function emitTraceEvent(event, pushEvent) {
7850
- console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7821
+ console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7851
7822
  pushEvent?.(event);
7852
7823
  }
7853
7824
 
7854
7825
  // src/run-scenario/agents/claude-code/execute.ts
7855
- var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7826
+ var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7856
7827
  async function* buildPromptStream(triggerPrompt, images) {
7857
7828
  yield {
7858
7829
  type: "user",
@@ -7917,7 +7888,7 @@ function extractToolActionDescription(toolName, toolArgs) {
7917
7888
  return `Using ${toolName}...`;
7918
7889
  }
7919
7890
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7920
- let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7891
+ let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7921
7892
  let toolName;
7922
7893
  let toolArgs;
7923
7894
  let outputPreview;
@@ -7925,28 +7896,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7925
7896
  let thinking;
7926
7897
  for (const block of message.message.content) {
7927
7898
  if (block.type === "tool_use") {
7928
- type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
7899
+ type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
7929
7900
  toolName = block.name;
7930
7901
  toolArgs = JSON.stringify(block.input).slice(0, 500);
7931
7902
  const input = block.input;
7932
7903
  if (input.file_path || input.path || input.target_file) {
7933
7904
  filePath = String(input.file_path || input.path || input.target_file);
7934
7905
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
7935
- type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
7906
+ type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
7936
7907
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
7937
- type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
7908
+ type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
7938
7909
  }
7939
7910
  }
7940
7911
  } else if (block.type === "text") {
7941
7912
  outputPreview = block.text.slice(0, 500);
7942
7913
  if (!toolName) {
7943
- type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7914
+ type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7944
7915
  }
7945
7916
  } else if (block.type === "thinking") {
7946
7917
  const thinkingBlock = block;
7947
7918
  thinking = thinkingBlock.thinking.slice(0, 500);
7948
7919
  if (!outputPreview && !toolName) {
7949
- type = import_evalforge_types5.LiveTraceEventType.THINKING;
7920
+ type = import_evalforge_types4.LiveTraceEventType.THINKING;
7950
7921
  }
7951
7922
  }
7952
7923
  }
@@ -8012,7 +7983,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8012
7983
  }
8013
7984
  return {
8014
7985
  ...baseEvent,
8015
- type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
7986
+ type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
8016
7987
  outputPreview: outputPreview || "(tool result)"
8017
7988
  };
8018
7989
  }
@@ -8020,7 +7991,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8020
7991
  const sysMsg = message;
8021
7992
  return {
8022
7993
  ...baseEvent,
8023
- type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
7994
+ type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
8024
7995
  outputPreview: sysMsg.subtype || "system"
8025
7996
  };
8026
7997
  }
@@ -8029,7 +8000,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8029
8000
  }
8030
8001
  return {
8031
8002
  ...baseEvent,
8032
- type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
8003
+ type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8033
8004
  outputPreview: `Message type: ${message.type}`
8034
8005
  };
8035
8006
  }
@@ -8131,7 +8102,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8131
8102
  queryOptions.systemPrompt = {
8132
8103
  type: "preset",
8133
8104
  preset: "claude_code",
8134
- append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8105
+ append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8135
8106
  };
8136
8107
  }
8137
8108
  if (options.temperature !== void 0) {
@@ -8166,7 +8137,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8166
8137
  targetId: traceContext.targetId,
8167
8138
  targetName: traceContext.targetName,
8168
8139
  stepNumber: 0,
8169
- type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8140
+ type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8170
8141
  outputPreview: JSON.stringify({
8171
8142
  event: "pre-sdk-execution",
8172
8143
  model: queryOptions.model,
@@ -8230,7 +8201,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8230
8201
  targetId: traceContext.targetId,
8231
8202
  targetName: traceContext.targetName,
8232
8203
  stepNumber: traceStepNumber,
8233
- type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
8204
+ type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8234
8205
  outputPreview: progressMessage,
8235
8206
  toolName: lastToolName,
8236
8207
  filePath: lastFilePath,
@@ -8267,18 +8238,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
8267
8238
  if (traceEvent) {
8268
8239
  lastToolName = traceEvent.toolName;
8269
8240
  lastFilePath = traceEvent.filePath;
8270
- if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
8241
+ if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
8271
8242
  lastAction = "Thinking...";
8272
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
8243
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
8273
8244
  lastAction = extractToolActionDescription(
8274
8245
  traceEvent.toolName,
8275
8246
  traceEvent.toolArgs
8276
8247
  );
8277
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
8248
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
8278
8249
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
8279
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
8250
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
8280
8251
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
8281
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
8252
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
8282
8253
  lastAction = "Processing response...";
8283
8254
  }
8284
8255
  emitTraceEvent(traceEvent, traceContext.pushEvent);
@@ -8456,7 +8427,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8456
8427
  targetId: traceContext.targetId,
8457
8428
  targetName: traceContext.targetName,
8458
8429
  stepNumber: traceStepNumber + 1,
8459
- type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8430
+ type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8460
8431
  outputPreview: JSON.stringify(
8461
8432
  {
8462
8433
  event: "sdk-execution-failed",
@@ -8490,7 +8461,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
8490
8461
  targetId: traceContext.targetId,
8491
8462
  targetName: traceContext.targetName,
8492
8463
  stepNumber: traceStepNumber + 1,
8493
- type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
8464
+ type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
8494
8465
  outputPreview: "Scenario execution completed",
8495
8466
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
8496
8467
  isComplete: true
@@ -8765,7 +8736,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8765
8736
  stepNumber: 0,
8766
8737
  // renumbered below
8767
8738
  turnIndex,
8768
- type: import_evalforge_types5.LLMStepType.THINKING,
8739
+ type: import_evalforge_types4.LLMStepType.THINKING,
8769
8740
  model,
8770
8741
  provider: "anthropic",
8771
8742
  startedAt: step.startedAt.toISOString(),
@@ -8796,7 +8767,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8796
8767
  id: (0, import_crypto2.randomUUID)(),
8797
8768
  stepNumber: 0,
8798
8769
  turnIndex,
8799
- type: import_evalforge_types5.LLMStepType.TOOL_USE,
8770
+ type: import_evalforge_types4.LLMStepType.TOOL_USE,
8800
8771
  model,
8801
8772
  provider: "anthropic",
8802
8773
  startedAt: step.startedAt.toISOString(),
@@ -8826,7 +8797,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8826
8797
  id: (0, import_crypto2.randomUUID)(),
8827
8798
  stepNumber: 0,
8828
8799
  turnIndex,
8829
- type: import_evalforge_types5.LLMStepType.COMPLETION,
8800
+ type: import_evalforge_types4.LLMStepType.COMPLETION,
8830
8801
  model,
8831
8802
  provider: "anthropic",
8832
8803
  startedAt: step.startedAt.toISOString(),
@@ -8843,7 +8814,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8843
8814
  });
8844
8815
  }
8845
8816
  if (subSteps.length === 0) {
8846
- const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
8817
+ const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
8847
8818
  subSteps.push({
8848
8819
  id: (0, import_crypto2.randomUUID)(),
8849
8820
  stepNumber: 0,
@@ -8913,7 +8884,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8913
8884
  var ClaudeCodeAdapter = class {
8914
8885
  id = "claude-code";
8915
8886
  name = "Claude Code";
8916
- supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
8887
+ supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
8917
8888
  /**
8918
8889
  * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
8919
8890
  * before the baseline snapshot is taken.
@@ -8945,9 +8916,9 @@ var ClaudeCodeAdapter = class {
8945
8916
  rules,
8946
8917
  systemPrompt
8947
8918
  } = context;
8948
- const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8919
+ const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8949
8920
  const cfg = typed?.success ? typed.data : void 0;
8950
- const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
8921
+ const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
8951
8922
  const extras = {};
8952
8923
  if (config) {
8953
8924
  for (const [key, value] of Object.entries(config)) {
@@ -9002,11 +8973,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
9002
8973
  defaultRegistry.register(claudeCodeAdapter);
9003
8974
 
9004
8975
  // src/run-scenario/agents/opencode/opencode-adapter.ts
9005
- var import_evalforge_types10 = require("@wix/evalforge-types");
8976
+ var import_evalforge_types9 = require("@wix/evalforge-types");
9006
8977
 
9007
8978
  // src/run-scenario/agents/opencode/execute.ts
9008
8979
  var import_child_process2 = require("child_process");
9009
- var import_evalforge_types9 = require("@wix/evalforge-types");
8980
+ var import_evalforge_types8 = require("@wix/evalforge-types");
9010
8981
 
9011
8982
  // src/run-scenario/agents/opencode/types.ts
9012
8983
  function tryParseJson(text) {
@@ -9020,28 +8991,49 @@ function tryParseJson(text) {
9020
8991
  // src/run-scenario/agents/opencode/write-skills.ts
9021
8992
  var import_promises8 = require("fs/promises");
9022
8993
  var import_path9 = require("path");
9023
- async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
8994
+ var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
8995
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
9024
8996
  await Promise.all(
9025
8997
  skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
9026
8998
  );
9027
8999
  }
9028
9000
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
9029
- const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
9001
+ const skillName = skill.name;
9002
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
9030
9003
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
9031
- try {
9032
- const files = await resolveSkillFiles(skill, fetchFn);
9033
- await writeFilesToDirectory(skillDir, files);
9034
- } catch (error) {
9035
- const message = error instanceof Error ? error.message : "Unknown error";
9036
- throw new Error(
9037
- `Failed to write skill ${skill.name} to filesystem: ${message}`
9004
+ const version = skill.latestVersion;
9005
+ if (version?.files && version.files.length > 0) {
9006
+ await writeFilesToDirectory(skillDir, version.files);
9007
+ console.log(
9008
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
9038
9009
  );
9010
+ } else if (skill.source) {
9011
+ try {
9012
+ const files = await fetchFn(skill.source, {
9013
+ userAgent: "EvalForge-Evaluator"
9014
+ });
9015
+ await writeFilesToDirectory(skillDir, files);
9016
+ console.log(
9017
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
9018
+ );
9019
+ } catch (error) {
9020
+ const message = error instanceof Error ? error.message : "Unknown error";
9021
+ console.error(
9022
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
9023
+ );
9024
+ throw new Error(
9025
+ `Failed to write skill ${skillName} to filesystem: ${message}`
9026
+ );
9027
+ }
9028
+ } else {
9029
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
9039
9030
  }
9040
9031
  }
9041
9032
 
9042
9033
  // src/run-scenario/agents/opencode/write-sub-agents.ts
9043
9034
  var import_promises9 = require("fs/promises");
9044
9035
  var import_path10 = require("path");
9036
+ var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
9045
9037
  var AGENTS_DIR2 = ".opencode/agents";
9046
9038
  function toAgentFilename2(name, index, nameCount) {
9047
9039
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -9049,7 +9041,34 @@ function toAgentFilename2(name, index, nameCount) {
9049
9041
  nameCount.set(base, count + 1);
9050
9042
  return count === 0 ? base : `${base}-${count + 1}`;
9051
9043
  }
9052
- async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9044
+ async function resolveSubAgentContent2(agent, fetchFn) {
9045
+ if (agent.source) {
9046
+ try {
9047
+ const content = await fetchFn(agent.source, {
9048
+ userAgent: "EvalForge-Evaluator"
9049
+ });
9050
+ console.log(
9051
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
9052
+ );
9053
+ return content;
9054
+ } catch (error) {
9055
+ const message = error instanceof Error ? error.message : "Unknown error";
9056
+ console.error(
9057
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
9058
+ );
9059
+ throw new Error(
9060
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
9061
+ );
9062
+ }
9063
+ }
9064
+ if (!agent.subAgentMd) {
9065
+ console.warn(
9066
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
9067
+ );
9068
+ }
9069
+ return agent.subAgentMd;
9070
+ }
9071
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
9053
9072
  if (subAgents.length === 0) return;
9054
9073
  const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
9055
9074
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
@@ -9057,7 +9076,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9057
9076
  for (const [i, agent] of subAgents.entries()) {
9058
9077
  const filename = toAgentFilename2(agent.name, i, nameCount);
9059
9078
  const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
9060
- const content = await resolveSubAgentMd(agent, fetchFn);
9079
+ const content = await resolveSubAgentContent2(agent, fetchFn);
9061
9080
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
9062
9081
  }
9063
9082
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -9065,8 +9084,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9065
9084
 
9066
9085
  // src/run-scenario/agents/opencode/config.ts
9067
9086
  var import_os3 = require("os");
9068
- var import_evalforge_types7 = require("@wix/evalforge-types");
9069
- var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9087
+ var import_evalforge_types6 = require("@wix/evalforge-types");
9088
+ var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9070
9089
  var OPENCODE_MODEL_ALIASES = {
9071
9090
  "claude-sonnet-4": "claude-sonnet-4-0",
9072
9091
  "claude-opus-4": "claude-opus-4-0"
@@ -9082,10 +9101,10 @@ function parseModel(model) {
9082
9101
  };
9083
9102
  }
9084
9103
  const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
9085
- const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
9104
+ const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
9086
9105
  model
9087
9106
  );
9088
- const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
9107
+ const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
9089
9108
  model
9090
9109
  );
9091
9110
  if (isGemini) return { providerID: "google", modelID };
@@ -9154,7 +9173,7 @@ async function buildOpenCodeEnv(options) {
9154
9173
  if (options.mcps && options.mcps.length > 0) {
9155
9174
  const mcpServers = {};
9156
9175
  for (const mcpEntity of options.mcps) {
9157
- const entityConfig = await resolveMcpConfig(mcpEntity);
9176
+ const entityConfig = mcpEntity.config;
9158
9177
  for (const [key, value] of Object.entries(entityConfig)) {
9159
9178
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
9160
9179
  throw new Error(
@@ -9179,7 +9198,7 @@ async function buildOpenCodeEnv(options) {
9179
9198
  if (options.maxTurns != null && options.maxTurns > 0) {
9180
9199
  agentOverrides.maxSteps = options.maxTurns;
9181
9200
  }
9182
- const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9201
+ const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9183
9202
  const configPermission = parsed?.success ? parsed.data.permission : void 0;
9184
9203
  const defaultPermission = {
9185
9204
  "*": "allow"
@@ -9221,7 +9240,7 @@ async function buildOpenCodeEnv(options) {
9221
9240
  }
9222
9241
 
9223
9242
  // src/run-scenario/agents/opencode/build-trace.ts
9224
- var import_evalforge_types8 = require("@wix/evalforge-types");
9243
+ var import_evalforge_types7 = require("@wix/evalforge-types");
9225
9244
  var import_crypto3 = require("crypto");
9226
9245
  function toCanonicalModelId(modelId) {
9227
9246
  const slashIndex = modelId.indexOf("/");
@@ -9301,7 +9320,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9301
9320
  id: (0, import_crypto3.randomUUID)(),
9302
9321
  stepNumber: 0,
9303
9322
  turnIndex,
9304
- type: import_evalforge_types8.LLMStepType.THINKING,
9323
+ type: import_evalforge_types7.LLMStepType.THINKING,
9305
9324
  model: stepModel,
9306
9325
  provider: stepProvider,
9307
9326
  startedAt,
@@ -9330,7 +9349,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9330
9349
  id: (0, import_crypto3.randomUUID)(),
9331
9350
  stepNumber: 0,
9332
9351
  turnIndex,
9333
- type: import_evalforge_types8.LLMStepType.TOOL_USE,
9352
+ type: import_evalforge_types7.LLMStepType.TOOL_USE,
9334
9353
  model: stepModel,
9335
9354
  provider: stepProvider,
9336
9355
  startedAt,
@@ -9360,7 +9379,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9360
9379
  id: (0, import_crypto3.randomUUID)(),
9361
9380
  stepNumber: 0,
9362
9381
  turnIndex,
9363
- type: import_evalforge_types8.LLMStepType.COMPLETION,
9382
+ type: import_evalforge_types7.LLMStepType.COMPLETION,
9364
9383
  model: stepModel,
9365
9384
  provider: stepProvider,
9366
9385
  startedAt,
@@ -9377,7 +9396,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9377
9396
  });
9378
9397
  }
9379
9398
  if (subSteps.length === 0) {
9380
- const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
9399
+ const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
9381
9400
  subSteps.push({
9382
9401
  id: (0, import_crypto3.randomUUID)(),
9383
9402
  stepNumber: 0,
@@ -9578,14 +9597,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9578
9597
  const te = evt;
9579
9598
  return {
9580
9599
  ...base,
9581
- type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
9600
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
9582
9601
  outputPreview: te.part.text.slice(0, 500)
9583
9602
  };
9584
9603
  }
9585
9604
  case "reasoning":
9586
9605
  return {
9587
9606
  ...base,
9588
- type: import_evalforge_types9.LiveTraceEventType.THINKING,
9607
+ type: import_evalforge_types8.LiveTraceEventType.THINKING,
9589
9608
  thinking: evt.part.text.slice(0, 500)
9590
9609
  };
9591
9610
  case "tool_use": {
@@ -9593,15 +9612,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9593
9612
  const toolName = tu.part.tool;
9594
9613
  const args = tu.part.state.input;
9595
9614
  const toolArgs = JSON.stringify(args).slice(0, 500);
9596
- let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
9615
+ let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
9597
9616
  let filePath;
9598
9617
  if (args) {
9599
9618
  if (args.file_path || args.path || args.target_file) {
9600
9619
  filePath = String(args.file_path || args.path || args.target_file);
9601
9620
  if (/write|edit/i.test(toolName)) {
9602
- type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
9621
+ type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
9603
9622
  } else if (/read|view/i.test(toolName)) {
9604
- type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
9623
+ type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
9605
9624
  }
9606
9625
  }
9607
9626
  }
@@ -9610,7 +9629,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9610
9629
  case "step_finish":
9611
9630
  return {
9612
9631
  ...base,
9613
- type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9632
+ type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9614
9633
  outputPreview: "Step completed"
9615
9634
  };
9616
9635
  default:
@@ -9641,7 +9660,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
9641
9660
  } else if (options.systemPrompt != null) {
9642
9661
  systemPrompt = options.systemPrompt;
9643
9662
  } else {
9644
- systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9663
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9645
9664
  }
9646
9665
  if (systemPrompt) {
9647
9666
  await writeSystemPromptRule(cwd, systemPrompt);
@@ -9833,7 +9852,7 @@ function spawnOpenCodeProcess(opts) {
9833
9852
  targetId: traceContext.targetId,
9834
9853
  targetName: traceContext.targetName,
9835
9854
  stepNumber: traceStepNumber,
9836
- type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9855
+ type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9837
9856
  outputPreview: progressMessage,
9838
9857
  toolName: lastToolName,
9839
9858
  filePath: lastFilePath,
@@ -9867,18 +9886,18 @@ function spawnOpenCodeProcess(opts) {
9867
9886
  if (traceEvt) {
9868
9887
  lastToolName = traceEvt.toolName;
9869
9888
  lastFilePath = traceEvt.filePath;
9870
- if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
9889
+ if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
9871
9890
  lastAction = "Thinking...";
9872
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
9891
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
9873
9892
  lastAction = extractToolAction(
9874
9893
  traceEvt.toolName ?? "",
9875
9894
  void 0
9876
9895
  );
9877
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
9896
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
9878
9897
  lastAction = `Writing: ${traceEvt.filePath || "file"}`;
9879
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
9898
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
9880
9899
  lastAction = `Reading: ${traceEvt.filePath || "file"}`;
9881
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
9900
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
9882
9901
  lastAction = "Processing response...";
9883
9902
  }
9884
9903
  emitTraceEvent(traceEvt, traceContext.pushEvent);
@@ -9960,7 +9979,7 @@ async function executeWithOpenCode(skills, scenario, options) {
9960
9979
  targetId: traceContext.targetId,
9961
9980
  targetName: traceContext.targetName,
9962
9981
  stepNumber: 0,
9963
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
9982
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
9964
9983
  outputPreview: JSON.stringify({
9965
9984
  event: "pre-cli-execution",
9966
9985
  model: `${providerID}/${modelID}`,
@@ -10014,7 +10033,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10014
10033
  targetId: traceContext.targetId,
10015
10034
  targetName: traceContext.targetName,
10016
10035
  stepNumber: traceStepNumber + 1,
10017
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10036
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10018
10037
  outputPreview: JSON.stringify({
10019
10038
  event: "idle-timeout-retry",
10020
10039
  attempt,
@@ -10058,7 +10077,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10058
10077
  targetId: traceContext.targetId,
10059
10078
  targetName: traceContext.targetName,
10060
10079
  stepNumber: traceStepNumber + 1,
10061
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10080
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10062
10081
  outputPreview: JSON.stringify({
10063
10082
  event: "cli-execution-failed",
10064
10083
  error: lastAttemptResult.error?.message ?? "Unknown error",
@@ -10113,7 +10132,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10113
10132
  targetId: traceContext.targetId,
10114
10133
  targetName: traceContext.targetName,
10115
10134
  stepNumber: traceStepNumber + 1,
10116
- type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
10135
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
10117
10136
  outputPreview: "Scenario execution completed",
10118
10137
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10119
10138
  isComplete: true
@@ -10150,7 +10169,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10150
10169
  var OpenCodeAdapter = class {
10151
10170
  id = "opencode";
10152
10171
  name = "OpenCode";
10153
- supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
10172
+ supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
10154
10173
  async prepareEnvironment(context) {
10155
10174
  await prepareOpenCodeEnvironment(context.cwd, context.skills, {
10156
10175
  mcps: context.mcps,
@@ -10173,7 +10192,7 @@ var OpenCodeAdapter = class {
10173
10192
  rules,
10174
10193
  systemPrompt
10175
10194
  } = context;
10176
- const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10195
+ const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10177
10196
  const cfg = typed?.success ? typed.data : void 0;
10178
10197
  const rawMaxTurns = cfg?.maxTurns;
10179
10198
  const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
@@ -10223,7 +10242,7 @@ var import_ai = require("ai");
10223
10242
  var import_anthropic = require("@ai-sdk/anthropic");
10224
10243
  var import_google = require("@ai-sdk/google");
10225
10244
  var import_openai = require("@ai-sdk/openai");
10226
- var import_evalforge_types12 = require("@wix/evalforge-types");
10245
+ var import_evalforge_types11 = require("@wix/evalforge-types");
10227
10246
  var import_crypto4 = require("crypto");
10228
10247
 
10229
10248
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
@@ -10320,7 +10339,7 @@ function extractErrorText(content) {
10320
10339
  }
10321
10340
 
10322
10341
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
10323
- var import_evalforge_types11 = require("@wix/evalforge-types");
10342
+ var import_evalforge_types10 = require("@wix/evalforge-types");
10324
10343
  var PROVIDER_ANTHROPIC = "anthropic";
10325
10344
  var PROVIDER_GEMINI = "gemini";
10326
10345
  var MODEL_PRICING = {
@@ -10389,7 +10408,7 @@ function extractGatewayCost(step, provider) {
10389
10408
  }
10390
10409
  }
10391
10410
  function calculateFromPricing(modelId, tokenUsage) {
10392
- const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
10411
+ const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
10393
10412
  const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
10394
10413
  if (!pricing) return 0;
10395
10414
  return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
@@ -10482,7 +10501,7 @@ function createModel(modelId, baseUrl, headers) {
10482
10501
  apiKey: "proxy-auth",
10483
10502
  headers
10484
10503
  });
10485
- if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10504
+ if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10486
10505
  (id) => modelId === id || modelId.startsWith(id)
10487
10506
  )) {
10488
10507
  return openai.responses(modelId);
@@ -10490,12 +10509,12 @@ function createModel(modelId, baseUrl, headers) {
10490
10509
  return openai.chat(modelId);
10491
10510
  }
10492
10511
  function isClaudeModelId(modelId) {
10493
- return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
10512
+ return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
10494
10513
  (id) => modelId === id || modelId.startsWith(id)
10495
10514
  );
10496
10515
  }
10497
10516
  function isGeminiModelId(modelId) {
10498
- return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
10517
+ return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
10499
10518
  (id) => modelId === id || modelId.startsWith(id)
10500
10519
  );
10501
10520
  }
@@ -10515,9 +10534,9 @@ async function executeWithAiSdk(context) {
10515
10534
  mcps,
10516
10535
  traceContext
10517
10536
  } = context;
10518
- const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10537
+ const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10519
10538
  const cfg = typed?.success ? typed.data : void 0;
10520
- const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
10539
+ const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
10521
10540
  const configExtras = {};
10522
10541
  if (config) {
10523
10542
  for (const [key, value] of Object.entries(config)) {
@@ -10554,11 +10573,11 @@ async function executeWithAiSdk(context) {
10554
10573
  }, SDK_TIMEOUT_MS);
10555
10574
  try {
10556
10575
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
10557
- const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10576
+ const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10558
10577
  (id) => modelId === id || modelId.startsWith(id)
10559
10578
  );
10560
10579
  const isGemini = provider === PROVIDER_GEMINI2;
10561
- const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
10580
+ const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
10562
10581
  const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
10563
10582
  const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
10564
10583
  const reasoningEffort = cfg.reasoningEffort ?? "high";
@@ -10637,7 +10656,7 @@ async function executeWithAiSdk(context) {
10637
10656
  targetId: traceContext.targetId,
10638
10657
  targetName: traceContext.targetName,
10639
10658
  stepNumber: stepTimestamps.length,
10640
- type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
10659
+ type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
10641
10660
  toolName: firstToolCall?.toolName,
10642
10661
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
10643
10662
  outputPreview: step.text?.slice(0, 500),
@@ -10842,7 +10861,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
10842
10861
  id: (0, import_crypto4.randomUUID)(),
10843
10862
  stepNumber: i + 1,
10844
10863
  turnIndex: i,
10845
- type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
10864
+ type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
10846
10865
  model: modelId,
10847
10866
  provider,
10848
10867
  startedAt: new Date(stepStartedAt).toISOString(),
@@ -10892,7 +10911,7 @@ function emitStartEvent(traceContext, startTime) {
10892
10911
  targetId: traceContext.targetId,
10893
10912
  targetName: traceContext.targetName,
10894
10913
  stepNumber: 0,
10895
- type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
10914
+ type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
10896
10915
  outputPreview: "Starting Simple Agent execution...",
10897
10916
  elapsedMs: Date.now() - startTime,
10898
10917
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -10910,7 +10929,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
10910
10929
  targetId: traceContext.targetId,
10911
10930
  targetName: traceContext.targetName,
10912
10931
  stepNumber,
10913
- type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
10932
+ type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
10914
10933
  outputPreview: "Scenario execution completed",
10915
10934
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10916
10935
  isComplete: true
@@ -11680,11 +11699,11 @@ function substituteVariables(prompt, variables) {
11680
11699
  }
11681
11700
 
11682
11701
  // src/run-scenario/run-agent-with-context.ts
11683
- var import_evalforge_types13 = require("@wix/evalforge-types");
11684
- var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
11702
+ var import_evalforge_types12 = require("@wix/evalforge-types");
11703
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
11685
11704
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
11686
11705
  const agent = evalData.agent ?? void 0;
11687
- const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
11706
+ const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
11688
11707
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
11689
11708
  const adapter = getAdapter(identifier);
11690
11709
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -11769,14 +11788,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11769
11788
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
11770
11789
  if (template) {
11771
11790
  console.log(
11772
- (0, import_evalforge_types14.formatTraceEventLine)({
11791
+ (0, import_evalforge_types13.formatTraceEventLine)({
11773
11792
  evalRunId: evalRunId2,
11774
11793
  scenarioId: scenario.id,
11775
11794
  scenarioName: scenario.name,
11776
11795
  targetId,
11777
11796
  targetName,
11778
11797
  stepNumber: 0,
11779
- type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
11798
+ type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
11780
11799
  outputPreview: "Setting up environment (installing dependencies)...",
11781
11800
  elapsedMs: 0,
11782
11801
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -11816,7 +11835,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11816
11835
  })),
11817
11836
  durationMs: partialResult.duration
11818
11837
  };
11819
- const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
11838
+ const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
11820
11839
  const assertionContext = {
11821
11840
  workDir,
11822
11841
  defaultJudgeModel,
@@ -11831,10 +11850,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11831
11850
  assertionContext
11832
11851
  ) : [];
11833
11852
  const passed = assertionResults.filter(
11834
- (r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
11853
+ (r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
11835
11854
  ).length;
11836
11855
  const failed = assertionResults.filter(
11837
- (r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
11856
+ (r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
11838
11857
  ).length;
11839
11858
  const total = assertionResults.length;
11840
11859
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -11910,7 +11929,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
11910
11929
  }
11911
11930
 
11912
11931
  // src/error-reporter.ts
11913
- var import_evalforge_types15 = require("@wix/evalforge-types");
11932
+ var import_evalforge_types14 = require("@wix/evalforge-types");
11914
11933
  function formatError(error, phase, context) {
11915
11934
  const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
11916
11935
  if (error instanceof Error) {
@@ -12153,7 +12172,7 @@ async function runEvaluation(projectId2, evalRunId2) {
12153
12172
  totalExecutions
12154
12173
  };
12155
12174
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
12156
- const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
12175
+ const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
12157
12176
  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
12158
12177
  firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
12159
12178
  ) : void 0;
@@ -12207,7 +12226,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12207
12226
  grpcAuthToken: config.grpcAuthToken
12208
12227
  });
12209
12228
  await api.updateEvalRun(projectId, evalRunId, {
12210
- status: import_evalforge_types16.EvalStatus.FAILED,
12229
+ status: import_evalforge_types15.EvalStatus.FAILED,
12211
12230
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12212
12231
  jobError,
12213
12232
  jobStatus: "FAILED"
@@ -12232,7 +12251,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12232
12251
  grpcAuthToken
12233
12252
  });
12234
12253
  await api.updateEvalRun(projectId, evalRunId, {
12235
- status: import_evalforge_types16.EvalStatus.FAILED,
12254
+ status: import_evalforge_types15.EvalStatus.FAILED,
12236
12255
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12237
12256
  jobError: `Config load failed, then: ${jobError}`,
12238
12257
  jobStatus: "FAILED"