@wix/evalforge-evaluator 0.182.0 → 0.183.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
5226
5226
  });
5227
5227
 
5228
5228
  // src/index.ts
5229
- var import_evalforge_types15 = require("@wix/evalforge-types");
5229
+ var import_evalforge_types16 = require("@wix/evalforge-types");
5230
5230
 
5231
5231
  // src/config.ts
5232
5232
  function loadConfig() {
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
7115
7115
  }
7116
7116
 
7117
7117
  // src/run-scenario/index.ts
7118
- var import_evalforge_types13 = require("@wix/evalforge-types");
7118
+ var import_evalforge_types14 = require("@wix/evalforge-types");
7119
7119
  var import_eval_assertions = require("@wix/eval-assertions");
7120
7120
 
7121
7121
  // src/run-scenario/environment.ts
@@ -7451,50 +7451,122 @@ function getAdapter(identifier) {
7451
7451
  }
7452
7452
 
7453
7453
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
7454
- var import_evalforge_types5 = require("@wix/evalforge-types");
7454
+ var import_evalforge_types6 = require("@wix/evalforge-types");
7455
7455
 
7456
7456
  // src/run-scenario/agents/claude-code/execute.ts
7457
- var import_evalforge_types4 = require("@wix/evalforge-types");
7457
+ var import_evalforge_types5 = require("@wix/evalforge-types");
7458
7458
 
7459
7459
  // src/run-scenario/agents/claude-code/write-skills.ts
7460
7460
  var import_promises3 = require("fs/promises");
7461
7461
  var import_path4 = require("path");
7462
+
7463
+ // src/run-scenario/agents/shared/resolve-capability-content.ts
7462
7464
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
7463
- async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7465
+ var import_evalforge_types2 = require("@wix/evalforge-types");
7466
+ var USER_AGENT = "EvalForge-Evaluator";
7467
+ async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7468
+ const version = skill.latestVersion;
7469
+ if (version?.files && version.files.length > 0) {
7470
+ console.log(
7471
+ `[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
7472
+ );
7473
+ return version.files;
7474
+ }
7475
+ if (skill.source) {
7476
+ const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
7477
+ console.log(
7478
+ `[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
7479
+ );
7480
+ return files;
7481
+ }
7482
+ throw new Error(`Skill ${skill.name} has no files and no source configured`);
7483
+ }
7484
+ async function fetchSourceFile(label, noun, name, source, fetchFn) {
7485
+ try {
7486
+ const content = await fetchFn(source, { userAgent: USER_AGENT });
7487
+ console.log(
7488
+ `[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
7489
+ );
7490
+ return content;
7491
+ } catch (error) {
7492
+ const message = error instanceof Error ? error.message : "Unknown error";
7493
+ console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
7494
+ throw new Error(
7495
+ `Failed to fetch ${noun} "${name}" from GitHub: ${message}`
7496
+ );
7497
+ }
7498
+ }
7499
+ async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7500
+ if (agent.source) {
7501
+ return fetchSourceFile(
7502
+ "SubAgents",
7503
+ "sub-agent",
7504
+ agent.name,
7505
+ agent.source,
7506
+ fetchFn
7507
+ );
7508
+ }
7509
+ if (!agent.subAgentMd) {
7510
+ console.warn(
7511
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7512
+ );
7513
+ }
7514
+ return agent.subAgentMd;
7515
+ }
7516
+ async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7517
+ if (!rule.source) {
7518
+ return rule.content;
7519
+ }
7520
+ return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
7521
+ }
7522
+ async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7523
+ if (!mcp.source) {
7524
+ return mcp.config;
7525
+ }
7526
+ const raw = await fetchSourceFile(
7527
+ "MCP",
7528
+ "MCP",
7529
+ mcp.name,
7530
+ mcp.source,
7531
+ fetchFn
7532
+ );
7533
+ let parsed;
7534
+ try {
7535
+ parsed = JSON.parse(raw);
7536
+ } catch (error) {
7537
+ const message = error instanceof Error ? error.message : "Unknown error";
7538
+ throw new Error(
7539
+ `MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
7540
+ );
7541
+ }
7542
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
7543
+ throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
7544
+ }
7545
+ const obj = parsed;
7546
+ const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
7547
+ if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
7548
+ return servers;
7549
+ }
7550
+ return obj;
7551
+ }
7552
+
7553
+ // src/run-scenario/agents/claude-code/write-skills.ts
7554
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
7464
7555
  await Promise.all(
7465
7556
  skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
7466
7557
  );
7467
7558
  }
7468
- async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7469
- const skillName = skill.name;
7470
- const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
7559
+ async function writeSkillToFilesystem(cwd, skill, fetchFn) {
7560
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
7471
7561
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7472
- const version = skill.latestVersion;
7473
- if (version?.files && version.files.length > 0) {
7474
- await writeFilesToDirectory(skillDir, version.files);
7475
- console.log(
7476
- `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
7562
+ try {
7563
+ const files = await resolveSkillFiles(skill, fetchFn);
7564
+ await writeFilesToDirectory(skillDir, files);
7565
+ } catch (error) {
7566
+ const message = error instanceof Error ? error.message : "Unknown error";
7567
+ throw new Error(
7568
+ `Failed to write skill ${skill.name} to filesystem: ${message}`
7477
7569
  );
7478
- } else if (skill.source) {
7479
- try {
7480
- const files = await fetchFn(skill.source, {
7481
- userAgent: "EvalForge-Evaluator"
7482
- });
7483
- await writeFilesToDirectory(skillDir, files);
7484
- console.log(
7485
- `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
7486
- );
7487
- } catch (error) {
7488
- const message = error instanceof Error ? error.message : "Unknown error";
7489
- console.error(
7490
- `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
7491
- );
7492
- throw new Error(
7493
- `Failed to write skill ${skillName} to filesystem: ${message}`
7494
- );
7495
- }
7496
- } else {
7497
- throw new Error(`Skill ${skillName} has no files and no source configured`);
7498
7570
  }
7499
7571
  }
7500
7572
 
@@ -7512,7 +7584,7 @@ var import_crypto2 = require("crypto");
7512
7584
  // src/run-scenario/agents/claude-code/write-mcp.ts
7513
7585
  var import_promises5 = require("fs/promises");
7514
7586
  var import_path6 = require("path");
7515
- var import_evalforge_types2 = require("@wix/evalforge-types");
7587
+ var import_evalforge_types3 = require("@wix/evalforge-types");
7516
7588
 
7517
7589
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
7518
7590
  var import_promises4 = require("fs/promises");
@@ -7557,11 +7629,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
7557
7629
  }
7558
7630
 
7559
7631
  // src/run-scenario/agents/claude-code/write-mcp.ts
7560
- async function writeMcpToFilesystem(cwd, mcps) {
7632
+ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7561
7633
  if (mcps.length === 0) return;
7562
7634
  const mcpServers = {};
7563
7635
  for (const mcp of mcps) {
7564
- const config = mcp.config;
7636
+ const config = await resolveMcpConfig(mcp, fetchFn);
7565
7637
  for (const [key, value] of Object.entries(config)) {
7566
7638
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
7567
7639
  throw new Error(
@@ -7573,7 +7645,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
7573
7645
  }
7574
7646
  const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
7575
7647
  const content = JSON.stringify(
7576
- { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
7648
+ { [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
7577
7649
  null,
7578
7650
  2
7579
7651
  );
@@ -7585,7 +7657,6 @@ async function writeMcpToFilesystem(cwd, mcps) {
7585
7657
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
7586
7658
  var import_promises6 = require("fs/promises");
7587
7659
  var import_path7 = require("path");
7588
- var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
7589
7660
  var AGENTS_DIR = ".claude/agents";
7590
7661
  function toAgentFilename(name, index, nameCount) {
7591
7662
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -7593,34 +7664,7 @@ function toAgentFilename(name, index, nameCount) {
7593
7664
  nameCount.set(base, count + 1);
7594
7665
  return count === 0 ? base : `${base}-${count + 1}`;
7595
7666
  }
7596
- async function resolveSubAgentContent(agent, fetchFn) {
7597
- if (agent.source) {
7598
- try {
7599
- const content = await fetchFn(agent.source, {
7600
- userAgent: "EvalForge-Evaluator"
7601
- });
7602
- console.log(
7603
- `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
7604
- );
7605
- return content;
7606
- } catch (error) {
7607
- const message = error instanceof Error ? error.message : "Unknown error";
7608
- console.error(
7609
- `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
7610
- );
7611
- throw new Error(
7612
- `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
7613
- );
7614
- }
7615
- }
7616
- if (!agent.subAgentMd) {
7617
- console.warn(
7618
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7619
- );
7620
- }
7621
- return agent.subAgentMd;
7622
- }
7623
- async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
7667
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7624
7668
  if (subAgents.length === 0) return;
7625
7669
  const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
7626
7670
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
@@ -7628,7 +7672,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
7628
7672
  for (const [i, agent] of subAgents.entries()) {
7629
7673
  const filename = toAgentFilename(agent.name, i, nameCount);
7630
7674
  const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
7631
- const content = await resolveSubAgentContent(agent, fetchFn);
7675
+ const content = await resolveSubAgentMd(agent, fetchFn);
7632
7676
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
7633
7677
  }
7634
7678
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -7678,18 +7722,19 @@ function validateGenericDirectory(dir, cwd) {
7678
7722
  }
7679
7723
  return trimmed;
7680
7724
  }
7681
- async function writeRulesToFilesystem(cwd, rules) {
7725
+ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7682
7726
  if (rules.length === 0) return;
7683
7727
  const nameCount = /* @__PURE__ */ new Map();
7684
7728
  let hasCursorRules = false;
7685
7729
  for (const [i, rule] of rules.entries()) {
7730
+ const content = await resolveRuleText(rule, fetchFn);
7686
7731
  switch (rule.ruleType) {
7687
7732
  case "claude-md": {
7688
- await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
7733
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
7689
7734
  break;
7690
7735
  }
7691
7736
  case "agents-md": {
7692
- await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
7737
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
7693
7738
  break;
7694
7739
  }
7695
7740
  case "cursor-rule": {
@@ -7699,7 +7744,7 @@ async function writeRulesToFilesystem(cwd, rules) {
7699
7744
  }
7700
7745
  const filename = toRuleFilename(rule.name, i, nameCount);
7701
7746
  const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
7702
- await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
7747
+ await (0, import_promises7.writeFile)(filePath, content, "utf8");
7703
7748
  break;
7704
7749
  }
7705
7750
  case "generic": {
@@ -7710,7 +7755,7 @@ async function writeRulesToFilesystem(cwd, rules) {
7710
7755
  const dirPath = (0, import_path8.join)(cwd, directory);
7711
7756
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
7712
7757
  const filename = toRuleFilename(rule.name, i, nameCount);
7713
- await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
7758
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
7714
7759
  break;
7715
7760
  }
7716
7761
  default: {
@@ -7800,14 +7845,14 @@ function buildConversation(timestampedMessages) {
7800
7845
  }
7801
7846
 
7802
7847
  // src/run-scenario/agents/shared/trace-emit.ts
7803
- var import_evalforge_types3 = require("@wix/evalforge-types");
7848
+ var import_evalforge_types4 = require("@wix/evalforge-types");
7804
7849
  function emitTraceEvent(event, pushEvent) {
7805
- console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7850
+ console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7806
7851
  pushEvent?.(event);
7807
7852
  }
7808
7853
 
7809
7854
  // src/run-scenario/agents/claude-code/execute.ts
7810
- var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7855
+ var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7811
7856
  async function* buildPromptStream(triggerPrompt, images) {
7812
7857
  yield {
7813
7858
  type: "user",
@@ -7872,7 +7917,7 @@ function extractToolActionDescription(toolName, toolArgs) {
7872
7917
  return `Using ${toolName}...`;
7873
7918
  }
7874
7919
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7875
- let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7920
+ let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7876
7921
  let toolName;
7877
7922
  let toolArgs;
7878
7923
  let outputPreview;
@@ -7880,28 +7925,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7880
7925
  let thinking;
7881
7926
  for (const block of message.message.content) {
7882
7927
  if (block.type === "tool_use") {
7883
- type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
7928
+ type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
7884
7929
  toolName = block.name;
7885
7930
  toolArgs = JSON.stringify(block.input).slice(0, 500);
7886
7931
  const input = block.input;
7887
7932
  if (input.file_path || input.path || input.target_file) {
7888
7933
  filePath = String(input.file_path || input.path || input.target_file);
7889
7934
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
7890
- type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
7935
+ type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
7891
7936
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
7892
- type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
7937
+ type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
7893
7938
  }
7894
7939
  }
7895
7940
  } else if (block.type === "text") {
7896
7941
  outputPreview = block.text.slice(0, 500);
7897
7942
  if (!toolName) {
7898
- type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7943
+ type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7899
7944
  }
7900
7945
  } else if (block.type === "thinking") {
7901
7946
  const thinkingBlock = block;
7902
7947
  thinking = thinkingBlock.thinking.slice(0, 500);
7903
7948
  if (!outputPreview && !toolName) {
7904
- type = import_evalforge_types4.LiveTraceEventType.THINKING;
7949
+ type = import_evalforge_types5.LiveTraceEventType.THINKING;
7905
7950
  }
7906
7951
  }
7907
7952
  }
@@ -7967,7 +8012,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
7967
8012
  }
7968
8013
  return {
7969
8014
  ...baseEvent,
7970
- type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
8015
+ type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
7971
8016
  outputPreview: outputPreview || "(tool result)"
7972
8017
  };
7973
8018
  }
@@ -7975,7 +8020,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
7975
8020
  const sysMsg = message;
7976
8021
  return {
7977
8022
  ...baseEvent,
7978
- type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
8023
+ type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
7979
8024
  outputPreview: sysMsg.subtype || "system"
7980
8025
  };
7981
8026
  }
@@ -7984,7 +8029,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
7984
8029
  }
7985
8030
  return {
7986
8031
  ...baseEvent,
7987
- type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8032
+ type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
7988
8033
  outputPreview: `Message type: ${message.type}`
7989
8034
  };
7990
8035
  }
@@ -8086,7 +8131,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8086
8131
  queryOptions.systemPrompt = {
8087
8132
  type: "preset",
8088
8133
  preset: "claude_code",
8089
- append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8134
+ append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8090
8135
  };
8091
8136
  }
8092
8137
  if (options.temperature !== void 0) {
@@ -8121,7 +8166,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8121
8166
  targetId: traceContext.targetId,
8122
8167
  targetName: traceContext.targetName,
8123
8168
  stepNumber: 0,
8124
- type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8169
+ type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8125
8170
  outputPreview: JSON.stringify({
8126
8171
  event: "pre-sdk-execution",
8127
8172
  model: queryOptions.model,
@@ -8185,7 +8230,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8185
8230
  targetId: traceContext.targetId,
8186
8231
  targetName: traceContext.targetName,
8187
8232
  stepNumber: traceStepNumber,
8188
- type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8233
+ type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
8189
8234
  outputPreview: progressMessage,
8190
8235
  toolName: lastToolName,
8191
8236
  filePath: lastFilePath,
@@ -8222,18 +8267,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
8222
8267
  if (traceEvent) {
8223
8268
  lastToolName = traceEvent.toolName;
8224
8269
  lastFilePath = traceEvent.filePath;
8225
- if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
8270
+ if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
8226
8271
  lastAction = "Thinking...";
8227
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
8272
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
8228
8273
  lastAction = extractToolActionDescription(
8229
8274
  traceEvent.toolName,
8230
8275
  traceEvent.toolArgs
8231
8276
  );
8232
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
8277
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
8233
8278
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
8234
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
8279
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
8235
8280
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
8236
- } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
8281
+ } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
8237
8282
  lastAction = "Processing response...";
8238
8283
  }
8239
8284
  emitTraceEvent(traceEvent, traceContext.pushEvent);
@@ -8411,7 +8456,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8411
8456
  targetId: traceContext.targetId,
8412
8457
  targetName: traceContext.targetName,
8413
8458
  stepNumber: traceStepNumber + 1,
8414
- type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8459
+ type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8415
8460
  outputPreview: JSON.stringify(
8416
8461
  {
8417
8462
  event: "sdk-execution-failed",
@@ -8445,7 +8490,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
8445
8490
  targetId: traceContext.targetId,
8446
8491
  targetName: traceContext.targetName,
8447
8492
  stepNumber: traceStepNumber + 1,
8448
- type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
8493
+ type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
8449
8494
  outputPreview: "Scenario execution completed",
8450
8495
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
8451
8496
  isComplete: true
@@ -8717,7 +8762,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8717
8762
  stepNumber: 0,
8718
8763
  // renumbered below
8719
8764
  turnIndex,
8720
- type: import_evalforge_types4.LLMStepType.THINKING,
8765
+ type: import_evalforge_types5.LLMStepType.THINKING,
8721
8766
  model,
8722
8767
  provider: "anthropic",
8723
8768
  startedAt: step.startedAt.toISOString(),
@@ -8746,7 +8791,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8746
8791
  id: (0, import_crypto2.randomUUID)(),
8747
8792
  stepNumber: 0,
8748
8793
  turnIndex,
8749
- type: import_evalforge_types4.LLMStepType.TOOL_USE,
8794
+ type: import_evalforge_types5.LLMStepType.TOOL_USE,
8750
8795
  model,
8751
8796
  provider: "anthropic",
8752
8797
  startedAt: step.startedAt.toISOString(),
@@ -8776,7 +8821,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8776
8821
  id: (0, import_crypto2.randomUUID)(),
8777
8822
  stepNumber: 0,
8778
8823
  turnIndex,
8779
- type: import_evalforge_types4.LLMStepType.COMPLETION,
8824
+ type: import_evalforge_types5.LLMStepType.COMPLETION,
8780
8825
  model,
8781
8826
  provider: "anthropic",
8782
8827
  startedAt: step.startedAt.toISOString(),
@@ -8793,7 +8838,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8793
8838
  });
8794
8839
  }
8795
8840
  if (subSteps.length === 0) {
8796
- const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
8841
+ const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
8797
8842
  subSteps.push({
8798
8843
  id: (0, import_crypto2.randomUUID)(),
8799
8844
  stepNumber: 0,
@@ -8863,7 +8908,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8863
8908
  var ClaudeCodeAdapter = class {
8864
8909
  id = "claude-code";
8865
8910
  name = "Claude Code";
8866
- supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
8911
+ supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
8867
8912
  /**
8868
8913
  * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
8869
8914
  * before the baseline snapshot is taken.
@@ -8895,9 +8940,9 @@ var ClaudeCodeAdapter = class {
8895
8940
  rules,
8896
8941
  systemPrompt
8897
8942
  } = context;
8898
- const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8943
+ const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8899
8944
  const cfg = typed?.success ? typed.data : void 0;
8900
- const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
8945
+ const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
8901
8946
  const extras = {};
8902
8947
  if (config) {
8903
8948
  for (const [key, value] of Object.entries(config)) {
@@ -8952,11 +8997,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
8952
8997
  defaultRegistry.register(claudeCodeAdapter);
8953
8998
 
8954
8999
  // src/run-scenario/agents/opencode/opencode-adapter.ts
8955
- var import_evalforge_types9 = require("@wix/evalforge-types");
9000
+ var import_evalforge_types10 = require("@wix/evalforge-types");
8956
9001
 
8957
9002
  // src/run-scenario/agents/opencode/execute.ts
8958
9003
  var import_child_process2 = require("child_process");
8959
- var import_evalforge_types8 = require("@wix/evalforge-types");
9004
+ var import_evalforge_types9 = require("@wix/evalforge-types");
8960
9005
 
8961
9006
  // src/run-scenario/agents/opencode/types.ts
8962
9007
  function tryParseJson(text) {
@@ -8970,49 +9015,28 @@ function tryParseJson(text) {
8970
9015
  // src/run-scenario/agents/opencode/write-skills.ts
8971
9016
  var import_promises8 = require("fs/promises");
8972
9017
  var import_path9 = require("path");
8973
- var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
8974
- async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
9018
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
8975
9019
  await Promise.all(
8976
9020
  skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
8977
9021
  );
8978
9022
  }
8979
9023
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
8980
- const skillName = skill.name;
8981
- const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
9024
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
8982
9025
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
8983
- const version = skill.latestVersion;
8984
- if (version?.files && version.files.length > 0) {
8985
- await writeFilesToDirectory(skillDir, version.files);
8986
- console.log(
8987
- `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
9026
+ try {
9027
+ const files = await resolveSkillFiles(skill, fetchFn);
9028
+ await writeFilesToDirectory(skillDir, files);
9029
+ } catch (error) {
9030
+ const message = error instanceof Error ? error.message : "Unknown error";
9031
+ throw new Error(
9032
+ `Failed to write skill ${skill.name} to filesystem: ${message}`
8988
9033
  );
8989
- } else if (skill.source) {
8990
- try {
8991
- const files = await fetchFn(skill.source, {
8992
- userAgent: "EvalForge-Evaluator"
8993
- });
8994
- await writeFilesToDirectory(skillDir, files);
8995
- console.log(
8996
- `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
8997
- );
8998
- } catch (error) {
8999
- const message = error instanceof Error ? error.message : "Unknown error";
9000
- console.error(
9001
- `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
9002
- );
9003
- throw new Error(
9004
- `Failed to write skill ${skillName} to filesystem: ${message}`
9005
- );
9006
- }
9007
- } else {
9008
- throw new Error(`Skill ${skillName} has no files and no source configured`);
9009
9034
  }
9010
9035
  }
9011
9036
 
9012
9037
  // src/run-scenario/agents/opencode/write-sub-agents.ts
9013
9038
  var import_promises9 = require("fs/promises");
9014
9039
  var import_path10 = require("path");
9015
- var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
9016
9040
  var AGENTS_DIR2 = ".opencode/agents";
9017
9041
  function toAgentFilename2(name, index, nameCount) {
9018
9042
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -9020,34 +9044,7 @@ function toAgentFilename2(name, index, nameCount) {
9020
9044
  nameCount.set(base, count + 1);
9021
9045
  return count === 0 ? base : `${base}-${count + 1}`;
9022
9046
  }
9023
- async function resolveSubAgentContent2(agent, fetchFn) {
9024
- if (agent.source) {
9025
- try {
9026
- const content = await fetchFn(agent.source, {
9027
- userAgent: "EvalForge-Evaluator"
9028
- });
9029
- console.log(
9030
- `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
9031
- );
9032
- return content;
9033
- } catch (error) {
9034
- const message = error instanceof Error ? error.message : "Unknown error";
9035
- console.error(
9036
- `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
9037
- );
9038
- throw new Error(
9039
- `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
9040
- );
9041
- }
9042
- }
9043
- if (!agent.subAgentMd) {
9044
- console.warn(
9045
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
9046
- );
9047
- }
9048
- return agent.subAgentMd;
9049
- }
9050
- async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
9047
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9051
9048
  if (subAgents.length === 0) return;
9052
9049
  const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
9053
9050
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
@@ -9055,7 +9052,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
9055
9052
  for (const [i, agent] of subAgents.entries()) {
9056
9053
  const filename = toAgentFilename2(agent.name, i, nameCount);
9057
9054
  const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
9058
- const content = await resolveSubAgentContent2(agent, fetchFn);
9055
+ const content = await resolveSubAgentMd(agent, fetchFn);
9059
9056
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
9060
9057
  }
9061
9058
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -9063,8 +9060,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
9063
9060
 
9064
9061
  // src/run-scenario/agents/opencode/config.ts
9065
9062
  var import_os3 = require("os");
9066
- var import_evalforge_types6 = require("@wix/evalforge-types");
9067
- var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9063
+ var import_evalforge_types7 = require("@wix/evalforge-types");
9064
+ var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9068
9065
  var OPENCODE_MODEL_ALIASES = {
9069
9066
  "claude-sonnet-4": "claude-sonnet-4-0",
9070
9067
  "claude-opus-4": "claude-opus-4-0"
@@ -9080,10 +9077,10 @@ function parseModel(model) {
9080
9077
  };
9081
9078
  }
9082
9079
  const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
9083
- const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
9080
+ const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
9084
9081
  model
9085
9082
  );
9086
- const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
9083
+ const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
9087
9084
  model
9088
9085
  );
9089
9086
  if (isGemini) return { providerID: "google", modelID };
@@ -9152,7 +9149,7 @@ async function buildOpenCodeEnv(options) {
9152
9149
  if (options.mcps && options.mcps.length > 0) {
9153
9150
  const mcpServers = {};
9154
9151
  for (const mcpEntity of options.mcps) {
9155
- const entityConfig = mcpEntity.config;
9152
+ const entityConfig = await resolveMcpConfig(mcpEntity);
9156
9153
  for (const [key, value] of Object.entries(entityConfig)) {
9157
9154
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
9158
9155
  throw new Error(
@@ -9177,7 +9174,7 @@ async function buildOpenCodeEnv(options) {
9177
9174
  if (options.maxTurns != null && options.maxTurns > 0) {
9178
9175
  agentOverrides.maxSteps = options.maxTurns;
9179
9176
  }
9180
- const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9177
+ const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9181
9178
  const configPermission = parsed?.success ? parsed.data.permission : void 0;
9182
9179
  const defaultPermission = {
9183
9180
  "*": "allow"
@@ -9219,7 +9216,7 @@ async function buildOpenCodeEnv(options) {
9219
9216
  }
9220
9217
 
9221
9218
  // src/run-scenario/agents/opencode/build-trace.ts
9222
- var import_evalforge_types7 = require("@wix/evalforge-types");
9219
+ var import_evalforge_types8 = require("@wix/evalforge-types");
9223
9220
  var import_crypto3 = require("crypto");
9224
9221
  function toCanonicalModelId(modelId) {
9225
9222
  const slashIndex = modelId.indexOf("/");
@@ -9299,7 +9296,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9299
9296
  id: (0, import_crypto3.randomUUID)(),
9300
9297
  stepNumber: 0,
9301
9298
  turnIndex,
9302
- type: import_evalforge_types7.LLMStepType.THINKING,
9299
+ type: import_evalforge_types8.LLMStepType.THINKING,
9303
9300
  model: stepModel,
9304
9301
  provider: stepProvider,
9305
9302
  startedAt,
@@ -9328,7 +9325,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9328
9325
  id: (0, import_crypto3.randomUUID)(),
9329
9326
  stepNumber: 0,
9330
9327
  turnIndex,
9331
- type: import_evalforge_types7.LLMStepType.TOOL_USE,
9328
+ type: import_evalforge_types8.LLMStepType.TOOL_USE,
9332
9329
  model: stepModel,
9333
9330
  provider: stepProvider,
9334
9331
  startedAt,
@@ -9358,7 +9355,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9358
9355
  id: (0, import_crypto3.randomUUID)(),
9359
9356
  stepNumber: 0,
9360
9357
  turnIndex,
9361
- type: import_evalforge_types7.LLMStepType.COMPLETION,
9358
+ type: import_evalforge_types8.LLMStepType.COMPLETION,
9362
9359
  model: stepModel,
9363
9360
  provider: stepProvider,
9364
9361
  startedAt,
@@ -9375,7 +9372,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9375
9372
  });
9376
9373
  }
9377
9374
  if (subSteps.length === 0) {
9378
- const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
9375
+ const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
9379
9376
  subSteps.push({
9380
9377
  id: (0, import_crypto3.randomUUID)(),
9381
9378
  stepNumber: 0,
@@ -9576,14 +9573,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9576
9573
  const te = evt;
9577
9574
  return {
9578
9575
  ...base,
9579
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
9576
+ type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
9580
9577
  outputPreview: te.part.text.slice(0, 500)
9581
9578
  };
9582
9579
  }
9583
9580
  case "reasoning":
9584
9581
  return {
9585
9582
  ...base,
9586
- type: import_evalforge_types8.LiveTraceEventType.THINKING,
9583
+ type: import_evalforge_types9.LiveTraceEventType.THINKING,
9587
9584
  thinking: evt.part.text.slice(0, 500)
9588
9585
  };
9589
9586
  case "tool_use": {
@@ -9591,15 +9588,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9591
9588
  const toolName = tu.part.tool;
9592
9589
  const args = tu.part.state.input;
9593
9590
  const toolArgs = JSON.stringify(args).slice(0, 500);
9594
- let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
9591
+ let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
9595
9592
  let filePath;
9596
9593
  if (args) {
9597
9594
  if (args.file_path || args.path || args.target_file) {
9598
9595
  filePath = String(args.file_path || args.path || args.target_file);
9599
9596
  if (/write|edit/i.test(toolName)) {
9600
- type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
9597
+ type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
9601
9598
  } else if (/read|view/i.test(toolName)) {
9602
- type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
9599
+ type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
9603
9600
  }
9604
9601
  }
9605
9602
  }
@@ -9608,7 +9605,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9608
9605
  case "step_finish":
9609
9606
  return {
9610
9607
  ...base,
9611
- type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9608
+ type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9612
9609
  outputPreview: "Step completed"
9613
9610
  };
9614
9611
  default:
@@ -9639,7 +9636,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
9639
9636
  } else if (options.systemPrompt != null) {
9640
9637
  systemPrompt = options.systemPrompt;
9641
9638
  } else {
9642
- systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9639
+ systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9643
9640
  }
9644
9641
  if (systemPrompt) {
9645
9642
  await writeSystemPromptRule(cwd, systemPrompt);
@@ -9831,7 +9828,7 @@ function spawnOpenCodeProcess(opts) {
9831
9828
  targetId: traceContext.targetId,
9832
9829
  targetName: traceContext.targetName,
9833
9830
  stepNumber: traceStepNumber,
9834
- type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9831
+ type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9835
9832
  outputPreview: progressMessage,
9836
9833
  toolName: lastToolName,
9837
9834
  filePath: lastFilePath,
@@ -9865,18 +9862,18 @@ function spawnOpenCodeProcess(opts) {
9865
9862
  if (traceEvt) {
9866
9863
  lastToolName = traceEvt.toolName;
9867
9864
  lastFilePath = traceEvt.filePath;
9868
- if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
9865
+ if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
9869
9866
  lastAction = "Thinking...";
9870
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
9867
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
9871
9868
  lastAction = extractToolAction(
9872
9869
  traceEvt.toolName ?? "",
9873
9870
  void 0
9874
9871
  );
9875
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
9872
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
9876
9873
  lastAction = `Writing: ${traceEvt.filePath || "file"}`;
9877
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
9874
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
9878
9875
  lastAction = `Reading: ${traceEvt.filePath || "file"}`;
9879
- } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
9876
+ } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
9880
9877
  lastAction = "Processing response...";
9881
9878
  }
9882
9879
  emitTraceEvent(traceEvt, traceContext.pushEvent);
@@ -9958,7 +9955,7 @@ async function executeWithOpenCode(skills, scenario, options) {
9958
9955
  targetId: traceContext.targetId,
9959
9956
  targetName: traceContext.targetName,
9960
9957
  stepNumber: 0,
9961
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
9958
+ type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
9962
9959
  outputPreview: JSON.stringify({
9963
9960
  event: "pre-cli-execution",
9964
9961
  model: `${providerID}/${modelID}`,
@@ -10012,7 +10009,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10012
10009
  targetId: traceContext.targetId,
10013
10010
  targetName: traceContext.targetName,
10014
10011
  stepNumber: traceStepNumber + 1,
10015
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10012
+ type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10016
10013
  outputPreview: JSON.stringify({
10017
10014
  event: "idle-timeout-retry",
10018
10015
  attempt,
@@ -10056,7 +10053,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10056
10053
  targetId: traceContext.targetId,
10057
10054
  targetName: traceContext.targetName,
10058
10055
  stepNumber: traceStepNumber + 1,
10059
- type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10056
+ type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10060
10057
  outputPreview: JSON.stringify({
10061
10058
  event: "cli-execution-failed",
10062
10059
  error: lastAttemptResult.error?.message ?? "Unknown error",
@@ -10111,7 +10108,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10111
10108
  targetId: traceContext.targetId,
10112
10109
  targetName: traceContext.targetName,
10113
10110
  stepNumber: traceStepNumber + 1,
10114
- type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
10111
+ type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
10115
10112
  outputPreview: "Scenario execution completed",
10116
10113
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10117
10114
  isComplete: true
@@ -10148,7 +10145,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10148
10145
  var OpenCodeAdapter = class {
10149
10146
  id = "opencode";
10150
10147
  name = "OpenCode";
10151
- supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
10148
+ supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
10152
10149
  async prepareEnvironment(context) {
10153
10150
  await prepareOpenCodeEnvironment(context.cwd, context.skills, {
10154
10151
  mcps: context.mcps,
@@ -10171,7 +10168,7 @@ var OpenCodeAdapter = class {
10171
10168
  rules,
10172
10169
  systemPrompt
10173
10170
  } = context;
10174
- const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10171
+ const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10175
10172
  const cfg = typed?.success ? typed.data : void 0;
10176
10173
  const rawMaxTurns = cfg?.maxTurns;
10177
10174
  const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
@@ -10221,7 +10218,7 @@ var import_ai = require("ai");
10221
10218
  var import_anthropic = require("@ai-sdk/anthropic");
10222
10219
  var import_google = require("@ai-sdk/google");
10223
10220
  var import_openai = require("@ai-sdk/openai");
10224
- var import_evalforge_types11 = require("@wix/evalforge-types");
10221
+ var import_evalforge_types12 = require("@wix/evalforge-types");
10225
10222
  var import_crypto4 = require("crypto");
10226
10223
 
10227
10224
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
@@ -10318,7 +10315,7 @@ function extractErrorText(content) {
10318
10315
  }
10319
10316
 
10320
10317
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
10321
- var import_evalforge_types10 = require("@wix/evalforge-types");
10318
+ var import_evalforge_types11 = require("@wix/evalforge-types");
10322
10319
  var PROVIDER_ANTHROPIC = "anthropic";
10323
10320
  var PROVIDER_GEMINI = "gemini";
10324
10321
  var MODEL_PRICING = {
@@ -10387,7 +10384,7 @@ function extractGatewayCost(step, provider) {
10387
10384
  }
10388
10385
  }
10389
10386
  function calculateFromPricing(modelId, tokenUsage) {
10390
- const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
10387
+ const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
10391
10388
  const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
10392
10389
  if (!pricing) return 0;
10393
10390
  return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
@@ -10480,7 +10477,7 @@ function createModel(modelId, baseUrl, headers) {
10480
10477
  apiKey: "proxy-auth",
10481
10478
  headers
10482
10479
  });
10483
- if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10480
+ if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10484
10481
  (id) => modelId === id || modelId.startsWith(id)
10485
10482
  )) {
10486
10483
  return openai.responses(modelId);
@@ -10488,12 +10485,12 @@ function createModel(modelId, baseUrl, headers) {
10488
10485
  return openai.chat(modelId);
10489
10486
  }
10490
10487
  function isClaudeModelId(modelId) {
10491
- return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
10488
+ return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
10492
10489
  (id) => modelId === id || modelId.startsWith(id)
10493
10490
  );
10494
10491
  }
10495
10492
  function isGeminiModelId(modelId) {
10496
- return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
10493
+ return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
10497
10494
  (id) => modelId === id || modelId.startsWith(id)
10498
10495
  );
10499
10496
  }
@@ -10513,9 +10510,9 @@ async function executeWithAiSdk(context) {
10513
10510
  mcps,
10514
10511
  traceContext
10515
10512
  } = context;
10516
- const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10513
+ const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10517
10514
  const cfg = typed?.success ? typed.data : void 0;
10518
- const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
10515
+ const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
10519
10516
  const configExtras = {};
10520
10517
  if (config) {
10521
10518
  for (const [key, value] of Object.entries(config)) {
@@ -10552,11 +10549,11 @@ async function executeWithAiSdk(context) {
10552
10549
  }, SDK_TIMEOUT_MS);
10553
10550
  try {
10554
10551
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
10555
- const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10552
+ const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10556
10553
  (id) => modelId === id || modelId.startsWith(id)
10557
10554
  );
10558
10555
  const isGemini = provider === PROVIDER_GEMINI2;
10559
- const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
10556
+ const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
10560
10557
  const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
10561
10558
  const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
10562
10559
  const reasoningEffort = cfg.reasoningEffort ?? "high";
@@ -10635,7 +10632,7 @@ async function executeWithAiSdk(context) {
10635
10632
  targetId: traceContext.targetId,
10636
10633
  targetName: traceContext.targetName,
10637
10634
  stepNumber: stepTimestamps.length,
10638
- type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
10635
+ type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
10639
10636
  toolName: firstToolCall?.toolName,
10640
10637
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
10641
10638
  outputPreview: step.text?.slice(0, 500),
@@ -10840,7 +10837,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
10840
10837
  id: (0, import_crypto4.randomUUID)(),
10841
10838
  stepNumber: i + 1,
10842
10839
  turnIndex: i,
10843
- type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
10840
+ type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
10844
10841
  model: modelId,
10845
10842
  provider,
10846
10843
  startedAt: new Date(stepStartedAt).toISOString(),
@@ -10890,7 +10887,7 @@ function emitStartEvent(traceContext, startTime) {
10890
10887
  targetId: traceContext.targetId,
10891
10888
  targetName: traceContext.targetName,
10892
10889
  stepNumber: 0,
10893
- type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
10890
+ type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
10894
10891
  outputPreview: "Starting Simple Agent execution...",
10895
10892
  elapsedMs: Date.now() - startTime,
10896
10893
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -10908,7 +10905,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
10908
10905
  targetId: traceContext.targetId,
10909
10906
  targetName: traceContext.targetName,
10910
10907
  stepNumber,
10911
- type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
10908
+ type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
10912
10909
  outputPreview: "Scenario execution completed",
10913
10910
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10914
10911
  isComplete: true
@@ -11678,11 +11675,11 @@ function substituteVariables(prompt, variables) {
11678
11675
  }
11679
11676
 
11680
11677
  // src/run-scenario/run-agent-with-context.ts
11681
- var import_evalforge_types12 = require("@wix/evalforge-types");
11682
- var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
11678
+ var import_evalforge_types13 = require("@wix/evalforge-types");
11679
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
11683
11680
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
11684
11681
  const agent = evalData.agent ?? void 0;
11685
- const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
11682
+ const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
11686
11683
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
11687
11684
  const adapter = getAdapter(identifier);
11688
11685
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -11767,14 +11764,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11767
11764
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
11768
11765
  if (template) {
11769
11766
  console.log(
11770
- (0, import_evalforge_types13.formatTraceEventLine)({
11767
+ (0, import_evalforge_types14.formatTraceEventLine)({
11771
11768
  evalRunId: evalRunId2,
11772
11769
  scenarioId: scenario.id,
11773
11770
  scenarioName: scenario.name,
11774
11771
  targetId,
11775
11772
  targetName,
11776
11773
  stepNumber: 0,
11777
- type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
11774
+ type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
11778
11775
  outputPreview: "Setting up environment (installing dependencies)...",
11779
11776
  elapsedMs: 0,
11780
11777
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -11814,7 +11811,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11814
11811
  })),
11815
11812
  durationMs: partialResult.duration
11816
11813
  };
11817
- const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
11814
+ const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
11818
11815
  const assertionContext = {
11819
11816
  workDir,
11820
11817
  defaultJudgeModel,
@@ -11829,10 +11826,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11829
11826
  assertionContext
11830
11827
  ) : [];
11831
11828
  const passed = assertionResults.filter(
11832
- (r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
11829
+ (r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
11833
11830
  ).length;
11834
11831
  const failed = assertionResults.filter(
11835
- (r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
11832
+ (r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
11836
11833
  ).length;
11837
11834
  const total = assertionResults.length;
11838
11835
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -11908,7 +11905,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
11908
11905
  }
11909
11906
 
11910
11907
  // src/error-reporter.ts
11911
- var import_evalforge_types14 = require("@wix/evalforge-types");
11908
+ var import_evalforge_types15 = require("@wix/evalforge-types");
11912
11909
  function formatError(error, phase, context) {
11913
11910
  const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
11914
11911
  if (error instanceof Error) {
@@ -12151,7 +12148,7 @@ async function runEvaluation(projectId2, evalRunId2) {
12151
12148
  totalExecutions
12152
12149
  };
12153
12150
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
12154
- const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
12151
+ const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
12155
12152
  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
12156
12153
  firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
12157
12154
  ) : void 0;
@@ -12205,7 +12202,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12205
12202
  grpcAuthToken: config.grpcAuthToken
12206
12203
  });
12207
12204
  await api.updateEvalRun(projectId, evalRunId, {
12208
- status: import_evalforge_types15.EvalStatus.FAILED,
12205
+ status: import_evalforge_types16.EvalStatus.FAILED,
12209
12206
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12210
12207
  jobError,
12211
12208
  jobStatus: "FAILED"
@@ -12230,7 +12227,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12230
12227
  grpcAuthToken
12231
12228
  });
12232
12229
  await api.updateEvalRun(projectId, evalRunId, {
12233
- status: import_evalforge_types15.EvalStatus.FAILED,
12230
+ status: import_evalforge_types16.EvalStatus.FAILED,
12234
12231
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12235
12232
  jobError: `Config load failed, then: ${jobError}`,
12236
12233
  jobStatus: "FAILED"