@wix/evalforge-evaluator 0.184.0 → 0.185.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
5226
5226
  });
5227
5227
 
5228
5228
  // src/index.ts
5229
- var import_evalforge_types16 = require("@wix/evalforge-types");
5229
+ var import_evalforge_types15 = require("@wix/evalforge-types");
5230
5230
 
5231
5231
  // src/config.ts
5232
5232
  function loadConfig() {
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
7115
7115
  }
7116
7116
 
7117
7117
  // src/run-scenario/index.ts
7118
- var import_evalforge_types14 = require("@wix/evalforge-types");
7118
+ var import_evalforge_types13 = require("@wix/evalforge-types");
7119
7119
  var import_eval_assertions = require("@wix/eval-assertions");
7120
7120
 
7121
7121
  // src/run-scenario/environment.ts
@@ -7451,122 +7451,50 @@ function getAdapter(identifier) {
7451
7451
  }
7452
7452
 
7453
7453
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
7454
- var import_evalforge_types6 = require("@wix/evalforge-types");
7454
+ var import_evalforge_types5 = require("@wix/evalforge-types");
7455
7455
 
7456
7456
  // src/run-scenario/agents/claude-code/execute.ts
7457
- var import_evalforge_types5 = require("@wix/evalforge-types");
7457
+ var import_evalforge_types4 = require("@wix/evalforge-types");
7458
7458
 
7459
7459
  // src/run-scenario/agents/claude-code/write-skills.ts
7460
7460
  var import_promises3 = require("fs/promises");
7461
7461
  var import_path4 = require("path");
7462
-
7463
- // src/run-scenario/agents/shared/resolve-capability-content.ts
7464
7462
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
7465
- var import_evalforge_types2 = require("@wix/evalforge-types");
7466
- var USER_AGENT = "EvalForge-Evaluator";
7467
- async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7468
- const version = skill.latestVersion;
7469
- if (version?.files && version.files.length > 0) {
7470
- console.log(
7471
- `[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
7472
- );
7473
- return version.files;
7474
- }
7475
- if (skill.source) {
7476
- const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
7477
- console.log(
7478
- `[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
7479
- );
7480
- return files;
7481
- }
7482
- throw new Error(`Skill ${skill.name} has no files and no source configured`);
7483
- }
7484
- async function fetchSourceFile(label, noun, name, source, fetchFn) {
7485
- try {
7486
- const content = await fetchFn(source, { userAgent: USER_AGENT });
7487
- console.log(
7488
- `[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
7489
- );
7490
- return content;
7491
- } catch (error) {
7492
- const message = error instanceof Error ? error.message : "Unknown error";
7493
- console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
7494
- throw new Error(
7495
- `Failed to fetch ${noun} "${name}" from GitHub: ${message}`
7496
- );
7497
- }
7498
- }
7499
- async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7500
- if (agent.source) {
7501
- return fetchSourceFile(
7502
- "SubAgents",
7503
- "sub-agent",
7504
- agent.name,
7505
- agent.source,
7506
- fetchFn
7507
- );
7508
- }
7509
- if (!agent.subAgentMd) {
7510
- console.warn(
7511
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7512
- );
7513
- }
7514
- return agent.subAgentMd;
7515
- }
7516
- async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7517
- if (!rule.source) {
7518
- return rule.content;
7519
- }
7520
- return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
7521
- }
7522
- async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7523
- if (!mcp.source) {
7524
- return mcp.config;
7525
- }
7526
- const raw = await fetchSourceFile(
7527
- "MCP",
7528
- "MCP",
7529
- mcp.name,
7530
- mcp.source,
7531
- fetchFn
7532
- );
7533
- let parsed;
7534
- try {
7535
- parsed = JSON.parse(raw);
7536
- } catch (error) {
7537
- const message = error instanceof Error ? error.message : "Unknown error";
7538
- throw new Error(
7539
- `MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
7540
- );
7541
- }
7542
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
7543
- throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
7544
- }
7545
- const obj = parsed;
7546
- const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
7547
- if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
7548
- return servers;
7549
- }
7550
- return obj;
7551
- }
7552
-
7553
- // src/run-scenario/agents/claude-code/write-skills.ts
7554
- async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
7463
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7555
7464
  await Promise.all(
7556
7465
  skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
7557
7466
  );
7558
7467
  }
7559
- async function writeSkillToFilesystem(cwd, skill, fetchFn) {
7560
- const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
7468
+ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7469
+ const skillName = skill.name;
7470
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
7561
7471
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7562
- try {
7563
- const files = await resolveSkillFiles(skill, fetchFn);
7564
- await writeFilesToDirectory(skillDir, files);
7565
- } catch (error) {
7566
- const message = error instanceof Error ? error.message : "Unknown error";
7567
- throw new Error(
7568
- `Failed to write skill ${skill.name} to filesystem: ${message}`
7472
+ const version = skill.latestVersion;
7473
+ if (version?.files && version.files.length > 0) {
7474
+ await writeFilesToDirectory(skillDir, version.files);
7475
+ console.log(
7476
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
7569
7477
  );
7478
+ } else if (skill.source) {
7479
+ try {
7480
+ const files = await fetchFn(skill.source, {
7481
+ userAgent: "EvalForge-Evaluator"
7482
+ });
7483
+ await writeFilesToDirectory(skillDir, files);
7484
+ console.log(
7485
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
7486
+ );
7487
+ } catch (error) {
7488
+ const message = error instanceof Error ? error.message : "Unknown error";
7489
+ console.error(
7490
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
7491
+ );
7492
+ throw new Error(
7493
+ `Failed to write skill ${skillName} to filesystem: ${message}`
7494
+ );
7495
+ }
7496
+ } else {
7497
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
7570
7498
  }
7571
7499
  }
7572
7500
 
@@ -7584,7 +7512,7 @@ var import_crypto2 = require("crypto");
7584
7512
  // src/run-scenario/agents/claude-code/write-mcp.ts
7585
7513
  var import_promises5 = require("fs/promises");
7586
7514
  var import_path6 = require("path");
7587
- var import_evalforge_types3 = require("@wix/evalforge-types");
7515
+ var import_evalforge_types2 = require("@wix/evalforge-types");
7588
7516
 
7589
7517
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
7590
7518
  var import_promises4 = require("fs/promises");
@@ -7629,11 +7557,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
7629
7557
  }
7630
7558
 
7631
7559
  // src/run-scenario/agents/claude-code/write-mcp.ts
7632
- async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7560
+ async function writeMcpToFilesystem(cwd, mcps) {
7633
7561
  if (mcps.length === 0) return;
7634
7562
  const mcpServers = {};
7635
7563
  for (const mcp of mcps) {
7636
- const config = await resolveMcpConfig(mcp, fetchFn);
7564
+ const config = mcp.config;
7637
7565
  for (const [key, value] of Object.entries(config)) {
7638
7566
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
7639
7567
  throw new Error(
@@ -7645,7 +7573,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7645
7573
  }
7646
7574
  const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
7647
7575
  const content = JSON.stringify(
7648
- { [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
7576
+ { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
7649
7577
  null,
7650
7578
  2
7651
7579
  );
@@ -7657,6 +7585,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7657
7585
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
7658
7586
  var import_promises6 = require("fs/promises");
7659
7587
  var import_path7 = require("path");
7588
+ var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
7660
7589
  var AGENTS_DIR = ".claude/agents";
7661
7590
  function toAgentFilename(name, index, nameCount) {
7662
7591
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -7664,7 +7593,34 @@ function toAgentFilename(name, index, nameCount) {
7664
7593
  nameCount.set(base, count + 1);
7665
7594
  return count === 0 ? base : `${base}-${count + 1}`;
7666
7595
  }
7667
- async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7596
+ async function resolveSubAgentContent(agent, fetchFn) {
7597
+ if (agent.source) {
7598
+ try {
7599
+ const content = await fetchFn(agent.source, {
7600
+ userAgent: "EvalForge-Evaluator"
7601
+ });
7602
+ console.log(
7603
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
7604
+ );
7605
+ return content;
7606
+ } catch (error) {
7607
+ const message = error instanceof Error ? error.message : "Unknown error";
7608
+ console.error(
7609
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
7610
+ );
7611
+ throw new Error(
7612
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
7613
+ );
7614
+ }
7615
+ }
7616
+ if (!agent.subAgentMd) {
7617
+ console.warn(
7618
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7619
+ );
7620
+ }
7621
+ return agent.subAgentMd;
7622
+ }
7623
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
7668
7624
  if (subAgents.length === 0) return;
7669
7625
  const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
7670
7626
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
@@ -7672,7 +7628,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7672
7628
  for (const [i, agent] of subAgents.entries()) {
7673
7629
  const filename = toAgentFilename(agent.name, i, nameCount);
7674
7630
  const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
7675
- const content = await resolveSubAgentMd(agent, fetchFn);
7631
+ const content = await resolveSubAgentContent(agent, fetchFn);
7676
7632
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
7677
7633
  }
7678
7634
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -7722,19 +7678,18 @@ function validateGenericDirectory(dir, cwd) {
7722
7678
  }
7723
7679
  return trimmed;
7724
7680
  }
7725
- async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7681
+ async function writeRulesToFilesystem(cwd, rules) {
7726
7682
  if (rules.length === 0) return;
7727
7683
  const nameCount = /* @__PURE__ */ new Map();
7728
7684
  let hasCursorRules = false;
7729
7685
  for (const [i, rule] of rules.entries()) {
7730
- const content = await resolveRuleText(rule, fetchFn);
7731
7686
  switch (rule.ruleType) {
7732
7687
  case "claude-md": {
7733
- await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
7688
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
7734
7689
  break;
7735
7690
  }
7736
7691
  case "agents-md": {
7737
- await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
7692
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
7738
7693
  break;
7739
7694
  }
7740
7695
  case "cursor-rule": {
@@ -7744,7 +7699,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7744
7699
  }
7745
7700
  const filename = toRuleFilename(rule.name, i, nameCount);
7746
7701
  const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
7747
- await (0, import_promises7.writeFile)(filePath, content, "utf8");
7702
+ await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
7748
7703
  break;
7749
7704
  }
7750
7705
  case "generic": {
@@ -7755,7 +7710,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7755
7710
  const dirPath = (0, import_path8.join)(cwd, directory);
7756
7711
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
7757
7712
  const filename = toRuleFilename(rule.name, i, nameCount);
7758
- await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
7713
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
7759
7714
  break;
7760
7715
  }
7761
7716
  default: {
@@ -7845,14 +7800,14 @@ function buildConversation(timestampedMessages) {
7845
7800
  }
7846
7801
 
7847
7802
  // src/run-scenario/agents/shared/trace-emit.ts
7848
- var import_evalforge_types4 = require("@wix/evalforge-types");
7803
+ var import_evalforge_types3 = require("@wix/evalforge-types");
7849
7804
  function emitTraceEvent(event, pushEvent) {
7850
- console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7805
+ console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7851
7806
  pushEvent?.(event);
7852
7807
  }
7853
7808
 
7854
7809
  // src/run-scenario/agents/claude-code/execute.ts
7855
- var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7810
+ var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7856
7811
  async function* buildPromptStream(triggerPrompt, images) {
7857
7812
  yield {
7858
7813
  type: "user",
@@ -7917,7 +7872,7 @@ function extractToolActionDescription(toolName, toolArgs) {
7917
7872
  return `Using ${toolName}...`;
7918
7873
  }
7919
7874
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7920
- let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7875
+ let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7921
7876
  let toolName;
7922
7877
  let toolArgs;
7923
7878
  let outputPreview;
@@ -7925,28 +7880,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7925
7880
  let thinking;
7926
7881
  for (const block of message.message.content) {
7927
7882
  if (block.type === "tool_use") {
7928
- type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
7883
+ type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
7929
7884
  toolName = block.name;
7930
7885
  toolArgs = JSON.stringify(block.input).slice(0, 500);
7931
7886
  const input = block.input;
7932
7887
  if (input.file_path || input.path || input.target_file) {
7933
7888
  filePath = String(input.file_path || input.path || input.target_file);
7934
7889
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
7935
- type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
7890
+ type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
7936
7891
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
7937
- type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
7892
+ type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
7938
7893
  }
7939
7894
  }
7940
7895
  } else if (block.type === "text") {
7941
7896
  outputPreview = block.text.slice(0, 500);
7942
7897
  if (!toolName) {
7943
- type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7898
+ type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7944
7899
  }
7945
7900
  } else if (block.type === "thinking") {
7946
7901
  const thinkingBlock = block;
7947
7902
  thinking = thinkingBlock.thinking.slice(0, 500);
7948
7903
  if (!outputPreview && !toolName) {
7949
- type = import_evalforge_types5.LiveTraceEventType.THINKING;
7904
+ type = import_evalforge_types4.LiveTraceEventType.THINKING;
7950
7905
  }
7951
7906
  }
7952
7907
  }
@@ -8012,7 +7967,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8012
7967
  }
8013
7968
  return {
8014
7969
  ...baseEvent,
8015
- type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
7970
+ type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
8016
7971
  outputPreview: outputPreview || "(tool result)"
8017
7972
  };
8018
7973
  }
@@ -8020,7 +7975,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8020
7975
  const sysMsg = message;
8021
7976
  return {
8022
7977
  ...baseEvent,
8023
- type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
7978
+ type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
8024
7979
  outputPreview: sysMsg.subtype || "system"
8025
7980
  };
8026
7981
  }
@@ -8029,7 +7984,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8029
7984
  }
8030
7985
  return {
8031
7986
  ...baseEvent,
8032
- type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
7987
+ type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8033
7988
  outputPreview: `Message type: ${message.type}`
8034
7989
  };
8035
7990
  }
@@ -8131,7 +8086,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8131
8086
  queryOptions.systemPrompt = {
8132
8087
  type: "preset",
8133
8088
  preset: "claude_code",
8134
- append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8089
+ append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8135
8090
  };
8136
8091
  }
8137
8092
  if (options.temperature !== void 0) {
@@ -8166,7 +8121,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8166
8121
  targetId: traceContext.targetId,
8167
8122
  targetName: traceContext.targetName,
8168
8123
  stepNumber: 0,
8169
- type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8124
+ type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8170
8125
  outputPreview: JSON.stringify({
8171
8126
  event: "pre-sdk-execution",
8172
8127
  model: queryOptions.model,
@@ -8230,7 +8185,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8230
8185
  targetId: traceContext.targetId,
8231
8186
  targetName: traceContext.targetName,
8232
8187
  stepNumber: traceStepNumber,
8233
- type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
8188
+ type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8234
8189
  outputPreview: progressMessage,
8235
8190
  toolName: lastToolName,
8236
8191
  filePath: lastFilePath,
@@ -8267,18 +8222,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
8267
8222
  if (traceEvent) {
8268
8223
  lastToolName = traceEvent.toolName;
8269
8224
  lastFilePath = traceEvent.filePath;
8270
- if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
8225
+ if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
8271
8226
  lastAction = "Thinking...";
8272
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
8227
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
8273
8228
  lastAction = extractToolActionDescription(
8274
8229
  traceEvent.toolName,
8275
8230
  traceEvent.toolArgs
8276
8231
  );
8277
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
8232
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
8278
8233
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
8279
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
8234
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
8280
8235
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
8281
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
8236
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
8282
8237
  lastAction = "Processing response...";
8283
8238
  }
8284
8239
  emitTraceEvent(traceEvent, traceContext.pushEvent);
@@ -8456,7 +8411,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8456
8411
  targetId: traceContext.targetId,
8457
8412
  targetName: traceContext.targetName,
8458
8413
  stepNumber: traceStepNumber + 1,
8459
- type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8414
+ type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8460
8415
  outputPreview: JSON.stringify(
8461
8416
  {
8462
8417
  event: "sdk-execution-failed",
@@ -8490,7 +8445,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
8490
8445
  targetId: traceContext.targetId,
8491
8446
  targetName: traceContext.targetName,
8492
8447
  stepNumber: traceStepNumber + 1,
8493
- type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
8448
+ type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
8494
8449
  outputPreview: "Scenario execution completed",
8495
8450
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
8496
8451
  isComplete: true
@@ -8765,7 +8720,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8765
8720
  stepNumber: 0,
8766
8721
  // renumbered below
8767
8722
  turnIndex,
8768
- type: import_evalforge_types5.LLMStepType.THINKING,
8723
+ type: import_evalforge_types4.LLMStepType.THINKING,
8769
8724
  model,
8770
8725
  provider: "anthropic",
8771
8726
  startedAt: step.startedAt.toISOString(),
@@ -8796,7 +8751,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8796
8751
  id: (0, import_crypto2.randomUUID)(),
8797
8752
  stepNumber: 0,
8798
8753
  turnIndex,
8799
- type: import_evalforge_types5.LLMStepType.TOOL_USE,
8754
+ type: import_evalforge_types4.LLMStepType.TOOL_USE,
8800
8755
  model,
8801
8756
  provider: "anthropic",
8802
8757
  startedAt: step.startedAt.toISOString(),
@@ -8826,7 +8781,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8826
8781
  id: (0, import_crypto2.randomUUID)(),
8827
8782
  stepNumber: 0,
8828
8783
  turnIndex,
8829
- type: import_evalforge_types5.LLMStepType.COMPLETION,
8784
+ type: import_evalforge_types4.LLMStepType.COMPLETION,
8830
8785
  model,
8831
8786
  provider: "anthropic",
8832
8787
  startedAt: step.startedAt.toISOString(),
@@ -8843,7 +8798,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8843
8798
  });
8844
8799
  }
8845
8800
  if (subSteps.length === 0) {
8846
- const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
8801
+ const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
8847
8802
  subSteps.push({
8848
8803
  id: (0, import_crypto2.randomUUID)(),
8849
8804
  stepNumber: 0,
@@ -8913,7 +8868,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8913
8868
  var ClaudeCodeAdapter = class {
8914
8869
  id = "claude-code";
8915
8870
  name = "Claude Code";
8916
- supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
8871
+ supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
8917
8872
  /**
8918
8873
  * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
8919
8874
  * before the baseline snapshot is taken.
@@ -8945,9 +8900,9 @@ var ClaudeCodeAdapter = class {
8945
8900
  rules,
8946
8901
  systemPrompt
8947
8902
  } = context;
8948
- const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8903
+ const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8949
8904
  const cfg = typed?.success ? typed.data : void 0;
8950
- const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
8905
+ const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
8951
8906
  const extras = {};
8952
8907
  if (config) {
8953
8908
  for (const [key, value] of Object.entries(config)) {
@@ -9002,11 +8957,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
9002
8957
  defaultRegistry.register(claudeCodeAdapter);
9003
8958
 
9004
8959
  // src/run-scenario/agents/opencode/opencode-adapter.ts
9005
- var import_evalforge_types10 = require("@wix/evalforge-types");
8960
+ var import_evalforge_types9 = require("@wix/evalforge-types");
9006
8961
 
9007
8962
  // src/run-scenario/agents/opencode/execute.ts
9008
8963
  var import_child_process2 = require("child_process");
9009
- var import_evalforge_types9 = require("@wix/evalforge-types");
8964
+ var import_evalforge_types8 = require("@wix/evalforge-types");
9010
8965
 
9011
8966
  // src/run-scenario/agents/opencode/types.ts
9012
8967
  function tryParseJson(text) {
@@ -9020,28 +8975,49 @@ function tryParseJson(text) {
9020
8975
  // src/run-scenario/agents/opencode/write-skills.ts
9021
8976
  var import_promises8 = require("fs/promises");
9022
8977
  var import_path9 = require("path");
9023
- async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
8978
+ var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
8979
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
9024
8980
  await Promise.all(
9025
8981
  skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
9026
8982
  );
9027
8983
  }
9028
8984
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
9029
- const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
8985
+ const skillName = skill.name;
8986
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
9030
8987
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
9031
- try {
9032
- const files = await resolveSkillFiles(skill, fetchFn);
9033
- await writeFilesToDirectory(skillDir, files);
9034
- } catch (error) {
9035
- const message = error instanceof Error ? error.message : "Unknown error";
9036
- throw new Error(
9037
- `Failed to write skill ${skill.name} to filesystem: ${message}`
8988
+ const version = skill.latestVersion;
8989
+ if (version?.files && version.files.length > 0) {
8990
+ await writeFilesToDirectory(skillDir, version.files);
8991
+ console.log(
8992
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
9038
8993
  );
8994
+ } else if (skill.source) {
8995
+ try {
8996
+ const files = await fetchFn(skill.source, {
8997
+ userAgent: "EvalForge-Evaluator"
8998
+ });
8999
+ await writeFilesToDirectory(skillDir, files);
9000
+ console.log(
9001
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
9002
+ );
9003
+ } catch (error) {
9004
+ const message = error instanceof Error ? error.message : "Unknown error";
9005
+ console.error(
9006
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
9007
+ );
9008
+ throw new Error(
9009
+ `Failed to write skill ${skillName} to filesystem: ${message}`
9010
+ );
9011
+ }
9012
+ } else {
9013
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
9039
9014
  }
9040
9015
  }
9041
9016
 
9042
9017
  // src/run-scenario/agents/opencode/write-sub-agents.ts
9043
9018
  var import_promises9 = require("fs/promises");
9044
9019
  var import_path10 = require("path");
9020
+ var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
9045
9021
  var AGENTS_DIR2 = ".opencode/agents";
9046
9022
  function toAgentFilename2(name, index, nameCount) {
9047
9023
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -9049,7 +9025,34 @@ function toAgentFilename2(name, index, nameCount) {
9049
9025
  nameCount.set(base, count + 1);
9050
9026
  return count === 0 ? base : `${base}-${count + 1}`;
9051
9027
  }
9052
- async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9028
+ async function resolveSubAgentContent2(agent, fetchFn) {
9029
+ if (agent.source) {
9030
+ try {
9031
+ const content = await fetchFn(agent.source, {
9032
+ userAgent: "EvalForge-Evaluator"
9033
+ });
9034
+ console.log(
9035
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
9036
+ );
9037
+ return content;
9038
+ } catch (error) {
9039
+ const message = error instanceof Error ? error.message : "Unknown error";
9040
+ console.error(
9041
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
9042
+ );
9043
+ throw new Error(
9044
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
9045
+ );
9046
+ }
9047
+ }
9048
+ if (!agent.subAgentMd) {
9049
+ console.warn(
9050
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
9051
+ );
9052
+ }
9053
+ return agent.subAgentMd;
9054
+ }
9055
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
9053
9056
  if (subAgents.length === 0) return;
9054
9057
  const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
9055
9058
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
@@ -9057,7 +9060,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9057
9060
  for (const [i, agent] of subAgents.entries()) {
9058
9061
  const filename = toAgentFilename2(agent.name, i, nameCount);
9059
9062
  const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
9060
- const content = await resolveSubAgentMd(agent, fetchFn);
9063
+ const content = await resolveSubAgentContent2(agent, fetchFn);
9061
9064
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
9062
9065
  }
9063
9066
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -9065,8 +9068,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9065
9068
 
9066
9069
  // src/run-scenario/agents/opencode/config.ts
9067
9070
  var import_os3 = require("os");
9068
- var import_evalforge_types7 = require("@wix/evalforge-types");
9069
- var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9071
+ var import_evalforge_types6 = require("@wix/evalforge-types");
9072
+ var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9070
9073
  var OPENCODE_MODEL_ALIASES = {
9071
9074
  "claude-sonnet-4": "claude-sonnet-4-0",
9072
9075
  "claude-opus-4": "claude-opus-4-0"
@@ -9082,10 +9085,10 @@ function parseModel(model) {
9082
9085
  };
9083
9086
  }
9084
9087
  const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
9085
- const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
9088
+ const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
9086
9089
  model
9087
9090
  );
9088
- const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
9091
+ const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
9089
9092
  model
9090
9093
  );
9091
9094
  if (isGemini) return { providerID: "google", modelID };
@@ -9154,7 +9157,7 @@ async function buildOpenCodeEnv(options) {
9154
9157
  if (options.mcps && options.mcps.length > 0) {
9155
9158
  const mcpServers = {};
9156
9159
  for (const mcpEntity of options.mcps) {
9157
- const entityConfig = await resolveMcpConfig(mcpEntity);
9160
+ const entityConfig = mcpEntity.config;
9158
9161
  for (const [key, value] of Object.entries(entityConfig)) {
9159
9162
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
9160
9163
  throw new Error(
@@ -9179,7 +9182,7 @@ async function buildOpenCodeEnv(options) {
9179
9182
  if (options.maxTurns != null && options.maxTurns > 0) {
9180
9183
  agentOverrides.maxSteps = options.maxTurns;
9181
9184
  }
9182
- const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9185
+ const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9183
9186
  const configPermission = parsed?.success ? parsed.data.permission : void 0;
9184
9187
  const defaultPermission = {
9185
9188
  "*": "allow"
@@ -9221,7 +9224,7 @@ async function buildOpenCodeEnv(options) {
9221
9224
  }
9222
9225
 
9223
9226
  // src/run-scenario/agents/opencode/build-trace.ts
9224
- var import_evalforge_types8 = require("@wix/evalforge-types");
9227
+ var import_evalforge_types7 = require("@wix/evalforge-types");
9225
9228
  var import_crypto3 = require("crypto");
9226
9229
  function toCanonicalModelId(modelId) {
9227
9230
  const slashIndex = modelId.indexOf("/");
@@ -9301,7 +9304,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9301
9304
  id: (0, import_crypto3.randomUUID)(),
9302
9305
  stepNumber: 0,
9303
9306
  turnIndex,
9304
- type: import_evalforge_types8.LLMStepType.THINKING,
9307
+ type: import_evalforge_types7.LLMStepType.THINKING,
9305
9308
  model: stepModel,
9306
9309
  provider: stepProvider,
9307
9310
  startedAt,
@@ -9330,7 +9333,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9330
9333
  id: (0, import_crypto3.randomUUID)(),
9331
9334
  stepNumber: 0,
9332
9335
  turnIndex,
9333
- type: import_evalforge_types8.LLMStepType.TOOL_USE,
9336
+ type: import_evalforge_types7.LLMStepType.TOOL_USE,
9334
9337
  model: stepModel,
9335
9338
  provider: stepProvider,
9336
9339
  startedAt,
@@ -9360,7 +9363,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9360
9363
  id: (0, import_crypto3.randomUUID)(),
9361
9364
  stepNumber: 0,
9362
9365
  turnIndex,
9363
- type: import_evalforge_types8.LLMStepType.COMPLETION,
9366
+ type: import_evalforge_types7.LLMStepType.COMPLETION,
9364
9367
  model: stepModel,
9365
9368
  provider: stepProvider,
9366
9369
  startedAt,
@@ -9377,7 +9380,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9377
9380
  });
9378
9381
  }
9379
9382
  if (subSteps.length === 0) {
9380
- const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
9383
+ const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
9381
9384
  subSteps.push({
9382
9385
  id: (0, import_crypto3.randomUUID)(),
9383
9386
  stepNumber: 0,
@@ -9578,14 +9581,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9578
9581
  const te = evt;
9579
9582
  return {
9580
9583
  ...base,
9581
- type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
9584
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
9582
9585
  outputPreview: te.part.text.slice(0, 500)
9583
9586
  };
9584
9587
  }
9585
9588
  case "reasoning":
9586
9589
  return {
9587
9590
  ...base,
9588
- type: import_evalforge_types9.LiveTraceEventType.THINKING,
9591
+ type: import_evalforge_types8.LiveTraceEventType.THINKING,
9589
9592
  thinking: evt.part.text.slice(0, 500)
9590
9593
  };
9591
9594
  case "tool_use": {
@@ -9593,15 +9596,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9593
9596
  const toolName = tu.part.tool;
9594
9597
  const args = tu.part.state.input;
9595
9598
  const toolArgs = JSON.stringify(args).slice(0, 500);
9596
- let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
9599
+ let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
9597
9600
  let filePath;
9598
9601
  if (args) {
9599
9602
  if (args.file_path || args.path || args.target_file) {
9600
9603
  filePath = String(args.file_path || args.path || args.target_file);
9601
9604
  if (/write|edit/i.test(toolName)) {
9602
- type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
9605
+ type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
9603
9606
  } else if (/read|view/i.test(toolName)) {
9604
- type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
9607
+ type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
9605
9608
  }
9606
9609
  }
9607
9610
  }
@@ -9610,7 +9613,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9610
9613
  case "step_finish":
9611
9614
  return {
9612
9615
  ...base,
9613
- type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9616
+ type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9614
9617
  outputPreview: "Step completed"
9615
9618
  };
9616
9619
  default:
@@ -9641,7 +9644,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
9641
9644
  } else if (options.systemPrompt != null) {
9642
9645
  systemPrompt = options.systemPrompt;
9643
9646
  } else {
9644
- systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9647
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9645
9648
  }
9646
9649
  if (systemPrompt) {
9647
9650
  await writeSystemPromptRule(cwd, systemPrompt);
@@ -9833,7 +9836,7 @@ function spawnOpenCodeProcess(opts) {
9833
9836
  targetId: traceContext.targetId,
9834
9837
  targetName: traceContext.targetName,
9835
9838
  stepNumber: traceStepNumber,
9836
- type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9839
+ type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9837
9840
  outputPreview: progressMessage,
9838
9841
  toolName: lastToolName,
9839
9842
  filePath: lastFilePath,
@@ -9867,18 +9870,18 @@ function spawnOpenCodeProcess(opts) {
9867
9870
  if (traceEvt) {
9868
9871
  lastToolName = traceEvt.toolName;
9869
9872
  lastFilePath = traceEvt.filePath;
9870
- if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
9873
+ if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
9871
9874
  lastAction = "Thinking...";
9872
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
9875
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
9873
9876
  lastAction = extractToolAction(
9874
9877
  traceEvt.toolName ?? "",
9875
9878
  void 0
9876
9879
  );
9877
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
9880
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
9878
9881
  lastAction = `Writing: ${traceEvt.filePath || "file"}`;
9879
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
9882
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
9880
9883
  lastAction = `Reading: ${traceEvt.filePath || "file"}`;
9881
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
9884
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
9882
9885
  lastAction = "Processing response...";
9883
9886
  }
9884
9887
  emitTraceEvent(traceEvt, traceContext.pushEvent);
@@ -9960,7 +9963,7 @@ async function executeWithOpenCode(skills, scenario, options) {
9960
9963
  targetId: traceContext.targetId,
9961
9964
  targetName: traceContext.targetName,
9962
9965
  stepNumber: 0,
9963
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
9966
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
9964
9967
  outputPreview: JSON.stringify({
9965
9968
  event: "pre-cli-execution",
9966
9969
  model: `${providerID}/${modelID}`,
@@ -10014,7 +10017,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10014
10017
  targetId: traceContext.targetId,
10015
10018
  targetName: traceContext.targetName,
10016
10019
  stepNumber: traceStepNumber + 1,
10017
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10020
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10018
10021
  outputPreview: JSON.stringify({
10019
10022
  event: "idle-timeout-retry",
10020
10023
  attempt,
@@ -10058,7 +10061,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10058
10061
  targetId: traceContext.targetId,
10059
10062
  targetName: traceContext.targetName,
10060
10063
  stepNumber: traceStepNumber + 1,
10061
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10064
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10062
10065
  outputPreview: JSON.stringify({
10063
10066
  event: "cli-execution-failed",
10064
10067
  error: lastAttemptResult.error?.message ?? "Unknown error",
@@ -10113,7 +10116,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10113
10116
  targetId: traceContext.targetId,
10114
10117
  targetName: traceContext.targetName,
10115
10118
  stepNumber: traceStepNumber + 1,
10116
- type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
10119
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
10117
10120
  outputPreview: "Scenario execution completed",
10118
10121
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10119
10122
  isComplete: true
@@ -10150,7 +10153,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10150
10153
  var OpenCodeAdapter = class {
10151
10154
  id = "opencode";
10152
10155
  name = "OpenCode";
10153
- supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
10156
+ supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
10154
10157
  async prepareEnvironment(context) {
10155
10158
  await prepareOpenCodeEnvironment(context.cwd, context.skills, {
10156
10159
  mcps: context.mcps,
@@ -10173,7 +10176,7 @@ var OpenCodeAdapter = class {
10173
10176
  rules,
10174
10177
  systemPrompt
10175
10178
  } = context;
10176
- const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10179
+ const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10177
10180
  const cfg = typed?.success ? typed.data : void 0;
10178
10181
  const rawMaxTurns = cfg?.maxTurns;
10179
10182
  const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
@@ -10223,7 +10226,7 @@ var import_ai = require("ai");
10223
10226
  var import_anthropic = require("@ai-sdk/anthropic");
10224
10227
  var import_google = require("@ai-sdk/google");
10225
10228
  var import_openai = require("@ai-sdk/openai");
10226
- var import_evalforge_types12 = require("@wix/evalforge-types");
10229
+ var import_evalforge_types11 = require("@wix/evalforge-types");
10227
10230
  var import_crypto4 = require("crypto");
10228
10231
 
10229
10232
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
@@ -10320,7 +10323,7 @@ function extractErrorText(content) {
10320
10323
  }
10321
10324
 
10322
10325
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
10323
- var import_evalforge_types11 = require("@wix/evalforge-types");
10326
+ var import_evalforge_types10 = require("@wix/evalforge-types");
10324
10327
  var PROVIDER_ANTHROPIC = "anthropic";
10325
10328
  var PROVIDER_GEMINI = "gemini";
10326
10329
  var MODEL_PRICING = {
@@ -10389,7 +10392,7 @@ function extractGatewayCost(step, provider) {
10389
10392
  }
10390
10393
  }
10391
10394
  function calculateFromPricing(modelId, tokenUsage) {
10392
- const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
10395
+ const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
10393
10396
  const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
10394
10397
  if (!pricing) return 0;
10395
10398
  return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
@@ -10482,7 +10485,7 @@ function createModel(modelId, baseUrl, headers) {
10482
10485
  apiKey: "proxy-auth",
10483
10486
  headers
10484
10487
  });
10485
- if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10488
+ if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10486
10489
  (id) => modelId === id || modelId.startsWith(id)
10487
10490
  )) {
10488
10491
  return openai.responses(modelId);
@@ -10490,12 +10493,12 @@ function createModel(modelId, baseUrl, headers) {
10490
10493
  return openai.chat(modelId);
10491
10494
  }
10492
10495
  function isClaudeModelId(modelId) {
10493
- return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
10496
+ return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
10494
10497
  (id) => modelId === id || modelId.startsWith(id)
10495
10498
  );
10496
10499
  }
10497
10500
  function isGeminiModelId(modelId) {
10498
- return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
10501
+ return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
10499
10502
  (id) => modelId === id || modelId.startsWith(id)
10500
10503
  );
10501
10504
  }
@@ -10515,9 +10518,9 @@ async function executeWithAiSdk(context) {
10515
10518
  mcps,
10516
10519
  traceContext
10517
10520
  } = context;
10518
- const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10521
+ const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10519
10522
  const cfg = typed?.success ? typed.data : void 0;
10520
- const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
10523
+ const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
10521
10524
  const configExtras = {};
10522
10525
  if (config) {
10523
10526
  for (const [key, value] of Object.entries(config)) {
@@ -10554,11 +10557,11 @@ async function executeWithAiSdk(context) {
10554
10557
  }, SDK_TIMEOUT_MS);
10555
10558
  try {
10556
10559
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
10557
- const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10560
+ const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10558
10561
  (id) => modelId === id || modelId.startsWith(id)
10559
10562
  );
10560
10563
  const isGemini = provider === PROVIDER_GEMINI2;
10561
- const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
10564
+ const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
10562
10565
  const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
10563
10566
  const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
10564
10567
  const reasoningEffort = cfg.reasoningEffort ?? "high";
@@ -10637,7 +10640,7 @@ async function executeWithAiSdk(context) {
10637
10640
  targetId: traceContext.targetId,
10638
10641
  targetName: traceContext.targetName,
10639
10642
  stepNumber: stepTimestamps.length,
10640
- type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
10643
+ type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
10641
10644
  toolName: firstToolCall?.toolName,
10642
10645
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
10643
10646
  outputPreview: step.text?.slice(0, 500),
@@ -10842,7 +10845,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
10842
10845
  id: (0, import_crypto4.randomUUID)(),
10843
10846
  stepNumber: i + 1,
10844
10847
  turnIndex: i,
10845
- type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
10848
+ type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
10846
10849
  model: modelId,
10847
10850
  provider,
10848
10851
  startedAt: new Date(stepStartedAt).toISOString(),
@@ -10892,7 +10895,7 @@ function emitStartEvent(traceContext, startTime) {
10892
10895
  targetId: traceContext.targetId,
10893
10896
  targetName: traceContext.targetName,
10894
10897
  stepNumber: 0,
10895
- type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
10898
+ type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
10896
10899
  outputPreview: "Starting Simple Agent execution...",
10897
10900
  elapsedMs: Date.now() - startTime,
10898
10901
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -10910,7 +10913,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
10910
10913
  targetId: traceContext.targetId,
10911
10914
  targetName: traceContext.targetName,
10912
10915
  stepNumber,
10913
- type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
10916
+ type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
10914
10917
  outputPreview: "Scenario execution completed",
10915
10918
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10916
10919
  isComplete: true
@@ -11680,11 +11683,11 @@ function substituteVariables(prompt, variables) {
11680
11683
  }
11681
11684
 
11682
11685
  // src/run-scenario/run-agent-with-context.ts
11683
- var import_evalforge_types13 = require("@wix/evalforge-types");
11684
- var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
11686
+ var import_evalforge_types12 = require("@wix/evalforge-types");
11687
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
11685
11688
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
11686
11689
  const agent = evalData.agent ?? void 0;
11687
- const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
11690
+ const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
11688
11691
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
11689
11692
  const adapter = getAdapter(identifier);
11690
11693
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -11769,14 +11772,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11769
11772
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
11770
11773
  if (template) {
11771
11774
  console.log(
11772
- (0, import_evalforge_types14.formatTraceEventLine)({
11775
+ (0, import_evalforge_types13.formatTraceEventLine)({
11773
11776
  evalRunId: evalRunId2,
11774
11777
  scenarioId: scenario.id,
11775
11778
  scenarioName: scenario.name,
11776
11779
  targetId,
11777
11780
  targetName,
11778
11781
  stepNumber: 0,
11779
- type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
11782
+ type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
11780
11783
  outputPreview: "Setting up environment (installing dependencies)...",
11781
11784
  elapsedMs: 0,
11782
11785
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -11816,7 +11819,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11816
11819
  })),
11817
11820
  durationMs: partialResult.duration
11818
11821
  };
11819
- const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
11822
+ const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
11820
11823
  const assertionContext = {
11821
11824
  workDir,
11822
11825
  defaultJudgeModel,
@@ -11831,10 +11834,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11831
11834
  assertionContext
11832
11835
  ) : [];
11833
11836
  const passed = assertionResults.filter(
11834
- (r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
11837
+ (r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
11835
11838
  ).length;
11836
11839
  const failed = assertionResults.filter(
11837
- (r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
11840
+ (r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
11838
11841
  ).length;
11839
11842
  const total = assertionResults.length;
11840
11843
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -11910,7 +11913,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
11910
11913
  }
11911
11914
 
11912
11915
  // src/error-reporter.ts
11913
- var import_evalforge_types15 = require("@wix/evalforge-types");
11916
+ var import_evalforge_types14 = require("@wix/evalforge-types");
11914
11917
  function formatError(error, phase, context) {
11915
11918
  const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
11916
11919
  if (error instanceof Error) {
@@ -12153,7 +12156,7 @@ async function runEvaluation(projectId2, evalRunId2) {
12153
12156
  totalExecutions
12154
12157
  };
12155
12158
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
12156
- const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
12159
+ const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
12157
12160
  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
12158
12161
  firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
12159
12162
  ) : void 0;
@@ -12207,7 +12210,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12207
12210
  grpcAuthToken: config.grpcAuthToken
12208
12211
  });
12209
12212
  await api.updateEvalRun(projectId, evalRunId, {
12210
- status: import_evalforge_types16.EvalStatus.FAILED,
12213
+ status: import_evalforge_types15.EvalStatus.FAILED,
12211
12214
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12212
12215
  jobError,
12213
12216
  jobStatus: "FAILED"
@@ -12232,7 +12235,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12232
12235
  grpcAuthToken
12233
12236
  });
12234
12237
  await api.updateEvalRun(projectId, evalRunId, {
12235
- status: import_evalforge_types16.EvalStatus.FAILED,
12238
+ status: import_evalforge_types15.EvalStatus.FAILED,
12236
12239
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12237
12240
  jobError: `Config load failed, then: ${jobError}`,
12238
12241
  jobStatus: "FAILED"