@wix/evalforge-evaluator 0.183.0 → 0.185.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
5226
5226
  });
5227
5227
 
5228
5228
  // src/index.ts
5229
- var import_evalforge_types16 = require("@wix/evalforge-types");
5229
+ var import_evalforge_types15 = require("@wix/evalforge-types");
5230
5230
 
5231
5231
  // src/config.ts
5232
5232
  function loadConfig() {
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
7115
7115
  }
7116
7116
 
7117
7117
  // src/run-scenario/index.ts
7118
- var import_evalforge_types14 = require("@wix/evalforge-types");
7118
+ var import_evalforge_types13 = require("@wix/evalforge-types");
7119
7119
  var import_eval_assertions = require("@wix/eval-assertions");
7120
7120
 
7121
7121
  // src/run-scenario/environment.ts
@@ -7451,122 +7451,50 @@ function getAdapter(identifier) {
7451
7451
  }
7452
7452
 
7453
7453
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
7454
- var import_evalforge_types6 = require("@wix/evalforge-types");
7454
+ var import_evalforge_types5 = require("@wix/evalforge-types");
7455
7455
 
7456
7456
  // src/run-scenario/agents/claude-code/execute.ts
7457
- var import_evalforge_types5 = require("@wix/evalforge-types");
7457
+ var import_evalforge_types4 = require("@wix/evalforge-types");
7458
7458
 
7459
7459
  // src/run-scenario/agents/claude-code/write-skills.ts
7460
7460
  var import_promises3 = require("fs/promises");
7461
7461
  var import_path4 = require("path");
7462
-
7463
- // src/run-scenario/agents/shared/resolve-capability-content.ts
7464
7462
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
7465
- var import_evalforge_types2 = require("@wix/evalforge-types");
7466
- var USER_AGENT = "EvalForge-Evaluator";
7467
- async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7468
- const version = skill.latestVersion;
7469
- if (version?.files && version.files.length > 0) {
7470
- console.log(
7471
- `[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
7472
- );
7473
- return version.files;
7474
- }
7475
- if (skill.source) {
7476
- const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
7477
- console.log(
7478
- `[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
7479
- );
7480
- return files;
7481
- }
7482
- throw new Error(`Skill ${skill.name} has no files and no source configured`);
7483
- }
7484
- async function fetchSourceFile(label, noun, name, source, fetchFn) {
7485
- try {
7486
- const content = await fetchFn(source, { userAgent: USER_AGENT });
7487
- console.log(
7488
- `[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
7489
- );
7490
- return content;
7491
- } catch (error) {
7492
- const message = error instanceof Error ? error.message : "Unknown error";
7493
- console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
7494
- throw new Error(
7495
- `Failed to fetch ${noun} "${name}" from GitHub: ${message}`
7496
- );
7497
- }
7498
- }
7499
- async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7500
- if (agent.source) {
7501
- return fetchSourceFile(
7502
- "SubAgents",
7503
- "sub-agent",
7504
- agent.name,
7505
- agent.source,
7506
- fetchFn
7507
- );
7508
- }
7509
- if (!agent.subAgentMd) {
7510
- console.warn(
7511
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7512
- );
7513
- }
7514
- return agent.subAgentMd;
7515
- }
7516
- async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7517
- if (!rule.source) {
7518
- return rule.content;
7519
- }
7520
- return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
7521
- }
7522
- async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
7523
- if (!mcp.source) {
7524
- return mcp.config;
7525
- }
7526
- const raw = await fetchSourceFile(
7527
- "MCP",
7528
- "MCP",
7529
- mcp.name,
7530
- mcp.source,
7531
- fetchFn
7532
- );
7533
- let parsed;
7534
- try {
7535
- parsed = JSON.parse(raw);
7536
- } catch (error) {
7537
- const message = error instanceof Error ? error.message : "Unknown error";
7538
- throw new Error(
7539
- `MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
7540
- );
7541
- }
7542
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
7543
- throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
7544
- }
7545
- const obj = parsed;
7546
- const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
7547
- if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
7548
- return servers;
7549
- }
7550
- return obj;
7551
- }
7552
-
7553
- // src/run-scenario/agents/claude-code/write-skills.ts
7554
- async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
7463
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7555
7464
  await Promise.all(
7556
7465
  skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
7557
7466
  );
7558
7467
  }
7559
- async function writeSkillToFilesystem(cwd, skill, fetchFn) {
7560
- const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
7468
+ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
7469
+ const skillName = skill.name;
7470
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
7561
7471
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7562
- try {
7563
- const files = await resolveSkillFiles(skill, fetchFn);
7564
- await writeFilesToDirectory(skillDir, files);
7565
- } catch (error) {
7566
- const message = error instanceof Error ? error.message : "Unknown error";
7567
- throw new Error(
7568
- `Failed to write skill ${skill.name} to filesystem: ${message}`
7472
+ const version = skill.latestVersion;
7473
+ if (version?.files && version.files.length > 0) {
7474
+ await writeFilesToDirectory(skillDir, version.files);
7475
+ console.log(
7476
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
7569
7477
  );
7478
+ } else if (skill.source) {
7479
+ try {
7480
+ const files = await fetchFn(skill.source, {
7481
+ userAgent: "EvalForge-Evaluator"
7482
+ });
7483
+ await writeFilesToDirectory(skillDir, files);
7484
+ console.log(
7485
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
7486
+ );
7487
+ } catch (error) {
7488
+ const message = error instanceof Error ? error.message : "Unknown error";
7489
+ console.error(
7490
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
7491
+ );
7492
+ throw new Error(
7493
+ `Failed to write skill ${skillName} to filesystem: ${message}`
7494
+ );
7495
+ }
7496
+ } else {
7497
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
7570
7498
  }
7571
7499
  }
7572
7500
 
@@ -7584,7 +7512,7 @@ var import_crypto2 = require("crypto");
7584
7512
  // src/run-scenario/agents/claude-code/write-mcp.ts
7585
7513
  var import_promises5 = require("fs/promises");
7586
7514
  var import_path6 = require("path");
7587
- var import_evalforge_types3 = require("@wix/evalforge-types");
7515
+ var import_evalforge_types2 = require("@wix/evalforge-types");
7588
7516
 
7589
7517
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
7590
7518
  var import_promises4 = require("fs/promises");
@@ -7629,11 +7557,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
7629
7557
  }
7630
7558
 
7631
7559
  // src/run-scenario/agents/claude-code/write-mcp.ts
7632
- async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7560
+ async function writeMcpToFilesystem(cwd, mcps) {
7633
7561
  if (mcps.length === 0) return;
7634
7562
  const mcpServers = {};
7635
7563
  for (const mcp of mcps) {
7636
- const config = await resolveMcpConfig(mcp, fetchFn);
7564
+ const config = mcp.config;
7637
7565
  for (const [key, value] of Object.entries(config)) {
7638
7566
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
7639
7567
  throw new Error(
@@ -7645,7 +7573,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7645
7573
  }
7646
7574
  const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
7647
7575
  const content = JSON.stringify(
7648
- { [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
7576
+ { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
7649
7577
  null,
7650
7578
  2
7651
7579
  );
@@ -7657,6 +7585,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7657
7585
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
7658
7586
  var import_promises6 = require("fs/promises");
7659
7587
  var import_path7 = require("path");
7588
+ var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
7660
7589
  var AGENTS_DIR = ".claude/agents";
7661
7590
  function toAgentFilename(name, index, nameCount) {
7662
7591
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -7664,7 +7593,34 @@ function toAgentFilename(name, index, nameCount) {
7664
7593
  nameCount.set(base, count + 1);
7665
7594
  return count === 0 ? base : `${base}-${count + 1}`;
7666
7595
  }
7667
- async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7596
+ async function resolveSubAgentContent(agent, fetchFn) {
7597
+ if (agent.source) {
7598
+ try {
7599
+ const content = await fetchFn(agent.source, {
7600
+ userAgent: "EvalForge-Evaluator"
7601
+ });
7602
+ console.log(
7603
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
7604
+ );
7605
+ return content;
7606
+ } catch (error) {
7607
+ const message = error instanceof Error ? error.message : "Unknown error";
7608
+ console.error(
7609
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
7610
+ );
7611
+ throw new Error(
7612
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
7613
+ );
7614
+ }
7615
+ }
7616
+ if (!agent.subAgentMd) {
7617
+ console.warn(
7618
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7619
+ );
7620
+ }
7621
+ return agent.subAgentMd;
7622
+ }
7623
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
7668
7624
  if (subAgents.length === 0) return;
7669
7625
  const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
7670
7626
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
@@ -7672,7 +7628,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7672
7628
  for (const [i, agent] of subAgents.entries()) {
7673
7629
  const filename = toAgentFilename(agent.name, i, nameCount);
7674
7630
  const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
7675
- const content = await resolveSubAgentMd(agent, fetchFn);
7631
+ const content = await resolveSubAgentContent(agent, fetchFn);
7676
7632
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
7677
7633
  }
7678
7634
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -7722,19 +7678,18 @@ function validateGenericDirectory(dir, cwd) {
7722
7678
  }
7723
7679
  return trimmed;
7724
7680
  }
7725
- async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7681
+ async function writeRulesToFilesystem(cwd, rules) {
7726
7682
  if (rules.length === 0) return;
7727
7683
  const nameCount = /* @__PURE__ */ new Map();
7728
7684
  let hasCursorRules = false;
7729
7685
  for (const [i, rule] of rules.entries()) {
7730
- const content = await resolveRuleText(rule, fetchFn);
7731
7686
  switch (rule.ruleType) {
7732
7687
  case "claude-md": {
7733
- await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
7688
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
7734
7689
  break;
7735
7690
  }
7736
7691
  case "agents-md": {
7737
- await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
7692
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
7738
7693
  break;
7739
7694
  }
7740
7695
  case "cursor-rule": {
@@ -7744,7 +7699,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7744
7699
  }
7745
7700
  const filename = toRuleFilename(rule.name, i, nameCount);
7746
7701
  const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
7747
- await (0, import_promises7.writeFile)(filePath, content, "utf8");
7702
+ await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
7748
7703
  break;
7749
7704
  }
7750
7705
  case "generic": {
@@ -7755,7 +7710,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7755
7710
  const dirPath = (0, import_path8.join)(cwd, directory);
7756
7711
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
7757
7712
  const filename = toRuleFilename(rule.name, i, nameCount);
7758
- await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
7713
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
7759
7714
  break;
7760
7715
  }
7761
7716
  default: {
@@ -7845,14 +7800,14 @@ function buildConversation(timestampedMessages) {
7845
7800
  }
7846
7801
 
7847
7802
  // src/run-scenario/agents/shared/trace-emit.ts
7848
- var import_evalforge_types4 = require("@wix/evalforge-types");
7803
+ var import_evalforge_types3 = require("@wix/evalforge-types");
7849
7804
  function emitTraceEvent(event, pushEvent) {
7850
- console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7805
+ console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
7851
7806
  pushEvent?.(event);
7852
7807
  }
7853
7808
 
7854
7809
  // src/run-scenario/agents/claude-code/execute.ts
7855
- var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7810
+ var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
7856
7811
  async function* buildPromptStream(triggerPrompt, images) {
7857
7812
  yield {
7858
7813
  type: "user",
@@ -7917,7 +7872,7 @@ function extractToolActionDescription(toolName, toolArgs) {
7917
7872
  return `Using ${toolName}...`;
7918
7873
  }
7919
7874
  function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7920
- let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7875
+ let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7921
7876
  let toolName;
7922
7877
  let toolArgs;
7923
7878
  let outputPreview;
@@ -7925,28 +7880,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
7925
7880
  let thinking;
7926
7881
  for (const block of message.message.content) {
7927
7882
  if (block.type === "tool_use") {
7928
- type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
7883
+ type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
7929
7884
  toolName = block.name;
7930
7885
  toolArgs = JSON.stringify(block.input).slice(0, 500);
7931
7886
  const input = block.input;
7932
7887
  if (input.file_path || input.path || input.target_file) {
7933
7888
  filePath = String(input.file_path || input.path || input.target_file);
7934
7889
  if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
7935
- type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
7890
+ type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
7936
7891
  } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
7937
- type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
7892
+ type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
7938
7893
  }
7939
7894
  }
7940
7895
  } else if (block.type === "text") {
7941
7896
  outputPreview = block.text.slice(0, 500);
7942
7897
  if (!toolName) {
7943
- type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
7898
+ type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
7944
7899
  }
7945
7900
  } else if (block.type === "thinking") {
7946
7901
  const thinkingBlock = block;
7947
7902
  thinking = thinkingBlock.thinking.slice(0, 500);
7948
7903
  if (!outputPreview && !toolName) {
7949
- type = import_evalforge_types5.LiveTraceEventType.THINKING;
7904
+ type = import_evalforge_types4.LiveTraceEventType.THINKING;
7950
7905
  }
7951
7906
  }
7952
7907
  }
@@ -8012,7 +7967,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8012
7967
  }
8013
7968
  return {
8014
7969
  ...baseEvent,
8015
- type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
7970
+ type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
8016
7971
  outputPreview: outputPreview || "(tool result)"
8017
7972
  };
8018
7973
  }
@@ -8020,7 +7975,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8020
7975
  const sysMsg = message;
8021
7976
  return {
8022
7977
  ...baseEvent,
8023
- type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
7978
+ type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
8024
7979
  outputPreview: sysMsg.subtype || "system"
8025
7980
  };
8026
7981
  }
@@ -8029,7 +7984,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
8029
7984
  }
8030
7985
  return {
8031
7986
  ...baseEvent,
8032
- type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
7987
+ type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8033
7988
  outputPreview: `Message type: ${message.type}`
8034
7989
  };
8035
7990
  }
@@ -8131,7 +8086,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8131
8086
  queryOptions.systemPrompt = {
8132
8087
  type: "preset",
8133
8088
  preset: "claude_code",
8134
- append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8089
+ append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
8135
8090
  };
8136
8091
  }
8137
8092
  if (options.temperature !== void 0) {
@@ -8166,7 +8121,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8166
8121
  targetId: traceContext.targetId,
8167
8122
  targetName: traceContext.targetName,
8168
8123
  stepNumber: 0,
8169
- type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8124
+ type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8170
8125
  outputPreview: JSON.stringify({
8171
8126
  event: "pre-sdk-execution",
8172
8127
  model: queryOptions.model,
@@ -8230,7 +8185,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8230
8185
  targetId: traceContext.targetId,
8231
8186
  targetName: traceContext.targetName,
8232
8187
  stepNumber: traceStepNumber,
8233
- type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
8188
+ type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
8234
8189
  outputPreview: progressMessage,
8235
8190
  toolName: lastToolName,
8236
8191
  filePath: lastFilePath,
@@ -8267,18 +8222,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
8267
8222
  if (traceEvent) {
8268
8223
  lastToolName = traceEvent.toolName;
8269
8224
  lastFilePath = traceEvent.filePath;
8270
- if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
8225
+ if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
8271
8226
  lastAction = "Thinking...";
8272
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
8227
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
8273
8228
  lastAction = extractToolActionDescription(
8274
8229
  traceEvent.toolName,
8275
8230
  traceEvent.toolArgs
8276
8231
  );
8277
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
8232
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
8278
8233
  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
8279
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
8234
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
8280
8235
  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
8281
- } else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
8236
+ } else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
8282
8237
  lastAction = "Processing response...";
8283
8238
  }
8284
8239
  emitTraceEvent(traceEvent, traceContext.pushEvent);
@@ -8456,7 +8411,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
8456
8411
  targetId: traceContext.targetId,
8457
8412
  targetName: traceContext.targetName,
8458
8413
  stepNumber: traceStepNumber + 1,
8459
- type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
8414
+ type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
8460
8415
  outputPreview: JSON.stringify(
8461
8416
  {
8462
8417
  event: "sdk-execution-failed",
@@ -8490,7 +8445,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
8490
8445
  targetId: traceContext.targetId,
8491
8446
  targetName: traceContext.targetName,
8492
8447
  stepNumber: traceStepNumber + 1,
8493
- type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
8448
+ type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
8494
8449
  outputPreview: "Scenario execution completed",
8495
8450
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
8496
8451
  isComplete: true
@@ -8670,9 +8625,12 @@ function processMessages(timestampedMessages, startTime, endTime) {
8670
8625
  if (!step.toolCalls) continue;
8671
8626
  for (const tc of step.toolCalls) {
8672
8627
  if (tc.toolUseId && toolResultErrors.has(tc.toolUseId)) {
8673
- step.hasToolError = true;
8674
- step.toolErrorContent = toolResultErrors.get(tc.toolUseId);
8675
- break;
8628
+ tc.isError = true;
8629
+ tc.errorContent = toolResultErrors.get(tc.toolUseId);
8630
+ if (!step.hasToolError) {
8631
+ step.hasToolError = true;
8632
+ step.toolErrorContent = tc.errorContent;
8633
+ }
8676
8634
  }
8677
8635
  }
8678
8636
  }
@@ -8762,7 +8720,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8762
8720
  stepNumber: 0,
8763
8721
  // renumbered below
8764
8722
  turnIndex,
8765
- type: import_evalforge_types5.LLMStepType.THINKING,
8723
+ type: import_evalforge_types4.LLMStepType.THINKING,
8766
8724
  model,
8767
8725
  provider: "anthropic",
8768
8726
  startedAt: step.startedAt.toISOString(),
@@ -8776,8 +8734,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8776
8734
  },
8777
8735
  costUsd: stepCost / totalSubSteps,
8778
8736
  outputPreview: step.thinking?.slice(0, 200),
8779
- success: isSuccess,
8780
- error: errorMsg
8737
+ success: true,
8738
+ error: void 0
8781
8739
  });
8782
8740
  }
8783
8741
  if (toolCallCount > 0) {
@@ -8787,11 +8745,13 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8787
8745
  const toolBudgetSteps = toolSubSteps + textSubSteps;
8788
8746
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
8789
8747
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
8748
+ const toolSuccess = !tc.isError;
8749
+ const toolError = tc.isError ? tc.errorContent ?? "Tool call failed" : void 0;
8790
8750
  subSteps.push({
8791
8751
  id: (0, import_crypto2.randomUUID)(),
8792
8752
  stepNumber: 0,
8793
8753
  turnIndex,
8794
- type: import_evalforge_types5.LLMStepType.TOOL_USE,
8754
+ type: import_evalforge_types4.LLMStepType.TOOL_USE,
8795
8755
  model,
8796
8756
  provider: "anthropic",
8797
8757
  startedAt: step.startedAt.toISOString(),
@@ -8811,8 +8771,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8811
8771
  toolName: tc.toolName,
8812
8772
  toolArguments: JSON.stringify(tc.args),
8813
8773
  outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
8814
- success: isSuccess,
8815
- error: errorMsg
8774
+ success: toolSuccess,
8775
+ error: toolError
8816
8776
  });
8817
8777
  }
8818
8778
  }
@@ -8821,7 +8781,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8821
8781
  id: (0, import_crypto2.randomUUID)(),
8822
8782
  stepNumber: 0,
8823
8783
  turnIndex,
8824
- type: import_evalforge_types5.LLMStepType.COMPLETION,
8784
+ type: import_evalforge_types4.LLMStepType.COMPLETION,
8825
8785
  model,
8826
8786
  provider: "anthropic",
8827
8787
  startedAt: step.startedAt.toISOString(),
@@ -8833,12 +8793,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8833
8793
  },
8834
8794
  costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
8835
8795
  outputPreview: step.text?.slice(0, 200),
8836
- success: isSuccess,
8837
- error: errorMsg
8796
+ success: true,
8797
+ error: void 0
8838
8798
  });
8839
8799
  }
8840
8800
  if (subSteps.length === 0) {
8841
- const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
8801
+ const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
8842
8802
  subSteps.push({
8843
8803
  id: (0, import_crypto2.randomUUID)(),
8844
8804
  stepNumber: 0,
@@ -8908,7 +8868,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8908
8868
  var ClaudeCodeAdapter = class {
8909
8869
  id = "claude-code";
8910
8870
  name = "Claude Code";
8911
- supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
8871
+ supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
8912
8872
  /**
8913
8873
  * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
8914
8874
  * before the baseline snapshot is taken.
@@ -8940,9 +8900,9 @@ var ClaudeCodeAdapter = class {
8940
8900
  rules,
8941
8901
  systemPrompt
8942
8902
  } = context;
8943
- const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8903
+ const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
8944
8904
  const cfg = typed?.success ? typed.data : void 0;
8945
- const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
8905
+ const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
8946
8906
  const extras = {};
8947
8907
  if (config) {
8948
8908
  for (const [key, value] of Object.entries(config)) {
@@ -8997,11 +8957,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
8997
8957
  defaultRegistry.register(claudeCodeAdapter);
8998
8958
 
8999
8959
  // src/run-scenario/agents/opencode/opencode-adapter.ts
9000
- var import_evalforge_types10 = require("@wix/evalforge-types");
8960
+ var import_evalforge_types9 = require("@wix/evalforge-types");
9001
8961
 
9002
8962
  // src/run-scenario/agents/opencode/execute.ts
9003
8963
  var import_child_process2 = require("child_process");
9004
- var import_evalforge_types9 = require("@wix/evalforge-types");
8964
+ var import_evalforge_types8 = require("@wix/evalforge-types");
9005
8965
 
9006
8966
  // src/run-scenario/agents/opencode/types.ts
9007
8967
  function tryParseJson(text) {
@@ -9015,28 +8975,49 @@ function tryParseJson(text) {
9015
8975
  // src/run-scenario/agents/opencode/write-skills.ts
9016
8976
  var import_promises8 = require("fs/promises");
9017
8977
  var import_path9 = require("path");
9018
- async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
8978
+ var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
8979
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
9019
8980
  await Promise.all(
9020
8981
  skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
9021
8982
  );
9022
8983
  }
9023
8984
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
9024
- const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
8985
+ const skillName = skill.name;
8986
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
9025
8987
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
9026
- try {
9027
- const files = await resolveSkillFiles(skill, fetchFn);
9028
- await writeFilesToDirectory(skillDir, files);
9029
- } catch (error) {
9030
- const message = error instanceof Error ? error.message : "Unknown error";
9031
- throw new Error(
9032
- `Failed to write skill ${skill.name} to filesystem: ${message}`
8988
+ const version = skill.latestVersion;
8989
+ if (version?.files && version.files.length > 0) {
8990
+ await writeFilesToDirectory(skillDir, version.files);
8991
+ console.log(
8992
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
9033
8993
  );
8994
+ } else if (skill.source) {
8995
+ try {
8996
+ const files = await fetchFn(skill.source, {
8997
+ userAgent: "EvalForge-Evaluator"
8998
+ });
8999
+ await writeFilesToDirectory(skillDir, files);
9000
+ console.log(
9001
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
9002
+ );
9003
+ } catch (error) {
9004
+ const message = error instanceof Error ? error.message : "Unknown error";
9005
+ console.error(
9006
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
9007
+ );
9008
+ throw new Error(
9009
+ `Failed to write skill ${skillName} to filesystem: ${message}`
9010
+ );
9011
+ }
9012
+ } else {
9013
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
9034
9014
  }
9035
9015
  }
9036
9016
 
9037
9017
  // src/run-scenario/agents/opencode/write-sub-agents.ts
9038
9018
  var import_promises9 = require("fs/promises");
9039
9019
  var import_path10 = require("path");
9020
+ var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
9040
9021
  var AGENTS_DIR2 = ".opencode/agents";
9041
9022
  function toAgentFilename2(name, index, nameCount) {
9042
9023
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -9044,7 +9025,34 @@ function toAgentFilename2(name, index, nameCount) {
9044
9025
  nameCount.set(base, count + 1);
9045
9026
  return count === 0 ? base : `${base}-${count + 1}`;
9046
9027
  }
9047
- async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9028
+ async function resolveSubAgentContent2(agent, fetchFn) {
9029
+ if (agent.source) {
9030
+ try {
9031
+ const content = await fetchFn(agent.source, {
9032
+ userAgent: "EvalForge-Evaluator"
9033
+ });
9034
+ console.log(
9035
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
9036
+ );
9037
+ return content;
9038
+ } catch (error) {
9039
+ const message = error instanceof Error ? error.message : "Unknown error";
9040
+ console.error(
9041
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
9042
+ );
9043
+ throw new Error(
9044
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
9045
+ );
9046
+ }
9047
+ }
9048
+ if (!agent.subAgentMd) {
9049
+ console.warn(
9050
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
9051
+ );
9052
+ }
9053
+ return agent.subAgentMd;
9054
+ }
9055
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
9048
9056
  if (subAgents.length === 0) return;
9049
9057
  const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
9050
9058
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
@@ -9052,7 +9060,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9052
9060
  for (const [i, agent] of subAgents.entries()) {
9053
9061
  const filename = toAgentFilename2(agent.name, i, nameCount);
9054
9062
  const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
9055
- const content = await resolveSubAgentMd(agent, fetchFn);
9063
+ const content = await resolveSubAgentContent2(agent, fetchFn);
9056
9064
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
9057
9065
  }
9058
9066
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -9060,8 +9068,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9060
9068
 
9061
9069
  // src/run-scenario/agents/opencode/config.ts
9062
9070
  var import_os3 = require("os");
9063
- var import_evalforge_types7 = require("@wix/evalforge-types");
9064
- var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9071
+ var import_evalforge_types6 = require("@wix/evalforge-types");
9072
+ var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
9065
9073
  var OPENCODE_MODEL_ALIASES = {
9066
9074
  "claude-sonnet-4": "claude-sonnet-4-0",
9067
9075
  "claude-opus-4": "claude-opus-4-0"
@@ -9077,10 +9085,10 @@ function parseModel(model) {
9077
9085
  };
9078
9086
  }
9079
9087
  const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
9080
- const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
9088
+ const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
9081
9089
  model
9082
9090
  );
9083
- const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
9091
+ const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
9084
9092
  model
9085
9093
  );
9086
9094
  if (isGemini) return { providerID: "google", modelID };
@@ -9149,7 +9157,7 @@ async function buildOpenCodeEnv(options) {
9149
9157
  if (options.mcps && options.mcps.length > 0) {
9150
9158
  const mcpServers = {};
9151
9159
  for (const mcpEntity of options.mcps) {
9152
- const entityConfig = await resolveMcpConfig(mcpEntity);
9160
+ const entityConfig = mcpEntity.config;
9153
9161
  for (const [key, value] of Object.entries(entityConfig)) {
9154
9162
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
9155
9163
  throw new Error(
@@ -9174,7 +9182,7 @@ async function buildOpenCodeEnv(options) {
9174
9182
  if (options.maxTurns != null && options.maxTurns > 0) {
9175
9183
  agentOverrides.maxSteps = options.maxTurns;
9176
9184
  }
9177
- const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9185
+ const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
9178
9186
  const configPermission = parsed?.success ? parsed.data.permission : void 0;
9179
9187
  const defaultPermission = {
9180
9188
  "*": "allow"
@@ -9216,7 +9224,7 @@ async function buildOpenCodeEnv(options) {
9216
9224
  }
9217
9225
 
9218
9226
  // src/run-scenario/agents/opencode/build-trace.ts
9219
- var import_evalforge_types8 = require("@wix/evalforge-types");
9227
+ var import_evalforge_types7 = require("@wix/evalforge-types");
9220
9228
  var import_crypto3 = require("crypto");
9221
9229
  function toCanonicalModelId(modelId) {
9222
9230
  const slashIndex = modelId.indexOf("/");
@@ -9296,7 +9304,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9296
9304
  id: (0, import_crypto3.randomUUID)(),
9297
9305
  stepNumber: 0,
9298
9306
  turnIndex,
9299
- type: import_evalforge_types8.LLMStepType.THINKING,
9307
+ type: import_evalforge_types7.LLMStepType.THINKING,
9300
9308
  model: stepModel,
9301
9309
  provider: stepProvider,
9302
9310
  startedAt,
@@ -9325,7 +9333,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9325
9333
  id: (0, import_crypto3.randomUUID)(),
9326
9334
  stepNumber: 0,
9327
9335
  turnIndex,
9328
- type: import_evalforge_types8.LLMStepType.TOOL_USE,
9336
+ type: import_evalforge_types7.LLMStepType.TOOL_USE,
9329
9337
  model: stepModel,
9330
9338
  provider: stepProvider,
9331
9339
  startedAt,
@@ -9355,7 +9363,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9355
9363
  id: (0, import_crypto3.randomUUID)(),
9356
9364
  stepNumber: 0,
9357
9365
  turnIndex,
9358
- type: import_evalforge_types8.LLMStepType.COMPLETION,
9366
+ type: import_evalforge_types7.LLMStepType.COMPLETION,
9359
9367
  model: stepModel,
9360
9368
  provider: stepProvider,
9361
9369
  startedAt,
@@ -9372,7 +9380,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
9372
9380
  });
9373
9381
  }
9374
9382
  if (subSteps.length === 0) {
9375
- const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
9383
+ const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
9376
9384
  subSteps.push({
9377
9385
  id: (0, import_crypto3.randomUUID)(),
9378
9386
  stepNumber: 0,
@@ -9573,14 +9581,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9573
9581
  const te = evt;
9574
9582
  return {
9575
9583
  ...base,
9576
- type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
9584
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
9577
9585
  outputPreview: te.part.text.slice(0, 500)
9578
9586
  };
9579
9587
  }
9580
9588
  case "reasoning":
9581
9589
  return {
9582
9590
  ...base,
9583
- type: import_evalforge_types9.LiveTraceEventType.THINKING,
9591
+ type: import_evalforge_types8.LiveTraceEventType.THINKING,
9584
9592
  thinking: evt.part.text.slice(0, 500)
9585
9593
  };
9586
9594
  case "tool_use": {
@@ -9588,15 +9596,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9588
9596
  const toolName = tu.part.tool;
9589
9597
  const args = tu.part.state.input;
9590
9598
  const toolArgs = JSON.stringify(args).slice(0, 500);
9591
- let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
9599
+ let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
9592
9600
  let filePath;
9593
9601
  if (args) {
9594
9602
  if (args.file_path || args.path || args.target_file) {
9595
9603
  filePath = String(args.file_path || args.path || args.target_file);
9596
9604
  if (/write|edit/i.test(toolName)) {
9597
- type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
9605
+ type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
9598
9606
  } else if (/read|view/i.test(toolName)) {
9599
- type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
9607
+ type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
9600
9608
  }
9601
9609
  }
9602
9610
  }
@@ -9605,7 +9613,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
9605
9613
  case "step_finish":
9606
9614
  return {
9607
9615
  ...base,
9608
- type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9616
+ type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9609
9617
  outputPreview: "Step completed"
9610
9618
  };
9611
9619
  default:
@@ -9636,7 +9644,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
9636
9644
  } else if (options.systemPrompt != null) {
9637
9645
  systemPrompt = options.systemPrompt;
9638
9646
  } else {
9639
- systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9647
+ systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
9640
9648
  }
9641
9649
  if (systemPrompt) {
9642
9650
  await writeSystemPromptRule(cwd, systemPrompt);
@@ -9828,7 +9836,7 @@ function spawnOpenCodeProcess(opts) {
9828
9836
  targetId: traceContext.targetId,
9829
9837
  targetName: traceContext.targetName,
9830
9838
  stepNumber: traceStepNumber,
9831
- type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
9839
+ type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
9832
9840
  outputPreview: progressMessage,
9833
9841
  toolName: lastToolName,
9834
9842
  filePath: lastFilePath,
@@ -9862,18 +9870,18 @@ function spawnOpenCodeProcess(opts) {
9862
9870
  if (traceEvt) {
9863
9871
  lastToolName = traceEvt.toolName;
9864
9872
  lastFilePath = traceEvt.filePath;
9865
- if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
9873
+ if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
9866
9874
  lastAction = "Thinking...";
9867
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
9875
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
9868
9876
  lastAction = extractToolAction(
9869
9877
  traceEvt.toolName ?? "",
9870
9878
  void 0
9871
9879
  );
9872
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
9880
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
9873
9881
  lastAction = `Writing: ${traceEvt.filePath || "file"}`;
9874
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
9882
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
9875
9883
  lastAction = `Reading: ${traceEvt.filePath || "file"}`;
9876
- } else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
9884
+ } else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
9877
9885
  lastAction = "Processing response...";
9878
9886
  }
9879
9887
  emitTraceEvent(traceEvt, traceContext.pushEvent);
@@ -9955,7 +9963,7 @@ async function executeWithOpenCode(skills, scenario, options) {
9955
9963
  targetId: traceContext.targetId,
9956
9964
  targetName: traceContext.targetName,
9957
9965
  stepNumber: 0,
9958
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
9966
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
9959
9967
  outputPreview: JSON.stringify({
9960
9968
  event: "pre-cli-execution",
9961
9969
  model: `${providerID}/${modelID}`,
@@ -10009,7 +10017,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10009
10017
  targetId: traceContext.targetId,
10010
10018
  targetName: traceContext.targetName,
10011
10019
  stepNumber: traceStepNumber + 1,
10012
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10020
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10013
10021
  outputPreview: JSON.stringify({
10014
10022
  event: "idle-timeout-retry",
10015
10023
  attempt,
@@ -10053,7 +10061,7 @@ async function executeWithOpenCode(skills, scenario, options) {
10053
10061
  targetId: traceContext.targetId,
10054
10062
  targetName: traceContext.targetName,
10055
10063
  stepNumber: traceStepNumber + 1,
10056
- type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
10064
+ type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
10057
10065
  outputPreview: JSON.stringify({
10058
10066
  event: "cli-execution-failed",
10059
10067
  error: lastAttemptResult.error?.message ?? "Unknown error",
@@ -10108,7 +10116,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10108
10116
  targetId: traceContext.targetId,
10109
10117
  targetName: traceContext.targetName,
10110
10118
  stepNumber: traceStepNumber + 1,
10111
- type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
10119
+ type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
10112
10120
  outputPreview: "Scenario execution completed",
10113
10121
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10114
10122
  isComplete: true
@@ -10145,7 +10153,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
10145
10153
  var OpenCodeAdapter = class {
10146
10154
  id = "opencode";
10147
10155
  name = "OpenCode";
10148
- supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
10156
+ supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
10149
10157
  async prepareEnvironment(context) {
10150
10158
  await prepareOpenCodeEnvironment(context.cwd, context.skills, {
10151
10159
  mcps: context.mcps,
@@ -10168,7 +10176,7 @@ var OpenCodeAdapter = class {
10168
10176
  rules,
10169
10177
  systemPrompt
10170
10178
  } = context;
10171
- const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10179
+ const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
10172
10180
  const cfg = typed?.success ? typed.data : void 0;
10173
10181
  const rawMaxTurns = cfg?.maxTurns;
10174
10182
  const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
@@ -10218,7 +10226,7 @@ var import_ai = require("ai");
10218
10226
  var import_anthropic = require("@ai-sdk/anthropic");
10219
10227
  var import_google = require("@ai-sdk/google");
10220
10228
  var import_openai = require("@ai-sdk/openai");
10221
- var import_evalforge_types12 = require("@wix/evalforge-types");
10229
+ var import_evalforge_types11 = require("@wix/evalforge-types");
10222
10230
  var import_crypto4 = require("crypto");
10223
10231
 
10224
10232
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
@@ -10315,7 +10323,7 @@ function extractErrorText(content) {
10315
10323
  }
10316
10324
 
10317
10325
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
10318
- var import_evalforge_types11 = require("@wix/evalforge-types");
10326
+ var import_evalforge_types10 = require("@wix/evalforge-types");
10319
10327
  var PROVIDER_ANTHROPIC = "anthropic";
10320
10328
  var PROVIDER_GEMINI = "gemini";
10321
10329
  var MODEL_PRICING = {
@@ -10384,7 +10392,7 @@ function extractGatewayCost(step, provider) {
10384
10392
  }
10385
10393
  }
10386
10394
  function calculateFromPricing(modelId, tokenUsage) {
10387
- const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
10395
+ const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
10388
10396
  const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
10389
10397
  if (!pricing) return 0;
10390
10398
  return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
@@ -10477,7 +10485,7 @@ function createModel(modelId, baseUrl, headers) {
10477
10485
  apiKey: "proxy-auth",
10478
10486
  headers
10479
10487
  });
10480
- if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10488
+ if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10481
10489
  (id) => modelId === id || modelId.startsWith(id)
10482
10490
  )) {
10483
10491
  return openai.responses(modelId);
@@ -10485,12 +10493,12 @@ function createModel(modelId, baseUrl, headers) {
10485
10493
  return openai.chat(modelId);
10486
10494
  }
10487
10495
  function isClaudeModelId(modelId) {
10488
- return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
10496
+ return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
10489
10497
  (id) => modelId === id || modelId.startsWith(id)
10490
10498
  );
10491
10499
  }
10492
10500
  function isGeminiModelId(modelId) {
10493
- return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
10501
+ return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
10494
10502
  (id) => modelId === id || modelId.startsWith(id)
10495
10503
  );
10496
10504
  }
@@ -10510,9 +10518,9 @@ async function executeWithAiSdk(context) {
10510
10518
  mcps,
10511
10519
  traceContext
10512
10520
  } = context;
10513
- const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10521
+ const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
10514
10522
  const cfg = typed?.success ? typed.data : void 0;
10515
- const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
10523
+ const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
10516
10524
  const configExtras = {};
10517
10525
  if (config) {
10518
10526
  for (const [key, value] of Object.entries(config)) {
@@ -10549,11 +10557,11 @@ async function executeWithAiSdk(context) {
10549
10557
  }, SDK_TIMEOUT_MS);
10550
10558
  try {
10551
10559
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
10552
- const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
10560
+ const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
10553
10561
  (id) => modelId === id || modelId.startsWith(id)
10554
10562
  );
10555
10563
  const isGemini = provider === PROVIDER_GEMINI2;
10556
- const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
10564
+ const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
10557
10565
  const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
10558
10566
  const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
10559
10567
  const reasoningEffort = cfg.reasoningEffort ?? "high";
@@ -10632,7 +10640,7 @@ async function executeWithAiSdk(context) {
10632
10640
  targetId: traceContext.targetId,
10633
10641
  targetName: traceContext.targetName,
10634
10642
  stepNumber: stepTimestamps.length,
10635
- type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
10643
+ type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
10636
10644
  toolName: firstToolCall?.toolName,
10637
10645
  toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
10638
10646
  outputPreview: step.text?.slice(0, 500),
@@ -10837,7 +10845,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
10837
10845
  id: (0, import_crypto4.randomUUID)(),
10838
10846
  stepNumber: i + 1,
10839
10847
  turnIndex: i,
10840
- type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
10848
+ type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
10841
10849
  model: modelId,
10842
10850
  provider,
10843
10851
  startedAt: new Date(stepStartedAt).toISOString(),
@@ -10887,7 +10895,7 @@ function emitStartEvent(traceContext, startTime) {
10887
10895
  targetId: traceContext.targetId,
10888
10896
  targetName: traceContext.targetName,
10889
10897
  stepNumber: 0,
10890
- type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
10898
+ type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
10891
10899
  outputPreview: "Starting Simple Agent execution...",
10892
10900
  elapsedMs: Date.now() - startTime,
10893
10901
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -10905,7 +10913,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
10905
10913
  targetId: traceContext.targetId,
10906
10914
  targetName: traceContext.targetName,
10907
10915
  stepNumber,
10908
- type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
10916
+ type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
10909
10917
  outputPreview: "Scenario execution completed",
10910
10918
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
10911
10919
  isComplete: true
@@ -11675,11 +11683,11 @@ function substituteVariables(prompt, variables) {
11675
11683
  }
11676
11684
 
11677
11685
  // src/run-scenario/run-agent-with-context.ts
11678
- var import_evalforge_types13 = require("@wix/evalforge-types");
11679
- var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
11686
+ var import_evalforge_types12 = require("@wix/evalforge-types");
11687
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
11680
11688
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
11681
11689
  const agent = evalData.agent ?? void 0;
11682
- const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
11690
+ const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
11683
11691
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
11684
11692
  const adapter = getAdapter(identifier);
11685
11693
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -11764,14 +11772,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11764
11772
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
11765
11773
  if (template) {
11766
11774
  console.log(
11767
- (0, import_evalforge_types14.formatTraceEventLine)({
11775
+ (0, import_evalforge_types13.formatTraceEventLine)({
11768
11776
  evalRunId: evalRunId2,
11769
11777
  scenarioId: scenario.id,
11770
11778
  scenarioName: scenario.name,
11771
11779
  targetId,
11772
11780
  targetName,
11773
11781
  stepNumber: 0,
11774
- type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
11782
+ type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
11775
11783
  outputPreview: "Setting up environment (installing dependencies)...",
11776
11784
  elapsedMs: 0,
11777
11785
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -11811,7 +11819,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11811
11819
  })),
11812
11820
  durationMs: partialResult.duration
11813
11821
  };
11814
- const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
11822
+ const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
11815
11823
  const assertionContext = {
11816
11824
  workDir,
11817
11825
  defaultJudgeModel,
@@ -11826,10 +11834,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
11826
11834
  assertionContext
11827
11835
  ) : [];
11828
11836
  const passed = assertionResults.filter(
11829
- (r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
11837
+ (r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
11830
11838
  ).length;
11831
11839
  const failed = assertionResults.filter(
11832
- (r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
11840
+ (r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
11833
11841
  ).length;
11834
11842
  const total = assertionResults.length;
11835
11843
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -11905,7 +11913,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
11905
11913
  }
11906
11914
 
11907
11915
  // src/error-reporter.ts
11908
- var import_evalforge_types15 = require("@wix/evalforge-types");
11916
+ var import_evalforge_types14 = require("@wix/evalforge-types");
11909
11917
  function formatError(error, phase, context) {
11910
11918
  const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
11911
11919
  if (error instanceof Error) {
@@ -12148,7 +12156,7 @@ async function runEvaluation(projectId2, evalRunId2) {
12148
12156
  totalExecutions
12149
12157
  };
12150
12158
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
12151
- const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
12159
+ const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
12152
12160
  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
12153
12161
  firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
12154
12162
  ) : void 0;
@@ -12202,7 +12210,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12202
12210
  grpcAuthToken: config.grpcAuthToken
12203
12211
  });
12204
12212
  await api.updateEvalRun(projectId, evalRunId, {
12205
- status: import_evalforge_types16.EvalStatus.FAILED,
12213
+ status: import_evalforge_types15.EvalStatus.FAILED,
12206
12214
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12207
12215
  jobError,
12208
12216
  jobStatus: "FAILED"
@@ -12227,7 +12235,7 @@ runEvaluation(projectId, evalRunId).then(() => {
12227
12235
  grpcAuthToken
12228
12236
  });
12229
12237
  await api.updateEvalRun(projectId, evalRunId, {
12230
- status: import_evalforge_types16.EvalStatus.FAILED,
12238
+ status: import_evalforge_types15.EvalStatus.FAILED,
12231
12239
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
12232
12240
  jobError: `Config load failed, then: ${jobError}`,
12233
12241
  jobStatus: "FAILED"