@wix/evalforge-evaluator 0.183.0 → 0.185.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -7487,117 +7487,42 @@ import {
7487
7487
  // src/run-scenario/agents/claude-code/write-skills.ts
7488
7488
  import { mkdir as mkdir3 } from "fs/promises";
7489
7489
  import { join } from "path";
7490
-
7491
- // src/run-scenario/agents/shared/resolve-capability-content.ts
7492
- import {
7493
- fetchGitHubFile as fetchGitHubFile2,
7494
- fetchGitHubFolder as fetchGitHubFolder2
7495
- } from "@wix/evalforge-github-client";
7496
- import { MCP_SERVERS_JSON_KEY } from "@wix/evalforge-types";
7497
- var USER_AGENT = "EvalForge-Evaluator";
7498
- async function resolveSkillFiles(skill, fetchFn = fetchGitHubFolder2) {
7499
- const version = skill.latestVersion;
7500
- if (version?.files && version.files.length > 0) {
7501
- console.log(
7502
- `[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
7503
- );
7504
- return version.files;
7505
- }
7506
- if (skill.source) {
7507
- const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
7508
- console.log(
7509
- `[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
7510
- );
7511
- return files;
7512
- }
7513
- throw new Error(`Skill ${skill.name} has no files and no source configured`);
7514
- }
7515
- async function fetchSourceFile(label, noun, name, source, fetchFn) {
7516
- try {
7517
- const content = await fetchFn(source, { userAgent: USER_AGENT });
7518
- console.log(
7519
- `[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
7520
- );
7521
- return content;
7522
- } catch (error) {
7523
- const message = error instanceof Error ? error.message : "Unknown error";
7524
- console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
7525
- throw new Error(
7526
- `Failed to fetch ${noun} "${name}" from GitHub: ${message}`
7527
- );
7528
- }
7529
- }
7530
- async function resolveSubAgentMd(agent, fetchFn = fetchGitHubFile2) {
7531
- if (agent.source) {
7532
- return fetchSourceFile(
7533
- "SubAgents",
7534
- "sub-agent",
7535
- agent.name,
7536
- agent.source,
7537
- fetchFn
7538
- );
7539
- }
7540
- if (!agent.subAgentMd) {
7541
- console.warn(
7542
- `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7543
- );
7544
- }
7545
- return agent.subAgentMd;
7546
- }
7547
- async function resolveRuleText(rule, fetchFn = fetchGitHubFile2) {
7548
- if (!rule.source) {
7549
- return rule.content;
7550
- }
7551
- return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
7552
- }
7553
- async function resolveMcpConfig(mcp, fetchFn = fetchGitHubFile2) {
7554
- if (!mcp.source) {
7555
- return mcp.config;
7556
- }
7557
- const raw = await fetchSourceFile(
7558
- "MCP",
7559
- "MCP",
7560
- mcp.name,
7561
- mcp.source,
7562
- fetchFn
7563
- );
7564
- let parsed;
7565
- try {
7566
- parsed = JSON.parse(raw);
7567
- } catch (error) {
7568
- const message = error instanceof Error ? error.message : "Unknown error";
7569
- throw new Error(
7570
- `MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
7571
- );
7572
- }
7573
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
7574
- throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
7575
- }
7576
- const obj = parsed;
7577
- const servers = obj[MCP_SERVERS_JSON_KEY];
7578
- if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
7579
- return servers;
7580
- }
7581
- return obj;
7582
- }
7583
-
7584
- // src/run-scenario/agents/claude-code/write-skills.ts
7585
- async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
7490
+ import { fetchGitHubFolder as fetchGitHubFolder2 } from "@wix/evalforge-github-client";
7491
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn = fetchGitHubFolder2) {
7586
7492
  await Promise.all(
7587
7493
  skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
7588
7494
  );
7589
7495
  }
7590
- async function writeSkillToFilesystem(cwd, skill, fetchFn) {
7591
- const skillDir = join(cwd, ".claude", "skills", skill.name);
7496
+ async function writeSkillToFilesystem(cwd, skill, fetchFn = fetchGitHubFolder2) {
7497
+ const skillName = skill.name;
7498
+ const skillDir = join(cwd, ".claude", "skills", skillName);
7592
7499
  await mkdir3(skillDir, { recursive: true });
7593
- try {
7594
- const files = await resolveSkillFiles(skill, fetchFn);
7595
- await writeFilesToDirectory(skillDir, files);
7596
- } catch (error) {
7597
- const message = error instanceof Error ? error.message : "Unknown error";
7598
- throw new Error(
7599
- `Failed to write skill ${skill.name} to filesystem: ${message}`
7500
+ const version = skill.latestVersion;
7501
+ if (version?.files && version.files.length > 0) {
7502
+ await writeFilesToDirectory(skillDir, version.files);
7503
+ console.log(
7504
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
7600
7505
  );
7506
+ } else if (skill.source) {
7507
+ try {
7508
+ const files = await fetchFn(skill.source, {
7509
+ userAgent: "EvalForge-Evaluator"
7510
+ });
7511
+ await writeFilesToDirectory(skillDir, files);
7512
+ console.log(
7513
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
7514
+ );
7515
+ } catch (error) {
7516
+ const message = error instanceof Error ? error.message : "Unknown error";
7517
+ console.error(
7518
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
7519
+ );
7520
+ throw new Error(
7521
+ `Failed to write skill ${skillName} to filesystem: ${message}`
7522
+ );
7523
+ }
7524
+ } else {
7525
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
7601
7526
  }
7602
7527
  }
7603
7528
 
@@ -7615,7 +7540,7 @@ import { randomUUID } from "crypto";
7615
7540
  // src/run-scenario/agents/claude-code/write-mcp.ts
7616
7541
  import { writeFile as writeFile3 } from "fs/promises";
7617
7542
  import { join as join3 } from "path";
7618
- import { MCP_SERVERS_JSON_KEY as MCP_SERVERS_JSON_KEY2 } from "@wix/evalforge-types";
7543
+ import { MCP_SERVERS_JSON_KEY } from "@wix/evalforge-types";
7619
7544
 
7620
7545
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
7621
7546
  import { readFile } from "fs/promises";
@@ -7660,11 +7585,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
7660
7585
  }
7661
7586
 
7662
7587
  // src/run-scenario/agents/claude-code/write-mcp.ts
7663
- async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7588
+ async function writeMcpToFilesystem(cwd, mcps) {
7664
7589
  if (mcps.length === 0) return;
7665
7590
  const mcpServers = {};
7666
7591
  for (const mcp of mcps) {
7667
- const config = await resolveMcpConfig(mcp, fetchFn);
7592
+ const config = mcp.config;
7668
7593
  for (const [key, value] of Object.entries(config)) {
7669
7594
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
7670
7595
  throw new Error(
@@ -7676,7 +7601,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7676
7601
  }
7677
7602
  const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
7678
7603
  const content = JSON.stringify(
7679
- { [MCP_SERVERS_JSON_KEY2]: resolvedServers },
7604
+ { [MCP_SERVERS_JSON_KEY]: resolvedServers },
7680
7605
  null,
7681
7606
  2
7682
7607
  );
@@ -7688,6 +7613,9 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
7688
7613
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
7689
7614
  import { mkdir as mkdir4, writeFile as writeFile4 } from "fs/promises";
7690
7615
  import { join as join4 } from "path";
7616
+ import {
7617
+ fetchGitHubFile as fetchGitHubFile2
7618
+ } from "@wix/evalforge-github-client";
7691
7619
  var AGENTS_DIR = ".claude/agents";
7692
7620
  function toAgentFilename(name, index, nameCount) {
7693
7621
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -7695,7 +7623,34 @@ function toAgentFilename(name, index, nameCount) {
7695
7623
  nameCount.set(base, count + 1);
7696
7624
  return count === 0 ? base : `${base}-${count + 1}`;
7697
7625
  }
7698
- async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7626
+ async function resolveSubAgentContent(agent, fetchFn) {
7627
+ if (agent.source) {
7628
+ try {
7629
+ const content = await fetchFn(agent.source, {
7630
+ userAgent: "EvalForge-Evaluator"
7631
+ });
7632
+ console.log(
7633
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
7634
+ );
7635
+ return content;
7636
+ } catch (error) {
7637
+ const message = error instanceof Error ? error.message : "Unknown error";
7638
+ console.error(
7639
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
7640
+ );
7641
+ throw new Error(
7642
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
7643
+ );
7644
+ }
7645
+ }
7646
+ if (!agent.subAgentMd) {
7647
+ console.warn(
7648
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
7649
+ );
7650
+ }
7651
+ return agent.subAgentMd;
7652
+ }
7653
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = fetchGitHubFile2) {
7699
7654
  if (subAgents.length === 0) return;
7700
7655
  const agentsDir = join4(cwd, AGENTS_DIR);
7701
7656
  await mkdir4(agentsDir, { recursive: true });
@@ -7703,7 +7658,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
7703
7658
  for (const [i, agent] of subAgents.entries()) {
7704
7659
  const filename = toAgentFilename(agent.name, i, nameCount);
7705
7660
  const filePath = join4(agentsDir, `${filename}.md`);
7706
- const content = await resolveSubAgentMd(agent, fetchFn);
7661
+ const content = await resolveSubAgentContent(agent, fetchFn);
7707
7662
  await writeFile4(filePath, content, "utf8");
7708
7663
  }
7709
7664
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -7753,19 +7708,18 @@ function validateGenericDirectory(dir, cwd) {
7753
7708
  }
7754
7709
  return trimmed;
7755
7710
  }
7756
- async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7711
+ async function writeRulesToFilesystem(cwd, rules) {
7757
7712
  if (rules.length === 0) return;
7758
7713
  const nameCount = /* @__PURE__ */ new Map();
7759
7714
  let hasCursorRules = false;
7760
7715
  for (const [i, rule] of rules.entries()) {
7761
- const content = await resolveRuleText(rule, fetchFn);
7762
7716
  switch (rule.ruleType) {
7763
7717
  case "claude-md": {
7764
- await appendToFile(join5(cwd, "CLAUDE.md"), content);
7718
+ await appendToFile(join5(cwd, "CLAUDE.md"), rule.content);
7765
7719
  break;
7766
7720
  }
7767
7721
  case "agents-md": {
7768
- await appendToFile(join5(cwd, "AGENTS.md"), content);
7722
+ await appendToFile(join5(cwd, "AGENTS.md"), rule.content);
7769
7723
  break;
7770
7724
  }
7771
7725
  case "cursor-rule": {
@@ -7775,7 +7729,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7775
7729
  }
7776
7730
  const filename = toRuleFilename(rule.name, i, nameCount);
7777
7731
  const filePath = join5(cwd, CURSOR_RULES_DIR, `${filename}.md`);
7778
- await writeFile5(filePath, content, "utf8");
7732
+ await writeFile5(filePath, rule.content, "utf8");
7779
7733
  break;
7780
7734
  }
7781
7735
  case "generic": {
@@ -7786,7 +7740,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
7786
7740
  const dirPath = join5(cwd, directory);
7787
7741
  await mkdir5(dirPath, { recursive: true });
7788
7742
  const filename = toRuleFilename(rule.name, i, nameCount);
7789
- await writeFile5(join5(dirPath, `${filename}.md`), content, "utf8");
7743
+ await writeFile5(join5(dirPath, `${filename}.md`), rule.content, "utf8");
7790
7744
  break;
7791
7745
  }
7792
7746
  default: {
@@ -8701,9 +8655,12 @@ function processMessages(timestampedMessages, startTime, endTime) {
8701
8655
  if (!step.toolCalls) continue;
8702
8656
  for (const tc of step.toolCalls) {
8703
8657
  if (tc.toolUseId && toolResultErrors.has(tc.toolUseId)) {
8704
- step.hasToolError = true;
8705
- step.toolErrorContent = toolResultErrors.get(tc.toolUseId);
8706
- break;
8658
+ tc.isError = true;
8659
+ tc.errorContent = toolResultErrors.get(tc.toolUseId);
8660
+ if (!step.hasToolError) {
8661
+ step.hasToolError = true;
8662
+ step.toolErrorContent = tc.errorContent;
8663
+ }
8707
8664
  }
8708
8665
  }
8709
8666
  }
@@ -8807,8 +8764,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8807
8764
  },
8808
8765
  costUsd: stepCost / totalSubSteps,
8809
8766
  outputPreview: step.thinking?.slice(0, 200),
8810
- success: isSuccess,
8811
- error: errorMsg
8767
+ success: true,
8768
+ error: void 0
8812
8769
  });
8813
8770
  }
8814
8771
  if (toolCallCount > 0) {
@@ -8818,6 +8775,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8818
8775
  const toolBudgetSteps = toolSubSteps + textSubSteps;
8819
8776
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
8820
8777
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
8778
+ const toolSuccess = !tc.isError;
8779
+ const toolError = tc.isError ? tc.errorContent ?? "Tool call failed" : void 0;
8821
8780
  subSteps.push({
8822
8781
  id: randomUUID(),
8823
8782
  stepNumber: 0,
@@ -8842,8 +8801,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8842
8801
  toolName: tc.toolName,
8843
8802
  toolArguments: JSON.stringify(tc.args),
8844
8803
  outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
8845
- success: isSuccess,
8846
- error: errorMsg
8804
+ success: toolSuccess,
8805
+ error: toolError
8847
8806
  });
8848
8807
  }
8849
8808
  }
@@ -8864,8 +8823,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
8864
8823
  },
8865
8824
  costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
8866
8825
  outputPreview: step.text?.slice(0, 200),
8867
- success: isSuccess,
8868
- error: errorMsg
8826
+ success: true,
8827
+ error: void 0
8869
8828
  });
8870
8829
  }
8871
8830
  if (subSteps.length === 0) {
@@ -9049,28 +9008,51 @@ function tryParseJson(text) {
9049
9008
  // src/run-scenario/agents/opencode/write-skills.ts
9050
9009
  import { mkdir as mkdir6 } from "fs/promises";
9051
9010
  import { join as join6 } from "path";
9052
- async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
9011
+ import { fetchGitHubFolder as fetchGitHubFolder3 } from "@wix/evalforge-github-client";
9012
+ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = fetchGitHubFolder3) {
9053
9013
  await Promise.all(
9054
9014
  skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
9055
9015
  );
9056
9016
  }
9057
9017
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
9058
- const skillDir = join6(cwd, ".opencode", "skills", skill.name);
9018
+ const skillName = skill.name;
9019
+ const skillDir = join6(cwd, ".opencode", "skills", skillName);
9059
9020
  await mkdir6(skillDir, { recursive: true });
9060
- try {
9061
- const files = await resolveSkillFiles(skill, fetchFn);
9062
- await writeFilesToDirectory(skillDir, files);
9063
- } catch (error) {
9064
- const message = error instanceof Error ? error.message : "Unknown error";
9065
- throw new Error(
9066
- `Failed to write skill ${skill.name} to filesystem: ${message}`
9021
+ const version = skill.latestVersion;
9022
+ if (version?.files && version.files.length > 0) {
9023
+ await writeFilesToDirectory(skillDir, version.files);
9024
+ console.log(
9025
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
9067
9026
  );
9027
+ } else if (skill.source) {
9028
+ try {
9029
+ const files = await fetchFn(skill.source, {
9030
+ userAgent: "EvalForge-Evaluator"
9031
+ });
9032
+ await writeFilesToDirectory(skillDir, files);
9033
+ console.log(
9034
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
9035
+ );
9036
+ } catch (error) {
9037
+ const message = error instanceof Error ? error.message : "Unknown error";
9038
+ console.error(
9039
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
9040
+ );
9041
+ throw new Error(
9042
+ `Failed to write skill ${skillName} to filesystem: ${message}`
9043
+ );
9044
+ }
9045
+ } else {
9046
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
9068
9047
  }
9069
9048
  }
9070
9049
 
9071
9050
  // src/run-scenario/agents/opencode/write-sub-agents.ts
9072
9051
  import { mkdir as mkdir7, writeFile as writeFile6 } from "fs/promises";
9073
9052
  import { join as join7 } from "path";
9053
+ import {
9054
+ fetchGitHubFile as fetchGitHubFile3
9055
+ } from "@wix/evalforge-github-client";
9074
9056
  var AGENTS_DIR2 = ".opencode/agents";
9075
9057
  function toAgentFilename2(name, index, nameCount) {
9076
9058
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -9078,7 +9060,34 @@ function toAgentFilename2(name, index, nameCount) {
9078
9060
  nameCount.set(base, count + 1);
9079
9061
  return count === 0 ? base : `${base}-${count + 1}`;
9080
9062
  }
9081
- async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9063
+ async function resolveSubAgentContent2(agent, fetchFn) {
9064
+ if (agent.source) {
9065
+ try {
9066
+ const content = await fetchFn(agent.source, {
9067
+ userAgent: "EvalForge-Evaluator"
9068
+ });
9069
+ console.log(
9070
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
9071
+ );
9072
+ return content;
9073
+ } catch (error) {
9074
+ const message = error instanceof Error ? error.message : "Unknown error";
9075
+ console.error(
9076
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
9077
+ );
9078
+ throw new Error(
9079
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
9080
+ );
9081
+ }
9082
+ }
9083
+ if (!agent.subAgentMd) {
9084
+ console.warn(
9085
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
9086
+ );
9087
+ }
9088
+ return agent.subAgentMd;
9089
+ }
9090
+ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHubFile3) {
9082
9091
  if (subAgents.length === 0) return;
9083
9092
  const agentsDir = join7(cwd, AGENTS_DIR2);
9084
9093
  await mkdir7(agentsDir, { recursive: true });
@@ -9086,7 +9095,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
9086
9095
  for (const [i, agent] of subAgents.entries()) {
9087
9096
  const filename = toAgentFilename2(agent.name, i, nameCount);
9088
9097
  const filePath = join7(agentsDir, `${filename}.md`);
9089
- const content = await resolveSubAgentMd(agent, fetchFn);
9098
+ const content = await resolveSubAgentContent2(agent, fetchFn);
9090
9099
  await writeFile6(filePath, content, "utf8");
9091
9100
  }
9092
9101
  console.log(`[SubAgents] Written to ${agentsDir}`);
@@ -9188,7 +9197,7 @@ async function buildOpenCodeEnv(options) {
9188
9197
  if (options.mcps && options.mcps.length > 0) {
9189
9198
  const mcpServers = {};
9190
9199
  for (const mcpEntity of options.mcps) {
9191
- const entityConfig = await resolveMcpConfig(mcpEntity);
9200
+ const entityConfig = mcpEntity.config;
9192
9201
  for (const [key, value] of Object.entries(entityConfig)) {
9193
9202
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
9194
9203
  throw new Error(