@wix/evalforge-evaluator 0.105.0 → 0.107.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -838,9 +838,6 @@ function createApiClient(serverUrl, options = "") {
838
838
  getEvalRun(projectId2, id) {
839
839
  return fetchJson(`/projects/${projectId2}/eval-runs/${id}`);
840
840
  },
841
- getSkillsGroup(projectId2, id) {
842
- return fetchJson(`/projects/${projectId2}/skills-groups/${id}`);
843
- },
844
841
  getScenario(projectId2, id) {
845
842
  return fetchJson(`/projects/${projectId2}/test-scenarios/${id}`);
846
843
  },
@@ -872,6 +869,9 @@ function createApiClient(serverUrl, options = "") {
872
869
  getRule(projectId2, id) {
873
870
  return fetchJson(`/projects/${projectId2}/rules/${id}`);
874
871
  },
872
+ getPreset(projectId2, id) {
873
+ return fetchJson(`/projects/${projectId2}/presets/${id}`);
874
+ },
875
875
  getAssertion(projectId2, id) {
876
876
  return fetchJson(`/projects/${projectId2}/assertions/${id}`);
877
877
  },
@@ -1074,17 +1074,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
1074
1074
  agent = await api.getAgent(projectId2, evalRun.agentId);
1075
1075
  }
1076
1076
  let skills = [];
1077
- let skillsGroup = null;
1078
- if (evalRun.skillsGroupId) {
1079
- skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
1080
- if (skillsGroup.skillIds.length > 0) {
1081
- const fetchResults = await Promise.allSettled(
1082
- skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
1083
- );
1084
- skills = fetchResults.filter(
1085
- (r) => r.status === "fulfilled"
1086
- ).map((r) => r.value).filter((s) => !s.deleted);
1087
- }
1077
+ const resolvedSkillIds = evalRun.skillIds ?? [];
1078
+ if (resolvedSkillIds.length > 0) {
1079
+ const fetchResults = await Promise.allSettled(
1080
+ resolvedSkillIds.map((id) => api.getSkill(projectId2, id))
1081
+ );
1082
+ skills = fetchResults.filter(
1083
+ (r) => r.status === "fulfilled"
1084
+ ).map((r) => r.value).filter((s) => !s.deleted);
1088
1085
  if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
1089
1086
  skills = await Promise.all(
1090
1087
  skills.map(async (skill) => {
@@ -1169,13 +1166,22 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
1169
1166
  resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
1170
1167
  };
1171
1168
  });
1172
- const skillsGroupName = skillsGroup?.name ?? "";
1169
+ let presetName = "";
1170
+ if (evalRun.presetId) {
1171
+ try {
1172
+ const preset = await api.getPreset(projectId2, evalRun.presetId);
1173
+ presetName = preset.name;
1174
+ } catch {
1175
+ presetName = skills.length > 0 ? skills.map((s) => s.name).join(", ") : "";
1176
+ }
1177
+ } else if (skills.length > 0) {
1178
+ presetName = skills.map((s) => s.name).join(", ");
1179
+ }
1173
1180
  return {
1174
1181
  evalRun,
1175
1182
  agent,
1176
1183
  skills,
1177
- skillsGroup,
1178
- skillsGroupName,
1184
+ presetName,
1179
1185
  mcps,
1180
1186
  subAgents,
1181
1187
  rules,
@@ -1499,6 +1505,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
1499
1505
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
1500
1506
  var import_promises5 = require("fs/promises");
1501
1507
  var import_path6 = require("path");
1508
+ var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
1502
1509
  var AGENTS_DIR = ".claude/agents";
1503
1510
  function toAgentFilename(name26, index, nameCount) {
1504
1511
  const base = (name26 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -1506,7 +1513,34 @@ function toAgentFilename(name26, index, nameCount) {
1506
1513
  nameCount.set(base, count + 1);
1507
1514
  return count === 0 ? base : `${base}-${count + 1}`;
1508
1515
  }
1509
- async function writeSubAgentsToFilesystem(cwd, subAgents) {
1516
+ async function resolveSubAgentContent(agent, fetchFn) {
1517
+ if (agent.source) {
1518
+ try {
1519
+ const content = await fetchFn(agent.source, {
1520
+ userAgent: "EvalForge-Evaluator"
1521
+ });
1522
+ console.log(
1523
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
1524
+ );
1525
+ return content;
1526
+ } catch (error48) {
1527
+ const message = error48 instanceof Error ? error48.message : "Unknown error";
1528
+ console.error(
1529
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
1530
+ );
1531
+ throw new Error(
1532
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
1533
+ );
1534
+ }
1535
+ }
1536
+ if (!agent.subAgentMd) {
1537
+ console.warn(
1538
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
1539
+ );
1540
+ }
1541
+ return agent.subAgentMd;
1542
+ }
1543
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
1510
1544
  if (subAgents.length === 0) return;
1511
1545
  const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
1512
1546
  await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
@@ -1514,7 +1548,8 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
1514
1548
  for (const [i, agent] of subAgents.entries()) {
1515
1549
  const filename = toAgentFilename(agent.name, i, nameCount);
1516
1550
  const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
1517
- await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
1551
+ const content = await resolveSubAgentContent(agent, fetchFn);
1552
+ await (0, import_promises5.writeFile)(filePath, content, "utf8");
1518
1553
  }
1519
1554
  console.log(`[SubAgents] Written to ${agentsDir}`);
1520
1555
  }
@@ -43710,18 +43745,20 @@ function extractTemplateFiles(before, after) {
43710
43745
  var import_evalforge_types7 = require("@wix/evalforge-types");
43711
43746
  var DEFAULT_AGENT_COMMAND = import_evalforge_types7.AgentRunCommand.CLAUDE;
43712
43747
  async function runAgentWithContext(config2, evalRunId2, scenario, evalData, workDir) {
43713
- const skillsGroupId = evalData.evalRun.skillsGroupId;
43748
+ const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
43749
+ if (!hasEntities) {
43750
+ throw new Error(
43751
+ `Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
43752
+ );
43753
+ }
43714
43754
  const agent = evalData.agent ?? void 0;
43715
43755
  const isSDK = agent?.agentType === import_evalforge_types7.AgentType.SDK;
43716
- if (!skillsGroupId) {
43717
- throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
43718
- }
43719
43756
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
43720
43757
  const adapter = getAdapter(identifier);
43721
43758
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
43722
43759
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
43723
- const targetId = skillsGroupId ?? agent?.id ?? evalData.evalRun.id;
43724
- const targetName = evalData.skillsGroupName || agent?.name || "";
43760
+ const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
43761
+ const targetName = evalData.presetName || agent?.name || "";
43725
43762
  const executionContext = {
43726
43763
  skills: evalData.skills,
43727
43764
  scenario,
@@ -43769,7 +43806,7 @@ async function runAgentWithContext(config2, evalRunId2, scenario, evalData, work
43769
43806
 
43770
43807
  // src/run-scenario/index.ts
43771
43808
  async function runScenario(config2, evalRunId2, scenario, evalData, template, resolvedAssertions) {
43772
- const targetId = evalData.evalRun.skillsGroupId ?? evalData.agent?.id ?? evalData.evalRun.id;
43809
+ const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
43773
43810
  const workDir = await prepareWorkingDirectory(
43774
43811
  config2,
43775
43812
  evalRunId2,
@@ -44004,16 +44041,18 @@ async function runEvaluation(projectId2, evalRunId2) {
44004
44041
  skillCount: skills.length,
44005
44042
  hasAgent: !!agent,
44006
44043
  agentId: evalData.evalRun.agentId,
44007
- skillsGroupId: evalData.evalRun.skillsGroupId
44044
+ presetId: evalData.evalRun.presetId,
44045
+ skillIds: evalData.evalRun.skillIds
44008
44046
  };
44009
- if (scenarioItems.length > 0 && skills.length === 0) {
44047
+ const hasEntities = skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || evalData.rules.length > 0;
44048
+ if (scenarioItems.length > 0 && !hasEntities) {
44010
44049
  throw new Error(
44011
- `[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
44050
+ `[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
44012
44051
  );
44013
44052
  }
44014
- if (scenarioItems.length > 0 && skills.length > 0 && !agent) {
44053
+ if (scenarioItems.length > 0 && hasEntities && !agent) {
44015
44054
  throw new Error(
44016
- `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
44055
+ `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
44017
44056
  );
44018
44057
  }
44019
44058
  let completedScenarios = 0;
@@ -44025,16 +44064,16 @@ async function runEvaluation(projectId2, evalRunId2) {
44025
44064
  evalRunId: evalRunId2,
44026
44065
  scenarioId: scenario.id,
44027
44066
  scenarioName: scenario.name,
44028
- skillsGroupId: evalData.evalRun.skillsGroupId,
44029
- skillsGroupName: evalData.skillsGroupName,
44067
+ presetId: evalData.evalRun.presetId,
44068
+ presetName: evalData.presetName,
44030
44069
  agentId: agent?.id,
44031
44070
  agentName: agent?.name,
44032
44071
  progress: `${completedScenarios + 1}/${totalScenarios}`
44033
44072
  };
44034
44073
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
44035
44074
  console.log(
44036
- "[Evaluator] Running scenario with skills group:",
44037
- evalData.skillsGroupName,
44075
+ "[Evaluator] Running scenario with preset:",
44076
+ evalData.presetName,
44038
44077
  skillNames ? `(${skillNames})` : "",
44039
44078
  agent ? `with agent: ${agent.name}` : "",
44040
44079
  `(${completedScenarios + 1}/${totalScenarios})`
@@ -44060,8 +44099,8 @@ async function runEvaluation(projectId2, evalRunId2) {
44060
44099
  const errorMsg = err instanceof Error ? err.message : String(err);
44061
44100
  const errorStack = err instanceof Error ? err.stack : void 0;
44062
44101
  console.error(
44063
- "[Evaluator] Failed to run scenario with skills group:",
44064
- evalData.skillsGroupName,
44102
+ "[Evaluator] Failed to run scenario with preset:",
44103
+ evalData.presetName,
44065
44104
  "Error:",
44066
44105
  errorMsg
44067
44106
  );
@@ -44069,7 +44108,7 @@ async function runEvaluation(projectId2, evalRunId2) {
44069
44108
  console.error("[Evaluator] Stack trace:", errorStack);
44070
44109
  }
44071
44110
  throw new Error(
44072
- `[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
44111
+ `[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
44073
44112
  );
44074
44113
  }
44075
44114
  }