@wix/evalforge-evaluator 0.104.0 → 0.106.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -838,9 +838,6 @@ function createApiClient(serverUrl, options = "") {
838
838
  getEvalRun(projectId2, id) {
839
839
  return fetchJson(`/projects/${projectId2}/eval-runs/${id}`);
840
840
  },
841
- getSkillsGroup(projectId2, id) {
842
- return fetchJson(`/projects/${projectId2}/skills-groups/${id}`);
843
- },
844
841
  getScenario(projectId2, id) {
845
842
  return fetchJson(`/projects/${projectId2}/test-scenarios/${id}`);
846
843
  },
@@ -872,6 +869,9 @@ function createApiClient(serverUrl, options = "") {
872
869
  getRule(projectId2, id) {
873
870
  return fetchJson(`/projects/${projectId2}/rules/${id}`);
874
871
  },
872
+ getPreset(projectId2, id) {
873
+ return fetchJson(`/projects/${projectId2}/presets/${id}`);
874
+ },
875
875
  getAssertion(projectId2, id) {
876
876
  return fetchJson(`/projects/${projectId2}/assertions/${id}`);
877
877
  },
@@ -1067,24 +1067,21 @@ function customAssertionToAssertion(ca, params) {
1067
1067
  async function fetchEvaluationData(api, projectId2, evalRunId2) {
1068
1068
  const evalRun = await api.getEvalRun(projectId2, evalRunId2);
1069
1069
  const scenarios = await Promise.all(
1070
- evalRun.scenarioIds.map((id) => api.getScenario(projectId2, id))
1070
+ (evalRun.scenarioIds ?? []).map((id) => api.getScenario(projectId2, id))
1071
1071
  );
1072
1072
  let agent = null;
1073
1073
  if (evalRun.agentId) {
1074
1074
  agent = await api.getAgent(projectId2, evalRun.agentId);
1075
1075
  }
1076
1076
  let skills = [];
1077
- let skillsGroup = null;
1078
- if (evalRun.skillsGroupId) {
1079
- skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
1080
- if (skillsGroup.skillIds.length > 0) {
1081
- const fetchResults = await Promise.allSettled(
1082
- skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
1083
- );
1084
- skills = fetchResults.filter(
1085
- (r) => r.status === "fulfilled"
1086
- ).map((r) => r.value).filter((s) => !s.deleted);
1087
- }
1077
+ const resolvedSkillIds = evalRun.skillIds ?? [];
1078
+ if (resolvedSkillIds.length > 0) {
1079
+ const fetchResults = await Promise.allSettled(
1080
+ resolvedSkillIds.map((id) => api.getSkill(projectId2, id))
1081
+ );
1082
+ skills = fetchResults.filter(
1083
+ (r) => r.status === "fulfilled"
1084
+ ).map((r) => r.value).filter((s) => !s.deleted);
1088
1085
  if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
1089
1086
  skills = await Promise.all(
1090
1087
  skills.map(async (skill) => {
@@ -1169,13 +1166,22 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
1169
1166
  resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
1170
1167
  };
1171
1168
  });
1172
- const skillsGroupName = skillsGroup?.name ?? "";
1169
+ let presetName = "";
1170
+ if (evalRun.presetId) {
1171
+ try {
1172
+ const preset = await api.getPreset(projectId2, evalRun.presetId);
1173
+ presetName = preset.name;
1174
+ } catch {
1175
+ presetName = skills.length > 0 ? skills.map((s) => s.name).join(", ") : "";
1176
+ }
1177
+ } else if (skills.length > 0) {
1178
+ presetName = skills.map((s) => s.name).join(", ");
1179
+ }
1173
1180
  return {
1174
1181
  evalRun,
1175
1182
  agent,
1176
1183
  skills,
1177
- skillsGroup,
1178
- skillsGroupName,
1184
+ presetName,
1179
1185
  mcps,
1180
1186
  subAgents,
1181
1187
  rules,
@@ -43710,18 +43716,20 @@ function extractTemplateFiles(before, after) {
43710
43716
  var import_evalforge_types7 = require("@wix/evalforge-types");
43711
43717
  var DEFAULT_AGENT_COMMAND = import_evalforge_types7.AgentRunCommand.CLAUDE;
43712
43718
  async function runAgentWithContext(config2, evalRunId2, scenario, evalData, workDir) {
43713
- const skillsGroupId = evalData.evalRun.skillsGroupId;
43719
+ const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
43720
+ if (!hasEntities) {
43721
+ throw new Error(
43722
+ `Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
43723
+ );
43724
+ }
43714
43725
  const agent = evalData.agent ?? void 0;
43715
43726
  const isSDK = agent?.agentType === import_evalforge_types7.AgentType.SDK;
43716
- if (!skillsGroupId) {
43717
- throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
43718
- }
43719
43727
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
43720
43728
  const adapter = getAdapter(identifier);
43721
43729
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
43722
43730
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
43723
- const targetId = skillsGroupId ?? agent?.id ?? evalData.evalRun.id;
43724
- const targetName = evalData.skillsGroupName || agent?.name || "";
43731
+ const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
43732
+ const targetName = evalData.presetName || agent?.name || "";
43725
43733
  const executionContext = {
43726
43734
  skills: evalData.skills,
43727
43735
  scenario,
@@ -43769,7 +43777,7 @@ async function runAgentWithContext(config2, evalRunId2, scenario, evalData, work
43769
43777
 
43770
43778
  // src/run-scenario/index.ts
43771
43779
  async function runScenario(config2, evalRunId2, scenario, evalData, template, resolvedAssertions) {
43772
- const targetId = evalData.evalRun.skillsGroupId ?? evalData.agent?.id ?? evalData.evalRun.id;
43780
+ const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
43773
43781
  const workDir = await prepareWorkingDirectory(
43774
43782
  config2,
43775
43783
  evalRunId2,
@@ -44004,16 +44012,18 @@ async function runEvaluation(projectId2, evalRunId2) {
44004
44012
  skillCount: skills.length,
44005
44013
  hasAgent: !!agent,
44006
44014
  agentId: evalData.evalRun.agentId,
44007
- skillsGroupId: evalData.evalRun.skillsGroupId
44015
+ presetId: evalData.evalRun.presetId,
44016
+ skillIds: evalData.evalRun.skillIds
44008
44017
  };
44009
- if (scenarioItems.length > 0 && skills.length === 0) {
44018
+ const hasEntities = skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || evalData.rules.length > 0;
44019
+ if (scenarioItems.length > 0 && !hasEntities) {
44010
44020
  throw new Error(
44011
- `[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
44021
+ `[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
44012
44022
  );
44013
44023
  }
44014
- if (scenarioItems.length > 0 && skills.length > 0 && !agent) {
44024
+ if (scenarioItems.length > 0 && hasEntities && !agent) {
44015
44025
  throw new Error(
44016
- `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
44026
+ `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
44017
44027
  );
44018
44028
  }
44019
44029
  let completedScenarios = 0;
@@ -44025,16 +44035,16 @@ async function runEvaluation(projectId2, evalRunId2) {
44025
44035
  evalRunId: evalRunId2,
44026
44036
  scenarioId: scenario.id,
44027
44037
  scenarioName: scenario.name,
44028
- skillsGroupId: evalData.evalRun.skillsGroupId,
44029
- skillsGroupName: evalData.skillsGroupName,
44038
+ presetId: evalData.evalRun.presetId,
44039
+ presetName: evalData.presetName,
44030
44040
  agentId: agent?.id,
44031
44041
  agentName: agent?.name,
44032
44042
  progress: `${completedScenarios + 1}/${totalScenarios}`
44033
44043
  };
44034
44044
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
44035
44045
  console.log(
44036
- "[Evaluator] Running scenario with skills group:",
44037
- evalData.skillsGroupName,
44046
+ "[Evaluator] Running scenario with preset:",
44047
+ evalData.presetName,
44038
44048
  skillNames ? `(${skillNames})` : "",
44039
44049
  agent ? `with agent: ${agent.name}` : "",
44040
44050
  `(${completedScenarios + 1}/${totalScenarios})`
@@ -44060,8 +44070,8 @@ async function runEvaluation(projectId2, evalRunId2) {
44060
44070
  const errorMsg = err instanceof Error ? err.message : String(err);
44061
44071
  const errorStack = err instanceof Error ? err.stack : void 0;
44062
44072
  console.error(
44063
- "[Evaluator] Failed to run scenario with skills group:",
44064
- evalData.skillsGroupName,
44073
+ "[Evaluator] Failed to run scenario with preset:",
44074
+ evalData.presetName,
44065
44075
  "Error:",
44066
44076
  errorMsg
44067
44077
  );
@@ -44069,7 +44079,7 @@ async function runEvaluation(projectId2, evalRunId2) {
44069
44079
  console.error("[Evaluator] Stack trace:", errorStack);
44070
44080
  }
44071
44081
  throw new Error(
44072
- `[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
44082
+ `[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
44073
44083
  );
44074
44084
  }
44075
44085
  }