@wix/evalforge-evaluator 0.104.0 → 0.106.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +47 -37
- package/build/index.js.map +2 -2
- package/build/index.mjs +47 -37
- package/build/index.mjs.map +2 -2
- package/build/types/api-client.d.ts +2 -2
- package/build/types/fetch-evaluation-data.d.ts +6 -7
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +1 -1
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -838,9 +838,6 @@ function createApiClient(serverUrl, options = "") {
|
|
|
838
838
|
getEvalRun(projectId2, id) {
|
|
839
839
|
return fetchJson(`/projects/${projectId2}/eval-runs/${id}`);
|
|
840
840
|
},
|
|
841
|
-
getSkillsGroup(projectId2, id) {
|
|
842
|
-
return fetchJson(`/projects/${projectId2}/skills-groups/${id}`);
|
|
843
|
-
},
|
|
844
841
|
getScenario(projectId2, id) {
|
|
845
842
|
return fetchJson(`/projects/${projectId2}/test-scenarios/${id}`);
|
|
846
843
|
},
|
|
@@ -872,6 +869,9 @@ function createApiClient(serverUrl, options = "") {
|
|
|
872
869
|
getRule(projectId2, id) {
|
|
873
870
|
return fetchJson(`/projects/${projectId2}/rules/${id}`);
|
|
874
871
|
},
|
|
872
|
+
getPreset(projectId2, id) {
|
|
873
|
+
return fetchJson(`/projects/${projectId2}/presets/${id}`);
|
|
874
|
+
},
|
|
875
875
|
getAssertion(projectId2, id) {
|
|
876
876
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
877
877
|
},
|
|
@@ -1067,24 +1067,21 @@ function customAssertionToAssertion(ca, params) {
|
|
|
1067
1067
|
async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
1068
1068
|
const evalRun = await api.getEvalRun(projectId2, evalRunId2);
|
|
1069
1069
|
const scenarios = await Promise.all(
|
|
1070
|
-
evalRun.scenarioIds.map((id) => api.getScenario(projectId2, id))
|
|
1070
|
+
(evalRun.scenarioIds ?? []).map((id) => api.getScenario(projectId2, id))
|
|
1071
1071
|
);
|
|
1072
1072
|
let agent = null;
|
|
1073
1073
|
if (evalRun.agentId) {
|
|
1074
1074
|
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
1075
1075
|
}
|
|
1076
1076
|
let skills = [];
|
|
1077
|
-
|
|
1078
|
-
if (
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
)
|
|
1084
|
-
|
|
1085
|
-
(r) => r.status === "fulfilled"
|
|
1086
|
-
).map((r) => r.value).filter((s) => !s.deleted);
|
|
1087
|
-
}
|
|
1077
|
+
const resolvedSkillIds = evalRun.skillIds ?? [];
|
|
1078
|
+
if (resolvedSkillIds.length > 0) {
|
|
1079
|
+
const fetchResults = await Promise.allSettled(
|
|
1080
|
+
resolvedSkillIds.map((id) => api.getSkill(projectId2, id))
|
|
1081
|
+
);
|
|
1082
|
+
skills = fetchResults.filter(
|
|
1083
|
+
(r) => r.status === "fulfilled"
|
|
1084
|
+
).map((r) => r.value).filter((s) => !s.deleted);
|
|
1088
1085
|
if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
|
|
1089
1086
|
skills = await Promise.all(
|
|
1090
1087
|
skills.map(async (skill) => {
|
|
@@ -1169,13 +1166,22 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
1169
1166
|
resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
|
|
1170
1167
|
};
|
|
1171
1168
|
});
|
|
1172
|
-
|
|
1169
|
+
let presetName = "";
|
|
1170
|
+
if (evalRun.presetId) {
|
|
1171
|
+
try {
|
|
1172
|
+
const preset = await api.getPreset(projectId2, evalRun.presetId);
|
|
1173
|
+
presetName = preset.name;
|
|
1174
|
+
} catch {
|
|
1175
|
+
presetName = skills.length > 0 ? skills.map((s) => s.name).join(", ") : "";
|
|
1176
|
+
}
|
|
1177
|
+
} else if (skills.length > 0) {
|
|
1178
|
+
presetName = skills.map((s) => s.name).join(", ");
|
|
1179
|
+
}
|
|
1173
1180
|
return {
|
|
1174
1181
|
evalRun,
|
|
1175
1182
|
agent,
|
|
1176
1183
|
skills,
|
|
1177
|
-
|
|
1178
|
-
skillsGroupName,
|
|
1184
|
+
presetName,
|
|
1179
1185
|
mcps,
|
|
1180
1186
|
subAgents,
|
|
1181
1187
|
rules,
|
|
@@ -43710,18 +43716,20 @@ function extractTemplateFiles(before, after) {
|
|
|
43710
43716
|
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
43711
43717
|
var DEFAULT_AGENT_COMMAND = import_evalforge_types7.AgentRunCommand.CLAUDE;
|
|
43712
43718
|
async function runAgentWithContext(config2, evalRunId2, scenario, evalData, workDir) {
|
|
43713
|
-
const
|
|
43719
|
+
const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
|
|
43720
|
+
if (!hasEntities) {
|
|
43721
|
+
throw new Error(
|
|
43722
|
+
`Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
43723
|
+
);
|
|
43724
|
+
}
|
|
43714
43725
|
const agent = evalData.agent ?? void 0;
|
|
43715
43726
|
const isSDK = agent?.agentType === import_evalforge_types7.AgentType.SDK;
|
|
43716
|
-
if (!skillsGroupId) {
|
|
43717
|
-
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
43718
|
-
}
|
|
43719
43727
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
43720
43728
|
const adapter = getAdapter(identifier);
|
|
43721
43729
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
43722
43730
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
43723
|
-
const targetId =
|
|
43724
|
-
const targetName = evalData.
|
|
43731
|
+
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
43732
|
+
const targetName = evalData.presetName || agent?.name || "";
|
|
43725
43733
|
const executionContext = {
|
|
43726
43734
|
skills: evalData.skills,
|
|
43727
43735
|
scenario,
|
|
@@ -43769,7 +43777,7 @@ async function runAgentWithContext(config2, evalRunId2, scenario, evalData, work
|
|
|
43769
43777
|
|
|
43770
43778
|
// src/run-scenario/index.ts
|
|
43771
43779
|
async function runScenario(config2, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
43772
|
-
const targetId = evalData.evalRun.
|
|
43780
|
+
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
43773
43781
|
const workDir = await prepareWorkingDirectory(
|
|
43774
43782
|
config2,
|
|
43775
43783
|
evalRunId2,
|
|
@@ -44004,16 +44012,18 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44004
44012
|
skillCount: skills.length,
|
|
44005
44013
|
hasAgent: !!agent,
|
|
44006
44014
|
agentId: evalData.evalRun.agentId,
|
|
44007
|
-
|
|
44015
|
+
presetId: evalData.evalRun.presetId,
|
|
44016
|
+
skillIds: evalData.evalRun.skillIds
|
|
44008
44017
|
};
|
|
44009
|
-
|
|
44018
|
+
const hasEntities = skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || evalData.rules.length > 0;
|
|
44019
|
+
if (scenarioItems.length > 0 && !hasEntities) {
|
|
44010
44020
|
throw new Error(
|
|
44011
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no
|
|
44021
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
44012
44022
|
);
|
|
44013
44023
|
}
|
|
44014
|
-
if (scenarioItems.length > 0 &&
|
|
44024
|
+
if (scenarioItems.length > 0 && hasEntities && !agent) {
|
|
44015
44025
|
throw new Error(
|
|
44016
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for
|
|
44026
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
44017
44027
|
);
|
|
44018
44028
|
}
|
|
44019
44029
|
let completedScenarios = 0;
|
|
@@ -44025,16 +44035,16 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44025
44035
|
evalRunId: evalRunId2,
|
|
44026
44036
|
scenarioId: scenario.id,
|
|
44027
44037
|
scenarioName: scenario.name,
|
|
44028
|
-
|
|
44029
|
-
|
|
44038
|
+
presetId: evalData.evalRun.presetId,
|
|
44039
|
+
presetName: evalData.presetName,
|
|
44030
44040
|
agentId: agent?.id,
|
|
44031
44041
|
agentName: agent?.name,
|
|
44032
44042
|
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
44033
44043
|
};
|
|
44034
44044
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
44035
44045
|
console.log(
|
|
44036
|
-
"[Evaluator] Running scenario with
|
|
44037
|
-
evalData.
|
|
44046
|
+
"[Evaluator] Running scenario with preset:",
|
|
44047
|
+
evalData.presetName,
|
|
44038
44048
|
skillNames ? `(${skillNames})` : "",
|
|
44039
44049
|
agent ? `with agent: ${agent.name}` : "",
|
|
44040
44050
|
`(${completedScenarios + 1}/${totalScenarios})`
|
|
@@ -44060,8 +44070,8 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44060
44070
|
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
44061
44071
|
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
44062
44072
|
console.error(
|
|
44063
|
-
"[Evaluator] Failed to run scenario with
|
|
44064
|
-
evalData.
|
|
44073
|
+
"[Evaluator] Failed to run scenario with preset:",
|
|
44074
|
+
evalData.presetName,
|
|
44065
44075
|
"Error:",
|
|
44066
44076
|
errorMsg
|
|
44067
44077
|
);
|
|
@@ -44069,7 +44079,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44069
44079
|
console.error("[Evaluator] Stack trace:", errorStack);
|
|
44070
44080
|
}
|
|
44071
44081
|
throw new Error(
|
|
44072
|
-
`[${state.currentPhase}] Failed to execute
|
|
44082
|
+
`[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
44073
44083
|
);
|
|
44074
44084
|
}
|
|
44075
44085
|
}
|