@wix/evalforge-evaluator 0.105.0 → 0.107.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +77 -38
- package/build/index.js.map +3 -3
- package/build/index.mjs +79 -38
- package/build/index.mjs.map +3 -3
- package/build/types/api-client.d.ts +2 -2
- package/build/types/fetch-evaluation-data.d.ts +6 -7
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +8 -4
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +1 -1
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -838,9 +838,6 @@ function createApiClient(serverUrl, options = "") {
|
|
|
838
838
|
getEvalRun(projectId2, id) {
|
|
839
839
|
return fetchJson(`/projects/${projectId2}/eval-runs/${id}`);
|
|
840
840
|
},
|
|
841
|
-
getSkillsGroup(projectId2, id) {
|
|
842
|
-
return fetchJson(`/projects/${projectId2}/skills-groups/${id}`);
|
|
843
|
-
},
|
|
844
841
|
getScenario(projectId2, id) {
|
|
845
842
|
return fetchJson(`/projects/${projectId2}/test-scenarios/${id}`);
|
|
846
843
|
},
|
|
@@ -872,6 +869,9 @@ function createApiClient(serverUrl, options = "") {
|
|
|
872
869
|
getRule(projectId2, id) {
|
|
873
870
|
return fetchJson(`/projects/${projectId2}/rules/${id}`);
|
|
874
871
|
},
|
|
872
|
+
getPreset(projectId2, id) {
|
|
873
|
+
return fetchJson(`/projects/${projectId2}/presets/${id}`);
|
|
874
|
+
},
|
|
875
875
|
getAssertion(projectId2, id) {
|
|
876
876
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
877
877
|
},
|
|
@@ -1074,17 +1074,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
1074
1074
|
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
1075
1075
|
}
|
|
1076
1076
|
let skills = [];
|
|
1077
|
-
|
|
1078
|
-
if (
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
)
|
|
1084
|
-
|
|
1085
|
-
(r) => r.status === "fulfilled"
|
|
1086
|
-
).map((r) => r.value).filter((s) => !s.deleted);
|
|
1087
|
-
}
|
|
1077
|
+
const resolvedSkillIds = evalRun.skillIds ?? [];
|
|
1078
|
+
if (resolvedSkillIds.length > 0) {
|
|
1079
|
+
const fetchResults = await Promise.allSettled(
|
|
1080
|
+
resolvedSkillIds.map((id) => api.getSkill(projectId2, id))
|
|
1081
|
+
);
|
|
1082
|
+
skills = fetchResults.filter(
|
|
1083
|
+
(r) => r.status === "fulfilled"
|
|
1084
|
+
).map((r) => r.value).filter((s) => !s.deleted);
|
|
1088
1085
|
if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
|
|
1089
1086
|
skills = await Promise.all(
|
|
1090
1087
|
skills.map(async (skill) => {
|
|
@@ -1169,13 +1166,22 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
1169
1166
|
resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
|
|
1170
1167
|
};
|
|
1171
1168
|
});
|
|
1172
|
-
|
|
1169
|
+
let presetName = "";
|
|
1170
|
+
if (evalRun.presetId) {
|
|
1171
|
+
try {
|
|
1172
|
+
const preset = await api.getPreset(projectId2, evalRun.presetId);
|
|
1173
|
+
presetName = preset.name;
|
|
1174
|
+
} catch {
|
|
1175
|
+
presetName = skills.length > 0 ? skills.map((s) => s.name).join(", ") : "";
|
|
1176
|
+
}
|
|
1177
|
+
} else if (skills.length > 0) {
|
|
1178
|
+
presetName = skills.map((s) => s.name).join(", ");
|
|
1179
|
+
}
|
|
1173
1180
|
return {
|
|
1174
1181
|
evalRun,
|
|
1175
1182
|
agent,
|
|
1176
1183
|
skills,
|
|
1177
|
-
|
|
1178
|
-
skillsGroupName,
|
|
1184
|
+
presetName,
|
|
1179
1185
|
mcps,
|
|
1180
1186
|
subAgents,
|
|
1181
1187
|
rules,
|
|
@@ -1499,6 +1505,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
1499
1505
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
1500
1506
|
var import_promises5 = require("fs/promises");
|
|
1501
1507
|
var import_path6 = require("path");
|
|
1508
|
+
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
1502
1509
|
var AGENTS_DIR = ".claude/agents";
|
|
1503
1510
|
function toAgentFilename(name26, index, nameCount) {
|
|
1504
1511
|
const base = (name26 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -1506,7 +1513,34 @@ function toAgentFilename(name26, index, nameCount) {
|
|
|
1506
1513
|
nameCount.set(base, count + 1);
|
|
1507
1514
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
1508
1515
|
}
|
|
1509
|
-
async function
|
|
1516
|
+
async function resolveSubAgentContent(agent, fetchFn) {
|
|
1517
|
+
if (agent.source) {
|
|
1518
|
+
try {
|
|
1519
|
+
const content = await fetchFn(agent.source, {
|
|
1520
|
+
userAgent: "EvalForge-Evaluator"
|
|
1521
|
+
});
|
|
1522
|
+
console.log(
|
|
1523
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
1524
|
+
);
|
|
1525
|
+
return content;
|
|
1526
|
+
} catch (error48) {
|
|
1527
|
+
const message = error48 instanceof Error ? error48.message : "Unknown error";
|
|
1528
|
+
console.error(
|
|
1529
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
1530
|
+
);
|
|
1531
|
+
throw new Error(
|
|
1532
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
1533
|
+
);
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
if (!agent.subAgentMd) {
|
|
1537
|
+
console.warn(
|
|
1538
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
1539
|
+
);
|
|
1540
|
+
}
|
|
1541
|
+
return agent.subAgentMd;
|
|
1542
|
+
}
|
|
1543
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
1510
1544
|
if (subAgents.length === 0) return;
|
|
1511
1545
|
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
1512
1546
|
await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
|
|
@@ -1514,7 +1548,8 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
|
1514
1548
|
for (const [i, agent] of subAgents.entries()) {
|
|
1515
1549
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
1516
1550
|
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
1517
|
-
await (
|
|
1551
|
+
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
1552
|
+
await (0, import_promises5.writeFile)(filePath, content, "utf8");
|
|
1518
1553
|
}
|
|
1519
1554
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
1520
1555
|
}
|
|
@@ -43710,18 +43745,20 @@ function extractTemplateFiles(before, after) {
|
|
|
43710
43745
|
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
43711
43746
|
var DEFAULT_AGENT_COMMAND = import_evalforge_types7.AgentRunCommand.CLAUDE;
|
|
43712
43747
|
async function runAgentWithContext(config2, evalRunId2, scenario, evalData, workDir) {
|
|
43713
|
-
const
|
|
43748
|
+
const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
|
|
43749
|
+
if (!hasEntities) {
|
|
43750
|
+
throw new Error(
|
|
43751
|
+
`Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
43752
|
+
);
|
|
43753
|
+
}
|
|
43714
43754
|
const agent = evalData.agent ?? void 0;
|
|
43715
43755
|
const isSDK = agent?.agentType === import_evalforge_types7.AgentType.SDK;
|
|
43716
|
-
if (!skillsGroupId) {
|
|
43717
|
-
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
43718
|
-
}
|
|
43719
43756
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
43720
43757
|
const adapter = getAdapter(identifier);
|
|
43721
43758
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
43722
43759
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
43723
|
-
const targetId =
|
|
43724
|
-
const targetName = evalData.
|
|
43760
|
+
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
43761
|
+
const targetName = evalData.presetName || agent?.name || "";
|
|
43725
43762
|
const executionContext = {
|
|
43726
43763
|
skills: evalData.skills,
|
|
43727
43764
|
scenario,
|
|
@@ -43769,7 +43806,7 @@ async function runAgentWithContext(config2, evalRunId2, scenario, evalData, work
|
|
|
43769
43806
|
|
|
43770
43807
|
// src/run-scenario/index.ts
|
|
43771
43808
|
async function runScenario(config2, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
43772
|
-
const targetId = evalData.evalRun.
|
|
43809
|
+
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
43773
43810
|
const workDir = await prepareWorkingDirectory(
|
|
43774
43811
|
config2,
|
|
43775
43812
|
evalRunId2,
|
|
@@ -44004,16 +44041,18 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44004
44041
|
skillCount: skills.length,
|
|
44005
44042
|
hasAgent: !!agent,
|
|
44006
44043
|
agentId: evalData.evalRun.agentId,
|
|
44007
|
-
|
|
44044
|
+
presetId: evalData.evalRun.presetId,
|
|
44045
|
+
skillIds: evalData.evalRun.skillIds
|
|
44008
44046
|
};
|
|
44009
|
-
|
|
44047
|
+
const hasEntities = skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || evalData.rules.length > 0;
|
|
44048
|
+
if (scenarioItems.length > 0 && !hasEntities) {
|
|
44010
44049
|
throw new Error(
|
|
44011
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no
|
|
44050
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
|
|
44012
44051
|
);
|
|
44013
44052
|
}
|
|
44014
|
-
if (scenarioItems.length > 0 &&
|
|
44053
|
+
if (scenarioItems.length > 0 && hasEntities && !agent) {
|
|
44015
44054
|
throw new Error(
|
|
44016
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for
|
|
44055
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
44017
44056
|
);
|
|
44018
44057
|
}
|
|
44019
44058
|
let completedScenarios = 0;
|
|
@@ -44025,16 +44064,16 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44025
44064
|
evalRunId: evalRunId2,
|
|
44026
44065
|
scenarioId: scenario.id,
|
|
44027
44066
|
scenarioName: scenario.name,
|
|
44028
|
-
|
|
44029
|
-
|
|
44067
|
+
presetId: evalData.evalRun.presetId,
|
|
44068
|
+
presetName: evalData.presetName,
|
|
44030
44069
|
agentId: agent?.id,
|
|
44031
44070
|
agentName: agent?.name,
|
|
44032
44071
|
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
44033
44072
|
};
|
|
44034
44073
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
44035
44074
|
console.log(
|
|
44036
|
-
"[Evaluator] Running scenario with
|
|
44037
|
-
evalData.
|
|
44075
|
+
"[Evaluator] Running scenario with preset:",
|
|
44076
|
+
evalData.presetName,
|
|
44038
44077
|
skillNames ? `(${skillNames})` : "",
|
|
44039
44078
|
agent ? `with agent: ${agent.name}` : "",
|
|
44040
44079
|
`(${completedScenarios + 1}/${totalScenarios})`
|
|
@@ -44060,8 +44099,8 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44060
44099
|
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
44061
44100
|
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
44062
44101
|
console.error(
|
|
44063
|
-
"[Evaluator] Failed to run scenario with
|
|
44064
|
-
evalData.
|
|
44102
|
+
"[Evaluator] Failed to run scenario with preset:",
|
|
44103
|
+
evalData.presetName,
|
|
44065
44104
|
"Error:",
|
|
44066
44105
|
errorMsg
|
|
44067
44106
|
);
|
|
@@ -44069,7 +44108,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
44069
44108
|
console.error("[Evaluator] Stack trace:", errorStack);
|
|
44070
44109
|
}
|
|
44071
44110
|
throw new Error(
|
|
44072
|
-
`[${state.currentPhase}] Failed to execute
|
|
44111
|
+
`[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
44073
44112
|
);
|
|
44074
44113
|
}
|
|
44075
44114
|
}
|