@wix/evalforge-evaluator 0.44.0 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -166,6 +166,9 @@ function createApiClient(serverUrl, options = "") {
|
|
|
166
166
|
getTemplate(projectId2, id) {
|
|
167
167
|
return fetchJson(`/projects/${projectId2}/templates/${id}`);
|
|
168
168
|
},
|
|
169
|
+
getAssertion(projectId2, id) {
|
|
170
|
+
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
171
|
+
},
|
|
169
172
|
addResult(projectId2, evalRunId2, result) {
|
|
170
173
|
return postJson(
|
|
171
174
|
`/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
|
|
@@ -184,6 +187,38 @@ function createApiClient(serverUrl, options = "") {
|
|
|
184
187
|
}
|
|
185
188
|
|
|
186
189
|
// src/fetch-evaluation-data.ts
|
|
190
|
+
function customAssertionToAssertion(ca) {
|
|
191
|
+
const config = ca.config;
|
|
192
|
+
switch (ca.type) {
|
|
193
|
+
case "skill_was_called":
|
|
194
|
+
return {
|
|
195
|
+
type: "skill_was_called",
|
|
196
|
+
skillName: config?.skillName ?? ""
|
|
197
|
+
};
|
|
198
|
+
case "build_passed":
|
|
199
|
+
return {
|
|
200
|
+
type: "build_passed",
|
|
201
|
+
command: config?.command,
|
|
202
|
+
expectedExitCode: config?.expectedExitCode
|
|
203
|
+
};
|
|
204
|
+
case "llm_judge":
|
|
205
|
+
case "custom":
|
|
206
|
+
return {
|
|
207
|
+
type: "llm_judge",
|
|
208
|
+
prompt: config?.prompt ?? "",
|
|
209
|
+
systemPrompt: config?.systemPrompt,
|
|
210
|
+
minScore: config?.minScore,
|
|
211
|
+
model: config?.model,
|
|
212
|
+
maxTokens: config?.maxTokens,
|
|
213
|
+
temperature: config?.temperature
|
|
214
|
+
};
|
|
215
|
+
default:
|
|
216
|
+
return {
|
|
217
|
+
type: "llm_judge",
|
|
218
|
+
prompt: ""
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
}
|
|
187
222
|
async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
188
223
|
const evalRun = await api.getEvalRun(projectId2, evalRunId2);
|
|
189
224
|
const scenarios = await Promise.all(
|
|
@@ -214,10 +249,21 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
214
249
|
templateIds.map((id) => api.getTemplate(projectId2, id))
|
|
215
250
|
) : [];
|
|
216
251
|
const templateMap = new Map(templates.map((t) => [t.id, t]));
|
|
217
|
-
const
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
252
|
+
const assertionIds = [
|
|
253
|
+
...new Set(scenarios.flatMap((s) => s.assertionIds ?? []))
|
|
254
|
+
];
|
|
255
|
+
const assertions = assertionIds.length > 0 ? await Promise.all(
|
|
256
|
+
assertionIds.map((id) => api.getAssertion(projectId2, id))
|
|
257
|
+
) : [];
|
|
258
|
+
const assertionMap = new Map(assertions.map((a) => [a.id, a]));
|
|
259
|
+
const scenarioItems = scenarios.map((scenario) => {
|
|
260
|
+
const resolvedAssertions = (scenario.assertionIds ?? []).map((id) => assertionMap.get(id)).filter((a) => a !== void 0).map(customAssertionToAssertion);
|
|
261
|
+
return {
|
|
262
|
+
scenario,
|
|
263
|
+
template: scenario.templateId ? templateMap.get(scenario.templateId) : void 0,
|
|
264
|
+
resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
|
|
265
|
+
};
|
|
266
|
+
});
|
|
221
267
|
return {
|
|
222
268
|
evalRun,
|
|
223
269
|
codeAgent,
|
|
@@ -8009,7 +8055,7 @@ function getTargetId(target) {
|
|
|
8009
8055
|
return target.agent.id;
|
|
8010
8056
|
}
|
|
8011
8057
|
}
|
|
8012
|
-
async function runScenario(config, evalRunId2, scenario, target, template) {
|
|
8058
|
+
async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
|
|
8013
8059
|
const targetId = getTargetId(target);
|
|
8014
8060
|
const workDir = await prepareWorkingDirectory(
|
|
8015
8061
|
config,
|
|
@@ -8034,7 +8080,11 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
|
|
|
8034
8080
|
partialResult = await callAgent(config, scenario, target.agent, workDir);
|
|
8035
8081
|
break;
|
|
8036
8082
|
}
|
|
8037
|
-
const
|
|
8083
|
+
const inlineAssertions = scenario.assertions ?? [];
|
|
8084
|
+
const assertions = [
|
|
8085
|
+
...inlineAssertions,
|
|
8086
|
+
...resolvedAssertions ?? []
|
|
8087
|
+
];
|
|
8038
8088
|
const evaluationInput = {
|
|
8039
8089
|
outputText: partialResult.outputText,
|
|
8040
8090
|
llmTrace: partialResult.llmTrace,
|
|
@@ -8255,7 +8305,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8255
8305
|
}
|
|
8256
8306
|
let completedScenarios = 0;
|
|
8257
8307
|
const totalScenarios = scenarioItems.length * skills.length;
|
|
8258
|
-
for (const { scenario, template } of scenarioItems) {
|
|
8308
|
+
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
8259
8309
|
for (const skill of skills) {
|
|
8260
8310
|
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
8261
8311
|
state.currentContext = {
|
|
@@ -8281,7 +8331,8 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8281
8331
|
evalRunId2,
|
|
8282
8332
|
scenario,
|
|
8283
8333
|
{ type: "skill", skill, agent: codeAgent ?? void 0 },
|
|
8284
|
-
template
|
|
8334
|
+
template,
|
|
8335
|
+
resolvedAssertions
|
|
8285
8336
|
);
|
|
8286
8337
|
console.log("[Evaluator] Skill completed, adding result");
|
|
8287
8338
|
state.currentPhase = ExecutionPhase.ADD_RESULT;
|