@wix/evalforge-evaluator 0.44.0 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -166,6 +166,9 @@ function createApiClient(serverUrl, options = "") {
166
166
  getTemplate(projectId2, id) {
167
167
  return fetchJson(`/projects/${projectId2}/templates/${id}`);
168
168
  },
169
+ getAssertion(projectId2, id) {
170
+ return fetchJson(`/projects/${projectId2}/assertions/${id}`);
171
+ },
169
172
  addResult(projectId2, evalRunId2, result) {
170
173
  return postJson(
171
174
  `/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
@@ -184,6 +187,38 @@ function createApiClient(serverUrl, options = "") {
184
187
  }
185
188
 
186
189
  // src/fetch-evaluation-data.ts
190
+ function customAssertionToAssertion(ca) {
191
+ const config = ca.config;
192
+ switch (ca.type) {
193
+ case "skill_was_called":
194
+ return {
195
+ type: "skill_was_called",
196
+ skillName: config?.skillName ?? ""
197
+ };
198
+ case "build_passed":
199
+ return {
200
+ type: "build_passed",
201
+ command: config?.command,
202
+ expectedExitCode: config?.expectedExitCode
203
+ };
204
+ case "llm_judge":
205
+ case "custom":
206
+ return {
207
+ type: "llm_judge",
208
+ prompt: config?.prompt ?? "",
209
+ systemPrompt: config?.systemPrompt,
210
+ minScore: config?.minScore,
211
+ model: config?.model,
212
+ maxTokens: config?.maxTokens,
213
+ temperature: config?.temperature
214
+ };
215
+ default:
216
+ return {
217
+ type: "llm_judge",
218
+ prompt: ""
219
+ };
220
+ }
221
+ }
187
222
  async function fetchEvaluationData(api, projectId2, evalRunId2) {
188
223
  const evalRun = await api.getEvalRun(projectId2, evalRunId2);
189
224
  const scenarios = await Promise.all(
@@ -214,10 +249,21 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
214
249
  templateIds.map((id) => api.getTemplate(projectId2, id))
215
250
  ) : [];
216
251
  const templateMap = new Map(templates.map((t) => [t.id, t]));
217
- const scenarioItems = scenarios.map((scenario) => ({
218
- scenario,
219
- template: scenario.templateId ? templateMap.get(scenario.templateId) : void 0
220
- }));
252
+ const assertionIds = [
253
+ ...new Set(scenarios.flatMap((s) => s.assertionIds ?? []))
254
+ ];
255
+ const assertions = assertionIds.length > 0 ? await Promise.all(
256
+ assertionIds.map((id) => api.getAssertion(projectId2, id))
257
+ ) : [];
258
+ const assertionMap = new Map(assertions.map((a) => [a.id, a]));
259
+ const scenarioItems = scenarios.map((scenario) => {
260
+ const resolvedAssertions = (scenario.assertionIds ?? []).map((id) => assertionMap.get(id)).filter((a) => a !== void 0).map(customAssertionToAssertion);
261
+ return {
262
+ scenario,
263
+ template: scenario.templateId ? templateMap.get(scenario.templateId) : void 0,
264
+ resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
265
+ };
266
+ });
221
267
  return {
222
268
  evalRun,
223
269
  codeAgent,
@@ -8009,7 +8055,7 @@ function getTargetId(target) {
8009
8055
  return target.agent.id;
8010
8056
  }
8011
8057
  }
8012
- async function runScenario(config, evalRunId2, scenario, target, template) {
8058
+ async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
8013
8059
  const targetId = getTargetId(target);
8014
8060
  const workDir = await prepareWorkingDirectory(
8015
8061
  config,
@@ -8034,7 +8080,11 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
8034
8080
  partialResult = await callAgent(config, scenario, target.agent, workDir);
8035
8081
  break;
8036
8082
  }
8037
- const assertions = scenario.assertions ?? [];
8083
+ const inlineAssertions = scenario.assertions ?? [];
8084
+ const assertions = [
8085
+ ...inlineAssertions,
8086
+ ...resolvedAssertions ?? []
8087
+ ];
8038
8088
  const evaluationInput = {
8039
8089
  outputText: partialResult.outputText,
8040
8090
  llmTrace: partialResult.llmTrace,
@@ -8255,7 +8305,7 @@ async function runEvaluation(projectId2, evalRunId2) {
8255
8305
  }
8256
8306
  let completedScenarios = 0;
8257
8307
  const totalScenarios = scenarioItems.length * skills.length;
8258
- for (const { scenario, template } of scenarioItems) {
8308
+ for (const { scenario, template, resolvedAssertions } of scenarioItems) {
8259
8309
  for (const skill of skills) {
8260
8310
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8261
8311
  state.currentContext = {
@@ -8281,7 +8331,8 @@ async function runEvaluation(projectId2, evalRunId2) {
8281
8331
  evalRunId2,
8282
8332
  scenario,
8283
8333
  { type: "skill", skill, agent: codeAgent ?? void 0 },
8284
- template
8334
+ template,
8335
+ resolvedAssertions
8285
8336
  );
8286
8337
  console.log("[Evaluator] Skill completed, adding result");
8287
8338
  state.currentPhase = ExecutionPhase.ADD_RESULT;