npm - @wix/evalforge-evaluator - Versions diffs - 0.44.0 → 0.46.0 - Mend

@wix/evalforge-evaluator 0.44.0 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/build/index.js +59 -8
package/build/index.js.map +2 -2
package/build/index.mjs +59 -8
package/build/index.mjs.map +2 -2
package/build/types/api-client.d.ts +2 -1
package/build/types/fetch-evaluation-data.d.ts +4 -2
package/build/types/run-scenario/index.d.ts +3 -1
package/package.json +4 -4

package/build/index.js CHANGED Viewed

@@ -166,6 +166,9 @@ function createApiClient(serverUrl, options = "") {
     getTemplate(projectId2, id) {
       return fetchJson(`/projects/${projectId2}/templates/${id}`);
     },
+    getAssertion(projectId2, id) {
+      return fetchJson(`/projects/${projectId2}/assertions/${id}`);
+    },
     addResult(projectId2, evalRunId2, result) {
       return postJson(
         `/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
@@ -184,6 +187,38 @@ function createApiClient(serverUrl, options = "") {
 }
 // src/fetch-evaluation-data.ts
+function customAssertionToAssertion(ca) {
+  const config = ca.config;
+  switch (ca.type) {
+    case "skill_was_called":
+      return {
+        type: "skill_was_called",
+        skillName: config?.skillName ?? ""
+      };
+    case "build_passed":
+      return {
+        type: "build_passed",
+        command: config?.command,
+        expectedExitCode: config?.expectedExitCode
+      };
+    case "llm_judge":
+    case "custom":
+      return {
+        type: "llm_judge",
+        prompt: config?.prompt ?? "",
+        systemPrompt: config?.systemPrompt,
+        minScore: config?.minScore,
+        model: config?.model,
+        maxTokens: config?.maxTokens,
+        temperature: config?.temperature
+      };
+    default:
+      return {
+        type: "llm_judge",
+        prompt: ""
+      };
+  }
+}
 async function fetchEvaluationData(api, projectId2, evalRunId2) {
   const evalRun = await api.getEvalRun(projectId2, evalRunId2);
   const scenarios = await Promise.all(
@@ -214,10 +249,21 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
     templateIds.map((id) => api.getTemplate(projectId2, id))
   ) : [];
   const templateMap = new Map(templates.map((t) => [t.id, t]));
-  const scenarioItems = scenarios.map((scenario) => ({
-    scenario,
-    template: scenario.templateId ? templateMap.get(scenario.templateId) : void 0
-  }));
+  const assertionIds = [
+    ...new Set(scenarios.flatMap((s) => s.assertionIds ?? []))
+  ];
+  const assertions = assertionIds.length > 0 ? await Promise.all(
+    assertionIds.map((id) => api.getAssertion(projectId2, id))
+  ) : [];
+  const assertionMap = new Map(assertions.map((a) => [a.id, a]));
+  const scenarioItems = scenarios.map((scenario) => {
+    const resolvedAssertions = (scenario.assertionIds ?? []).map((id) => assertionMap.get(id)).filter((a) => a !== void 0).map(customAssertionToAssertion);
+    return {
+      scenario,
+      template: scenario.templateId ? templateMap.get(scenario.templateId) : void 0,
+      resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
+    };
+  });
   return {
     evalRun,
     codeAgent,
@@ -8009,7 +8055,7 @@ function getTargetId(target) {
       return target.agent.id;
   }
 }
-async function runScenario(config, evalRunId2, scenario, target, template) {
+async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
   const targetId = getTargetId(target);
   const workDir = await prepareWorkingDirectory(
     config,
@@ -8034,7 +8080,11 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
       partialResult = await callAgent(config, scenario, target.agent, workDir);
       break;
   }
-  const assertions = scenario.assertions ?? [];
+  const inlineAssertions = scenario.assertions ?? [];
+  const assertions = [
+    ...inlineAssertions,
+    ...resolvedAssertions ?? []
+  ];
   const evaluationInput = {
     outputText: partialResult.outputText,
     llmTrace: partialResult.llmTrace,
@@ -8255,7 +8305,7 @@ async function runEvaluation(projectId2, evalRunId2) {
   }
   let completedScenarios = 0;
   const totalScenarios = scenarioItems.length * skills.length;
-  for (const { scenario, template } of scenarioItems) {
+  for (const { scenario, template, resolvedAssertions } of scenarioItems) {
     for (const skill of skills) {
       state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
       state.currentContext = {
@@ -8281,7 +8331,8 @@ async function runEvaluation(projectId2, evalRunId2) {
           evalRunId2,
           scenario,
           { type: "skill", skill, agent: codeAgent ?? void 0 },
-          template
+          template,
+          resolvedAssertions
         );
         console.log("[Evaluator] Skill completed, adding result");
         state.currentPhase = ExecutionPhase.ADD_RESULT;