npm - @wix/evalforge-evaluator - Versions diffs - 0.140.0 → 0.142.0 - Mend

@wix/evalforge-evaluator 0.140.0 → 0.142.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +51 -48
package/build/index.js.map +2 -2
package/build/index.mjs +53 -49
package/build/index.mjs.map +2 -2
package/build/types/evaluation-loop.d.ts +1 -0
package/package.json +5 -5

package/build/index.mjs CHANGED Viewed

@@ -183,7 +183,8 @@ function createApiClient(serverUrl, options = "") {
 // src/fetch-evaluation-data.ts
 import {
   isSystemAssertionId,
-  SYSTEM_ASSERTIONS
+  SYSTEM_ASSERTIONS,
+  isAllowedBuildCommandString
 } from "@wix/evalforge-types";
 // src/resolve-placeholders.ts
@@ -267,13 +268,16 @@ function resolveSystemAssertion(assertionId, params) {
         }
       };
       break;
-    case "build_passed":
+    case "build_passed": {
+      const rawCmd = params?.command;
+      const command = typeof rawCmd === "string" && isAllowedBuildCommandString(rawCmd) ? rawCmd.trim() : void 0;
       baseAssertion = {
         type: "build_passed",
-        command: params?.command ?? void 0,
+        ...command !== void 0 && { command },
         expectedExitCode: params?.expectedExitCode ?? void 0
       };
       break;
+    }
     case "time_limit":
       baseAssertion = {
         type: "time_limit",
@@ -4762,6 +4766,7 @@ import { randomUUID as randomUUID5 } from "crypto";
 async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
   const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
   let completedExecutions = 0;
+  let erroredExecutions = 0;
   const totalExecutions = scenarioItems.length * runsPerScenario;
   const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
   const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4798,6 +4803,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
           iterationIndex
         };
         await callbacks.addResult(errorResult);
+        erroredExecutions++;
       }
       if (scenarioResult !== null) {
         await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4805,7 +4811,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
       completedExecutions++;
     }
   }
-  return { completedExecutions, totalExecutions };
+  return { completedExecutions, totalExecutions, erroredExecutions };
 }
 // src/error-reporter.ts
@@ -4989,50 +4995,46 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
   const skillNames = evalData.skills.map((s) => s.name).join(", ");
-  const { completedExecutions, totalExecutions } = await runEvaluationLoop(
-    scenarioItems,
-    evalData,
-    {
-      runScenario: (scenario, template, resolvedAssertions) => {
-        state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
-        state.currentContext = {
-          projectId: projectId2,
-          evalRunId: evalRunId2,
-          scenarioId: scenario.id,
-          scenarioName: scenario.name,
-          presetId: evalData.evalRun.presetId,
-          presetName: evalData.presetName,
-          agentId: agent?.id,
-          agentName: agent?.name
-        };
-        return runScenario(
-          config,
-          evalRunId2,
-          scenario,
-          evalData,
-          template,
-          resolvedAssertions
-        );
-      },
-      addResult: async (result) => {
-        state.currentPhase = ExecutionPhase.ADD_RESULT;
-        state.currentContext = {
-          ...state.currentContext,
-          resultId: result.id
-        };
-        await api.addResult(projectId2, evalRunId2, result);
-      },
-      onProgress: (completed, total, iterLabel) => {
-        console.log(
-          "[Evaluator] Running scenario with preset:",
-          evalData.presetName,
-          skillNames ? `(${skillNames})` : "",
-          agent ? `with agent: ${agent.name}` : "",
-          `(${completed}/${total})${iterLabel}`
-        );
-      }
+  const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
+    runScenario: (scenario, template, resolvedAssertions) => {
+      state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
+      state.currentContext = {
+        projectId: projectId2,
+        evalRunId: evalRunId2,
+        scenarioId: scenario.id,
+        scenarioName: scenario.name,
+        presetId: evalData.evalRun.presetId,
+        presetName: evalData.presetName,
+        agentId: agent?.id,
+        agentName: agent?.name
+      };
+      return runScenario(
+        config,
+        evalRunId2,
+        scenario,
+        evalData,
+        template,
+        resolvedAssertions
+      );
+    },
+    addResult: async (result) => {
+      state.currentPhase = ExecutionPhase.ADD_RESULT;
+      state.currentContext = {
+        ...state.currentContext,
+        resultId: result.id
+      };
+      await api.addResult(projectId2, evalRunId2, result);
+    },
+    onProgress: (completed, total, iterLabel) => {
+      console.log(
+        "[Evaluator] Running scenario with preset:",
+        evalData.presetName,
+        skillNames ? `(${skillNames})` : "",
+        agent ? `with agent: ${agent.name}` : "",
+        `(${completed}/${total})${iterLabel}`
+      );
     }
-  );
+  });
   state.currentPhase = ExecutionPhase.UPDATE_STATUS;
   state.currentContext = {
     projectId: projectId2,
@@ -5040,14 +5042,16 @@ async function runEvaluation(projectId2, evalRunId2) {
     completedExecutions,
     totalExecutions
   };
+  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
+  const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
-      status: EvalStatus2.COMPLETED,
+      status: finalStatus,
       completedAt: (/* @__PURE__ */ new Date()).toISOString()
     });
   } catch (updateErr) {
     throw new Error(
-      `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
+      `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
     );
   }
 }