npm - @wix/evalforge-evaluator - Versions diffs - 0.140.0 → 0.142.0 - Mend

@wix/evalforge-evaluator 0.140.0 → 0.142.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +51 -48
package/build/index.js.map +2 -2
package/build/index.mjs +53 -49
package/build/index.mjs.map +2 -2
package/build/types/evaluation-loop.d.ts +1 -0
package/package.json +5 -5

package/build/index.js CHANGED Viewed

@@ -287,13 +287,16 @@ function resolveSystemAssertion(assertionId, params) {
         }
       };
       break;
-    case "build_passed":
+    case "build_passed": {
+      const rawCmd = params?.command;
+      const command = typeof rawCmd === "string" && (0, import_evalforge_types.isAllowedBuildCommandString)(rawCmd) ? rawCmd.trim() : void 0;
       baseAssertion = {
         type: "build_passed",
-        command: params?.command ?? void 0,
+        ...command !== void 0 && { command },
         expectedExitCode: params?.expectedExitCode ?? void 0
       };
       break;
+    }
     case "time_limit":
       baseAssertion = {
         type: "time_limit",
@@ -4754,6 +4757,7 @@ var import_crypto5 = require("crypto");
 async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
   const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
   let completedExecutions = 0;
+  let erroredExecutions = 0;
   const totalExecutions = scenarioItems.length * runsPerScenario;
   const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
   const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4790,6 +4794,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
           iterationIndex
         };
         await callbacks.addResult(errorResult);
+        erroredExecutions++;
       }
       if (scenarioResult !== null) {
         await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4797,7 +4802,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
       completedExecutions++;
     }
   }
-  return { completedExecutions, totalExecutions };
+  return { completedExecutions, totalExecutions, erroredExecutions };
 }
 // src/error-reporter.ts
@@ -4981,50 +4986,46 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
   const skillNames = evalData.skills.map((s) => s.name).join(", ");
-  const { completedExecutions, totalExecutions } = await runEvaluationLoop(
-    scenarioItems,
-    evalData,
-    {
-      runScenario: (scenario, template, resolvedAssertions) => {
-        state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
-        state.currentContext = {
-          projectId: projectId2,
-          evalRunId: evalRunId2,
-          scenarioId: scenario.id,
-          scenarioName: scenario.name,
-          presetId: evalData.evalRun.presetId,
-          presetName: evalData.presetName,
-          agentId: agent?.id,
-          agentName: agent?.name
-        };
-        return runScenario(
-          config,
-          evalRunId2,
-          scenario,
-          evalData,
-          template,
-          resolvedAssertions
-        );
-      },
-      addResult: async (result) => {
-        state.currentPhase = ExecutionPhase.ADD_RESULT;
-        state.currentContext = {
-          ...state.currentContext,
-          resultId: result.id
-        };
-        await api.addResult(projectId2, evalRunId2, result);
-      },
-      onProgress: (completed, total, iterLabel) => {
-        console.log(
-          "[Evaluator] Running scenario with preset:",
-          evalData.presetName,
-          skillNames ? `(${skillNames})` : "",
-          agent ? `with agent: ${agent.name}` : "",
-          `(${completed}/${total})${iterLabel}`
-        );
-      }
+  const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
+    runScenario: (scenario, template, resolvedAssertions) => {
+      state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
+      state.currentContext = {
+        projectId: projectId2,
+        evalRunId: evalRunId2,
+        scenarioId: scenario.id,
+        scenarioName: scenario.name,
+        presetId: evalData.evalRun.presetId,
+        presetName: evalData.presetName,
+        agentId: agent?.id,
+        agentName: agent?.name
+      };
+      return runScenario(
+        config,
+        evalRunId2,
+        scenario,
+        evalData,
+        template,
+        resolvedAssertions
+      );
+    },
+    addResult: async (result) => {
+      state.currentPhase = ExecutionPhase.ADD_RESULT;
+      state.currentContext = {
+        ...state.currentContext,
+        resultId: result.id
+      };
+      await api.addResult(projectId2, evalRunId2, result);
+    },
+    onProgress: (completed, total, iterLabel) => {
+      console.log(
+        "[Evaluator] Running scenario with preset:",
+        evalData.presetName,
+        skillNames ? `(${skillNames})` : "",
+        agent ? `with agent: ${agent.name}` : "",
+        `(${completed}/${total})${iterLabel}`
+      );
     }
-  );
+  });
   state.currentPhase = ExecutionPhase.UPDATE_STATUS;
   state.currentContext = {
     projectId: projectId2,
@@ -5032,14 +5033,16 @@ async function runEvaluation(projectId2, evalRunId2) {
     completedExecutions,
     totalExecutions
   };
+  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
+  const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
-      status: import_evalforge_types15.EvalStatus.COMPLETED,
+      status: finalStatus,
       completedAt: (/* @__PURE__ */ new Date()).toISOString()
     });
   } catch (updateErr) {
     throw new Error(
-      `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
+      `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
     );
   }
 }