npm - @wix/evalforge-evaluator - Versions diffs - 0.154.0 → 0.155.0 - Mend

@wix/evalforge-evaluator 0.154.0 → 0.155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +26 -3
package/build/index.js.map +2 -2
package/build/index.mjs +26 -3
package/build/index.mjs.map +2 -2
package/build/types/evaluation-loop.d.ts +6 -0
package/package.json +2 -2

package/build/index.mjs CHANGED Viewed

@@ -4995,6 +4995,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
   const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
   let completedExecutions = 0;
   let erroredExecutions = 0;
+  let firstErrorMessage;
   const totalExecutions = scenarioItems.length * runsPerScenario;
   const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
   const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -5032,6 +5033,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
         };
         await callbacks.addResult(errorResult);
         erroredExecutions++;
+        if (firstErrorMessage === void 0) {
+          firstErrorMessage = errorResult.outputText;
+        }
       }
       if (scenarioResult !== null) {
         await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -5039,7 +5043,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
       completedExecutions++;
     }
   }
-  return { completedExecutions, totalExecutions, erroredExecutions };
+  return {
+    completedExecutions,
+    totalExecutions,
+    erroredExecutions,
+    firstErrorMessage
+  };
 }
 // src/error-reporter.ts
@@ -5223,7 +5232,12 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
   const skillNames = evalData.skills.map((s) => s.name).join(", ");
-  const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
+  const {
+    completedExecutions,
+    totalExecutions,
+    erroredExecutions,
+    firstErrorMessage
+  } = await runEvaluationLoop(scenarioItems, evalData, {
     runScenario: (scenario, template, resolvedAssertions) => {
       state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
       state.currentContext = {
@@ -5272,10 +5286,14 @@ async function runEvaluation(projectId2, evalRunId2) {
   };
   const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
   const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
+  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
+    firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
+  ) : void 0;
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
       status: finalStatus,
-      completedAt: (/* @__PURE__ */ new Date()).toISOString()
+      completedAt: (/* @__PURE__ */ new Date()).toISOString(),
+      ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
     });
   } catch (updateErr) {
     throw new Error(
@@ -5283,6 +5301,11 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
 }
+var JOB_ERROR_MAX_LENGTH = 1e3;
+function truncateForJobError(message) {
+  if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
+  return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
+}
 var projectId = process.argv[2];
 var evalRunId = process.argv[3];
 console.error(