npm - @wix/evalforge-evaluator - Versions diffs - 0.154.0 → 0.156.0 - Mend

@wix/evalforge-evaluator 0.154.0 → 0.156.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/build/index.js +40 -6
package/build/index.js.map +3 -3
package/build/index.mjs +40 -6
package/build/index.mjs.map +3 -3
package/build/types/evaluation-loop.d.ts +6 -0
package/build/types/run-scenario/agents/timeout.d.ts +9 -0
package/package.json +2 -2

package/build/index.js CHANGED Viewed

@@ -664,6 +664,14 @@ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_git
   }
 }
+// src/run-scenario/agents/timeout.ts
+var UNLIMITED_RUN_TIMEOUT_MS = 60 * 6e4;
+function resolveTimeoutMs(maxTurns, maxDurationMs) {
+  if (maxDurationMs !== void 0) return maxDurationMs;
+  if (maxTurns === void 0) return UNLIMITED_RUN_TIMEOUT_MS;
+  return Math.max(3e5, maxTurns * 6e4);
+}
 // src/run-scenario/agents/claude-code/execute.ts
 var import_crypto = require("crypto");
@@ -1294,7 +1302,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
       traceContext.authToken
     );
   }
-  const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
+  const SDK_TIMEOUT_MS = resolveTimeoutMs(maxTurns, options.maxDurationMs);
   let timeoutHandle;
   let timedOut = false;
   const HEARTBEAT_INTERVAL_MS = 1e4;
@@ -3097,7 +3105,7 @@ async function executeWithOpenCode(skills, scenario, options) {
     model: options.model
   });
   const maxTurns = options.maxTurns || void 0;
-  const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
+  const sdkTimeoutMs = resolveTimeoutMs(maxTurns, options.maxDurationMs);
   const { env, providerID, modelID } = await buildOpenCodeEnv({
     model: options.model,
     temperature: options.temperature,
@@ -3707,7 +3715,10 @@ async function executeWithAiSdk(context) {
     emitStartEvent(traceContext, startTime);
   }
   const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
-  const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
+  const SDK_TIMEOUT_MS = resolveTimeoutMs(
+    effectiveMaxTurns,
+    cfg.maxDurationMs
+  );
   const abortController = new AbortController();
   const timeoutHandle = setTimeout(() => {
     abortController.abort(
@@ -4976,6 +4987,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
   const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
   let completedExecutions = 0;
   let erroredExecutions = 0;
+  let firstErrorMessage;
   const totalExecutions = scenarioItems.length * runsPerScenario;
   const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
   const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -5013,6 +5025,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
         };
         await callbacks.addResult(errorResult);
         erroredExecutions++;
+        if (firstErrorMessage === void 0) {
+          firstErrorMessage = errorResult.outputText;
+        }
       }
       if (scenarioResult !== null) {
         await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -5020,7 +5035,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
       completedExecutions++;
     }
   }
-  return { completedExecutions, totalExecutions, erroredExecutions };
+  return {
+    completedExecutions,
+    totalExecutions,
+    erroredExecutions,
+    firstErrorMessage
+  };
 }
 // src/error-reporter.ts
@@ -5204,7 +5224,12 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
   const skillNames = evalData.skills.map((s) => s.name).join(", ");
-  const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
+  const {
+    completedExecutions,
+    totalExecutions,
+    erroredExecutions,
+    firstErrorMessage
+  } = await runEvaluationLoop(scenarioItems, evalData, {
     runScenario: (scenario, template, resolvedAssertions) => {
       state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
       state.currentContext = {
@@ -5253,10 +5278,14 @@ async function runEvaluation(projectId2, evalRunId2) {
   };
   const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
   const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
+  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
+    firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
+  ) : void 0;
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
       status: finalStatus,
-      completedAt: (/* @__PURE__ */ new Date()).toISOString()
+      completedAt: (/* @__PURE__ */ new Date()).toISOString(),
+      ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
     });
   } catch (updateErr) {
     throw new Error(
@@ -5264,6 +5293,11 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
 }
+var JOB_ERROR_MAX_LENGTH = 1e3;
+function truncateForJobError(message) {
+  if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
+  return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
+}
 var projectId = process.argv[2];
 var evalRunId = process.argv[3];
 console.error(