@wix/evalforge-evaluator 0.154.0 → 0.156.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -664,6 +664,14 @@ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_git
664
664
  }
665
665
  }
666
666
 
667
+ // src/run-scenario/agents/timeout.ts
668
+ var UNLIMITED_RUN_TIMEOUT_MS = 60 * 6e4;
669
+ function resolveTimeoutMs(maxTurns, maxDurationMs) {
670
+ if (maxDurationMs !== void 0) return maxDurationMs;
671
+ if (maxTurns === void 0) return UNLIMITED_RUN_TIMEOUT_MS;
672
+ return Math.max(3e5, maxTurns * 6e4);
673
+ }
674
+
667
675
  // src/run-scenario/agents/claude-code/execute.ts
668
676
  var import_crypto = require("crypto");
669
677
 
@@ -1294,7 +1302,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
1294
1302
  traceContext.authToken
1295
1303
  );
1296
1304
  }
1297
- const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
1305
+ const SDK_TIMEOUT_MS = resolveTimeoutMs(maxTurns, options.maxDurationMs);
1298
1306
  let timeoutHandle;
1299
1307
  let timedOut = false;
1300
1308
  const HEARTBEAT_INTERVAL_MS = 1e4;
@@ -3097,7 +3105,7 @@ async function executeWithOpenCode(skills, scenario, options) {
3097
3105
  model: options.model
3098
3106
  });
3099
3107
  const maxTurns = options.maxTurns || void 0;
3100
- const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
3108
+ const sdkTimeoutMs = resolveTimeoutMs(maxTurns, options.maxDurationMs);
3101
3109
  const { env, providerID, modelID } = await buildOpenCodeEnv({
3102
3110
  model: options.model,
3103
3111
  temperature: options.temperature,
@@ -3707,7 +3715,10 @@ async function executeWithAiSdk(context) {
3707
3715
  emitStartEvent(traceContext, startTime);
3708
3716
  }
3709
3717
  const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3710
- const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
3718
+ const SDK_TIMEOUT_MS = resolveTimeoutMs(
3719
+ effectiveMaxTurns,
3720
+ cfg.maxDurationMs
3721
+ );
3711
3722
  const abortController = new AbortController();
3712
3723
  const timeoutHandle = setTimeout(() => {
3713
3724
  abortController.abort(
@@ -4976,6 +4987,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4976
4987
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4977
4988
  let completedExecutions = 0;
4978
4989
  let erroredExecutions = 0;
4990
+ let firstErrorMessage;
4979
4991
  const totalExecutions = scenarioItems.length * runsPerScenario;
4980
4992
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4981
4993
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -5013,6 +5025,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5013
5025
  };
5014
5026
  await callbacks.addResult(errorResult);
5015
5027
  erroredExecutions++;
5028
+ if (firstErrorMessage === void 0) {
5029
+ firstErrorMessage = errorResult.outputText;
5030
+ }
5016
5031
  }
5017
5032
  if (scenarioResult !== null) {
5018
5033
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -5020,7 +5035,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5020
5035
  completedExecutions++;
5021
5036
  }
5022
5037
  }
5023
- return { completedExecutions, totalExecutions, erroredExecutions };
5038
+ return {
5039
+ completedExecutions,
5040
+ totalExecutions,
5041
+ erroredExecutions,
5042
+ firstErrorMessage
5043
+ };
5024
5044
  }
5025
5045
 
5026
5046
  // src/error-reporter.ts
@@ -5204,7 +5224,12 @@ async function runEvaluation(projectId2, evalRunId2) {
5204
5224
  );
5205
5225
  }
5206
5226
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
5207
- const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
5227
+ const {
5228
+ completedExecutions,
5229
+ totalExecutions,
5230
+ erroredExecutions,
5231
+ firstErrorMessage
5232
+ } = await runEvaluationLoop(scenarioItems, evalData, {
5208
5233
  runScenario: (scenario, template, resolvedAssertions) => {
5209
5234
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5210
5235
  state.currentContext = {
@@ -5253,10 +5278,14 @@ async function runEvaluation(projectId2, evalRunId2) {
5253
5278
  };
5254
5279
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5255
5280
  const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
5281
+ const jobErrorOnAllFailed = allFailed ? truncateForJobError(
5282
+ firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
5283
+ ) : void 0;
5256
5284
  try {
5257
5285
  await api.updateEvalRun(projectId2, evalRunId2, {
5258
5286
  status: finalStatus,
5259
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
5287
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
5288
+ ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
5260
5289
  });
5261
5290
  } catch (updateErr) {
5262
5291
  throw new Error(
@@ -5264,6 +5293,11 @@ async function runEvaluation(projectId2, evalRunId2) {
5264
5293
  );
5265
5294
  }
5266
5295
  }
5296
+ var JOB_ERROR_MAX_LENGTH = 1e3;
5297
+ function truncateForJobError(message) {
5298
+ if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
5299
+ return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
5300
+ }
5267
5301
  var projectId = process.argv[2];
5268
5302
  var evalRunId = process.argv[3];
5269
5303
  console.error(