@wix/evalforge-evaluator 0.154.0 → 0.156.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -659,6 +659,14 @@ async function writeSkillToFilesystem(cwd, skill, fetchFn = fetchGitHubFolder2)
659
659
  }
660
660
  }
661
661
 
662
+ // src/run-scenario/agents/timeout.ts
663
+ var UNLIMITED_RUN_TIMEOUT_MS = 60 * 6e4;
664
+ function resolveTimeoutMs(maxTurns, maxDurationMs) {
665
+ if (maxDurationMs !== void 0) return maxDurationMs;
666
+ if (maxTurns === void 0) return UNLIMITED_RUN_TIMEOUT_MS;
667
+ return Math.max(3e5, maxTurns * 6e4);
668
+ }
669
+
662
670
  // src/run-scenario/agents/claude-code/execute.ts
663
671
  import { randomUUID } from "crypto";
664
672
 
@@ -1291,7 +1299,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
1291
1299
  traceContext.authToken
1292
1300
  );
1293
1301
  }
1294
- const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
1302
+ const SDK_TIMEOUT_MS = resolveTimeoutMs(maxTurns, options.maxDurationMs);
1295
1303
  let timeoutHandle;
1296
1304
  let timedOut = false;
1297
1305
  const HEARTBEAT_INTERVAL_MS = 1e4;
@@ -3104,7 +3112,7 @@ async function executeWithOpenCode(skills, scenario, options) {
3104
3112
  model: options.model
3105
3113
  });
3106
3114
  const maxTurns = options.maxTurns || void 0;
3107
- const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
3115
+ const sdkTimeoutMs = resolveTimeoutMs(maxTurns, options.maxDurationMs);
3108
3116
  const { env, providerID, modelID } = await buildOpenCodeEnv({
3109
3117
  model: options.model,
3110
3118
  temperature: options.temperature,
@@ -3726,7 +3734,10 @@ async function executeWithAiSdk(context) {
3726
3734
  emitStartEvent(traceContext, startTime);
3727
3735
  }
3728
3736
  const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3729
- const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
3737
+ const SDK_TIMEOUT_MS = resolveTimeoutMs(
3738
+ effectiveMaxTurns,
3739
+ cfg.maxDurationMs
3740
+ );
3730
3741
  const abortController = new AbortController();
3731
3742
  const timeoutHandle = setTimeout(() => {
3732
3743
  abortController.abort(
@@ -4995,6 +5006,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4995
5006
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4996
5007
  let completedExecutions = 0;
4997
5008
  let erroredExecutions = 0;
5009
+ let firstErrorMessage;
4998
5010
  const totalExecutions = scenarioItems.length * runsPerScenario;
4999
5011
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
5000
5012
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -5032,6 +5044,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5032
5044
  };
5033
5045
  await callbacks.addResult(errorResult);
5034
5046
  erroredExecutions++;
5047
+ if (firstErrorMessage === void 0) {
5048
+ firstErrorMessage = errorResult.outputText;
5049
+ }
5035
5050
  }
5036
5051
  if (scenarioResult !== null) {
5037
5052
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -5039,7 +5054,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5039
5054
  completedExecutions++;
5040
5055
  }
5041
5056
  }
5042
- return { completedExecutions, totalExecutions, erroredExecutions };
5057
+ return {
5058
+ completedExecutions,
5059
+ totalExecutions,
5060
+ erroredExecutions,
5061
+ firstErrorMessage
5062
+ };
5043
5063
  }
5044
5064
 
5045
5065
  // src/error-reporter.ts
@@ -5223,7 +5243,12 @@ async function runEvaluation(projectId2, evalRunId2) {
5223
5243
  );
5224
5244
  }
5225
5245
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
5226
- const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
5246
+ const {
5247
+ completedExecutions,
5248
+ totalExecutions,
5249
+ erroredExecutions,
5250
+ firstErrorMessage
5251
+ } = await runEvaluationLoop(scenarioItems, evalData, {
5227
5252
  runScenario: (scenario, template, resolvedAssertions) => {
5228
5253
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5229
5254
  state.currentContext = {
@@ -5272,10 +5297,14 @@ async function runEvaluation(projectId2, evalRunId2) {
5272
5297
  };
5273
5298
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5274
5299
  const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
5300
+ const jobErrorOnAllFailed = allFailed ? truncateForJobError(
5301
+ firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
5302
+ ) : void 0;
5275
5303
  try {
5276
5304
  await api.updateEvalRun(projectId2, evalRunId2, {
5277
5305
  status: finalStatus,
5278
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
5306
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
5307
+ ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
5279
5308
  });
5280
5309
  } catch (updateErr) {
5281
5310
  throw new Error(
@@ -5283,6 +5312,11 @@ async function runEvaluation(projectId2, evalRunId2) {
5283
5312
  );
5284
5313
  }
5285
5314
  }
5315
+ var JOB_ERROR_MAX_LENGTH = 1e3;
5316
+ function truncateForJobError(message) {
5317
+ if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
5318
+ return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
5319
+ }
5286
5320
  var projectId = process.argv[2];
5287
5321
  var evalRunId = process.argv[3];
5288
5322
  console.error(