@wix/evalforge-evaluator 0.154.0 → 0.155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -4995,6 +4995,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4995
4995
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4996
4996
  let completedExecutions = 0;
4997
4997
  let erroredExecutions = 0;
4998
+ let firstErrorMessage;
4998
4999
  const totalExecutions = scenarioItems.length * runsPerScenario;
4999
5000
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
5000
5001
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -5032,6 +5033,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5032
5033
  };
5033
5034
  await callbacks.addResult(errorResult);
5034
5035
  erroredExecutions++;
5036
+ if (firstErrorMessage === void 0) {
5037
+ firstErrorMessage = errorResult.outputText;
5038
+ }
5035
5039
  }
5036
5040
  if (scenarioResult !== null) {
5037
5041
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -5039,7 +5043,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5039
5043
  completedExecutions++;
5040
5044
  }
5041
5045
  }
5042
- return { completedExecutions, totalExecutions, erroredExecutions };
5046
+ return {
5047
+ completedExecutions,
5048
+ totalExecutions,
5049
+ erroredExecutions,
5050
+ firstErrorMessage
5051
+ };
5043
5052
  }
5044
5053
 
5045
5054
  // src/error-reporter.ts
@@ -5223,7 +5232,12 @@ async function runEvaluation(projectId2, evalRunId2) {
5223
5232
  );
5224
5233
  }
5225
5234
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
5226
- const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
5235
+ const {
5236
+ completedExecutions,
5237
+ totalExecutions,
5238
+ erroredExecutions,
5239
+ firstErrorMessage
5240
+ } = await runEvaluationLoop(scenarioItems, evalData, {
5227
5241
  runScenario: (scenario, template, resolvedAssertions) => {
5228
5242
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5229
5243
  state.currentContext = {
@@ -5272,10 +5286,14 @@ async function runEvaluation(projectId2, evalRunId2) {
5272
5286
  };
5273
5287
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5274
5288
  const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
5289
+ const jobErrorOnAllFailed = allFailed ? truncateForJobError(
5290
+ firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
5291
+ ) : void 0;
5275
5292
  try {
5276
5293
  await api.updateEvalRun(projectId2, evalRunId2, {
5277
5294
  status: finalStatus,
5278
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
5295
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
5296
+ ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
5279
5297
  });
5280
5298
  } catch (updateErr) {
5281
5299
  throw new Error(
@@ -5283,6 +5301,11 @@ async function runEvaluation(projectId2, evalRunId2) {
5283
5301
  );
5284
5302
  }
5285
5303
  }
5304
+ var JOB_ERROR_MAX_LENGTH = 1e3;
5305
+ function truncateForJobError(message) {
5306
+ if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
5307
+ return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
5308
+ }
5286
5309
  var projectId = process.argv[2];
5287
5310
  var evalRunId = process.argv[3];
5288
5311
  console.error(