@wix/evalforge-evaluator 0.154.0 → 0.155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -4976,6 +4976,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4976
4976
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4977
4977
  let completedExecutions = 0;
4978
4978
  let erroredExecutions = 0;
4979
+ let firstErrorMessage;
4979
4980
  const totalExecutions = scenarioItems.length * runsPerScenario;
4980
4981
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4981
4982
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -5013,6 +5014,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5013
5014
  };
5014
5015
  await callbacks.addResult(errorResult);
5015
5016
  erroredExecutions++;
5017
+ if (firstErrorMessage === void 0) {
5018
+ firstErrorMessage = errorResult.outputText;
5019
+ }
5016
5020
  }
5017
5021
  if (scenarioResult !== null) {
5018
5022
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -5020,7 +5024,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5020
5024
  completedExecutions++;
5021
5025
  }
5022
5026
  }
5023
- return { completedExecutions, totalExecutions, erroredExecutions };
5027
+ return {
5028
+ completedExecutions,
5029
+ totalExecutions,
5030
+ erroredExecutions,
5031
+ firstErrorMessage
5032
+ };
5024
5033
  }
5025
5034
 
5026
5035
  // src/error-reporter.ts
@@ -5204,7 +5213,12 @@ async function runEvaluation(projectId2, evalRunId2) {
5204
5213
  );
5205
5214
  }
5206
5215
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
5207
- const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
5216
+ const {
5217
+ completedExecutions,
5218
+ totalExecutions,
5219
+ erroredExecutions,
5220
+ firstErrorMessage
5221
+ } = await runEvaluationLoop(scenarioItems, evalData, {
5208
5222
  runScenario: (scenario, template, resolvedAssertions) => {
5209
5223
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5210
5224
  state.currentContext = {
@@ -5253,10 +5267,14 @@ async function runEvaluation(projectId2, evalRunId2) {
5253
5267
  };
5254
5268
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5255
5269
  const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
5270
+ const jobErrorOnAllFailed = allFailed ? truncateForJobError(
5271
+ firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
5272
+ ) : void 0;
5256
5273
  try {
5257
5274
  await api.updateEvalRun(projectId2, evalRunId2, {
5258
5275
  status: finalStatus,
5259
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
5276
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
5277
+ ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
5260
5278
  });
5261
5279
  } catch (updateErr) {
5262
5280
  throw new Error(
@@ -5264,6 +5282,11 @@ async function runEvaluation(projectId2, evalRunId2) {
5264
5282
  );
5265
5283
  }
5266
5284
  }
5285
+ var JOB_ERROR_MAX_LENGTH = 1e3;
5286
+ function truncateForJobError(message) {
5287
+ if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
5288
+ return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
5289
+ }
5267
5290
  var projectId = process.argv[2];
5268
5291
  var evalRunId = process.argv[3];
5269
5292
  console.error(