@wix/evalforge-evaluator 0.154.0 → 0.155.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +26 -3
- package/build/index.js.map +2 -2
- package/build/index.mjs +26 -3
- package/build/index.mjs.map +2 -2
- package/build/types/evaluation-loop.d.ts +6 -0
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -4995,6 +4995,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4995
4995
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4996
4996
|
let completedExecutions = 0;
|
|
4997
4997
|
let erroredExecutions = 0;
|
|
4998
|
+
let firstErrorMessage;
|
|
4998
4999
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4999
5000
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
5000
5001
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -5032,6 +5033,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5032
5033
|
};
|
|
5033
5034
|
await callbacks.addResult(errorResult);
|
|
5034
5035
|
erroredExecutions++;
|
|
5036
|
+
if (firstErrorMessage === void 0) {
|
|
5037
|
+
firstErrorMessage = errorResult.outputText;
|
|
5038
|
+
}
|
|
5035
5039
|
}
|
|
5036
5040
|
if (scenarioResult !== null) {
|
|
5037
5041
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -5039,7 +5043,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5039
5043
|
completedExecutions++;
|
|
5040
5044
|
}
|
|
5041
5045
|
}
|
|
5042
|
-
return {
|
|
5046
|
+
return {
|
|
5047
|
+
completedExecutions,
|
|
5048
|
+
totalExecutions,
|
|
5049
|
+
erroredExecutions,
|
|
5050
|
+
firstErrorMessage
|
|
5051
|
+
};
|
|
5043
5052
|
}
|
|
5044
5053
|
|
|
5045
5054
|
// src/error-reporter.ts
|
|
@@ -5223,7 +5232,12 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5223
5232
|
);
|
|
5224
5233
|
}
|
|
5225
5234
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
5226
|
-
const {
|
|
5235
|
+
const {
|
|
5236
|
+
completedExecutions,
|
|
5237
|
+
totalExecutions,
|
|
5238
|
+
erroredExecutions,
|
|
5239
|
+
firstErrorMessage
|
|
5240
|
+
} = await runEvaluationLoop(scenarioItems, evalData, {
|
|
5227
5241
|
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5228
5242
|
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5229
5243
|
state.currentContext = {
|
|
@@ -5272,10 +5286,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5272
5286
|
};
|
|
5273
5287
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5274
5288
|
const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
|
|
5289
|
+
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
5290
|
+
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
5291
|
+
) : void 0;
|
|
5275
5292
|
try {
|
|
5276
5293
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5277
5294
|
status: finalStatus,
|
|
5278
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5295
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5296
|
+
...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
|
|
5279
5297
|
});
|
|
5280
5298
|
} catch (updateErr) {
|
|
5281
5299
|
throw new Error(
|
|
@@ -5283,6 +5301,11 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5283
5301
|
);
|
|
5284
5302
|
}
|
|
5285
5303
|
}
|
|
5304
|
+
var JOB_ERROR_MAX_LENGTH = 1e3;
|
|
5305
|
+
function truncateForJobError(message) {
|
|
5306
|
+
if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
|
|
5307
|
+
return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
|
|
5308
|
+
}
|
|
5286
5309
|
var projectId = process.argv[2];
|
|
5287
5310
|
var evalRunId = process.argv[3];
|
|
5288
5311
|
console.error(
|