@wix/evalforge-evaluator 0.154.0 → 0.156.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -664,6 +664,14 @@ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_git
|
|
|
664
664
|
}
|
|
665
665
|
}
|
|
666
666
|
|
|
667
|
+
// src/run-scenario/agents/timeout.ts
|
|
668
|
+
var UNLIMITED_RUN_TIMEOUT_MS = 60 * 6e4;
|
|
669
|
+
function resolveTimeoutMs(maxTurns, maxDurationMs) {
|
|
670
|
+
if (maxDurationMs !== void 0) return maxDurationMs;
|
|
671
|
+
if (maxTurns === void 0) return UNLIMITED_RUN_TIMEOUT_MS;
|
|
672
|
+
return Math.max(3e5, maxTurns * 6e4);
|
|
673
|
+
}
|
|
674
|
+
|
|
667
675
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
668
676
|
var import_crypto = require("crypto");
|
|
669
677
|
|
|
@@ -1294,7 +1302,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1294
1302
|
traceContext.authToken
|
|
1295
1303
|
);
|
|
1296
1304
|
}
|
|
1297
|
-
const SDK_TIMEOUT_MS = options.maxDurationMs
|
|
1305
|
+
const SDK_TIMEOUT_MS = resolveTimeoutMs(maxTurns, options.maxDurationMs);
|
|
1298
1306
|
let timeoutHandle;
|
|
1299
1307
|
let timedOut = false;
|
|
1300
1308
|
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
@@ -3097,7 +3105,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3097
3105
|
model: options.model
|
|
3098
3106
|
});
|
|
3099
3107
|
const maxTurns = options.maxTurns || void 0;
|
|
3100
|
-
const sdkTimeoutMs = options.maxDurationMs
|
|
3108
|
+
const sdkTimeoutMs = resolveTimeoutMs(maxTurns, options.maxDurationMs);
|
|
3101
3109
|
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3102
3110
|
model: options.model,
|
|
3103
3111
|
temperature: options.temperature,
|
|
@@ -3707,7 +3715,10 @@ async function executeWithAiSdk(context) {
|
|
|
3707
3715
|
emitStartEvent(traceContext, startTime);
|
|
3708
3716
|
}
|
|
3709
3717
|
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3710
|
-
const SDK_TIMEOUT_MS =
|
|
3718
|
+
const SDK_TIMEOUT_MS = resolveTimeoutMs(
|
|
3719
|
+
effectiveMaxTurns,
|
|
3720
|
+
cfg.maxDurationMs
|
|
3721
|
+
);
|
|
3711
3722
|
const abortController = new AbortController();
|
|
3712
3723
|
const timeoutHandle = setTimeout(() => {
|
|
3713
3724
|
abortController.abort(
|
|
@@ -4976,6 +4987,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4976
4987
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4977
4988
|
let completedExecutions = 0;
|
|
4978
4989
|
let erroredExecutions = 0;
|
|
4990
|
+
let firstErrorMessage;
|
|
4979
4991
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4980
4992
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4981
4993
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -5013,6 +5025,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5013
5025
|
};
|
|
5014
5026
|
await callbacks.addResult(errorResult);
|
|
5015
5027
|
erroredExecutions++;
|
|
5028
|
+
if (firstErrorMessage === void 0) {
|
|
5029
|
+
firstErrorMessage = errorResult.outputText;
|
|
5030
|
+
}
|
|
5016
5031
|
}
|
|
5017
5032
|
if (scenarioResult !== null) {
|
|
5018
5033
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -5020,7 +5035,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5020
5035
|
completedExecutions++;
|
|
5021
5036
|
}
|
|
5022
5037
|
}
|
|
5023
|
-
return {
|
|
5038
|
+
return {
|
|
5039
|
+
completedExecutions,
|
|
5040
|
+
totalExecutions,
|
|
5041
|
+
erroredExecutions,
|
|
5042
|
+
firstErrorMessage
|
|
5043
|
+
};
|
|
5024
5044
|
}
|
|
5025
5045
|
|
|
5026
5046
|
// src/error-reporter.ts
|
|
@@ -5204,7 +5224,12 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5204
5224
|
);
|
|
5205
5225
|
}
|
|
5206
5226
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
5207
|
-
const {
|
|
5227
|
+
const {
|
|
5228
|
+
completedExecutions,
|
|
5229
|
+
totalExecutions,
|
|
5230
|
+
erroredExecutions,
|
|
5231
|
+
firstErrorMessage
|
|
5232
|
+
} = await runEvaluationLoop(scenarioItems, evalData, {
|
|
5208
5233
|
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5209
5234
|
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5210
5235
|
state.currentContext = {
|
|
@@ -5253,10 +5278,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5253
5278
|
};
|
|
5254
5279
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5255
5280
|
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
5281
|
+
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
5282
|
+
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
5283
|
+
) : void 0;
|
|
5256
5284
|
try {
|
|
5257
5285
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5258
5286
|
status: finalStatus,
|
|
5259
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5287
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5288
|
+
...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
|
|
5260
5289
|
});
|
|
5261
5290
|
} catch (updateErr) {
|
|
5262
5291
|
throw new Error(
|
|
@@ -5264,6 +5293,11 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5264
5293
|
);
|
|
5265
5294
|
}
|
|
5266
5295
|
}
|
|
5296
|
+
var JOB_ERROR_MAX_LENGTH = 1e3;
|
|
5297
|
+
function truncateForJobError(message) {
|
|
5298
|
+
if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
|
|
5299
|
+
return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
|
|
5300
|
+
}
|
|
5267
5301
|
var projectId = process.argv[2];
|
|
5268
5302
|
var evalRunId = process.argv[3];
|
|
5269
5303
|
console.error(
|