@wix/evalforge-evaluator 0.154.0 → 0.156.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs
CHANGED
|
@@ -659,6 +659,14 @@ async function writeSkillToFilesystem(cwd, skill, fetchFn = fetchGitHubFolder2)
|
|
|
659
659
|
}
|
|
660
660
|
}
|
|
661
661
|
|
|
662
|
+
// src/run-scenario/agents/timeout.ts
|
|
663
|
+
var UNLIMITED_RUN_TIMEOUT_MS = 60 * 6e4;
|
|
664
|
+
function resolveTimeoutMs(maxTurns, maxDurationMs) {
|
|
665
|
+
if (maxDurationMs !== void 0) return maxDurationMs;
|
|
666
|
+
if (maxTurns === void 0) return UNLIMITED_RUN_TIMEOUT_MS;
|
|
667
|
+
return Math.max(3e5, maxTurns * 6e4);
|
|
668
|
+
}
|
|
669
|
+
|
|
662
670
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
663
671
|
import { randomUUID } from "crypto";
|
|
664
672
|
|
|
@@ -1291,7 +1299,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1291
1299
|
traceContext.authToken
|
|
1292
1300
|
);
|
|
1293
1301
|
}
|
|
1294
|
-
const SDK_TIMEOUT_MS = options.maxDurationMs
|
|
1302
|
+
const SDK_TIMEOUT_MS = resolveTimeoutMs(maxTurns, options.maxDurationMs);
|
|
1295
1303
|
let timeoutHandle;
|
|
1296
1304
|
let timedOut = false;
|
|
1297
1305
|
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
@@ -3104,7 +3112,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3104
3112
|
model: options.model
|
|
3105
3113
|
});
|
|
3106
3114
|
const maxTurns = options.maxTurns || void 0;
|
|
3107
|
-
const sdkTimeoutMs = options.maxDurationMs
|
|
3115
|
+
const sdkTimeoutMs = resolveTimeoutMs(maxTurns, options.maxDurationMs);
|
|
3108
3116
|
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3109
3117
|
model: options.model,
|
|
3110
3118
|
temperature: options.temperature,
|
|
@@ -3726,7 +3734,10 @@ async function executeWithAiSdk(context) {
|
|
|
3726
3734
|
emitStartEvent(traceContext, startTime);
|
|
3727
3735
|
}
|
|
3728
3736
|
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3729
|
-
const SDK_TIMEOUT_MS =
|
|
3737
|
+
const SDK_TIMEOUT_MS = resolveTimeoutMs(
|
|
3738
|
+
effectiveMaxTurns,
|
|
3739
|
+
cfg.maxDurationMs
|
|
3740
|
+
);
|
|
3730
3741
|
const abortController = new AbortController();
|
|
3731
3742
|
const timeoutHandle = setTimeout(() => {
|
|
3732
3743
|
abortController.abort(
|
|
@@ -4995,6 +5006,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4995
5006
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4996
5007
|
let completedExecutions = 0;
|
|
4997
5008
|
let erroredExecutions = 0;
|
|
5009
|
+
let firstErrorMessage;
|
|
4998
5010
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4999
5011
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
5000
5012
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -5032,6 +5044,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5032
5044
|
};
|
|
5033
5045
|
await callbacks.addResult(errorResult);
|
|
5034
5046
|
erroredExecutions++;
|
|
5047
|
+
if (firstErrorMessage === void 0) {
|
|
5048
|
+
firstErrorMessage = errorResult.outputText;
|
|
5049
|
+
}
|
|
5035
5050
|
}
|
|
5036
5051
|
if (scenarioResult !== null) {
|
|
5037
5052
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -5039,7 +5054,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5039
5054
|
completedExecutions++;
|
|
5040
5055
|
}
|
|
5041
5056
|
}
|
|
5042
|
-
return {
|
|
5057
|
+
return {
|
|
5058
|
+
completedExecutions,
|
|
5059
|
+
totalExecutions,
|
|
5060
|
+
erroredExecutions,
|
|
5061
|
+
firstErrorMessage
|
|
5062
|
+
};
|
|
5043
5063
|
}
|
|
5044
5064
|
|
|
5045
5065
|
// src/error-reporter.ts
|
|
@@ -5223,7 +5243,12 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5223
5243
|
);
|
|
5224
5244
|
}
|
|
5225
5245
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
5226
|
-
const {
|
|
5246
|
+
const {
|
|
5247
|
+
completedExecutions,
|
|
5248
|
+
totalExecutions,
|
|
5249
|
+
erroredExecutions,
|
|
5250
|
+
firstErrorMessage
|
|
5251
|
+
} = await runEvaluationLoop(scenarioItems, evalData, {
|
|
5227
5252
|
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5228
5253
|
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5229
5254
|
state.currentContext = {
|
|
@@ -5272,10 +5297,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5272
5297
|
};
|
|
5273
5298
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5274
5299
|
const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
|
|
5300
|
+
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
5301
|
+
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
5302
|
+
) : void 0;
|
|
5275
5303
|
try {
|
|
5276
5304
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5277
5305
|
status: finalStatus,
|
|
5278
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5306
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5307
|
+
...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
|
|
5279
5308
|
});
|
|
5280
5309
|
} catch (updateErr) {
|
|
5281
5310
|
throw new Error(
|
|
@@ -5283,6 +5312,11 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5283
5312
|
);
|
|
5284
5313
|
}
|
|
5285
5314
|
}
|
|
5315
|
+
var JOB_ERROR_MAX_LENGTH = 1e3;
|
|
5316
|
+
function truncateForJobError(message) {
|
|
5317
|
+
if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
|
|
5318
|
+
return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
|
|
5319
|
+
}
|
|
5286
5320
|
var projectId = process.argv[2];
|
|
5287
5321
|
var evalRunId = process.argv[3];
|
|
5288
5322
|
console.error(
|