@wix/evalforge-evaluator 0.153.0 → 0.155.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs
CHANGED
|
@@ -33,6 +33,8 @@ function loadConfig() {
|
|
|
33
33
|
}
|
|
34
34
|
}
|
|
35
35
|
aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
|
|
36
|
+
aiGatewayHeaders["x-time-budget"] = "300000";
|
|
37
|
+
aiGatewayHeaders["x-wix-time-budget"] = "300000";
|
|
36
38
|
const tracePushUrl = process.env.TRACE_PUSH_URL;
|
|
37
39
|
const routeHeader = process.env.EVAL_ROUTE_HEADER;
|
|
38
40
|
const authToken = process.env.EVAL_AUTH_TOKEN;
|
|
@@ -3393,6 +3395,7 @@ defaultRegistry.register(openCodeAdapter);
|
|
|
3393
3395
|
|
|
3394
3396
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3395
3397
|
import {
|
|
3398
|
+
APICallError,
|
|
3396
3399
|
generateText,
|
|
3397
3400
|
stepCountIs
|
|
3398
3401
|
} from "ai";
|
|
@@ -3722,6 +3725,16 @@ async function executeWithAiSdk(context) {
|
|
|
3722
3725
|
if (traceContext) {
|
|
3723
3726
|
emitStartEvent(traceContext, startTime);
|
|
3724
3727
|
}
|
|
3728
|
+
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3729
|
+
const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
|
|
3730
|
+
const abortController = new AbortController();
|
|
3731
|
+
const timeoutHandle = setTimeout(() => {
|
|
3732
|
+
abortController.abort(
|
|
3733
|
+
new Error(
|
|
3734
|
+
`Simple Agent generateText timed out after ${SDK_TIMEOUT_MS}ms (model=${modelId}, scenario=${scenario.name})`
|
|
3735
|
+
)
|
|
3736
|
+
);
|
|
3737
|
+
}, SDK_TIMEOUT_MS);
|
|
3725
3738
|
try {
|
|
3726
3739
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
3727
3740
|
const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
|
|
@@ -3762,12 +3775,12 @@ async function executeWithAiSdk(context) {
|
|
|
3762
3775
|
...computedProviderOpts
|
|
3763
3776
|
};
|
|
3764
3777
|
const stepTimestamps = [];
|
|
3765
|
-
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3766
3778
|
const { triggerPromptImages } = context;
|
|
3767
3779
|
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3768
3780
|
const result = await generateText({
|
|
3769
3781
|
...topLevelExtras,
|
|
3770
3782
|
model,
|
|
3783
|
+
abortSignal: abortController.signal,
|
|
3771
3784
|
system: systemPrompt,
|
|
3772
3785
|
...hasImages ? {
|
|
3773
3786
|
messages: [
|
|
@@ -3855,14 +3868,90 @@ async function executeWithAiSdk(context) {
|
|
|
3855
3868
|
};
|
|
3856
3869
|
} catch (err) {
|
|
3857
3870
|
const baseMsg = err instanceof Error ? err.message : String(err);
|
|
3871
|
+
const ctx = extractGatewayErrorContext(err);
|
|
3858
3872
|
throw new Error(
|
|
3859
|
-
`AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}`,
|
|
3873
|
+
`AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}${ctx}`,
|
|
3860
3874
|
{ cause: err }
|
|
3861
3875
|
);
|
|
3862
3876
|
} finally {
|
|
3877
|
+
clearTimeout(timeoutHandle);
|
|
3863
3878
|
await closeMcpClients(clients);
|
|
3864
3879
|
}
|
|
3865
3880
|
}
|
|
3881
|
+
var UPSTREAM_REQUEST_ID_HEADERS = [
|
|
3882
|
+
"openai-request-id",
|
|
3883
|
+
"x-request-id",
|
|
3884
|
+
"cf-ray",
|
|
3885
|
+
"x-wix-request-id"
|
|
3886
|
+
];
|
|
3887
|
+
var PROXY_BODY_FIELDS = [
|
|
3888
|
+
"proxy_request_id",
|
|
3889
|
+
"upstream_request_id",
|
|
3890
|
+
"failure_phase"
|
|
3891
|
+
];
|
|
3892
|
+
var MAX_ERROR_WALK_DEPTH = 5;
|
|
3893
|
+
function findApiCallError(err, depth = 0) {
|
|
3894
|
+
if (err == null || depth > MAX_ERROR_WALK_DEPTH) return void 0;
|
|
3895
|
+
if (APICallError.isInstance(err)) return err;
|
|
3896
|
+
if (typeof err === "object") {
|
|
3897
|
+
const errors = err.errors;
|
|
3898
|
+
if (Array.isArray(errors)) {
|
|
3899
|
+
for (let i = errors.length - 1; i >= 0; i--) {
|
|
3900
|
+
const found = findApiCallError(errors[i], depth + 1);
|
|
3901
|
+
if (found) return found;
|
|
3902
|
+
}
|
|
3903
|
+
}
|
|
3904
|
+
const cause = err.cause;
|
|
3905
|
+
if (cause != null) return findApiCallError(cause, depth + 1);
|
|
3906
|
+
}
|
|
3907
|
+
return void 0;
|
|
3908
|
+
}
|
|
3909
|
+
function getHeader(headers, name) {
|
|
3910
|
+
if (!headers) return void 0;
|
|
3911
|
+
const target = name.toLowerCase();
|
|
3912
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
3913
|
+
if (key.toLowerCase() === target) return value;
|
|
3914
|
+
}
|
|
3915
|
+
return void 0;
|
|
3916
|
+
}
|
|
3917
|
+
function getProxyBodyFields(responseBody) {
|
|
3918
|
+
if (typeof responseBody !== "string" || responseBody.length === 0) {
|
|
3919
|
+
return {};
|
|
3920
|
+
}
|
|
3921
|
+
let parsed;
|
|
3922
|
+
try {
|
|
3923
|
+
parsed = JSON.parse(responseBody);
|
|
3924
|
+
} catch {
|
|
3925
|
+
return {};
|
|
3926
|
+
}
|
|
3927
|
+
if (parsed == null || typeof parsed !== "object") return {};
|
|
3928
|
+
const out = {};
|
|
3929
|
+
for (const name of PROXY_BODY_FIELDS) {
|
|
3930
|
+
const value = parsed[name];
|
|
3931
|
+
if (typeof value === "string" && value.length > 0) {
|
|
3932
|
+
out[name] = value;
|
|
3933
|
+
}
|
|
3934
|
+
}
|
|
3935
|
+
return out;
|
|
3936
|
+
}
|
|
3937
|
+
function extractGatewayErrorContext(err) {
|
|
3938
|
+
const apiError = findApiCallError(err);
|
|
3939
|
+
if (!apiError) return "";
|
|
3940
|
+
const fields = [];
|
|
3941
|
+
if (typeof apiError.statusCode === "number") {
|
|
3942
|
+
fields.push(`status=${apiError.statusCode}`);
|
|
3943
|
+
}
|
|
3944
|
+
for (const name of UPSTREAM_REQUEST_ID_HEADERS) {
|
|
3945
|
+
const value = getHeader(apiError.responseHeaders, name);
|
|
3946
|
+
if (value) fields.push(`${name}=${value}`);
|
|
3947
|
+
}
|
|
3948
|
+
const bodyFields = getProxyBodyFields(apiError.responseBody);
|
|
3949
|
+
for (const name of PROXY_BODY_FIELDS) {
|
|
3950
|
+
const value = bodyFields[name];
|
|
3951
|
+
if (value) fields.push(`${name}=${value}`);
|
|
3952
|
+
}
|
|
3953
|
+
return fields.length > 0 ? ` [${fields.join(", ")}]` : "";
|
|
3954
|
+
}
|
|
3866
3955
|
function composeSystemPrompt(context) {
|
|
3867
3956
|
const parts = [];
|
|
3868
3957
|
if (context.systemPrompt) {
|
|
@@ -4906,6 +4995,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4906
4995
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4907
4996
|
let completedExecutions = 0;
|
|
4908
4997
|
let erroredExecutions = 0;
|
|
4998
|
+
let firstErrorMessage;
|
|
4909
4999
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4910
5000
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4911
5001
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -4943,6 +5033,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4943
5033
|
};
|
|
4944
5034
|
await callbacks.addResult(errorResult);
|
|
4945
5035
|
erroredExecutions++;
|
|
5036
|
+
if (firstErrorMessage === void 0) {
|
|
5037
|
+
firstErrorMessage = errorResult.outputText;
|
|
5038
|
+
}
|
|
4946
5039
|
}
|
|
4947
5040
|
if (scenarioResult !== null) {
|
|
4948
5041
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -4950,7 +5043,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4950
5043
|
completedExecutions++;
|
|
4951
5044
|
}
|
|
4952
5045
|
}
|
|
4953
|
-
return {
|
|
5046
|
+
return {
|
|
5047
|
+
completedExecutions,
|
|
5048
|
+
totalExecutions,
|
|
5049
|
+
erroredExecutions,
|
|
5050
|
+
firstErrorMessage
|
|
5051
|
+
};
|
|
4954
5052
|
}
|
|
4955
5053
|
|
|
4956
5054
|
// src/error-reporter.ts
|
|
@@ -5134,7 +5232,12 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5134
5232
|
);
|
|
5135
5233
|
}
|
|
5136
5234
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
5137
|
-
const {
|
|
5235
|
+
const {
|
|
5236
|
+
completedExecutions,
|
|
5237
|
+
totalExecutions,
|
|
5238
|
+
erroredExecutions,
|
|
5239
|
+
firstErrorMessage
|
|
5240
|
+
} = await runEvaluationLoop(scenarioItems, evalData, {
|
|
5138
5241
|
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5139
5242
|
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5140
5243
|
state.currentContext = {
|
|
@@ -5183,10 +5286,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5183
5286
|
};
|
|
5184
5287
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5185
5288
|
const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
|
|
5289
|
+
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
5290
|
+
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
5291
|
+
) : void 0;
|
|
5186
5292
|
try {
|
|
5187
5293
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5188
5294
|
status: finalStatus,
|
|
5189
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5295
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5296
|
+
...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
|
|
5190
5297
|
});
|
|
5191
5298
|
} catch (updateErr) {
|
|
5192
5299
|
throw new Error(
|
|
@@ -5194,6 +5301,11 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5194
5301
|
);
|
|
5195
5302
|
}
|
|
5196
5303
|
}
|
|
5304
|
+
var JOB_ERROR_MAX_LENGTH = 1e3;
|
|
5305
|
+
function truncateForJobError(message) {
|
|
5306
|
+
if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
|
|
5307
|
+
return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
|
|
5308
|
+
}
|
|
5197
5309
|
var projectId = process.argv[2];
|
|
5198
5310
|
var evalRunId = process.argv[3];
|
|
5199
5311
|
console.error(
|