@wix/evalforge-evaluator 0.153.0 → 0.155.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -56,6 +56,8 @@ function loadConfig() {
|
|
|
56
56
|
}
|
|
57
57
|
}
|
|
58
58
|
aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
|
|
59
|
+
aiGatewayHeaders["x-time-budget"] = "300000";
|
|
60
|
+
aiGatewayHeaders["x-wix-time-budget"] = "300000";
|
|
59
61
|
const tracePushUrl = process.env.TRACE_PUSH_URL;
|
|
60
62
|
const routeHeader = process.env.EVAL_ROUTE_HEADER;
|
|
61
63
|
const authToken = process.env.EVAL_AUTH_TOKEN;
|
|
@@ -3704,6 +3706,16 @@ async function executeWithAiSdk(context) {
|
|
|
3704
3706
|
if (traceContext) {
|
|
3705
3707
|
emitStartEvent(traceContext, startTime);
|
|
3706
3708
|
}
|
|
3709
|
+
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3710
|
+
const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
|
|
3711
|
+
const abortController = new AbortController();
|
|
3712
|
+
const timeoutHandle = setTimeout(() => {
|
|
3713
|
+
abortController.abort(
|
|
3714
|
+
new Error(
|
|
3715
|
+
`Simple Agent generateText timed out after ${SDK_TIMEOUT_MS}ms (model=${modelId}, scenario=${scenario.name})`
|
|
3716
|
+
)
|
|
3717
|
+
);
|
|
3718
|
+
}, SDK_TIMEOUT_MS);
|
|
3707
3719
|
try {
|
|
3708
3720
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
3709
3721
|
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
@@ -3744,12 +3756,12 @@ async function executeWithAiSdk(context) {
|
|
|
3744
3756
|
...computedProviderOpts
|
|
3745
3757
|
};
|
|
3746
3758
|
const stepTimestamps = [];
|
|
3747
|
-
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3748
3759
|
const { triggerPromptImages } = context;
|
|
3749
3760
|
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3750
3761
|
const result = await (0, import_ai.generateText)({
|
|
3751
3762
|
...topLevelExtras,
|
|
3752
3763
|
model,
|
|
3764
|
+
abortSignal: abortController.signal,
|
|
3753
3765
|
system: systemPrompt,
|
|
3754
3766
|
...hasImages ? {
|
|
3755
3767
|
messages: [
|
|
@@ -3837,14 +3849,90 @@ async function executeWithAiSdk(context) {
|
|
|
3837
3849
|
};
|
|
3838
3850
|
} catch (err) {
|
|
3839
3851
|
const baseMsg = err instanceof Error ? err.message : String(err);
|
|
3852
|
+
const ctx = extractGatewayErrorContext(err);
|
|
3840
3853
|
throw new Error(
|
|
3841
|
-
`AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}`,
|
|
3854
|
+
`AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}${ctx}`,
|
|
3842
3855
|
{ cause: err }
|
|
3843
3856
|
);
|
|
3844
3857
|
} finally {
|
|
3858
|
+
clearTimeout(timeoutHandle);
|
|
3845
3859
|
await closeMcpClients(clients);
|
|
3846
3860
|
}
|
|
3847
3861
|
}
|
|
3862
|
+
var UPSTREAM_REQUEST_ID_HEADERS = [
|
|
3863
|
+
"openai-request-id",
|
|
3864
|
+
"x-request-id",
|
|
3865
|
+
"cf-ray",
|
|
3866
|
+
"x-wix-request-id"
|
|
3867
|
+
];
|
|
3868
|
+
var PROXY_BODY_FIELDS = [
|
|
3869
|
+
"proxy_request_id",
|
|
3870
|
+
"upstream_request_id",
|
|
3871
|
+
"failure_phase"
|
|
3872
|
+
];
|
|
3873
|
+
var MAX_ERROR_WALK_DEPTH = 5;
|
|
3874
|
+
function findApiCallError(err, depth = 0) {
|
|
3875
|
+
if (err == null || depth > MAX_ERROR_WALK_DEPTH) return void 0;
|
|
3876
|
+
if (import_ai.APICallError.isInstance(err)) return err;
|
|
3877
|
+
if (typeof err === "object") {
|
|
3878
|
+
const errors = err.errors;
|
|
3879
|
+
if (Array.isArray(errors)) {
|
|
3880
|
+
for (let i = errors.length - 1; i >= 0; i--) {
|
|
3881
|
+
const found = findApiCallError(errors[i], depth + 1);
|
|
3882
|
+
if (found) return found;
|
|
3883
|
+
}
|
|
3884
|
+
}
|
|
3885
|
+
const cause = err.cause;
|
|
3886
|
+
if (cause != null) return findApiCallError(cause, depth + 1);
|
|
3887
|
+
}
|
|
3888
|
+
return void 0;
|
|
3889
|
+
}
|
|
3890
|
+
function getHeader(headers, name) {
|
|
3891
|
+
if (!headers) return void 0;
|
|
3892
|
+
const target = name.toLowerCase();
|
|
3893
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
3894
|
+
if (key.toLowerCase() === target) return value;
|
|
3895
|
+
}
|
|
3896
|
+
return void 0;
|
|
3897
|
+
}
|
|
3898
|
+
function getProxyBodyFields(responseBody) {
|
|
3899
|
+
if (typeof responseBody !== "string" || responseBody.length === 0) {
|
|
3900
|
+
return {};
|
|
3901
|
+
}
|
|
3902
|
+
let parsed;
|
|
3903
|
+
try {
|
|
3904
|
+
parsed = JSON.parse(responseBody);
|
|
3905
|
+
} catch {
|
|
3906
|
+
return {};
|
|
3907
|
+
}
|
|
3908
|
+
if (parsed == null || typeof parsed !== "object") return {};
|
|
3909
|
+
const out = {};
|
|
3910
|
+
for (const name of PROXY_BODY_FIELDS) {
|
|
3911
|
+
const value = parsed[name];
|
|
3912
|
+
if (typeof value === "string" && value.length > 0) {
|
|
3913
|
+
out[name] = value;
|
|
3914
|
+
}
|
|
3915
|
+
}
|
|
3916
|
+
return out;
|
|
3917
|
+
}
|
|
3918
|
+
function extractGatewayErrorContext(err) {
|
|
3919
|
+
const apiError = findApiCallError(err);
|
|
3920
|
+
if (!apiError) return "";
|
|
3921
|
+
const fields = [];
|
|
3922
|
+
if (typeof apiError.statusCode === "number") {
|
|
3923
|
+
fields.push(`status=${apiError.statusCode}`);
|
|
3924
|
+
}
|
|
3925
|
+
for (const name of UPSTREAM_REQUEST_ID_HEADERS) {
|
|
3926
|
+
const value = getHeader(apiError.responseHeaders, name);
|
|
3927
|
+
if (value) fields.push(`${name}=${value}`);
|
|
3928
|
+
}
|
|
3929
|
+
const bodyFields = getProxyBodyFields(apiError.responseBody);
|
|
3930
|
+
for (const name of PROXY_BODY_FIELDS) {
|
|
3931
|
+
const value = bodyFields[name];
|
|
3932
|
+
if (value) fields.push(`${name}=${value}`);
|
|
3933
|
+
}
|
|
3934
|
+
return fields.length > 0 ? ` [${fields.join(", ")}]` : "";
|
|
3935
|
+
}
|
|
3848
3936
|
function composeSystemPrompt(context) {
|
|
3849
3937
|
const parts = [];
|
|
3850
3938
|
if (context.systemPrompt) {
|
|
@@ -4888,6 +4976,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4888
4976
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4889
4977
|
let completedExecutions = 0;
|
|
4890
4978
|
let erroredExecutions = 0;
|
|
4979
|
+
let firstErrorMessage;
|
|
4891
4980
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4892
4981
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4893
4982
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -4925,6 +5014,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4925
5014
|
};
|
|
4926
5015
|
await callbacks.addResult(errorResult);
|
|
4927
5016
|
erroredExecutions++;
|
|
5017
|
+
if (firstErrorMessage === void 0) {
|
|
5018
|
+
firstErrorMessage = errorResult.outputText;
|
|
5019
|
+
}
|
|
4928
5020
|
}
|
|
4929
5021
|
if (scenarioResult !== null) {
|
|
4930
5022
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -4932,7 +5024,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4932
5024
|
completedExecutions++;
|
|
4933
5025
|
}
|
|
4934
5026
|
}
|
|
4935
|
-
return {
|
|
5027
|
+
return {
|
|
5028
|
+
completedExecutions,
|
|
5029
|
+
totalExecutions,
|
|
5030
|
+
erroredExecutions,
|
|
5031
|
+
firstErrorMessage
|
|
5032
|
+
};
|
|
4936
5033
|
}
|
|
4937
5034
|
|
|
4938
5035
|
// src/error-reporter.ts
|
|
@@ -5116,7 +5213,12 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5116
5213
|
);
|
|
5117
5214
|
}
|
|
5118
5215
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
5119
|
-
const {
|
|
5216
|
+
const {
|
|
5217
|
+
completedExecutions,
|
|
5218
|
+
totalExecutions,
|
|
5219
|
+
erroredExecutions,
|
|
5220
|
+
firstErrorMessage
|
|
5221
|
+
} = await runEvaluationLoop(scenarioItems, evalData, {
|
|
5120
5222
|
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5121
5223
|
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5122
5224
|
state.currentContext = {
|
|
@@ -5165,10 +5267,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5165
5267
|
};
|
|
5166
5268
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5167
5269
|
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
5270
|
+
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
5271
|
+
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
5272
|
+
) : void 0;
|
|
5168
5273
|
try {
|
|
5169
5274
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5170
5275
|
status: finalStatus,
|
|
5171
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5276
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5277
|
+
...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
|
|
5172
5278
|
});
|
|
5173
5279
|
} catch (updateErr) {
|
|
5174
5280
|
throw new Error(
|
|
@@ -5176,6 +5282,11 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5176
5282
|
);
|
|
5177
5283
|
}
|
|
5178
5284
|
}
|
|
5285
|
+
var JOB_ERROR_MAX_LENGTH = 1e3;
|
|
5286
|
+
function truncateForJobError(message) {
|
|
5287
|
+
if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
|
|
5288
|
+
return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
|
|
5289
|
+
}
|
|
5179
5290
|
var projectId = process.argv[2];
|
|
5180
5291
|
var evalRunId = process.argv[3];
|
|
5181
5292
|
console.error(
|