@wix/evalforge-evaluator 0.153.0 → 0.155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -33,6 +33,8 @@ function loadConfig() {
33
33
  }
34
34
  }
35
35
  aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
36
+ aiGatewayHeaders["x-time-budget"] = "300000";
37
+ aiGatewayHeaders["x-wix-time-budget"] = "300000";
36
38
  const tracePushUrl = process.env.TRACE_PUSH_URL;
37
39
  const routeHeader = process.env.EVAL_ROUTE_HEADER;
38
40
  const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -3393,6 +3395,7 @@ defaultRegistry.register(openCodeAdapter);
3393
3395
 
3394
3396
  // src/run-scenario/agents/simple-agent/execute.ts
3395
3397
  import {
3398
+ APICallError,
3396
3399
  generateText,
3397
3400
  stepCountIs
3398
3401
  } from "ai";
@@ -3722,6 +3725,16 @@ async function executeWithAiSdk(context) {
3722
3725
  if (traceContext) {
3723
3726
  emitStartEvent(traceContext, startTime);
3724
3727
  }
3728
+ const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3729
+ const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
3730
+ const abortController = new AbortController();
3731
+ const timeoutHandle = setTimeout(() => {
3732
+ abortController.abort(
3733
+ new Error(
3734
+ `Simple Agent generateText timed out after ${SDK_TIMEOUT_MS}ms (model=${modelId}, scenario=${scenario.name})`
3735
+ )
3736
+ );
3737
+ }, SDK_TIMEOUT_MS);
3725
3738
  try {
3726
3739
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
3727
3740
  const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
@@ -3762,12 +3775,12 @@ async function executeWithAiSdk(context) {
3762
3775
  ...computedProviderOpts
3763
3776
  };
3764
3777
  const stepTimestamps = [];
3765
- const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3766
3778
  const { triggerPromptImages } = context;
3767
3779
  const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3768
3780
  const result = await generateText({
3769
3781
  ...topLevelExtras,
3770
3782
  model,
3783
+ abortSignal: abortController.signal,
3771
3784
  system: systemPrompt,
3772
3785
  ...hasImages ? {
3773
3786
  messages: [
@@ -3855,14 +3868,90 @@ async function executeWithAiSdk(context) {
3855
3868
  };
3856
3869
  } catch (err) {
3857
3870
  const baseMsg = err instanceof Error ? err.message : String(err);
3871
+ const ctx = extractGatewayErrorContext(err);
3858
3872
  throw new Error(
3859
- `AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}`,
3873
+ `AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}${ctx}`,
3860
3874
  { cause: err }
3861
3875
  );
3862
3876
  } finally {
3877
+ clearTimeout(timeoutHandle);
3863
3878
  await closeMcpClients(clients);
3864
3879
  }
3865
3880
  }
3881
+ var UPSTREAM_REQUEST_ID_HEADERS = [
3882
+ "openai-request-id",
3883
+ "x-request-id",
3884
+ "cf-ray",
3885
+ "x-wix-request-id"
3886
+ ];
3887
+ var PROXY_BODY_FIELDS = [
3888
+ "proxy_request_id",
3889
+ "upstream_request_id",
3890
+ "failure_phase"
3891
+ ];
3892
+ var MAX_ERROR_WALK_DEPTH = 5;
3893
+ function findApiCallError(err, depth = 0) {
3894
+ if (err == null || depth > MAX_ERROR_WALK_DEPTH) return void 0;
3895
+ if (APICallError.isInstance(err)) return err;
3896
+ if (typeof err === "object") {
3897
+ const errors = err.errors;
3898
+ if (Array.isArray(errors)) {
3899
+ for (let i = errors.length - 1; i >= 0; i--) {
3900
+ const found = findApiCallError(errors[i], depth + 1);
3901
+ if (found) return found;
3902
+ }
3903
+ }
3904
+ const cause = err.cause;
3905
+ if (cause != null) return findApiCallError(cause, depth + 1);
3906
+ }
3907
+ return void 0;
3908
+ }
3909
+ function getHeader(headers, name) {
3910
+ if (!headers) return void 0;
3911
+ const target = name.toLowerCase();
3912
+ for (const [key, value] of Object.entries(headers)) {
3913
+ if (key.toLowerCase() === target) return value;
3914
+ }
3915
+ return void 0;
3916
+ }
3917
+ function getProxyBodyFields(responseBody) {
3918
+ if (typeof responseBody !== "string" || responseBody.length === 0) {
3919
+ return {};
3920
+ }
3921
+ let parsed;
3922
+ try {
3923
+ parsed = JSON.parse(responseBody);
3924
+ } catch {
3925
+ return {};
3926
+ }
3927
+ if (parsed == null || typeof parsed !== "object") return {};
3928
+ const out = {};
3929
+ for (const name of PROXY_BODY_FIELDS) {
3930
+ const value = parsed[name];
3931
+ if (typeof value === "string" && value.length > 0) {
3932
+ out[name] = value;
3933
+ }
3934
+ }
3935
+ return out;
3936
+ }
3937
+ function extractGatewayErrorContext(err) {
3938
+ const apiError = findApiCallError(err);
3939
+ if (!apiError) return "";
3940
+ const fields = [];
3941
+ if (typeof apiError.statusCode === "number") {
3942
+ fields.push(`status=${apiError.statusCode}`);
3943
+ }
3944
+ for (const name of UPSTREAM_REQUEST_ID_HEADERS) {
3945
+ const value = getHeader(apiError.responseHeaders, name);
3946
+ if (value) fields.push(`${name}=${value}`);
3947
+ }
3948
+ const bodyFields = getProxyBodyFields(apiError.responseBody);
3949
+ for (const name of PROXY_BODY_FIELDS) {
3950
+ const value = bodyFields[name];
3951
+ if (value) fields.push(`${name}=${value}`);
3952
+ }
3953
+ return fields.length > 0 ? ` [${fields.join(", ")}]` : "";
3954
+ }
3866
3955
  function composeSystemPrompt(context) {
3867
3956
  const parts = [];
3868
3957
  if (context.systemPrompt) {
@@ -4906,6 +4995,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4906
4995
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4907
4996
  let completedExecutions = 0;
4908
4997
  let erroredExecutions = 0;
4998
+ let firstErrorMessage;
4909
4999
  const totalExecutions = scenarioItems.length * runsPerScenario;
4910
5000
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4911
5001
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4943,6 +5033,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4943
5033
  };
4944
5034
  await callbacks.addResult(errorResult);
4945
5035
  erroredExecutions++;
5036
+ if (firstErrorMessage === void 0) {
5037
+ firstErrorMessage = errorResult.outputText;
5038
+ }
4946
5039
  }
4947
5040
  if (scenarioResult !== null) {
4948
5041
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4950,7 +5043,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4950
5043
  completedExecutions++;
4951
5044
  }
4952
5045
  }
4953
- return { completedExecutions, totalExecutions, erroredExecutions };
5046
+ return {
5047
+ completedExecutions,
5048
+ totalExecutions,
5049
+ erroredExecutions,
5050
+ firstErrorMessage
5051
+ };
4954
5052
  }
4955
5053
 
4956
5054
  // src/error-reporter.ts
@@ -5134,7 +5232,12 @@ async function runEvaluation(projectId2, evalRunId2) {
5134
5232
  );
5135
5233
  }
5136
5234
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
5137
- const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
5235
+ const {
5236
+ completedExecutions,
5237
+ totalExecutions,
5238
+ erroredExecutions,
5239
+ firstErrorMessage
5240
+ } = await runEvaluationLoop(scenarioItems, evalData, {
5138
5241
  runScenario: (scenario, template, resolvedAssertions) => {
5139
5242
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5140
5243
  state.currentContext = {
@@ -5183,10 +5286,14 @@ async function runEvaluation(projectId2, evalRunId2) {
5183
5286
  };
5184
5287
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5185
5288
  const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
5289
+ const jobErrorOnAllFailed = allFailed ? truncateForJobError(
5290
+ firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
5291
+ ) : void 0;
5186
5292
  try {
5187
5293
  await api.updateEvalRun(projectId2, evalRunId2, {
5188
5294
  status: finalStatus,
5189
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
5295
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
5296
+ ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
5190
5297
  });
5191
5298
  } catch (updateErr) {
5192
5299
  throw new Error(
@@ -5194,6 +5301,11 @@ async function runEvaluation(projectId2, evalRunId2) {
5194
5301
  );
5195
5302
  }
5196
5303
  }
5304
+ var JOB_ERROR_MAX_LENGTH = 1e3;
5305
+ function truncateForJobError(message) {
5306
+ if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
5307
+ return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
5308
+ }
5197
5309
  var projectId = process.argv[2];
5198
5310
  var evalRunId = process.argv[3];
5199
5311
  console.error(