@wix/evalforge-evaluator 0.153.0 → 0.155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -56,6 +56,8 @@ function loadConfig() {
56
56
  }
57
57
  }
58
58
  aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
59
+ aiGatewayHeaders["x-time-budget"] = "300000";
60
+ aiGatewayHeaders["x-wix-time-budget"] = "300000";
59
61
  const tracePushUrl = process.env.TRACE_PUSH_URL;
60
62
  const routeHeader = process.env.EVAL_ROUTE_HEADER;
61
63
  const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -3704,6 +3706,16 @@ async function executeWithAiSdk(context) {
3704
3706
  if (traceContext) {
3705
3707
  emitStartEvent(traceContext, startTime);
3706
3708
  }
3709
+ const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3710
+ const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
3711
+ const abortController = new AbortController();
3712
+ const timeoutHandle = setTimeout(() => {
3713
+ abortController.abort(
3714
+ new Error(
3715
+ `Simple Agent generateText timed out after ${SDK_TIMEOUT_MS}ms (model=${modelId}, scenario=${scenario.name})`
3716
+ )
3717
+ );
3718
+ }, SDK_TIMEOUT_MS);
3707
3719
  try {
3708
3720
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
3709
3721
  const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
@@ -3744,12 +3756,12 @@ async function executeWithAiSdk(context) {
3744
3756
  ...computedProviderOpts
3745
3757
  };
3746
3758
  const stepTimestamps = [];
3747
- const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3748
3759
  const { triggerPromptImages } = context;
3749
3760
  const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3750
3761
  const result = await (0, import_ai.generateText)({
3751
3762
  ...topLevelExtras,
3752
3763
  model,
3764
+ abortSignal: abortController.signal,
3753
3765
  system: systemPrompt,
3754
3766
  ...hasImages ? {
3755
3767
  messages: [
@@ -3837,14 +3849,90 @@ async function executeWithAiSdk(context) {
3837
3849
  };
3838
3850
  } catch (err) {
3839
3851
  const baseMsg = err instanceof Error ? err.message : String(err);
3852
+ const ctx = extractGatewayErrorContext(err);
3840
3853
  throw new Error(
3841
- `AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}`,
3854
+ `AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}${ctx}`,
3842
3855
  { cause: err }
3843
3856
  );
3844
3857
  } finally {
3858
+ clearTimeout(timeoutHandle);
3845
3859
  await closeMcpClients(clients);
3846
3860
  }
3847
3861
  }
3862
+ var UPSTREAM_REQUEST_ID_HEADERS = [
3863
+ "openai-request-id",
3864
+ "x-request-id",
3865
+ "cf-ray",
3866
+ "x-wix-request-id"
3867
+ ];
3868
+ var PROXY_BODY_FIELDS = [
3869
+ "proxy_request_id",
3870
+ "upstream_request_id",
3871
+ "failure_phase"
3872
+ ];
3873
+ var MAX_ERROR_WALK_DEPTH = 5;
3874
+ function findApiCallError(err, depth = 0) {
3875
+ if (err == null || depth > MAX_ERROR_WALK_DEPTH) return void 0;
3876
+ if (import_ai.APICallError.isInstance(err)) return err;
3877
+ if (typeof err === "object") {
3878
+ const errors = err.errors;
3879
+ if (Array.isArray(errors)) {
3880
+ for (let i = errors.length - 1; i >= 0; i--) {
3881
+ const found = findApiCallError(errors[i], depth + 1);
3882
+ if (found) return found;
3883
+ }
3884
+ }
3885
+ const cause = err.cause;
3886
+ if (cause != null) return findApiCallError(cause, depth + 1);
3887
+ }
3888
+ return void 0;
3889
+ }
3890
+ function getHeader(headers, name) {
3891
+ if (!headers) return void 0;
3892
+ const target = name.toLowerCase();
3893
+ for (const [key, value] of Object.entries(headers)) {
3894
+ if (key.toLowerCase() === target) return value;
3895
+ }
3896
+ return void 0;
3897
+ }
3898
+ function getProxyBodyFields(responseBody) {
3899
+ if (typeof responseBody !== "string" || responseBody.length === 0) {
3900
+ return {};
3901
+ }
3902
+ let parsed;
3903
+ try {
3904
+ parsed = JSON.parse(responseBody);
3905
+ } catch {
3906
+ return {};
3907
+ }
3908
+ if (parsed == null || typeof parsed !== "object") return {};
3909
+ const out = {};
3910
+ for (const name of PROXY_BODY_FIELDS) {
3911
+ const value = parsed[name];
3912
+ if (typeof value === "string" && value.length > 0) {
3913
+ out[name] = value;
3914
+ }
3915
+ }
3916
+ return out;
3917
+ }
3918
+ function extractGatewayErrorContext(err) {
3919
+ const apiError = findApiCallError(err);
3920
+ if (!apiError) return "";
3921
+ const fields = [];
3922
+ if (typeof apiError.statusCode === "number") {
3923
+ fields.push(`status=${apiError.statusCode}`);
3924
+ }
3925
+ for (const name of UPSTREAM_REQUEST_ID_HEADERS) {
3926
+ const value = getHeader(apiError.responseHeaders, name);
3927
+ if (value) fields.push(`${name}=${value}`);
3928
+ }
3929
+ const bodyFields = getProxyBodyFields(apiError.responseBody);
3930
+ for (const name of PROXY_BODY_FIELDS) {
3931
+ const value = bodyFields[name];
3932
+ if (value) fields.push(`${name}=${value}`);
3933
+ }
3934
+ return fields.length > 0 ? ` [${fields.join(", ")}]` : "";
3935
+ }
3848
3936
  function composeSystemPrompt(context) {
3849
3937
  const parts = [];
3850
3938
  if (context.systemPrompt) {
@@ -4888,6 +4976,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4888
4976
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4889
4977
  let completedExecutions = 0;
4890
4978
  let erroredExecutions = 0;
4979
+ let firstErrorMessage;
4891
4980
  const totalExecutions = scenarioItems.length * runsPerScenario;
4892
4981
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4893
4982
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4925,6 +5014,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4925
5014
  };
4926
5015
  await callbacks.addResult(errorResult);
4927
5016
  erroredExecutions++;
5017
+ if (firstErrorMessage === void 0) {
5018
+ firstErrorMessage = errorResult.outputText;
5019
+ }
4928
5020
  }
4929
5021
  if (scenarioResult !== null) {
4930
5022
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4932,7 +5024,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4932
5024
  completedExecutions++;
4933
5025
  }
4934
5026
  }
4935
- return { completedExecutions, totalExecutions, erroredExecutions };
5027
+ return {
5028
+ completedExecutions,
5029
+ totalExecutions,
5030
+ erroredExecutions,
5031
+ firstErrorMessage
5032
+ };
4936
5033
  }
4937
5034
 
4938
5035
  // src/error-reporter.ts
@@ -5116,7 +5213,12 @@ async function runEvaluation(projectId2, evalRunId2) {
5116
5213
  );
5117
5214
  }
5118
5215
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
5119
- const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
5216
+ const {
5217
+ completedExecutions,
5218
+ totalExecutions,
5219
+ erroredExecutions,
5220
+ firstErrorMessage
5221
+ } = await runEvaluationLoop(scenarioItems, evalData, {
5120
5222
  runScenario: (scenario, template, resolvedAssertions) => {
5121
5223
  state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5122
5224
  state.currentContext = {
@@ -5165,10 +5267,14 @@ async function runEvaluation(projectId2, evalRunId2) {
5165
5267
  };
5166
5268
  const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5167
5269
  const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
5270
+ const jobErrorOnAllFailed = allFailed ? truncateForJobError(
5271
+ firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
5272
+ ) : void 0;
5168
5273
  try {
5169
5274
  await api.updateEvalRun(projectId2, evalRunId2, {
5170
5275
  status: finalStatus,
5171
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
5276
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
5277
+ ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
5172
5278
  });
5173
5279
  } catch (updateErr) {
5174
5280
  throw new Error(
@@ -5176,6 +5282,11 @@ async function runEvaluation(projectId2, evalRunId2) {
5176
5282
  );
5177
5283
  }
5178
5284
  }
5285
+ var JOB_ERROR_MAX_LENGTH = 1e3;
5286
+ function truncateForJobError(message) {
5287
+ if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
5288
+ return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
5289
+ }
5179
5290
  var projectId = process.argv[2];
5180
5291
  var evalRunId = process.argv[3];
5181
5292
  console.error(