npm - @wix/evalforge-evaluator - Versions diffs - 0.153.0 → 0.155.0 - Mend

@wix/evalforge-evaluator 0.153.0 → 0.155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/build/index.js +116 -5
package/build/index.js.map +2 -2
package/build/index.mjs +117 -5
package/build/index.mjs.map +2 -2
package/build/types/evaluation-loop.d.ts +6 -0
package/build/types/run-scenario/agents/simple-agent/execute.d.ts +9 -0
package/package.json +2 -2

package/build/index.js CHANGED Viewed

@@ -56,6 +56,8 @@ function loadConfig() {
     }
   }
   aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
+  aiGatewayHeaders["x-time-budget"] = "300000";
+  aiGatewayHeaders["x-wix-time-budget"] = "300000";
   const tracePushUrl = process.env.TRACE_PUSH_URL;
   const routeHeader = process.env.EVAL_ROUTE_HEADER;
   const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -3704,6 +3706,16 @@ async function executeWithAiSdk(context) {
   if (traceContext) {
     emitStartEvent(traceContext, startTime);
   }
+  const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
+  const SDK_TIMEOUT_MS = cfg.maxDurationMs ?? Math.max(3e5, (effectiveMaxTurns ?? 25) * 6e4);
+  const abortController = new AbortController();
+  const timeoutHandle = setTimeout(() => {
+    abortController.abort(
+      new Error(
+        `Simple Agent generateText timed out after ${SDK_TIMEOUT_MS}ms (model=${modelId}, scenario=${scenario.name})`
+      )
+    );
+  }, SDK_TIMEOUT_MS);
   try {
     const isAnthropic = provider === PROVIDER_ANTHROPIC2;
     const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
@@ -3744,12 +3756,12 @@ async function executeWithAiSdk(context) {
       ...computedProviderOpts
     };
     const stepTimestamps = [];
-    const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
     const { triggerPromptImages } = context;
     const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
     const result = await (0, import_ai.generateText)({
       ...topLevelExtras,
       model,
+      abortSignal: abortController.signal,
       system: systemPrompt,
       ...hasImages ? {
         messages: [
@@ -3837,14 +3849,90 @@ async function executeWithAiSdk(context) {
     };
   } catch (err) {
     const baseMsg = err instanceof Error ? err.message : String(err);
+    const ctx = extractGatewayErrorContext(err);
     throw new Error(
-      `AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}`,
+      `AI gateway request failed (provider=${provider}, model=${modelId}): ${baseMsg}${ctx}`,
       { cause: err }
     );
   } finally {
+    clearTimeout(timeoutHandle);
     await closeMcpClients(clients);
   }
 }
+var UPSTREAM_REQUEST_ID_HEADERS = [
+  "openai-request-id",
+  "x-request-id",
+  "cf-ray",
+  "x-wix-request-id"
+];
+var PROXY_BODY_FIELDS = [
+  "proxy_request_id",
+  "upstream_request_id",
+  "failure_phase"
+];
+var MAX_ERROR_WALK_DEPTH = 5;
+function findApiCallError(err, depth = 0) {
+  if (err == null || depth > MAX_ERROR_WALK_DEPTH) return void 0;
+  if (import_ai.APICallError.isInstance(err)) return err;
+  if (typeof err === "object") {
+    const errors = err.errors;
+    if (Array.isArray(errors)) {
+      for (let i = errors.length - 1; i >= 0; i--) {
+        const found = findApiCallError(errors[i], depth + 1);
+        if (found) return found;
+      }
+    }
+    const cause = err.cause;
+    if (cause != null) return findApiCallError(cause, depth + 1);
+  }
+  return void 0;
+}
+function getHeader(headers, name) {
+  if (!headers) return void 0;
+  const target = name.toLowerCase();
+  for (const [key, value] of Object.entries(headers)) {
+    if (key.toLowerCase() === target) return value;
+  }
+  return void 0;
+}
+function getProxyBodyFields(responseBody) {
+  if (typeof responseBody !== "string" || responseBody.length === 0) {
+    return {};
+  }
+  let parsed;
+  try {
+    parsed = JSON.parse(responseBody);
+  } catch {
+    return {};
+  }
+  if (parsed == null || typeof parsed !== "object") return {};
+  const out = {};
+  for (const name of PROXY_BODY_FIELDS) {
+    const value = parsed[name];
+    if (typeof value === "string" && value.length > 0) {
+      out[name] = value;
+    }
+  }
+  return out;
+}
+function extractGatewayErrorContext(err) {
+  const apiError = findApiCallError(err);
+  if (!apiError) return "";
+  const fields = [];
+  if (typeof apiError.statusCode === "number") {
+    fields.push(`status=${apiError.statusCode}`);
+  }
+  for (const name of UPSTREAM_REQUEST_ID_HEADERS) {
+    const value = getHeader(apiError.responseHeaders, name);
+    if (value) fields.push(`${name}=${value}`);
+  }
+  const bodyFields = getProxyBodyFields(apiError.responseBody);
+  for (const name of PROXY_BODY_FIELDS) {
+    const value = bodyFields[name];
+    if (value) fields.push(`${name}=${value}`);
+  }
+  return fields.length > 0 ? ` [${fields.join(", ")}]` : "";
+}
 function composeSystemPrompt(context) {
   const parts = [];
   if (context.systemPrompt) {
@@ -4888,6 +4976,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
   const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
   let completedExecutions = 0;
   let erroredExecutions = 0;
+  let firstErrorMessage;
   const totalExecutions = scenarioItems.length * runsPerScenario;
   const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
   const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4925,6 +5014,9 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
         };
         await callbacks.addResult(errorResult);
         erroredExecutions++;
+        if (firstErrorMessage === void 0) {
+          firstErrorMessage = errorResult.outputText;
+        }
       }
       if (scenarioResult !== null) {
         await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4932,7 +5024,12 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
       completedExecutions++;
     }
   }
-  return { completedExecutions, totalExecutions, erroredExecutions };
+  return {
+    completedExecutions,
+    totalExecutions,
+    erroredExecutions,
+    firstErrorMessage
+  };
 }
 // src/error-reporter.ts
@@ -5116,7 +5213,12 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
   const skillNames = evalData.skills.map((s) => s.name).join(", ");
-  const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
+  const {
+    completedExecutions,
+    totalExecutions,
+    erroredExecutions,
+    firstErrorMessage
+  } = await runEvaluationLoop(scenarioItems, evalData, {
     runScenario: (scenario, template, resolvedAssertions) => {
       state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
       state.currentContext = {
@@ -5165,10 +5267,14 @@ async function runEvaluation(projectId2, evalRunId2) {
   };
   const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
   const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
+  const jobErrorOnAllFailed = allFailed ? truncateForJobError(
+    firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
+  ) : void 0;
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
       status: finalStatus,
-      completedAt: (/* @__PURE__ */ new Date()).toISOString()
+      completedAt: (/* @__PURE__ */ new Date()).toISOString(),
+      ...allFailed ? { jobError: jobErrorOnAllFailed, jobStatus: "FAILED" } : {}
     });
   } catch (updateErr) {
     throw new Error(
@@ -5176,6 +5282,11 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
 }
+var JOB_ERROR_MAX_LENGTH = 1e3;
+function truncateForJobError(message) {
+  if (message.length <= JOB_ERROR_MAX_LENGTH) return message;
+  return `${message.slice(0, JOB_ERROR_MAX_LENGTH)}\u2026 [truncated]`;
+}
 var projectId = process.argv[2];
 var evalRunId = process.argv[3];
 console.error(