npm - @opengeni/runtime - Versions diffs - 0.2.2 → 0.2.3 - Mend

@opengeni/runtime 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-2PO56VAL.js → chunk-KNW7AMQB.js} +11 -4
package/dist/chunk-KNW7AMQB.js.map +1 -0
package/dist/index.d.ts +89 -177
package/dist/index.js +346 -156
package/dist/index.js.map +1 -1
package/dist/sandbox/index.d.ts +6 -4
package/dist/sandbox/index.js +1 -1
package/package.json +3 -3
package/src/context-compaction.ts +217 -348
package/src/image-history.ts +149 -0
package/src/index.ts +129 -34
package/src/sandbox/display-stack.ts +61 -12
package/src/sandbox-computer.ts +36 -9
package/dist/chunk-2PO56VAL.js.map +0 -1

package/src/image-history.ts ADDED Viewed

@@ -0,0 +1,149 @@
+import type { AgentInputItem } from "@openai/agents";
+export const SCREENSHOT_OMITTED_PLACEHOLDER =
+  "[screenshot omitted: an older desktop frame — the full image remains in the session event log]";
+const DATA_IMAGE_BASE64_PATTERN = /data:image\/[a-z0-9.+-]+;base64,[a-z0-9+/=_-]+/i;
+type PathSegment = string | number;
+type ImageOccurrence = {
+  path: PathSegment[];
+  replacement: unknown;
+};
+export type ElideStaleScreenshotsResult<T> = {
+  items: T[];
+  imageCount: number;
+  elidedCount: number;
+};
+export type ElideStaleScreenshotsOptions = {
+  keepLast?: number;
+  placeholder?: string;
+};
+export function elideStaleScreenshotImages<T extends AgentInputItem>(
+  items: readonly T[],
+  options: ElideStaleScreenshotsOptions = {},
+): ElideStaleScreenshotsResult<T> {
+  const keepLast = Math.max(0, Math.floor(options.keepLast ?? 3));
+  const placeholder = options.placeholder ?? SCREENSHOT_OMITTED_PLACEHOLDER;
+  const occurrences: ImageOccurrence[] = [];
+  for (let i = 0; i < items.length; i += 1) {
+    collectItemImageOccurrences(items[i], [i], placeholder, occurrences);
+  }
+  const elidedCount = Math.max(0, occurrences.length - keepLast);
+  if (elidedCount === 0) {
+    return { items: items.slice(), imageCount: occurrences.length, elidedCount: 0 };
+  }
+  const cloned = structuredClone(items) as T[];
+  for (const occurrence of occurrences.slice(0, elidedCount)) {
+    setPath(cloned, occurrence.path, occurrence.replacement);
+  }
+  return { items: cloned, imageCount: occurrences.length, elidedCount };
+}
+function collectItemImageOccurrences(
+  item: unknown,
+  path: PathSegment[],
+  placeholder: string,
+  out: ImageOccurrence[],
+): void {
+  if (!isRecord(item)) {
+    return;
+  }
+  if (item.type === "message" && (item.role === "user" || item.role === "system")) {
+    return;
+  }
+  if (item.type === "computer_call_result" || item.type === "computer_call_output") {
+    collectComputerOutputImages(item, path, placeholder, out);
+    return;
+  }
+  if (item.type === "function_call_result" || item.type === "function_call_output") {
+    collectToolResultImages(item.output, [...path, "output"], placeholder, out);
+  }
+}
+function collectComputerOutputImages(
+  item: Record<string, unknown>,
+  path: PathSegment[],
+  placeholder: string,
+  out: ImageOccurrence[],
+): void {
+  const output = item.output;
+  if (!isRecord(output) || output.type !== "computer_screenshot") {
+    return;
+  }
+  for (const key of ["data", "image_url", "imageUrl"]) {
+    if (isImageDataUrl(output[key])) {
+      out.push({ path: [...path, "output", key], replacement: placeholder });
+      return;
+    }
+  }
+}
+function collectToolResultImages(
+  value: unknown,
+  path: PathSegment[],
+  placeholder: string,
+  out: ImageOccurrence[],
+): void {
+  if (typeof value === "string") {
+    if (isImageDataUrl(value)) {
+      out.push({ path, replacement: placeholder });
+    }
+    return;
+  }
+  if (Array.isArray(value)) {
+    for (let i = 0; i < value.length; i += 1) {
+      collectToolResultImages(value[i], [...path, i], placeholder, out);
+    }
+    return;
+  }
+  if (!isRecord(value)) {
+    return;
+  }
+  if (value.type === "input_image") {
+    for (const key of ["image", "imageUrl", "image_url"]) {
+      if (isImageDataUrl(value[key])) {
+        out.push({ path, replacement: { type: "input_text", text: placeholder } });
+        return;
+      }
+    }
+  }
+  for (const key of ["content", "text", "output"]) {
+    if (key in value) {
+      collectToolResultImages(value[key], [...path, key], placeholder, out);
+    }
+  }
+}
+function isImageDataUrl(value: unknown): value is string {
+  return typeof value === "string" && DATA_IMAGE_BASE64_PATTERN.test(value);
+}
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}
+function setPath(root: unknown, path: PathSegment[], value: unknown): void {
+  if (path.length === 0) {
+    return;
+  }
+  let cursor = root;
+  for (let i = 0; i < path.length - 1; i += 1) {
+    const segment = path[i]!;
+    cursor = Array.isArray(cursor)
+      ? cursor[segment as number]
+      : (cursor as Record<string, unknown>)[segment as string];
+  }
+  const last = path[path.length - 1]!;
+  if (Array.isArray(cursor)) {
+    cursor[last as number] = value;
+  } else {
+    (cursor as Record<string, unknown>)[last as string] = value;
+  }
+}

package/src/index.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import type { ConfiguredModel, ContextCompactionMode, ModelProviderApi, ResolvedModelProvider, Settings } from "@opengeni/config";
-import { AGENT_INSTRUCTIONS_CORE_PLACEHOLDER, collectSandboxEnvironment, contextServerCompactThreshold, firstPartyMcpBaseUrl, parseExposedPorts, resolveContextCompactionMode, resolveModelProvider, sandboxLifecycleHookIds } from "@opengeni/config";
+import { AGENT_INSTRUCTIONS_CORE_PLACEHOLDER, collectSandboxEnvironment, contextInputBudgetTokens, contextServerCompactThreshold, firstPartyMcpBaseUrl, parseExposedPorts, resolveContextCompactionMode, resolveModelProvider, sandboxLifecycleHookIds } from "@opengeni/config";
 import { CAPABILITY_DESCRIPTORS, isClearedRunStateBlob, signDelegatedAccessToken, type Permission, type ReasoningEffort, type ResourceRef, type SessionEventType, type ToolRef } from "@opengeni/contracts";
 import {
   Agent,
@@ -82,8 +82,17 @@ import { dirname, isAbsolute, join, posix as posixPath, relative } from "node:pa
 import { fileURLToPath } from "node:url";
 import { computerCallNormalizingFetch, normalizeComputerCallActions, sanitizeHistoryItemsForModel } from "./history-sanitizer";
+import { elideStaleScreenshotImages } from "./image-history";
 import { installCodexToolSearch } from "./codex-tool-search";
-import { enforceInputBudget, estimateItemTokens } from "./context-compaction";
+import {
+  CompactionNeededError,
+  SUMMARY_BUFFER_TOKENS,
+  clientCompactionThresholdTokens,
+  enforceInputBudget,
+  estimateItemTokens,
+  estimateTokens,
+  renderCompactionPromptInputForChat,
+} from "./context-compaction";
 import {
   createSandboxClient,
   deserializeSandboxSessionStateEnvelope,
@@ -134,22 +143,34 @@ export type { HistoryItem } from "./history-sanitizer";
 export { OpenAIChatCompletionsModel, OpenAIResponsesModel } from "@openai/agents";
 export {
-  planCompaction,
+  CompactionNeededError,
+  buildCompactionPromptInput,
+  buildCompactionReplacementHistory,
+  clientCompactionThresholdTokens,
+  decideClientCompaction,
   enforceInputBudget,
   buildSummaryItem,
-  buildCompactionMessages,
+  findCompactionNeededError,
   isCompactionSummary,
   isUserMessage,
   findKeepBoundary,
   estimateTokens,
   estimateItemTokens,
-  compactionSummaryText,
-  renderPrefixTranscript,
+  renderCompactionPromptInputForChat,
   COMPACTION_SUMMARY_MARKER,
+  COMPACTION_PROMPT,
+  COMPACT_USER_MESSAGE_MAX_TOKENS,
+  CLIENT_COMPACTION_TRIGGER_FRACTION,
+  SUMMARY_BUFFER_TOKENS,
   SUMMARY_PREFIX,
-  SUMMARY_INSTRUCTIONS,
+  USER_MESSAGE_TRUNCATION_MARKER,
 } from "./context-compaction";
-export type { CompactionItem, CompactionPlan, PlanCompactionInput } from "./context-compaction";
+export type { ClientCompactionDecision, CompactionItem } from "./context-compaction";
+export {
+  elideStaleScreenshotImages,
+  SCREENSHOT_OMITTED_PLACEHOLDER,
+} from "./image-history";
+export type { ElideStaleScreenshotsOptions, ElideStaleScreenshotsResult } from "./image-history";
 ensureReadableStreamFrom();
@@ -500,10 +521,10 @@ export function configureOpenAI(settings: Settings): void {
 /**
  * Run the compaction summarizer as one plain, tool-less, non-streaming model
- * call against the resolved provider. `system`/`user` come from
- * buildCompactionMessages. Returns the trimmed summary text, or null on any
+ * call against the resolved provider. `input` is the active history plus
+ * Codex's checkpoint prompt. Returns the trimmed summary text, or null on any
  * failure (the caller treats a failed summarize as "skip compaction this turn"
- * — never fatal). The call deliberately does NOT request reasoning encryption,
+ * - never fatal). The call deliberately does NOT request reasoning encryption,
  * tools, or server-side compaction; it is a self-contained summarize.
  *
  * Provider-aware: the summary always runs on the SAME provider that serves the
@@ -517,22 +538,19 @@ export function configureOpenAI(settings: Settings): void {
  */
 export async function summarizeForCompaction(
   settings: Settings,
-  messages: { system: string; user: string },
+  input: Array<Record<string, unknown>>,
   options: { client?: OpenAI; api?: ModelProviderApi; maxOutputTokens?: number; model?: string } = {},
 ): Promise<string | null> {
   const client = options.client ?? buildOpenAIClientFromSettings(settings);
   const api = options.api ?? "responses";
   const model = options.model ?? settings.openaiModel;
-  const maxTokens = options.maxOutputTokens ?? settings.contextSummaryMaxTokens;
+  const maxTokens = options.maxOutputTokens ?? SUMMARY_BUFFER_TOKENS;
   try {
     if (api === "chat") {
       const completion = await client.chat.completions.create({
         model,
         max_tokens: maxTokens,
-        messages: [
-          { role: "system", content: messages.system },
-          { role: "user", content: messages.user },
-        ],
+        messages: [{ role: "user", content: renderCompactionPromptInputForChat(input) }],
       } as any);
       const text = (completion as { choices?: Array<{ message?: { content?: unknown } }> }).choices?.[0]?.message?.content;
       const trimmed = typeof text === "string" ? text.trim() : "";
@@ -545,10 +563,7 @@ export async function summarizeForCompaction(
       // built-in path (api "responses"), so gate it on the built-in provider.
       ...(settings.openaiProvider === "azure" ? {} : { store: false }),
       max_output_tokens: maxTokens,
-      input: [
-        { role: "system", content: messages.system },
-        { role: "user", content: messages.user },
-      ],
+      input,
     } as any);
     const text = extractResponseOutputText(response);
     const trimmed = text.trim();
@@ -1573,6 +1588,7 @@ export type RunAgentStreamOptions = {
   sandboxClient?: unknown;
   sandboxEnvironment?: Record<string, string>;
   onRuntimeEvent?: (event: NormalizedRuntimeEvent) => Promise<void> | void;
+  contextCompactionSignalTokens?: () => number | null | undefined;
   // OWNERSHIP INVERSION (P1.2): an externally-owned, already-live sandbox
   // session resolved by the per-turn resume-by-id path. When present,
   // runAgentStream does NOT build (or resume, or discard) a client — it threads
@@ -1603,6 +1619,11 @@ export type RunAgentStreamOptions = {
   callModelInputFilter?: CallModelInputFilter;
 };
+export type ContextRobustnessFilterOptions = {
+  contextCompactionSignalTokens?: () => number | null | undefined;
+  throwOnCompactionNeeded?: boolean;
+};
 // One-shot directive appended to the agent's system prompt on the genesis turn
 // (see buildOpenGeniAgent's genesisTitleHint). Delivered through the
 // authoritative instructions channel so the model reliably obeys; references
@@ -1656,6 +1677,59 @@ export const normalizeComputerCallsFilter: CallModelInputFilter = ({ modelData }
   ) as unknown as AgentInputItem[],
 });
+export function contextRobustnessFilterForSettings(
+  settings: Settings,
+  options: ContextRobustnessFilterOptions = {},
+): CallModelInputFilter {
+  const inputBudgetTokens = modelCallBudgetTokens(settings);
+  const clientCompactionMode = resolveContextCompactionMode(settings) === "client";
+  const compactionThresholdTokens = clientCompactionThresholdTokens(settings);
+  return ({ modelData }) => {
+    const images = elideStaleScreenshotImages(modelData.input);
+    if (images.elidedCount > 0) {
+      console.warn(
+        `per-call image history policy elided ${images.elidedCount} older screenshot image(s), keeping the last ${Math.min(3, images.imageCount)} full image(s)`,
+      );
+    }
+    let input = images.items;
+    if (inputBudgetTokens !== undefined) {
+      const guarded = enforceInputBudget(
+        input as unknown as Array<Record<string, unknown>>,
+        inputBudgetTokens,
+      );
+      if (guarded.trimmed) {
+        console.warn(
+          `per-call budget guard trimmed ${guarded.droppedCount} oldest history item(s) to fit input budget (${inputBudgetTokens} tokens); the over-budget model call was NOT sent`,
+        );
+        input = guarded.items as unknown as AgentInputItem[];
+      }
+    }
+    if (clientCompactionMode && options.throwOnCompactionNeeded) {
+      const reported = options.contextCompactionSignalTokens?.();
+      const hasReported = typeof reported === "number" && reported > 0;
+      const signalTokens = hasReported
+        ? reported
+        : estimateTokens(input as unknown as Array<Record<string, unknown>>);
+      if (signalTokens > compactionThresholdTokens) {
+        throw new CompactionNeededError({
+          signalTokens,
+          thresholdTokens: compactionThresholdTokens,
+          signalSource: hasReported ? "provider" : "estimate",
+        });
+      }
+    }
+    return { ...modelData, input };
+  };
+}
+function modelCallBudgetTokens(settings: Settings): number | undefined {
+  if (resolveContextCompactionMode(settings) !== "client") {
+    return undefined;
+  }
+  const budget = contextInputBudgetTokens(settings);
+  return budget > 0 ? budget : undefined;
+}
 /**
  * Compose a list of callModelInputFilters into one, applied left-to-right so
  * each sees the prior filter's output.
@@ -1674,13 +1748,18 @@ function composeCallModelInputFilters(filters: CallModelInputFilter[]): CallMode
  * The model-input filter applied before every model call. The computer_call
  * action/actions normalizer is ALWAYS on (the Azure endpoint 400s without it);
  * the provider-item-id strip is layered on top when the configured policy
- * selects it.
+ * selects it; the context-robustness guard then elides stale screenshots on
+ * every mode and applies hard budget trimming only on the client-compaction path.
  */
-export function callModelInputFilterForSettings(settings: Settings): CallModelInputFilter | undefined {
+export function callModelInputFilterForSettings(
+  settings: Settings,
+  options: ContextRobustnessFilterOptions = {},
+): CallModelInputFilter | undefined {
   const filters: CallModelInputFilter[] = [normalizeComputerCallsFilter];
   if (settings.openaiProviderItemIds === "strip") {
     filters.push(stripProviderItemIdsFilter);
   }
+  filters.push(contextRobustnessFilterForSettings(settings, options));
   return composeCallModelInputFilters(filters);
 }
@@ -1759,7 +1838,15 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
     // through the client during this run (it is inert for the provided session).
     const decoratedClient = withSandboxLifecycleHooks(resourceClient, ownedHooks, ownedHookContext);
     const ownedFilter = composeCallModelInputFilters(
-      [callModelInputFilterForSettings(settings), overrides.callModelInputFilter].filter(
+      [
+        callModelInputFilterForSettings(settings, {
+          throwOnCompactionNeeded: Boolean(overrides.contextCompactionSignalTokens),
+          ...(overrides.contextCompactionSignalTokens
+            ? { contextCompactionSignalTokens: overrides.contextCompactionSignalTokens }
+            : {}),
+        }),
+        overrides.callModelInputFilter,
+      ].filter(
         (f): f is CallModelInputFilter => Boolean(f),
       ),
     );
@@ -1806,23 +1893,31 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
     ?? (prepared.serializedRunStateForSandbox && client
       ? await restoredSandboxSessionState(await RunState.fromString(agent, prepared.serializedRunStateForSandbox), client)
       : undefined);
-  // Strip provider item ids first, then apply any per-turn filter (genesis
-  // title directive). Composed left-to-right so the directive lands on the
-  // already-id-stripped input. A callModelInputFilter only shapes the per-call
-  // model input, never the persisted run-state history.
+  // Apply the built-in per-call filters (computer-call normalization, optional
+  // provider-id stripping, image/budget guard), then any per-turn filter
+  // (genesis title directive). A callModelInputFilter only shapes the per-call
+  // model input; the SDK persists filtered clones into its session view, while
+  // OpenGeni's durable conversation truth is still reconciled explicitly below.
   const callModelInputFilter = composeCallModelInputFilters(
-    [callModelInputFilterForSettings(settings), overrides.callModelInputFilter].filter(
+    [
+      callModelInputFilterForSettings(settings, {
+        throwOnCompactionNeeded: Boolean(overrides.contextCompactionSignalTokens),
+        ...(overrides.contextCompactionSignalTokens
+          ? { contextCompactionSignalTokens: overrides.contextCompactionSignalTokens }
+          : {}),
+      }),
+      overrides.callModelInputFilter,
+    ].filter(
       (f): f is CallModelInputFilter => Boolean(f),
     ),
   );
   const runOptions: Parameters<typeof run>[2] = {
     stream: true,
     maxTurns: settings.agentMaxModelCallsPerTurn,
-    // Strip provider-assigned item ids from every model call (turn-start
-    // history replay AND mid-turn follow-ups) so requests never depend on the
-    // provider's server-side response store. A stored response can vanish
-    // between two calls of the same turn, failing the run with 400 "Item with
-    // id 'rs_…' not found"; with the ids gone the request is self-contained.
+    // Built-in per-call guard chain: normalize computer calls, optionally strip
+    // provider ids, elide stale screenshots in every mode, and trim to the input
+    // budget on the client-compaction path. This runs for turn-start replay AND
+    // every mid-turn follow-up.
     callModelInputFilter,
   };
   void settings.disableOpenaiTracing;

package/src/sandbox/display-stack.ts CHANGED Viewed

@@ -24,11 +24,17 @@ import { DESKTOP_STREAM_PORT } from "@opengeni/contracts";
 export { DESKTOP_STREAM_PORT };
 export const STREAM_PORT = DESKTOP_STREAM_PORT;
-// The whole-stack launch is bounded by the readiness gates inside the script
-// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS first-boot XFCE/dbus
-// + font-cache warm-up on a cold gVisor box. 60s gives headroom over the spike's
-// observed ~5-10s warm path without masking a genuine wedge.
-export const DISPLAY_STACK_TIMEOUT_MS = 60_000;
+// The whole-stack launch is bounded by the readiness gates inside the up-script
+// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS the PAINTABLE-FRAME
+// gate we append (up to ~30s of scrot probing) PLUS first-boot XFCE/dbus + font-cache
+// warm-up on a cold gVisor box. 90s gives headroom over the spike's observed ~5-10s
+// warm path AND the cold-box paint warm-up without masking a genuine wedge.
+export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
+// PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
+// waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
+const PAINT_PROBE_ATTEMPTS = 150;
+const PAINT_PROBE_INTERVAL_S = 0.2;
 /** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
  *  change is a full down -> up restart (a separate op). */
@@ -41,15 +47,25 @@ export type DesktopGeometry = {
 export const DEFAULT_DESKTOP_GEOMETRY: DesktopGeometry = { width: 1280, height: 800, dpi: 96 };
 /** Thrown when a stage of the launch script failed. exitCode 11/12/13 map to
- *  Xvfb / x11vnc / websockify respectively (the stage that died). Degradation is
- *  surfaced as a value to viewers by the caller; this error is for diagnostics. */
+ *  Xvfb / x11vnc / websockify respectively (the stage that died); 14 is the
+ *  PAINTABLE-FRAME gate (ports listening but scrot still yields an empty frame —
+ *  the display is up but not actually painting). Degradation is surfaced as a
+ *  value to viewers by the caller; this error is for diagnostics. */
 export class DisplayStackError extends Error {
   readonly exitCode: number;
-  readonly stage: "xvfb" | "x11vnc" | "websockify" | "unknown";
+  readonly stage: "xvfb" | "x11vnc" | "websockify" | "paint" | "unknown";
   constructor(exitCode: number, output: string) {
     const stage =
-      exitCode === 11 ? "xvfb" : exitCode === 12 ? "x11vnc" : exitCode === 13 ? "websockify" : "unknown";
+      exitCode === 11
+        ? "xvfb"
+        : exitCode === 12
+          ? "x11vnc"
+          : exitCode === 13
+            ? "websockify"
+            : exitCode === 14
+              ? "paint"
+              : "unknown";
     super(`desktop display stack failed at stage "${stage}" (exit ${exitCode})${output ? `:\n${output}` : ""}`);
     this.name = "DisplayStackError";
     this.exitCode = exitCode;
@@ -125,15 +141,41 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
   // flock -w bounds the wait so a wedged holder can't deadlock the caller; the
   // up-script itself ALSO takes the same lock (belt + braces) so this works even
   // against an older image that predates the wrapper.
-  return (
+  //
+  // PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
+  // only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
+  // LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
+  // machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
+  // ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
+  // scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
+  // is exactly the empty screenshot that 400s the model and blanks the human viewer.
+  // We therefore chain a real scrot probe as the completion gate: after the up-script
+  // reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
+  // only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
+  // typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
+  // rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
+  // access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
+  // — an already-up display paints on the first probe). Lives in the runtime-built script
+  // (not the baked image up-script) so it ships with the worker/api, no image rebuild.
+  const bringUp =
     `if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
     `echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
     `else ` +
     `mkdir -p /tmp/opengeni-desktop && ` +
     `flock -w 45 /tmp/opengeni-desktop/up.outer.lock ` +
     `env ${env} opengeni-desktop-up; ` +
-    `fi`
-  );
+    `fi`;
+  const paintProbe =
+    `p=/tmp/opengeni-desktop/paint-probe.png; ` +
+    `for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
+    `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
+    `rm -f "$p"; ` +
+    // NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
+    // caller infers the outcome by string-matching the output — stdout is always captured.
+    `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
+    `sleep ${PAINT_PROBE_INTERVAL_S}; ` +
+    `done`;
+  return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;
 }
 function execResultOutput(result: ExecResultLike | string): string {
@@ -157,6 +199,13 @@ function execResultExitCode(result: ExecResultLike | string): number | null {
 // bare string), we infer success from the OPENGENI_DESKTOP_UP marker and infer
 // the failing stage from the stage-failure message the script prints to stderr.
 function inferExitFromOutput(output: string): number {
+  // Check the PAINTABLE-FRAME failure FIRST: on that path the up-script already
+  // printed OPENGENI_DESKTOP_UP (bring-up succeeded) and THEN the paint gate failed,
+  // so both markers are present — the NOT_PAINTING one is the authoritative outcome.
+  // (Modal is execCommand-only, so this string-inference path is the live one.)
+  if (/OPENGENI_DESKTOP_NOT_PAINTING/.test(output)) {
+    return 14;
+  }
   if (/OPENGENI_DESKTOP_UP\b/.test(output)) {
     return 0;
   }

package/src/sandbox-computer.ts CHANGED Viewed

@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
 const SCROLL_MAX_CLICKS = 15;
 // screenshot() never hands the model an empty image_url (the SDK turns "" into
 // `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
-// a zero-byte frame on the first scrot; bounded retries with a short pause let a
-// momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
-const SCREENSHOT_MAX_ATTEMPTS = 3;
-const SCREENSHOT_RETRY_DELAY_MS = 400;
+// zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
+// + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
+// after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
+// So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
+// a short pause between tries, so that first post-cold / post-swap screenshot self-heals
+// as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
+// that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
+// short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
+const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
+const SCREENSHOT_RETRY_DELAY_MS = 750;
 export type SandboxComputerOptions = {
   display?: string; // ":0"
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
   typeDelayMs?: number; // xdotool type --delay (default 12ms)
   readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
   screenshotTmpDir?: string; // "/tmp"
+  // How long screenshot() keeps retrying an empty (still-warming) frame before it
+  // FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
+  // exposed mainly so tests can shrink it (a real caller wants the full budget).
+  screenshotWarmupBudgetMs?: number;
+  screenshotRetryDelayMs?: number;
 };
 // X keysym map for keypress(): model key names → xdotool keysyms.
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
   private readonly typeDelayMs: number;
   private readonly readOnly: boolean;
   private readonly tmp: string;
+  private readonly screenshotWarmupBudgetMs: number;
+  private readonly screenshotRetryDelayMs: number;
   constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
     this.session = session as unknown as ComputerSession;
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
     this.typeDelayMs = opts.typeDelayMs ?? 12;
     this.readOnly = opts.readOnly ?? false;
     this.tmp = opts.screenshotTmpDir ?? "/tmp";
+    this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
+    this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
   }
   /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
     // but momentarily not painting (XFCE/dbus still warming) recovers without
     // failing the turn.
     let lastError: unknown;
-    for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
+    const deadline = Date.now() + this.screenshotWarmupBudgetMs;
+    let attempt = 0;
+    // Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
+    // first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
+    // is a KNOWN transient during that warm-up — not a reason to fail the turn.
+    while (true) {
       if (attempt > 0) {
-        await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
+        await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
       }
+      attempt++;
       const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
       try {
         await this.x(`scrot --pointer --overwrite ${f}`);
         const bytes = await this.readScreenshotBytes(f);
         if (bytes.length === 0) {
           // A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
-          // hand the model an empty image_url; throw on the final attempt.
+          // hand the model an empty image_url; throw once the budget is spent.
           throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
         }
         return Buffer.from(bytes).toString("base64");
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
         // screenshot result.
         await this.x(`rm -f ${f}`).catch(() => undefined);
       }
+      // Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
+      if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
+        break;
+      }
     }
-    // Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
-    // returning "" here would surface to the model as an invalid empty image_url.
+    // Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
+    // outcome — returning "" here would surface to the model as an invalid empty
+    // image_url. Reaching here means the display was still dead after ~30s, not merely
+    // warming, so a hard action failure is correct.
     if (lastError instanceof Error) {
       throw lastError;
     }