npm - @opengeni/runtime - Versions diffs - 0.2.3 → 0.3.1 - Mend

@opengeni/runtime 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/{chunk-KNW7AMQB.js → chunk-HGQ252FL.js} +251 -22
package/dist/chunk-HGQ252FL.js.map +1 -0
package/dist/index-CSGkld-v.d.ts +1801 -0
package/dist/index.d.ts +23 -3
package/dist/index.js +238 -39
package/dist/index.js.map +1 -1
package/dist/sandbox/index.d.ts +4 -1738
package/dist/sandbox/index.js +11 -1
package/package.json +3 -3
package/src/history-sanitizer.ts +35 -38
package/src/index.ts +133 -10
package/src/metrics.ts +5 -0
package/src/sandbox/display-stack.ts +69 -13
package/src/sandbox/index.ts +100 -13
package/src/sandbox/providers/modal.ts +225 -0
package/src/sandbox/routing/routing-session.ts +2 -2
package/src/sandbox/selfhosted/session.ts +21 -5
package/src/sandbox-computer.ts +214 -48
package/src/screenshot-error-card.ts +25 -0
package/dist/chunk-KNW7AMQB.js.map +0 -1

package/dist/sandbox/index.js CHANGED Viewed

@@ -63,6 +63,8 @@ import {
   isWorkspaceEscapeError,
   makeActiveBackendResolver,
   mintStreamToken,
+  modalSandboxAttributionEnvironment,
+  modalSandboxAttributionTags,
   negotiateCapabilities,
   negotiateSelfhostedCapabilities,
   offlineAgentError,
@@ -85,12 +87,15 @@ import {
   stopRecording,
   stripExecBanner,
   subjectFor,
+  sweepModalOrphanSandboxes,
+  tagModalSandbox,
   tearDownDisplayStack,
   tearDownTerminalServer,
+  terminateModalSandboxById,
   timeoutAgentError,
   timeoutControlResponse,
   verifyStreamToken
-} from "../chunk-KNW7AMQB.js";
+} from "../chunk-HGQ252FL.js";
 export {
   ActiveBackendUnresolvableError,
   CAPABILITY_DESCRIPTORS,
@@ -156,6 +161,8 @@ export {
   isWorkspaceEscapeError,
   makeActiveBackendResolver,
   mintStreamToken,
+  modalSandboxAttributionEnvironment,
+  modalSandboxAttributionTags,
   negotiateCapabilities,
   negotiateSelfhostedCapabilities,
   offlineAgentError,
@@ -178,8 +185,11 @@ export {
   stopRecording,
   stripExecBanner,
   subjectFor,
+  sweepModalOrphanSandboxes,
+  tagModalSandbox,
   tearDownDisplayStack,
   tearDownTerminalServer,
+  terminateModalSandboxById,
   timeoutAgentError,
   timeoutControlResponse,
   verifyStreamToken

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@opengeni/runtime",
-  "version": "0.2.3",
+  "version": "0.3.1",
   "type": "module",
   "main": "./dist/index.js",
   "module": "./dist/index.js",
@@ -29,8 +29,8 @@
   },
   "dependencies": {
     "@opengeni/agent-proto": "^0.2.1",
-    "@opengeni/config": "^0.2.3",
-    "@opengeni/contracts": "^0.5.0",
+    "@opengeni/config": "^0.2.5",
+    "@opengeni/contracts": "^0.7.0",
     "@openai/agents": "^0.11.6",
     "@openai/agents-extensions": "^0.11.6",
     "modal": "^0.7.4",

package/src/history-sanitizer.ts CHANGED Viewed

@@ -27,6 +27,8 @@
  * filtered, keeping the persisted audit trail intact.
  */
+import { SCREENSHOT_FAILURE_CARD_IMAGE_URL } from "./screenshot-error-card";
 /** A history item is any JSON object; we only inspect a few discriminator fields. */
 export type HistoryItem = Record<string, unknown>;
@@ -594,42 +596,35 @@ export function rewriteComputerCallsToActionsOnly(body: unknown): boolean {
 }
 /**
- * The 1×1 transparent PNG placeholder used by the SDK for tool-approval-rejection
- * screenshots (`TOOL_APPROVAL_REJECTION_SCREENSHOT_DATA_URL` in agents-core
- * `toolExecution.mjs`). We reuse the exact same constant as a backstop for the
- * action-timeout 400: when an action times out the SDK's catch sets output='' and
- * builds `{type:"computer_call_output",output:{type:"computer_screenshot",image_url:""}}`.
- * Azure rejects `image_url:""` with "400 Invalid input[N].output.image_url". This
- * placeholder is a valid data URI the provider accepts, so the turn continues and
- * the model receives the next real screenshot on its following step.
- */
-const EMPTY_IMAGE_URL_PLACEHOLDER =
-  "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
-/**
- * Backstop for the action-timeout 400: walk the `input` array of a serialized
- * Responses request body and replace any `computer_call_output` item whose
- * `output.image_url` is an empty string, null, undefined, or otherwise not a
- * non-empty string with the 1×1 transparent PNG placeholder data URI.
- *
- * WHY THIS IS NEEDED. When a computer ACTION (click/type/scroll/drag) times out
- * at the 15-second yield window `SandboxComputer.x()` throws `ComputerActionError`.
- * The agents-core SDK `toolExecution.mjs` catch block sets `output = ''` and then
- * builds the wire item:
- *
- *   `{type:"computer_call_output", output:{type:"computer_screenshot", image_url:""}}`
- *
- * Azure rejects the whole request with:
- *
- *   `400 Invalid 'input[N].output.image_url'. Expected a valid URL, but got a
- *    value with an invalid format.`
- *
- * Our screenshot() fail-loud guard (which throws on empty frames) only runs when
- * the SDK calls screenshot() on a SUCCESS path — not on this action-error catch
- * path that sets output='' directly. This wire-level rewrite is the only seam that
- * catches both paths regardless of how the empty image_url was produced. It runs
- * in the same `computerCallNormalizingFetch` wrapper, so a single parse/rewrite
- * pass covers both the action/actions-only rewrite and this placeholder injection.
+ * Backstop for the empty `computer_call_output` image_url: walk the `input` array of
+ * a serialized Responses request body and replace any `computer_call_output` item
+ * whose `output.image_url` is empty/missing with a LEGIBLE "screen capture failed"
+ * error card ({@link SCREENSHOT_FAILURE_CARD_IMAGE_URL}).
+ *
+ * WHY A CARD, NOT A BLANK. An empty `image_url` reaches this seam ONLY when the
+ * computer op genuinely FAILED to produce a screen: agents-core's `toolExecution.mjs`
+ * catch sets `output = ''` when the action OR the follow-up `computer.screenshot()`
+ * throws, building `{type:"computer_call_output",output:{type:"computer_screenshot",
+ * image_url:""}}`. Azure then rejects the whole request with
+ * `400 Invalid 'input[N].output.image_url'`. The previous fix substituted a 1×1
+ * TRANSPARENT placeholder to dodge the 400 — but that reaches the model as a
+ * plausible BLANK DESKTOP it confidently reports ("the screen appears blank/empty"),
+ * turning a hard capture FAILURE into a silent, wrong observation. That is the worst
+ * failure mode for computer use, and it is exactly what the 0.1.3 TCC-denied incident
+ * produced. Substituting a legible error card instead makes the failure REACH THE
+ * MODEL as an error (the only channel the hosted `computer_use_preview` protocol has
+ * is the image), so the model stops and tells the user rather than hallucinating.
+ *
+ * WHY THIS IS SAFE (empty = failure, never an intentional blank). Post-af289e3 the
+ * intentional-blank cases carry a NON-empty data URI already: agents-core's
+ * tool-approval-rejection screenshot is its own non-empty 1×1 placeholder, and the
+ * SandboxComputer action-timeout now warn+returns to a REAL screenshot rather than an
+ * empty output. So an EMPTY image_url at this seam is unambiguously a capture/interact
+ * FAILURE — the error card is the correct substitution for every empty case, and this
+ * function never touches a non-empty (real screenshot OR intentional blank) output.
+ *
+ * The failure REASON (permission denied / null image / timeout / display down) is not
+ * on the card; it is logged worker-side by `NativeDesktopComputer.screenshot()`.
  *
  * Mutates `body` in place (the caller has already JSON.parsed a private copy).
  * Returns `true` iff at least one image_url was replaced.
@@ -657,9 +652,11 @@ export function rewriteEmptyComputerCallOutputImageUrls(body: unknown): boolean
     }
     const out = output as Record<string, unknown>;
     const imageUrl = out.image_url;
-    // Replace the image_url when it is not a non-empty string (covers: "", null, undefined, missing).
+    // Replace the image_url when it is not a non-empty string (covers: "", null,
+    // undefined, missing) — an empty output is always a genuine capture failure, so
+    // it becomes the legible error card, never a silent blank.
     if (typeof imageUrl !== "string" || imageUrl.length === 0) {
-      out.image_url = EMPTY_IMAGE_URL_PLACEHOLDER;
+      out.image_url = SCREENSHOT_FAILURE_CARD_IMAGE_URL;
       changed = true;
     }
   }

package/src/index.ts CHANGED Viewed

@@ -27,6 +27,7 @@ import {
   setDefaultOpenAIClient,
   setDefaultOpenAIKey,
   setOpenAIResponsesTransport,
+  setTracingDisabled,
   // Hosted web_search tool factory. Re-exported from @openai/agents-openai via
   // `export * from '@openai/agents-openai'` in @openai/agents' index (0.11.6);
   // it returns a { type: 'hosted_tool', providerData: { type: 'web_search' } }
@@ -101,6 +102,9 @@ import {
   setSelfhostedApplyDiff,
 } from "./sandbox";
 import { computerUse, type ComputerToolMode } from "./sandbox-computer";
+import type { RuntimeMetricsHooks } from "./metrics";
+export type { RuntimeMetricsHooks } from "./metrics";
 // P4.3 computer-use surface (the agent's :0 driver). Re-exported from the barrel
 // so callers (the worker, live proofs) reach SandboxComputer/ComputerUseCapability
@@ -253,6 +257,12 @@ export type SandboxFileDownload = {
   sizeBytes?: number;
 };
+let runtimeMetricsHooks: RuntimeMetricsHooks | null = null;
+export function configureRuntimeMetricsHooks(hooks: RuntimeMetricsHooks | null | undefined): void {
+  runtimeMetricsHooks = hooks ?? null;
+}
 export type OpenGeniRuntime = {
   configure: (settings: Settings) => void;
   // Multi-provider per-turn model routing. Returns the resolved provider, its
@@ -270,11 +280,15 @@ export type OpenGeniRuntime = {
 export type ProductionRuntimeOverrides = {
   model?: Model;
   sandboxClient?: unknown;
+  metrics?: RuntimeMetricsHooks;
 };
 export function createProductionAgentRuntime(overrides: ProductionRuntimeOverrides = {}): OpenGeniRuntime {
   return {
-    configure: configureOpenAI,
+    configure: (settings) => {
+      configureRuntimeMetricsHooks(overrides.metrics);
+      configureOpenAI(settings);
+    },
     // A test/override model shadows the registry routing entirely (the scripted
     // model used in worker tests is not in any provider's allow-list), so when
     // one is supplied resolveTurnModel reports "no resolution" and the caller
@@ -301,7 +315,7 @@ export function createProductionAgentRuntime(overrides: ProductionRuntimeOverrid
  * the OpenAI-platform path has only a key (the SDK default client is used via
  * setDefaultOpenAIKey there); the caller then constructs a key-only client.
  */
-export function buildOpenAIClientFromSettings(settings: Settings): OpenAI {
+export function buildOpenAIClientFromSettings(settings: Settings, providerId: string = settings.openaiProvider): OpenAI {
   if (settings.openaiProvider === "azure") {
     const baseURL = settings.azureOpenaiBaseUrl ?? azureDeploymentBaseUrl(settings);
     const apiKey = settings.azureOpenaiApiKey ?? settings.azureOpenaiAdToken ?? "azure-ad-token";
@@ -318,13 +332,14 @@ export function buildOpenAIClientFromSettings(settings: Settings): OpenAI {
       // seam — below the SDK responses converter, which always re-synthesizes BOTH
       // `action` and `actions` (rejected 400 "exactly one of action or actions").
       // See computerCallNormalizingFetch / rewriteComputerCallsToActionsOnly.
-      fetch: computerCallNormalizingFetch(globalThis.fetch),
+      fetch: computerCallNormalizingFetch(instrumentedModelFetch(providerId, globalThis.fetch)),
     });
   }
   return new OpenAI({
     apiKey: settings.openaiApiKey ?? process.env.OPENAI_API_KEY,
     ...(settings.openaiBaseUrl ? { baseURL: settings.openaiBaseUrl } : {}),
     maxRetries: settings.openaiMaxRetries,
+    fetch: instrumentedModelFetch(providerId, globalThis.fetch),
   });
 }
@@ -346,7 +361,7 @@ export function buildProviderClient(provider: ResolvedModelProvider, settings: S
     return cached;
   }
   const client = provider.builtin
-    ? buildOpenAIClientFromSettings(settings)
+    ? buildOpenAIClientFromSettings(settings, provider.id)
     : provider.kind === "codex-subscription"
       // Codex subscription: the static apiKey is a placeholder — the real per-request
       // bearer + ChatGPT-Account-ID, the /responses->/codex/responses rewrite, and the
@@ -358,7 +373,7 @@ export function buildProviderClient(provider: ResolvedModelProvider, settings: S
         apiKey: provider.apiKey ?? "codex-subscription",
         ...(provider.baseUrl ? { baseURL: provider.baseUrl } : {}),
         maxRetries: settings.openaiMaxRetries,
-        fetch: codexSubscriptionFetch(globalThis.fetch),
+        fetch: codexSubscriptionFetch(instrumentedModelFetch(provider.id, globalThis.fetch)),
       })
     // ResolvedModelProvider.apiKey is already the resolved key (configuredProviders
     // ran resolveProviderApiKey at config time, collapsing apiKey/apiKeyEnv), so it
@@ -369,6 +384,7 @@ export function buildProviderClient(provider: ResolvedModelProvider, settings: S
       maxRetries: settings.openaiMaxRetries,
       ...(provider.defaultQuery ? { defaultQuery: provider.defaultQuery } : {}),
       ...(provider.defaultHeaders ? { defaultHeaders: provider.defaultHeaders } : {}),
+      fetch: instrumentedModelFetch(provider.id, globalThis.fetch),
     });
   providerClientCache.set(provider.id, client);
   return client;
@@ -441,7 +457,7 @@ export class MultiProviderModelProvider implements ModelProvider {
   async getModel(modelName?: string): Promise<Model> {
     if (modelName) {
-      const resolved = resolveTurnModel(this.settings, modelName);
+      const resolved = resolveTurnModel(settingsForRunScopedModelResolution(this.settings, modelName), modelName);
       if (resolved) {
         // Fail-loud floor (defense in depth): a `codex/<slug>` id must only ever
         // resolve through the synthetic codex-subscription provider (which installs
@@ -479,6 +495,27 @@ export class MultiProviderModelProvider implements ModelProvider {
   }
 }
+function settingsForRunScopedModelResolution(settings: Settings, modelName: string): Settings {
+  if (modelName !== settings.openaiModel) {
+    return settings;
+  }
+  const builtinAllowed = splitOpenaiAllowedModels(settings.openaiAllowedModels);
+  const fallbackBuiltin = builtinAllowed.find((id) => id !== modelName);
+  if (!fallbackBuiltin) {
+    return settings;
+  }
+  // The worker sets runSettings.openaiModel to the turn's model. For namespaced
+  // registry ids configuredModels filters the built-in entry out, but a unique
+  // bare registry id would otherwise be claimed by the built-in only because of
+  // that per-turn override. Resolve the run-scoped router against the deployment
+  // allow-list head instead; real built-in models stay in the allow-list.
+  return builtinAllowed.includes(modelName) ? settings : { ...settings, openaiModel: fallbackBuiltin };
+}
+function splitOpenaiAllowedModels(value: string): string[] {
+  return value.split(",").map((item) => item.trim()).filter(Boolean);
+}
 /**
  * A `codex/<slug>` turn reached the model router but the workspace has no active
  * Codex subscription connected (the worker overlay never injected the synthetic
@@ -500,6 +537,7 @@ export class CodexSubscriptionUnavailableError extends Error {
 export function configureOpenAI(settings: Settings): void {
   setOpenAIResponsesTransport(settings.openaiResponsesTransport);
+  setTracingDisabled(settings.disableOpenaiTracing || !settings.observabilityOtlpEndpoint);
   // Install the registry-aware router as the process default model provider so a
   // model name re-resolved on the SandboxAgent/Modal path (where a Model instance
   // does not survive) routes to its provider instead of the built-in client.
@@ -519,6 +557,51 @@ export function configureOpenAI(settings: Settings): void {
   setDefaultModelProvider(router);
 }
+function instrumentedModelFetch(provider: string, inner: typeof fetch): typeof fetch {
+  return (async (input: Parameters<typeof fetch>[0], init?: Parameters<typeof fetch>[1]) => {
+    if (!isModelCallFetch(input)) {
+      return await inner(input, init);
+    }
+    const started = performance.now();
+    try {
+      const response = await inner(input, init);
+      recordModelCallMetric(provider, response.ok ? "completed" : "failed", started);
+      return response;
+    } catch (error) {
+      recordModelCallMetric(provider, "failed", started);
+      throw error;
+    }
+  }) as typeof fetch;
+}
+function isModelCallFetch(input: Parameters<typeof fetch>[0]): boolean {
+  const rawUrl = typeof input === "string"
+    ? input
+    : input instanceof URL
+      ? input.toString()
+      : (input as { url?: unknown }).url;
+  if (typeof rawUrl !== "string" || rawUrl.length === 0) {
+    return false;
+  }
+  try {
+    const pathname = new URL(rawUrl, "http://opengeni.local").pathname;
+    return pathname.endsWith("/responses")
+      || pathname.endsWith("/chat/completions")
+      || pathname.endsWith("/codex/responses");
+  } catch {
+    return /\/(?:codex\/)?responses(?:\?|$)|\/chat\/completions(?:\?|$)/.test(rawUrl);
+  }
+}
+function recordModelCallMetric(provider: string, outcome: "completed" | "failed", started: number): void {
+  const durationSeconds = Math.max(0, (performance.now() - started) / 1000);
+  try {
+    runtimeMetricsHooks?.onModelCall?.({ provider, outcome, durationSeconds });
+  } catch {
+    // Metrics emission must never affect a model call.
+  }
+}
 /**
  * Run the compaction summarizer as one plain, tool-less, non-streaming model
  * call against the resolved provider. `input` is the active history plus
@@ -711,6 +794,14 @@ export type BuildAgentOptions = {
   // restyle the persona but never drop the goal-loop contract or environment
   // block.
   instructionsTemplate?: string;
+  // Per-SESSION persona/system instructions (the per-agent-type prompt lever an
+  // embedding host supplies at session create). Composed AFTER the workspace
+  // instructionsTemplate + the non-bypassable CORE, so it refines the workspace
+  // persona for this one session without dropping the goal-loop/environment
+  // contract. Rides the SAME instructions channel (system-level) — NEVER a user/
+  // timeline message. Omitted ⇒ the composed instructions are byte-identical to
+  // a workspace-only persona.
+  sessionInstructions?: string;
   // Skills delivered by enabled capability packs. They join the bundled
   // skills in the sandbox skill index (mounted under .agents/) so
   // skills/<name> references resolve like any other indexed skill.
@@ -793,6 +884,27 @@ export function composeAgentInstructions(template: string, workspaceEnvironment?
   return core ? `${template} ${core}` : template;
 }
+/**
+ * Appends the per-session persona instructions to the already-composed
+ * (workspace + CORE) instructions, joined by " " — exactly the join used
+ * throughout the persona composition. The session slice is intentionally LAST
+ * (session-specific refinement of the workspace persona). An absent/blank value
+ * is a no-op that returns the composed string byte-for-byte.
+ */
+export function appendSessionInstructions(composed: string, sessionInstructions?: string): string {
+  const trimmed = sessionInstructions?.trim();
+  return trimmed ? `${composed} ${trimmed}` : composed;
+}
+/**
+ * Appends the one-shot genesis title directive (genesis turn only), joined by
+ * " " and always LAST so a white-label persona template or a per-session
+ * instruction can't drop it. A no-op when the hint is absent.
+ */
+export function appendGenesisTitleDirective(instructions: string, genesisTitleHint?: boolean): string {
+  return genesisTitleHint ? `${instructions} ${GENESIS_TITLE_DIRECTIVE}` : instructions;
+}
 const agentFileDownloads = new WeakMap<object, SandboxFileDownload[]>();
 const agentRepositoryCloneHooks = new WeakMap<object, SandboxLifecycleHook[]>();
 // TOKEN-BROKER (B1): the per-turn git token seed, stashed alongside the agent's
@@ -837,9 +949,21 @@ export function buildOpenGeniAgent(settings: Settings, resources: ResourceRef[],
     // ownership + workspace-environment block) at the {{core}} marker, or
     // appends it when the template omits the marker. With the default template
     // and no environment this is byte-identical to the historical preamble.
-    instructions: options.genesisTitleHint
-      ? `${composeAgentInstructions(options.instructionsTemplate ?? settings.agentInstructionsTemplate, options.workspaceEnvironment)} ${GENESIS_TITLE_DIRECTIVE}`
-      : composeAgentInstructions(options.instructionsTemplate ?? settings.agentInstructionsTemplate, options.workspaceEnvironment),
+    // Persona composition order (all one system-level instructions string):
+    //   1. workspace instructionsTemplate (or deployment default) with the
+    //      non-bypassable CORE substituted at {{core}} — composeAgentInstructions,
+    //   2. + the per-session persona instructions (session-specific, LAST so it
+    //      refines the workspace persona),
+    //   3. + the one-shot genesis title directive (genesis turn only).
+    // With no session instructions and no genesis hint this is byte-identical to
+    // the historical composed instructions.
+    instructions: appendGenesisTitleDirective(
+      appendSessionInstructions(
+        composeAgentInstructions(options.instructionsTemplate ?? settings.agentInstructionsTemplate, options.workspaceEnvironment),
+        options.sessionInstructions,
+      ),
+      options.genesisTitleHint,
+    ),
     modelSettings: {
       reasoning: { effort: options.reasoningEffort ?? settings.openaiReasoningEffort, summary: "detailed" },
       // Server-side compaction (OpenAI platform) requires store=false: the
@@ -1920,7 +2044,6 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
     // every mid-turn follow-up.
     callModelInputFilter,
   };
-  void settings.disableOpenaiTracing;
   if (client) {
     runOptions.sandbox = {
       client,

package/src/metrics.ts ADDED Viewed

@@ -0,0 +1,5 @@
+export type RuntimeMetricsHooks = {
+  onModelCall?: (input: { provider: string; outcome: "completed" | "failed"; durationSeconds: number }) => void;
+  onSandboxCreate?: (input: { backend: string; outcome: "completed" | "failed"; durationSeconds: number }) => void;
+  onSandboxWarmingTimeout?: (input: { backend: string }) => void;
+};

package/src/sandbox/display-stack.ts CHANGED Viewed

@@ -32,10 +32,50 @@ export const STREAM_PORT = DESKTOP_STREAM_PORT;
 export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
 // PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
-// waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
+// waiting for an actually-PAINTED frame before declaring the stack "up" (~30s worst case).
 const PAINT_PROBE_ATTEMPTS = 150;
 const PAINT_PROBE_INTERVAL_S = 0.2;
+// The paint FLOOR (bytes): a scrot at/above this size is a real painted desktop; below
+// it, the root is still unpainted and the frame would read as "blank" to the model.
+//
+// WHY A SIZE FLOOR, NOT NON-EMPTINESS (the bug this fixes): the old gate only checked
+// `[ -s frame.png ]` (non-empty). But an UNPAINTED root is never zero-byte — a fresh
+// Xvfb draws either the `-retro` weave stipple or (with `-retro` dropped) solid black,
+// and scrot happily encodes that as a small-but-non-empty PNG. So the old gate passed
+// the instant the VNC ports bound — MEASURED at ~1.4s (fast runc host) to several
+// seconds (cold gVisor) BEFORE xfdesktop finishes its first wallpaper paint — handing
+// the model the pre-paint frame. That pre-paint frame is exactly the "blank/black"
+// screenshot that 400s the model and blanks the human viewer.
+//
+// The sizes are unambiguous and were measured on the canonical desktop image (1280x800)
+// under runc — both the current staging image and a fresh local build:
+//   painted XFCE desktop (blue-gradient wallpaper + panel + icons): ~210-222 KB
+//   `-retro` stipple root (unpainted, current image):                ~17 KB
+//   solid-black root (unpainted, after we drop `-retro`):            ~13.5 KB
+// 60 KB sits ~3.5x above every unpainted state and ~3.5x below a real paint — a wide,
+// unambiguous margin. It holds against BOTH the currently-deployed `-retro` image and
+// the `-retro`-dropped image this change ships, so the runtime gate is correct before
+// AND after the image rebuild lands. (Assumes the default ~1280x800 geometry; a larger
+// framebuffer only scales the painted frame further above the floor.)
+const PAINT_MIN_BYTES = 60_000;
+// SETTLE gate (the gVisor staged-paint fix): crossing the 60 KB floor is necessary but
+// NOT sufficient. On a fast runc host the paint is atomic (black 13.5 KB -> full 209 KB
+// in one step, panel + icons included). On a STONE-COLD gVisor Modal box it is STAGED:
+// the wallpaper gradient paints and crosses 60 KB a beat BEFORE xfdesktop draws the
+// panel / launcher icons / logo. A screenshot in that window shows a bare teal wallpaper
+// with no panel — which the model correctly reports as "graphical, but the desktop
+// hasn't fully loaded" (VERIFIED live on staging: a cold-box turn's first agent
+// screenshot caught exactly this). So the gate additionally waits for the frame to
+// SETTLE: two consecutive probes both above the floor whose byte-sizes agree within
+// PAINT_SETTLE_DELTA_BYTES. A still-painting desktop grows between probes; a fully
+// rendered, static one is byte-stable (scrot -o omits the cursor, and the clock is
+// minute-precision, so consecutive captures of a settled desktop are near-identical).
+// This makes ensureDisplayStack block until the FULL desktop is up, so the turn's first
+// screenshot — which runs AFTER this gate — sees the panel, not a bare wallpaper.
+const PAINT_SETTLE_DELTA_BYTES = 2_000;
 /** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
  *  change is a full down -> up restart (a separate op). */
 export type DesktopGeometry = {
@@ -145,18 +185,25 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
   // PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
   // only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
   // LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
-  // machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
-  // ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
-  // scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
-  // is exactly the empty screenshot that 400s the model and blanks the human viewer.
+  // machine→sandbox swap-recovery turn always hits one), Xvfb answers and the VNC ports
+  // bind ~1.4s (fast host) to several seconds BEFORE xfdesktop finishes its first
+  // wallpaper paint. In that window a scrot yields a small UNPAINTED frame (the -retro
+  // stipple or a solid-black root) — never zero-byte — which is exactly the "blank/black"
+  // screenshot that 400s the model and blanks the human viewer. (VERIFIED locally: the
+  // real xfdesktop backdrop window maps at full 1280x800 the whole time; the render is
+  // never structurally broken — it is purely this pre-paint capture race.)
+  //
   // We therefore chain a real scrot probe as the completion gate: after the up-script
-  // reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
-  // only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
-  // typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
-  // rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
+  // reports success, poll scrot until it produces an actually-PAINTED frame — a PNG at or
+  // above PAINT_MIN_BYTES, not merely NON-EMPTY (the old `[ -s ]` check passed on the
+  // ~17 KB pre-paint stipple immediately; that WAS the bug) — bounded ~30s, and only THEN
+  // let the command exit 0. If it never paints we exit 14 so the caller sees a typed
+  // DisplayStackError("paint") — an HONEST failure the worker can degrade + log, rather
+  // than a false "up" that hands the model an unpainted image. `-ac` on Xvfb disables
   // access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
   // — an already-up display paints on the first probe). Lives in the runtime-built script
-  // (not the baked image up-script) so it ships with the worker/api, no image rebuild.
+  // (not the baked image up-script) so it ships with the worker/api, no image rebuild —
+  // and its size floor holds against the currently-deployed image too.
   const bringUp =
     `if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
     `echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
@@ -166,13 +213,22 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
     `env ${env} opengeni-desktop-up; ` +
     `fi`;
   const paintProbe =
-    `p=/tmp/opengeni-desktop/paint-probe.png; ` +
+    `p=/tmp/opengeni-desktop/paint-probe.png; prev=0; ` +
     `for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
-    `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
+    // Capture, then measure the PNG byte-size. `wc -c < "$p"` yields a bare integer; a
+    // failed scrot leaves sz=0. A frame at/above PAINT_MIN_BYTES is a real painted desktop.
+    `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1; then sz=$(wc -c < "$p" 2>/dev/null || echo 0); else sz=0; fi; ` +
     `rm -f "$p"; ` +
+    // SETTLE: accept only when THIS probe AND the PREVIOUS one are both above the floor
+    // and their sizes agree within PAINT_SETTLE_DELTA_BYTES — i.e., the paint has stopped
+    // growing (the full desktop, panel + icons included, is up), not merely crossed the
+    // floor mid-paint on a staged gVisor boot. ($sz/$prev/$d are bare shell — no ${}
+    // braces — so JS leaves them for bash; ${PAINT_*} ARE JS constants and interpolate.)
+    `if [ "$sz" -ge ${PAINT_MIN_BYTES} ] && [ "$prev" -ge ${PAINT_MIN_BYTES} ]; then d=$((sz-prev)); [ "$d" -lt 0 ] && d=$((0-d)); [ "$d" -le ${PAINT_SETTLE_DELTA_BYTES} ] && break; fi; ` +
+    `prev=$sz; ` +
     // NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
     // caller infers the outcome by string-matching the output — stdout is always captured.
-    `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
+    `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot below ${PAINT_MIN_BYTES}B or unsettled after warmup (last=$sz)"; exit 14; fi; ` +
     `sleep ${PAINT_PROBE_INTERVAL_S}; ` +
     `done`;
   return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;