npm - @opengeni/runtime - Versions diffs - 0.2.1 → 0.2.3 - Mend

@opengeni/runtime 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-2PO56VAL.js → chunk-KNW7AMQB.js} +11 -4
package/dist/chunk-KNW7AMQB.js.map +1 -0
package/dist/index.d.ts +113 -177
package/dist/index.js +371 -171
package/dist/index.js.map +1 -1
package/dist/sandbox/index.d.ts +6 -4
package/dist/sandbox/index.js +1 -1
package/package.json +5 -5
package/src/context-compaction.ts +217 -348
package/src/image-history.ts +149 -0
package/src/index.ts +184 -60
package/src/sandbox/display-stack.ts +61 -12
package/src/sandbox-computer.ts +90 -18
package/dist/chunk-2PO56VAL.js.map +0 -1

package/src/sandbox/display-stack.ts CHANGED Viewed

@@ -24,11 +24,17 @@ import { DESKTOP_STREAM_PORT } from "@opengeni/contracts";
 export { DESKTOP_STREAM_PORT };
 export const STREAM_PORT = DESKTOP_STREAM_PORT;
-// The whole-stack launch is bounded by the readiness gates inside the script
-// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS first-boot XFCE/dbus
-// + font-cache warm-up on a cold gVisor box. 60s gives headroom over the spike's
-// observed ~5-10s warm path without masking a genuine wedge.
-export const DISPLAY_STACK_TIMEOUT_MS = 60_000;
+// The whole-stack launch is bounded by the readiness gates inside the up-script
+// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS the PAINTABLE-FRAME
+// gate we append (up to ~30s of scrot probing) PLUS first-boot XFCE/dbus + font-cache
+// warm-up on a cold gVisor box. 90s gives headroom over the spike's observed ~5-10s
+// warm path AND the cold-box paint warm-up without masking a genuine wedge.
+export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
+// PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
+// waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
+const PAINT_PROBE_ATTEMPTS = 150;
+const PAINT_PROBE_INTERVAL_S = 0.2;
 /** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
  *  change is a full down -> up restart (a separate op). */
@@ -41,15 +47,25 @@ export type DesktopGeometry = {
 export const DEFAULT_DESKTOP_GEOMETRY: DesktopGeometry = { width: 1280, height: 800, dpi: 96 };
 /** Thrown when a stage of the launch script failed. exitCode 11/12/13 map to
- *  Xvfb / x11vnc / websockify respectively (the stage that died). Degradation is
- *  surfaced as a value to viewers by the caller; this error is for diagnostics. */
+ *  Xvfb / x11vnc / websockify respectively (the stage that died); 14 is the
+ *  PAINTABLE-FRAME gate (ports listening but scrot still yields an empty frame —
+ *  the display is up but not actually painting). Degradation is surfaced as a
+ *  value to viewers by the caller; this error is for diagnostics. */
 export class DisplayStackError extends Error {
   readonly exitCode: number;
-  readonly stage: "xvfb" | "x11vnc" | "websockify" | "unknown";
+  readonly stage: "xvfb" | "x11vnc" | "websockify" | "paint" | "unknown";
   constructor(exitCode: number, output: string) {
     const stage =
-      exitCode === 11 ? "xvfb" : exitCode === 12 ? "x11vnc" : exitCode === 13 ? "websockify" : "unknown";
+      exitCode === 11
+        ? "xvfb"
+        : exitCode === 12
+          ? "x11vnc"
+          : exitCode === 13
+            ? "websockify"
+            : exitCode === 14
+              ? "paint"
+              : "unknown";
     super(`desktop display stack failed at stage "${stage}" (exit ${exitCode})${output ? `:\n${output}` : ""}`);
     this.name = "DisplayStackError";
     this.exitCode = exitCode;
@@ -125,15 +141,41 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
   // flock -w bounds the wait so a wedged holder can't deadlock the caller; the
   // up-script itself ALSO takes the same lock (belt + braces) so this works even
   // against an older image that predates the wrapper.
-  return (
+  //
+  // PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
+  // only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
+  // LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
+  // machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
+  // ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
+  // scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
+  // is exactly the empty screenshot that 400s the model and blanks the human viewer.
+  // We therefore chain a real scrot probe as the completion gate: after the up-script
+  // reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
+  // only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
+  // typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
+  // rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
+  // access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
+  // — an already-up display paints on the first probe). Lives in the runtime-built script
+  // (not the baked image up-script) so it ships with the worker/api, no image rebuild.
+  const bringUp =
     `if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
     `echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
     `else ` +
     `mkdir -p /tmp/opengeni-desktop && ` +
     `flock -w 45 /tmp/opengeni-desktop/up.outer.lock ` +
     `env ${env} opengeni-desktop-up; ` +
-    `fi`
-  );
+    `fi`;
+  const paintProbe =
+    `p=/tmp/opengeni-desktop/paint-probe.png; ` +
+    `for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
+    `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
+    `rm -f "$p"; ` +
+    // NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
+    // caller infers the outcome by string-matching the output — stdout is always captured.
+    `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
+    `sleep ${PAINT_PROBE_INTERVAL_S}; ` +
+    `done`;
+  return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;
 }
 function execResultOutput(result: ExecResultLike | string): string {
@@ -157,6 +199,13 @@ function execResultExitCode(result: ExecResultLike | string): number | null {
 // bare string), we infer success from the OPENGENI_DESKTOP_UP marker and infer
 // the failing stage from the stage-failure message the script prints to stderr.
 function inferExitFromOutput(output: string): number {
+  // Check the PAINTABLE-FRAME failure FIRST: on that path the up-script already
+  // printed OPENGENI_DESKTOP_UP (bring-up succeeded) and THEN the paint gate failed,
+  // so both markers are present — the NOT_PAINTING one is the authoritative outcome.
+  // (Modal is execCommand-only, so this string-inference path is the live one.)
+  if (/OPENGENI_DESKTOP_NOT_PAINTING/.test(output)) {
+    return 14;
+  }
   if (/OPENGENI_DESKTOP_UP\b/.test(output)) {
     return 0;
   }

package/src/sandbox-computer.ts CHANGED Viewed

@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
 const SCROLL_MAX_CLICKS = 15;
 // screenshot() never hands the model an empty image_url (the SDK turns "" into
 // `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
-// a zero-byte frame on the first scrot; bounded retries with a short pause let a
-// momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
-const SCREENSHOT_MAX_ATTEMPTS = 3;
-const SCREENSHOT_RETRY_DELAY_MS = 400;
+// zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
+// + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
+// after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
+// So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
+// a short pause between tries, so that first post-cold / post-swap screenshot self-heals
+// as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
+// that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
+// short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
+const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
+const SCREENSHOT_RETRY_DELAY_MS = 750;
 export type SandboxComputerOptions = {
   display?: string; // ":0"
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
   typeDelayMs?: number; // xdotool type --delay (default 12ms)
   readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
   screenshotTmpDir?: string; // "/tmp"
+  // How long screenshot() keeps retrying an empty (still-warming) frame before it
+  // FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
+  // exposed mainly so tests can shrink it (a real caller wants the full budget).
+  screenshotWarmupBudgetMs?: number;
+  screenshotRetryDelayMs?: number;
 };
 // X keysym map for keypress(): model key names → xdotool keysyms.
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
   private readonly typeDelayMs: number;
   private readonly readOnly: boolean;
   private readonly tmp: string;
+  private readonly screenshotWarmupBudgetMs: number;
+  private readonly screenshotRetryDelayMs: number;
   constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
     this.session = session as unknown as ComputerSession;
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
     this.typeDelayMs = opts.typeDelayMs ?? 12;
     this.readOnly = opts.readOnly ?? false;
     this.tmp = opts.screenshotTmpDir ?? "/tmp";
+    this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
+    this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
   }
   /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
     // but momentarily not painting (XFCE/dbus still warming) recovers without
     // failing the turn.
     let lastError: unknown;
-    for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
+    const deadline = Date.now() + this.screenshotWarmupBudgetMs;
+    let attempt = 0;
+    // Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
+    // first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
+    // is a KNOWN transient during that warm-up — not a reason to fail the turn.
+    while (true) {
       if (attempt > 0) {
-        await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
+        await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
       }
+      attempt++;
       const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
       try {
         await this.x(`scrot --pointer --overwrite ${f}`);
         const bytes = await this.readScreenshotBytes(f);
         if (bytes.length === 0) {
           // A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
-          // hand the model an empty image_url; throw on the final attempt.
+          // hand the model an empty image_url; throw once the budget is spent.
           throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
         }
         return Buffer.from(bytes).toString("base64");
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
         // screenshot result.
         await this.x(`rm -f ${f}`).catch(() => undefined);
       }
+      // Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
+      if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
+        break;
+      }
     }
-    // Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
-    // returning "" here would surface to the model as an invalid empty image_url.
+    // Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
+    // outcome — returning "" here would surface to the model as an invalid empty
+    // image_url. Reaching here means the display was still dead after ~30s, not merely
+    // warming, so a hard action failure is correct.
     if (lastError instanceof Error) {
       throw lastError;
     }
@@ -761,6 +788,25 @@ export function computerFunctionTools(
 // ── The capability (the SDK seam) ────────────────────────────────────────────
+/**
+ * EXPLICIT tool-transport selection, decided by the caller that knows the
+ * provider's true wire identity (the worker's model resolution — see agent-turn.ts),
+ * NOT inferred from the bound model instance's constructor name. This is the
+ * HARDENING seam: `supportsStructuredToolOutputTransport` string-sniffs the
+ * constructor for "ChatCompletions", which a wrapped / proxied / minified model
+ * instance would defeat — silently handing a chat-completions provider the HOSTED
+ * `computer_use_preview` tool it 400s on every turn. When `toolMode` is set, tools()
+ * OBEYS it and never consults the sniff:
+ *   • "hosted"         → the single hosted `computer_use_preview` tool (Responses backends).
+ *   • "function-image" → the FUNCTION `computer_*` tools with screenshots delivered as a
+ *                        structured `{type:'image'}` output (the codex/ChatGPT backend,
+ *                        which rejects hosted tool types but SEES structured image results).
+ *   • "function-text"  → the FUNCTION tools with screenshots rendered as a text
+ *                        `data:…;base64` URL (chat-completions providers, which can't read
+ *                        structured image tool results).
+ */
+export type ComputerToolMode = "hosted" | "function-image" | "function-text";
 export type ComputerUseArgs = {
   dimensions?: [number, number];
   readOnly?: boolean;
@@ -771,8 +817,14 @@ export type ComputerUseArgs = {
   // `input_image` content item inside the function_call_output) instead of the text
   // data-URL string. Only the codex/ChatGPT backend can read structured image tool
   // results; chat-completions providers cannot, so this stays OFF (text rendering)
-  // by default and is turned on only on the codex path (see index.ts).
+  // by default and is turned on only on the codex path (see index.ts). Ignored when
+  // `toolMode` is set (the mode carries its own image-delivery choice).
   imageFunctionResults?: boolean;
+  // EXPLICIT transport selection (see {@link ComputerToolMode}). When present, tools()
+  // obeys it directly — the constructor-name sniff is NOT consulted. When ABSENT, the
+  // legacy sniff behaviour is preserved byte-for-byte (back-compat for any embedder
+  // that constructs the capability without threading a mode).
+  toolMode?: ComputerToolMode;
 };
 export function computerUse(args: ComputerUseArgs = {}): ComputerUseCapability {
@@ -820,16 +872,36 @@ export class ComputerUseCapability extends Capability {
           // The SDK base exposes the bound runAs as a protected field.
           ...(typeof this._runAs === "string" ? { runAs: this._runAs } : {}),
         });
-    // Structured transport keeps the HOSTED computer tool (unchanged); the codex /
-    // text backend gets the FUNCTION tools it can actually call.
+    // HARDENING: when the caller declares an EXPLICIT toolMode, obey it and NEVER
+    // consult `supportsStructuredToolOutputTransport` — tool selection must not
+    // depend on the model instance's constructor name (a wrapped/proxied/minified
+    // instance would defeat the "ChatCompletions" string-sniff and silently hand a
+    // chat-completions provider the hosted tool it 400s on). The mode is decided by
+    // the worker, where provider identity is authoritative (see agent-turn.ts).
+    switch (this.args.toolMode) {
+      case "hosted":
+        return [this.hostedComputerTool(computer)];
+      case "function-image":
+        return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, true);
+      case "function-text":
+        return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, false);
+      case undefined:
+        break; // fall through to the legacy sniff (back-compat), preserved byte-for-byte
+    }
+    // Legacy (no toolMode): structured transport keeps the HOSTED computer tool
+    // (unchanged); the codex / text backend gets the FUNCTION tools it can call.
     if (supportsStructuredToolOutputTransport(this._modelInstance)) {
-      return [
-        computerTool({
-          computer,
-          ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
-        }) as unknown as Tool<unknown>,
-      ];
+      return [this.hostedComputerTool(computer)];
     }
     return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, this.args.imageFunctionResults ?? false);
   }
+  /** The single HOSTED `computer_use_preview` tool bound to `computer` — identical
+   *  construction for the explicit "hosted" mode and the legacy structured-sniff path. */
+  private hostedComputerTool(computer: Computer): Tool<unknown> {
+    return computerTool({
+      computer,
+      ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
+    }) as unknown as Tool<unknown>;
+  }
 }