@opengeni/runtime 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,11 +24,17 @@ import { DESKTOP_STREAM_PORT } from "@opengeni/contracts";
24
24
  export { DESKTOP_STREAM_PORT };
25
25
  export const STREAM_PORT = DESKTOP_STREAM_PORT;
26
26
 
27
- // The whole-stack launch is bounded by the readiness gates inside the script
28
- // (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS first-boot XFCE/dbus
29
- // + font-cache warm-up on a cold gVisor box. 60s gives headroom over the spike's
30
- // observed ~5-10s warm path without masking a genuine wedge.
31
- export const DISPLAY_STACK_TIMEOUT_MS = 60_000;
27
+ // The whole-stack launch is bounded by the readiness gates inside the up-script
28
+ // (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS the PAINTABLE-FRAME
29
+ // gate we append (up to ~30s of scrot probing) PLUS first-boot XFCE/dbus + font-cache
30
+ // warm-up on a cold gVisor box. 90s gives headroom over the spike's observed ~5-10s
31
+ // warm path AND the cold-box paint warm-up without masking a genuine wedge.
32
+ export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
33
+
34
+ // PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
35
+ // waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
36
+ const PAINT_PROBE_ATTEMPTS = 150;
37
+ const PAINT_PROBE_INTERVAL_S = 0.2;
32
38
 
33
39
  /** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
34
40
  * change is a full down -> up restart (a separate op). */
@@ -41,15 +47,25 @@ export type DesktopGeometry = {
41
47
  export const DEFAULT_DESKTOP_GEOMETRY: DesktopGeometry = { width: 1280, height: 800, dpi: 96 };
42
48
 
43
49
  /** Thrown when a stage of the launch script failed. exitCode 11/12/13 map to
44
- * Xvfb / x11vnc / websockify respectively (the stage that died). Degradation is
45
- * surfaced as a value to viewers by the caller; this error is for diagnostics. */
50
+ * Xvfb / x11vnc / websockify respectively (the stage that died); 14 is the
51
+ * PAINTABLE-FRAME gate (ports listening but scrot still yields an empty frame
52
+ * the display is up but not actually painting). Degradation is surfaced as a
53
+ * value to viewers by the caller; this error is for diagnostics. */
46
54
  export class DisplayStackError extends Error {
47
55
  readonly exitCode: number;
48
- readonly stage: "xvfb" | "x11vnc" | "websockify" | "unknown";
56
+ readonly stage: "xvfb" | "x11vnc" | "websockify" | "paint" | "unknown";
49
57
 
50
58
  constructor(exitCode: number, output: string) {
51
59
  const stage =
52
- exitCode === 11 ? "xvfb" : exitCode === 12 ? "x11vnc" : exitCode === 13 ? "websockify" : "unknown";
60
+ exitCode === 11
61
+ ? "xvfb"
62
+ : exitCode === 12
63
+ ? "x11vnc"
64
+ : exitCode === 13
65
+ ? "websockify"
66
+ : exitCode === 14
67
+ ? "paint"
68
+ : "unknown";
53
69
  super(`desktop display stack failed at stage "${stage}" (exit ${exitCode})${output ? `:\n${output}` : ""}`);
54
70
  this.name = "DisplayStackError";
55
71
  this.exitCode = exitCode;
@@ -125,15 +141,41 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
125
141
  // flock -w bounds the wait so a wedged holder can't deadlock the caller; the
126
142
  // up-script itself ALSO takes the same lock (belt + braces) so this works even
127
143
  // against an older image that predates the wrapper.
128
- return (
144
+ //
145
+ // PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
146
+ // only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
147
+ // LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
148
+ // machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
149
+ // ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
150
+ // scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
151
+ // is exactly the empty screenshot that 400s the model and blanks the human viewer.
152
+ // We therefore chain a real scrot probe as the completion gate: after the up-script
153
+ // reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
154
+ // only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
155
+ // typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
156
+ // rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
157
+ // access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
158
+ // — an already-up display paints on the first probe). Lives in the runtime-built script
159
+ // (not the baked image up-script) so it ships with the worker/api, no image rebuild.
160
+ const bringUp =
129
161
  `if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
130
162
  `echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
131
163
  `else ` +
132
164
  `mkdir -p /tmp/opengeni-desktop && ` +
133
165
  `flock -w 45 /tmp/opengeni-desktop/up.outer.lock ` +
134
166
  `env ${env} opengeni-desktop-up; ` +
135
- `fi`
136
- );
167
+ `fi`;
168
+ const paintProbe =
169
+ `p=/tmp/opengeni-desktop/paint-probe.png; ` +
170
+ `for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
171
+ `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
172
+ `rm -f "$p"; ` +
173
+ // NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
174
+ // caller infers the outcome by string-matching the output — stdout is always captured.
175
+ `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
176
+ `sleep ${PAINT_PROBE_INTERVAL_S}; ` +
177
+ `done`;
178
+ return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;
137
179
  }
138
180
 
139
181
  function execResultOutput(result: ExecResultLike | string): string {
@@ -157,6 +199,13 @@ function execResultExitCode(result: ExecResultLike | string): number | null {
157
199
  // bare string), we infer success from the OPENGENI_DESKTOP_UP marker and infer
158
200
  // the failing stage from the stage-failure message the script prints to stderr.
159
201
  function inferExitFromOutput(output: string): number {
202
+ // Check the PAINTABLE-FRAME failure FIRST: on that path the up-script already
203
+ // printed OPENGENI_DESKTOP_UP (bring-up succeeded) and THEN the paint gate failed,
204
+ // so both markers are present — the NOT_PAINTING one is the authoritative outcome.
205
+ // (Modal is execCommand-only, so this string-inference path is the live one.)
206
+ if (/OPENGENI_DESKTOP_NOT_PAINTING/.test(output)) {
207
+ return 14;
208
+ }
160
209
  if (/OPENGENI_DESKTOP_UP\b/.test(output)) {
161
210
  return 0;
162
211
  }
@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
67
67
  const SCROLL_MAX_CLICKS = 15;
68
68
  // screenshot() never hands the model an empty image_url (the SDK turns "" into
69
69
  // `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
70
- // a zero-byte frame on the first scrot; bounded retries with a short pause let a
71
- // momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
72
- const SCREENSHOT_MAX_ATTEMPTS = 3;
73
- const SCREENSHOT_RETRY_DELAY_MS = 400;
70
+ // zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
71
+ // + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
72
+ // after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
73
+ // So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
74
+ // a short pause between tries, so that first post-cold / post-swap screenshot self-heals
75
+ // as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
76
+ // that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
77
+ // short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
78
+ const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
79
+ const SCREENSHOT_RETRY_DELAY_MS = 750;
74
80
 
75
81
  export type SandboxComputerOptions = {
76
82
  display?: string; // ":0"
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
79
85
  typeDelayMs?: number; // xdotool type --delay (default 12ms)
80
86
  readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
81
87
  screenshotTmpDir?: string; // "/tmp"
88
+ // How long screenshot() keeps retrying an empty (still-warming) frame before it
89
+ // FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
90
+ // exposed mainly so tests can shrink it (a real caller wants the full budget).
91
+ screenshotWarmupBudgetMs?: number;
92
+ screenshotRetryDelayMs?: number;
82
93
  };
83
94
 
84
95
  // X keysym map for keypress(): model key names → xdotool keysyms.
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
144
155
  private readonly typeDelayMs: number;
145
156
  private readonly readOnly: boolean;
146
157
  private readonly tmp: string;
158
+ private readonly screenshotWarmupBudgetMs: number;
159
+ private readonly screenshotRetryDelayMs: number;
147
160
 
148
161
  constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
149
162
  this.session = session as unknown as ComputerSession;
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
155
168
  this.typeDelayMs = opts.typeDelayMs ?? 12;
156
169
  this.readOnly = opts.readOnly ?? false;
157
170
  this.tmp = opts.screenshotTmpDir ?? "/tmp";
171
+ this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
172
+ this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
158
173
  }
159
174
 
160
175
  /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
231
246
  // but momentarily not painting (XFCE/dbus still warming) recovers without
232
247
  // failing the turn.
233
248
  let lastError: unknown;
234
- for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
249
+ const deadline = Date.now() + this.screenshotWarmupBudgetMs;
250
+ let attempt = 0;
251
+ // Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
252
+ // first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
253
+ // is a KNOWN transient during that warm-up — not a reason to fail the turn.
254
+ while (true) {
235
255
  if (attempt > 0) {
236
- await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
256
+ await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
237
257
  }
258
+ attempt++;
238
259
  const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
239
260
  try {
240
261
  await this.x(`scrot --pointer --overwrite ${f}`);
241
262
  const bytes = await this.readScreenshotBytes(f);
242
263
  if (bytes.length === 0) {
243
264
  // A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
244
- // hand the model an empty image_url; throw on the final attempt.
265
+ // hand the model an empty image_url; throw once the budget is spent.
245
266
  throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
246
267
  }
247
268
  return Buffer.from(bytes).toString("base64");
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
252
273
  // screenshot result.
253
274
  await this.x(`rm -f ${f}`).catch(() => undefined);
254
275
  }
276
+ // Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
277
+ if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
278
+ break;
279
+ }
255
280
  }
256
- // Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
257
- // returning "" here would surface to the model as an invalid empty image_url.
281
+ // Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
282
+ // outcome — returning "" here would surface to the model as an invalid empty
283
+ // image_url. Reaching here means the display was still dead after ~30s, not merely
284
+ // warming, so a hard action failure is correct.
258
285
  if (lastError instanceof Error) {
259
286
  throw lastError;
260
287
  }
@@ -761,6 +788,25 @@ export function computerFunctionTools(
761
788
 
762
789
  // ── The capability (the SDK seam) ────────────────────────────────────────────
763
790
 
791
+ /**
792
+ * EXPLICIT tool-transport selection, decided by the caller that knows the
793
+ * provider's true wire identity (the worker's model resolution — see agent-turn.ts),
794
+ * NOT inferred from the bound model instance's constructor name. This is the
795
+ * HARDENING seam: `supportsStructuredToolOutputTransport` string-sniffs the
796
+ * constructor for "ChatCompletions", which a wrapped / proxied / minified model
797
+ * instance would defeat — silently handing a chat-completions provider the HOSTED
798
+ * `computer_use_preview` tool it 400s on every turn. When `toolMode` is set, tools()
799
+ * OBEYS it and never consults the sniff:
800
+ * • "hosted" → the single hosted `computer_use_preview` tool (Responses backends).
801
+ * • "function-image" → the FUNCTION `computer_*` tools with screenshots delivered as a
802
+ * structured `{type:'image'}` output (the codex/ChatGPT backend,
803
+ * which rejects hosted tool types but SEES structured image results).
804
+ * • "function-text" → the FUNCTION tools with screenshots rendered as a text
805
+ * `data:…;base64` URL (chat-completions providers, which can't read
806
+ * structured image tool results).
807
+ */
808
+ export type ComputerToolMode = "hosted" | "function-image" | "function-text";
809
+
764
810
  export type ComputerUseArgs = {
765
811
  dimensions?: [number, number];
766
812
  readOnly?: boolean;
@@ -771,8 +817,14 @@ export type ComputerUseArgs = {
771
817
  // `input_image` content item inside the function_call_output) instead of the text
772
818
  // data-URL string. Only the codex/ChatGPT backend can read structured image tool
773
819
  // results; chat-completions providers cannot, so this stays OFF (text rendering)
774
- // by default and is turned on only on the codex path (see index.ts).
820
+ // by default and is turned on only on the codex path (see index.ts). Ignored when
821
+ // `toolMode` is set (the mode carries its own image-delivery choice).
775
822
  imageFunctionResults?: boolean;
823
+ // EXPLICIT transport selection (see {@link ComputerToolMode}). When present, tools()
824
+ // obeys it directly — the constructor-name sniff is NOT consulted. When ABSENT, the
825
+ // legacy sniff behaviour is preserved byte-for-byte (back-compat for any embedder
826
+ // that constructs the capability without threading a mode).
827
+ toolMode?: ComputerToolMode;
776
828
  };
777
829
 
778
830
  export function computerUse(args: ComputerUseArgs = {}): ComputerUseCapability {
@@ -820,16 +872,36 @@ export class ComputerUseCapability extends Capability {
820
872
  // The SDK base exposes the bound runAs as a protected field.
821
873
  ...(typeof this._runAs === "string" ? { runAs: this._runAs } : {}),
822
874
  });
823
- // Structured transport keeps the HOSTED computer tool (unchanged); the codex /
824
- // text backend gets the FUNCTION tools it can actually call.
875
+ // HARDENING: when the caller declares an EXPLICIT toolMode, obey it and NEVER
876
+ // consult `supportsStructuredToolOutputTransport` tool selection must not
877
+ // depend on the model instance's constructor name (a wrapped/proxied/minified
878
+ // instance would defeat the "ChatCompletions" string-sniff and silently hand a
879
+ // chat-completions provider the hosted tool it 400s on). The mode is decided by
880
+ // the worker, where provider identity is authoritative (see agent-turn.ts).
881
+ switch (this.args.toolMode) {
882
+ case "hosted":
883
+ return [this.hostedComputerTool(computer)];
884
+ case "function-image":
885
+ return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, true);
886
+ case "function-text":
887
+ return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, false);
888
+ case undefined:
889
+ break; // fall through to the legacy sniff (back-compat), preserved byte-for-byte
890
+ }
891
+ // Legacy (no toolMode): structured transport keeps the HOSTED computer tool
892
+ // (unchanged); the codex / text backend gets the FUNCTION tools it can call.
825
893
  if (supportsStructuredToolOutputTransport(this._modelInstance)) {
826
- return [
827
- computerTool({
828
- computer,
829
- ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
830
- }) as unknown as Tool<unknown>,
831
- ];
894
+ return [this.hostedComputerTool(computer)];
832
895
  }
833
896
  return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, this.args.imageFunctionResults ?? false);
834
897
  }
898
+
899
+ /** The single HOSTED `computer_use_preview` tool bound to `computer` — identical
900
+ * construction for the explicit "hosted" mode and the legacy structured-sniff path. */
901
+ private hostedComputerTool(computer: Computer): Tool<unknown> {
902
+ return computerTool({
903
+ computer,
904
+ ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
905
+ }) as unknown as Tool<unknown>;
906
+ }
835
907
  }