@opengeni/runtime 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
67
67
  const SCROLL_MAX_CLICKS = 15;
68
68
  // screenshot() never hands the model an empty image_url (the SDK turns "" into
69
69
  // `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
70
- // a zero-byte frame on the first scrot; bounded retries with a short pause let a
71
- // momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
72
- const SCREENSHOT_MAX_ATTEMPTS = 3;
73
- const SCREENSHOT_RETRY_DELAY_MS = 400;
70
+ // zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
71
+ // + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
72
+ // after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
73
+ // So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
74
+ // a short pause between tries, so that first post-cold / post-swap screenshot self-heals
75
+ // as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
76
+ // that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
77
+ // short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
78
+ const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
79
+ const SCREENSHOT_RETRY_DELAY_MS = 750;
74
80
 
75
81
  export type SandboxComputerOptions = {
76
82
  display?: string; // ":0"
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
79
85
  typeDelayMs?: number; // xdotool type --delay (default 12ms)
80
86
  readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
81
87
  screenshotTmpDir?: string; // "/tmp"
88
+ // How long screenshot() keeps retrying an empty (still-warming) frame before it
89
+ // FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
90
+ // exposed mainly so tests can shrink it (a real caller wants the full budget).
91
+ screenshotWarmupBudgetMs?: number;
92
+ screenshotRetryDelayMs?: number;
82
93
  };
83
94
 
84
95
  // X keysym map for keypress(): model key names → xdotool keysyms.
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
144
155
  private readonly typeDelayMs: number;
145
156
  private readonly readOnly: boolean;
146
157
  private readonly tmp: string;
158
+ private readonly screenshotWarmupBudgetMs: number;
159
+ private readonly screenshotRetryDelayMs: number;
147
160
 
148
161
  constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
149
162
  this.session = session as unknown as ComputerSession;
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
155
168
  this.typeDelayMs = opts.typeDelayMs ?? 12;
156
169
  this.readOnly = opts.readOnly ?? false;
157
170
  this.tmp = opts.screenshotTmpDir ?? "/tmp";
171
+ this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
172
+ this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
158
173
  }
159
174
 
160
175
  /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
231
246
  // but momentarily not painting (XFCE/dbus still warming) recovers without
232
247
  // failing the turn.
233
248
  let lastError: unknown;
234
- for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
249
+ const deadline = Date.now() + this.screenshotWarmupBudgetMs;
250
+ let attempt = 0;
251
+ // Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
252
+ // first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
253
+ // is a KNOWN transient during that warm-up — not a reason to fail the turn.
254
+ while (true) {
235
255
  if (attempt > 0) {
236
- await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
256
+ await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
237
257
  }
258
+ attempt++;
238
259
  const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
239
260
  try {
240
261
  await this.x(`scrot --pointer --overwrite ${f}`);
241
262
  const bytes = await this.readScreenshotBytes(f);
242
263
  if (bytes.length === 0) {
243
264
  // A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
244
- // hand the model an empty image_url; throw on the final attempt.
265
+ // hand the model an empty image_url; throw once the budget is spent.
245
266
  throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
246
267
  }
247
268
  return Buffer.from(bytes).toString("base64");
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
252
273
  // screenshot result.
253
274
  await this.x(`rm -f ${f}`).catch(() => undefined);
254
275
  }
276
+ // Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
277
+ if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
278
+ break;
279
+ }
255
280
  }
256
- // Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
257
- // returning "" here would surface to the model as an invalid empty image_url.
281
+ // Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
282
+ // outcome — returning "" here would surface to the model as an invalid empty
283
+ // image_url. Reaching here means the display was still dead after ~30s, not merely
284
+ // warming, so a hard action failure is correct.
258
285
  if (lastError instanceof Error) {
259
286
  throw lastError;
260
287
  }
@@ -358,7 +385,11 @@ export class SandboxComputer implements Computer {
358
385
  * leaf; the duck-typed `isNativeDesktopSession` probe (below) selects on it. */
359
386
  export type NativeDesktopSession = {
360
387
  desktopInput(event: DesktopInputRequest["event"]): Promise<void>;
361
- screenshot(): Promise<{ png: Uint8Array; width: number; height: number }>;
388
+ // `nativeWidth`/`nativeHeight` are the PRE-DOWNSCALE capture geometry (equal to
389
+ // width/height when the agent did not have to shrink the PNG to fit the transport
390
+ // budget). NativeDesktopComputer scales model clicks from the ENCODED pixel space
391
+ // back to native pixels using their ratio before injecting.
392
+ screenshot(): Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
362
393
  };
363
394
 
364
395
  /** Model `Button` → wire `PointerButton`. The proto has no back/forward button, so
@@ -397,6 +428,15 @@ export class NativeDesktopComputer implements Computer {
397
428
  readonly dimensions: [number, number];
398
429
  private session: NativeDesktopSession;
399
430
  private readonly readOnly: boolean;
431
+ // The ENCODED vs NATIVE geometry of the MOST RECENT screenshot the model saw. The
432
+ // model computes click coordinates in the encoded-pixel space of that screenshot;
433
+ // when the agent downscaled the PNG to fit the transport budget, encoded < native,
434
+ // so we scale coordinates back up to native pixels before injecting (the agent's
435
+ // native inject — macOS CGEvent / Linux XTEST — expects native-pixel coordinates,
436
+ // exactly as it received them pre-downscale). Null until the first screenshot;
437
+ // equal encoded==native (or absent) ⇒ scale factor 1.0 ⇒ byte-identical behavior.
438
+ private lastEncoded: [number, number] | null = null;
439
+ private lastNative: [number, number] | null = null;
400
440
 
401
441
  constructor(session: NativeDesktopSession, opts: NativeDesktopComputerOptions = {}) {
402
442
  this.session = session;
@@ -414,17 +454,34 @@ export class NativeDesktopComputer implements Computer {
414
454
  if (this.readOnly) throw new ComputerReadOnlyError();
415
455
  }
416
456
 
457
+ /** Scale a coordinate the model expressed in the MOST RECENT screenshot's
458
+ * ENCODED pixel space back to NATIVE pixels. When the last frame was not
459
+ * downscaled (encoded == native), or no screenshot has been taken yet, this is a
460
+ * 1:1 identity — the byte-identical current behavior. The agent then applies its
461
+ * own platform mapping (macOS divides native pixels by the backing scale to reach
462
+ * CGEvent points; Linux XTEST is 1:1) exactly as it did pre-downscale. */
463
+ private toNative(x: number, y: number): { x: number; y: number } {
464
+ const enc = this.lastEncoded;
465
+ const nat = this.lastNative;
466
+ if (!enc || !nat || enc[0] <= 0 || enc[1] <= 0) return { x, y };
467
+ if (enc[0] === nat[0] && enc[1] === nat[1]) return { x, y };
468
+ return {
469
+ x: Math.round((x * nat[0]) / enc[0]),
470
+ y: Math.round((y * nat[1]) / enc[1]),
471
+ };
472
+ }
473
+
417
474
  private async pointer(x: number, y: number, action: PointerAction, button: PointerButton): Promise<void> {
418
- // COORDINATE SEAM — TODO(verify e2e on macOS): the model computes x/y against the
419
- // pixels of the screenshot it just saw, and the agent's macOS CGEvent inject
420
- // treats x/y as raw screen coordinates. On a Retina Mac, ScreenCaptureKit may
421
- // capture at the logical POINT space while CGEvent expects logical points — a
422
- // potential mismatch between the coords the model derives and the coords the
423
- // inject applies. This MUST be measured on a real Retina Mac (compare the
424
- // screenshot's reported width/height against the logical display bounds) before
425
- // any DPR scaling is added. Do NOT add scaling speculatively. Self-hosted Linux
426
- // (XTEST/x11) is 1:1 and unaffected.
427
- await this.session.desktopInput({ $case: "pointer", pointer: { x, y, action, button } });
475
+ // COORDINATE SEAM: the model computes x/y against the pixels of the screenshot it
476
+ // just saw — which the agent may have DOWNSCALED to fit the transport's max
477
+ // payload (a full-res Retina/busy screen exceeds NATS's 1 MiB default). We scale
478
+ // those encoded-pixel coordinates back to native pixels here, using the native
479
+ // geometry the last screenshot reported, so the agent's native inject lands the
480
+ // click where the model intended. When no downscale occurred the factor is 1.0
481
+ // and the coordinates pass through unchanged. Self-hosted Linux (XTEST/x11) and
482
+ // any non-downscaled frame are 1:1 and unaffected.
483
+ const n = this.toNative(x, y);
484
+ await this.session.desktopInput({ $case: "pointer", pointer: { x: n.x, y: n.y, action, button } });
428
485
  }
429
486
 
430
487
  async screenshot(): Promise<string> {
@@ -434,10 +491,14 @@ export class NativeDesktopComputer implements Computer {
434
491
  // missing/empty frame is therefore a THROW, never a silent "". Native capture
435
492
  // (ScreenCaptureKit / x11) does not have the cold-scrot warm-up the xdotool path
436
493
  // retries around, so a single capture + a hard empty-guard is sufficient.
437
- const { png } = await this.session.screenshot();
494
+ const { png, width, height, nativeWidth, nativeHeight } = await this.session.screenshot();
438
495
  if (png.length === 0) {
439
496
  throw new ComputerUnavailableError("native desktop screenshot returned an empty frame (display not up?)");
440
497
  }
498
+ // Record the encoded (what the model sees) vs native geometry of THIS frame so
499
+ // the next click/move/scroll/drag scales its coordinates back to native pixels.
500
+ this.lastEncoded = [width, height];
501
+ this.lastNative = [nativeWidth || width, nativeHeight || height];
441
502
  return Buffer.from(png).toString("base64");
442
503
  }
443
504
 
@@ -455,11 +516,12 @@ export class NativeDesktopComputer implements Computer {
455
516
  }
456
517
  async scroll(x: number, y: number, sx: number, sy: number) {
457
518
  this.guardWrite();
458
- // The model's scroll deltas are PIXELS forward them straight to the agent as a
459
- // ScrollEvent{x,y,deltaX,deltaY} and let the native inject translate to wheel
460
- // events per platform. No xdotool "notch" quantization here (that is an
461
- // xdotool-specific artifact); the agent owns the platform-appropriate scaling.
462
- await this.session.desktopInput({ $case: "scroll", scroll: { x, y, deltaX: sx, deltaY: sy } });
519
+ // The scroll ANCHOR (x,y) is a screenshot-pixel position scale to native like a
520
+ // click. The deltas (sx,sy) are relative scroll AMOUNTS, not positions; the agent
521
+ // owns the platform-appropriate wheel translation, so they pass through unscaled
522
+ // (no xdotool "notch" quantization here that is an xdotool-specific artifact).
523
+ const n = this.toNative(x, y);
524
+ await this.session.desktopInput({ $case: "scroll", scroll: { x: n.x, y: n.y, deltaX: sx, deltaY: sy } });
463
525
  }
464
526
  async type(text: string) {
465
527
  this.guardWrite();