@opengeni/runtime 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-2PO56VAL.js → chunk-D5KU3QUC.js} +240 -23
- package/dist/chunk-D5KU3QUC.js.map +1 -0
- package/dist/index.d.ts +106 -178
- package/dist/index.js +427 -161
- package/dist/index.js.map +1 -1
- package/dist/sandbox/index.d.ts +54 -6
- package/dist/sandbox/index.js +11 -1
- package/package.json +3 -3
- package/src/context-compaction.ts +217 -348
- package/src/image-history.ts +149 -0
- package/src/index.ts +195 -38
- package/src/sandbox/display-stack.ts +96 -12
- package/src/sandbox/index.ts +72 -12
- package/src/sandbox/providers/modal.ts +225 -0
- package/src/sandbox/routing/routing-session.ts +2 -2
- package/src/sandbox/selfhosted/session.ts +21 -5
- package/src/sandbox-computer.ts +88 -26
- package/dist/chunk-2PO56VAL.js.map +0 -1
package/src/sandbox-computer.ts
CHANGED
|
@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
|
|
|
67
67
|
const SCROLL_MAX_CLICKS = 15;
|
|
68
68
|
// screenshot() never hands the model an empty image_url (the SDK turns "" into
|
|
69
69
|
// `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
|
|
70
|
-
//
|
|
71
|
-
//
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
// zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
|
|
71
|
+
// + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
|
|
72
|
+
// after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
|
|
73
|
+
// So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
|
|
74
|
+
// a short pause between tries, so that first post-cold / post-swap screenshot self-heals
|
|
75
|
+
// as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
|
|
76
|
+
// that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
|
|
77
|
+
// short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
|
|
78
|
+
const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
|
|
79
|
+
const SCREENSHOT_RETRY_DELAY_MS = 750;
|
|
74
80
|
|
|
75
81
|
export type SandboxComputerOptions = {
|
|
76
82
|
display?: string; // ":0"
|
|
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
|
|
|
79
85
|
typeDelayMs?: number; // xdotool type --delay (default 12ms)
|
|
80
86
|
readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
|
|
81
87
|
screenshotTmpDir?: string; // "/tmp"
|
|
88
|
+
// How long screenshot() keeps retrying an empty (still-warming) frame before it
|
|
89
|
+
// FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
|
|
90
|
+
// exposed mainly so tests can shrink it (a real caller wants the full budget).
|
|
91
|
+
screenshotWarmupBudgetMs?: number;
|
|
92
|
+
screenshotRetryDelayMs?: number;
|
|
82
93
|
};
|
|
83
94
|
|
|
84
95
|
// X keysym map for keypress(): model key names → xdotool keysyms.
|
|
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
|
|
|
144
155
|
private readonly typeDelayMs: number;
|
|
145
156
|
private readonly readOnly: boolean;
|
|
146
157
|
private readonly tmp: string;
|
|
158
|
+
private readonly screenshotWarmupBudgetMs: number;
|
|
159
|
+
private readonly screenshotRetryDelayMs: number;
|
|
147
160
|
|
|
148
161
|
constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
|
|
149
162
|
this.session = session as unknown as ComputerSession;
|
|
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
|
|
|
155
168
|
this.typeDelayMs = opts.typeDelayMs ?? 12;
|
|
156
169
|
this.readOnly = opts.readOnly ?? false;
|
|
157
170
|
this.tmp = opts.screenshotTmpDir ?? "/tmp";
|
|
171
|
+
this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
|
|
172
|
+
this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
|
|
158
173
|
}
|
|
159
174
|
|
|
160
175
|
/** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
|
|
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
|
|
|
231
246
|
// but momentarily not painting (XFCE/dbus still warming) recovers without
|
|
232
247
|
// failing the turn.
|
|
233
248
|
let lastError: unknown;
|
|
234
|
-
|
|
249
|
+
const deadline = Date.now() + this.screenshotWarmupBudgetMs;
|
|
250
|
+
let attempt = 0;
|
|
251
|
+
// Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
|
|
252
|
+
// first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
|
|
253
|
+
// is a KNOWN transient during that warm-up — not a reason to fail the turn.
|
|
254
|
+
while (true) {
|
|
235
255
|
if (attempt > 0) {
|
|
236
|
-
await new Promise((r) => setTimeout(r,
|
|
256
|
+
await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
|
|
237
257
|
}
|
|
258
|
+
attempt++;
|
|
238
259
|
const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
|
|
239
260
|
try {
|
|
240
261
|
await this.x(`scrot --pointer --overwrite ${f}`);
|
|
241
262
|
const bytes = await this.readScreenshotBytes(f);
|
|
242
263
|
if (bytes.length === 0) {
|
|
243
264
|
// A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
|
|
244
|
-
// hand the model an empty image_url; throw
|
|
265
|
+
// hand the model an empty image_url; throw once the budget is spent.
|
|
245
266
|
throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
|
|
246
267
|
}
|
|
247
268
|
return Buffer.from(bytes).toString("base64");
|
|
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
|
|
|
252
273
|
// screenshot result.
|
|
253
274
|
await this.x(`rm -f ${f}`).catch(() => undefined);
|
|
254
275
|
}
|
|
276
|
+
// Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
|
|
277
|
+
if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
255
280
|
}
|
|
256
|
-
// Exhausted
|
|
257
|
-
// returning "" here would surface to the model as an invalid empty
|
|
281
|
+
// Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
|
|
282
|
+
// outcome — returning "" here would surface to the model as an invalid empty
|
|
283
|
+
// image_url. Reaching here means the display was still dead after ~30s, not merely
|
|
284
|
+
// warming, so a hard action failure is correct.
|
|
258
285
|
if (lastError instanceof Error) {
|
|
259
286
|
throw lastError;
|
|
260
287
|
}
|
|
@@ -358,7 +385,11 @@ export class SandboxComputer implements Computer {
|
|
|
358
385
|
* leaf; the duck-typed `isNativeDesktopSession` probe (below) selects on it. */
|
|
359
386
|
export type NativeDesktopSession = {
|
|
360
387
|
desktopInput(event: DesktopInputRequest["event"]): Promise<void>;
|
|
361
|
-
|
|
388
|
+
// `nativeWidth`/`nativeHeight` are the PRE-DOWNSCALE capture geometry (equal to
|
|
389
|
+
// width/height when the agent did not have to shrink the PNG to fit the transport
|
|
390
|
+
// budget). NativeDesktopComputer scales model clicks from the ENCODED pixel space
|
|
391
|
+
// back to native pixels using their ratio before injecting.
|
|
392
|
+
screenshot(): Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
|
|
362
393
|
};
|
|
363
394
|
|
|
364
395
|
/** Model `Button` → wire `PointerButton`. The proto has no back/forward button, so
|
|
@@ -397,6 +428,15 @@ export class NativeDesktopComputer implements Computer {
|
|
|
397
428
|
readonly dimensions: [number, number];
|
|
398
429
|
private session: NativeDesktopSession;
|
|
399
430
|
private readonly readOnly: boolean;
|
|
431
|
+
// The ENCODED vs NATIVE geometry of the MOST RECENT screenshot the model saw. The
|
|
432
|
+
// model computes click coordinates in the encoded-pixel space of that screenshot;
|
|
433
|
+
// when the agent downscaled the PNG to fit the transport budget, encoded < native,
|
|
434
|
+
// so we scale coordinates back up to native pixels before injecting (the agent's
|
|
435
|
+
// native inject — macOS CGEvent / Linux XTEST — expects native-pixel coordinates,
|
|
436
|
+
// exactly as it received them pre-downscale). Null until the first screenshot;
|
|
437
|
+
// equal encoded==native (or absent) ⇒ scale factor 1.0 ⇒ byte-identical behavior.
|
|
438
|
+
private lastEncoded: [number, number] | null = null;
|
|
439
|
+
private lastNative: [number, number] | null = null;
|
|
400
440
|
|
|
401
441
|
constructor(session: NativeDesktopSession, opts: NativeDesktopComputerOptions = {}) {
|
|
402
442
|
this.session = session;
|
|
@@ -414,17 +454,34 @@ export class NativeDesktopComputer implements Computer {
|
|
|
414
454
|
if (this.readOnly) throw new ComputerReadOnlyError();
|
|
415
455
|
}
|
|
416
456
|
|
|
457
|
+
/** Scale a coordinate the model expressed in the MOST RECENT screenshot's
|
|
458
|
+
* ENCODED pixel space back to NATIVE pixels. When the last frame was not
|
|
459
|
+
* downscaled (encoded == native), or no screenshot has been taken yet, this is a
|
|
460
|
+
* 1:1 identity — the byte-identical current behavior. The agent then applies its
|
|
461
|
+
* own platform mapping (macOS divides native pixels by the backing scale to reach
|
|
462
|
+
* CGEvent points; Linux XTEST is 1:1) exactly as it did pre-downscale. */
|
|
463
|
+
private toNative(x: number, y: number): { x: number; y: number } {
|
|
464
|
+
const enc = this.lastEncoded;
|
|
465
|
+
const nat = this.lastNative;
|
|
466
|
+
if (!enc || !nat || enc[0] <= 0 || enc[1] <= 0) return { x, y };
|
|
467
|
+
if (enc[0] === nat[0] && enc[1] === nat[1]) return { x, y };
|
|
468
|
+
return {
|
|
469
|
+
x: Math.round((x * nat[0]) / enc[0]),
|
|
470
|
+
y: Math.round((y * nat[1]) / enc[1]),
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
|
|
417
474
|
private async pointer(x: number, y: number, action: PointerAction, button: PointerButton): Promise<void> {
|
|
418
|
-
// COORDINATE SEAM
|
|
419
|
-
//
|
|
420
|
-
//
|
|
421
|
-
//
|
|
422
|
-
//
|
|
423
|
-
//
|
|
424
|
-
//
|
|
425
|
-
// any
|
|
426
|
-
|
|
427
|
-
await this.session.desktopInput({ $case: "pointer", pointer: { x, y, action, button } });
|
|
475
|
+
// COORDINATE SEAM: the model computes x/y against the pixels of the screenshot it
|
|
476
|
+
// just saw — which the agent may have DOWNSCALED to fit the transport's max
|
|
477
|
+
// payload (a full-res Retina/busy screen exceeds NATS's 1 MiB default). We scale
|
|
478
|
+
// those encoded-pixel coordinates back to native pixels here, using the native
|
|
479
|
+
// geometry the last screenshot reported, so the agent's native inject lands the
|
|
480
|
+
// click where the model intended. When no downscale occurred the factor is 1.0
|
|
481
|
+
// and the coordinates pass through unchanged. Self-hosted Linux (XTEST/x11) and
|
|
482
|
+
// any non-downscaled frame are 1:1 and unaffected.
|
|
483
|
+
const n = this.toNative(x, y);
|
|
484
|
+
await this.session.desktopInput({ $case: "pointer", pointer: { x: n.x, y: n.y, action, button } });
|
|
428
485
|
}
|
|
429
486
|
|
|
430
487
|
async screenshot(): Promise<string> {
|
|
@@ -434,10 +491,14 @@ export class NativeDesktopComputer implements Computer {
|
|
|
434
491
|
// missing/empty frame is therefore a THROW, never a silent "". Native capture
|
|
435
492
|
// (ScreenCaptureKit / x11) does not have the cold-scrot warm-up the xdotool path
|
|
436
493
|
// retries around, so a single capture + a hard empty-guard is sufficient.
|
|
437
|
-
const { png } = await this.session.screenshot();
|
|
494
|
+
const { png, width, height, nativeWidth, nativeHeight } = await this.session.screenshot();
|
|
438
495
|
if (png.length === 0) {
|
|
439
496
|
throw new ComputerUnavailableError("native desktop screenshot returned an empty frame (display not up?)");
|
|
440
497
|
}
|
|
498
|
+
// Record the encoded (what the model sees) vs native geometry of THIS frame so
|
|
499
|
+
// the next click/move/scroll/drag scales its coordinates back to native pixels.
|
|
500
|
+
this.lastEncoded = [width, height];
|
|
501
|
+
this.lastNative = [nativeWidth || width, nativeHeight || height];
|
|
441
502
|
return Buffer.from(png).toString("base64");
|
|
442
503
|
}
|
|
443
504
|
|
|
@@ -455,11 +516,12 @@ export class NativeDesktopComputer implements Computer {
|
|
|
455
516
|
}
|
|
456
517
|
async scroll(x: number, y: number, sx: number, sy: number) {
|
|
457
518
|
this.guardWrite();
|
|
458
|
-
// The
|
|
459
|
-
//
|
|
460
|
-
//
|
|
461
|
-
// xdotool
|
|
462
|
-
|
|
519
|
+
// The scroll ANCHOR (x,y) is a screenshot-pixel position → scale to native like a
|
|
520
|
+
// click. The deltas (sx,sy) are relative scroll AMOUNTS, not positions; the agent
|
|
521
|
+
// owns the platform-appropriate wheel translation, so they pass through unscaled
|
|
522
|
+
// (no xdotool "notch" quantization here — that is an xdotool-specific artifact).
|
|
523
|
+
const n = this.toNative(x, y);
|
|
524
|
+
await this.session.desktopInput({ $case: "scroll", scroll: { x: n.x, y: n.y, deltaX: sx, deltaY: sy } });
|
|
463
525
|
}
|
|
464
526
|
async type(text: string) {
|
|
465
527
|
this.guardWrite();
|