@opengeni/runtime 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,53 @@
1
1
  import { ModalImageSelector, ModalSandboxClient } from "@openai/agents-extensions/sandbox/modal";
2
2
  import { effectiveModalIdleTimeoutSeconds } from "@opengeni/config";
3
+ import type { Settings } from "@opengeni/config";
3
4
  import { CAPABILITY_DESCRIPTORS } from "../capabilities";
4
5
  import { SandboxConfigError } from "../errors";
5
6
  import type { ProviderRegistration } from "./types";
6
7
 
8
+ const MODAL_ORPHAN_SWEEP_LIMIT = 50;
9
+ const MODAL_UNATTRIBUTED_ORPHAN_GRACE_MS = 30 * 60_000;
10
+
11
+ export type ModalSandboxAttribution = {
12
+ leaseId: string;
13
+ workspaceId: string;
14
+ sandboxGroupId: string;
15
+ };
16
+
17
+ export type LiveModalSandboxLeaseAttribution = ModalSandboxAttribution & {
18
+ instanceId: string | null;
19
+ liveness?: string;
20
+ };
21
+
22
+ export type ModalOrphanSweepTermination = {
23
+ sandboxId: string;
24
+ reason: "stale_attribution" | "unattributed";
25
+ tags: Record<string, string>;
26
+ };
27
+
28
+ export type ModalOrphanSweepResult = {
29
+ examined: number;
30
+ terminated: ModalOrphanSweepTermination[];
31
+ skipped: number;
32
+ };
33
+
34
+ export function modalSandboxAttributionEnvironment(input: ModalSandboxAttribution): Record<string, string> {
35
+ return {
36
+ OPENGENI_SANDBOX_LEASE_ID: input.leaseId,
37
+ OPENGENI_SANDBOX_GROUP_ID: input.sandboxGroupId,
38
+ OPENGENI_WORKSPACE_ID: input.workspaceId,
39
+ };
40
+ }
41
+
42
+ export function modalSandboxAttributionTags(input: ModalSandboxAttribution): Record<string, string> {
43
+ return {
44
+ opengeni: "true",
45
+ opengeni_lease_id: input.leaseId,
46
+ opengeni_workspace_id: input.workspaceId,
47
+ opengeni_sandbox_group_id: input.sandboxGroupId,
48
+ };
49
+ }
50
+
7
51
  export const modalProvider: ProviderRegistration = {
8
52
  backend: "modal",
9
53
  descriptor: CAPABILITY_DESCRIPTORS.modal,
@@ -23,8 +67,13 @@ export const modalProvider: ProviderRegistration = {
23
67
  const options: NonNullable<ConstructorParameters<typeof ModalSandboxClient>[0]> = {
24
68
  appName: settings.modalAppName,
25
69
  timeoutMs: settings.modalTimeoutSeconds * 1000,
70
+ sandboxCreateTimeoutS: Math.ceil(settings.sandboxWarmingTimeoutMs / 1000),
26
71
  exposedPorts,
27
72
  env: environment,
73
+ // The Modal JS SDK's sandbox default command already sleeps until timeout
74
+ // or explicit termination. Do not let the Agents extension stamp a separate
75
+ // hardcoded sleep command; OPENGENI_MODAL_TIMEOUT_SECONDS owns lifetime.
76
+ useSleepCmd: false,
28
77
  };
29
78
  // gap-fill (module 03 §4.1): these SDK options were previously unmapped.
30
79
  // ALWAYS pin idleTimeoutMs (sandbox-file-persistence): an UNSET idle timeout
@@ -53,3 +102,179 @@ export const modalProvider: ProviderRegistration = {
53
102
  return new ModalSandboxClient(options);
54
103
  },
55
104
  };
105
+
106
+ type ModalModule = typeof import("modal");
107
+ type ModalClientLike = InstanceType<ModalModule["ModalClient"]>;
108
+
109
+ function modalClientOptions(settings: Settings): ConstructorParameters<ModalModule["ModalClient"]>[0] {
110
+ return {
111
+ ...(settings.modalTokenId ? { tokenId: settings.modalTokenId } : {}),
112
+ ...(settings.modalTokenSecret ? { tokenSecret: settings.modalTokenSecret } : {}),
113
+ ...(settings.modalEnvironment ? { environment: settings.modalEnvironment } : {}),
114
+ ...(settings.modalTimeoutSeconds ? { timeoutMs: settings.modalTimeoutSeconds * 1000 } : {}),
115
+ };
116
+ }
117
+
118
+ async function createModalClient(settings: Settings): Promise<ModalClientLike> {
119
+ const modal = await import("modal");
120
+ return new modal.ModalClient(modalClientOptions(settings));
121
+ }
122
+
123
+ export async function tagModalSandbox(
124
+ settings: Settings,
125
+ sandboxId: string,
126
+ attribution: ModalSandboxAttribution,
127
+ ): Promise<boolean> {
128
+ if (!sandboxId) {
129
+ return false;
130
+ }
131
+ const modal = await createModalClient(settings);
132
+ try {
133
+ const sandbox = await modal.sandboxes.fromId(sandboxId);
134
+ await sandbox.setTags(modalSandboxAttributionTags(attribution));
135
+ return true;
136
+ } finally {
137
+ modal.close();
138
+ }
139
+ }
140
+
141
+ export async function terminateModalSandboxById(settings: Settings, sandboxId: string): Promise<boolean> {
142
+ if (!sandboxId) {
143
+ return true;
144
+ }
145
+ const modal = await createModalClient(settings);
146
+ try {
147
+ const sandbox = await modal.sandboxes.fromId(sandboxId);
148
+ await sandbox.terminate();
149
+ return true;
150
+ } finally {
151
+ modal.close();
152
+ }
153
+ }
154
+
155
+ type ModalSandboxInfo = {
156
+ id: string;
157
+ createdAt?: number;
158
+ tags?: Array<{ tagName?: string; tagValue?: string }>;
159
+ };
160
+
161
+ type ModalCpListClient = ModalClientLike & {
162
+ cpClient: {
163
+ sandboxList(input: {
164
+ appId?: string;
165
+ beforeTimestamp?: number;
166
+ environmentName?: string;
167
+ includeFinished?: boolean;
168
+ tags?: Array<{ tagName: string; tagValue: string }>;
169
+ }): Promise<{ sandboxes?: ModalSandboxInfo[] }>;
170
+ };
171
+ };
172
+
173
+ function tagsFromInfo(info: ModalSandboxInfo): Record<string, string> {
174
+ const tags: Record<string, string> = {};
175
+ for (const tag of info.tags ?? []) {
176
+ if (typeof tag.tagName === "string" && typeof tag.tagValue === "string") {
177
+ tags[tag.tagName] = tag.tagValue;
178
+ }
179
+ }
180
+ return tags;
181
+ }
182
+
183
+ function sandboxCreatedAtMs(info: ModalSandboxInfo): number | null {
184
+ if (typeof info.createdAt !== "number" || !Number.isFinite(info.createdAt) || info.createdAt <= 0) {
185
+ return null;
186
+ }
187
+ // Modal protobuf timestamps in this SDK are seconds as doubles.
188
+ return info.createdAt < 10_000_000_000 ? Math.floor(info.createdAt * 1000) : Math.floor(info.createdAt);
189
+ }
190
+
191
+ function attributionKey(input: Pick<ModalSandboxAttribution, "leaseId" | "workspaceId" | "sandboxGroupId">): string {
192
+ return `${input.workspaceId}:${input.sandboxGroupId}:${input.leaseId}`;
193
+ }
194
+
195
+ export async function sweepModalOrphanSandboxes(
196
+ settings: Settings,
197
+ liveLeases: LiveModalSandboxLeaseAttribution[],
198
+ options: {
199
+ now?: Date;
200
+ maxTerminations?: number;
201
+ unattributedGraceMs?: number;
202
+ client?: ModalClientLike;
203
+ } = {},
204
+ ): Promise<ModalOrphanSweepResult> {
205
+ const nowMs = options.now?.getTime() ?? Date.now();
206
+ const maxTerminations = options.maxTerminations ?? MODAL_ORPHAN_SWEEP_LIMIT;
207
+ const unattributedGraceMs = options.unattributedGraceMs ?? MODAL_UNATTRIBUTED_ORPHAN_GRACE_MS;
208
+ const liveByAttribution = new Map(liveLeases.map((lease) => [attributionKey(lease), lease]));
209
+ const ownedClient = options.client ? null : await createModalClient(settings);
210
+ const modal = (options.client ?? ownedClient)! as ModalCpListClient;
211
+ try {
212
+ const app = await modal.apps.fromName(settings.modalAppName, {
213
+ createIfMissing: false,
214
+ ...(settings.modalEnvironment ? { environment: settings.modalEnvironment } : {}),
215
+ });
216
+ const appId = app.appId;
217
+ if (!appId) {
218
+ return { examined: 0, terminated: [], skipped: 0 };
219
+ }
220
+
221
+ let examined = 0;
222
+ let skipped = 0;
223
+ const terminated: ModalOrphanSweepTermination[] = [];
224
+ let beforeTimestamp: number | undefined;
225
+ while (terminated.length < maxTerminations) {
226
+ const response = await modal.cpClient.sandboxList({
227
+ appId,
228
+ ...(beforeTimestamp !== undefined ? { beforeTimestamp } : {}),
229
+ includeFinished: false,
230
+ ...(settings.modalEnvironment ? { environmentName: settings.modalEnvironment } : {}),
231
+ tags: [],
232
+ });
233
+ const sandboxes = response.sandboxes ?? [];
234
+ if (sandboxes.length === 0) {
235
+ break;
236
+ }
237
+ for (const info of sandboxes) {
238
+ examined += 1;
239
+ const tags = tagsFromInfo(info);
240
+ const leaseId = tags.opengeni_lease_id;
241
+ const workspaceId = tags.opengeni_workspace_id;
242
+ const sandboxGroupId = tags.opengeni_sandbox_group_id;
243
+ let reason: ModalOrphanSweepTermination["reason"] | null = null;
244
+ if (leaseId && workspaceId && sandboxGroupId) {
245
+ const live = liveByAttribution.get(attributionKey({ leaseId, workspaceId, sandboxGroupId }));
246
+ if (!live || (live.instanceId && live.instanceId !== info.id)) {
247
+ reason = "stale_attribution";
248
+ }
249
+ } else {
250
+ const createdAtMs = sandboxCreatedAtMs(info);
251
+ if (createdAtMs !== null && nowMs - createdAtMs >= unattributedGraceMs) {
252
+ reason = "unattributed";
253
+ }
254
+ }
255
+
256
+ if (!reason) {
257
+ skipped += 1;
258
+ continue;
259
+ }
260
+ try {
261
+ const sandbox = await modal.sandboxes.fromId(info.id);
262
+ await sandbox.terminate();
263
+ terminated.push({ sandboxId: info.id, reason, tags });
264
+ } catch {
265
+ skipped += 1;
266
+ }
267
+ if (terminated.length >= maxTerminations) {
268
+ break;
269
+ }
270
+ }
271
+ beforeTimestamp = sandboxes[sandboxes.length - 1]?.createdAt;
272
+ if (beforeTimestamp === undefined) {
273
+ break;
274
+ }
275
+ }
276
+ return { examined, terminated, skipped };
277
+ } finally {
278
+ ownedClient?.close();
279
+ }
280
+ }
@@ -74,7 +74,7 @@ export interface RoutableBackendSession {
74
74
  // `event` is kept `unknown` (mirroring the interface's structural style + avoiding
75
75
  // a proto import into the leaf); the SelfhostedSession takes `DesktopInputRequest["event"]`.
76
76
  desktopInput?(event: unknown): Promise<void>;
77
- screenshot?(): Promise<{ png: Uint8Array; width: number; height: number }>;
77
+ screenshot?(): Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
78
78
  }
79
79
 
80
80
  /** The resolved active backend for an epoch: the live session + the sandbox id it
@@ -189,7 +189,7 @@ export class RoutingSandboxSession implements RoutableBackendSession {
189
189
  // that cannot serve them. So the constructor assigns them ONLY when the
190
190
  // construction-time default backend actually implements the native surface (below).
191
191
  desktopInput?: (event: unknown) => Promise<void>;
192
- screenshot?: () => Promise<{ png: Uint8Array; width: number; height: number }>;
192
+ screenshot?: () => Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
193
193
 
194
194
  constructor(deps: RoutingSandboxSessionDeps) {
195
195
  this.deps = deps;
@@ -545,16 +545,32 @@ export class SelfhostedSession {
545
545
  /** Computer-use VIEW op: capture a single PNG screenshot of the machine's desktop
546
546
  * plus its geometry (via ScreenCaptureKit / x11). NOT consent-gated (a view op —
547
547
  * the view/control decoupling), so it works with a display but no screen-control
548
- * consent. Returns the raw encoded bytes + width/height. */
549
- async screenshot(): Promise<{ png: Uint8Array; width: number; height: number }> {
548
+ * consent. Returns the raw encoded bytes + the ENCODED width/height, plus the
549
+ * NATIVE (pre-downscale) geometry: when the agent had to downscale the PNG to fit
550
+ * the transport's max payload, `nativeWidth`/`nativeHeight` carry the original
551
+ * capture size so the computer-use layer can scale model clicks (in encoded-pixel
552
+ * space) back to native pixels. An older agent leaves them 0 → read as "same as
553
+ * width/height" (no downscale). */
554
+ async screenshot(): Promise<{
555
+ png: Uint8Array;
556
+ width: number;
557
+ height: number;
558
+ nativeWidth: number;
559
+ nativeHeight: number;
560
+ }> {
550
561
  const result = await this.call({ $case: "desktopScreenshot", desktopScreenshot: {} });
551
562
  if (result.$case !== "desktopScreenshot") {
552
563
  throw new Error(`selfhosted screenshot: unexpected result ${result.$case}`);
553
564
  }
565
+ const s = result.desktopScreenshot;
566
+ // Back-compat: an agent predating the native-geometry fields sends 0 → treat the
567
+ // encoded geometry AS the native geometry (scale factor 1.0, no coordinate shift).
554
568
  return {
555
- png: result.desktopScreenshot.png,
556
- width: result.desktopScreenshot.width,
557
- height: result.desktopScreenshot.height,
569
+ png: s.png,
570
+ width: s.width,
571
+ height: s.height,
572
+ nativeWidth: s.nativeWidth || s.width,
573
+ nativeHeight: s.nativeHeight || s.height,
558
574
  };
559
575
  }
560
576
 
@@ -385,7 +385,11 @@ export class SandboxComputer implements Computer {
385
385
  * leaf; the duck-typed `isNativeDesktopSession` probe (below) selects on it. */
386
386
  export type NativeDesktopSession = {
387
387
  desktopInput(event: DesktopInputRequest["event"]): Promise<void>;
388
- screenshot(): Promise<{ png: Uint8Array; width: number; height: number }>;
388
+ // `nativeWidth`/`nativeHeight` are the PRE-DOWNSCALE capture geometry (equal to
389
+ // width/height when the agent did not have to shrink the PNG to fit the transport
390
+ // budget). NativeDesktopComputer scales model clicks from the ENCODED pixel space
391
+ // back to native pixels using their ratio before injecting.
392
+ screenshot(): Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
389
393
  };
390
394
 
391
395
  /** Model `Button` → wire `PointerButton`. The proto has no back/forward button, so
@@ -424,6 +428,15 @@ export class NativeDesktopComputer implements Computer {
424
428
  readonly dimensions: [number, number];
425
429
  private session: NativeDesktopSession;
426
430
  private readonly readOnly: boolean;
431
+ // The ENCODED vs NATIVE geometry of the MOST RECENT screenshot the model saw. The
432
+ // model computes click coordinates in the encoded-pixel space of that screenshot;
433
+ // when the agent downscaled the PNG to fit the transport budget, encoded < native,
434
+ // so we scale coordinates back up to native pixels before injecting (the agent's
435
+ // native inject — macOS CGEvent / Linux XTEST — expects native-pixel coordinates,
436
+ // exactly as it received them pre-downscale). Null until the first screenshot;
437
+ // equal encoded==native (or absent) ⇒ scale factor 1.0 ⇒ byte-identical behavior.
438
+ private lastEncoded: [number, number] | null = null;
439
+ private lastNative: [number, number] | null = null;
427
440
 
428
441
  constructor(session: NativeDesktopSession, opts: NativeDesktopComputerOptions = {}) {
429
442
  this.session = session;
@@ -441,17 +454,34 @@ export class NativeDesktopComputer implements Computer {
441
454
  if (this.readOnly) throw new ComputerReadOnlyError();
442
455
  }
443
456
 
457
+ /** Scale a coordinate the model expressed in the MOST RECENT screenshot's
458
+ * ENCODED pixel space back to NATIVE pixels. When the last frame was not
459
+ * downscaled (encoded == native), or no screenshot has been taken yet, this is a
460
+ * 1:1 identity — the byte-identical current behavior. The agent then applies its
461
+ * own platform mapping (macOS divides native pixels by the backing scale to reach
462
+ * CGEvent points; Linux XTEST is 1:1) exactly as it did pre-downscale. */
463
+ private toNative(x: number, y: number): { x: number; y: number } {
464
+ const enc = this.lastEncoded;
465
+ const nat = this.lastNative;
466
+ if (!enc || !nat || enc[0] <= 0 || enc[1] <= 0) return { x, y };
467
+ if (enc[0] === nat[0] && enc[1] === nat[1]) return { x, y };
468
+ return {
469
+ x: Math.round((x * nat[0]) / enc[0]),
470
+ y: Math.round((y * nat[1]) / enc[1]),
471
+ };
472
+ }
473
+
444
474
  private async pointer(x: number, y: number, action: PointerAction, button: PointerButton): Promise<void> {
445
- // COORDINATE SEAM — TODO(verify e2e on macOS): the model computes x/y against the
446
- // pixels of the screenshot it just saw, and the agent's macOS CGEvent inject
447
- // treats x/y as raw screen coordinates. On a Retina Mac, ScreenCaptureKit may
448
- // capture at the logical POINT space while CGEvent expects logical points — a
449
- // potential mismatch between the coords the model derives and the coords the
450
- // inject applies. This MUST be measured on a real Retina Mac (compare the
451
- // screenshot's reported width/height against the logical display bounds) before
452
- // any DPR scaling is added. Do NOT add scaling speculatively. Self-hosted Linux
453
- // (XTEST/x11) is 1:1 and unaffected.
454
- await this.session.desktopInput({ $case: "pointer", pointer: { x, y, action, button } });
475
+ // COORDINATE SEAM: the model computes x/y against the pixels of the screenshot it
476
+ // just saw — which the agent may have DOWNSCALED to fit the transport's max
477
+ // payload (a full-res Retina/busy screen exceeds NATS's 1 MiB default). We scale
478
+ // those encoded-pixel coordinates back to native pixels here, using the native
479
+ // geometry the last screenshot reported, so the agent's native inject lands the
480
+ // click where the model intended. When no downscale occurred the factor is 1.0
481
+ // and the coordinates pass through unchanged. Self-hosted Linux (XTEST/x11) and
482
+ // any non-downscaled frame are 1:1 and unaffected.
483
+ const n = this.toNative(x, y);
484
+ await this.session.desktopInput({ $case: "pointer", pointer: { x: n.x, y: n.y, action, button } });
455
485
  }
456
486
 
457
487
  async screenshot(): Promise<string> {
@@ -461,10 +491,14 @@ export class NativeDesktopComputer implements Computer {
461
491
  // missing/empty frame is therefore a THROW, never a silent "". Native capture
462
492
  // (ScreenCaptureKit / x11) does not have the cold-scrot warm-up the xdotool path
463
493
  // retries around, so a single capture + a hard empty-guard is sufficient.
464
- const { png } = await this.session.screenshot();
494
+ const { png, width, height, nativeWidth, nativeHeight } = await this.session.screenshot();
465
495
  if (png.length === 0) {
466
496
  throw new ComputerUnavailableError("native desktop screenshot returned an empty frame (display not up?)");
467
497
  }
498
+ // Record the encoded (what the model sees) vs native geometry of THIS frame so
499
+ // the next click/move/scroll/drag scales its coordinates back to native pixels.
500
+ this.lastEncoded = [width, height];
501
+ this.lastNative = [nativeWidth || width, nativeHeight || height];
468
502
  return Buffer.from(png).toString("base64");
469
503
  }
470
504
 
@@ -482,11 +516,12 @@ export class NativeDesktopComputer implements Computer {
482
516
  }
483
517
  async scroll(x: number, y: number, sx: number, sy: number) {
484
518
  this.guardWrite();
485
- // The model's scroll deltas are PIXELS forward them straight to the agent as a
486
- // ScrollEvent{x,y,deltaX,deltaY} and let the native inject translate to wheel
487
- // events per platform. No xdotool "notch" quantization here (that is an
488
- // xdotool-specific artifact); the agent owns the platform-appropriate scaling.
489
- await this.session.desktopInput({ $case: "scroll", scroll: { x, y, deltaX: sx, deltaY: sy } });
519
+ // The scroll ANCHOR (x,y) is a screenshot-pixel position scale to native like a
520
+ // click. The deltas (sx,sy) are relative scroll AMOUNTS, not positions; the agent
521
+ // owns the platform-appropriate wheel translation, so they pass through unscaled
522
+ // (no xdotool "notch" quantization here that is an xdotool-specific artifact).
523
+ const n = this.toNative(x, y);
524
+ await this.session.desktopInput({ $case: "scroll", scroll: { x: n.x, y: n.y, deltaX: sx, deltaY: sy } });
490
525
  }
491
526
  async type(text: string) {
492
527
  this.guardWrite();