@opengeni/runtime 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-KNW7AMQB.js → chunk-D5KU3QUC.js} +231 -21
- package/dist/chunk-D5KU3QUC.js.map +1 -0
- package/dist/index.d.ts +18 -2
- package/dist/index.js +82 -6
- package/dist/index.js.map +1 -1
- package/dist/sandbox/index.d.ts +48 -2
- package/dist/sandbox/index.js +11 -1
- package/package.json +3 -3
- package/src/index.ts +66 -4
- package/src/sandbox/display-stack.ts +47 -12
- package/src/sandbox/index.ts +72 -12
- package/src/sandbox/providers/modal.ts +225 -0
- package/src/sandbox/routing/routing-session.ts +2 -2
- package/src/sandbox/selfhosted/session.ts +21 -5
- package/src/sandbox-computer.ts +52 -17
- package/dist/chunk-KNW7AMQB.js.map +0 -1
|
@@ -1,9 +1,53 @@
|
|
|
1
1
|
import { ModalImageSelector, ModalSandboxClient } from "@openai/agents-extensions/sandbox/modal";
|
|
2
2
|
import { effectiveModalIdleTimeoutSeconds } from "@opengeni/config";
|
|
3
|
+
import type { Settings } from "@opengeni/config";
|
|
3
4
|
import { CAPABILITY_DESCRIPTORS } from "../capabilities";
|
|
4
5
|
import { SandboxConfigError } from "../errors";
|
|
5
6
|
import type { ProviderRegistration } from "./types";
|
|
6
7
|
|
|
8
|
+
const MODAL_ORPHAN_SWEEP_LIMIT = 50;
|
|
9
|
+
const MODAL_UNATTRIBUTED_ORPHAN_GRACE_MS = 30 * 60_000;
|
|
10
|
+
|
|
11
|
+
export type ModalSandboxAttribution = {
|
|
12
|
+
leaseId: string;
|
|
13
|
+
workspaceId: string;
|
|
14
|
+
sandboxGroupId: string;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export type LiveModalSandboxLeaseAttribution = ModalSandboxAttribution & {
|
|
18
|
+
instanceId: string | null;
|
|
19
|
+
liveness?: string;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
export type ModalOrphanSweepTermination = {
|
|
23
|
+
sandboxId: string;
|
|
24
|
+
reason: "stale_attribution" | "unattributed";
|
|
25
|
+
tags: Record<string, string>;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export type ModalOrphanSweepResult = {
|
|
29
|
+
examined: number;
|
|
30
|
+
terminated: ModalOrphanSweepTermination[];
|
|
31
|
+
skipped: number;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
export function modalSandboxAttributionEnvironment(input: ModalSandboxAttribution): Record<string, string> {
|
|
35
|
+
return {
|
|
36
|
+
OPENGENI_SANDBOX_LEASE_ID: input.leaseId,
|
|
37
|
+
OPENGENI_SANDBOX_GROUP_ID: input.sandboxGroupId,
|
|
38
|
+
OPENGENI_WORKSPACE_ID: input.workspaceId,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function modalSandboxAttributionTags(input: ModalSandboxAttribution): Record<string, string> {
|
|
43
|
+
return {
|
|
44
|
+
opengeni: "true",
|
|
45
|
+
opengeni_lease_id: input.leaseId,
|
|
46
|
+
opengeni_workspace_id: input.workspaceId,
|
|
47
|
+
opengeni_sandbox_group_id: input.sandboxGroupId,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
7
51
|
export const modalProvider: ProviderRegistration = {
|
|
8
52
|
backend: "modal",
|
|
9
53
|
descriptor: CAPABILITY_DESCRIPTORS.modal,
|
|
@@ -23,8 +67,13 @@ export const modalProvider: ProviderRegistration = {
|
|
|
23
67
|
const options: NonNullable<ConstructorParameters<typeof ModalSandboxClient>[0]> = {
|
|
24
68
|
appName: settings.modalAppName,
|
|
25
69
|
timeoutMs: settings.modalTimeoutSeconds * 1000,
|
|
70
|
+
sandboxCreateTimeoutS: Math.ceil(settings.sandboxWarmingTimeoutMs / 1000),
|
|
26
71
|
exposedPorts,
|
|
27
72
|
env: environment,
|
|
73
|
+
// The Modal JS SDK's sandbox default command already sleeps until timeout
|
|
74
|
+
// or explicit termination. Do not let the Agents extension stamp a separate
|
|
75
|
+
// hardcoded sleep command; OPENGENI_MODAL_TIMEOUT_SECONDS owns lifetime.
|
|
76
|
+
useSleepCmd: false,
|
|
28
77
|
};
|
|
29
78
|
// gap-fill (module 03 §4.1): these SDK options were previously unmapped.
|
|
30
79
|
// ALWAYS pin idleTimeoutMs (sandbox-file-persistence): an UNSET idle timeout
|
|
@@ -53,3 +102,179 @@ export const modalProvider: ProviderRegistration = {
|
|
|
53
102
|
return new ModalSandboxClient(options);
|
|
54
103
|
},
|
|
55
104
|
};
|
|
105
|
+
|
|
106
|
+
type ModalModule = typeof import("modal");
|
|
107
|
+
type ModalClientLike = InstanceType<ModalModule["ModalClient"]>;
|
|
108
|
+
|
|
109
|
+
function modalClientOptions(settings: Settings): ConstructorParameters<ModalModule["ModalClient"]>[0] {
|
|
110
|
+
return {
|
|
111
|
+
...(settings.modalTokenId ? { tokenId: settings.modalTokenId } : {}),
|
|
112
|
+
...(settings.modalTokenSecret ? { tokenSecret: settings.modalTokenSecret } : {}),
|
|
113
|
+
...(settings.modalEnvironment ? { environment: settings.modalEnvironment } : {}),
|
|
114
|
+
...(settings.modalTimeoutSeconds ? { timeoutMs: settings.modalTimeoutSeconds * 1000 } : {}),
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async function createModalClient(settings: Settings): Promise<ModalClientLike> {
|
|
119
|
+
const modal = await import("modal");
|
|
120
|
+
return new modal.ModalClient(modalClientOptions(settings));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export async function tagModalSandbox(
|
|
124
|
+
settings: Settings,
|
|
125
|
+
sandboxId: string,
|
|
126
|
+
attribution: ModalSandboxAttribution,
|
|
127
|
+
): Promise<boolean> {
|
|
128
|
+
if (!sandboxId) {
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
const modal = await createModalClient(settings);
|
|
132
|
+
try {
|
|
133
|
+
const sandbox = await modal.sandboxes.fromId(sandboxId);
|
|
134
|
+
await sandbox.setTags(modalSandboxAttributionTags(attribution));
|
|
135
|
+
return true;
|
|
136
|
+
} finally {
|
|
137
|
+
modal.close();
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
export async function terminateModalSandboxById(settings: Settings, sandboxId: string): Promise<boolean> {
|
|
142
|
+
if (!sandboxId) {
|
|
143
|
+
return true;
|
|
144
|
+
}
|
|
145
|
+
const modal = await createModalClient(settings);
|
|
146
|
+
try {
|
|
147
|
+
const sandbox = await modal.sandboxes.fromId(sandboxId);
|
|
148
|
+
await sandbox.terminate();
|
|
149
|
+
return true;
|
|
150
|
+
} finally {
|
|
151
|
+
modal.close();
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
type ModalSandboxInfo = {
|
|
156
|
+
id: string;
|
|
157
|
+
createdAt?: number;
|
|
158
|
+
tags?: Array<{ tagName?: string; tagValue?: string }>;
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
type ModalCpListClient = ModalClientLike & {
|
|
162
|
+
cpClient: {
|
|
163
|
+
sandboxList(input: {
|
|
164
|
+
appId?: string;
|
|
165
|
+
beforeTimestamp?: number;
|
|
166
|
+
environmentName?: string;
|
|
167
|
+
includeFinished?: boolean;
|
|
168
|
+
tags?: Array<{ tagName: string; tagValue: string }>;
|
|
169
|
+
}): Promise<{ sandboxes?: ModalSandboxInfo[] }>;
|
|
170
|
+
};
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
function tagsFromInfo(info: ModalSandboxInfo): Record<string, string> {
|
|
174
|
+
const tags: Record<string, string> = {};
|
|
175
|
+
for (const tag of info.tags ?? []) {
|
|
176
|
+
if (typeof tag.tagName === "string" && typeof tag.tagValue === "string") {
|
|
177
|
+
tags[tag.tagName] = tag.tagValue;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return tags;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function sandboxCreatedAtMs(info: ModalSandboxInfo): number | null {
|
|
184
|
+
if (typeof info.createdAt !== "number" || !Number.isFinite(info.createdAt) || info.createdAt <= 0) {
|
|
185
|
+
return null;
|
|
186
|
+
}
|
|
187
|
+
// Modal protobuf timestamps in this SDK are seconds as doubles.
|
|
188
|
+
return info.createdAt < 10_000_000_000 ? Math.floor(info.createdAt * 1000) : Math.floor(info.createdAt);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function attributionKey(input: Pick<ModalSandboxAttribution, "leaseId" | "workspaceId" | "sandboxGroupId">): string {
|
|
192
|
+
return `${input.workspaceId}:${input.sandboxGroupId}:${input.leaseId}`;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export async function sweepModalOrphanSandboxes(
|
|
196
|
+
settings: Settings,
|
|
197
|
+
liveLeases: LiveModalSandboxLeaseAttribution[],
|
|
198
|
+
options: {
|
|
199
|
+
now?: Date;
|
|
200
|
+
maxTerminations?: number;
|
|
201
|
+
unattributedGraceMs?: number;
|
|
202
|
+
client?: ModalClientLike;
|
|
203
|
+
} = {},
|
|
204
|
+
): Promise<ModalOrphanSweepResult> {
|
|
205
|
+
const nowMs = options.now?.getTime() ?? Date.now();
|
|
206
|
+
const maxTerminations = options.maxTerminations ?? MODAL_ORPHAN_SWEEP_LIMIT;
|
|
207
|
+
const unattributedGraceMs = options.unattributedGraceMs ?? MODAL_UNATTRIBUTED_ORPHAN_GRACE_MS;
|
|
208
|
+
const liveByAttribution = new Map(liveLeases.map((lease) => [attributionKey(lease), lease]));
|
|
209
|
+
const ownedClient = options.client ? null : await createModalClient(settings);
|
|
210
|
+
const modal = (options.client ?? ownedClient)! as ModalCpListClient;
|
|
211
|
+
try {
|
|
212
|
+
const app = await modal.apps.fromName(settings.modalAppName, {
|
|
213
|
+
createIfMissing: false,
|
|
214
|
+
...(settings.modalEnvironment ? { environment: settings.modalEnvironment } : {}),
|
|
215
|
+
});
|
|
216
|
+
const appId = app.appId;
|
|
217
|
+
if (!appId) {
|
|
218
|
+
return { examined: 0, terminated: [], skipped: 0 };
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
let examined = 0;
|
|
222
|
+
let skipped = 0;
|
|
223
|
+
const terminated: ModalOrphanSweepTermination[] = [];
|
|
224
|
+
let beforeTimestamp: number | undefined;
|
|
225
|
+
while (terminated.length < maxTerminations) {
|
|
226
|
+
const response = await modal.cpClient.sandboxList({
|
|
227
|
+
appId,
|
|
228
|
+
...(beforeTimestamp !== undefined ? { beforeTimestamp } : {}),
|
|
229
|
+
includeFinished: false,
|
|
230
|
+
...(settings.modalEnvironment ? { environmentName: settings.modalEnvironment } : {}),
|
|
231
|
+
tags: [],
|
|
232
|
+
});
|
|
233
|
+
const sandboxes = response.sandboxes ?? [];
|
|
234
|
+
if (sandboxes.length === 0) {
|
|
235
|
+
break;
|
|
236
|
+
}
|
|
237
|
+
for (const info of sandboxes) {
|
|
238
|
+
examined += 1;
|
|
239
|
+
const tags = tagsFromInfo(info);
|
|
240
|
+
const leaseId = tags.opengeni_lease_id;
|
|
241
|
+
const workspaceId = tags.opengeni_workspace_id;
|
|
242
|
+
const sandboxGroupId = tags.opengeni_sandbox_group_id;
|
|
243
|
+
let reason: ModalOrphanSweepTermination["reason"] | null = null;
|
|
244
|
+
if (leaseId && workspaceId && sandboxGroupId) {
|
|
245
|
+
const live = liveByAttribution.get(attributionKey({ leaseId, workspaceId, sandboxGroupId }));
|
|
246
|
+
if (!live || (live.instanceId && live.instanceId !== info.id)) {
|
|
247
|
+
reason = "stale_attribution";
|
|
248
|
+
}
|
|
249
|
+
} else {
|
|
250
|
+
const createdAtMs = sandboxCreatedAtMs(info);
|
|
251
|
+
if (createdAtMs !== null && nowMs - createdAtMs >= unattributedGraceMs) {
|
|
252
|
+
reason = "unattributed";
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
if (!reason) {
|
|
257
|
+
skipped += 1;
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
try {
|
|
261
|
+
const sandbox = await modal.sandboxes.fromId(info.id);
|
|
262
|
+
await sandbox.terminate();
|
|
263
|
+
terminated.push({ sandboxId: info.id, reason, tags });
|
|
264
|
+
} catch {
|
|
265
|
+
skipped += 1;
|
|
266
|
+
}
|
|
267
|
+
if (terminated.length >= maxTerminations) {
|
|
268
|
+
break;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
beforeTimestamp = sandboxes[sandboxes.length - 1]?.createdAt;
|
|
272
|
+
if (beforeTimestamp === undefined) {
|
|
273
|
+
break;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
return { examined, terminated, skipped };
|
|
277
|
+
} finally {
|
|
278
|
+
ownedClient?.close();
|
|
279
|
+
}
|
|
280
|
+
}
|
|
@@ -74,7 +74,7 @@ export interface RoutableBackendSession {
|
|
|
74
74
|
// `event` is kept `unknown` (mirroring the interface's structural style + avoiding
|
|
75
75
|
// a proto import into the leaf); the SelfhostedSession takes `DesktopInputRequest["event"]`.
|
|
76
76
|
desktopInput?(event: unknown): Promise<void>;
|
|
77
|
-
screenshot?(): Promise<{ png: Uint8Array; width: number; height: number }>;
|
|
77
|
+
screenshot?(): Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
|
|
78
78
|
}
|
|
79
79
|
|
|
80
80
|
/** The resolved active backend for an epoch: the live session + the sandbox id it
|
|
@@ -189,7 +189,7 @@ export class RoutingSandboxSession implements RoutableBackendSession {
|
|
|
189
189
|
// that cannot serve them. So the constructor assigns them ONLY when the
|
|
190
190
|
// construction-time default backend actually implements the native surface (below).
|
|
191
191
|
desktopInput?: (event: unknown) => Promise<void>;
|
|
192
|
-
screenshot?: () => Promise<{ png: Uint8Array; width: number; height: number }>;
|
|
192
|
+
screenshot?: () => Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
|
|
193
193
|
|
|
194
194
|
constructor(deps: RoutingSandboxSessionDeps) {
|
|
195
195
|
this.deps = deps;
|
|
@@ -545,16 +545,32 @@ export class SelfhostedSession {
|
|
|
545
545
|
/** Computer-use VIEW op: capture a single PNG screenshot of the machine's desktop
|
|
546
546
|
* plus its geometry (via ScreenCaptureKit / x11). NOT consent-gated (a view op —
|
|
547
547
|
* the view/control decoupling), so it works with a display but no screen-control
|
|
548
|
-
* consent. Returns the raw encoded bytes + width/height
|
|
549
|
-
|
|
548
|
+
* consent. Returns the raw encoded bytes + the ENCODED width/height, plus the
|
|
549
|
+
* NATIVE (pre-downscale) geometry: when the agent had to downscale the PNG to fit
|
|
550
|
+
* the transport's max payload, `nativeWidth`/`nativeHeight` carry the original
|
|
551
|
+
* capture size so the computer-use layer can scale model clicks (in encoded-pixel
|
|
552
|
+
* space) back to native pixels. An older agent leaves them 0 → read as "same as
|
|
553
|
+
* width/height" (no downscale). */
|
|
554
|
+
async screenshot(): Promise<{
|
|
555
|
+
png: Uint8Array;
|
|
556
|
+
width: number;
|
|
557
|
+
height: number;
|
|
558
|
+
nativeWidth: number;
|
|
559
|
+
nativeHeight: number;
|
|
560
|
+
}> {
|
|
550
561
|
const result = await this.call({ $case: "desktopScreenshot", desktopScreenshot: {} });
|
|
551
562
|
if (result.$case !== "desktopScreenshot") {
|
|
552
563
|
throw new Error(`selfhosted screenshot: unexpected result ${result.$case}`);
|
|
553
564
|
}
|
|
565
|
+
const s = result.desktopScreenshot;
|
|
566
|
+
// Back-compat: an agent predating the native-geometry fields sends 0 → treat the
|
|
567
|
+
// encoded geometry AS the native geometry (scale factor 1.0, no coordinate shift).
|
|
554
568
|
return {
|
|
555
|
-
png:
|
|
556
|
-
width:
|
|
557
|
-
height:
|
|
569
|
+
png: s.png,
|
|
570
|
+
width: s.width,
|
|
571
|
+
height: s.height,
|
|
572
|
+
nativeWidth: s.nativeWidth || s.width,
|
|
573
|
+
nativeHeight: s.nativeHeight || s.height,
|
|
558
574
|
};
|
|
559
575
|
}
|
|
560
576
|
|
package/src/sandbox-computer.ts
CHANGED
|
@@ -385,7 +385,11 @@ export class SandboxComputer implements Computer {
|
|
|
385
385
|
* leaf; the duck-typed `isNativeDesktopSession` probe (below) selects on it. */
|
|
386
386
|
export type NativeDesktopSession = {
|
|
387
387
|
desktopInput(event: DesktopInputRequest["event"]): Promise<void>;
|
|
388
|
-
|
|
388
|
+
// `nativeWidth`/`nativeHeight` are the PRE-DOWNSCALE capture geometry (equal to
|
|
389
|
+
// width/height when the agent did not have to shrink the PNG to fit the transport
|
|
390
|
+
// budget). NativeDesktopComputer scales model clicks from the ENCODED pixel space
|
|
391
|
+
// back to native pixels using their ratio before injecting.
|
|
392
|
+
screenshot(): Promise<{ png: Uint8Array; width: number; height: number; nativeWidth: number; nativeHeight: number }>;
|
|
389
393
|
};
|
|
390
394
|
|
|
391
395
|
/** Model `Button` → wire `PointerButton`. The proto has no back/forward button, so
|
|
@@ -424,6 +428,15 @@ export class NativeDesktopComputer implements Computer {
|
|
|
424
428
|
readonly dimensions: [number, number];
|
|
425
429
|
private session: NativeDesktopSession;
|
|
426
430
|
private readonly readOnly: boolean;
|
|
431
|
+
// The ENCODED vs NATIVE geometry of the MOST RECENT screenshot the model saw. The
|
|
432
|
+
// model computes click coordinates in the encoded-pixel space of that screenshot;
|
|
433
|
+
// when the agent downscaled the PNG to fit the transport budget, encoded < native,
|
|
434
|
+
// so we scale coordinates back up to native pixels before injecting (the agent's
|
|
435
|
+
// native inject — macOS CGEvent / Linux XTEST — expects native-pixel coordinates,
|
|
436
|
+
// exactly as it received them pre-downscale). Null until the first screenshot;
|
|
437
|
+
// equal encoded==native (or absent) ⇒ scale factor 1.0 ⇒ byte-identical behavior.
|
|
438
|
+
private lastEncoded: [number, number] | null = null;
|
|
439
|
+
private lastNative: [number, number] | null = null;
|
|
427
440
|
|
|
428
441
|
constructor(session: NativeDesktopSession, opts: NativeDesktopComputerOptions = {}) {
|
|
429
442
|
this.session = session;
|
|
@@ -441,17 +454,34 @@ export class NativeDesktopComputer implements Computer {
|
|
|
441
454
|
if (this.readOnly) throw new ComputerReadOnlyError();
|
|
442
455
|
}
|
|
443
456
|
|
|
457
|
+
/** Scale a coordinate the model expressed in the MOST RECENT screenshot's
|
|
458
|
+
* ENCODED pixel space back to NATIVE pixels. When the last frame was not
|
|
459
|
+
* downscaled (encoded == native), or no screenshot has been taken yet, this is a
|
|
460
|
+
* 1:1 identity — the byte-identical current behavior. The agent then applies its
|
|
461
|
+
* own platform mapping (macOS divides native pixels by the backing scale to reach
|
|
462
|
+
* CGEvent points; Linux XTEST is 1:1) exactly as it did pre-downscale. */
|
|
463
|
+
private toNative(x: number, y: number): { x: number; y: number } {
|
|
464
|
+
const enc = this.lastEncoded;
|
|
465
|
+
const nat = this.lastNative;
|
|
466
|
+
if (!enc || !nat || enc[0] <= 0 || enc[1] <= 0) return { x, y };
|
|
467
|
+
if (enc[0] === nat[0] && enc[1] === nat[1]) return { x, y };
|
|
468
|
+
return {
|
|
469
|
+
x: Math.round((x * nat[0]) / enc[0]),
|
|
470
|
+
y: Math.round((y * nat[1]) / enc[1]),
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
|
|
444
474
|
private async pointer(x: number, y: number, action: PointerAction, button: PointerButton): Promise<void> {
|
|
445
|
-
// COORDINATE SEAM
|
|
446
|
-
//
|
|
447
|
-
//
|
|
448
|
-
//
|
|
449
|
-
//
|
|
450
|
-
//
|
|
451
|
-
//
|
|
452
|
-
// any
|
|
453
|
-
|
|
454
|
-
await this.session.desktopInput({ $case: "pointer", pointer: { x, y, action, button } });
|
|
475
|
+
// COORDINATE SEAM: the model computes x/y against the pixels of the screenshot it
|
|
476
|
+
// just saw — which the agent may have DOWNSCALED to fit the transport's max
|
|
477
|
+
// payload (a full-res Retina/busy screen exceeds NATS's 1 MiB default). We scale
|
|
478
|
+
// those encoded-pixel coordinates back to native pixels here, using the native
|
|
479
|
+
// geometry the last screenshot reported, so the agent's native inject lands the
|
|
480
|
+
// click where the model intended. When no downscale occurred the factor is 1.0
|
|
481
|
+
// and the coordinates pass through unchanged. Self-hosted Linux (XTEST/x11) and
|
|
482
|
+
// any non-downscaled frame are 1:1 and unaffected.
|
|
483
|
+
const n = this.toNative(x, y);
|
|
484
|
+
await this.session.desktopInput({ $case: "pointer", pointer: { x: n.x, y: n.y, action, button } });
|
|
455
485
|
}
|
|
456
486
|
|
|
457
487
|
async screenshot(): Promise<string> {
|
|
@@ -461,10 +491,14 @@ export class NativeDesktopComputer implements Computer {
|
|
|
461
491
|
// missing/empty frame is therefore a THROW, never a silent "". Native capture
|
|
462
492
|
// (ScreenCaptureKit / x11) does not have the cold-scrot warm-up the xdotool path
|
|
463
493
|
// retries around, so a single capture + a hard empty-guard is sufficient.
|
|
464
|
-
const { png } = await this.session.screenshot();
|
|
494
|
+
const { png, width, height, nativeWidth, nativeHeight } = await this.session.screenshot();
|
|
465
495
|
if (png.length === 0) {
|
|
466
496
|
throw new ComputerUnavailableError("native desktop screenshot returned an empty frame (display not up?)");
|
|
467
497
|
}
|
|
498
|
+
// Record the encoded (what the model sees) vs native geometry of THIS frame so
|
|
499
|
+
// the next click/move/scroll/drag scales its coordinates back to native pixels.
|
|
500
|
+
this.lastEncoded = [width, height];
|
|
501
|
+
this.lastNative = [nativeWidth || width, nativeHeight || height];
|
|
468
502
|
return Buffer.from(png).toString("base64");
|
|
469
503
|
}
|
|
470
504
|
|
|
@@ -482,11 +516,12 @@ export class NativeDesktopComputer implements Computer {
|
|
|
482
516
|
}
|
|
483
517
|
async scroll(x: number, y: number, sx: number, sy: number) {
|
|
484
518
|
this.guardWrite();
|
|
485
|
-
// The
|
|
486
|
-
//
|
|
487
|
-
//
|
|
488
|
-
// xdotool
|
|
489
|
-
|
|
519
|
+
// The scroll ANCHOR (x,y) is a screenshot-pixel position → scale to native like a
|
|
520
|
+
// click. The deltas (sx,sy) are relative scroll AMOUNTS, not positions; the agent
|
|
521
|
+
// owns the platform-appropriate wheel translation, so they pass through unscaled
|
|
522
|
+
// (no xdotool "notch" quantization here — that is an xdotool-specific artifact).
|
|
523
|
+
const n = this.toNative(x, y);
|
|
524
|
+
await this.session.desktopInput({ $case: "scroll", scroll: { x: n.x, y: n.y, deltaX: sx, deltaY: sy } });
|
|
490
525
|
}
|
|
491
526
|
async type(text: string) {
|
|
492
527
|
this.guardWrite();
|