@opengeni/runtime 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/chunk-2PO56VAL.js +3478 -0
  2. package/dist/chunk-2PO56VAL.js.map +1 -0
  3. package/dist/index.d.ts +912 -0
  4. package/dist/index.js +3663 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/sandbox/index.d.ts +1738 -0
  7. package/dist/sandbox/index.js +187 -0
  8. package/dist/sandbox/index.js.map +1 -0
  9. package/package.json +49 -0
  10. package/src/bundled_hashicorp_terraform_skills/LICENSE +373 -0
  11. package/src/bundled_hashicorp_terraform_skills/README.md +18 -0
  12. package/src/bundled_hashicorp_terraform_skills/UPSTREAM_GIT_SHA +1 -0
  13. package/src/bundled_hashicorp_terraform_skills/azure-verified-modules/SKILL.md +613 -0
  14. package/src/bundled_hashicorp_terraform_skills/checkov/SKILL.md +43 -0
  15. package/src/bundled_hashicorp_terraform_skills/refactor-module/SKILL.md +538 -0
  16. package/src/bundled_hashicorp_terraform_skills/social-media-marketing/SKILL.md +35 -0
  17. package/src/bundled_hashicorp_terraform_skills/terraform-search-import/SKILL.md +372 -0
  18. package/src/bundled_hashicorp_terraform_skills/terraform-search-import/references/MANUAL-IMPORT.md +113 -0
  19. package/src/bundled_hashicorp_terraform_skills/terraform-search-import/scripts/list_resources.sh +38 -0
  20. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/SKILL.md +480 -0
  21. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/api-monitoring.md +543 -0
  22. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/component-blocks.md +476 -0
  23. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/deployment-blocks.md +391 -0
  24. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/examples.md +1529 -0
  25. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/linked-stacks.md +187 -0
  26. package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/troubleshooting.md +671 -0
  27. package/src/bundled_hashicorp_terraform_skills/terraform-style-guide/SKILL.md +353 -0
  28. package/src/bundled_hashicorp_terraform_skills/terraform-test/SKILL.md +451 -0
  29. package/src/bundled_hashicorp_terraform_skills/terraform-test/references/CI_CD.md +80 -0
  30. package/src/bundled_hashicorp_terraform_skills/terraform-test/references/EXAMPLES.md +314 -0
  31. package/src/bundled_hashicorp_terraform_skills/terraform-test/references/MOCK_PROVIDERS.md +171 -0
  32. package/src/codex-tool-search.ts +267 -0
  33. package/src/context-compaction.ts +538 -0
  34. package/src/history-sanitizer.ts +719 -0
  35. package/src/index.ts +3299 -0
  36. package/src/sandbox/capabilities.ts +69 -0
  37. package/src/sandbox/channel-a.ts +1031 -0
  38. package/src/sandbox/display-stack.ts +231 -0
  39. package/src/sandbox/errors.ts +34 -0
  40. package/src/sandbox/index.ts +832 -0
  41. package/src/sandbox/providers/blaxel.ts +35 -0
  42. package/src/sandbox/providers/cloudflare.ts +24 -0
  43. package/src/sandbox/providers/daytona.ts +34 -0
  44. package/src/sandbox/providers/docker.ts +17 -0
  45. package/src/sandbox/providers/e2b.ts +36 -0
  46. package/src/sandbox/providers/index.ts +107 -0
  47. package/src/sandbox/providers/local.ts +13 -0
  48. package/src/sandbox/providers/modal.ts +55 -0
  49. package/src/sandbox/providers/none.ts +13 -0
  50. package/src/sandbox/providers/runloop.ts +32 -0
  51. package/src/sandbox/providers/selfhosted.ts +96 -0
  52. package/src/sandbox/providers/types.ts +38 -0
  53. package/src/sandbox/providers/vercel.ts +29 -0
  54. package/src/sandbox/recording.ts +286 -0
  55. package/src/sandbox/routing/backend-resolver.ts +189 -0
  56. package/src/sandbox/routing/routing-session.ts +455 -0
  57. package/src/sandbox/select.ts +371 -0
  58. package/src/sandbox/selfhosted/capabilities.ts +255 -0
  59. package/src/sandbox/selfhosted/control-rpc.ts +351 -0
  60. package/src/sandbox/selfhosted/session.ts +930 -0
  61. package/src/sandbox/selfhosted/testing.ts +230 -0
  62. package/src/sandbox/stream-port.ts +185 -0
  63. package/src/sandbox/stream-token.ts +90 -0
  64. package/src/sandbox/terminal-server.ts +203 -0
  65. package/src/sandbox-computer.ts +835 -0
@@ -0,0 +1,835 @@
1
+ // packages/runtime/src/sandbox-computer.ts — the agent computer-use surface (P4.3).
2
+ //
3
+ // A `Computer` impl backed by xdotool (mouse/keyboard/move/click/type/key) +
4
+ // scrot (screenshots), issued through the SAME externally-owned `session` the
5
+ // human watches over Channel B. The agent and the human share ONE :0 display —
6
+ // zero projection: ffmpeg reads exactly the pixels xdotool draws. Exposed to the
7
+ // Agents SDK as a `computerTool` carried by `ComputerUseCapability`, pushed into
8
+ // `buildAgentCapabilities` when `computerUseEnabled && desktopCapableBackend`.
9
+ //
10
+ // This file lives OUTSIDE the @opengeni/runtime/sandbox agent-loop-free leaf
11
+ // (it imports `computerTool` from the @openai/agents root, which the leaf forbids)
12
+ // and is wired into the agent-loop barrel (packages/runtime/src/index.ts).
13
+ //
14
+ // ── Adversarial-review fixes folded in (module 05 §Adversarial) ──────────────
15
+ // F1 exec is OPTIONAL on every provider (Modal has only execCommand) — the
16
+ // primitive dual-paths `session.exec ?? session.execCommand`.
17
+ // F2 execCommand returns a FORMATTED STRING with a metadata preamble, not raw
18
+ // stdout — screenshots read the PNG by running `base64 <path>` over the SAME
19
+ // command primitive and stripping the banner (NOT `session.readFile`: Modal's
20
+ // readFile path-validates against the /workspace root and THROWS
21
+ // "Sandbox path /tmp/…png escapes the workspace root", so the /tmp scrot can
22
+ // never be read → empty frame → `image_url: ''` → model 400). This mirrors
23
+ // recording.ts/channel-a fsReadViaExec. Exit codes come from the established
24
+ // `sandboxCommandExitCode` parser, not a `.exitCode` field.
25
+ // F3 exec/execCommand YIELDS (does not wait) — `sandboxCommandStillRunning` is
26
+ // treated as a retriable failure, and the input commands complete well under
27
+ // the yield window.
28
+ // F4 import paths: `computerTool`/`Computer` from `@openai/agents` (root, via
29
+ // the agents-core star re-export); `Capability`/`requireBoundSession` from
30
+ // `@openai/agents/sandbox`. `Button` is NOT exported — the union is inlined.
31
+ // F5 scroll deltas are model PIXELS (often hundreds) — divided by a notch step
32
+ // and clamped, NOT used as literal wheel-click `--repeat` counts.
33
+
34
+ import { computerTool, tool, type Computer, type Tool } from "@openai/agents";
35
+ import { Capability, type SandboxSessionLike } from "@openai/agents/sandbox";
36
+ import { KeyAction, PointerAction, PointerButton, type DesktopInputRequest } from "@opengeni/agent-proto";
37
+
38
+ import { sandboxCommandExitCode, sandboxCommandOutput, sandboxCommandStillRunning } from "./index";
39
+ // `stripExecBanner` is the SAME pure helper recording.ts uses to recover the raw
40
+ // command body from Modal's execCommand banner ("…Output:\n<body>"). Imported from
41
+ // the agent-loop-free leaf (importing a pure parser FROM the leaf is allowed — the
42
+ // leaf boundary only forbids the leaf importing the agent loop, not the reverse).
43
+ import { stripExecBanner } from "./sandbox";
44
+
45
+ // `requireBoundSession` lives in @openai/agents-core/sandbox/capabilities/base
46
+ // but is NOT re-exported from the public @openai/agents/sandbox barrel, so we
47
+ // inline the trivial bound-session guard (parity with the SDK's own helper).
48
+ function requireBoundSession(capabilityType: string, session?: SandboxSessionLike): SandboxSessionLike {
49
+ if (!session) {
50
+ throw new ComputerUnavailableError(`capability "${capabilityType}" used before bind(session)`);
51
+ }
52
+ return session;
53
+ }
54
+
55
+ // `Button` is intentionally NOT imported (it is not a public export, F4) — the
56
+ // union is inlined and kept in lockstep with @openai/agents-core/computer.d.ts.
57
+ type ComputerButton = "left" | "right" | "wheel" | "back" | "forward";
58
+
59
+ const DEFAULT_DISPLAY = ":0";
60
+ const DEFAULT_DIMENSIONS: [number, number] = [1280, 800];
61
+ // Commands must complete well under this (F3): xdotool/scrot of a 1280x800 PNG is
62
+ // sub-second; the wait gives headroom on a cold gVisor box without masking a wedge.
63
+ const ACTION_YIELD_MS = 15_000;
64
+ // Model scroll deltas are pixels (F5); one wheel "notch" ≈ this many pixels. e2b
65
+ // uses a similar divisor. Clamp keeps a runaway delta from spamming the wheel.
66
+ const SCROLL_NOTCH_PIXELS = 100;
67
+ const SCROLL_MAX_CLICKS = 15;
68
+ // screenshot() never hands the model an empty image_url (the SDK turns "" into
69
+ // `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
70
+ // a zero-byte frame on the first scrot; bounded retries with a short pause let a
71
+ // momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
72
+ const SCREENSHOT_MAX_ATTEMPTS = 3;
73
+ const SCREENSHOT_RETRY_DELAY_MS = 400;
74
+
75
+ export type SandboxComputerOptions = {
76
+ display?: string; // ":0"
77
+ dimensions?: [number, number]; // must match the Xvfb geometry
78
+ runAs?: string; // provider runAs (modal/docker: "sandbox"); undefined otherwise
79
+ typeDelayMs?: number; // xdotool type --delay (default 12ms)
80
+ readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
81
+ screenshotTmpDir?: string; // "/tmp"
82
+ };
83
+
84
+ // X keysym map for keypress(): model key names → xdotool keysyms.
85
+ const KEYSYM: Record<string, string> = {
86
+ ctrl: "ctrl", control: "ctrl", alt: "alt", option: "alt", shift: "shift",
87
+ cmd: "super", meta: "super", win: "super", super: "super",
88
+ enter: "Return", return: "Return", tab: "Tab", esc: "Escape", escape: "Escape",
89
+ backspace: "BackSpace", delete: "Delete", space: "space",
90
+ up: "Up", down: "Down", left: "Left", right: "Right",
91
+ pageup: "Prior", pagedown: "Next", home: "Home", end: "End",
92
+ };
93
+ function toKeysym(k: string): string {
94
+ const low = k.toLowerCase();
95
+ if (KEYSYM[low]) return KEYSYM[low];
96
+ if (/^f([1-9]|1[0-2])$/.test(low)) return low.toUpperCase();
97
+ return low.length === 1 ? low : k;
98
+ }
99
+ const BUTTON_NUM: Record<ComputerButton, number> = { left: 1, wheel: 2, right: 3, back: 8, forward: 9 };
100
+
101
+ // The structural slice of a provider session computer-use drives. exec and
102
+ // execCommand are optional because the SDK's SandboxSessionLike leaves them
103
+ // optional (Modal implements execCommand, not exec — F1). readFile is intentionally
104
+ // NOT in this type: screenshots read the /tmp PNG via `base64 <path>` over
105
+ // exec/execCommand (readFile path-validates against /workspace and rejects /tmp).
106
+ type ExecResultLike = { output?: string; stdout?: string; stderr?: string; exitCode?: number | null; sessionId?: number };
107
+ type ComputerSession = {
108
+ exec?: (args: { cmd: string; runAs?: string; yieldTimeMs?: number; maxOutputTokens?: number }) => Promise<ExecResultLike>;
109
+ execCommand?: (args: { cmd: string; runAs?: string; yieldTimeMs?: number; maxOutputTokens?: number }) => Promise<string>;
110
+ };
111
+
112
+ /** No exec/execCommand on the session, or the display is not up. */
113
+ export class ComputerUnavailableError extends Error {
114
+ constructor(message: string) { super(message); this.name = "ComputerUnavailableError"; }
115
+ }
116
+ /** A write action attempted while readOnly. */
117
+ export class ComputerReadOnlyError extends Error {
118
+ constructor() { super("computer-use is read-only — write actions are disabled"); this.name = "ComputerReadOnlyError"; }
119
+ }
120
+ /** A nonzero xdotool/scrot exit, OR a command that did not finish before the
121
+ * yield window (F3 — "still running" is a failure, not a silent success). */
122
+ export class ComputerActionError extends Error {
123
+ constructor(public cmd: string, public exitCode: number, public stderr: string) {
124
+ super(`computer action failed (${exitCode}): ${cmd}${stderr ? `\n${stderr}` : ""}`);
125
+ this.name = "ComputerActionError";
126
+ }
127
+ }
128
+
129
+ /**
130
+ * The Computer the agent drives. Every action issues ONE shell line through the
131
+ * externally-owned session (exec ?? execCommand, F1), prefixed with the display.
132
+ * screenshot() scrots to a /tmp file and reads the RAW bytes by running
133
+ * `base64 <path>` over the SAME command primitive and stripping the banner — NOT
134
+ * `session.readFile` (Modal's readFile path-validates against /workspace and rejects
135
+ * /tmp with "escapes the workspace root", which would yield an empty frame and 400
136
+ * the model). The base64-over-exec path is /tmp-readable and binary-safe.
137
+ */
138
+ export class SandboxComputer implements Computer {
139
+ readonly environment = "ubuntu" as const;
140
+ readonly dimensions: [number, number];
141
+ private session: ComputerSession;
142
+ private readonly display: string;
143
+ private readonly runAs?: string;
144
+ private readonly typeDelayMs: number;
145
+ private readonly readOnly: boolean;
146
+ private readonly tmp: string;
147
+
148
+ constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
149
+ this.session = session as unknown as ComputerSession;
150
+ this.display = opts.display ?? DEFAULT_DISPLAY;
151
+ this.dimensions = opts.dimensions ?? DEFAULT_DIMENSIONS;
152
+ if (opts.runAs !== undefined) {
153
+ this.runAs = opts.runAs;
154
+ }
155
+ this.typeDelayMs = opts.typeDelayMs ?? 12;
156
+ this.readOnly = opts.readOnly ?? false;
157
+ this.tmp = opts.screenshotTmpDir ?? "/tmp";
158
+ }
159
+
160
+ /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
161
+ rebind(session: SandboxSessionLike) { this.session = session as unknown as ComputerSession; }
162
+
163
+ // The single command primitive. Dual-paths exec/execCommand (F1), then uses the
164
+ // established string-aware parsers (F2/F3): exitCode from the preamble, and
165
+ // "still running" → a retriable failure. Returns the command OUTPUT body.
166
+ private async x(cmd: string): Promise<string> {
167
+ const args = {
168
+ cmd: `DISPLAY=${this.display} ${cmd}`,
169
+ ...(this.runAs ? { runAs: this.runAs } : {}),
170
+ yieldTimeMs: ACTION_YIELD_MS,
171
+ maxOutputTokens: 4_000,
172
+ };
173
+ let result: ExecResultLike | string;
174
+ if (typeof this.session.exec === "function") {
175
+ result = await this.session.exec(args);
176
+ } else if (typeof this.session.execCommand === "function") {
177
+ result = await this.session.execCommand(args);
178
+ } else {
179
+ throw new ComputerUnavailableError("session cannot run commands (no exec/execCommand)");
180
+ }
181
+ const output = sandboxCommandOutput(result);
182
+ if (sandboxCommandStillRunning(result)) {
183
+ // F3: the command exceeded the yield window. WARN AND RETURN rather than
184
+ // throw. Throwing here causes the SDK's catch in `_runComputerActionAndScreenshot`
185
+ // to set output='' and build `{image_url:""}` → Azure 400. By returning
186
+ // instead, the SDK proceeds past the action loop and calls computer.screenshot()
187
+ // so the model gets the REAL current frame for its next step.
188
+ //
189
+ // screenshot()'s FAIL-LOUD + retry contract is preserved: if scrot itself
190
+ // times out (very unlikely at 15 s), x() returns here, readScreenshotBytes
191
+ // produces empty bytes, and the retry loop eventually throws. The wire-level
192
+ // backstop in computerCallNormalizingFetch is also in place as a second net.
193
+ console.warn(
194
+ `[SandboxComputer] action command did not finish before the ${ACTION_YIELD_MS}ms yield window — proceeding to screenshot: ${cmd}`,
195
+ );
196
+ return output;
197
+ }
198
+ const exitCode = sandboxCommandExitCode(result);
199
+ if (exitCode !== null && exitCode !== 0) {
200
+ throw new ComputerActionError(cmd, exitCode, output);
201
+ }
202
+ return output;
203
+ }
204
+
205
+ private guardWrite() {
206
+ if (this.readOnly) throw new ComputerReadOnlyError();
207
+ }
208
+ private shq(s: string): string {
209
+ return `'${s.replace(/'/g, `'\\''`)}'`;
210
+ }
211
+
212
+ async screenshot(): Promise<string> {
213
+ // F2: scrot to a /tmp file, then read the RAW PNG bytes by running `base64
214
+ // <path>` over the SAME command primitive (exec ?? execCommand) and stripping
215
+ // the banner — NOT `session.readFile`. On Modal, readFile path-validates the
216
+ // path against the /workspace root and THROWS for /tmp ("Sandbox path
217
+ // /tmp/og-shot-*.png escapes the workspace root"), so the scrot could never be
218
+ // read → empty frame → `image_url: ''` → the model 400s. The base64-over-exec
219
+ // mechanism (mirroring recording.ts readRecordingBytes + channel-a
220
+ // fsReadViaExec) is /tmp-readable and binary-safe. We do NOT use execCommand's
221
+ // body via the `this.x()` parser — that drops the execCommand string body; the
222
+ // banner is stripped explicitly here so the base64 payload survives intact.
223
+ //
224
+ // CRITICAL CONTRACT: this NEVER returns an empty string. The Agents SDK builds
225
+ // the model-facing image as `data:image/png;base64,${output}` — so an empty
226
+ // `output` becomes `image_url: ''`, which the model API rejects with
227
+ // "400 Invalid input[N].output.image_url, expected a valid URL" and kills the
228
+ // turn. An empty/failed frame is therefore a THROW (a clear action failure the
229
+ // SDK surfaces), never a silent "". We also self-heal a transient cold-display
230
+ // frame: bounded retries with a short wait between attempts, so a :0 that is up
231
+ // but momentarily not painting (XFCE/dbus still warming) recovers without
232
+ // failing the turn.
233
+ let lastError: unknown;
234
+ for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
235
+ if (attempt > 0) {
236
+ await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
237
+ }
238
+ const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
239
+ try {
240
+ await this.x(`scrot --pointer --overwrite ${f}`);
241
+ const bytes = await this.readScreenshotBytes(f);
242
+ if (bytes.length === 0) {
243
+ // A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
244
+ // hand the model an empty image_url; throw on the final attempt.
245
+ throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
246
+ }
247
+ return Buffer.from(bytes).toString("base64");
248
+ } catch (error) {
249
+ lastError = error;
250
+ } finally {
251
+ // Best-effort cleanup on every attempt (success OR failure); never mask the
252
+ // screenshot result.
253
+ await this.x(`rm -f ${f}`).catch(() => undefined);
254
+ }
255
+ }
256
+ // Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
257
+ // returning "" here would surface to the model as an invalid empty image_url.
258
+ if (lastError instanceof Error) {
259
+ throw lastError;
260
+ }
261
+ throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
262
+ }
263
+
264
+ // Read the screenshot PNG bytes by base64-ing the absolute /tmp path through the
265
+ // SAME command primitive (exec ?? execCommand) — NOT `session.readFile` (Modal
266
+ // path-validates against /workspace and rejects /tmp) and NOT `this.x()` (its
267
+ // `sandboxCommandOutput` parser drops the execCommand STRING body, returning ""
268
+ // — only the exec-object path has a structured body). We capture the RAW result,
269
+ // strip the execCommand banner ("…Output:\n<base64>"), strip whitespace, and
270
+ // decode. Binary-safe: base64 of the scrot is plain ASCII over stdout, no
271
+ // truncation (maxOutputTokens:null), mirroring recording.ts readRecordingBytes.
272
+ private async readScreenshotBytes(path: string): Promise<Uint8Array> {
273
+ const args = {
274
+ cmd: `DISPLAY=${this.display} base64 ${path}`,
275
+ ...(this.runAs ? { runAs: this.runAs } : {}),
276
+ yieldTimeMs: ACTION_YIELD_MS,
277
+ // null disables the provider's output truncation so a full-screen PNG's
278
+ // base64 is never clipped (the SDK's truncateOutput passes through on null).
279
+ maxOutputTokens: null as unknown as number,
280
+ };
281
+ let raw: string;
282
+ if (typeof this.session.exec === "function") {
283
+ // The exec-object path exposes a structured stdout/output body.
284
+ raw = sandboxCommandOutput(await this.session.exec(args));
285
+ } else if (typeof this.session.execCommand === "function") {
286
+ // execCommand returns the formatted STRING — strip the banner to recover the
287
+ // base64 body (sandboxCommandOutput would drop it for the string form).
288
+ raw = stripExecBanner(await this.session.execCommand(args));
289
+ } else {
290
+ throw new ComputerUnavailableError("session cannot run commands (no exec/execCommand) — screenshots unavailable");
291
+ }
292
+ const b64 = raw.replace(/\s+/g, "");
293
+ if (b64.length === 0) return new Uint8Array();
294
+ return Uint8Array.from(Buffer.from(b64, "base64"));
295
+ }
296
+
297
+ async click(xp: number, yp: number, button: ComputerButton) {
298
+ this.guardWrite();
299
+ await this.x(`xdotool mousemove --sync ${xp} ${yp} click ${BUTTON_NUM[button] ?? 1}`);
300
+ }
301
+ async doubleClick(xp: number, yp: number) {
302
+ this.guardWrite();
303
+ await this.x(`xdotool mousemove --sync ${xp} ${yp} click --repeat 2 --delay 60 1`);
304
+ }
305
+ async move(xp: number, yp: number) {
306
+ this.guardWrite();
307
+ await this.x(`xdotool mousemove --sync ${xp} ${yp}`);
308
+ }
309
+ async scroll(xp: number, yp: number, sx: number, sy: number) {
310
+ this.guardWrite();
311
+ // F5: model deltas are PIXELS — convert to wheel notches, clamp.
312
+ const notches = (px: number): number => Math.min(SCROLL_MAX_CLICKS, Math.max(0, Math.round(Math.abs(px) / SCROLL_NOTCH_PIXELS)));
313
+ const vBtn = sy < 0 ? 4 : 5;
314
+ const hBtn = sx < 0 ? 6 : 7;
315
+ const vN = notches(sy);
316
+ const hN = notches(sx);
317
+ let cmd = `xdotool mousemove --sync ${xp} ${yp}`;
318
+ if (vN) cmd += ` click --repeat ${vN} ${vBtn}`;
319
+ if (hN) cmd += ` click --repeat ${hN} ${hBtn}`;
320
+ await this.x(cmd);
321
+ }
322
+ async type(text: string) {
323
+ this.guardWrite();
324
+ await this.x(`xdotool type --delay ${this.typeDelayMs} -- ${this.shq(text)}`);
325
+ }
326
+ async keypress(keys: string[]) {
327
+ this.guardWrite();
328
+ const combo = keys.map(toKeysym).join("+");
329
+ await this.x(`xdotool key -- ${this.shq(combo)}`);
330
+ }
331
+ async drag(path: [number, number][]) {
332
+ this.guardWrite();
333
+ if (path.length === 0) return;
334
+ const [sx0, sy0] = path[0]!;
335
+ let cmd = `xdotool mousemove --sync ${sx0} ${sy0} mousedown 1`;
336
+ for (const [px, py] of path.slice(1)) cmd += ` mousemove --sync ${px} ${py}`;
337
+ cmd += ` mouseup 1`;
338
+ await this.x(cmd);
339
+ }
340
+ async wait() {
341
+ await new Promise((r) => setTimeout(r, 1000));
342
+ }
343
+ }
344
+
345
+ // ── The native-desktop computer (self-hosted / macOS) ────────────────────────
346
+ //
347
+ // `SandboxComputer` drives the desktop by shelling out to xdotool/scrot over the
348
+ // session's `exec` — which needs those X utilities installed in the box image and
349
+ // only works under X11. A SELF-HOSTED machine (macOS OR bring-your-own Linux) has
350
+ // neither guarantee, so it drives the desktop NATIVELY over the control plane: the
351
+ // Rust agent injects input via CGEvent (macOS) / XTEST (Linux) and captures via
352
+ // ScreenCaptureKit / x11, exposed as the two `SelfhostedSession` ops below. No
353
+ // xdotool/scrot dependency; works on macOS.
354
+
355
+ /** The structural slice of a self-hosted session the native computer drives — the
356
+ * two control-plane ops added in session.ts. Kept structural (NOT an import of
357
+ * `SelfhostedSession`) so this agent-loop file never hard-couples to the sandbox
358
+ * leaf; the duck-typed `isNativeDesktopSession` probe (below) selects on it. */
359
+ export type NativeDesktopSession = {
360
+ desktopInput(event: DesktopInputRequest["event"]): Promise<void>;
361
+ screenshot(): Promise<{ png: Uint8Array; width: number; height: number }>;
362
+ };
363
+
364
+ /** Model `Button` → wire `PointerButton`. The proto has no back/forward button, so
365
+ * those degrade to UNSPECIFIED (the agent ignores an unmapped button rather than
366
+ * mis-clicking). A total record so indexing is exhaustive. */
367
+ const POINTER_BUTTON: Record<ComputerButton, PointerButton> = {
368
+ left: PointerButton.POINTER_BUTTON_LEFT,
369
+ right: PointerButton.POINTER_BUTTON_RIGHT,
370
+ wheel: PointerButton.POINTER_BUTTON_MIDDLE,
371
+ back: PointerButton.POINTER_BUTTON_UNSPECIFIED,
372
+ forward: PointerButton.POINTER_BUTTON_UNSPECIFIED,
373
+ };
374
+
375
+ export type NativeDesktopComputerOptions = {
376
+ dimensions?: [number, number]; // the display geometry (must match the capture size)
377
+ environment?: NonNullable<Computer["environment"]>; // "ubuntu" (default) | "mac" | ...; model uses it for OS key conventions
378
+ readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
379
+ };
380
+
381
+ /**
382
+ * A `Computer` that drives a SELF-HOSTED machine's OWN desktop NATIVELY over the
383
+ * control plane (`desktopInput` inject + `screenshot` capture on the bound
384
+ * `SelfhostedSession`) instead of xdotool/scrot over `exec`. Consent + epoch are
385
+ * enforced AGENT-side, so an unconsented inject surfaces the session's mapped
386
+ * control error.
387
+ *
388
+ * screenshot() returns raw base64 with NO data-URL prefix — the EXACT contract of
389
+ * `SandboxComputer.screenshot`: the Agents SDK wraps it as
390
+ * `data:image/png;base64,${output}`, so an empty string would become
391
+ * `image_url: ''` and 400 the model turn. An empty PNG therefore THROWS
392
+ * `ComputerUnavailableError` (mirroring SandboxComputer's empty-guard) — the model
393
+ * never receives an empty image_url.
394
+ */
395
+ export class NativeDesktopComputer implements Computer {
396
+ readonly environment: NonNullable<Computer["environment"]>;
397
+ readonly dimensions: [number, number];
398
+ private session: NativeDesktopSession;
399
+ private readonly readOnly: boolean;
400
+
401
+ constructor(session: NativeDesktopSession, opts: NativeDesktopComputerOptions = {}) {
402
+ this.session = session;
403
+ this.dimensions = opts.dimensions ?? DEFAULT_DIMENSIONS;
404
+ // Default "ubuntu" (self-hosted Linux is the near-term target); a macOS session
405
+ // should pass "mac" so the model uses ⌘-based shortcuts — see the coordinate TODO.
406
+ this.environment = opts.environment ?? "ubuntu";
407
+ this.readOnly = opts.readOnly ?? false;
408
+ }
409
+
410
+ /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
411
+ rebind(session: NativeDesktopSession) { this.session = session; }
412
+
413
+ private guardWrite() {
414
+ if (this.readOnly) throw new ComputerReadOnlyError();
415
+ }
416
+
417
+ private async pointer(x: number, y: number, action: PointerAction, button: PointerButton): Promise<void> {
418
+ // COORDINATE SEAM — TODO(verify e2e on macOS): the model computes x/y against the
419
+ // pixels of the screenshot it just saw, and the agent's macOS CGEvent inject
420
+ // treats x/y as raw screen coordinates. On a Retina Mac, ScreenCaptureKit may
421
+ // capture at 2× the logical POINT space while CGEvent expects logical points — a
422
+ // potential 2× mismatch between the coords the model derives and the coords the
423
+ // inject applies. This MUST be measured on a real Retina Mac (compare the
424
+ // screenshot's reported width/height against the logical display bounds) before
425
+ // any DPR scaling is added. Do NOT add scaling speculatively. Self-hosted Linux
426
+ // (XTEST/x11) is 1:1 and unaffected.
427
+ await this.session.desktopInput({ $case: "pointer", pointer: { x, y, action, button } });
428
+ }
429
+
430
+ async screenshot(): Promise<string> {
431
+ // CRITICAL CONTRACT (mirrors SandboxComputer.screenshot): NEVER return "". The
432
+ // Agents SDK builds the model image as `data:image/png;base64,${output}`; an
433
+ // empty output → `image_url: ''` → the model API 400s and kills the turn. A
434
+ // missing/empty frame is therefore a THROW, never a silent "". Native capture
435
+ // (ScreenCaptureKit / x11) does not have the cold-scrot warm-up the xdotool path
436
+ // retries around, so a single capture + a hard empty-guard is sufficient.
437
+ const { png } = await this.session.screenshot();
438
+ if (png.length === 0) {
439
+ throw new ComputerUnavailableError("native desktop screenshot returned an empty frame (display not up?)");
440
+ }
441
+ return Buffer.from(png).toString("base64");
442
+ }
443
+
444
+ async click(x: number, y: number, button: ComputerButton) {
445
+ this.guardWrite();
446
+ await this.pointer(x, y, PointerAction.POINTER_ACTION_CLICK, POINTER_BUTTON[button] ?? PointerButton.POINTER_BUTTON_LEFT);
447
+ }
448
+ async doubleClick(x: number, y: number) {
449
+ this.guardWrite();
450
+ await this.pointer(x, y, PointerAction.POINTER_ACTION_DOUBLE_CLICK, PointerButton.POINTER_BUTTON_LEFT);
451
+ }
452
+ async move(x: number, y: number) {
453
+ this.guardWrite();
454
+ await this.pointer(x, y, PointerAction.POINTER_ACTION_MOVE, PointerButton.POINTER_BUTTON_UNSPECIFIED);
455
+ }
456
+ async scroll(x: number, y: number, sx: number, sy: number) {
457
+ this.guardWrite();
458
+ // The model's scroll deltas are PIXELS — forward them straight to the agent as a
459
+ // ScrollEvent{x,y,deltaX,deltaY} and let the native inject translate to wheel
460
+ // events per platform. No xdotool "notch" quantization here (that is an
461
+ // xdotool-specific artifact); the agent owns the platform-appropriate scaling.
462
+ await this.session.desktopInput({ $case: "scroll", scroll: { x, y, deltaX: sx, deltaY: sy } });
463
+ }
464
+ async type(text: string) {
465
+ this.guardWrite();
466
+ // A literal text burst: isText:true tells the agent to type the string verbatim
467
+ // (Unicode-aware) rather than interpret it as a key name.
468
+ await this.session.desktopInput({ $case: "key", key: { key: text, isText: true, action: KeyAction.KEY_ACTION_PRESS } });
469
+ }
470
+ async keypress(keys: string[]) {
471
+ this.guardWrite();
472
+ // A chord ("ctrl+c") as ONE non-text KeyEvent (isText:false ⇒ interpret as key
473
+ // names). We send the model's PLATFORM-INDEPENDENT key names joined with "+" —
474
+ // NOT xdotool X keysyms (SandboxComputer's toKeysym maps to "Return"/"super"/
475
+ // "Prior", which are X-specific and wrong for the macOS CGEvent path). The agent
476
+ // owns the per-platform key-name → keycode mapping (the KeyEvent.key contract).
477
+ await this.session.desktopInput({ $case: "key", key: { key: keys.join("+"), isText: false, action: KeyAction.KEY_ACTION_PRESS } });
478
+ }
479
+ async drag(path: [number, number][]) {
480
+ this.guardWrite();
481
+ if (path.length === 0) return;
482
+ // Press at the start, move through each waypoint with the button held, release at
483
+ // the last point. The agent tracks button state across the DOWN → MOVE… → UP.
484
+ const [sx, sy] = path[0]!;
485
+ await this.pointer(sx, sy, PointerAction.POINTER_ACTION_DOWN, PointerButton.POINTER_BUTTON_LEFT);
486
+ for (const [px, py] of path.slice(1)) {
487
+ await this.pointer(px, py, PointerAction.POINTER_ACTION_MOVE, PointerButton.POINTER_BUTTON_LEFT);
488
+ }
489
+ const [ex, ey] = path[path.length - 1]!;
490
+ await this.pointer(ex, ey, PointerAction.POINTER_ACTION_UP, PointerButton.POINTER_BUTTON_LEFT);
491
+ }
492
+ async wait() {
493
+ await new Promise((r) => setTimeout(r, 1000));
494
+ }
495
+ }
496
+
497
+ /**
498
+ * Backend-aware SELECTION discriminator: a SELF-HOSTED session exposes the two
499
+ * native control-plane ops (`desktopInput` + `screenshot`); a MODAL session does
500
+ * not (it drives the desktop via xdotool/scrot over `exec`). Duck-typing on those
501
+ * two methods keeps this file from hard-importing `SelfhostedSession` (avoiding an
502
+ * agent-loop ↔ sandbox-leaf import coupling) and is future-proof: any backend that
503
+ * grows native inject/capture is picked up automatically.
504
+ */
505
+ export function isNativeDesktopSession(session: SandboxSessionLike): session is SandboxSessionLike & NativeDesktopSession {
506
+ const s = session as Partial<NativeDesktopSession>;
507
+ return typeof s.desktopInput === "function" && typeof s.screenshot === "function";
508
+ }
509
+
510
+ // ── Function-transport (codex / text backend) computer tools ─────────────────
511
+ //
512
+ // The SDK emits computer-use ONLY as the HOSTED `computer_use_preview` tool, which
513
+ // the codex / ChatGPT backend rejects (it accepts only function/custom/web_search
514
+ // tool types) — so on codex the hosted tool is unusable and the agent has nothing
515
+ // to drive the desktop with. We mirror EXACTLY how the SDK's filesystem capability
516
+ // degrades `view_image` for the text transport: when the bound model does NOT
517
+ // support the structured tool-output transport, emit a set of FUNCTION tools that
518
+ // route to the SAME bound `Computer`, and hand the model the screen by rendering
519
+ // the screenshot image-output as a text-transport data URL — the identical two-step
520
+ // `imageOutputFromBytes` → `renderImageForTextTransport` the SDK's text `view_image`
521
+ // uses. Those three helpers are NOT public exports of `@openai/agents` /
522
+ // `@openai/agents/sandbox` (they live in the SDK's private capabilities/transport +
523
+ // shared/media modules, unreachable via the package `exports` map), so — mirroring
524
+ // selfhosted/session.ts's local `sniffImageMediaType` — the three tiny pure helpers
525
+ // are reimplemented here in lockstep with the SDK.
526
+
527
+ /** The SDK's tool-output image shape (@openai/agents-core shared/media `ToolOutputImage`). */
528
+ type ToolOutputImage = { type: "image"; image: { data: Uint8Array; mediaType: string } };
529
+
530
+ /** Magic-byte image sniff, in lockstep with the SDK's `sniffImageMediaType`
531
+ * (shared/media). Screenshots are always PNG (scrot / ScreenCaptureKit / x11), so
532
+ * an unrecognized header defaults to image/png rather than failing the frame. */
533
+ function sniffScreenshotMediaType(bytes: Uint8Array): string {
534
+ if (bytes[0] === 0x89 && bytes[1] === 0x50 && bytes[2] === 0x4e && bytes[3] === 0x47) return "image/png";
535
+ if (bytes[0] === 0xff && bytes[1] === 0xd8 && bytes[2] === 0xff) return "image/jpeg";
536
+ if (bytes[0] === 0x47 && bytes[1] === 0x49 && bytes[2] === 0x46 && bytes[3] === 0x38) return "image/gif";
537
+ if (
538
+ bytes[0] === 0x52 && bytes[1] === 0x49 && bytes[2] === 0x46 && bytes[3] === 0x46 &&
539
+ bytes[8] === 0x57 && bytes[9] === 0x45 && bytes[10] === 0x42 && bytes[11] === 0x50
540
+ ) return "image/webp";
541
+ return "image/png";
542
+ }
543
+
544
+ /** Build the SDK `ToolOutputImage` from raw screenshot bytes — the structured shape
545
+ * the SDK's `imageOutputFromBytes` produces (`{type:'image', image:{data,mediaType}}`). */
546
+ function imageOutputFromScreenshotBytes(bytes: Uint8Array): ToolOutputImage {
547
+ return { type: "image", image: { data: Uint8Array.from(bytes), mediaType: sniffScreenshotMediaType(bytes) } };
548
+ }
549
+
550
+ /** Render an image tool-output as a text-transport string, in lockstep with the
551
+ * SDK's private `renderImageForTextTransport` (capabilities/filesystem). Our image
552
+ * output always carries `data` as bytes, so it becomes a `data:<mediaType>;base64,…`
553
+ * URL — the exact form the text-backend `view_image` hands the model. */
554
+ function renderImageForTextTransport(output: ToolOutputImage | string): string {
555
+ if (typeof output === "string") return output;
556
+ const { image } = output;
557
+ const mediaType = typeof image.mediaType === "string" ? image.mediaType : "application/octet-stream";
558
+ return `data:${mediaType};base64,${Buffer.from(image.data).toString("base64")}`;
559
+ }
560
+
561
+ /** Whether the bound model supports the structured tool-output transport, in
562
+ * lockstep with the SDK's private `supportsStructuredToolOutputTransport`
563
+ * (capabilities/transport): a ChatCompletions-family model — and an UNBOUND model
564
+ * (undefined) — does NOT, so it gets the function tools; every other model keeps
565
+ * the hosted `computer_use_preview` tool. The codex neutralize trick in index.ts
566
+ * drops `_modelInstance`, so this returns false there and the function tools win. */
567
+ function supportsStructuredToolOutputTransport(modelInstance: unknown): boolean {
568
+ if (!modelInstance) return false;
569
+ const constructorName =
570
+ typeof modelInstance === "object" && modelInstance && typeof (modelInstance as { constructor?: unknown }).constructor === "function"
571
+ ? ((modelInstance as { constructor: { name?: string } }).constructor.name ?? "")
572
+ : "";
573
+ return !constructorName.includes("ChatCompletions");
574
+ }
575
+
576
+ const COMPUTER_READ_ONLY_MESSAGE =
577
+ "computer-use is read-only for this session — click, double_click, move, scroll, type, keypress, and drag are disabled. Call computer_screenshot to observe the desktop.";
578
+
579
+ // The two coordinate properties every pointer tool shares. Raw JSON schema (NOT
580
+ // zod: zod is not a @opengeni/runtime dependency) with `strict:false`, mirroring the
581
+ // SDK's own `apply_patch` function-tool schema style.
582
+ const COORD_PROPS = {
583
+ x: { type: "integer", description: "X coordinate in the pixels of the most recent computer_screenshot" },
584
+ y: { type: "integer", description: "Y coordinate in the pixels of the most recent computer_screenshot" },
585
+ } as const;
586
+
587
+ function objectSchema(properties: Record<string, unknown>, required: string[]): Record<string, unknown> {
588
+ return { type: "object", properties, required, additionalProperties: false };
589
+ }
590
+
591
+ /**
592
+ * The FUNCTION-transport computer tools for the codex / text backend, each routing
593
+ * to the SAME bound `Computer` the hosted `computer_use_preview` tool would drive.
594
+ * `computer_screenshot` hands the model the desktop two ways, selected by
595
+ * `imageFunctionResults`:
596
+ * • false (chat-completions providers, the default) → the text-transport
597
+ * `data:image/png;base64,…` URL (imageOutputFromBytes → renderImageForTextTransport,
598
+ * the SDK's text `view_image` path) — those backends can't read structured image
599
+ * tool results.
600
+ * • true (the codex/ChatGPT backend) → the structured `{type:'image'}` tool output,
601
+ * which agents-core normalizes into an `input_image` content item inside the
602
+ * function_call_output — the codex /responses backend accepts and SEES it (a text
603
+ * data-URL there is just unreadable text). See index.ts for why it's on there.
604
+ * Write tools return a concise confirmation; when read-only they return
605
+ * {@link COMPUTER_READ_ONLY_MESSAGE} instead of throwing, and any action error is
606
+ * returned as a string so a failed action never kills the turn. Exported so it can be
607
+ * unit-tested against a fake `Computer`.
608
+ */
609
+ export function computerFunctionTools(
610
+ computer: Computer,
611
+ readOnly: boolean,
612
+ needsApproval?: ComputerUseArgs["needsApproval"],
613
+ imageFunctionResults = false,
614
+ ): Tool<unknown>[] {
615
+ const approval = needsApproval !== undefined ? { needsApproval: needsApproval as never } : {};
616
+ // Perform a WRITE action, surfacing read-only / failures as a model-readable
617
+ // string rather than an uncaught throw (an uncaught throw becomes a tool error
618
+ // the backend may 400 on, or kills the step).
619
+ const write = async (confirmation: string, action: () => void | Promise<void>): Promise<string> => {
620
+ if (readOnly) return COMPUTER_READ_ONLY_MESSAGE;
621
+ try {
622
+ await action();
623
+ return confirmation;
624
+ } catch (error) {
625
+ if (error instanceof ComputerReadOnlyError) return COMPUTER_READ_ONLY_MESSAGE;
626
+ return `computer action failed: ${error instanceof Error ? error.message : String(error)}`;
627
+ }
628
+ };
629
+
630
+ return [
631
+ tool({
632
+ name: "computer_screenshot",
633
+ description:
634
+ "Capture the current desktop and return it as an image. Call this FIRST and again after each action — all coordinates for click/move/scroll/drag are pixels of the most recent screenshot.",
635
+ parameters: objectSchema({}, []) as never,
636
+ strict: false,
637
+ execute: async () => {
638
+ // screenshot() returns raw base64 PNG and NEVER an empty string (it throws
639
+ // instead), so the model can't receive an empty image_url.
640
+ const b64 = await computer.screenshot();
641
+ const bytes = Uint8Array.from(Buffer.from(b64, "base64"));
642
+ const image = imageOutputFromScreenshotBytes(bytes);
643
+ // On the codex backend return the structured image output so the model SEES
644
+ // the desktop (agents-core normalizes {type:'image'} → an input_image data-URL
645
+ // content item in the function_call_output); chat-completions providers get
646
+ // the text data-URL string they expect.
647
+ return imageFunctionResults ? image : renderImageForTextTransport(image);
648
+ },
649
+ }),
650
+ tool({
651
+ name: "computer_click",
652
+ description:
653
+ "Click the mouse at (x, y). `button` is one of left|right|wheel|back|forward (default left). Take a computer_screenshot first to find coordinates.",
654
+ parameters: objectSchema(
655
+ { ...COORD_PROPS, button: { type: "string", enum: ["left", "right", "wheel", "back", "forward"], description: "Mouse button; defaults to left" } },
656
+ ["x", "y"],
657
+ ) as never,
658
+ strict: false,
659
+ ...approval,
660
+ execute: async (input) => {
661
+ const { x, y, button } = input as { x: number; y: number; button?: ComputerButton };
662
+ return write(`clicked ${button ?? "left"} at (${x}, ${y})`, () => computer.click(x, y, button ?? "left"));
663
+ },
664
+ }),
665
+ tool({
666
+ name: "computer_double_click",
667
+ description: "Double-click the left mouse button at (x, y). Take a computer_screenshot first to find coordinates.",
668
+ parameters: objectSchema({ ...COORD_PROPS }, ["x", "y"]) as never,
669
+ strict: false,
670
+ ...approval,
671
+ execute: async (input) => {
672
+ const { x, y } = input as { x: number; y: number };
673
+ return write(`double-clicked at (${x}, ${y})`, () => computer.doubleClick(x, y));
674
+ },
675
+ }),
676
+ tool({
677
+ name: "computer_move",
678
+ description: "Move the mouse cursor to (x, y) without clicking.",
679
+ parameters: objectSchema({ ...COORD_PROPS }, ["x", "y"]) as never,
680
+ strict: false,
681
+ ...approval,
682
+ execute: async (input) => {
683
+ const { x, y } = input as { x: number; y: number };
684
+ return write(`moved to (${x}, ${y})`, () => computer.move(x, y));
685
+ },
686
+ }),
687
+ tool({
688
+ name: "computer_scroll",
689
+ description:
690
+ "Scroll at (x, y) by scroll_x / scroll_y pixels (positive scroll_y scrolls down, negative up; positive scroll_x scrolls right).",
691
+ parameters: objectSchema(
692
+ {
693
+ ...COORD_PROPS,
694
+ scroll_x: { type: "integer", description: "Horizontal scroll amount in pixels (positive = right)" },
695
+ scroll_y: { type: "integer", description: "Vertical scroll amount in pixels (positive = down)" },
696
+ },
697
+ ["x", "y", "scroll_x", "scroll_y"],
698
+ ) as never,
699
+ strict: false,
700
+ ...approval,
701
+ execute: async (input) => {
702
+ const { x, y, scroll_x, scroll_y } = input as { x: number; y: number; scroll_x: number; scroll_y: number };
703
+ return write(`scrolled (${scroll_x}, ${scroll_y}) at (${x}, ${y})`, () => computer.scroll(x, y, scroll_x, scroll_y));
704
+ },
705
+ }),
706
+ tool({
707
+ name: "computer_type",
708
+ description: "Type a literal text string at the current keyboard focus. Click the target field first.",
709
+ parameters: objectSchema({ text: { type: "string", description: "The literal text to type" } }, ["text"]) as never,
710
+ strict: false,
711
+ ...approval,
712
+ execute: async (input) => {
713
+ const { text } = input as { text: string };
714
+ return write(`typed ${text.length} character(s)`, () => computer.type(text));
715
+ },
716
+ }),
717
+ tool({
718
+ name: "computer_keypress",
719
+ description:
720
+ 'Press a key or chord. `keys` is an ordered list pressed together, e.g. ["ctrl","c"] or ["Enter"]. Use key names (ctrl, alt, shift, cmd, enter, tab, esc, arrows…), not characters.',
721
+ parameters: objectSchema(
722
+ { keys: { type: "array", items: { type: "string" }, description: "Keys pressed together as a chord" } },
723
+ ["keys"],
724
+ ) as never,
725
+ strict: false,
726
+ ...approval,
727
+ execute: async (input) => {
728
+ const { keys } = input as { keys: string[] };
729
+ return write(`pressed ${keys.join("+")}`, () => computer.keypress(keys));
730
+ },
731
+ }),
732
+ tool({
733
+ name: "computer_drag",
734
+ description:
735
+ "Drag the left mouse button along a path of points. `path` is an ordered list of {x, y} pixels; the button is pressed at the first point, moved through each, and released at the last.",
736
+ parameters: objectSchema(
737
+ {
738
+ path: {
739
+ type: "array",
740
+ description: "Ordered list of points to drag through",
741
+ items: {
742
+ type: "object",
743
+ properties: { x: { type: "integer" }, y: { type: "integer" } },
744
+ required: ["x", "y"],
745
+ additionalProperties: false,
746
+ },
747
+ },
748
+ },
749
+ ["path"],
750
+ ) as never,
751
+ strict: false,
752
+ ...approval,
753
+ execute: async (input) => {
754
+ const { path } = input as { path: Array<{ x: number; y: number }> };
755
+ const points = path.map((p) => [p.x, p.y] as [number, number]);
756
+ return write(`dragged through ${points.length} point(s)`, () => computer.drag(points));
757
+ },
758
+ }),
759
+ ] as unknown as Tool<unknown>[];
760
+ }
761
+
762
+ // ── The capability (the SDK seam) ────────────────────────────────────────────
763
+
764
+ export type ComputerUseArgs = {
765
+ dimensions?: [number, number];
766
+ readOnly?: boolean;
767
+ display?: string;
768
+ needsApproval?: boolean | ((ctx: unknown, action: unknown) => boolean | Promise<boolean>);
769
+ // Deliver screenshots from the FUNCTION tools as a REAL image the model can see
770
+ // (a structured `{type:'image'}` tool output → agents-core normalizes it to an
771
+ // `input_image` content item inside the function_call_output) instead of the text
772
+ // data-URL string. Only the codex/ChatGPT backend can read structured image tool
773
+ // results; chat-completions providers cannot, so this stays OFF (text rendering)
774
+ // by default and is turned on only on the codex path (see index.ts).
775
+ imageFunctionResults?: boolean;
776
+ };
777
+
778
+ export function computerUse(args: ComputerUseArgs = {}): ComputerUseCapability {
779
+ return new ComputerUseCapability(args);
780
+ }
781
+
782
+ /**
783
+ * A `Capability` subclass merged into the agent's tool set by SandboxAgent
784
+ * (`tools = [...agent.tools, ...capability.tools()]`). `bind(session)` hands it
785
+ * the LIVE externally-owned session, so the agent's actions and the viewers'
786
+ * pixels are one display.
787
+ *
788
+ * `tools()` is TRANSPORT-AWARE, mirroring the SDK's `filesystem()` capability
789
+ * (which branches its `view_image` / `apply_patch` on
790
+ * `supportsStructuredToolOutputTransport(this._modelInstance)`):
791
+ * • structured transport (the Responses/OpenAI backend) → the single HOSTED
792
+ * `computer_use_preview` tool over a Computer bound to the session (unchanged).
793
+ * • text transport (codex / ChatGPT backend — or an unbound model) → a set of
794
+ * FUNCTION tools ({@link computerFunctionTools}) that route to the SAME Computer,
795
+ * because the codex backend rejects the hosted computer tool type.
796
+ * The bound model instance is captured by the SDK's `bind().bindRunAs().bindModel()`
797
+ * chain (base `Capability._modelInstance`); the codex path in index.ts neutralizes
798
+ * `bindModel` so `_modelInstance` stays undefined here → the function tools win.
799
+ */
800
+ export class ComputerUseCapability extends Capability {
801
+ readonly type = "computer-use";
802
+ constructor(private args: ComputerUseArgs = {}) { super(); }
803
+
804
+ override tools(): Tool<unknown>[] {
805
+ const session = requireBoundSession("computer-use", this._session);
806
+ // Backend-aware: a SELF-HOSTED session (macOS OR bring-your-own Linux) drives the
807
+ // desktop NATIVELY (CGEvent/XTEST inject + ScreenCaptureKit/x11 capture over the
808
+ // control plane) — no xdotool/scrot on the user's machine required. Everything
809
+ // else (Modal) keeps the xdotool/scrot-over-exec SandboxComputer. See
810
+ // `isNativeDesktopSession` for the duck-typed discriminator.
811
+ const computer: Computer = isNativeDesktopSession(session)
812
+ ? new NativeDesktopComputer(session, {
813
+ ...(this.args.dimensions ? { dimensions: this.args.dimensions } : {}),
814
+ ...(this.args.readOnly !== undefined ? { readOnly: this.args.readOnly } : {}),
815
+ })
816
+ : new SandboxComputer(session, {
817
+ ...(this.args.dimensions ? { dimensions: this.args.dimensions } : {}),
818
+ ...(this.args.readOnly !== undefined ? { readOnly: this.args.readOnly } : {}),
819
+ ...(this.args.display ? { display: this.args.display } : {}),
820
+ // The SDK base exposes the bound runAs as a protected field.
821
+ ...(typeof this._runAs === "string" ? { runAs: this._runAs } : {}),
822
+ });
823
+ // Structured transport keeps the HOSTED computer tool (unchanged); the codex /
824
+ // text backend gets the FUNCTION tools it can actually call.
825
+ if (supportsStructuredToolOutputTransport(this._modelInstance)) {
826
+ return [
827
+ computerTool({
828
+ computer,
829
+ ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
830
+ }) as unknown as Tool<unknown>,
831
+ ];
832
+ }
833
+ return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, this.args.imageFunctionResults ?? false);
834
+ }
835
+ }