@opengeni/runtime 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-2PO56VAL.js +3478 -0
- package/dist/chunk-2PO56VAL.js.map +1 -0
- package/dist/index.d.ts +912 -0
- package/dist/index.js +3663 -0
- package/dist/index.js.map +1 -0
- package/dist/sandbox/index.d.ts +1738 -0
- package/dist/sandbox/index.js +187 -0
- package/dist/sandbox/index.js.map +1 -0
- package/package.json +49 -0
- package/src/bundled_hashicorp_terraform_skills/LICENSE +373 -0
- package/src/bundled_hashicorp_terraform_skills/README.md +18 -0
- package/src/bundled_hashicorp_terraform_skills/UPSTREAM_GIT_SHA +1 -0
- package/src/bundled_hashicorp_terraform_skills/azure-verified-modules/SKILL.md +613 -0
- package/src/bundled_hashicorp_terraform_skills/checkov/SKILL.md +43 -0
- package/src/bundled_hashicorp_terraform_skills/refactor-module/SKILL.md +538 -0
- package/src/bundled_hashicorp_terraform_skills/social-media-marketing/SKILL.md +35 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-search-import/SKILL.md +372 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-search-import/references/MANUAL-IMPORT.md +113 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-search-import/scripts/list_resources.sh +38 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/SKILL.md +480 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/api-monitoring.md +543 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/component-blocks.md +476 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/deployment-blocks.md +391 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/examples.md +1529 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/linked-stacks.md +187 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-stacks/references/troubleshooting.md +671 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-style-guide/SKILL.md +353 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-test/SKILL.md +451 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-test/references/CI_CD.md +80 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-test/references/EXAMPLES.md +314 -0
- package/src/bundled_hashicorp_terraform_skills/terraform-test/references/MOCK_PROVIDERS.md +171 -0
- package/src/codex-tool-search.ts +267 -0
- package/src/context-compaction.ts +538 -0
- package/src/history-sanitizer.ts +719 -0
- package/src/index.ts +3299 -0
- package/src/sandbox/capabilities.ts +69 -0
- package/src/sandbox/channel-a.ts +1031 -0
- package/src/sandbox/display-stack.ts +231 -0
- package/src/sandbox/errors.ts +34 -0
- package/src/sandbox/index.ts +832 -0
- package/src/sandbox/providers/blaxel.ts +35 -0
- package/src/sandbox/providers/cloudflare.ts +24 -0
- package/src/sandbox/providers/daytona.ts +34 -0
- package/src/sandbox/providers/docker.ts +17 -0
- package/src/sandbox/providers/e2b.ts +36 -0
- package/src/sandbox/providers/index.ts +107 -0
- package/src/sandbox/providers/local.ts +13 -0
- package/src/sandbox/providers/modal.ts +55 -0
- package/src/sandbox/providers/none.ts +13 -0
- package/src/sandbox/providers/runloop.ts +32 -0
- package/src/sandbox/providers/selfhosted.ts +96 -0
- package/src/sandbox/providers/types.ts +38 -0
- package/src/sandbox/providers/vercel.ts +29 -0
- package/src/sandbox/recording.ts +286 -0
- package/src/sandbox/routing/backend-resolver.ts +189 -0
- package/src/sandbox/routing/routing-session.ts +455 -0
- package/src/sandbox/select.ts +371 -0
- package/src/sandbox/selfhosted/capabilities.ts +255 -0
- package/src/sandbox/selfhosted/control-rpc.ts +351 -0
- package/src/sandbox/selfhosted/session.ts +930 -0
- package/src/sandbox/selfhosted/testing.ts +230 -0
- package/src/sandbox/stream-port.ts +185 -0
- package/src/sandbox/stream-token.ts +90 -0
- package/src/sandbox/terminal-server.ts +203 -0
- package/src/sandbox-computer.ts +835 -0
|
@@ -0,0 +1,835 @@
|
|
|
1
|
+
// packages/runtime/src/sandbox-computer.ts — the agent computer-use surface (P4.3).
|
|
2
|
+
//
|
|
3
|
+
// A `Computer` impl backed by xdotool (mouse/keyboard/move/click/type/key) +
|
|
4
|
+
// scrot (screenshots), issued through the SAME externally-owned `session` the
|
|
5
|
+
// human watches over Channel B. The agent and the human share ONE :0 display —
|
|
6
|
+
// zero projection: ffmpeg reads exactly the pixels xdotool draws. Exposed to the
|
|
7
|
+
// Agents SDK as a `computerTool` carried by `ComputerUseCapability`, pushed into
|
|
8
|
+
// `buildAgentCapabilities` when `computerUseEnabled && desktopCapableBackend`.
|
|
9
|
+
//
|
|
10
|
+
// This file lives OUTSIDE the @opengeni/runtime/sandbox agent-loop-free leaf
|
|
11
|
+
// (it imports `computerTool` from the @openai/agents root, which the leaf forbids)
|
|
12
|
+
// and is wired into the agent-loop barrel (packages/runtime/src/index.ts).
|
|
13
|
+
//
|
|
14
|
+
// ── Adversarial-review fixes folded in (module 05 §Adversarial) ──────────────
|
|
15
|
+
// F1 exec is OPTIONAL on every provider (Modal has only execCommand) — the
|
|
16
|
+
// primitive dual-paths `session.exec ?? session.execCommand`.
|
|
17
|
+
// F2 execCommand returns a FORMATTED STRING with a metadata preamble, not raw
|
|
18
|
+
// stdout — screenshots read the PNG by running `base64 <path>` over the SAME
|
|
19
|
+
// command primitive and stripping the banner (NOT `session.readFile`: Modal's
|
|
20
|
+
// readFile path-validates against the /workspace root and THROWS
|
|
21
|
+
// "Sandbox path /tmp/…png escapes the workspace root", so the /tmp scrot can
|
|
22
|
+
// never be read → empty frame → `image_url: ''` → model 400). This mirrors
|
|
23
|
+
// recording.ts/channel-a fsReadViaExec. Exit codes come from the established
|
|
24
|
+
// `sandboxCommandExitCode` parser, not a `.exitCode` field.
|
|
25
|
+
// F3 exec/execCommand YIELDS (does not wait) — `sandboxCommandStillRunning` is
|
|
26
|
+
// treated as a retriable failure, and the input commands complete well under
|
|
27
|
+
// the yield window.
|
|
28
|
+
// F4 import paths: `computerTool`/`Computer` from `@openai/agents` (root, via
|
|
29
|
+
// the agents-core star re-export); `Capability`/`requireBoundSession` from
|
|
30
|
+
// `@openai/agents/sandbox`. `Button` is NOT exported — the union is inlined.
|
|
31
|
+
// F5 scroll deltas are model PIXELS (often hundreds) — divided by a notch step
|
|
32
|
+
// and clamped, NOT used as literal wheel-click `--repeat` counts.
|
|
33
|
+
|
|
34
|
+
import { computerTool, tool, type Computer, type Tool } from "@openai/agents";
|
|
35
|
+
import { Capability, type SandboxSessionLike } from "@openai/agents/sandbox";
|
|
36
|
+
import { KeyAction, PointerAction, PointerButton, type DesktopInputRequest } from "@opengeni/agent-proto";
|
|
37
|
+
|
|
38
|
+
import { sandboxCommandExitCode, sandboxCommandOutput, sandboxCommandStillRunning } from "./index";
|
|
39
|
+
// `stripExecBanner` is the SAME pure helper recording.ts uses to recover the raw
|
|
40
|
+
// command body from Modal's execCommand banner ("…Output:\n<body>"). Imported from
|
|
41
|
+
// the agent-loop-free leaf (importing a pure parser FROM the leaf is allowed — the
|
|
42
|
+
// leaf boundary only forbids the leaf importing the agent loop, not the reverse).
|
|
43
|
+
import { stripExecBanner } from "./sandbox";
|
|
44
|
+
|
|
45
|
+
// `requireBoundSession` lives in @openai/agents-core/sandbox/capabilities/base
|
|
46
|
+
// but is NOT re-exported from the public @openai/agents/sandbox barrel, so we
|
|
47
|
+
// inline the trivial bound-session guard (parity with the SDK's own helper).
|
|
48
|
+
function requireBoundSession(capabilityType: string, session?: SandboxSessionLike): SandboxSessionLike {
|
|
49
|
+
if (!session) {
|
|
50
|
+
throw new ComputerUnavailableError(`capability "${capabilityType}" used before bind(session)`);
|
|
51
|
+
}
|
|
52
|
+
return session;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// `Button` is intentionally NOT imported (it is not a public export, F4) — the
|
|
56
|
+
// union is inlined and kept in lockstep with @openai/agents-core/computer.d.ts.
|
|
57
|
+
type ComputerButton = "left" | "right" | "wheel" | "back" | "forward";
|
|
58
|
+
|
|
59
|
+
const DEFAULT_DISPLAY = ":0";
|
|
60
|
+
const DEFAULT_DIMENSIONS: [number, number] = [1280, 800];
|
|
61
|
+
// Commands must complete well under this (F3): xdotool/scrot of a 1280x800 PNG is
|
|
62
|
+
// sub-second; the wait gives headroom on a cold gVisor box without masking a wedge.
|
|
63
|
+
const ACTION_YIELD_MS = 15_000;
|
|
64
|
+
// Model scroll deltas are pixels (F5); one wheel "notch" ≈ this many pixels. e2b
|
|
65
|
+
// uses a similar divisor. Clamp keeps a runaway delta from spamming the wheel.
|
|
66
|
+
const SCROLL_NOTCH_PIXELS = 100;
|
|
67
|
+
const SCROLL_MAX_CLICKS = 15;
|
|
68
|
+
// screenshot() never hands the model an empty image_url (the SDK turns "" into
|
|
69
|
+
// `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
|
|
70
|
+
// a zero-byte frame on the first scrot; bounded retries with a short pause let a
|
|
71
|
+
// momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
|
|
72
|
+
const SCREENSHOT_MAX_ATTEMPTS = 3;
|
|
73
|
+
const SCREENSHOT_RETRY_DELAY_MS = 400;
|
|
74
|
+
|
|
75
|
+
export type SandboxComputerOptions = {
|
|
76
|
+
display?: string; // ":0"
|
|
77
|
+
dimensions?: [number, number]; // must match the Xvfb geometry
|
|
78
|
+
runAs?: string; // provider runAs (modal/docker: "sandbox"); undefined otherwise
|
|
79
|
+
typeDelayMs?: number; // xdotool type --delay (default 12ms)
|
|
80
|
+
readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
|
|
81
|
+
screenshotTmpDir?: string; // "/tmp"
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
// X keysym map for keypress(): model key names → xdotool keysyms.
|
|
85
|
+
const KEYSYM: Record<string, string> = {
|
|
86
|
+
ctrl: "ctrl", control: "ctrl", alt: "alt", option: "alt", shift: "shift",
|
|
87
|
+
cmd: "super", meta: "super", win: "super", super: "super",
|
|
88
|
+
enter: "Return", return: "Return", tab: "Tab", esc: "Escape", escape: "Escape",
|
|
89
|
+
backspace: "BackSpace", delete: "Delete", space: "space",
|
|
90
|
+
up: "Up", down: "Down", left: "Left", right: "Right",
|
|
91
|
+
pageup: "Prior", pagedown: "Next", home: "Home", end: "End",
|
|
92
|
+
};
|
|
93
|
+
function toKeysym(k: string): string {
|
|
94
|
+
const low = k.toLowerCase();
|
|
95
|
+
if (KEYSYM[low]) return KEYSYM[low];
|
|
96
|
+
if (/^f([1-9]|1[0-2])$/.test(low)) return low.toUpperCase();
|
|
97
|
+
return low.length === 1 ? low : k;
|
|
98
|
+
}
|
|
99
|
+
const BUTTON_NUM: Record<ComputerButton, number> = { left: 1, wheel: 2, right: 3, back: 8, forward: 9 };
|
|
100
|
+
|
|
101
|
+
// The structural slice of a provider session computer-use drives. exec and
|
|
102
|
+
// execCommand are optional because the SDK's SandboxSessionLike leaves them
|
|
103
|
+
// optional (Modal implements execCommand, not exec — F1). readFile is intentionally
|
|
104
|
+
// NOT in this type: screenshots read the /tmp PNG via `base64 <path>` over
|
|
105
|
+
// exec/execCommand (readFile path-validates against /workspace and rejects /tmp).
|
|
106
|
+
type ExecResultLike = { output?: string; stdout?: string; stderr?: string; exitCode?: number | null; sessionId?: number };
|
|
107
|
+
type ComputerSession = {
|
|
108
|
+
exec?: (args: { cmd: string; runAs?: string; yieldTimeMs?: number; maxOutputTokens?: number }) => Promise<ExecResultLike>;
|
|
109
|
+
execCommand?: (args: { cmd: string; runAs?: string; yieldTimeMs?: number; maxOutputTokens?: number }) => Promise<string>;
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
/** No exec/execCommand on the session, or the display is not up. */
|
|
113
|
+
export class ComputerUnavailableError extends Error {
|
|
114
|
+
constructor(message: string) { super(message); this.name = "ComputerUnavailableError"; }
|
|
115
|
+
}
|
|
116
|
+
/** A write action attempted while readOnly. */
|
|
117
|
+
export class ComputerReadOnlyError extends Error {
|
|
118
|
+
constructor() { super("computer-use is read-only — write actions are disabled"); this.name = "ComputerReadOnlyError"; }
|
|
119
|
+
}
|
|
120
|
+
/** A nonzero xdotool/scrot exit, OR a command that did not finish before the
|
|
121
|
+
* yield window (F3 — "still running" is a failure, not a silent success). */
|
|
122
|
+
export class ComputerActionError extends Error {
|
|
123
|
+
constructor(public cmd: string, public exitCode: number, public stderr: string) {
|
|
124
|
+
super(`computer action failed (${exitCode}): ${cmd}${stderr ? `\n${stderr}` : ""}`);
|
|
125
|
+
this.name = "ComputerActionError";
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* The Computer the agent drives. Every action issues ONE shell line through the
|
|
131
|
+
* externally-owned session (exec ?? execCommand, F1), prefixed with the display.
|
|
132
|
+
* screenshot() scrots to a /tmp file and reads the RAW bytes by running
|
|
133
|
+
* `base64 <path>` over the SAME command primitive and stripping the banner — NOT
|
|
134
|
+
* `session.readFile` (Modal's readFile path-validates against /workspace and rejects
|
|
135
|
+
* /tmp with "escapes the workspace root", which would yield an empty frame and 400
|
|
136
|
+
* the model). The base64-over-exec path is /tmp-readable and binary-safe.
|
|
137
|
+
*/
|
|
138
|
+
export class SandboxComputer implements Computer {
|
|
139
|
+
readonly environment = "ubuntu" as const;
|
|
140
|
+
readonly dimensions: [number, number];
|
|
141
|
+
private session: ComputerSession;
|
|
142
|
+
private readonly display: string;
|
|
143
|
+
private readonly runAs?: string;
|
|
144
|
+
private readonly typeDelayMs: number;
|
|
145
|
+
private readonly readOnly: boolean;
|
|
146
|
+
private readonly tmp: string;
|
|
147
|
+
|
|
148
|
+
constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
|
|
149
|
+
this.session = session as unknown as ComputerSession;
|
|
150
|
+
this.display = opts.display ?? DEFAULT_DISPLAY;
|
|
151
|
+
this.dimensions = opts.dimensions ?? DEFAULT_DIMENSIONS;
|
|
152
|
+
if (opts.runAs !== undefined) {
|
|
153
|
+
this.runAs = opts.runAs;
|
|
154
|
+
}
|
|
155
|
+
this.typeDelayMs = opts.typeDelayMs ?? 12;
|
|
156
|
+
this.readOnly = opts.readOnly ?? false;
|
|
157
|
+
this.tmp = opts.screenshotTmpDir ?? "/tmp";
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
|
|
161
|
+
rebind(session: SandboxSessionLike) { this.session = session as unknown as ComputerSession; }
|
|
162
|
+
|
|
163
|
+
// The single command primitive. Dual-paths exec/execCommand (F1), then uses the
|
|
164
|
+
// established string-aware parsers (F2/F3): exitCode from the preamble, and
|
|
165
|
+
// "still running" → a retriable failure. Returns the command OUTPUT body.
|
|
166
|
+
private async x(cmd: string): Promise<string> {
|
|
167
|
+
const args = {
|
|
168
|
+
cmd: `DISPLAY=${this.display} ${cmd}`,
|
|
169
|
+
...(this.runAs ? { runAs: this.runAs } : {}),
|
|
170
|
+
yieldTimeMs: ACTION_YIELD_MS,
|
|
171
|
+
maxOutputTokens: 4_000,
|
|
172
|
+
};
|
|
173
|
+
let result: ExecResultLike | string;
|
|
174
|
+
if (typeof this.session.exec === "function") {
|
|
175
|
+
result = await this.session.exec(args);
|
|
176
|
+
} else if (typeof this.session.execCommand === "function") {
|
|
177
|
+
result = await this.session.execCommand(args);
|
|
178
|
+
} else {
|
|
179
|
+
throw new ComputerUnavailableError("session cannot run commands (no exec/execCommand)");
|
|
180
|
+
}
|
|
181
|
+
const output = sandboxCommandOutput(result);
|
|
182
|
+
if (sandboxCommandStillRunning(result)) {
|
|
183
|
+
// F3: the command exceeded the yield window. WARN AND RETURN rather than
|
|
184
|
+
// throw. Throwing here causes the SDK's catch in `_runComputerActionAndScreenshot`
|
|
185
|
+
// to set output='' and build `{image_url:""}` → Azure 400. By returning
|
|
186
|
+
// instead, the SDK proceeds past the action loop and calls computer.screenshot()
|
|
187
|
+
// so the model gets the REAL current frame for its next step.
|
|
188
|
+
//
|
|
189
|
+
// screenshot()'s FAIL-LOUD + retry contract is preserved: if scrot itself
|
|
190
|
+
// times out (very unlikely at 15 s), x() returns here, readScreenshotBytes
|
|
191
|
+
// produces empty bytes, and the retry loop eventually throws. The wire-level
|
|
192
|
+
// backstop in computerCallNormalizingFetch is also in place as a second net.
|
|
193
|
+
console.warn(
|
|
194
|
+
`[SandboxComputer] action command did not finish before the ${ACTION_YIELD_MS}ms yield window — proceeding to screenshot: ${cmd}`,
|
|
195
|
+
);
|
|
196
|
+
return output;
|
|
197
|
+
}
|
|
198
|
+
const exitCode = sandboxCommandExitCode(result);
|
|
199
|
+
if (exitCode !== null && exitCode !== 0) {
|
|
200
|
+
throw new ComputerActionError(cmd, exitCode, output);
|
|
201
|
+
}
|
|
202
|
+
return output;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
private guardWrite() {
|
|
206
|
+
if (this.readOnly) throw new ComputerReadOnlyError();
|
|
207
|
+
}
|
|
208
|
+
private shq(s: string): string {
|
|
209
|
+
return `'${s.replace(/'/g, `'\\''`)}'`;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
async screenshot(): Promise<string> {
|
|
213
|
+
// F2: scrot to a /tmp file, then read the RAW PNG bytes by running `base64
|
|
214
|
+
// <path>` over the SAME command primitive (exec ?? execCommand) and stripping
|
|
215
|
+
// the banner — NOT `session.readFile`. On Modal, readFile path-validates the
|
|
216
|
+
// path against the /workspace root and THROWS for /tmp ("Sandbox path
|
|
217
|
+
// /tmp/og-shot-*.png escapes the workspace root"), so the scrot could never be
|
|
218
|
+
// read → empty frame → `image_url: ''` → the model 400s. The base64-over-exec
|
|
219
|
+
// mechanism (mirroring recording.ts readRecordingBytes + channel-a
|
|
220
|
+
// fsReadViaExec) is /tmp-readable and binary-safe. We do NOT use execCommand's
|
|
221
|
+
// body via the `this.x()` parser — that drops the execCommand string body; the
|
|
222
|
+
// banner is stripped explicitly here so the base64 payload survives intact.
|
|
223
|
+
//
|
|
224
|
+
// CRITICAL CONTRACT: this NEVER returns an empty string. The Agents SDK builds
|
|
225
|
+
// the model-facing image as `data:image/png;base64,${output}` — so an empty
|
|
226
|
+
// `output` becomes `image_url: ''`, which the model API rejects with
|
|
227
|
+
// "400 Invalid input[N].output.image_url, expected a valid URL" and kills the
|
|
228
|
+
// turn. An empty/failed frame is therefore a THROW (a clear action failure the
|
|
229
|
+
// SDK surfaces), never a silent "". We also self-heal a transient cold-display
|
|
230
|
+
// frame: bounded retries with a short wait between attempts, so a :0 that is up
|
|
231
|
+
// but momentarily not painting (XFCE/dbus still warming) recovers without
|
|
232
|
+
// failing the turn.
|
|
233
|
+
let lastError: unknown;
|
|
234
|
+
for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
|
|
235
|
+
if (attempt > 0) {
|
|
236
|
+
await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
|
|
237
|
+
}
|
|
238
|
+
const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
|
|
239
|
+
try {
|
|
240
|
+
await this.x(`scrot --pointer --overwrite ${f}`);
|
|
241
|
+
const bytes = await this.readScreenshotBytes(f);
|
|
242
|
+
if (bytes.length === 0) {
|
|
243
|
+
// A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
|
|
244
|
+
// hand the model an empty image_url; throw on the final attempt.
|
|
245
|
+
throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
|
|
246
|
+
}
|
|
247
|
+
return Buffer.from(bytes).toString("base64");
|
|
248
|
+
} catch (error) {
|
|
249
|
+
lastError = error;
|
|
250
|
+
} finally {
|
|
251
|
+
// Best-effort cleanup on every attempt (success OR failure); never mask the
|
|
252
|
+
// screenshot result.
|
|
253
|
+
await this.x(`rm -f ${f}`).catch(() => undefined);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
|
|
257
|
+
// returning "" here would surface to the model as an invalid empty image_url.
|
|
258
|
+
if (lastError instanceof Error) {
|
|
259
|
+
throw lastError;
|
|
260
|
+
}
|
|
261
|
+
throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Read the screenshot PNG bytes by base64-ing the absolute /tmp path through the
|
|
265
|
+
// SAME command primitive (exec ?? execCommand) — NOT `session.readFile` (Modal
|
|
266
|
+
// path-validates against /workspace and rejects /tmp) and NOT `this.x()` (its
|
|
267
|
+
// `sandboxCommandOutput` parser drops the execCommand STRING body, returning ""
|
|
268
|
+
// — only the exec-object path has a structured body). We capture the RAW result,
|
|
269
|
+
// strip the execCommand banner ("…Output:\n<base64>"), strip whitespace, and
|
|
270
|
+
// decode. Binary-safe: base64 of the scrot is plain ASCII over stdout, no
|
|
271
|
+
// truncation (maxOutputTokens:null), mirroring recording.ts readRecordingBytes.
|
|
272
|
+
private async readScreenshotBytes(path: string): Promise<Uint8Array> {
|
|
273
|
+
const args = {
|
|
274
|
+
cmd: `DISPLAY=${this.display} base64 ${path}`,
|
|
275
|
+
...(this.runAs ? { runAs: this.runAs } : {}),
|
|
276
|
+
yieldTimeMs: ACTION_YIELD_MS,
|
|
277
|
+
// null disables the provider's output truncation so a full-screen PNG's
|
|
278
|
+
// base64 is never clipped (the SDK's truncateOutput passes through on null).
|
|
279
|
+
maxOutputTokens: null as unknown as number,
|
|
280
|
+
};
|
|
281
|
+
let raw: string;
|
|
282
|
+
if (typeof this.session.exec === "function") {
|
|
283
|
+
// The exec-object path exposes a structured stdout/output body.
|
|
284
|
+
raw = sandboxCommandOutput(await this.session.exec(args));
|
|
285
|
+
} else if (typeof this.session.execCommand === "function") {
|
|
286
|
+
// execCommand returns the formatted STRING — strip the banner to recover the
|
|
287
|
+
// base64 body (sandboxCommandOutput would drop it for the string form).
|
|
288
|
+
raw = stripExecBanner(await this.session.execCommand(args));
|
|
289
|
+
} else {
|
|
290
|
+
throw new ComputerUnavailableError("session cannot run commands (no exec/execCommand) — screenshots unavailable");
|
|
291
|
+
}
|
|
292
|
+
const b64 = raw.replace(/\s+/g, "");
|
|
293
|
+
if (b64.length === 0) return new Uint8Array();
|
|
294
|
+
return Uint8Array.from(Buffer.from(b64, "base64"));
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
async click(xp: number, yp: number, button: ComputerButton) {
|
|
298
|
+
this.guardWrite();
|
|
299
|
+
await this.x(`xdotool mousemove --sync ${xp} ${yp} click ${BUTTON_NUM[button] ?? 1}`);
|
|
300
|
+
}
|
|
301
|
+
async doubleClick(xp: number, yp: number) {
|
|
302
|
+
this.guardWrite();
|
|
303
|
+
await this.x(`xdotool mousemove --sync ${xp} ${yp} click --repeat 2 --delay 60 1`);
|
|
304
|
+
}
|
|
305
|
+
async move(xp: number, yp: number) {
|
|
306
|
+
this.guardWrite();
|
|
307
|
+
await this.x(`xdotool mousemove --sync ${xp} ${yp}`);
|
|
308
|
+
}
|
|
309
|
+
async scroll(xp: number, yp: number, sx: number, sy: number) {
|
|
310
|
+
this.guardWrite();
|
|
311
|
+
// F5: model deltas are PIXELS — convert to wheel notches, clamp.
|
|
312
|
+
const notches = (px: number): number => Math.min(SCROLL_MAX_CLICKS, Math.max(0, Math.round(Math.abs(px) / SCROLL_NOTCH_PIXELS)));
|
|
313
|
+
const vBtn = sy < 0 ? 4 : 5;
|
|
314
|
+
const hBtn = sx < 0 ? 6 : 7;
|
|
315
|
+
const vN = notches(sy);
|
|
316
|
+
const hN = notches(sx);
|
|
317
|
+
let cmd = `xdotool mousemove --sync ${xp} ${yp}`;
|
|
318
|
+
if (vN) cmd += ` click --repeat ${vN} ${vBtn}`;
|
|
319
|
+
if (hN) cmd += ` click --repeat ${hN} ${hBtn}`;
|
|
320
|
+
await this.x(cmd);
|
|
321
|
+
}
|
|
322
|
+
async type(text: string) {
|
|
323
|
+
this.guardWrite();
|
|
324
|
+
await this.x(`xdotool type --delay ${this.typeDelayMs} -- ${this.shq(text)}`);
|
|
325
|
+
}
|
|
326
|
+
async keypress(keys: string[]) {
|
|
327
|
+
this.guardWrite();
|
|
328
|
+
const combo = keys.map(toKeysym).join("+");
|
|
329
|
+
await this.x(`xdotool key -- ${this.shq(combo)}`);
|
|
330
|
+
}
|
|
331
|
+
async drag(path: [number, number][]) {
|
|
332
|
+
this.guardWrite();
|
|
333
|
+
if (path.length === 0) return;
|
|
334
|
+
const [sx0, sy0] = path[0]!;
|
|
335
|
+
let cmd = `xdotool mousemove --sync ${sx0} ${sy0} mousedown 1`;
|
|
336
|
+
for (const [px, py] of path.slice(1)) cmd += ` mousemove --sync ${px} ${py}`;
|
|
337
|
+
cmd += ` mouseup 1`;
|
|
338
|
+
await this.x(cmd);
|
|
339
|
+
}
|
|
340
|
+
async wait() {
|
|
341
|
+
await new Promise((r) => setTimeout(r, 1000));
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// ── The native-desktop computer (self-hosted / macOS) ────────────────────────
|
|
346
|
+
//
|
|
347
|
+
// `SandboxComputer` drives the desktop by shelling out to xdotool/scrot over the
|
|
348
|
+
// session's `exec` — which needs those X utilities installed in the box image and
|
|
349
|
+
// only works under X11. A SELF-HOSTED machine (macOS OR bring-your-own Linux) has
|
|
350
|
+
// neither guarantee, so it drives the desktop NATIVELY over the control plane: the
|
|
351
|
+
// Rust agent injects input via CGEvent (macOS) / XTEST (Linux) and captures via
|
|
352
|
+
// ScreenCaptureKit / x11, exposed as the two `SelfhostedSession` ops below. No
|
|
353
|
+
// xdotool/scrot dependency; works on macOS.
|
|
354
|
+
|
|
355
|
+
/** The structural slice of a self-hosted session the native computer drives — the
|
|
356
|
+
* two control-plane ops added in session.ts. Kept structural (NOT an import of
|
|
357
|
+
* `SelfhostedSession`) so this agent-loop file never hard-couples to the sandbox
|
|
358
|
+
* leaf; the duck-typed `isNativeDesktopSession` probe (below) selects on it. */
|
|
359
|
+
export type NativeDesktopSession = {
|
|
360
|
+
desktopInput(event: DesktopInputRequest["event"]): Promise<void>;
|
|
361
|
+
screenshot(): Promise<{ png: Uint8Array; width: number; height: number }>;
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
/** Model `Button` → wire `PointerButton`. The proto has no back/forward button, so
|
|
365
|
+
* those degrade to UNSPECIFIED (the agent ignores an unmapped button rather than
|
|
366
|
+
* mis-clicking). A total record so indexing is exhaustive. */
|
|
367
|
+
const POINTER_BUTTON: Record<ComputerButton, PointerButton> = {
|
|
368
|
+
left: PointerButton.POINTER_BUTTON_LEFT,
|
|
369
|
+
right: PointerButton.POINTER_BUTTON_RIGHT,
|
|
370
|
+
wheel: PointerButton.POINTER_BUTTON_MIDDLE,
|
|
371
|
+
back: PointerButton.POINTER_BUTTON_UNSPECIFIED,
|
|
372
|
+
forward: PointerButton.POINTER_BUTTON_UNSPECIFIED,
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
export type NativeDesktopComputerOptions = {
|
|
376
|
+
dimensions?: [number, number]; // the display geometry (must match the capture size)
|
|
377
|
+
environment?: NonNullable<Computer["environment"]>; // "ubuntu" (default) | "mac" | ...; model uses it for OS key conventions
|
|
378
|
+
readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
|
|
379
|
+
};
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* A `Computer` that drives a SELF-HOSTED machine's OWN desktop NATIVELY over the
|
|
383
|
+
* control plane (`desktopInput` inject + `screenshot` capture on the bound
|
|
384
|
+
* `SelfhostedSession`) instead of xdotool/scrot over `exec`. Consent + epoch are
|
|
385
|
+
* enforced AGENT-side, so an unconsented inject surfaces the session's mapped
|
|
386
|
+
* control error.
|
|
387
|
+
*
|
|
388
|
+
* screenshot() returns raw base64 with NO data-URL prefix — the EXACT contract of
|
|
389
|
+
* `SandboxComputer.screenshot`: the Agents SDK wraps it as
|
|
390
|
+
* `data:image/png;base64,${output}`, so an empty string would become
|
|
391
|
+
* `image_url: ''` and 400 the model turn. An empty PNG therefore THROWS
|
|
392
|
+
* `ComputerUnavailableError` (mirroring SandboxComputer's empty-guard) — the model
|
|
393
|
+
* never receives an empty image_url.
|
|
394
|
+
*/
|
|
395
|
+
export class NativeDesktopComputer implements Computer {
|
|
396
|
+
readonly environment: NonNullable<Computer["environment"]>;
|
|
397
|
+
readonly dimensions: [number, number];
|
|
398
|
+
private session: NativeDesktopSession;
|
|
399
|
+
private readonly readOnly: boolean;
|
|
400
|
+
|
|
401
|
+
constructor(session: NativeDesktopSession, opts: NativeDesktopComputerOptions = {}) {
|
|
402
|
+
this.session = session;
|
|
403
|
+
this.dimensions = opts.dimensions ?? DEFAULT_DIMENSIONS;
|
|
404
|
+
// Default "ubuntu" (self-hosted Linux is the near-term target); a macOS session
|
|
405
|
+
// should pass "mac" so the model uses ⌘-based shortcuts — see the coordinate TODO.
|
|
406
|
+
this.environment = opts.environment ?? "ubuntu";
|
|
407
|
+
this.readOnly = opts.readOnly ?? false;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
/** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
|
|
411
|
+
rebind(session: NativeDesktopSession) { this.session = session; }
|
|
412
|
+
|
|
413
|
+
private guardWrite() {
|
|
414
|
+
if (this.readOnly) throw new ComputerReadOnlyError();
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
private async pointer(x: number, y: number, action: PointerAction, button: PointerButton): Promise<void> {
|
|
418
|
+
// COORDINATE SEAM — TODO(verify e2e on macOS): the model computes x/y against the
|
|
419
|
+
// pixels of the screenshot it just saw, and the agent's macOS CGEvent inject
|
|
420
|
+
// treats x/y as raw screen coordinates. On a Retina Mac, ScreenCaptureKit may
|
|
421
|
+
// capture at 2× the logical POINT space while CGEvent expects logical points — a
|
|
422
|
+
// potential 2× mismatch between the coords the model derives and the coords the
|
|
423
|
+
// inject applies. This MUST be measured on a real Retina Mac (compare the
|
|
424
|
+
// screenshot's reported width/height against the logical display bounds) before
|
|
425
|
+
// any DPR scaling is added. Do NOT add scaling speculatively. Self-hosted Linux
|
|
426
|
+
// (XTEST/x11) is 1:1 and unaffected.
|
|
427
|
+
await this.session.desktopInput({ $case: "pointer", pointer: { x, y, action, button } });
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
async screenshot(): Promise<string> {
|
|
431
|
+
// CRITICAL CONTRACT (mirrors SandboxComputer.screenshot): NEVER return "". The
|
|
432
|
+
// Agents SDK builds the model image as `data:image/png;base64,${output}`; an
|
|
433
|
+
// empty output → `image_url: ''` → the model API 400s and kills the turn. A
|
|
434
|
+
// missing/empty frame is therefore a THROW, never a silent "". Native capture
|
|
435
|
+
// (ScreenCaptureKit / x11) does not have the cold-scrot warm-up the xdotool path
|
|
436
|
+
// retries around, so a single capture + a hard empty-guard is sufficient.
|
|
437
|
+
const { png } = await this.session.screenshot();
|
|
438
|
+
if (png.length === 0) {
|
|
439
|
+
throw new ComputerUnavailableError("native desktop screenshot returned an empty frame (display not up?)");
|
|
440
|
+
}
|
|
441
|
+
return Buffer.from(png).toString("base64");
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
async click(x: number, y: number, button: ComputerButton) {
|
|
445
|
+
this.guardWrite();
|
|
446
|
+
await this.pointer(x, y, PointerAction.POINTER_ACTION_CLICK, POINTER_BUTTON[button] ?? PointerButton.POINTER_BUTTON_LEFT);
|
|
447
|
+
}
|
|
448
|
+
async doubleClick(x: number, y: number) {
|
|
449
|
+
this.guardWrite();
|
|
450
|
+
await this.pointer(x, y, PointerAction.POINTER_ACTION_DOUBLE_CLICK, PointerButton.POINTER_BUTTON_LEFT);
|
|
451
|
+
}
|
|
452
|
+
async move(x: number, y: number) {
|
|
453
|
+
this.guardWrite();
|
|
454
|
+
await this.pointer(x, y, PointerAction.POINTER_ACTION_MOVE, PointerButton.POINTER_BUTTON_UNSPECIFIED);
|
|
455
|
+
}
|
|
456
|
+
async scroll(x: number, y: number, sx: number, sy: number) {
|
|
457
|
+
this.guardWrite();
|
|
458
|
+
// The model's scroll deltas are PIXELS — forward them straight to the agent as a
|
|
459
|
+
// ScrollEvent{x,y,deltaX,deltaY} and let the native inject translate to wheel
|
|
460
|
+
// events per platform. No xdotool "notch" quantization here (that is an
|
|
461
|
+
// xdotool-specific artifact); the agent owns the platform-appropriate scaling.
|
|
462
|
+
await this.session.desktopInput({ $case: "scroll", scroll: { x, y, deltaX: sx, deltaY: sy } });
|
|
463
|
+
}
|
|
464
|
+
async type(text: string) {
|
|
465
|
+
this.guardWrite();
|
|
466
|
+
// A literal text burst: isText:true tells the agent to type the string verbatim
|
|
467
|
+
// (Unicode-aware) rather than interpret it as a key name.
|
|
468
|
+
await this.session.desktopInput({ $case: "key", key: { key: text, isText: true, action: KeyAction.KEY_ACTION_PRESS } });
|
|
469
|
+
}
|
|
470
|
+
async keypress(keys: string[]) {
|
|
471
|
+
this.guardWrite();
|
|
472
|
+
// A chord ("ctrl+c") as ONE non-text KeyEvent (isText:false ⇒ interpret as key
|
|
473
|
+
// names). We send the model's PLATFORM-INDEPENDENT key names joined with "+" —
|
|
474
|
+
// NOT xdotool X keysyms (SandboxComputer's toKeysym maps to "Return"/"super"/
|
|
475
|
+
// "Prior", which are X-specific and wrong for the macOS CGEvent path). The agent
|
|
476
|
+
// owns the per-platform key-name → keycode mapping (the KeyEvent.key contract).
|
|
477
|
+
await this.session.desktopInput({ $case: "key", key: { key: keys.join("+"), isText: false, action: KeyAction.KEY_ACTION_PRESS } });
|
|
478
|
+
}
|
|
479
|
+
async drag(path: [number, number][]) {
|
|
480
|
+
this.guardWrite();
|
|
481
|
+
if (path.length === 0) return;
|
|
482
|
+
// Press at the start, move through each waypoint with the button held, release at
|
|
483
|
+
// the last point. The agent tracks button state across the DOWN → MOVE… → UP.
|
|
484
|
+
const [sx, sy] = path[0]!;
|
|
485
|
+
await this.pointer(sx, sy, PointerAction.POINTER_ACTION_DOWN, PointerButton.POINTER_BUTTON_LEFT);
|
|
486
|
+
for (const [px, py] of path.slice(1)) {
|
|
487
|
+
await this.pointer(px, py, PointerAction.POINTER_ACTION_MOVE, PointerButton.POINTER_BUTTON_LEFT);
|
|
488
|
+
}
|
|
489
|
+
const [ex, ey] = path[path.length - 1]!;
|
|
490
|
+
await this.pointer(ex, ey, PointerAction.POINTER_ACTION_UP, PointerButton.POINTER_BUTTON_LEFT);
|
|
491
|
+
}
|
|
492
|
+
async wait() {
|
|
493
|
+
await new Promise((r) => setTimeout(r, 1000));
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
/**
|
|
498
|
+
* Backend-aware SELECTION discriminator: a SELF-HOSTED session exposes the two
|
|
499
|
+
* native control-plane ops (`desktopInput` + `screenshot`); a MODAL session does
|
|
500
|
+
* not (it drives the desktop via xdotool/scrot over `exec`). Duck-typing on those
|
|
501
|
+
* two methods keeps this file from hard-importing `SelfhostedSession` (avoiding an
|
|
502
|
+
* agent-loop ↔ sandbox-leaf import coupling) and is future-proof: any backend that
|
|
503
|
+
* grows native inject/capture is picked up automatically.
|
|
504
|
+
*/
|
|
505
|
+
export function isNativeDesktopSession(session: SandboxSessionLike): session is SandboxSessionLike & NativeDesktopSession {
|
|
506
|
+
const s = session as Partial<NativeDesktopSession>;
|
|
507
|
+
return typeof s.desktopInput === "function" && typeof s.screenshot === "function";
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// ── Function-transport (codex / text backend) computer tools ─────────────────
|
|
511
|
+
//
|
|
512
|
+
// The SDK emits computer-use ONLY as the HOSTED `computer_use_preview` tool, which
|
|
513
|
+
// the codex / ChatGPT backend rejects (it accepts only function/custom/web_search
|
|
514
|
+
// tool types) — so on codex the hosted tool is unusable and the agent has nothing
|
|
515
|
+
// to drive the desktop with. We mirror EXACTLY how the SDK's filesystem capability
|
|
516
|
+
// degrades `view_image` for the text transport: when the bound model does NOT
|
|
517
|
+
// support the structured tool-output transport, emit a set of FUNCTION tools that
|
|
518
|
+
// route to the SAME bound `Computer`, and hand the model the screen by rendering
|
|
519
|
+
// the screenshot image-output as a text-transport data URL — the identical two-step
|
|
520
|
+
// `imageOutputFromBytes` → `renderImageForTextTransport` the SDK's text `view_image`
|
|
521
|
+
// uses. Those three helpers are NOT public exports of `@openai/agents` /
|
|
522
|
+
// `@openai/agents/sandbox` (they live in the SDK's private capabilities/transport +
|
|
523
|
+
// shared/media modules, unreachable via the package `exports` map), so — mirroring
|
|
524
|
+
// selfhosted/session.ts's local `sniffImageMediaType` — the three tiny pure helpers
|
|
525
|
+
// are reimplemented here in lockstep with the SDK.
|
|
526
|
+
|
|
527
|
+
/** The SDK's tool-output image shape (@openai/agents-core shared/media `ToolOutputImage`). */
|
|
528
|
+
type ToolOutputImage = { type: "image"; image: { data: Uint8Array; mediaType: string } };
|
|
529
|
+
|
|
530
|
+
/** Magic-byte image sniff, in lockstep with the SDK's `sniffImageMediaType`
|
|
531
|
+
* (shared/media). Screenshots are always PNG (scrot / ScreenCaptureKit / x11), so
|
|
532
|
+
* an unrecognized header defaults to image/png rather than failing the frame. */
|
|
533
|
+
function sniffScreenshotMediaType(bytes: Uint8Array): string {
|
|
534
|
+
if (bytes[0] === 0x89 && bytes[1] === 0x50 && bytes[2] === 0x4e && bytes[3] === 0x47) return "image/png";
|
|
535
|
+
if (bytes[0] === 0xff && bytes[1] === 0xd8 && bytes[2] === 0xff) return "image/jpeg";
|
|
536
|
+
if (bytes[0] === 0x47 && bytes[1] === 0x49 && bytes[2] === 0x46 && bytes[3] === 0x38) return "image/gif";
|
|
537
|
+
if (
|
|
538
|
+
bytes[0] === 0x52 && bytes[1] === 0x49 && bytes[2] === 0x46 && bytes[3] === 0x46 &&
|
|
539
|
+
bytes[8] === 0x57 && bytes[9] === 0x45 && bytes[10] === 0x42 && bytes[11] === 0x50
|
|
540
|
+
) return "image/webp";
|
|
541
|
+
return "image/png";
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
/** Build the SDK `ToolOutputImage` from raw screenshot bytes — the structured shape
|
|
545
|
+
* the SDK's `imageOutputFromBytes` produces (`{type:'image', image:{data,mediaType}}`). */
|
|
546
|
+
function imageOutputFromScreenshotBytes(bytes: Uint8Array): ToolOutputImage {
|
|
547
|
+
return { type: "image", image: { data: Uint8Array.from(bytes), mediaType: sniffScreenshotMediaType(bytes) } };
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
/** Render an image tool-output as a text-transport string, in lockstep with the
|
|
551
|
+
* SDK's private `renderImageForTextTransport` (capabilities/filesystem). Our image
|
|
552
|
+
* output always carries `data` as bytes, so it becomes a `data:<mediaType>;base64,…`
|
|
553
|
+
* URL — the exact form the text-backend `view_image` hands the model. */
|
|
554
|
+
function renderImageForTextTransport(output: ToolOutputImage | string): string {
|
|
555
|
+
if (typeof output === "string") return output;
|
|
556
|
+
const { image } = output;
|
|
557
|
+
const mediaType = typeof image.mediaType === "string" ? image.mediaType : "application/octet-stream";
|
|
558
|
+
return `data:${mediaType};base64,${Buffer.from(image.data).toString("base64")}`;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
/** Whether the bound model supports the structured tool-output transport, in
|
|
562
|
+
* lockstep with the SDK's private `supportsStructuredToolOutputTransport`
|
|
563
|
+
* (capabilities/transport): a ChatCompletions-family model — and an UNBOUND model
|
|
564
|
+
* (undefined) — does NOT, so it gets the function tools; every other model keeps
|
|
565
|
+
* the hosted `computer_use_preview` tool. The codex neutralize trick in index.ts
|
|
566
|
+
* drops `_modelInstance`, so this returns false there and the function tools win. */
|
|
567
|
+
function supportsStructuredToolOutputTransport(modelInstance: unknown): boolean {
|
|
568
|
+
if (!modelInstance) return false;
|
|
569
|
+
const constructorName =
|
|
570
|
+
typeof modelInstance === "object" && modelInstance && typeof (modelInstance as { constructor?: unknown }).constructor === "function"
|
|
571
|
+
? ((modelInstance as { constructor: { name?: string } }).constructor.name ?? "")
|
|
572
|
+
: "";
|
|
573
|
+
return !constructorName.includes("ChatCompletions");
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
const COMPUTER_READ_ONLY_MESSAGE =
|
|
577
|
+
"computer-use is read-only for this session — click, double_click, move, scroll, type, keypress, and drag are disabled. Call computer_screenshot to observe the desktop.";
|
|
578
|
+
|
|
579
|
+
// The two coordinate properties every pointer tool shares. Raw JSON schema (NOT
|
|
580
|
+
// zod: zod is not a @opengeni/runtime dependency) with `strict:false`, mirroring the
|
|
581
|
+
// SDK's own `apply_patch` function-tool schema style.
|
|
582
|
+
const COORD_PROPS = {
|
|
583
|
+
x: { type: "integer", description: "X coordinate in the pixels of the most recent computer_screenshot" },
|
|
584
|
+
y: { type: "integer", description: "Y coordinate in the pixels of the most recent computer_screenshot" },
|
|
585
|
+
} as const;
|
|
586
|
+
|
|
587
|
+
function objectSchema(properties: Record<string, unknown>, required: string[]): Record<string, unknown> {
|
|
588
|
+
return { type: "object", properties, required, additionalProperties: false };
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
/**
|
|
592
|
+
* The FUNCTION-transport computer tools for the codex / text backend, each routing
|
|
593
|
+
* to the SAME bound `Computer` the hosted `computer_use_preview` tool would drive.
|
|
594
|
+
* `computer_screenshot` hands the model the desktop two ways, selected by
|
|
595
|
+
* `imageFunctionResults`:
|
|
596
|
+
* • false (chat-completions providers, the default) → the text-transport
|
|
597
|
+
* `data:image/png;base64,…` URL (imageOutputFromBytes → renderImageForTextTransport,
|
|
598
|
+
* the SDK's text `view_image` path) — those backends can't read structured image
|
|
599
|
+
* tool results.
|
|
600
|
+
* • true (the codex/ChatGPT backend) → the structured `{type:'image'}` tool output,
|
|
601
|
+
* which agents-core normalizes into an `input_image` content item inside the
|
|
602
|
+
* function_call_output — the codex /responses backend accepts and SEES it (a text
|
|
603
|
+
* data-URL there is just unreadable text). See index.ts for why it's on there.
|
|
604
|
+
* Write tools return a concise confirmation; when read-only they return
|
|
605
|
+
* {@link COMPUTER_READ_ONLY_MESSAGE} instead of throwing, and any action error is
|
|
606
|
+
* returned as a string so a failed action never kills the turn. Exported so it can be
|
|
607
|
+
* unit-tested against a fake `Computer`.
|
|
608
|
+
*/
|
|
609
|
+
export function computerFunctionTools(
|
|
610
|
+
computer: Computer,
|
|
611
|
+
readOnly: boolean,
|
|
612
|
+
needsApproval?: ComputerUseArgs["needsApproval"],
|
|
613
|
+
imageFunctionResults = false,
|
|
614
|
+
): Tool<unknown>[] {
|
|
615
|
+
const approval = needsApproval !== undefined ? { needsApproval: needsApproval as never } : {};
|
|
616
|
+
// Perform a WRITE action, surfacing read-only / failures as a model-readable
|
|
617
|
+
// string rather than an uncaught throw (an uncaught throw becomes a tool error
|
|
618
|
+
// the backend may 400 on, or kills the step).
|
|
619
|
+
const write = async (confirmation: string, action: () => void | Promise<void>): Promise<string> => {
|
|
620
|
+
if (readOnly) return COMPUTER_READ_ONLY_MESSAGE;
|
|
621
|
+
try {
|
|
622
|
+
await action();
|
|
623
|
+
return confirmation;
|
|
624
|
+
} catch (error) {
|
|
625
|
+
if (error instanceof ComputerReadOnlyError) return COMPUTER_READ_ONLY_MESSAGE;
|
|
626
|
+
return `computer action failed: ${error instanceof Error ? error.message : String(error)}`;
|
|
627
|
+
}
|
|
628
|
+
};
|
|
629
|
+
|
|
630
|
+
return [
|
|
631
|
+
tool({
|
|
632
|
+
name: "computer_screenshot",
|
|
633
|
+
description:
|
|
634
|
+
"Capture the current desktop and return it as an image. Call this FIRST and again after each action — all coordinates for click/move/scroll/drag are pixels of the most recent screenshot.",
|
|
635
|
+
parameters: objectSchema({}, []) as never,
|
|
636
|
+
strict: false,
|
|
637
|
+
execute: async () => {
|
|
638
|
+
// screenshot() returns raw base64 PNG and NEVER an empty string (it throws
|
|
639
|
+
// instead), so the model can't receive an empty image_url.
|
|
640
|
+
const b64 = await computer.screenshot();
|
|
641
|
+
const bytes = Uint8Array.from(Buffer.from(b64, "base64"));
|
|
642
|
+
const image = imageOutputFromScreenshotBytes(bytes);
|
|
643
|
+
// On the codex backend return the structured image output so the model SEES
|
|
644
|
+
// the desktop (agents-core normalizes {type:'image'} → an input_image data-URL
|
|
645
|
+
// content item in the function_call_output); chat-completions providers get
|
|
646
|
+
// the text data-URL string they expect.
|
|
647
|
+
return imageFunctionResults ? image : renderImageForTextTransport(image);
|
|
648
|
+
},
|
|
649
|
+
}),
|
|
650
|
+
tool({
|
|
651
|
+
name: "computer_click",
|
|
652
|
+
description:
|
|
653
|
+
"Click the mouse at (x, y). `button` is one of left|right|wheel|back|forward (default left). Take a computer_screenshot first to find coordinates.",
|
|
654
|
+
parameters: objectSchema(
|
|
655
|
+
{ ...COORD_PROPS, button: { type: "string", enum: ["left", "right", "wheel", "back", "forward"], description: "Mouse button; defaults to left" } },
|
|
656
|
+
["x", "y"],
|
|
657
|
+
) as never,
|
|
658
|
+
strict: false,
|
|
659
|
+
...approval,
|
|
660
|
+
execute: async (input) => {
|
|
661
|
+
const { x, y, button } = input as { x: number; y: number; button?: ComputerButton };
|
|
662
|
+
return write(`clicked ${button ?? "left"} at (${x}, ${y})`, () => computer.click(x, y, button ?? "left"));
|
|
663
|
+
},
|
|
664
|
+
}),
|
|
665
|
+
tool({
|
|
666
|
+
name: "computer_double_click",
|
|
667
|
+
description: "Double-click the left mouse button at (x, y). Take a computer_screenshot first to find coordinates.",
|
|
668
|
+
parameters: objectSchema({ ...COORD_PROPS }, ["x", "y"]) as never,
|
|
669
|
+
strict: false,
|
|
670
|
+
...approval,
|
|
671
|
+
execute: async (input) => {
|
|
672
|
+
const { x, y } = input as { x: number; y: number };
|
|
673
|
+
return write(`double-clicked at (${x}, ${y})`, () => computer.doubleClick(x, y));
|
|
674
|
+
},
|
|
675
|
+
}),
|
|
676
|
+
tool({
|
|
677
|
+
name: "computer_move",
|
|
678
|
+
description: "Move the mouse cursor to (x, y) without clicking.",
|
|
679
|
+
parameters: objectSchema({ ...COORD_PROPS }, ["x", "y"]) as never,
|
|
680
|
+
strict: false,
|
|
681
|
+
...approval,
|
|
682
|
+
execute: async (input) => {
|
|
683
|
+
const { x, y } = input as { x: number; y: number };
|
|
684
|
+
return write(`moved to (${x}, ${y})`, () => computer.move(x, y));
|
|
685
|
+
},
|
|
686
|
+
}),
|
|
687
|
+
tool({
|
|
688
|
+
name: "computer_scroll",
|
|
689
|
+
description:
|
|
690
|
+
"Scroll at (x, y) by scroll_x / scroll_y pixels (positive scroll_y scrolls down, negative up; positive scroll_x scrolls right).",
|
|
691
|
+
parameters: objectSchema(
|
|
692
|
+
{
|
|
693
|
+
...COORD_PROPS,
|
|
694
|
+
scroll_x: { type: "integer", description: "Horizontal scroll amount in pixels (positive = right)" },
|
|
695
|
+
scroll_y: { type: "integer", description: "Vertical scroll amount in pixels (positive = down)" },
|
|
696
|
+
},
|
|
697
|
+
["x", "y", "scroll_x", "scroll_y"],
|
|
698
|
+
) as never,
|
|
699
|
+
strict: false,
|
|
700
|
+
...approval,
|
|
701
|
+
execute: async (input) => {
|
|
702
|
+
const { x, y, scroll_x, scroll_y } = input as { x: number; y: number; scroll_x: number; scroll_y: number };
|
|
703
|
+
return write(`scrolled (${scroll_x}, ${scroll_y}) at (${x}, ${y})`, () => computer.scroll(x, y, scroll_x, scroll_y));
|
|
704
|
+
},
|
|
705
|
+
}),
|
|
706
|
+
tool({
|
|
707
|
+
name: "computer_type",
|
|
708
|
+
description: "Type a literal text string at the current keyboard focus. Click the target field first.",
|
|
709
|
+
parameters: objectSchema({ text: { type: "string", description: "The literal text to type" } }, ["text"]) as never,
|
|
710
|
+
strict: false,
|
|
711
|
+
...approval,
|
|
712
|
+
execute: async (input) => {
|
|
713
|
+
const { text } = input as { text: string };
|
|
714
|
+
return write(`typed ${text.length} character(s)`, () => computer.type(text));
|
|
715
|
+
},
|
|
716
|
+
}),
|
|
717
|
+
tool({
|
|
718
|
+
name: "computer_keypress",
|
|
719
|
+
description:
|
|
720
|
+
'Press a key or chord. `keys` is an ordered list pressed together, e.g. ["ctrl","c"] or ["Enter"]. Use key names (ctrl, alt, shift, cmd, enter, tab, esc, arrows…), not characters.',
|
|
721
|
+
parameters: objectSchema(
|
|
722
|
+
{ keys: { type: "array", items: { type: "string" }, description: "Keys pressed together as a chord" } },
|
|
723
|
+
["keys"],
|
|
724
|
+
) as never,
|
|
725
|
+
strict: false,
|
|
726
|
+
...approval,
|
|
727
|
+
execute: async (input) => {
|
|
728
|
+
const { keys } = input as { keys: string[] };
|
|
729
|
+
return write(`pressed ${keys.join("+")}`, () => computer.keypress(keys));
|
|
730
|
+
},
|
|
731
|
+
}),
|
|
732
|
+
tool({
|
|
733
|
+
name: "computer_drag",
|
|
734
|
+
description:
|
|
735
|
+
"Drag the left mouse button along a path of points. `path` is an ordered list of {x, y} pixels; the button is pressed at the first point, moved through each, and released at the last.",
|
|
736
|
+
parameters: objectSchema(
|
|
737
|
+
{
|
|
738
|
+
path: {
|
|
739
|
+
type: "array",
|
|
740
|
+
description: "Ordered list of points to drag through",
|
|
741
|
+
items: {
|
|
742
|
+
type: "object",
|
|
743
|
+
properties: { x: { type: "integer" }, y: { type: "integer" } },
|
|
744
|
+
required: ["x", "y"],
|
|
745
|
+
additionalProperties: false,
|
|
746
|
+
},
|
|
747
|
+
},
|
|
748
|
+
},
|
|
749
|
+
["path"],
|
|
750
|
+
) as never,
|
|
751
|
+
strict: false,
|
|
752
|
+
...approval,
|
|
753
|
+
execute: async (input) => {
|
|
754
|
+
const { path } = input as { path: Array<{ x: number; y: number }> };
|
|
755
|
+
const points = path.map((p) => [p.x, p.y] as [number, number]);
|
|
756
|
+
return write(`dragged through ${points.length} point(s)`, () => computer.drag(points));
|
|
757
|
+
},
|
|
758
|
+
}),
|
|
759
|
+
] as unknown as Tool<unknown>[];
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
// ── The capability (the SDK seam) ────────────────────────────────────────────
|
|
763
|
+
|
|
764
|
+
export type ComputerUseArgs = {
|
|
765
|
+
dimensions?: [number, number];
|
|
766
|
+
readOnly?: boolean;
|
|
767
|
+
display?: string;
|
|
768
|
+
needsApproval?: boolean | ((ctx: unknown, action: unknown) => boolean | Promise<boolean>);
|
|
769
|
+
// Deliver screenshots from the FUNCTION tools as a REAL image the model can see
|
|
770
|
+
// (a structured `{type:'image'}` tool output → agents-core normalizes it to an
|
|
771
|
+
// `input_image` content item inside the function_call_output) instead of the text
|
|
772
|
+
// data-URL string. Only the codex/ChatGPT backend can read structured image tool
|
|
773
|
+
// results; chat-completions providers cannot, so this stays OFF (text rendering)
|
|
774
|
+
// by default and is turned on only on the codex path (see index.ts).
|
|
775
|
+
imageFunctionResults?: boolean;
|
|
776
|
+
};
|
|
777
|
+
|
|
778
|
+
export function computerUse(args: ComputerUseArgs = {}): ComputerUseCapability {
|
|
779
|
+
return new ComputerUseCapability(args);
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
/**
|
|
783
|
+
* A `Capability` subclass merged into the agent's tool set by SandboxAgent
|
|
784
|
+
* (`tools = [...agent.tools, ...capability.tools()]`). `bind(session)` hands it
|
|
785
|
+
* the LIVE externally-owned session, so the agent's actions and the viewers'
|
|
786
|
+
* pixels are one display.
|
|
787
|
+
*
|
|
788
|
+
* `tools()` is TRANSPORT-AWARE, mirroring the SDK's `filesystem()` capability
|
|
789
|
+
* (which branches its `view_image` / `apply_patch` on
|
|
790
|
+
* `supportsStructuredToolOutputTransport(this._modelInstance)`):
|
|
791
|
+
* • structured transport (the Responses/OpenAI backend) → the single HOSTED
|
|
792
|
+
* `computer_use_preview` tool over a Computer bound to the session (unchanged).
|
|
793
|
+
* • text transport (codex / ChatGPT backend — or an unbound model) → a set of
|
|
794
|
+
* FUNCTION tools ({@link computerFunctionTools}) that route to the SAME Computer,
|
|
795
|
+
* because the codex backend rejects the hosted computer tool type.
|
|
796
|
+
* The bound model instance is captured by the SDK's `bind().bindRunAs().bindModel()`
|
|
797
|
+
* chain (base `Capability._modelInstance`); the codex path in index.ts neutralizes
|
|
798
|
+
* `bindModel` so `_modelInstance` stays undefined here → the function tools win.
|
|
799
|
+
*/
|
|
800
|
+
export class ComputerUseCapability extends Capability {
|
|
801
|
+
readonly type = "computer-use";
|
|
802
|
+
constructor(private args: ComputerUseArgs = {}) { super(); }
|
|
803
|
+
|
|
804
|
+
override tools(): Tool<unknown>[] {
|
|
805
|
+
const session = requireBoundSession("computer-use", this._session);
|
|
806
|
+
// Backend-aware: a SELF-HOSTED session (macOS OR bring-your-own Linux) drives the
|
|
807
|
+
// desktop NATIVELY (CGEvent/XTEST inject + ScreenCaptureKit/x11 capture over the
|
|
808
|
+
// control plane) — no xdotool/scrot on the user's machine required. Everything
|
|
809
|
+
// else (Modal) keeps the xdotool/scrot-over-exec SandboxComputer. See
|
|
810
|
+
// `isNativeDesktopSession` for the duck-typed discriminator.
|
|
811
|
+
const computer: Computer = isNativeDesktopSession(session)
|
|
812
|
+
? new NativeDesktopComputer(session, {
|
|
813
|
+
...(this.args.dimensions ? { dimensions: this.args.dimensions } : {}),
|
|
814
|
+
...(this.args.readOnly !== undefined ? { readOnly: this.args.readOnly } : {}),
|
|
815
|
+
})
|
|
816
|
+
: new SandboxComputer(session, {
|
|
817
|
+
...(this.args.dimensions ? { dimensions: this.args.dimensions } : {}),
|
|
818
|
+
...(this.args.readOnly !== undefined ? { readOnly: this.args.readOnly } : {}),
|
|
819
|
+
...(this.args.display ? { display: this.args.display } : {}),
|
|
820
|
+
// The SDK base exposes the bound runAs as a protected field.
|
|
821
|
+
...(typeof this._runAs === "string" ? { runAs: this._runAs } : {}),
|
|
822
|
+
});
|
|
823
|
+
// Structured transport keeps the HOSTED computer tool (unchanged); the codex /
|
|
824
|
+
// text backend gets the FUNCTION tools it can actually call.
|
|
825
|
+
if (supportsStructuredToolOutputTransport(this._modelInstance)) {
|
|
826
|
+
return [
|
|
827
|
+
computerTool({
|
|
828
|
+
computer,
|
|
829
|
+
...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
|
|
830
|
+
}) as unknown as Tool<unknown>,
|
|
831
|
+
];
|
|
832
|
+
}
|
|
833
|
+
return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, this.args.imageFunctionResults ?? false);
|
|
834
|
+
}
|
|
835
|
+
}
|