gclm-code 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +1 -1
  2. package/bin/gc.js +53 -25
  3. package/bin/install-runtime.js +253 -0
  4. package/package.json +10 -5
  5. package/vendor/manifest.json +92 -0
  6. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/package.json +9 -0
  7. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/bridgeClient.ts +1126 -0
  8. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/browserTools.ts +546 -0
  9. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/index.ts +15 -0
  10. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/mcpServer.ts +96 -0
  11. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/mcpSocketClient.ts +493 -0
  12. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/mcpSocketPool.ts +327 -0
  13. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/toolCalls.ts +301 -0
  14. package/vendor/modules/node_modules/@ant/claude-for-chrome-mcp/src/types.ts +134 -0
  15. package/vendor/modules/node_modules/@ant/computer-use-input/package.json +9 -0
  16. package/vendor/modules/node_modules/@ant/computer-use-input/src/driver-jxa.js +341 -0
  17. package/vendor/modules/node_modules/@ant/computer-use-input/src/driver-swift.swift +417 -0
  18. package/vendor/modules/node_modules/@ant/computer-use-input/src/implementation.js +204 -0
  19. package/vendor/modules/node_modules/@ant/computer-use-input/src/index.js +5 -0
  20. package/vendor/modules/node_modules/@ant/computer-use-mcp/package.json +11 -0
  21. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/deniedApps.ts +553 -0
  22. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/imageResize.ts +108 -0
  23. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/index.ts +69 -0
  24. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/keyBlocklist.ts +153 -0
  25. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/mcpServer.ts +313 -0
  26. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/pixelCompare.ts +171 -0
  27. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/sentinelApps.ts +43 -0
  28. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/subGates.ts +19 -0
  29. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/toolCalls.ts +3872 -0
  30. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/tools.ts +706 -0
  31. package/vendor/modules/node_modules/@ant/computer-use-mcp/src/types.ts +635 -0
  32. package/vendor/modules/node_modules/@ant/computer-use-swift/package.json +9 -0
  33. package/vendor/modules/node_modules/@ant/computer-use-swift/src/driver-jxa.js +108 -0
  34. package/vendor/modules/node_modules/@ant/computer-use-swift/src/implementation.js +706 -0
  35. package/vendor/modules/node_modules/@ant/computer-use-swift/src/index.js +7 -0
  36. package/vendor/modules/node_modules/audio-capture-napi/package.json +8 -0
  37. package/vendor/modules/node_modules/audio-capture-napi/src/index.ts +226 -0
  38. package/vendor/modules/node_modules/image-processor-napi/package.json +11 -0
  39. package/vendor/modules/node_modules/image-processor-napi/src/index.ts +396 -0
  40. package/vendor/modules/node_modules/modifiers-napi/package.json +8 -0
  41. package/vendor/modules/node_modules/modifiers-napi/src/index.ts +79 -0
  42. package/vendor/modules/node_modules/url-handler-napi/package.json +8 -0
  43. package/vendor/modules/node_modules/url-handler-napi/src/index.ts +62 -0
@@ -0,0 +1,3872 @@
1
+ /**
2
+ * Tool dispatch. Every security decision from plan §2 is enforced HERE,
3
+ * before any executor method is called.
4
+ *
5
+ * Enforcement order, every call:
6
+ * 1. Kill switch (`adapter.isDisabled()`).
7
+ * 2. Capability-scoped TCC gate (`adapter.ensureOsPermissions(required)`).
8
+ * Input tools request Accessibility, screenshot tools request Screen
9
+ * Recording, and mixed paths request the union. `request_access` /
10
+ * `request_teach_access` only READ the current TCC state and thread it to
11
+ * the renderer so the user can grant OS perms from inside the approval
12
+ * dialog.
13
+ * 3. Tool-specific gates (see dispatch table) — ANY exception in a gate
14
+ * returns a tool error, executor never called.
15
+ * 4. Executor call.
16
+ *
17
+ * For input actions (click/type/key/scroll/drag/move_mouse) the tool-specific
18
+ * gates are, in order:
19
+ * a. `prepareForAction` — hide every non-allowlisted app, then defocus us
20
+ * (battle-tested pre-action sequence from the Vercept acquisition).
21
+ * Sub-gated via `hideBeforeAction`. After this runs the screenshot is
22
+ * TRUE (what the
23
+ * model sees IS what's at each pixel) and we are not keyboard-focused.
24
+ * b. Frontmost gate — branched by actionKind:
25
+ * mouse: frontmost ∈ allowlist ∪ {hostBundleId, Finder} → pass.
26
+ * hostBundleId passes because the executor's
27
+ * `withClickThrough` bracket makes us click-through.
28
+ * keyboard: frontmost ∈ allowlist ∪ {Finder} → pass.
29
+ * hostBundleId → ERROR (safety net — defocus should have
30
+ * moved us off; if it didn't, typing would go into our
31
+ * own chat box).
32
+ * After step (a) this gate fires RARELY — only when something popped
33
+ * up between prepare and action, or the 5-try hide loop gave up.
34
+ * Checked FRESH on every call, not cached across calls.
35
+ *
36
+ * For click variants only, AFTER the above gates but BEFORE the executor call:
37
+ * c. Pixel-validation staleness check (sub-gated).
38
+ */
39
+
40
+ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
41
+ import { randomUUID } from "node:crypto";
42
+
43
+ import { getDefaultTierForApp, getDeniedCategoryForApp, isPolicyDenied } from "./deniedApps.js";
44
+ import type {
45
+ ComputerExecutor,
46
+ DisplayGeometry,
47
+ InstalledApp,
48
+ ScreenshotResult,
49
+ } from "./executor.js";
50
+ import { isSystemKeyCombo } from "./keyBlocklist.js";
51
+ import { validateClickTarget } from "./pixelCompare.js";
52
+ import { SENTINEL_BUNDLE_IDS } from "./sentinelApps.js";
53
+ import type {
54
+ AppGrant,
55
+ ComputerUseHostAdapter,
56
+ ComputerUseOverrides,
57
+ CoordinateMode,
58
+ CuAppPermTier,
59
+ CuGrantFlags,
60
+ CuOsPermissionRequirements,
61
+ CuPermissionRequest,
62
+ CuSubGates,
63
+ CuTeachPermissionRequest,
64
+ Logger,
65
+ ResolvedAppRequest,
66
+ TeachStepRequest,
67
+ } from "./types.js";
68
+
69
+ /**
70
+ * Finder is never hidden by the hide loop (hiding Finder kills the Desktop),
71
+ * so it's always a valid frontmost.
72
+ */
73
+ const FINDER_BUNDLE_ID = "com.apple.finder";
74
+ const NO_OS_PERMISSIONS = Object.freeze({
75
+ accessibility: false,
76
+ screenRecording: false,
77
+ });
78
+ const ACCESSIBILITY_ONLY = Object.freeze({
79
+ accessibility: true,
80
+ screenRecording: false,
81
+ });
82
+ const SCREEN_RECORDING_ONLY = Object.freeze({
83
+ accessibility: false,
84
+ screenRecording: true,
85
+ });
86
+ const ACCESSIBILITY_AND_SCREEN_RECORDING = Object.freeze({
87
+ accessibility: true,
88
+ screenRecording: true,
89
+ });
90
+
91
+ /**
92
+ * Categorical error classes for the cu_tool_call telemetry event. Never
93
+ * free text — error messages may contain file paths / app content (PII).
94
+ */
95
+ export type CuErrorKind =
96
+ | "allowlist_empty"
97
+ | "tcc_not_granted"
98
+ | "cu_lock_held"
99
+ | "teach_mode_conflict"
100
+ | "teach_mode_not_active"
101
+ | "executor_threw"
102
+ | "capture_failed"
103
+ | "app_denied" // no longer emitted (tiered model replaced hard-deny); kept for schema compat
104
+ | "bad_args" // malformed tool args (type/shape/range/unknown value)
105
+ | "app_not_granted" // target app not in session allowlist (distinct from allowlist_empty)
106
+ | "tier_insufficient" // app in allowlist but at a tier too low for the action
107
+ | "feature_unavailable" // tool callable but session not wired for it
108
+ | "state_conflict" // wrong state for action (call sequence, mouse already held)
109
+ | "grant_flag_required" // action needs a grant flag (systemKeyCombos, clipboard*) from request_access
110
+ | "display_error" // display enumeration failed (platform)
111
+ | "other";
112
+
113
+ /**
114
+ * Telemetry payload piggybacked on the result — populated by handlers,
115
+ * consumed and stripped by the host wrapper (serverDef.ts) before the
116
+ * result goes to the SDK. Same pattern as `screenshot`.
117
+ */
118
+ export interface CuCallTelemetry {
119
+ /** request_access / request_teach_access: apps NEWLY granted in THIS call
120
+ * (does NOT include idempotent re-grants of already-allowed apps). */
121
+ granted_count?: number;
122
+ /** request_access / request_teach_access: apps denied in THIS call */
123
+ denied_count?: number;
124
+ /** request_access / request_teach_access: apps safety-denied (browser) this call */
125
+ denied_browser_count?: number;
126
+ /** request_access / request_teach_access: apps safety-denied (terminal) this call */
127
+ denied_terminal_count?: number;
128
+ /** Categorical error class (only set when isError) */
129
+ error_kind?: CuErrorKind;
130
+ }
131
+
132
+ /**
133
+ * `CallToolResult` augmented with the screenshot payload. `bindSessionContext`
134
+ * reads `result.screenshot` after a `screenshot` tool call and stashes it in a
135
+ * closure cell for the next pixel-validation. MCP clients never see this
136
+ * field — the host wrapper strips it before returning to the SDK.
137
+ */
138
+ export type CuCallToolResult = CallToolResult & {
139
+ screenshot?: ScreenshotResult;
140
+ /** Piggybacked telemetry — stripped by the host wrapper before SDK return. */
141
+ telemetry?: CuCallTelemetry;
142
+ };
143
+
144
+ // ---------------------------------------------------------------------------
145
+ // Small result helpers (mirror of chrome-mcp's inline `{content, isError}`)
146
+ // ---------------------------------------------------------------------------
147
+
148
+ function errorResult(text: string, errorKind?: CuErrorKind): CuCallToolResult {
149
+ return {
150
+ content: [{ type: "text", text }],
151
+ isError: true,
152
+ telemetry: errorKind ? { error_kind: errorKind } : undefined,
153
+ };
154
+ }
155
+
156
+ function okText(text: string): CuCallToolResult {
157
+ return { content: [{ type: "text", text }] };
158
+ }
159
+
160
+ function okJson(obj: unknown, telemetry?: CuCallTelemetry): CuCallToolResult {
161
+ return {
162
+ content: [{ type: "text", text: JSON.stringify(obj) }],
163
+ telemetry,
164
+ };
165
+ }
166
+
167
+ // ---------------------------------------------------------------------------
168
+ // Arg validation — lightweight, no zod (mirrors chrome-mcp's cast-and-check)
169
+ // ---------------------------------------------------------------------------
170
+
171
+ function asRecord(args: unknown): Record<string, unknown> {
172
+ if (typeof args === "object" && args !== null) {
173
+ return args as Record<string, unknown>;
174
+ }
175
+ return {};
176
+ }
177
+
178
+ function requireNumber(
179
+ args: Record<string, unknown>,
180
+ key: string,
181
+ ): number | Error {
182
+ const v = args[key];
183
+ if (typeof v !== "number" || !Number.isFinite(v)) {
184
+ return new Error(`"${key}" must be a finite number.`);
185
+ }
186
+ return v;
187
+ }
188
+
189
+ function requireString(
190
+ args: Record<string, unknown>,
191
+ key: string,
192
+ ): string | Error {
193
+ const v = args[key];
194
+ if (typeof v !== "string") {
195
+ return new Error(`"${key}" must be a string.`);
196
+ }
197
+ return v;
198
+ }
199
+
200
+ /**
201
+ * Extract (x, y) from `coordinate: [x, y]` tuple.
202
+ * array of length 2, both non-negative numbers.
203
+ */
204
+ function extractCoordinate(
205
+ args: Record<string, unknown>,
206
+ paramName: string = "coordinate",
207
+ ): [number, number] | Error {
208
+ const coord = args[paramName];
209
+ if (coord === undefined) {
210
+ return new Error(`${paramName} is required`);
211
+ }
212
+ if (!Array.isArray(coord) || coord.length !== 2) {
213
+ return new Error(`${paramName} must be an array of length 2`);
214
+ }
215
+ const [x, y] = coord;
216
+ if (typeof x !== "number" || typeof y !== "number" || x < 0 || y < 0) {
217
+ return new Error(`${paramName} must be a tuple of non-negative numbers`);
218
+ }
219
+ return [x, y];
220
+ }
221
+
222
+ // ---------------------------------------------------------------------------
223
+ // Coordinate scaling
224
+ // ---------------------------------------------------------------------------
225
+
226
+ /**
227
+ * Convert model-space coordinates to the logical points that enigo expects.
228
+ *
229
+ * - `normalized_0_100`: (x / 100) * display.width. `display` is fetched
230
+ * fresh per tool call — never cached across calls —
231
+ * so a mid-session display-settings change doesn't leave us stale.
232
+ * - `pixels`: the model sent image-space pixel coords (it read them off the
233
+ * last screenshot). With the 1568-px long-edge downsample, the
234
+ * screenshot-px → logical-pt ratio is `displayWidth / screenshotWidth`,
235
+ * NOT `1/scaleFactor`. Uses the display geometry stashed at CAPTURE time
236
+ * (`lastScreenshot.displayWidth`), not fresh — so the transform matches
237
+ * what the model actually saw even if the user changed display settings
238
+ * since. (Chrome's ScreenshotContext pattern — CDPService.ts:1486-1493.)
239
+ */
240
+ function scaleCoord(
241
+ rawX: number,
242
+ rawY: number,
243
+ mode: CoordinateMode,
244
+ display: DisplayGeometry,
245
+ lastScreenshot: ScreenshotResult | undefined,
246
+ logger: Logger,
247
+ ): { x: number; y: number } {
248
+ if (mode === "normalized_0_100") {
249
+ // Origin offset targets the selected display in virtual-screen space.
250
+ return {
251
+ x: Math.round((rawX / 100) * display.width) + display.originX,
252
+ y: Math.round((rawY / 100) * display.height) + display.originY,
253
+ };
254
+ }
255
+
256
+ // mode === "pixels": model sent image-space pixel coords.
257
+ if (lastScreenshot) {
258
+ // The transform. Chrome coordinateScaling.ts:22-34 + claude-in-a-box
259
+ // ComputerTool.swift:70-80 — two independent convergent impls.
260
+ // Uses the display geometry stashed AT CAPTURE TIME, not fresh.
261
+ // Origin from the same snapshot keeps clicks coherent with the captured display.
262
+ return {
263
+ x:
264
+ Math.round(
265
+ rawX * (lastScreenshot.displayWidth / lastScreenshot.width),
266
+ ) + lastScreenshot.originX,
267
+ y:
268
+ Math.round(
269
+ rawY * (lastScreenshot.displayHeight / lastScreenshot.height),
270
+ ) + lastScreenshot.originY,
271
+ };
272
+ }
273
+
274
+ // Cold start: model sent pixel coords without having taken a screenshot.
275
+ // Degenerate — fall back to the old /sf behavior and warn.
276
+ logger.warn(
277
+ "[computer-use] pixels-mode coordinate received with no prior screenshot; " +
278
+ "falling back to /scaleFactor. Click may be off if downsample is active.",
279
+ );
280
+ return {
281
+ x: Math.round(rawX / display.scaleFactor) + display.originX,
282
+ y: Math.round(rawY / display.scaleFactor) + display.originY,
283
+ };
284
+ }
285
+
286
+ /**
287
+ * Convert model-space coordinates to the 0–100 percentage that
288
+ * pixelCompare.ts works in. The staleness check operates in screenshot-image
289
+ * space; comparing by percentage lets us crop both last and fresh screenshots
290
+ * at the same relative location without caring about their absolute dims.
291
+ *
292
+ * With the 1568-px downsample, `screenshot.width != display.width * sf`, so
293
+ * the old `rawX / (display.width * sf)` formula is wrong. The correct
294
+ * denominator is just `lastScreenshot.width` — the model's raw pixel coord is
295
+ * already in that image's coordinate space. `DisplayGeometry` is no longer
296
+ * consumed at all.
297
+ */
298
+ function coordToPercentageForPixelCompare(
299
+ rawX: number,
300
+ rawY: number,
301
+ mode: CoordinateMode,
302
+ lastScreenshot: ScreenshotResult | undefined,
303
+ ): { xPct: number; yPct: number } {
304
+ if (mode === "normalized_0_100") {
305
+ // Unchanged — already a percentage.
306
+ return { xPct: rawX, yPct: rawY };
307
+ }
308
+
309
+ // mode === "pixels"
310
+ if (!lastScreenshot) {
311
+ // validateClickTarget at pixelCompare.ts:141-143 already skips when
312
+ // lastScreenshot is undefined, so this return value never reaches a crop.
313
+ return { xPct: 0, yPct: 0 };
314
+ }
315
+ return {
316
+ xPct: (rawX / lastScreenshot.width) * 100,
317
+ yPct: (rawY / lastScreenshot.height) * 100,
318
+ };
319
+ }
320
+
321
+ // ---------------------------------------------------------------------------
322
+ // Shared input-action gates
323
+ // ---------------------------------------------------------------------------
324
+
325
+ /**
326
+ * Tier needed to perform a given action class. `undefined` → `"full"`.
327
+ *
328
+ * - `"mouse_position"` — mouse_move only. Passes at any tier including
329
+ * `"read"`. Pure cursor positioning, no app interaction. Still runs
330
+ * prepareForAction (hide non-allowed apps).
331
+ * - `"mouse"` — plain left click, double/triple, scroll, drag-from.
332
+ * Requires tier `"click"` or `"full"`.
333
+ * - `"mouse_full"` — right/middle click, any click with modifiers,
334
+ * drag-drop (the `to` endpoint of left_click_drag). Requires tier
335
+ * `"full"`. Right-click → context menu Paste, modifier chords →
336
+ * keystrokes before click, drag-drop → text insertion at the drop
337
+ * point. All escalate a click-tier grant to keyboard-equivalent input.
338
+ * Blunt: also rejects same-app drags (scrollbar, panel resize) onto
339
+ * click-tier apps; `scroll` is the tier-"click" way to scroll.
340
+ * - `"keyboard"` — type, key, hold_key. Requires tier `"full"`.
341
+ */
342
+ type CuActionKind = "mouse_position" | "mouse" | "mouse_full" | "keyboard";
343
+
344
+ function tierSatisfies(
345
+ grantTier: CuAppPermTier | undefined,
346
+ actionKind: CuActionKind,
347
+ ): boolean {
348
+ const tier = grantTier ?? "full";
349
+ if (actionKind === "mouse_position") return true;
350
+ if (actionKind === "keyboard" || actionKind === "mouse_full") {
351
+ return tier === "full";
352
+ }
353
+ // mouse
354
+ return tier === "click" || tier === "full";
355
+ }
356
+
357
+ // Appended to every tier_insufficient error. The model may try to route
358
+ // around the gate (osascript, System Events, cliclick via Bash) — this
359
+ // closes that door explicitly. Leading space so it concatenates cleanly.
360
+ const TIER_ANTI_SUBVERSION =
361
+ " Do not attempt to work around this restriction — never use AppleScript, " +
362
+ "System Events, shell commands, or any other method to send clicks or " +
363
+ "keystrokes to this app.";
364
+
365
+ // ---------------------------------------------------------------------------
366
+ // Clipboard guard — stash+clear while a click-tier app is frontmost
367
+ // ---------------------------------------------------------------------------
368
+ //
369
+ // Threat: tier "click" blocks type/key/right-click-Paste, but a click-tier
370
+ // terminal/IDE may have a UI Paste button that's plain-left-clickable. If the
371
+ // clipboard holds `rm -rf /` — from the user, from a prior full-tier paste,
372
+ // OR from the agent's own write_clipboard call (which doesn't route through
373
+ // runInputActionGates) — a left_click on that button injects it.
374
+ //
375
+ // Mitigation: stash the user's clipboard on first entry to click-tier, then
376
+ // RE-CLEAR before every input action while click-tier stays frontmost. The
377
+ // re-clear is the load-bearing part — a stash-on-transition-only design
378
+ // leaves a gap between an agent write_clipboard and the next left_click.
379
+ // When frontmost becomes anything else, restore. Turn-end restore is inlined
380
+ // in the host's result-handler + leavingRunning (same dual-location as
381
+ // cuHiddenDuringTurn unhide) — reads `session.cuClipboardStash` directly and
382
+ // writes via Electron's `clipboard.writeText`, so no nest-only import.
383
+ //
384
+ // State lives on the session (via `overrides.getClipboardStash` /
385
+ // `onClipboardStashChanged`), not module-level. The CU lock still guarantees
386
+ // one session at a time, but session-scoped state means the host's turn-end
387
+ // restore doesn't need to reach back into this package.
388
+
389
+ async function syncClipboardStash(
390
+ adapter: ComputerUseHostAdapter,
391
+ overrides: ComputerUseOverrides,
392
+ frontmostIsClickTier: boolean,
393
+ ): Promise<void> {
394
+ const current = overrides.getClipboardStash?.();
395
+ if (!frontmostIsClickTier) {
396
+ // Restore + clear. Idempotent — if nothing is stashed, no-op.
397
+ if (current === undefined) return;
398
+ try {
399
+ await adapter.executor.writeClipboard(current);
400
+ // Clear only after a successful write — a transient pasteboard
401
+ // failure must not irrecoverably drop the stash.
402
+ overrides.onClipboardStashChanged?.(undefined);
403
+ } catch {
404
+ // Best effort — stash held, next non-click action retries.
405
+ }
406
+ return;
407
+ }
408
+ // Stash the user's clipboard on FIRST entry to click-tier only.
409
+ if (current === undefined) {
410
+ try {
411
+ const read = await adapter.executor.readClipboard();
412
+ overrides.onClipboardStashChanged?.(read);
413
+ } catch {
414
+ // readClipboard failed — use empty sentinel so we don't retry the stash
415
+ // on the next action; restore becomes a harmless writeClipboard("").
416
+ overrides.onClipboardStashChanged?.("");
417
+ }
418
+ }
419
+ // Re-clear on EVERY click-tier action, not just the first. Defeats the
420
+ // bypass where the agent calls write_clipboard (which doesn't route
421
+ // through runInputActionGates) between stash and a left_click on a UI
422
+ // Paste button — the next action's clear clobbers the agent's write
423
+ // before the click lands.
424
+ try {
425
+ await adapter.executor.writeClipboard("");
426
+ } catch {
427
+ // Transient pasteboard failure. The tier-"click" right-click/modifier
428
+ // block still holds; this is a net, not a promise.
429
+ }
430
+ }
431
+
432
+ /** Every click/type/key/scroll/drag/move_mouse runs through this before
433
+ * touching the executor. Returns null on pass, error-result on block.
434
+ * Any throw inside → caught by handleToolCall's outer try → tool error. */
435
+ async function runInputActionGates(
436
+ adapter: ComputerUseHostAdapter,
437
+ overrides: ComputerUseOverrides,
438
+ subGates: CuSubGates,
439
+ actionKind: CuActionKind,
440
+ ): Promise<CuCallToolResult | null> {
441
+ // Step A+B — hide non-allowlisted apps + defocus us. Sub-gated. After this
442
+ // runs, the frontmost gate below becomes a rare edge-case detector (something
443
+ // popped up between prepare and action) rather than a normal-path blocker.
444
+ // ALL grant tiers stay visible — visibility is the baseline (tier "read").
445
+ if (subGates.hideBeforeAction) {
446
+ const hidden = await adapter.executor.prepareForAction(
447
+ overrides.allowedApps.map((a) => a.bundleId),
448
+ overrides.selectedDisplayId,
449
+ );
450
+ // Empty-check so we don't spam the callback on every action when nothing
451
+ // was hidden (the common case after the first action of a turn).
452
+ if (hidden.length > 0) {
453
+ overrides.onAppsHidden?.(hidden);
454
+ }
455
+ }
456
+
457
+ // Frontmost gate. Check FRESH on every call.
458
+ const frontmost = await adapter.executor.getFrontmostApp();
459
+
460
+ const tierByBundleId = new Map(
461
+ overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
462
+ );
463
+
464
+ // After handleToolCall's tier backfill, every grant has a concrete tier —
465
+ // .get() returning undefined means the app is not in the allowlist at all.
466
+ const frontmostTier = frontmost
467
+ ? tierByBundleId.get(frontmost.bundleId)
468
+ : undefined;
469
+
470
+ // Clipboard guard. Per-action, not per-tool-call — runs for every sub-action
471
+ // inside computer_batch and teach_step/teach_batch, so clicking into a
472
+ // click-tier app mid-batch stashes+clears before the next click lands.
473
+ // Lives here (not in handleToolCall) so deferAcquire tools (request_access,
474
+ // list_granted_applications), `wait`, and the teach_step blocking-dialog
475
+ // phase don't trigger a sync — only input actions do.
476
+ if (subGates.clipboardGuard) {
477
+ await syncClipboardStash(adapter, overrides, frontmostTier === "click");
478
+ }
479
+
480
+ if (!frontmost) {
481
+ // No frontmost app (rare — login window?). Let it through; the click
482
+ // will land somewhere and PixelCompare catches staleness.
483
+ return null;
484
+ }
485
+
486
+ const { hostBundleId } = adapter.executor.capabilities;
487
+
488
+ if (frontmostTier !== undefined) {
489
+ if (tierSatisfies(frontmostTier, actionKind)) return null;
490
+ // In the allowlist but tier doesn't cover this action. Tailor the
491
+ // guidance to the actual tier — at "read", suggesting left_click or Bash
492
+ // is wrong (nothing is allowed; use Chrome MCP). At "click", the
493
+ // mouse_full/keyboard-specific messages apply.
494
+ if (frontmostTier === "read") {
495
+ // tier "read" is not category-unique (browser AND trading map to it) —
496
+ // re-look-up so the CiC hint only shows for actual browsers.
497
+ const isBrowser =
498
+ getDeniedCategoryForApp(frontmost.bundleId, frontmost.displayName) ===
499
+ "browser";
500
+ return errorResult(
501
+ `"${frontmost.displayName}" is granted at tier "read" — ` +
502
+ `visible in screenshots only, no clicks or typing.` +
503
+ (isBrowser
504
+ ? " Use the Claude-in-Chrome MCP for browser interaction (tools " +
505
+ "named `mcp__Claude_in_Chrome__*`; load via ToolSearch if " +
506
+ "deferred)."
507
+ : " No interaction is permitted; ask the user to take any " +
508
+ "actions in this app themselves.") +
509
+ TIER_ANTI_SUBVERSION,
510
+ "tier_insufficient",
511
+ );
512
+ }
513
+ // frontmostTier === "click" (tier === "full" would have passed tierSatisfies)
514
+ if (actionKind === "keyboard") {
515
+ return errorResult(
516
+ `"${frontmost.displayName}" is granted at tier "click" — ` +
517
+ `typing, key presses, and paste require tier "full". The keys ` +
518
+ `would go to this app's text fields or integrated terminal. To ` +
519
+ `type into a different app, click it first to bring it forward. ` +
520
+ `For shell commands, use the Bash tool.` + TIER_ANTI_SUBVERSION,
521
+ "tier_insufficient",
522
+ );
523
+ }
524
+ // actionKind === "mouse_full" ("mouse" and "mouse_position" pass at "click")
525
+ return errorResult(
526
+ `"${frontmost.displayName}" is granted at tier "click" — ` +
527
+ `right-click, middle-click, and clicks with modifier keys require ` +
528
+ `tier "full". Right-click opens a context menu with Paste/Cut, and ` +
529
+ `modifier chords fire as keystrokes before the click. Plain ` +
530
+ `left_click is allowed here.` + TIER_ANTI_SUBVERSION,
531
+ "tier_insufficient",
532
+ );
533
+ }
534
+ // Finder is never-hide, always allowed.
535
+ if (frontmost.bundleId === FINDER_BUNDLE_ID) return null;
536
+
537
+ if (frontmost.bundleId === hostBundleId) {
538
+ if (actionKind !== "keyboard") {
539
+ // mouse and mouse_full are both click events — click-through works.
540
+ // We're click-through (executor's withClickThrough). Pass.
541
+ return null;
542
+ }
543
+ // Keyboard safety net — defocus (prepareForAction step B) should have
544
+ // moved us off. If we're still here, typing would go to our chat box.
545
+ return errorResult(
546
+ "Gclm Code's own window still has keyboard focus. This should not happen " +
547
+ "after the pre-action defocus. Click on the target application first.",
548
+ "state_conflict",
549
+ );
550
+ }
551
+
552
+ // Non-allowlisted, non-us, non-Finder. RARE after the hide loop — means
553
+ // something popped up between prepare and action, or the 5-try loop gave up.
554
+ return errorResult(
555
+ `"${frontmost.displayName}" is not in the allowed applications and is ` +
556
+ `currently in front. Take a new screenshot — it may have appeared ` +
557
+ `since your last one.`,
558
+ "app_not_granted",
559
+ );
560
+ }
561
+
562
+ /**
563
+ * Hit-test gate: reject a mouse action if the window under (x, y) belongs
564
+ * to an app whose tier doesn't cover mouse input. Closes the gap where a
565
+ * tier-"full" app is frontmost but the click lands on a tier-"read" window
566
+ * overlapping it — `runInputActionGates` passes (frontmost is fine), but the
567
+ * click actually goes to the read-tier app.
568
+ *
569
+ * Runs AFTER `scaleCoord` (needs global coords) and BEFORE the executor call.
570
+ * Returns null on pass (target is tier-"click"/"full", or desktop/Finder/us),
571
+ * error-result on block.
572
+ *
573
+ * When `appUnderPoint` returns null (desktop, or platform without hit-test),
574
+ * falls through — the frontmost check in `runInputActionGates` already ran.
575
+ */
576
+ async function runHitTestGate(
577
+ adapter: ComputerUseHostAdapter,
578
+ overrides: ComputerUseOverrides,
579
+ subGates: CuSubGates,
580
+ x: number,
581
+ y: number,
582
+ actionKind: CuActionKind,
583
+ ): Promise<CuCallToolResult | null> {
584
+ const target = await adapter.executor.appUnderPoint(x, y);
585
+ if (!target) return null; // desktop / nothing under point / platform no-op
586
+
587
+ // Finder (desktop, file dialogs) is always clickable — same exemption as
588
+ // runInputActionGates. Our own overlay is filtered by the backend (pid != self).
589
+ if (target.bundleId === FINDER_BUNDLE_ID) return null;
590
+
591
+ const tierByBundleId = new Map(
592
+ overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
593
+ );
594
+
595
+ if (!tierByBundleId.has(target.bundleId)) {
596
+ // Not in the allowlist at all. The frontmost check would catch this if
597
+ // the target were frontmost, but here a different app is in front. This
598
+ // is the "something popped up" edge case — a new window appeared between
599
+ // screenshot and click, or a background app's window overlaps the target.
600
+ return errorResult(
601
+ `Click at these coordinates would land on "${target.displayName}", ` +
602
+ `which is not in the allowed applications. Take a fresh screenshot ` +
603
+ `to see the current window layout.`,
604
+ "app_not_granted",
605
+ );
606
+ }
607
+
608
+ const targetTier = tierByBundleId.get(target.bundleId);
609
+
610
+ // Frontmost-based sync (runInputActionGates) misses the case where
611
+ // the click lands on a NON-FRONTMOST click-tier window. Re-sync by
612
+ // the hit-test target's tier — if target is click-tier, stash+clear
613
+ // before the click lands, regardless of what's frontmost.
614
+ if (subGates.clipboardGuard && targetTier === "click") {
615
+ await syncClipboardStash(adapter, overrides, true);
616
+ }
617
+
618
+ if (tierSatisfies(targetTier, actionKind)) return null;
619
+
620
+ // Target is in the allowlist but tier doesn't cover this action.
621
+ // runHitTestGate is only called with mouse/mouse_full (keyboard routes to
622
+ // frontmost, not window-under-cursor). The branch above catches
623
+ // mouse_full ∧ click; the only remaining fall-through is tier "read".
624
+ if (actionKind === "mouse_full" && targetTier === "click") {
625
+ return errorResult(
626
+ `Click at these coordinates would land on "${target.displayName}", ` +
627
+ `which is granted at tier "click" — right-click, middle-click, and ` +
628
+ `clicks with modifier keys require tier "full" (they can Paste via ` +
629
+ `the context menu or fire modifier-chord keystrokes). Plain ` +
630
+ `left_click is allowed here.` + TIER_ANTI_SUBVERSION,
631
+ "tier_insufficient",
632
+ );
633
+ }
634
+ const isBrowser =
635
+ getDeniedCategoryForApp(target.bundleId, target.displayName) === "browser";
636
+ return errorResult(
637
+ `Click at these coordinates would land on "${target.displayName}", ` +
638
+ `which is granted at tier "read" (screenshots only, no interaction). ` +
639
+ (isBrowser
640
+ ? "Use the Claude-in-Chrome MCP for browser interaction."
641
+ : "Ask the user to take any actions in this app themselves.") +
642
+ TIER_ANTI_SUBVERSION,
643
+ "tier_insufficient",
644
+ );
645
+ }
646
+
647
+ // ---------------------------------------------------------------------------
648
+ // Screenshot helpers
649
+ // ---------------------------------------------------------------------------
650
+
651
+ /**
652
+ * §6 item 9 — screenshot retry on implausibly-small buffer. Battle-tested
653
+ * threshold (1024 bytes). We retry exactly once.
654
+ */
655
+ const MIN_SCREENSHOT_BYTES = 1024;
656
+
657
+ function decodedByteLength(base64: string): number {
658
+ // 3 bytes per 4 chars, minus padding. Good enough for a threshold check.
659
+ const padding = base64.endsWith("==") ? 2 : base64.endsWith("=") ? 1 : 0;
660
+ return Math.floor((base64.length * 3) / 4) - padding;
661
+ }
662
+
663
+ async function takeScreenshotWithRetry(
664
+ executor: ComputerExecutor,
665
+ allowedBundleIds: string[],
666
+ logger: ComputerUseHostAdapter["logger"],
667
+ displayId?: number,
668
+ ): Promise<ScreenshotResult> {
669
+ let shot = await executor.screenshot({ allowedBundleIds, displayId });
670
+ if (decodedByteLength(shot.base64) < MIN_SCREENSHOT_BYTES) {
671
+ logger.warn(
672
+ `[computer-use] screenshot implausibly small (${decodedByteLength(shot.base64)} bytes decoded), retrying once`,
673
+ );
674
+ shot = await executor.screenshot({ allowedBundleIds, displayId });
675
+ }
676
+ return shot;
677
+ }
678
+
679
+ // ---------------------------------------------------------------------------
680
+ // Grapheme iteration — §6 item 7, ported from the Vercept acquisition
681
+ // ---------------------------------------------------------------------------
682
+
683
+ const INTER_GRAPHEME_SLEEP_MS = 8; // §6 item 4 — 125 Hz USB polling
684
+
685
+ function segmentGraphemes(text: string): string[] {
686
+ try {
687
+ // Node 18+ has Intl.Segmenter; the try is defence against a stripped-
688
+ // -down runtime (falls back to code points).
689
+ const Segmenter = (
690
+ Intl as typeof Intl & {
691
+ Segmenter?: new (
692
+ locale?: string,
693
+ options?: { granularity: "grapheme" | "word" | "sentence" },
694
+ ) => { segment: (s: string) => Iterable<{ segment: string }> };
695
+ }
696
+ ).Segmenter;
697
+ if (typeof Segmenter === "function") {
698
+ const seg = new Segmenter(undefined, { granularity: "grapheme" });
699
+ return Array.from(seg.segment(text), (s) => s.segment);
700
+ }
701
+ } catch {
702
+ // fall through
703
+ }
704
+ // Code-point iteration. Keeps surrogate pairs together but splits ZWJ.
705
+ return Array.from(text);
706
+ }
707
+
708
+ function sleep(ms: number): Promise<void> {
709
+ return new Promise((r) => setTimeout(r, ms));
710
+ }
711
+
712
+ /**
713
+ * Split a chord string like "ctrl+shift" into individual key names.
714
+ * Same parsing as `key` tool / executor.key / keyBlocklist.normalizeKeySequence.
715
+ */
716
+ function parseKeyChord(text: string): string[] {
717
+ return text
718
+ .split("+")
719
+ .map((s) => s.trim())
720
+ .filter(Boolean);
721
+ }
722
+
723
+ // ---------------------------------------------------------------------------
724
+ // left_mouse_down / left_mouse_up held-state tracking
725
+ // ---------------------------------------------------------------------------
726
+
727
+ /**
728
+ * Errors on double-down but not on up-without-down. Module-level, but
729
+ * reset on every lock acquire (handleToolCall → acquireCuLock branch) so
730
+ * a session interrupted mid-drag (overlay stop during left_mouse_down)
731
+ * doesn't leave the flag true for the next lock holder.
732
+ *
733
+ * Still scoped wrong within a single lock cycle if sessions could interleave
734
+ * tool calls, but the lock enforces at-most-one-session-uses-CU so they
735
+ * can't. The per-turn reset is the correctness boundary.
736
+ */
737
+ let mouseButtonHeld = false;
738
+ /** Whether mouse_move occurred between left_mouse_down and left_mouse_up.
739
+ * When false at mouseUp, the decomposed sequence is a click-release (not a
740
+ * drop) — hit-test at "mouse", not "mouse_full". */
741
+ let mouseMoved = false;
742
+
743
+ /** Clears the cross-call drag flags. Called from Gate-3 on lock-acquire and
744
+ * from `bindSessionContext` in mcpServer.ts — a fresh lock holder must not
745
+ * inherit a prior session's mid-drag state. */
746
+ export function resetMouseButtonHeld(): void {
747
+ mouseButtonHeld = false;
748
+ mouseMoved = false;
749
+ }
750
+
751
+ /** If a left_mouse_down set the OS button without a matching left_mouse_up
752
+ * ever getting its turn, release it now. Same release-before-return as
753
+ * handleClick. No-op when not held — callers don't need to check. */
754
+ async function releaseHeldMouse(
755
+ adapter: ComputerUseHostAdapter,
756
+ ): Promise<void> {
757
+ if (!mouseButtonHeld) return;
758
+ await adapter.executor.mouseUp();
759
+ mouseButtonHeld = false;
760
+ mouseMoved = false;
761
+ }
762
+
763
+ /**
764
+ * Tools that check the lock but don't acquire it. `request_access` and
765
+ * `list_granted_applications` hit the CHECK (so a blocked session doesn't
766
+ * show an approval dialog for access it can't use) but defer ACQUIRE — the
767
+ * enter-CU notification/overlay only fires on the first action tool.
768
+ *
769
+ * `request_teach_access` is NOT here: approving teach mode hides the main
770
+ * window, and the lock must be held before that. See Gate-3 block in
771
+ * `handleToolCall` for the full explanation.
772
+ *
773
+ * Exported for `bindSessionContext` in mcpServer.ts so the async lock gate
774
+ * uses the same set as the sync one.
775
+ */
776
+ export function defersLockAcquire(toolName: string): boolean {
777
+ return (
778
+ toolName === "request_access" ||
779
+ toolName === "list_granted_applications"
780
+ );
781
+ }
782
+
783
+ // ---------------------------------------------------------------------------
784
+ // request_access helpers
785
+ // ---------------------------------------------------------------------------
786
+
787
+ /** Reverse-DNS-ish: contains at least one dot, no spaces, no slashes. Lets
788
+ * raw bundle IDs pass through resolution. */
789
+ const REVERSE_DNS_RE = /^[A-Za-z0-9][\w.-]*\.[A-Za-z0-9][\w.-]*$/;
790
+
791
+ function looksLikeBundleId(s: string): boolean {
792
+ return REVERSE_DNS_RE.test(s) && !s.includes(" ");
793
+ }
794
+
795
+ function resolveRequestedApps(
796
+ requestedNames: string[],
797
+ installed: InstalledApp[],
798
+ alreadyGrantedBundleIds: ReadonlySet<string>,
799
+ ): ResolvedAppRequest[] {
800
+ const byLowerDisplayName = new Map<string, InstalledApp>();
801
+ const byBundleId = new Map<string, InstalledApp>();
802
+ for (const app of installed) {
803
+ byBundleId.set(app.bundleId, app);
804
+ // Last write wins on collisions. Ambiguous-name handling (multiple
805
+ // candidates in the dialog) is plan-documented but deferred — the
806
+ // InstalledApps enumerator dedupes by bundle ID, so true display-name
807
+ // collisions are rare. TODO(chicago, post-P1): surface all candidates.
808
+ byLowerDisplayName.set(app.displayName.toLowerCase(), app);
809
+ }
810
+
811
+ return requestedNames.map((requested): ResolvedAppRequest => {
812
+ let resolved: InstalledApp | undefined;
813
+ if (looksLikeBundleId(requested)) {
814
+ resolved = byBundleId.get(requested);
815
+ }
816
+ if (!resolved) {
817
+ resolved = byLowerDisplayName.get(requested.toLowerCase());
818
+ }
819
+ const bundleId = resolved?.bundleId;
820
+ // When unresolved AND the requested string looks like a bundle ID, use it
821
+ // directly for tier lookup (e.g. "company.thebrowser.Browser" with Arc not
822
+ // installed — the reverse-DNS string won't match any display-name substring).
823
+ const bundleIdCandidate =
824
+ bundleId ?? (looksLikeBundleId(requested) ? requested : undefined);
825
+ return {
826
+ requestedName: requested,
827
+ resolved,
828
+ isSentinel: bundleId ? SENTINEL_BUNDLE_IDS.has(bundleId) : false,
829
+ alreadyGranted: bundleId ? alreadyGrantedBundleIds.has(bundleId) : false,
830
+ proposedTier: getDefaultTierForApp(
831
+ bundleIdCandidate,
832
+ resolved?.displayName ?? requested,
833
+ ),
834
+ };
835
+ });
836
+ }
837
+
838
+ // ---------------------------------------------------------------------------
839
+ // Individual tool handlers
840
+ // ---------------------------------------------------------------------------
841
+
842
+ async function handleRequestAccess(
843
+ adapter: ComputerUseHostAdapter,
844
+ args: Record<string, unknown>,
845
+ overrides: ComputerUseOverrides,
846
+ tccState: { accessibility: boolean; screenRecording: boolean } | undefined,
847
+ ): Promise<CuCallToolResult> {
848
+ if (!overrides.onPermissionRequest) {
849
+ return errorResult(
850
+ "This session was not wired with a permission handler. Computer control is not available here.",
851
+ "feature_unavailable",
852
+ );
853
+ }
854
+
855
+ // Teach mode hides the main window; permission dialogs render in that
856
+ // window. Without this, handleToolPermission blocks on an invisible
857
+ // prompt and the overlay spins forever. Tell the model to exit teach
858
+ // mode, request access, then re-enter.
859
+ if (overrides.getTeachModeActive?.()) {
860
+ return errorResult(
861
+ "Cannot request additional permissions during teach mode — the permission dialog would be hidden. End teach mode (finish the tour or let the turn complete), then call request_access, then start a new tour.",
862
+ "teach_mode_conflict",
863
+ );
864
+ }
865
+
866
+ const reason = requireString(args, "reason");
867
+ if (reason instanceof Error) return errorResult(reason.message, "bad_args");
868
+
869
+ // TCC-ungranted branch. The renderer shows a toggle panel INSTEAD OF the
870
+ // app list when `tccState` is present on the request, so we skip app
871
+ // resolution entirely (listInstalledApps() may fail without Screen
872
+ // Recording anyway). The user grants the OS perms from inside the dialog,
873
+ // then clicks "Ask again" — both buttons resolve with deny by design
874
+ // (ComputerUseApproval.tsx) so the model re-calls request_access and
875
+ // gets the app list on the next call.
876
+ if (tccState) {
877
+ const req: CuPermissionRequest = {
878
+ requestId: randomUUID(),
879
+ reason,
880
+ apps: [],
881
+ requestedFlags: {},
882
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
883
+ tccState,
884
+ };
885
+ await overrides.onPermissionRequest(req);
886
+
887
+ // Re-check: the user may have granted in System Settings while the
888
+ // dialog was up. The `tccState` arg is a pre-dialog snapshot — reading
889
+ // it here would tell the model "not yet granted" even after the user
890
+ // granted, and the model waits for confirmation instead of retrying.
891
+ // The renderer's TCC panel already live-polls (computerUseTccStore);
892
+ // this is the same re-check on the tool-result side.
893
+ const recheck = await adapter.ensureOsPermissions(
894
+ ACCESSIBILITY_AND_SCREEN_RECORDING,
895
+ { requestMissing: false },
896
+ );
897
+ if (recheck.granted) {
898
+ return errorResult(
899
+ "macOS Accessibility and Screen Recording are now both granted. " +
900
+ "Call request_access again immediately — the next call will show " +
901
+ "the app selection list.",
902
+ );
903
+ }
904
+
905
+ const missing: string[] = [];
906
+ if (!recheck.accessibility) missing.push("Accessibility");
907
+ if (!recheck.screenRecording) missing.push("Screen Recording");
908
+ return errorResult(
909
+ `macOS ${missing.join(" and ")} permission(s) not yet granted. ` +
910
+ `The permission panel has been shown. Once the user grants the ` +
911
+ `missing permission(s), call request_access again.`,
912
+ "tcc_not_granted",
913
+ );
914
+ }
915
+
916
+ const rawApps = args.apps;
917
+ if (!Array.isArray(rawApps) || !rawApps.every((a) => typeof a === "string")) {
918
+ return errorResult('"apps" must be an array of strings.', "bad_args");
919
+ }
920
+ const apps = rawApps as string[];
921
+
922
+ const requestedFlags: Partial<CuGrantFlags> = {};
923
+ if (typeof args.clipboardRead === "boolean") {
924
+ requestedFlags.clipboardRead = args.clipboardRead;
925
+ }
926
+ if (typeof args.clipboardWrite === "boolean") {
927
+ requestedFlags.clipboardWrite = args.clipboardWrite;
928
+ }
929
+ if (typeof args.systemKeyCombos === "boolean") {
930
+ requestedFlags.systemKeyCombos = args.systemKeyCombos;
931
+ }
932
+
933
+ const {
934
+ needDialog,
935
+ skipDialogGrants,
936
+ willHide,
937
+ tieredApps,
938
+ userDenied,
939
+ policyDenied,
940
+ } = await buildAccessRequest(
941
+ adapter,
942
+ apps,
943
+ overrides.allowedApps,
944
+ new Set(overrides.userDeniedBundleIds),
945
+ overrides.selectedDisplayId,
946
+ );
947
+
948
+ let dialogGranted: AppGrant[] = [];
949
+ let dialogDenied: Array<{
950
+ bundleId: string;
951
+ reason: "user_denied" | "not_installed";
952
+ }> = [];
953
+ let dialogFlags: CuGrantFlags = overrides.grantFlags;
954
+
955
+ if (needDialog.length > 0 || Object.keys(requestedFlags).length > 0) {
956
+ const req: CuPermissionRequest = {
957
+ requestId: randomUUID(),
958
+ reason,
959
+ apps: needDialog,
960
+ requestedFlags,
961
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
962
+ // Undefined when empty so the renderer skips the section cleanly.
963
+ ...(willHide.length > 0 && {
964
+ willHide,
965
+ autoUnhideEnabled: adapter.getAutoUnhideEnabled(),
966
+ }),
967
+ };
968
+ const response = await overrides.onPermissionRequest(req);
969
+ dialogGranted = response.granted;
970
+ dialogDenied = response.denied;
971
+ dialogFlags = response.flags;
972
+ }
973
+
974
+ // Do NOT return display geometry or coordinateMode. See COORDINATES.md
975
+ // ("Never give the model a number that invites rescaling"). scaleCoord
976
+ // already transforms server-side; the coordinate convention is baked into
977
+ // the tool param descriptions at server-construction time.
978
+ const allGranted = [...skipDialogGrants, ...dialogGranted];
979
+ // Filter tieredApps to what was actually granted — if the user unchecked
980
+ // Chrome in the dialog, don't explain Chrome's tier.
981
+ const grantedBundleIds = new Set(allGranted.map((g) => g.bundleId));
982
+ const grantedTieredApps = tieredApps.filter((t) =>
983
+ grantedBundleIds.has(t.bundleId),
984
+ );
985
+ // Best-effort — grants are already persisted by wrappedPermissionHandler;
986
+ // a listDisplays/findWindowDisplays failure (monitor hot-unplug, NAPI
987
+ // error) must not tank the grant response. Same discipline as
988
+ // buildMonitorNote's listDisplays try/catch.
989
+ let windowLocations: Awaited<ReturnType<typeof buildWindowLocations>> = [];
990
+ try {
991
+ windowLocations = await buildWindowLocations(adapter, allGranted);
992
+ } catch (e) {
993
+ adapter.logger.warn(
994
+ `[computer-use] buildWindowLocations failed: ${String(e)}`,
995
+ );
996
+ }
997
+ return okJson(
998
+ {
999
+ granted: allGranted,
1000
+ denied: dialogDenied,
1001
+ // Policy blocklist — precedes userDenied in precedence and response
1002
+ // order. No escape hatch; the agent is told to find another approach.
1003
+ ...(policyDenied.length > 0 && {
1004
+ policyDenied: {
1005
+ apps: policyDenied,
1006
+ guidance: buildPolicyDeniedGuidance(policyDenied),
1007
+ },
1008
+ }),
1009
+ // User-configured auto-deny — stripped before the dialog; this is the
1010
+ // agent's only signal that these apps exist but are user-blocked.
1011
+ ...(userDenied.length > 0 && {
1012
+ userDenied: {
1013
+ apps: userDenied,
1014
+ guidance: buildUserDeniedGuidance(userDenied),
1015
+ },
1016
+ }),
1017
+ // Upfront guidance so the model knows what each tier allows BEFORE
1018
+ // hitting the gate. Only included when something was tier-restricted.
1019
+ ...(grantedTieredApps.length > 0 && {
1020
+ tierGuidance: buildTierGuidanceMessage(grantedTieredApps),
1021
+ }),
1022
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
1023
+ // Where each granted app currently has open windows, across monitors.
1024
+ // Omitted when the app isn't running or has no normal windows.
1025
+ ...(windowLocations.length > 0 ? { windowLocations } : {}),
1026
+ },
1027
+ {
1028
+ // dialogGranted only — skipDialogGrants are idempotent re-grants of
1029
+ // apps already in the allowlist (no user action, dialog skips them).
1030
+ // Matching denied_count's this-call-only semantics.
1031
+ granted_count: dialogGranted.length,
1032
+ denied_count: dialogDenied.length,
1033
+ ...tierAssignmentTelemetry(grantedTieredApps),
1034
+ },
1035
+ );
1036
+ }
1037
+
1038
+ /**
1039
+ * For each granted app with open windows, which displays those windows are
1040
+ * on. Single-monitor setups return an empty array (no multi-monitor signal
1041
+ * to give). Apps not running, or running with no normal windows, are omitted.
1042
+ */
1043
+ async function buildWindowLocations(
1044
+ adapter: ComputerUseHostAdapter,
1045
+ granted: AppGrant[],
1046
+ ): Promise<
1047
+ Array<{
1048
+ bundleId: string;
1049
+ displayName: string;
1050
+ displays: Array<{ id: number; label?: string; isPrimary?: boolean }>;
1051
+ }>
1052
+ > {
1053
+ if (granted.length === 0) return [];
1054
+
1055
+ const displays = await adapter.executor.listDisplays();
1056
+ if (displays.length <= 1) return [];
1057
+
1058
+ const grantedBundleIds = granted.map((g) => g.bundleId);
1059
+ const windowLocs = await adapter.executor.findWindowDisplays(grantedBundleIds);
1060
+ const displayById = new Map(displays.map((d) => [d.displayId, d]));
1061
+ const idsByBundle = new Map(windowLocs.map((w) => [w.bundleId, w.displayIds]));
1062
+
1063
+ const out = [];
1064
+ for (const g of granted) {
1065
+ const displayIds = idsByBundle.get(g.bundleId);
1066
+ if (!displayIds || displayIds.length === 0) continue;
1067
+ out.push({
1068
+ bundleId: g.bundleId,
1069
+ displayName: g.displayName,
1070
+ displays: displayIds.map((id) => {
1071
+ const d = displayById.get(id);
1072
+ return { id, label: d?.label, isPrimary: d?.isPrimary };
1073
+ }),
1074
+ });
1075
+ }
1076
+ return out;
1077
+ }
1078
+
1079
+ /**
1080
+ * Shared app-resolution + partition + hide-preview pipeline. Extracted from
1081
+ * `handleRequestAccess` so `handleRequestTeachAccess` can call the same path.
1082
+ *
1083
+ * Does the full app-name→InstalledApp resolution, assigns each a tier
1084
+ * (browser→"read", terminal/IDE→"click", else "full" — see deniedApps.ts),
1085
+ * splits into already-granted (skip the dialog, preserve grantedAt+tier) vs
1086
+ * need-dialog, and computes the willHide preview. Unlike the previous
1087
+ * hard-deny model, ALL apps proceed to the dialog; the tier just constrains
1088
+ * what actions are allowed once granted.
1089
+ */
1090
+ /** An app assigned a restricted tier (not `"full"`). Used to build the
1091
+ * guidance message telling the model what it can/can't do. */
1092
+ interface TieredApp {
1093
+ bundleId: string;
1094
+ displayName: string;
1095
+ /** Never `"full"` — only restricted tiers are collected. */
1096
+ tier: "read" | "click";
1097
+ }
1098
+
1099
+ interface AccessRequestParts {
1100
+ needDialog: ResolvedAppRequest[];
1101
+ skipDialogGrants: AppGrant[];
1102
+ willHide: Array<{ bundleId: string; displayName: string }>;
1103
+ /** Resolved apps with `proposedTier !== "full"` — for the guidance text.
1104
+ * Unresolved apps are omitted (they go to `denied` with `not_installed`). */
1105
+ tieredApps: TieredApp[];
1106
+ /** Apps stripped by the user's Settings auto-deny list. Surfaced in the
1107
+ * response with guidance; never reach the dialog. */
1108
+ userDenied: Array<{ requestedName: string; displayName: string }>;
1109
+ /** Apps stripped by the baked-in policy blocklist (streaming/music/ebooks,
1110
+ * etc. — `deniedApps.isPolicyDenied`). Precedence over userDenied. */
1111
+ policyDenied: Array<{ requestedName: string; displayName: string }>;
1112
+ }
1113
+
1114
+ async function buildAccessRequest(
1115
+ adapter: ComputerUseHostAdapter,
1116
+ apps: string[],
1117
+ allowedApps: AppGrant[],
1118
+ userDeniedBundleIds: ReadonlySet<string>,
1119
+ selectedDisplayId?: number,
1120
+ ): Promise<AccessRequestParts> {
1121
+ const alreadyGranted = new Set(allowedApps.map((g) => g.bundleId));
1122
+ const installed = await adapter.executor.listInstalledApps();
1123
+ const resolved = resolveRequestedApps(apps, installed, alreadyGranted);
1124
+
1125
+ // Policy-level auto-deny (baked-in, not user-configurable). Stripped
1126
+ // before userDenied — checks bundle ID AND display name (covers
1127
+ // unresolved requests). Precedence: policy > user setting > tier.
1128
+ const policyDenied: Array<{ requestedName: string; displayName: string }> =
1129
+ [];
1130
+ const afterPolicy: typeof resolved = [];
1131
+ for (const r of resolved) {
1132
+ const displayName = r.resolved?.displayName ?? r.requestedName;
1133
+ if (isPolicyDenied(r.resolved?.bundleId, displayName)) {
1134
+ policyDenied.push({ requestedName: r.requestedName, displayName });
1135
+ } else {
1136
+ afterPolicy.push(r);
1137
+ }
1138
+ }
1139
+
1140
+ // User-configured auto-deny (Settings → Desktop app → Computer Use).
1141
+ // Stripped BEFORE
1142
+ // tier assignment — these never reach the dialog regardless of category.
1143
+ // Bundle-ID match only (the Settings UI picks from installed apps, which
1144
+ // always have a bundle ID). Unresolved requests pass through to the tier
1145
+ // system; the user can't preemptively deny an app that isn't installed.
1146
+ const userDenied: Array<{ requestedName: string; displayName: string }> = [];
1147
+ const surviving: typeof afterPolicy = [];
1148
+ for (const r of afterPolicy) {
1149
+ if (r.resolved && userDeniedBundleIds.has(r.resolved.bundleId)) {
1150
+ userDenied.push({
1151
+ requestedName: r.requestedName,
1152
+ displayName: r.resolved.displayName,
1153
+ });
1154
+ } else {
1155
+ surviving.push(r);
1156
+ }
1157
+ }
1158
+
1159
+ // Collect resolved apps with a restricted tier for the guidance message.
1160
+ // Unresolved apps with a restricted tier (e.g. model asks for "Chrome" but
1161
+ // it's not installed) are omitted — they'll end up in the `denied` list
1162
+ // with reason "not_installed" and the model will see that instead.
1163
+ const tieredApps: TieredApp[] = [];
1164
+ for (const r of surviving) {
1165
+ if (r.proposedTier === "full" || !r.resolved) continue;
1166
+ tieredApps.push({
1167
+ bundleId: r.resolved.bundleId,
1168
+ displayName: r.resolved.displayName,
1169
+ tier: r.proposedTier,
1170
+ });
1171
+ }
1172
+
1173
+ // Idempotence: apps that are already granted skip the dialog and are
1174
+ // merged into the `granted` response. Existing grants keep their tier
1175
+ // (which may differ from the current proposedTier if policy changed).
1176
+ const skipDialog = surviving.filter((r) => r.alreadyGranted);
1177
+ const needDialog = surviving.filter((r) => !r.alreadyGranted);
1178
+
1179
+ // Populate icons only for what the dialog will actually show. Sequential
1180
+ // awaits are fine — the computer-use backend is cached (listInstalledApps above
1181
+ // loaded it), each N-API call is synchronous, and the darwin executor
1182
+ // memoizes by path. Failures leave iconDataUrl undefined; renderer falls
1183
+ // back to a grey box.
1184
+ for (const r of needDialog) {
1185
+ if (!r.resolved) continue;
1186
+ try {
1187
+ r.resolved.iconDataUrl = await adapter.executor.getAppIcon(
1188
+ r.resolved.path,
1189
+ );
1190
+ } catch {
1191
+ // leave undefined
1192
+ }
1193
+ }
1194
+
1195
+ const now = Date.now();
1196
+ const skipDialogGrants: AppGrant[] = skipDialog
1197
+ .filter((r) => r.resolved)
1198
+ .map((r) => {
1199
+ // Reuse the existing grant (preserving grantedAt + tier) rather than
1200
+ // synthesizing a new one — keeps Settings-page "Granted 3m ago" honest.
1201
+ const existing = allowedApps.find(
1202
+ (g) => g.bundleId === r.resolved!.bundleId,
1203
+ );
1204
+ return (
1205
+ existing ?? {
1206
+ bundleId: r.resolved!.bundleId,
1207
+ displayName: r.resolved!.displayName,
1208
+ grantedAt: now,
1209
+ tier: r.proposedTier,
1210
+ }
1211
+ );
1212
+ });
1213
+
1214
+ // Preview what will be hidden if the user approves exactly the requested
1215
+ // set plus what they already have. All tiers are visible, so everything
1216
+ // resolved goes in the exempt set.
1217
+ const exemptForPreview = [
1218
+ ...allowedApps.map((a) => a.bundleId),
1219
+ ...surviving.filter((r) => r.resolved).map((r) => r.resolved!.bundleId),
1220
+ ];
1221
+ const willHide = await adapter.executor.previewHideSet(
1222
+ exemptForPreview,
1223
+ selectedDisplayId,
1224
+ );
1225
+
1226
+ return {
1227
+ needDialog,
1228
+ skipDialogGrants,
1229
+ willHide,
1230
+ tieredApps,
1231
+ userDenied,
1232
+ policyDenied,
1233
+ };
1234
+ }
1235
+
1236
+ /**
1237
+ * Build guidance text for apps granted at a restricted tier. Returned
1238
+ * inline in the okJson response so the model knows upfront what it can
1239
+ * do with each app, instead of learning by hitting the tier gate.
1240
+ */
1241
+ function buildTierGuidanceMessage(tiered: TieredApp[]): string {
1242
+ // tier "read" is not category-unique — split so browsers get the CiC hint
1243
+ // and trading platforms get "ask the user" instead.
1244
+ const readBrowsers = tiered.filter(
1245
+ (t) =>
1246
+ t.tier === "read" &&
1247
+ getDeniedCategoryForApp(t.bundleId, t.displayName) === "browser",
1248
+ );
1249
+ const readOther = tiered.filter(
1250
+ (t) =>
1251
+ t.tier === "read" &&
1252
+ getDeniedCategoryForApp(t.bundleId, t.displayName) !== "browser",
1253
+ );
1254
+ const clickTier = tiered.filter((t) => t.tier === "click");
1255
+
1256
+ const parts: string[] = [];
1257
+
1258
+ if (readBrowsers.length > 0) {
1259
+ const names = readBrowsers.map((b) => `"${b.displayName}"`).join(", ");
1260
+ parts.push(
1261
+ `${names} ${readBrowsers.length === 1 ? "is a browser" : "are browsers"} — ` +
1262
+ `granted at tier "read" (visible in screenshots only; no clicks or ` +
1263
+ `typing). You can read what's on screen but cannot navigate, click, ` +
1264
+ `or type into ${readBrowsers.length === 1 ? "it" : "them"}. For browser ` +
1265
+ `interaction, use the Claude-in-Chrome MCP (tools named ` +
1266
+ `\`mcp__Claude_in_Chrome__*\`; load via ToolSearch if deferred).`,
1267
+ );
1268
+ }
1269
+
1270
+ if (readOther.length > 0) {
1271
+ const names = readOther.map((t) => `"${t.displayName}"`).join(", ");
1272
+ parts.push(
1273
+ `${names} ${readOther.length === 1 ? "is" : "are"} granted at tier ` +
1274
+ `"read" (visible in screenshots only; no clicks or typing). You can ` +
1275
+ `read what's on screen but cannot interact. Ask the user to take any ` +
1276
+ `actions in ${readOther.length === 1 ? "this app" : "these apps"} ` +
1277
+ `themselves.`,
1278
+ );
1279
+ }
1280
+
1281
+ if (clickTier.length > 0) {
1282
+ const names = clickTier.map((t) => `"${t.displayName}"`).join(", ");
1283
+ parts.push(
1284
+ `${names} ${clickTier.length === 1 ? "has" : "have"} terminal or IDE ` +
1285
+ `capabilities — granted at tier "click" (visible + plain left-click ` +
1286
+ `only; NO typing, key presses, right-click, modifier-clicks, or ` +
1287
+ `drag-drop). You can click buttons and scroll output, but ` +
1288
+ `${clickTier.length === 1 ? "its" : "their"} integrated terminal and ` +
1289
+ `editor are off-limits to keyboard input. Right-click (context-menu ` +
1290
+ `Paste) and dragging text onto ${clickTier.length === 1 ? "it" : "them"} ` +
1291
+ `require tier "full". For shell commands, use the Bash tool.`,
1292
+ );
1293
+ }
1294
+
1295
+ if (parts.length === 0) return "";
1296
+ // Same anti-subversion clause the gate errors carry — said upfront so the
1297
+ // model doesn't reach for osascript/cliclick after seeing "no clicks/typing".
1298
+ return parts.join("\n\n") + TIER_ANTI_SUBVERSION;
1299
+ }
1300
+
1301
+ /**
1302
+ * Build guidance text for apps stripped by the user's Settings auto-deny
1303
+ * list. Returned inline in the okJson response so the agent knows (a) the
1304
+ * app is auto-denied by request_access and (b) the escape hatch
1305
+ * is to ask the human to edit Settings, not to retry or reword the request.
1306
+ */
1307
+ function buildUserDeniedGuidance(
1308
+ userDenied: Array<{ requestedName: string; displayName: string }>,
1309
+ ): string {
1310
+ const names = userDenied.map((d) => `"${d.displayName}"`).join(", ");
1311
+ const one = userDenied.length === 1;
1312
+ return (
1313
+ `${names} ${one ? "is" : "are"} in the user's auto-deny list ` +
1314
+ `(Settings → Desktop app (General) → Computer Use → Denied apps). ` +
1315
+ `Requests for ` +
1316
+ `${one ? "this app" : "these apps"} are automatically denied. If you need access for ` +
1317
+ `this task, ask the user to remove ${one ? "it" : "them"} from their ` +
1318
+ `deny list in Settings — you cannot request this through the tool.`
1319
+ );
1320
+ }
1321
+
1322
+ /**
1323
+ * Guidance for policy-denied apps (baked-in blocklist, not user-editable).
1324
+ * Unlike userDenied, there is no escape hatch — the agent is told to find
1325
+ * another approach.
1326
+ */
1327
+ function buildPolicyDeniedGuidance(
1328
+ policyDenied: Array<{ requestedName: string; displayName: string }>,
1329
+ ): string {
1330
+ const names = policyDenied.map((d) => `"${d.displayName}"`).join(", ");
1331
+ const one = policyDenied.length === 1;
1332
+ return (
1333
+ `${names} ${one ? "is" : "are"} blocked by policy for computer use. ` +
1334
+ `Requests for ${one ? "this app" : "these apps"} are automatically ` +
1335
+ `denied regardless of what the user has approved. There is no Settings ` +
1336
+ `override. Inform the user that you cannot access ` +
1337
+ `${one ? "this app" : "these apps"} and suggest an alternative ` +
1338
+ `approach if one exists. Do not try to directly subvert this block ` +
1339
+ `regardless of the user's request.`
1340
+ );
1341
+ }
1342
+
1343
+ /**
1344
+ * Telemetry helper — counts by category. Field names (`denied_*`) are kept
1345
+ * for schema compat; interpret as "assigned non-full tier" in dashboards.
1346
+ */
1347
+ function tierAssignmentTelemetry(
1348
+ tiered: TieredApp[],
1349
+ ): Pick<CuCallTelemetry, "denied_browser_count" | "denied_terminal_count"> {
1350
+ // `denied_browser_count` now counts ALL tier-"read" grants (browsers +
1351
+ // trading). The field name was already legacy-only before trading existed
1352
+ // (dashboards read it as "non-full tier"), so no new column.
1353
+ const browserCount = tiered.filter((t) => t.tier === "read").length;
1354
+ const terminalCount = tiered.filter((t) => t.tier === "click").length;
1355
+ return {
1356
+ ...(browserCount > 0 && { denied_browser_count: browserCount }),
1357
+ ...(terminalCount > 0 && { denied_terminal_count: terminalCount }),
1358
+ };
1359
+ }
1360
+
1361
+ /**
1362
+ * Sibling of `handleRequestAccess`. Same app-resolution + TCC-threading, but
1363
+ * routes to the teach approval dialog and fires `onTeachModeActivated` on
1364
+ * success. No grant-flag checkboxes (clipboard/systemKeys) in teach mode —
1365
+ * the tool schema omits those fields.
1366
+ *
1367
+ * Unlike `request_access`, this ALWAYS shows the dialog even when every
1368
+ * requested app is already granted. Teach mode is a distinct UX the user
1369
+ * must explicitly consent to (main window hides) — idempotent app grants
1370
+ * don't imply consent to being guided.
1371
+ */
1372
+ async function handleRequestTeachAccess(
1373
+ adapter: ComputerUseHostAdapter,
1374
+ args: Record<string, unknown>,
1375
+ overrides: ComputerUseOverrides,
1376
+ tccState: { accessibility: boolean; screenRecording: boolean } | undefined,
1377
+ ): Promise<CuCallToolResult> {
1378
+ if (!overrides.onTeachPermissionRequest) {
1379
+ return errorResult(
1380
+ "Teach mode is not available in this session.",
1381
+ "feature_unavailable",
1382
+ );
1383
+ }
1384
+
1385
+ // Same as handleRequestAccess above — the dialog renders in the hidden
1386
+ // main window. Model re-calling request_teach_access mid-tour (to add
1387
+ // another app) is plausible since request_access docs say "call again
1388
+ // mid-session to add more apps" and this uses the same grant model.
1389
+ if (overrides.getTeachModeActive?.()) {
1390
+ return errorResult(
1391
+ "Teach mode is already active. To add more apps, end the current tour first, then call request_teach_access again with the full app list.",
1392
+ "teach_mode_conflict",
1393
+ );
1394
+ }
1395
+
1396
+ const reason = requireString(args, "reason");
1397
+ if (reason instanceof Error) return errorResult(reason.message, "bad_args");
1398
+
1399
+ // TCC-ungranted branch — identical to handleRequestAccess's. The renderer
1400
+ // shows the same TCC toggle panel regardless of which request tool got here.
1401
+ if (tccState) {
1402
+ const req: CuTeachPermissionRequest = {
1403
+ requestId: randomUUID(),
1404
+ reason,
1405
+ apps: [],
1406
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
1407
+ tccState,
1408
+ };
1409
+ await overrides.onTeachPermissionRequest(req);
1410
+
1411
+ // Same re-check as handleRequestAccess — user may have granted while the
1412
+ // dialog was up, and the pre-dialog snapshot would mislead the model.
1413
+ const recheck = await adapter.ensureOsPermissions(
1414
+ ACCESSIBILITY_AND_SCREEN_RECORDING,
1415
+ { requestMissing: false },
1416
+ );
1417
+ if (recheck.granted) {
1418
+ return errorResult(
1419
+ "macOS Accessibility and Screen Recording are now both granted. " +
1420
+ "Call request_teach_access again immediately — the next call will " +
1421
+ "show the app selection list.",
1422
+ );
1423
+ }
1424
+
1425
+ const missing: string[] = [];
1426
+ if (!recheck.accessibility) missing.push("Accessibility");
1427
+ if (!recheck.screenRecording) missing.push("Screen Recording");
1428
+ return errorResult(
1429
+ `macOS ${missing.join(" and ")} permission(s) not yet granted. ` +
1430
+ `The permission panel has been shown. Once the user grants the ` +
1431
+ `missing permission(s), call request_teach_access again.`,
1432
+ "tcc_not_granted",
1433
+ );
1434
+ }
1435
+
1436
+ const rawApps = args.apps;
1437
+ if (!Array.isArray(rawApps) || !rawApps.every((a) => typeof a === "string")) {
1438
+ return errorResult('"apps" must be an array of strings.', "bad_args");
1439
+ }
1440
+ const apps = rawApps as string[];
1441
+
1442
+ const {
1443
+ needDialog,
1444
+ skipDialogGrants,
1445
+ willHide,
1446
+ tieredApps,
1447
+ userDenied,
1448
+ policyDenied,
1449
+ } = await buildAccessRequest(
1450
+ adapter,
1451
+ apps,
1452
+ overrides.allowedApps,
1453
+ new Set(overrides.userDeniedBundleIds),
1454
+ overrides.selectedDisplayId,
1455
+ );
1456
+
1457
+ // All requested apps were user-denied (or unresolvable) and none pre-granted
1458
+ // — skip the dialog entirely. Without this, onTeachPermissionRequest fires
1459
+ // with apps:[] and the user sees an empty approval dialog where Allow and
1460
+ // Deny produce the same result (granted=[] → teachModeActive stays false).
1461
+ // handleRequestAccess has the equivalent guard at the needDialog.length
1462
+ // check; teach didn't need one before user-deny because needDialog=[]
1463
+ // previously implied skipDialogGrants.length > 0 (all-already-granted).
1464
+ if (needDialog.length === 0 && skipDialogGrants.length === 0) {
1465
+ return okJson(
1466
+ {
1467
+ granted: [],
1468
+ denied: [],
1469
+ ...(policyDenied.length > 0 && {
1470
+ policyDenied: {
1471
+ apps: policyDenied,
1472
+ guidance: buildPolicyDeniedGuidance(policyDenied),
1473
+ },
1474
+ }),
1475
+ ...(userDenied.length > 0 && {
1476
+ userDenied: {
1477
+ apps: userDenied,
1478
+ guidance: buildUserDeniedGuidance(userDenied),
1479
+ },
1480
+ }),
1481
+ teachModeActive: false,
1482
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
1483
+ },
1484
+ { granted_count: 0, denied_count: 0 },
1485
+ );
1486
+ }
1487
+
1488
+ const req: CuTeachPermissionRequest = {
1489
+ requestId: randomUUID(),
1490
+ reason,
1491
+ apps: needDialog,
1492
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
1493
+ ...(willHide.length > 0 && {
1494
+ willHide,
1495
+ autoUnhideEnabled: adapter.getAutoUnhideEnabled(),
1496
+ }),
1497
+ };
1498
+ const response = await overrides.onTeachPermissionRequest(req);
1499
+
1500
+ const granted = [...skipDialogGrants, ...response.granted];
1501
+ // Gate on explicit dialog consent, NOT on merged grant length.
1502
+ // skipDialogGrants are pre-existing idempotent app grants — they don't
1503
+ // imply the user said yes to THIS dialog. Without the userConsented
1504
+ // check, Deny would still activate teach mode whenever any requested
1505
+ // app was previously granted (worst case: needDialog=[] → Allow and
1506
+ // Deny payloads are structurally identical).
1507
+ const teachModeActive = response.userConsented === true && granted.length > 0;
1508
+ if (teachModeActive) {
1509
+ overrides.onTeachModeActivated?.();
1510
+ }
1511
+
1512
+ const grantedBundleIds = new Set(granted.map((g) => g.bundleId));
1513
+ const grantedTieredApps = tieredApps.filter((t) =>
1514
+ grantedBundleIds.has(t.bundleId),
1515
+ );
1516
+
1517
+ return okJson(
1518
+ {
1519
+ granted,
1520
+ denied: response.denied,
1521
+ ...(policyDenied.length > 0 && {
1522
+ policyDenied: {
1523
+ apps: policyDenied,
1524
+ guidance: buildPolicyDeniedGuidance(policyDenied),
1525
+ },
1526
+ }),
1527
+ ...(userDenied.length > 0 && {
1528
+ userDenied: {
1529
+ apps: userDenied,
1530
+ guidance: buildUserDeniedGuidance(userDenied),
1531
+ },
1532
+ }),
1533
+ ...(grantedTieredApps.length > 0 && {
1534
+ tierGuidance: buildTierGuidanceMessage(grantedTieredApps),
1535
+ }),
1536
+ teachModeActive,
1537
+ screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
1538
+ },
1539
+ {
1540
+ // response.granted only — skipDialogGrants are idempotent re-grants.
1541
+ // See handleRequestAccess's parallel comment.
1542
+ granted_count: response.granted.length,
1543
+ denied_count: response.denied.length,
1544
+ ...tierAssignmentTelemetry(grantedTieredApps),
1545
+ },
1546
+ );
1547
+ }
1548
+
1549
+ // ---------------------------------------------------------------------------
1550
+ // teach_step + teach_batch — shared step primitives
1551
+ // ---------------------------------------------------------------------------
1552
+
1553
+ /** A fully-validated teach step, anchor already scaled to logical points. */
1554
+ interface ValidatedTeachStep {
1555
+ explanation: string;
1556
+ nextPreview: string;
1557
+ anchorLogical: TeachStepRequest["anchorLogical"];
1558
+ actions: Array<Record<string, unknown>>;
1559
+ }
1560
+
1561
+ /**
1562
+ * Validate one raw step record and scale its anchor. `label` is prefixed to
1563
+ * error messages so teach_batch can say `steps[2].actions[0]` instead of
1564
+ * just `actions[0]`.
1565
+ *
1566
+ * The anchor transform is the whole coordinate story: model sends image-pixel
1567
+ * coords (same space as click coords, per COORDINATES.md), `scaleCoord` turns
1568
+ * them into logical points against `overrides.lastScreenshot`. For
1569
+ * teach_batch, lastScreenshot stays at its pre-call value for the entire
1570
+ * batch — same invariant as computer_batch's "coordinates refer to the
1571
+ * PRE-BATCH screenshot". Anchors for step 2+ must therefore target elements
1572
+ * the model can predict will be at those coordinates after step 1's actions.
1573
+ */
1574
+ async function validateTeachStepArgs(
1575
+ raw: Record<string, unknown>,
1576
+ adapter: ComputerUseHostAdapter,
1577
+ overrides: ComputerUseOverrides,
1578
+ label: string,
1579
+ ): Promise<ValidatedTeachStep | Error> {
1580
+ const explanation = requireString(raw, "explanation");
1581
+ if (explanation instanceof Error) {
1582
+ return new Error(`${label}: ${explanation.message}`);
1583
+ }
1584
+ const nextPreview = requireString(raw, "next_preview");
1585
+ if (nextPreview instanceof Error) {
1586
+ return new Error(`${label}: ${nextPreview.message}`);
1587
+ }
1588
+
1589
+ const actions = raw.actions;
1590
+ if (!Array.isArray(actions)) {
1591
+ return new Error(
1592
+ `${label}: "actions" must be an array (empty is allowed).`,
1593
+ );
1594
+ }
1595
+ for (const [i, act] of actions.entries()) {
1596
+ if (typeof act !== "object" || act === null) {
1597
+ return new Error(`${label}: actions[${i}] must be an object`);
1598
+ }
1599
+ const action = (act as Record<string, unknown>).action;
1600
+ if (typeof action !== "string") {
1601
+ return new Error(`${label}: actions[${i}].action must be a string`);
1602
+ }
1603
+ if (!BATCHABLE_ACTIONS.has(action)) {
1604
+ return new Error(
1605
+ `${label}: actions[${i}].action="${action}" is not allowed. ` +
1606
+ `Allowed: ${[...BATCHABLE_ACTIONS].join(", ")}.`,
1607
+ );
1608
+ }
1609
+ }
1610
+
1611
+ let anchorLogical: TeachStepRequest["anchorLogical"];
1612
+ if (raw.anchor !== undefined) {
1613
+ const anchor = raw.anchor;
1614
+ if (
1615
+ !Array.isArray(anchor) ||
1616
+ anchor.length !== 2 ||
1617
+ typeof anchor[0] !== "number" ||
1618
+ typeof anchor[1] !== "number" ||
1619
+ !Number.isFinite(anchor[0]) ||
1620
+ !Number.isFinite(anchor[1])
1621
+ ) {
1622
+ return new Error(
1623
+ `${label}: "anchor" must be a [x, y] number tuple or omitted.`,
1624
+ );
1625
+ }
1626
+ const display = await adapter.executor.getDisplaySize(
1627
+ overrides.selectedDisplayId,
1628
+ );
1629
+ anchorLogical = scaleCoord(
1630
+ anchor[0],
1631
+ anchor[1],
1632
+ overrides.coordinateMode,
1633
+ display,
1634
+ overrides.lastScreenshot,
1635
+ adapter.logger,
1636
+ );
1637
+ }
1638
+
1639
+ return {
1640
+ explanation,
1641
+ nextPreview,
1642
+ anchorLogical,
1643
+ actions: actions as Array<Record<string, unknown>>,
1644
+ };
1645
+ }
1646
+
1647
+ /** Outcome of showing one tooltip + running its actions. */
1648
+ type TeachStepOutcome =
1649
+ | { kind: "exit" }
1650
+ | { kind: "ok"; results: BatchActionResult[] }
1651
+ | {
1652
+ kind: "action_error";
1653
+ executed: number;
1654
+ failed: BatchActionResult;
1655
+ remaining: number;
1656
+ /** The inner action's telemetry (error_kind), forwarded so the
1657
+ * caller can pass it to okJson and keep cu_tool_call accurate
1658
+ * when the failure happened inside a batch. */
1659
+ telemetry: CuCallTelemetry | undefined;
1660
+ };
1661
+
1662
+ /**
1663
+ * Show the tooltip, block for Next/Exit, run actions on Next.
1664
+ *
1665
+ * Action execution is a straight lift from `handleComputerBatch`:
1666
+ * prepareForAction ONCE per step (the user clicked Next — they consented to
1667
+ * that step's sequence), pixelValidation OFF (committed sequence), frontmost
1668
+ * gate still per-action, stop-on-first-error with partial results.
1669
+ *
1670
+ * Empty `actions` is valid — "read this, click Next to continue" steps.
1671
+ * Assumes `overrides.onTeachStep` is set (caller guards).
1672
+ */
1673
+ async function executeTeachStep(
1674
+ step: ValidatedTeachStep,
1675
+ adapter: ComputerUseHostAdapter,
1676
+ overrides: ComputerUseOverrides,
1677
+ subGates: CuSubGates,
1678
+ ): Promise<TeachStepOutcome> {
1679
+ // Block until Next or Exit. Same pending-promise pattern as
1680
+ // onPermissionRequest — host stores the resolver, overlay IPC fires it.
1681
+ // `!` is safe: both callers guard on overrides.onTeachStep before reaching here.
1682
+ const stepResult = await overrides.onTeachStep!({
1683
+ explanation: step.explanation,
1684
+ nextPreview: step.nextPreview,
1685
+ anchorLogical: step.anchorLogical,
1686
+ });
1687
+
1688
+ if (stepResult.action === "exit") {
1689
+ // The host's Exit handler also calls stopSession, so the turn is
1690
+ // already unwinding. Caller decides what to return for the transcript.
1691
+ // A PREVIOUS step's left_mouse_down may have left the OS button held.
1692
+ await releaseHeldMouse(adapter);
1693
+ return { kind: "exit" };
1694
+ }
1695
+
1696
+ // Next clicked. Flip overlay to spinner before we start driving.
1697
+ overrides.onTeachWorking?.();
1698
+
1699
+ if (step.actions.length === 0) {
1700
+ return { kind: "ok", results: [] };
1701
+ }
1702
+
1703
+ if (subGates.hideBeforeAction) {
1704
+ const hidden = await adapter.executor.prepareForAction(
1705
+ overrides.allowedApps.map((a) => a.bundleId),
1706
+ overrides.selectedDisplayId,
1707
+ );
1708
+ if (hidden.length > 0) {
1709
+ overrides.onAppsHidden?.(hidden);
1710
+ }
1711
+ }
1712
+
1713
+ const stepSubGates: CuSubGates = {
1714
+ ...subGates,
1715
+ hideBeforeAction: false,
1716
+ pixelValidation: false,
1717
+ // Anchors are pre-computed against the display at batch start.
1718
+ // A mid-batch resolver switch would break tooltip positioning.
1719
+ autoTargetDisplay: false,
1720
+ };
1721
+
1722
+ const results: BatchActionResult[] = [];
1723
+ for (const [i, act] of step.actions.entries()) {
1724
+ // Same abort check as handleComputerBatch — Exit calls stopSession so
1725
+ // this IS the exit path, just caught mid-dispatch instead of at the
1726
+ // onTeachStep await above. Callers already handle { kind: "exit" }.
1727
+ if (overrides.isAborted?.()) {
1728
+ await releaseHeldMouse(adapter);
1729
+ return { kind: "exit" };
1730
+ }
1731
+ // Same inter-step settle as handleComputerBatch.
1732
+ if (i > 0) await sleep(10);
1733
+ const action = act.action as string;
1734
+
1735
+ // Drop mid-step screenshot piggyback — same invariant as computer_batch.
1736
+ // Click coords stay anchored to the screenshot the model took BEFORE
1737
+ // calling teach_step/teach_batch.
1738
+ const { screenshot: _dropped, ...inner } = await dispatchAction(
1739
+ action,
1740
+ act,
1741
+ adapter,
1742
+ overrides,
1743
+ stepSubGates,
1744
+ );
1745
+
1746
+ const text = firstTextContent(inner);
1747
+ const result = { action, ok: !inner.isError, output: text };
1748
+ results.push(result);
1749
+
1750
+ if (inner.isError) {
1751
+ await releaseHeldMouse(adapter);
1752
+ return {
1753
+ kind: "action_error",
1754
+ executed: results.length - 1,
1755
+ failed: result,
1756
+ remaining: step.actions.length - results.length,
1757
+ telemetry: inner.telemetry,
1758
+ };
1759
+ }
1760
+ }
1761
+
1762
+ return { kind: "ok", results };
1763
+ }
1764
+
1765
+ /**
1766
+ * Fold a fresh screenshot into the result. Eliminates the separate
1767
+ * screenshot tool call the model would otherwise make before the next
1768
+ * teach_step (one fewer API round trip per step). handleScreenshot
1769
+ * runs its own prepareForAction — that's correct: actions may have
1770
+ * opened something outside the allowlist. The .screenshot piggyback
1771
+ * flows through to serverDef.ts's stash → lastScreenshot updates →
1772
+ * the next teach_step.anchor scales against THIS image, which is what
1773
+ * the model is now looking at.
1774
+ */
1775
+ async function appendTeachScreenshot(
1776
+ resultJson: unknown,
1777
+ adapter: ComputerUseHostAdapter,
1778
+ overrides: ComputerUseOverrides,
1779
+ subGates: CuSubGates,
1780
+ ): Promise<CuCallToolResult> {
1781
+ const shotResult = await handleScreenshot(adapter, overrides, subGates);
1782
+ if (shotResult.isError) {
1783
+ // Hide+screenshot failed (rare — e.g. SCContentFilter error). Don't
1784
+ // tank the step; just omit the image. Model will call screenshot
1785
+ // itself and see the real error.
1786
+ return okJson(resultJson);
1787
+ }
1788
+ return {
1789
+ content: [
1790
+ { type: "text", text: JSON.stringify(resultJson) },
1791
+ // handleScreenshot's content is [maybeMonitorNote, maybeHiddenNote,
1792
+ // image]. Spread all — both notes are useful context and the model
1793
+ // expects them alongside screenshots.
1794
+ ...shotResult.content,
1795
+ ],
1796
+ // For serverDef.ts to stash. Next teach_step.anchor scales against this.
1797
+ screenshot: shotResult.screenshot,
1798
+ };
1799
+ }
1800
+
1801
+ /**
1802
+ * Show one guided-tour tooltip and block until the user clicks Next or Exit.
1803
+ * On Next, execute `actions[]` with `computer_batch` semantics.
1804
+ */
1805
+ async function handleTeachStep(
1806
+ adapter: ComputerUseHostAdapter,
1807
+ args: Record<string, unknown>,
1808
+ overrides: ComputerUseOverrides,
1809
+ subGates: CuSubGates,
1810
+ ): Promise<CuCallToolResult> {
1811
+ if (!overrides.onTeachStep) {
1812
+ return errorResult(
1813
+ "Teach mode is not active. Call request_teach_access first.",
1814
+ "teach_mode_not_active",
1815
+ );
1816
+ }
1817
+
1818
+ const step = await validateTeachStepArgs(
1819
+ args,
1820
+ adapter,
1821
+ overrides,
1822
+ "teach_step",
1823
+ );
1824
+ if (step instanceof Error) return errorResult(step.message, "bad_args");
1825
+
1826
+ const outcome = await executeTeachStep(step, adapter, overrides, subGates);
1827
+
1828
+ if (outcome.kind === "exit") {
1829
+ return okJson({ exited: true });
1830
+ }
1831
+ if (outcome.kind === "action_error") {
1832
+ return okJson(
1833
+ {
1834
+ executed: outcome.executed,
1835
+ failed: outcome.failed,
1836
+ remaining: outcome.remaining,
1837
+ },
1838
+ outcome.telemetry,
1839
+ );
1840
+ }
1841
+
1842
+ // ok. No screenshot for empty actions — screen didn't change, model's
1843
+ // existing screenshot is still accurate.
1844
+ if (step.actions.length === 0) {
1845
+ return okJson({ executed: 0, results: [] });
1846
+ }
1847
+ return appendTeachScreenshot(
1848
+ { executed: outcome.results.length, results: outcome.results },
1849
+ adapter,
1850
+ overrides,
1851
+ subGates,
1852
+ );
1853
+ }
1854
+
1855
+ /**
1856
+ * Queue a whole guided tour in one tool call. Parallels `computer_batch`: N
1857
+ * steps → one model→API round trip instead of N. Each step still blocks for
1858
+ * its own Next click (the user paces the tour), but the model doesn't wait
1859
+ * for a round trip between steps.
1860
+ *
1861
+ * Validates ALL steps upfront so a typo in step 5 doesn't surface after the
1862
+ * user has already clicked through steps 1–4.
1863
+ *
1864
+ * Anchors for every step scale against the pre-call `lastScreenshot` — same
1865
+ * PRE-BATCH invariant as computer_batch. Steps 2+ should either omit anchor
1866
+ * (centered tooltip) or target elements the model predicts won't have moved.
1867
+ *
1868
+ * Result shape:
1869
+ * {exited: true, stepsCompleted: N} — user clicked Exit
1870
+ * {stepsCompleted, stepFailed, executed, failed, …} — action error at step N
1871
+ * {stepsCompleted, results: [...]} + screenshot — all steps ran
1872
+ */
1873
+ async function handleTeachBatch(
1874
+ adapter: ComputerUseHostAdapter,
1875
+ args: Record<string, unknown>,
1876
+ overrides: ComputerUseOverrides,
1877
+ subGates: CuSubGates,
1878
+ ): Promise<CuCallToolResult> {
1879
+ if (!overrides.onTeachStep) {
1880
+ return errorResult(
1881
+ "Teach mode is not active. Call request_teach_access first.",
1882
+ "teach_mode_not_active",
1883
+ );
1884
+ }
1885
+
1886
+ const rawSteps = args.steps;
1887
+ if (!Array.isArray(rawSteps) || rawSteps.length < 1) {
1888
+ return errorResult('"steps" must be a non-empty array.', "bad_args");
1889
+ }
1890
+
1891
+ // Validate upfront — fail fast before showing any tooltip.
1892
+ const steps: ValidatedTeachStep[] = [];
1893
+ for (const [i, raw] of rawSteps.entries()) {
1894
+ if (typeof raw !== "object" || raw === null) {
1895
+ return errorResult(`steps[${i}] must be an object`, "bad_args");
1896
+ }
1897
+ const v = await validateTeachStepArgs(
1898
+ raw as Record<string, unknown>,
1899
+ adapter,
1900
+ overrides,
1901
+ `steps[${i}]`,
1902
+ );
1903
+ if (v instanceof Error) return errorResult(v.message, "bad_args");
1904
+ steps.push(v);
1905
+ }
1906
+
1907
+ const allResults: BatchActionResult[][] = [];
1908
+ for (const [i, step] of steps.entries()) {
1909
+ const outcome = await executeTeachStep(step, adapter, overrides, subGates);
1910
+
1911
+ if (outcome.kind === "exit") {
1912
+ return okJson({ exited: true, stepsCompleted: i });
1913
+ }
1914
+ if (outcome.kind === "action_error") {
1915
+ return okJson(
1916
+ {
1917
+ stepsCompleted: i,
1918
+ stepFailed: i,
1919
+ executed: outcome.executed,
1920
+ failed: outcome.failed,
1921
+ remaining: outcome.remaining,
1922
+ results: allResults,
1923
+ },
1924
+ outcome.telemetry,
1925
+ );
1926
+ }
1927
+ allResults.push(outcome.results);
1928
+ }
1929
+
1930
+ // Final screenshot only if any step ran actions (screen changed).
1931
+ const screenChanged = steps.some((s) => s.actions.length > 0);
1932
+ const resultJson = { stepsCompleted: steps.length, results: allResults };
1933
+ if (!screenChanged) {
1934
+ return okJson(resultJson);
1935
+ }
1936
+ return appendTeachScreenshot(resultJson, adapter, overrides, subGates);
1937
+ }
1938
+
1939
+ /**
1940
+ * Build the hidden-apps note that accompanies a screenshot. Tells the model
1941
+ * which apps got hidden (not in allowlist) and how to add them. Returns
1942
+ * undefined when nothing was hidden since the last screenshot.
1943
+ */
1944
+ async function buildHiddenNote(
1945
+ adapter: ComputerUseHostAdapter,
1946
+ hiddenSinceLastSeen: string[],
1947
+ ): Promise<string | undefined> {
1948
+ if (hiddenSinceLastSeen.length === 0) return undefined;
1949
+ const running = await adapter.executor.listRunningApps();
1950
+ const nameOf = new Map(running.map((a) => [a.bundleId, a.displayName]));
1951
+ const names = hiddenSinceLastSeen.map((id) => nameOf.get(id) ?? id);
1952
+ const list = names.map((n) => `"${n}"`).join(", ");
1953
+ const one = names.length === 1;
1954
+ return (
1955
+ `${list} ${one ? "was" : "were"} open and got hidden before this screenshot ` +
1956
+ `(not in the session allowlist). If a previous action was meant to open ` +
1957
+ `${one ? "it" : "one of them"}, that's why you don't see it — call ` +
1958
+ `request_access to add ${one ? "it" : "them"} to the allowlist.`
1959
+ );
1960
+ }
1961
+
1962
+ /**
1963
+ * Assign a human-readable label to each display. Falls back to `display N`
1964
+ * when NSScreen.localizedName is undefined; disambiguates identical labels
1965
+ * (matched-pair external monitors) with a `(2)` suffix. Used by both
1966
+ * buildMonitorNote and handleSwitchDisplay so the name the model sees in a
1967
+ * screenshot note is the same name it can pass back to switch_display.
1968
+ */
1969
+ function uniqueDisplayLabels(
1970
+ displays: readonly DisplayGeometry[],
1971
+ ): Map<number, string> {
1972
+ // Sort by displayId so the (N) suffix is stable regardless of
1973
+ // NSScreen.screens iteration order — same label always maps to same
1974
+ // physical display across buildMonitorNote → switch_display round-trip,
1975
+ // even if display configuration reorders between the two calls.
1976
+ const sorted = [...displays].sort((a, b) => a.displayId - b.displayId);
1977
+ const counts = new Map<string, number>();
1978
+ const out = new Map<number, string>();
1979
+ for (const d of sorted) {
1980
+ const base = d.label ?? `display ${d.displayId}`;
1981
+ const n = (counts.get(base) ?? 0) + 1;
1982
+ counts.set(base, n);
1983
+ out.set(d.displayId, n === 1 ? base : `${base} (${n})`);
1984
+ }
1985
+ return out;
1986
+ }
1987
+
1988
+ /**
1989
+ * Build the monitor-context text that accompanies a screenshot. Tells the
1990
+ * model which monitor it's looking at (by human name), lists other attached
1991
+ * monitors, and flags when the monitor changed vs. the previous screenshot.
1992
+ *
1993
+ * Only emitted when there are 2+ displays AND (first screenshot OR the
1994
+ * display changed). Single-monitor setups and steady-state same-monitor
1995
+ * screenshots get no text — avoids noise.
1996
+ */
1997
+ async function buildMonitorNote(
1998
+ adapter: ComputerUseHostAdapter,
1999
+ shotDisplayId: number,
2000
+ lastDisplayId: number | undefined,
2001
+ canSwitchDisplay: boolean,
2002
+ ): Promise<string | undefined> {
2003
+ // listDisplays failure (e.g. the backend returns zero screens during monitor
2004
+ // hot-unplug) must not tank the screenshot — this note is optional context.
2005
+ let displays;
2006
+ try {
2007
+ displays = await adapter.executor.listDisplays();
2008
+ } catch (e) {
2009
+ adapter.logger.warn(`[computer-use] listDisplays failed: ${String(e)}`);
2010
+ return undefined;
2011
+ }
2012
+ if (displays.length < 2) return undefined;
2013
+
2014
+ const labels = uniqueDisplayLabels(displays);
2015
+ const nameOf = (id: number): string => labels.get(id) ?? `display ${id}`;
2016
+
2017
+ const current = nameOf(shotDisplayId);
2018
+ const others = displays
2019
+ .filter((d) => d.displayId !== shotDisplayId)
2020
+ .map((d) => nameOf(d.displayId));
2021
+ const switchHint = canSwitchDisplay
2022
+ ? " Use switch_display to capture a different monitor."
2023
+ : "";
2024
+ const othersList =
2025
+ others.length > 0
2026
+ ? ` Other attached monitors: ${others.map((n) => `"${n}"`).join(", ")}.` +
2027
+ switchHint
2028
+ : "";
2029
+
2030
+ // 0 is kCGNullDirectDisplay (sentinel from old sessions persisted
2031
+ // pre-multimon) — treat same as undefined.
2032
+ if (lastDisplayId === undefined || lastDisplayId === 0) {
2033
+ return `This screenshot was taken on monitor "${current}".` + othersList;
2034
+ }
2035
+ if (lastDisplayId !== shotDisplayId) {
2036
+ const prev = nameOf(lastDisplayId);
2037
+ return (
2038
+ `This screenshot was taken on monitor "${current}", which is different ` +
2039
+ `from your previous screenshot (taken on "${prev}").` +
2040
+ othersList
2041
+ );
2042
+ }
2043
+ return undefined;
2044
+ }
2045
+
2046
+ async function handleScreenshot(
2047
+ adapter: ComputerUseHostAdapter,
2048
+ overrides: ComputerUseOverrides,
2049
+ subGates: CuSubGates,
2050
+ ): Promise<CuCallToolResult> {
2051
+ // §2 — empty allowlist → tool error, no screenshot.
2052
+ if (overrides.allowedApps.length === 0) {
2053
+ return errorResult(
2054
+ "No applications are granted for this session. Call request_access first.",
2055
+ "allowlist_empty",
2056
+ );
2057
+ }
2058
+
2059
+ // Atomic resolve→prepare→capture (one backend call, no scheduler gap).
2060
+ // Off → fall through to separate-calls path below.
2061
+ if (subGates.autoTargetDisplay) {
2062
+ // Model's explicit switch_display pin overrides everything — the backend's
2063
+ // straight cuDisplayInfo(forDisplayID:) passthrough, no chase chain.
2064
+ // Otherwise sticky display: only auto-resolve when the allowed-app
2065
+ // set has changed since the display was last resolved. Prevents the
2066
+ // resolver yanking the display on every screenshot.
2067
+ const allowedBundleIds = overrides.allowedApps.map((a) => a.bundleId);
2068
+ const currentAppSetKey = allowedBundleIds.slice().sort().join(",");
2069
+ const appSetChanged = currentAppSetKey !== overrides.displayResolvedForApps;
2070
+ const autoResolve = !overrides.displayPinnedByModel && appSetChanged;
2071
+
2072
+ const result = await adapter.executor.resolvePrepareCapture({
2073
+ allowedBundleIds,
2074
+ preferredDisplayId: overrides.selectedDisplayId,
2075
+ autoResolve,
2076
+ // Keep the hideBeforeAction sub-gate independently rollable —
2077
+ // atomic path honors the same toggle the non-atomic path checks
2078
+ // at the prepareForAction call site.
2079
+ doHide: subGates.hideBeforeAction,
2080
+ });
2081
+
2082
+ // Non-atomic path's takeScreenshotWithRetry has a MIN_SCREENSHOT_BYTES
2083
+ // check + retry. The atomic call is expensive (resolve+prepare+capture),
2084
+ // so no retry here — just a warning when the result is implausibly
2085
+ // small (transient display state like sleep wake). Skip when
2086
+ // captureError is set (base64 is intentionally empty then).
2087
+ if (
2088
+ result.captureError === undefined &&
2089
+ decodedByteLength(result.base64) < MIN_SCREENSHOT_BYTES
2090
+ ) {
2091
+ adapter.logger.warn(
2092
+ `[computer-use] resolvePrepareCapture result implausibly small (${decodedByteLength(result.base64)} bytes decoded) — possible transient display state`,
2093
+ );
2094
+ }
2095
+
2096
+ // Resolver picked a different display than the session had selected
2097
+ // (host window moved, or allowed app on a different display). Write
2098
+ // the pick back to session so teach overlay positioning and subsequent
2099
+ // non-resolver calls track the same display. Fire-and-forget.
2100
+ if (result.displayId !== overrides.selectedDisplayId) {
2101
+ adapter.logger.debug(
2102
+ `[computer-use] resolver: preferred=${overrides.selectedDisplayId} resolved=${result.displayId}`,
2103
+ );
2104
+ overrides.onResolvedDisplayUpdated?.(result.displayId);
2105
+ }
2106
+ // Record the app set this display was resolved for, so the next
2107
+ // screenshot skips auto-resolve until the set changes again. Gated on
2108
+ // autoResolve (not just appSetChanged) — when pinned, we didn't
2109
+ // actually resolve, so don't update the key.
2110
+ if (autoResolve) {
2111
+ overrides.onDisplayResolvedForApps?.(currentAppSetKey);
2112
+ }
2113
+
2114
+ // Report hidden apps only when the model has already seen the screen.
2115
+ let hiddenSinceLastSeen: string[] = [];
2116
+ if (overrides.lastScreenshot !== undefined) {
2117
+ hiddenSinceLastSeen = result.hidden;
2118
+ }
2119
+ if (result.hidden.length > 0) {
2120
+ overrides.onAppsHidden?.(result.hidden);
2121
+ }
2122
+
2123
+ // Partial-success case: hide succeeded, capture failed (SCK perm
2124
+ // revoked mid-session). onAppsHidden fired above so auto-unhide will
2125
+ // restore hidden apps at turn end. Now surface the error to the model.
2126
+ if (result.captureError !== undefined) {
2127
+ return errorResult(result.captureError, "capture_failed");
2128
+ }
2129
+
2130
+ const hiddenNote = await buildHiddenNote(adapter, hiddenSinceLastSeen);
2131
+
2132
+ // Cherry-pick — don't spread `result` (would leak resolver fields into lastScreenshot).
2133
+ const shot: ScreenshotResult = {
2134
+ base64: result.base64,
2135
+ width: result.width,
2136
+ height: result.height,
2137
+ displayWidth: result.displayWidth,
2138
+ displayHeight: result.displayHeight,
2139
+ displayId: result.displayId,
2140
+ originX: result.originX,
2141
+ originY: result.originY,
2142
+ };
2143
+
2144
+ const monitorNote = await buildMonitorNote(
2145
+ adapter,
2146
+ shot.displayId,
2147
+ overrides.lastScreenshot?.displayId,
2148
+ overrides.onDisplayPinned !== undefined,
2149
+ );
2150
+
2151
+ return {
2152
+ content: [
2153
+ ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
2154
+ ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
2155
+ {
2156
+ type: "image",
2157
+ data: shot.base64,
2158
+ mimeType: "image/jpeg",
2159
+ },
2160
+ ],
2161
+ screenshot: shot,
2162
+ };
2163
+ }
2164
+
2165
+ // Same hide+defocus sequence as input actions. Screenshot needs hide too
2166
+ // — if a non-allowlisted app is on top, SCContentFilter would composite it
2167
+ // out, but the pixels BELOW it are what the model would see, and those are
2168
+ // NOT what's actually there. Hiding first makes the screenshot TRUE.
2169
+ let hiddenSinceLastSeen: string[] = [];
2170
+ if (subGates.hideBeforeAction) {
2171
+ const hidden = await adapter.executor.prepareForAction(
2172
+ overrides.allowedApps.map((a) => a.bundleId),
2173
+ overrides.selectedDisplayId,
2174
+ );
2175
+ // "Something appeared since the model last looked." Report whenever:
2176
+ // (a) prepare hid something AND
2177
+ // (b) the model has ALREADY SEEN the screen (lastScreenshot is set).
2178
+ //
2179
+ // (b) is the discriminator that silences the first screenshot's
2180
+ // expected-noise hide. NOT a delta against a cumulative set — that was
2181
+ // the earlier bug: cuHiddenDuringTurn only grows, so once Preview is in
2182
+ // it (from the first screenshot's hide), subsequent re-hides of Preview
2183
+ // delta to zero. The double-click → Preview opens → re-hide → silent
2184
+ // loop never breaks.
2185
+ //
2186
+ // With this check: every re-hide fires. If the model loops "click → file
2187
+ // opens in Preview → screenshot → Preview hidden", it gets told EVERY
2188
+ // time. Eventually it'll request_access for Preview (or give up).
2189
+ //
2190
+ // False positive: user alt-tabs mid-turn → Safari re-hidden → reported.
2191
+ // Rare, and "Safari appeared" is at worst mild noise — far better than
2192
+ // the false-negative of never explaining why the file vanished.
2193
+ if (overrides.lastScreenshot !== undefined) {
2194
+ hiddenSinceLastSeen = hidden;
2195
+ }
2196
+ if (hidden.length > 0) {
2197
+ overrides.onAppsHidden?.(hidden);
2198
+ }
2199
+ }
2200
+
2201
+ const allowedBundleIds = overrides.allowedApps.map((g) => g.bundleId);
2202
+ const shot = await takeScreenshotWithRetry(
2203
+ adapter.executor,
2204
+ allowedBundleIds,
2205
+ adapter.logger,
2206
+ overrides.selectedDisplayId,
2207
+ );
2208
+
2209
+ const hiddenNote = await buildHiddenNote(adapter, hiddenSinceLastSeen);
2210
+
2211
+ const monitorNote = await buildMonitorNote(
2212
+ adapter,
2213
+ shot.displayId,
2214
+ overrides.lastScreenshot?.displayId,
2215
+ overrides.onDisplayPinned !== undefined,
2216
+ );
2217
+
2218
+ return {
2219
+ content: [
2220
+ ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
2221
+ ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
2222
+ {
2223
+ type: "image",
2224
+ data: shot.base64,
2225
+ mimeType: "image/jpeg",
2226
+ },
2227
+ ],
2228
+ // Piggybacked for serverDef.ts to stash on InternalServerContext.
2229
+ screenshot: shot,
2230
+ };
2231
+ }
2232
+
2233
+ /**
2234
+ * Region-crop upscaled screenshot. Coord invariant (computer_use_v2.py:1092):
2235
+ * click coords ALWAYS refer to the full-screen screenshot, never the zoom.
2236
+ * Enforced structurally: this handler's return has NO `.screenshot` field,
2237
+ * so serverDef.ts's `if (result.screenshot)` branch cannot fire and
2238
+ * `cuLastScreenshot` is never touched. `executor.zoom()`'s return type also
2239
+ * lacks displayWidth/displayHeight, so it's not assignable to
2240
+ * `ScreenshotResult` even by accident.
2241
+ */
2242
+ async function handleZoom(
2243
+ adapter: ComputerUseHostAdapter,
2244
+ args: Record<string, unknown>,
2245
+ overrides: ComputerUseOverrides,
2246
+ ): Promise<CuCallToolResult> {
2247
+ // region: [x0, y0, x1, y1] in IMAGE-PX of lastScreenshot — same space the
2248
+ // model reads click coords from.
2249
+ const region = args.region;
2250
+ if (!Array.isArray(region) || region.length !== 4) {
2251
+ return errorResult(
2252
+ "region must be an array of length 4: [x0, y0, x1, y1]",
2253
+ "bad_args",
2254
+ );
2255
+ }
2256
+ const [x0, y0, x1, y1] = region;
2257
+ if (![x0, y0, x1, y1].every((v) => typeof v === "number" && v >= 0)) {
2258
+ return errorResult(
2259
+ "region values must be non-negative numbers",
2260
+ "bad_args",
2261
+ );
2262
+ }
2263
+ if (x1 <= x0)
2264
+ return errorResult("region x1 must be greater than x0", "bad_args");
2265
+ if (y1 <= y0)
2266
+ return errorResult("region y1 must be greater than y0", "bad_args");
2267
+
2268
+ const last = overrides.lastScreenshot;
2269
+ if (!last) {
2270
+ return errorResult(
2271
+ "take a screenshot before zooming (region coords are relative to it)",
2272
+ "state_conflict",
2273
+ );
2274
+ }
2275
+ if (x1 > last.width || y1 > last.height) {
2276
+ return errorResult(
2277
+ `region exceeds screenshot bounds (${last.width}×${last.height})`,
2278
+ "bad_args",
2279
+ );
2280
+ }
2281
+
2282
+ // image-px → logical-pt. Same ratio as scaleCoord (:198-199) —
2283
+ // displayWidth / width, not 1/scaleFactor. The ratio is folded.
2284
+ const ratioX = last.displayWidth / last.width;
2285
+ const ratioY = last.displayHeight / last.height;
2286
+ const regionLogical = {
2287
+ x: x0 * ratioX,
2288
+ y: y0 * ratioY,
2289
+ w: (x1 - x0) * ratioX,
2290
+ h: (y1 - y0) * ratioY,
2291
+ };
2292
+
2293
+ const allowedIds = overrides.allowedApps.map((g) => g.bundleId);
2294
+ // Crop from the same display as lastScreenshot so the zoom region
2295
+ // matches the image the model is reading coords from.
2296
+ const zoomed = await adapter.executor.zoom(
2297
+ regionLogical,
2298
+ allowedIds,
2299
+ last.displayId,
2300
+ );
2301
+
2302
+ // Return the image. NO `.screenshot` piggyback — this is the invariant.
2303
+ return {
2304
+ content: [{ type: "image", data: zoomed.base64, mimeType: "image/jpeg" }],
2305
+ };
2306
+ }
2307
+
2308
+ /** Shared handler for all five click variants. */
2309
+ async function handleClickVariant(
2310
+ adapter: ComputerUseHostAdapter,
2311
+ args: Record<string, unknown>,
2312
+ overrides: ComputerUseOverrides,
2313
+ subGates: CuSubGates,
2314
+ button: "left" | "right" | "middle",
2315
+ count: 1 | 2 | 3,
2316
+ ): Promise<CuCallToolResult> {
2317
+ // A prior left_mouse_down may have set mouseButtonHeld without a matching
2318
+ // left_mouse_up (e.g. drag rejected by a tier gate, model falls back to
2319
+ // left_click). executor.click() does its own mouseDown+mouseUp, releasing
2320
+ // the OS button — but without this, the JS flag stays true and all
2321
+ // subsequent mouse_move calls take the held-button path ("mouse"/
2322
+ // "mouse_full" actionKind + hit-test), causing spurious rejections on
2323
+ // click-tier and read-tier windows. Release first so click() gets a clean
2324
+ // slate.
2325
+ if (mouseButtonHeld) {
2326
+ await adapter.executor.mouseUp();
2327
+ mouseButtonHeld = false;
2328
+ mouseMoved = false;
2329
+ }
2330
+
2331
+ const coord = extractCoordinate(args);
2332
+ if (coord instanceof Error) return errorResult(coord.message, "bad_args");
2333
+ const [rawX, rawY] = coord;
2334
+
2335
+ // left_click(coordinate=[x,y], text="shift") — hold modifiers
2336
+ // during the click. Same chord parsing as the key tool.
2337
+ let modifiers: string[] | undefined;
2338
+ if (args.text !== undefined) {
2339
+ if (typeof args.text !== "string") {
2340
+ return errorResult("text must be a string", "bad_args");
2341
+ }
2342
+ // Same gate as handleKey/handleHoldKey. withModifiers presses each name
2343
+ // via native.key(m, "press") — a non-modifier like "q" in text="cmd+q"
2344
+ // gets pressed while Cmd is held → Cmd+Q fires before the click.
2345
+ if (
2346
+ isSystemKeyCombo(args.text, adapter.executor.capabilities.platform) &&
2347
+ !overrides.grantFlags.systemKeyCombos
2348
+ ) {
2349
+ return errorResult(
2350
+ `The modifier chord "${args.text}" would fire a system shortcut. ` +
2351
+ "Request the systemKeyCombos grant flag via request_access, or use " +
2352
+ "only modifier keys (shift, ctrl, alt, cmd) in the text parameter.",
2353
+ "grant_flag_required",
2354
+ );
2355
+ }
2356
+ modifiers = parseKeyChord(args.text);
2357
+ }
2358
+
2359
+ // Right/middle-click and any click with a modifier chord escalate to
2360
+ // keyboard-equivalent input at tier "click" (context-menu Paste, chord
2361
+ // keystrokes). Compute once, pass to both gates.
2362
+ const clickActionKind: CuActionKind =
2363
+ button !== "left" || (modifiers !== undefined && modifiers.length > 0)
2364
+ ? "mouse_full"
2365
+ : "mouse";
2366
+
2367
+ const gate = await runInputActionGates(
2368
+ adapter,
2369
+ overrides,
2370
+ subGates,
2371
+ clickActionKind,
2372
+ );
2373
+ if (gate) return gate;
2374
+
2375
+ const display = await adapter.executor.getDisplaySize(
2376
+ overrides.selectedDisplayId,
2377
+ );
2378
+
2379
+ // §6 item P — pixel-validation staleness check. Sub-gated.
2380
+ // Runs AFTER the gates (no point validating if we're about to refuse
2381
+ // anyway) but BEFORE the executor call.
2382
+ if (subGates.pixelValidation) {
2383
+ const { xPct, yPct } = coordToPercentageForPixelCompare(
2384
+ rawX,
2385
+ rawY,
2386
+ overrides.coordinateMode,
2387
+ overrides.lastScreenshot,
2388
+ );
2389
+ const validation = await validateClickTarget(
2390
+ adapter.cropRawPatch,
2391
+ overrides.lastScreenshot,
2392
+ xPct,
2393
+ yPct,
2394
+ async () => {
2395
+ // The fresh screenshot for validation uses the SAME allow-set as
2396
+ // the model's last screenshot did, so we compare like with like.
2397
+ const allowedIds = overrides.allowedApps.map((g) => g.bundleId);
2398
+ try {
2399
+ // Fresh shot must match lastScreenshot's display, not the current
2400
+ // selection — pixel-compare is against the model's last image.
2401
+ return await adapter.executor.screenshot({
2402
+ allowedBundleIds: allowedIds,
2403
+ displayId: overrides.lastScreenshot?.displayId,
2404
+ });
2405
+ } catch {
2406
+ return null;
2407
+ }
2408
+ },
2409
+ adapter.logger,
2410
+ );
2411
+ if (!validation.valid && validation.warning) {
2412
+ // Warning result — model told to re-screenshot.
2413
+ return okText(validation.warning);
2414
+ }
2415
+ }
2416
+
2417
+ const { x, y } = scaleCoord(
2418
+ rawX,
2419
+ rawY,
2420
+ overrides.coordinateMode,
2421
+ display,
2422
+ overrides.lastScreenshot,
2423
+ adapter.logger,
2424
+ );
2425
+
2426
+ const hitGate = await runHitTestGate(
2427
+ adapter,
2428
+ overrides,
2429
+ subGates,
2430
+ x,
2431
+ y,
2432
+ clickActionKind,
2433
+ );
2434
+ if (hitGate) return hitGate;
2435
+
2436
+ await adapter.executor.click(x, y, button, count, modifiers);
2437
+ return okText("Clicked.");
2438
+ }
2439
+
2440
+ async function handleType(
2441
+ adapter: ComputerUseHostAdapter,
2442
+ args: Record<string, unknown>,
2443
+ overrides: ComputerUseOverrides,
2444
+ subGates: CuSubGates,
2445
+ ): Promise<CuCallToolResult> {
2446
+ const text = requireString(args, "text");
2447
+ if (text instanceof Error) return errorResult(text.message, "bad_args");
2448
+
2449
+ const gate = await runInputActionGates(
2450
+ adapter,
2451
+ overrides,
2452
+ subGates,
2453
+ "keyboard",
2454
+ );
2455
+ if (gate) return gate;
2456
+
2457
+ // §6 item 3 — clipboard-paste fast path for multi-line. Sub-gated AND
2458
+ // requires clipboardWrite grant. The save/restore + read-back-verify
2459
+ // lives in the EXECUTOR (task #5), not here. Here we just route.
2460
+ const viaClipboard =
2461
+ text.includes("\n") &&
2462
+ overrides.grantFlags.clipboardWrite &&
2463
+ subGates.clipboardPasteMultiline;
2464
+
2465
+ if (viaClipboard) {
2466
+ await adapter.executor.type(text, { viaClipboard: true });
2467
+ return okText("Typed (via clipboard).");
2468
+ }
2469
+
2470
+ // §6 item 7 — grapheme-cluster iteration. Prevents ZWJ emoji → �.
2471
+ // §6 item 4 — 8ms between graphemes (125 Hz USB polling). Battle-tested:
2472
+ // sleep BEFORE each keystroke, not after.
2473
+ //
2474
+ // \n, \r, \t MUST route through executor.key(), not type(). Two reasons:
2475
+ // 1. enigo.text("\n") on macOS posts a stale CGEvent with virtualKey=0
2476
+ // after stripping the newline — virtualKey 0 is the 'a' key, so a
2477
+ // ghost 'a' gets typed. Upstream bug in enigo 0.6.1 fast_text().
2478
+ // 2. Unicode text-insertion of '\n' is not a Return key press. URL bars
2479
+ // and terminals ignore it; the model's intent (submit/execute) is lost.
2480
+ // CRLF (\r\n) is one grapheme cluster (UAX #29 GB3), so check for it too.
2481
+ const graphemes = segmentGraphemes(text);
2482
+ for (const [i, g] of graphemes.entries()) {
2483
+ // Same abort check as handleComputerBatch. At 8ms/grapheme a 50-char
2484
+ // type() runs ~400ms; this is where an in-flight batch actually
2485
+ // spends its time.
2486
+ if (overrides.isAborted?.()) {
2487
+ return errorResult(
2488
+ `Typing aborted after ${i} of ${graphemes.length} graphemes (user interrupt).`,
2489
+ );
2490
+ }
2491
+ await sleep(INTER_GRAPHEME_SLEEP_MS);
2492
+ if (g === "\n" || g === "\r" || g === "\r\n") {
2493
+ await adapter.executor.key("return");
2494
+ } else if (g === "\t") {
2495
+ await adapter.executor.key("tab");
2496
+ } else {
2497
+ await adapter.executor.type(g, { viaClipboard: false });
2498
+ }
2499
+ }
2500
+ return okText(`Typed ${graphemes.length} grapheme(s).`);
2501
+ }
2502
+
2503
+ async function handleKey(
2504
+ adapter: ComputerUseHostAdapter,
2505
+ args: Record<string, unknown>,
2506
+ overrides: ComputerUseOverrides,
2507
+ subGates: CuSubGates,
2508
+ ): Promise<CuCallToolResult> {
2509
+ const keySequence = requireString(args, "text");
2510
+ if (keySequence instanceof Error)
2511
+ return errorResult("text is required", "bad_args");
2512
+
2513
+ // Cap 100, error strings match.
2514
+ let repeat: number | undefined;
2515
+ if (args.repeat !== undefined) {
2516
+ if (
2517
+ typeof args.repeat !== "number" ||
2518
+ !Number.isInteger(args.repeat) ||
2519
+ args.repeat < 1
2520
+ ) {
2521
+ return errorResult("repeat must be a positive integer", "bad_args");
2522
+ }
2523
+ if (args.repeat > 100) {
2524
+ return errorResult("repeat exceeds maximum of 100", "bad_args");
2525
+ }
2526
+ repeat = args.repeat;
2527
+ }
2528
+
2529
+ // §2 — blocklist check BEFORE gates. A blocked combo with an ungranted
2530
+ // app frontmost should return the blocklist error, not the frontmost
2531
+ // error — the model's fix is to request the flag, not change focus.
2532
+ if (
2533
+ isSystemKeyCombo(keySequence, adapter.executor.capabilities.platform) &&
2534
+ !overrides.grantFlags.systemKeyCombos
2535
+ ) {
2536
+ return errorResult(
2537
+ `"${keySequence}" is a system-level shortcut. Request the \`systemKeyCombos\` grant via request_access to use it.`,
2538
+ "grant_flag_required",
2539
+ );
2540
+ }
2541
+
2542
+ const gate = await runInputActionGates(
2543
+ adapter,
2544
+ overrides,
2545
+ subGates,
2546
+ "keyboard",
2547
+ );
2548
+ if (gate) return gate;
2549
+
2550
+ await adapter.executor.key(keySequence, repeat);
2551
+ return okText("Key pressed.");
2552
+ }
2553
+
2554
+ async function handleScroll(
2555
+ adapter: ComputerUseHostAdapter,
2556
+ args: Record<string, unknown>,
2557
+ overrides: ComputerUseOverrides,
2558
+ subGates: CuSubGates,
2559
+ ): Promise<CuCallToolResult> {
2560
+ const coord = extractCoordinate(args);
2561
+ if (coord instanceof Error) return errorResult(coord.message, "bad_args");
2562
+ const [rawX, rawY] = coord;
2563
+
2564
+ // Uses scroll_direction + scroll_amount.
2565
+ // Map to our dx/dy executor interface.
2566
+ const dir = args.scroll_direction;
2567
+ if (dir !== "up" && dir !== "down" && dir !== "left" && dir !== "right") {
2568
+ return errorResult(
2569
+ "scroll_direction must be 'up', 'down', 'left', or 'right'",
2570
+ "bad_args",
2571
+ );
2572
+ }
2573
+ const amount = args.scroll_amount;
2574
+ if (typeof amount !== "number" || !Number.isInteger(amount) || amount < 0) {
2575
+ return errorResult("scroll_amount must be a non-negative int", "bad_args");
2576
+ }
2577
+ if (amount > 100) {
2578
+ return errorResult("scroll_amount exceeds maximum of 100", "bad_args");
2579
+ }
2580
+ // up → dy = -amount; down → dy = +amount; left → dx = -amount; right → dx = +amount.
2581
+ const dx = dir === "left" ? -amount : dir === "right" ? amount : 0;
2582
+ const dy = dir === "up" ? -amount : dir === "down" ? amount : 0;
2583
+
2584
+ const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
2585
+ if (gate) return gate;
2586
+
2587
+ const display = await adapter.executor.getDisplaySize(
2588
+ overrides.selectedDisplayId,
2589
+ );
2590
+ const { x, y } = scaleCoord(
2591
+ rawX,
2592
+ rawY,
2593
+ overrides.coordinateMode,
2594
+ display,
2595
+ overrides.lastScreenshot,
2596
+ adapter.logger,
2597
+ );
2598
+
2599
+ // When the button is held, executor.scroll's internal moveMouse generates
2600
+ // a leftMouseDragged event (enigo reads NSEvent.pressedMouseButtons) —
2601
+ // same mechanism as handleMoveMouse's held-button path. Upgrade the
2602
+ // hit-test to "mouse_full" so scroll can't be used to drag-drop text onto
2603
+ // a click-tier terminal, and mark mouseMoved so the subsequent
2604
+ // left_mouse_up hit-tests as a drop not a click-release.
2605
+ const hitGate = await runHitTestGate(
2606
+ adapter,
2607
+ overrides,
2608
+ subGates,
2609
+ x,
2610
+ y,
2611
+ mouseButtonHeld ? "mouse_full" : "mouse",
2612
+ );
2613
+ if (hitGate) return hitGate;
2614
+ if (mouseButtonHeld) mouseMoved = true;
2615
+
2616
+ await adapter.executor.scroll(x, y, dx, dy);
2617
+ return okText("Scrolled.");
2618
+ }
2619
+
2620
+ async function handleDrag(
2621
+ adapter: ComputerUseHostAdapter,
2622
+ args: Record<string, unknown>,
2623
+ overrides: ComputerUseOverrides,
2624
+ subGates: CuSubGates,
2625
+ ): Promise<CuCallToolResult> {
2626
+ // executor.drag() does its own press+release internally. Without this
2627
+ // defensive clear, a prior left_mouse_down leaves mouseButtonHeld=true
2628
+ // across the drag and desyncs the flag from OS state — same mechanism as
2629
+ // the handleClickVariant clear above. Release first so drag() gets a
2630
+ // clean slate.
2631
+ if (mouseButtonHeld) {
2632
+ await adapter.executor.mouseUp();
2633
+ mouseButtonHeld = false;
2634
+ mouseMoved = false;
2635
+ }
2636
+
2637
+ // `coordinate` is the END point
2638
+ // (required). `start_coordinate` is OPTIONAL — when omitted, drag from
2639
+ // current cursor position.
2640
+ const endCoord = extractCoordinate(args, "coordinate");
2641
+ if (endCoord instanceof Error)
2642
+ return errorResult(endCoord.message, "bad_args");
2643
+ const rawTo = endCoord;
2644
+
2645
+ let rawFrom: [number, number] | undefined;
2646
+ if (args.start_coordinate !== undefined) {
2647
+ const startCoord = extractCoordinate(args, "start_coordinate");
2648
+ if (startCoord instanceof Error)
2649
+ return errorResult(startCoord.message, "bad_args");
2650
+ rawFrom = startCoord;
2651
+ }
2652
+ // else: rawFrom stays undefined → executor drags from current cursor.
2653
+
2654
+ const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
2655
+ if (gate) return gate;
2656
+
2657
+ const display = await adapter.executor.getDisplaySize(
2658
+ overrides.selectedDisplayId,
2659
+ );
2660
+ const from =
2661
+ rawFrom === undefined
2662
+ ? undefined
2663
+ : scaleCoord(
2664
+ rawFrom[0],
2665
+ rawFrom[1],
2666
+ overrides.coordinateMode,
2667
+ display,
2668
+ overrides.lastScreenshot,
2669
+ adapter.logger,
2670
+ );
2671
+ const to = scaleCoord(
2672
+ rawTo[0],
2673
+ rawTo[1],
2674
+ overrides.coordinateMode,
2675
+ display,
2676
+ overrides.lastScreenshot,
2677
+ adapter.logger,
2678
+ );
2679
+
2680
+ // Check both drag endpoints. `from` is where the mouseDown happens (picks
2681
+ // up), `to` is where mouseUp happens (drops). When start_coordinate is
2682
+ // omitted the drag begins at the cursor — same bypass as mouse_move →
2683
+ // left_mouse_down, so read the cursor and hit-test it (mirrors
2684
+ // handleLeftMouseDown).
2685
+ //
2686
+ // The `to` endpoint uses "mouse_full" (not "mouse"): dropping text onto a
2687
+ // terminal inserts it as if typed (macOS text drag-drop). Same threat as
2688
+ // right-click→Paste. `from` stays "mouse" — picking up is a read.
2689
+ const fromPoint = from ?? (await adapter.executor.getCursorPosition());
2690
+ const fromGate = await runHitTestGate(
2691
+ adapter,
2692
+ overrides,
2693
+ subGates,
2694
+ fromPoint.x,
2695
+ fromPoint.y,
2696
+ "mouse",
2697
+ );
2698
+ if (fromGate) return fromGate;
2699
+ const toGate = await runHitTestGate(
2700
+ adapter,
2701
+ overrides,
2702
+ subGates,
2703
+ to.x,
2704
+ to.y,
2705
+ "mouse_full",
2706
+ );
2707
+ if (toGate) return toGate;
2708
+
2709
+ await adapter.executor.drag(from, to);
2710
+ return okText("Dragged.");
2711
+ }
2712
+
2713
+ async function handleMoveMouse(
2714
+ adapter: ComputerUseHostAdapter,
2715
+ args: Record<string, unknown>,
2716
+ overrides: ComputerUseOverrides,
2717
+ subGates: CuSubGates,
2718
+ ): Promise<CuCallToolResult> {
2719
+ const coord = extractCoordinate(args);
2720
+ if (coord instanceof Error) return errorResult(coord.message, "bad_args");
2721
+ const [rawX, rawY] = coord;
2722
+
2723
+ // When the button is held, moveMouse generates leftMouseDragged events on
2724
+ // the window under the cursor — that's interaction, not positioning.
2725
+ // Upgrade to "mouse" and hit-test the destination. When the button is NOT
2726
+ // held: pure positioning, passes at any tier, no hit-test (mouseDown/Up
2727
+ // hit-test the cursor to close the mouse_move→left_mouse_down decomposition).
2728
+ const actionKind: CuActionKind = mouseButtonHeld ? "mouse" : "mouse_position";
2729
+ const gate = await runInputActionGates(
2730
+ adapter,
2731
+ overrides,
2732
+ subGates,
2733
+ actionKind,
2734
+ );
2735
+ if (gate) return gate;
2736
+
2737
+ const display = await adapter.executor.getDisplaySize(
2738
+ overrides.selectedDisplayId,
2739
+ );
2740
+ const { x, y } = scaleCoord(
2741
+ rawX,
2742
+ rawY,
2743
+ overrides.coordinateMode,
2744
+ display,
2745
+ overrides.lastScreenshot,
2746
+ adapter.logger,
2747
+ );
2748
+
2749
+ if (mouseButtonHeld) {
2750
+ // "mouse_full" — same as left_click_drag's to-endpoint. Dragging onto a
2751
+ // click-tier terminal is text injection regardless of which primitive
2752
+ // (atomic drag vs. decomposed down/move/up) delivers the events.
2753
+ const hitGate = await runHitTestGate(
2754
+ adapter,
2755
+ overrides,
2756
+ subGates,
2757
+ x,
2758
+ y,
2759
+ "mouse_full",
2760
+ );
2761
+ if (hitGate) return hitGate;
2762
+ }
2763
+
2764
+ await adapter.executor.moveMouse(x, y);
2765
+ if (mouseButtonHeld) mouseMoved = true;
2766
+ return okText("Moved.");
2767
+ }
2768
+
2769
+ async function handleOpenApplication(
2770
+ adapter: ComputerUseHostAdapter,
2771
+ args: Record<string, unknown>,
2772
+ overrides: ComputerUseOverrides,
2773
+ ): Promise<CuCallToolResult> {
2774
+ const app = requireString(args, "app");
2775
+ if (app instanceof Error) return errorResult(app.message, "bad_args");
2776
+
2777
+ // Resolve display-name → bundle ID. Same logic as request_access.
2778
+ const allowed = new Set(overrides.allowedApps.map((g) => g.bundleId));
2779
+ let targetBundleId: string | undefined;
2780
+
2781
+ if (looksLikeBundleId(app) && allowed.has(app)) {
2782
+ targetBundleId = app;
2783
+ } else {
2784
+ // Try display name → bundle ID, but ONLY against the allowlist itself.
2785
+ // Avoids paying the listInstalledApps() cost on the hot path and is
2786
+ // arguably more correct: if the user granted "Slack", the model asking
2787
+ // to open "Slack" should match THAT grant.
2788
+ const match = overrides.allowedApps.find(
2789
+ (g) => g.displayName.toLowerCase() === app.toLowerCase(),
2790
+ );
2791
+ targetBundleId = match?.bundleId;
2792
+ }
2793
+
2794
+ if (!targetBundleId || !allowed.has(targetBundleId)) {
2795
+ return errorResult(
2796
+ `"${app}" is not granted for this session. Call request_access first.`,
2797
+ "app_not_granted",
2798
+ );
2799
+ }
2800
+
2801
+ // open_application works at any tier — bringing an app forward is exactly
2802
+ // what tier "read" enables (you need it on screen to screenshot it). The
2803
+ // tier gates on click/type catch any follow-up interaction.
2804
+
2805
+ await adapter.executor.openApp(targetBundleId);
2806
+
2807
+ // On multi-monitor setups, macOS may place the opened window on a monitor
2808
+ // the resolver won't pick (e.g. Claude + another allowed app are co-located
2809
+ // elsewhere). Nudge the model toward switch_display BEFORE it wastes steps
2810
+ // clicking on dock icons. Single-monitor → no hint. listDisplays failure is
2811
+ // non-fatal — the hint is advisory.
2812
+ if (overrides.onDisplayPinned !== undefined) {
2813
+ let displayCount = 1;
2814
+ try {
2815
+ displayCount = (await adapter.executor.listDisplays()).length;
2816
+ } catch {
2817
+ // hint skipped
2818
+ }
2819
+ if (displayCount >= 2) {
2820
+ return okText(
2821
+ `Opened "${app}". If it isn't visible in the next screenshot, it may ` +
2822
+ `have opened on a different monitor — use switch_display to check.`,
2823
+ );
2824
+ }
2825
+ }
2826
+
2827
+ return okText(`Opened "${app}".`);
2828
+ }
2829
+
2830
+ async function handleSwitchDisplay(
2831
+ adapter: ComputerUseHostAdapter,
2832
+ args: Record<string, unknown>,
2833
+ overrides: ComputerUseOverrides,
2834
+ ): Promise<CuCallToolResult> {
2835
+ const display = requireString(args, "display");
2836
+ if (display instanceof Error) return errorResult(display.message, "bad_args");
2837
+
2838
+ if (!overrides.onDisplayPinned) {
2839
+ return errorResult(
2840
+ "Display switching is not available in this session.",
2841
+ "feature_unavailable",
2842
+ );
2843
+ }
2844
+
2845
+ if (display.toLowerCase() === "auto") {
2846
+ overrides.onDisplayPinned(undefined);
2847
+ return okText(
2848
+ "Returned to automatic monitor selection. Call screenshot to continue.",
2849
+ );
2850
+ }
2851
+
2852
+ // Resolve label → displayId fresh. Same source buildMonitorNote reads,
2853
+ // so whatever name the model saw in a screenshot note resolves here.
2854
+ let displays;
2855
+ try {
2856
+ displays = await adapter.executor.listDisplays();
2857
+ } catch (e) {
2858
+ return errorResult(
2859
+ `Failed to enumerate displays: ${String(e)}`,
2860
+ "display_error",
2861
+ );
2862
+ }
2863
+
2864
+ if (displays.length < 2) {
2865
+ return errorResult(
2866
+ "Only one monitor is connected. There is nothing to switch to.",
2867
+ "bad_args",
2868
+ );
2869
+ }
2870
+
2871
+ const labels = uniqueDisplayLabels(displays);
2872
+ const wanted = display.toLowerCase();
2873
+ const target = displays.find(
2874
+ (d) => labels.get(d.displayId)?.toLowerCase() === wanted,
2875
+ );
2876
+ if (!target) {
2877
+ const available = displays
2878
+ .map((d) => `"${labels.get(d.displayId)}"`)
2879
+ .join(", ");
2880
+ return errorResult(
2881
+ `No monitor named "${display}" is connected. Available monitors: ${available}.`,
2882
+ "bad_args",
2883
+ );
2884
+ }
2885
+
2886
+ overrides.onDisplayPinned(target.displayId);
2887
+ return okText(
2888
+ `Switched to monitor "${labels.get(target.displayId)}". Call screenshot to see it.`,
2889
+ );
2890
+ }
2891
+
2892
+ function handleListGrantedApplications(
2893
+ overrides: ComputerUseOverrides,
2894
+ ): CuCallToolResult {
2895
+ return okJson({
2896
+ allowedApps: overrides.allowedApps,
2897
+ grantFlags: overrides.grantFlags,
2898
+ });
2899
+ }
2900
+
2901
+ async function handleReadClipboard(
2902
+ adapter: ComputerUseHostAdapter,
2903
+ overrides: ComputerUseOverrides,
2904
+ subGates: CuSubGates,
2905
+ ): Promise<CuCallToolResult> {
2906
+ if (!overrides.grantFlags.clipboardRead) {
2907
+ return errorResult(
2908
+ "Clipboard read is not granted. Request `clipboardRead` via request_access.",
2909
+ "grant_flag_required",
2910
+ );
2911
+ }
2912
+
2913
+ // read_clipboard doesn't route through runInputActionGates — sync here so
2914
+ // reading after clicking into a click-tier app sees the cleared clipboard
2915
+ // (same as what the app's own Paste would see).
2916
+ if (subGates.clipboardGuard) {
2917
+ const frontmost = await adapter.executor.getFrontmostApp();
2918
+ const tierByBundleId = new Map(
2919
+ overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
2920
+ );
2921
+ const frontmostTier = frontmost
2922
+ ? tierByBundleId.get(frontmost.bundleId)
2923
+ : undefined;
2924
+ await syncClipboardStash(adapter, overrides, frontmostTier === "click");
2925
+ }
2926
+
2927
+ // clipboardGuard may have stashed+cleared — read the actual (possibly
2928
+ // empty) clipboard. The agent sees what the app would see.
2929
+ const text = await adapter.executor.readClipboard();
2930
+ return okJson({ text });
2931
+ }
2932
+
2933
+ async function handleWriteClipboard(
2934
+ adapter: ComputerUseHostAdapter,
2935
+ args: Record<string, unknown>,
2936
+ overrides: ComputerUseOverrides,
2937
+ subGates: CuSubGates,
2938
+ ): Promise<CuCallToolResult> {
2939
+ if (!overrides.grantFlags.clipboardWrite) {
2940
+ return errorResult(
2941
+ "Clipboard write is not granted. Request `clipboardWrite` via request_access.",
2942
+ "grant_flag_required",
2943
+ );
2944
+ }
2945
+ const text = requireString(args, "text");
2946
+ if (text instanceof Error) return errorResult(text.message, "bad_args");
2947
+
2948
+ if (subGates.clipboardGuard) {
2949
+ const frontmost = await adapter.executor.getFrontmostApp();
2950
+ const tierByBundleId = new Map(
2951
+ overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
2952
+ );
2953
+ const frontmostTier = frontmost
2954
+ ? tierByBundleId.get(frontmost.bundleId)
2955
+ : undefined;
2956
+
2957
+ // Defense-in-depth for the clipboardGuard bypass: write_clipboard +
2958
+ // left_click on a click-tier app's UI Paste button. The re-clear in
2959
+ // syncClipboardStash already defeats it (the next action clobbers the
2960
+ // write), but rejecting here gives the agent a clear signal instead of
2961
+ // silently voiding its write.
2962
+ if (frontmost && frontmostTier === "click") {
2963
+ return errorResult(
2964
+ `"${frontmost.displayName}" is a tier-"click" app and currently ` +
2965
+ `frontmost. write_clipboard is blocked because the next action ` +
2966
+ `would clear the clipboard anyway — a UI Paste button in this ` +
2967
+ `app cannot be used to inject text. Bring a tier-"full" app ` +
2968
+ `forward before writing to the clipboard.` +
2969
+ TIER_ANTI_SUBVERSION,
2970
+ "tier_insufficient",
2971
+ );
2972
+ }
2973
+
2974
+ // write_clipboard doesn't route through runInputActionGates — sync here
2975
+ // so clicking away from a click-tier app then writing restores the user's
2976
+ // stash before the agent's text lands.
2977
+ await syncClipboardStash(adapter, overrides, frontmostTier === "click");
2978
+ }
2979
+
2980
+ await adapter.executor.writeClipboard(text);
2981
+ return okText("Clipboard written.");
2982
+ }
2983
+
2984
+ /**
2985
+ * wait(duration=N). Sleeps N seconds, capped at 100.
2986
+ * No frontmost gate — no input, nothing to protect. Kill-switch + TCC
2987
+ * are checked in handleToolCall before dispatch reaches here.
2988
+ */
2989
+ async function handleWait(
2990
+ args: Record<string, unknown>,
2991
+ ): Promise<CuCallToolResult> {
2992
+ const duration = args.duration;
2993
+ if (typeof duration !== "number" || !Number.isFinite(duration)) {
2994
+ return errorResult("duration must be a number", "bad_args");
2995
+ }
2996
+ if (duration < 0) {
2997
+ return errorResult("duration must be non-negative", "bad_args");
2998
+ }
2999
+ if (duration > 100) {
3000
+ return errorResult(
3001
+ "duration is too long. Duration is in seconds.",
3002
+ "bad_args",
3003
+ );
3004
+ }
3005
+ await sleep(duration * 1000);
3006
+ return okText(`Waited ${duration}s.`);
3007
+ }
3008
+
3009
+ /**
3010
+ * Returns "X=...,Y=..." plain text. We return richer JSON with
3011
+ * coordinateSpace annotation — the model handles both shapes.
3012
+ *
3013
+ * When lastScreenshot is present: inverse of scaleCoord — logical points →
3014
+ * image-pixels via `imageX = logicalX × (screenshotWidth / displayWidth)`.
3015
+ * Uses capture-time dims so the returned coords match what the model would
3016
+ * read off that screenshot.
3017
+ *
3018
+ * No frontmost gate — read-only, no input.
3019
+ */
3020
+ async function handleCursorPosition(
3021
+ adapter: ComputerUseHostAdapter,
3022
+ overrides: ComputerUseOverrides,
3023
+ ): Promise<CuCallToolResult> {
3024
+ const logical = await adapter.executor.getCursorPosition();
3025
+ const shot = overrides.lastScreenshot;
3026
+ if (shot) {
3027
+ // Inverse of scaleCoord: subtract capture-time origin to go from
3028
+ // virtual-screen to display-relative before the image-px transform.
3029
+ const localX = logical.x - shot.originX;
3030
+ const localY = logical.y - shot.originY;
3031
+ // Cursor off the captured display (multi-monitor): local coords go
3032
+ // negative or exceed display dims. Return logical_points + hint rather
3033
+ // than garbage image-px.
3034
+ if (
3035
+ localX < 0 ||
3036
+ localX > shot.displayWidth ||
3037
+ localY < 0 ||
3038
+ localY > shot.displayHeight
3039
+ ) {
3040
+ return okJson({
3041
+ x: logical.x,
3042
+ y: logical.y,
3043
+ coordinateSpace: "logical_points",
3044
+ note: "cursor is on a different monitor than your last screenshot; take a fresh screenshot",
3045
+ });
3046
+ }
3047
+ const x = Math.round(localX * (shot.width / shot.displayWidth));
3048
+ const y = Math.round(localY * (shot.height / shot.displayHeight));
3049
+ return okJson({ x, y, coordinateSpace: "image_pixels" });
3050
+ }
3051
+ return okJson({
3052
+ x: logical.x,
3053
+ y: logical.y,
3054
+ coordinateSpace: "logical_points",
3055
+ note: "take a screenshot first for image-pixel coordinates",
3056
+ });
3057
+ }
3058
+
3059
+ /**
3060
+ * Presses each key in the
3061
+ * chord, sleeps duration seconds, releases in reverse. Same duration bounds
3062
+ * as wait. Keyboard action → frontmost gate applies; same systemKeyCombos
3063
+ * blocklist check as key.
3064
+ */
3065
+ async function handleHoldKey(
3066
+ adapter: ComputerUseHostAdapter,
3067
+ args: Record<string, unknown>,
3068
+ overrides: ComputerUseOverrides,
3069
+ subGates: CuSubGates,
3070
+ ): Promise<CuCallToolResult> {
3071
+ const text = requireString(args, "text");
3072
+ if (text instanceof Error) return errorResult(text.message, "bad_args");
3073
+
3074
+ const duration = args.duration;
3075
+ if (typeof duration !== "number" || !Number.isFinite(duration)) {
3076
+ return errorResult("duration must be a number", "bad_args");
3077
+ }
3078
+ if (duration < 0) {
3079
+ return errorResult("duration must be non-negative", "bad_args");
3080
+ }
3081
+ if (duration > 100) {
3082
+ return errorResult(
3083
+ "duration is too long. Duration is in seconds.",
3084
+ "bad_args",
3085
+ );
3086
+ }
3087
+
3088
+ // Blocklist check BEFORE gates — same reasoning as handleKey. Holding
3089
+ // cmd+q is just as dangerous as tapping it.
3090
+ if (
3091
+ isSystemKeyCombo(text, adapter.executor.capabilities.platform) &&
3092
+ !overrides.grantFlags.systemKeyCombos
3093
+ ) {
3094
+ return errorResult(
3095
+ `"${text}" is a system-level shortcut. Request the \`systemKeyCombos\` grant via request_access to use it.`,
3096
+ "grant_flag_required",
3097
+ );
3098
+ }
3099
+
3100
+ const gate = await runInputActionGates(
3101
+ adapter,
3102
+ overrides,
3103
+ subGates,
3104
+ "keyboard",
3105
+ );
3106
+ if (gate) return gate;
3107
+
3108
+ const keyNames = parseKeyChord(text);
3109
+ await adapter.executor.holdKey(keyNames, duration * 1000);
3110
+ return okText("Key held.");
3111
+ }
3112
+
3113
+ /**
3114
+ * Raw press at current cursor, no coordinate.
3115
+ * Move first with mouse_move. Errors if already held.
3116
+ */
3117
+ async function handleLeftMouseDown(
3118
+ adapter: ComputerUseHostAdapter,
3119
+ overrides: ComputerUseOverrides,
3120
+ subGates: CuSubGates,
3121
+ ): Promise<CuCallToolResult> {
3122
+ if (mouseButtonHeld) {
3123
+ return errorResult(
3124
+ "mouse button already held, call left_mouse_up first",
3125
+ "state_conflict",
3126
+ );
3127
+ }
3128
+
3129
+ const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
3130
+ if (gate) return gate;
3131
+
3132
+ // macOS routes mouseDown to the window under the cursor, not the frontmost
3133
+ // app. Without this hit-test, mouse_move (positioning, passes at any tier)
3134
+ // + left_mouse_down decomposes a click that lands on a tier-"read" window
3135
+ // overlapping a tier-"full" frontmost app — bypassing runHitTestGate's
3136
+ // whole purpose. All three are batchable, so the bypass is atomic.
3137
+ const cursor = await adapter.executor.getCursorPosition();
3138
+ const hitGate = await runHitTestGate(
3139
+ adapter,
3140
+ overrides,
3141
+ subGates,
3142
+ cursor.x,
3143
+ cursor.y,
3144
+ "mouse",
3145
+ );
3146
+ if (hitGate) return hitGate;
3147
+
3148
+ await adapter.executor.mouseDown();
3149
+ mouseButtonHeld = true;
3150
+ mouseMoved = false;
3151
+ return okText("Mouse button pressed.");
3152
+ }
3153
+
3154
+ /**
3155
+ * Raw release at current cursor. Does NOT error
3156
+ * if not held (idempotent release).
3157
+ */
3158
+ async function handleLeftMouseUp(
3159
+ adapter: ComputerUseHostAdapter,
3160
+ overrides: ComputerUseOverrides,
3161
+ subGates: CuSubGates,
3162
+ ): Promise<CuCallToolResult> {
3163
+ // Any gate rejection here must release the button FIRST — otherwise the
3164
+ // OS button stays pressed and mouseButtonHeld stays true. Recovery
3165
+ // attempts (mouse_move back to a safe app) would generate leftMouseDragged
3166
+ // events into whatever window is under the cursor, including the very
3167
+ // read-tier window the gate was protecting. A single mouseUp on a
3168
+ // restricted window is one event; a stuck button is cascading damage.
3169
+ //
3170
+ // This includes the frontmost gate: focus can change between mouseDown and
3171
+ // mouseUp (something else grabbed focus), in which case runInputActionGates
3172
+ // rejects here even though it passed at mouseDown.
3173
+ const releaseFirst = async (
3174
+ err: CuCallToolResult,
3175
+ ): Promise<CuCallToolResult> => {
3176
+ await adapter.executor.mouseUp();
3177
+ mouseButtonHeld = false;
3178
+ mouseMoved = false;
3179
+ return err;
3180
+ };
3181
+
3182
+ const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
3183
+ if (gate) return releaseFirst(gate);
3184
+
3185
+ // When the cursor moved since mouseDown, this is a drop (text-injection
3186
+ // vector) — hit-test at "mouse_full" same as left_click_drag's `to`. When
3187
+ // NO move happened, this is a click-release — same semantics as the atomic
3188
+ // left_click, hit-test at "mouse". Without this distinction, a decomposed
3189
+ // click on a click-tier app fails here while the atomic left_click works,
3190
+ // and releaseFirst fires mouseUp anyway so the OS sees a complete click
3191
+ // while the model gets a misleading error.
3192
+ const cursor = await adapter.executor.getCursorPosition();
3193
+ const hitGate = await runHitTestGate(
3194
+ adapter,
3195
+ overrides,
3196
+ subGates,
3197
+ cursor.x,
3198
+ cursor.y,
3199
+ mouseMoved ? "mouse_full" : "mouse",
3200
+ );
3201
+ if (hitGate) return releaseFirst(hitGate);
3202
+
3203
+ await adapter.executor.mouseUp();
3204
+ mouseButtonHeld = false;
3205
+ mouseMoved = false;
3206
+ return okText("Mouse button released.");
3207
+ }
3208
+
3209
+ // ---------------------------------------------------------------------------
3210
+ // Batch dispatch
3211
+ // ---------------------------------------------------------------------------
3212
+
3213
+ /**
3214
+ * Actions allowed inside a computer_batch call. Excludes request_access,
3215
+ * open_application, clipboard, list_granted (no latency benefit, complicates
3216
+ * security model).
3217
+ */
3218
+ const BATCHABLE_ACTIONS: ReadonlySet<string> = new Set([
3219
+ "key",
3220
+ "type",
3221
+ "mouse_move",
3222
+ "left_click",
3223
+ "left_click_drag",
3224
+ "right_click",
3225
+ "middle_click",
3226
+ "double_click",
3227
+ "triple_click",
3228
+ "scroll",
3229
+ "hold_key",
3230
+ "screenshot",
3231
+ "cursor_position",
3232
+ "left_mouse_down",
3233
+ "left_mouse_up",
3234
+ "wait",
3235
+ ]);
3236
+
3237
+ interface OsPermissionPlan {
3238
+ required: CuOsPermissionRequirements;
3239
+ requestMissing: boolean;
3240
+ }
3241
+
3242
+ function mergeOsPermissions(
3243
+ ...requirements: CuOsPermissionRequirements[]
3244
+ ): CuOsPermissionRequirements {
3245
+ return requirements.reduce(
3246
+ (merged, requirement) => ({
3247
+ accessibility: merged.accessibility || requirement.accessibility === true,
3248
+ screenRecording:
3249
+ merged.screenRecording || requirement.screenRecording === true,
3250
+ }),
3251
+ { ...NO_OS_PERMISSIONS },
3252
+ );
3253
+ }
3254
+
3255
+ function hasRequiredOsPermissions(
3256
+ required: CuOsPermissionRequirements,
3257
+ ): boolean {
3258
+ return required.accessibility === true || required.screenRecording === true;
3259
+ }
3260
+
3261
+ function getMissingOsPermissionLabels(
3262
+ required: CuOsPermissionRequirements,
3263
+ state: { accessibility: boolean; screenRecording: boolean },
3264
+ ): string[] {
3265
+ const missing: string[] = [];
3266
+ if (required.accessibility && !state.accessibility) {
3267
+ missing.push("Accessibility");
3268
+ }
3269
+ if (required.screenRecording && !state.screenRecording) {
3270
+ missing.push("Screen Recording");
3271
+ }
3272
+ return missing;
3273
+ }
3274
+
3275
+ function formatOsPermissionLabels(labels: string[]): string {
3276
+ if (labels.length <= 1) {
3277
+ return labels[0] ?? "";
3278
+ }
3279
+ if (labels.length === 2) {
3280
+ return `${labels[0]} and ${labels[1]}`;
3281
+ }
3282
+ return `${labels.slice(0, -1).join(", ")}, and ${labels.at(-1)}`;
3283
+ }
3284
+
3285
+ function getActionOsPermissions(
3286
+ name: string,
3287
+ subGates: CuSubGates,
3288
+ ): CuOsPermissionRequirements {
3289
+ switch (name) {
3290
+ case "screenshot":
3291
+ return SCREEN_RECORDING_ONLY;
3292
+ case "zoom":
3293
+ case "wait":
3294
+ case "cursor_position":
3295
+ case "open_application":
3296
+ case "switch_display":
3297
+ case "list_granted_applications":
3298
+ case "read_clipboard":
3299
+ case "write_clipboard":
3300
+ return NO_OS_PERMISSIONS;
3301
+ case "left_click":
3302
+ case "double_click":
3303
+ case "triple_click":
3304
+ case "right_click":
3305
+ case "middle_click":
3306
+ return subGates.pixelValidation
3307
+ ? ACCESSIBILITY_AND_SCREEN_RECORDING
3308
+ : ACCESSIBILITY_ONLY;
3309
+ case "type":
3310
+ case "key":
3311
+ case "scroll":
3312
+ case "left_click_drag":
3313
+ case "mouse_move":
3314
+ case "hold_key":
3315
+ case "left_mouse_down":
3316
+ case "left_mouse_up":
3317
+ return ACCESSIBILITY_ONLY;
3318
+ default:
3319
+ return NO_OS_PERMISSIONS;
3320
+ }
3321
+ }
3322
+
3323
+ function getComputerBatchOsPermissions(
3324
+ args: Record<string, unknown>,
3325
+ subGates: CuSubGates,
3326
+ ): CuOsPermissionRequirements {
3327
+ const actions = Array.isArray(args.actions) ? args.actions : [];
3328
+ return mergeOsPermissions(
3329
+ ...actions
3330
+ .filter((action): action is Record<string, unknown> => {
3331
+ return typeof action === "object" && action !== null;
3332
+ })
3333
+ .map((action) =>
3334
+ getActionOsPermissions(
3335
+ typeof action.action === "string" ? action.action : "",
3336
+ subGates,
3337
+ ),
3338
+ ),
3339
+ );
3340
+ }
3341
+
3342
+ function getTeachStepOsPermissions(
3343
+ args: Record<string, unknown>,
3344
+ subGates: CuSubGates,
3345
+ ): CuOsPermissionRequirements {
3346
+ const actions = Array.isArray(args.actions) ? args.actions : [];
3347
+ const actionPermissions = mergeOsPermissions(
3348
+ ...actions
3349
+ .filter((action): action is Record<string, unknown> => {
3350
+ return typeof action === "object" && action !== null;
3351
+ })
3352
+ .map((action) =>
3353
+ getActionOsPermissions(
3354
+ typeof action.action === "string" ? action.action : "",
3355
+ subGates,
3356
+ ),
3357
+ ),
3358
+ );
3359
+ return actions.length > 0
3360
+ ? mergeOsPermissions(actionPermissions, SCREEN_RECORDING_ONLY)
3361
+ : actionPermissions;
3362
+ }
3363
+
3364
+ function getTeachBatchOsPermissions(
3365
+ args: Record<string, unknown>,
3366
+ subGates: CuSubGates,
3367
+ ): CuOsPermissionRequirements {
3368
+ const steps = Array.isArray(args.steps) ? args.steps : [];
3369
+ const stepPermissions = mergeOsPermissions(
3370
+ ...steps
3371
+ .filter((step): step is Record<string, unknown> => {
3372
+ return typeof step === "object" && step !== null;
3373
+ })
3374
+ .map((step) => getTeachStepOsPermissions(step, subGates)),
3375
+ );
3376
+ const hasAnyActions = steps.some((step) => {
3377
+ return (
3378
+ typeof step === "object" &&
3379
+ step !== null &&
3380
+ Array.isArray((step as Record<string, unknown>).actions) &&
3381
+ ((step as Record<string, unknown>).actions as unknown[]).length > 0
3382
+ );
3383
+ });
3384
+ return hasAnyActions
3385
+ ? mergeOsPermissions(stepPermissions, SCREEN_RECORDING_ONLY)
3386
+ : stepPermissions;
3387
+ }
3388
+
3389
+ function getOsPermissionPlan(
3390
+ name: string,
3391
+ args: Record<string, unknown>,
3392
+ subGates: CuSubGates,
3393
+ ): OsPermissionPlan {
3394
+ if (name === "request_access" || name === "request_teach_access") {
3395
+ return {
3396
+ required: ACCESSIBILITY_AND_SCREEN_RECORDING,
3397
+ requestMissing: false,
3398
+ };
3399
+ }
3400
+ if (name === "computer_batch") {
3401
+ return {
3402
+ required: getComputerBatchOsPermissions(args, subGates),
3403
+ requestMissing: true,
3404
+ };
3405
+ }
3406
+ if (name === "teach_step") {
3407
+ return {
3408
+ required: getTeachStepOsPermissions(args, subGates),
3409
+ requestMissing: true,
3410
+ };
3411
+ }
3412
+ if (name === "teach_batch") {
3413
+ return {
3414
+ required: getTeachBatchOsPermissions(args, subGates),
3415
+ requestMissing: true,
3416
+ };
3417
+ }
3418
+ return {
3419
+ required: getActionOsPermissions(name, subGates),
3420
+ requestMissing: true,
3421
+ };
3422
+ }
3423
+
3424
+ interface BatchActionResult {
3425
+ action: string;
3426
+ ok: boolean;
3427
+ output: string;
3428
+ }
3429
+
3430
+ /**
3431
+ * Executes `actions: [{action, …}, …]`
3432
+ * sequentially in ONE model→API round trip — the dominant latency cost
3433
+ * (seconds, vs. ~50ms local overhead per action).
3434
+ *
3435
+ * Gate semantics (the security model):
3436
+ * - Kill-switch + TCC: checked ONCE by handleToolCall before reaching here.
3437
+ * - prepareForAction: run ONCE at the top. The user approved "do this
3438
+ * sequence"; hiding apps per-action is wasted work and fast-pathed anyway.
3439
+ * - Frontmost gate: checked PER ACTION. State can change mid-batch — a
3440
+ * click might open a non-allowed app. This is the safety net: if action
3441
+ * 3 of 5 opened Safari (not allowed), action 4's frontmost check fires
3442
+ * and stops the batch there.
3443
+ * - PixelCompare: SKIPPED inside batch. The model committed to the full
3444
+ * sequence without intermediate screenshots; validating mid-batch clicks
3445
+ * against a pre-batch screenshot would false-positive constantly.
3446
+ *
3447
+ * Both skips are implemented by passing `{...subGates, hideBeforeAction:
3448
+ * false, pixelValidation: false}` to each inner dispatch — the handlers'
3449
+ * existing gate logic does the right thing, no new code paths.
3450
+ *
3451
+ * Stop-on-first-error: accumulate results, on
3452
+ * first `isError` stop executing, return everything so far + the error. The
3453
+ * model sees exactly where the batch broke and what succeeded before it.
3454
+ *
3455
+ * Mid-batch screenshots are allowed (for inspection) but NEVER piggyback —
3456
+ * their `.screenshot` field is dropped. Same invariant as zoom: click coords
3457
+ * always refer to the PRE-BATCH `lastScreenshot`. If the model wants to click
3458
+ * based on a new screenshot, it ends the batch and screenshots separately.
3459
+ */
3460
+ async function handleComputerBatch(
3461
+ adapter: ComputerUseHostAdapter,
3462
+ args: Record<string, unknown>,
3463
+ overrides: ComputerUseOverrides,
3464
+ subGates: CuSubGates,
3465
+ ): Promise<CuCallToolResult> {
3466
+ const actions = args.actions;
3467
+ if (!Array.isArray(actions) || actions.length === 0) {
3468
+ return errorResult("actions must be a non-empty array", "bad_args");
3469
+ }
3470
+
3471
+ for (const [i, act] of actions.entries()) {
3472
+ if (typeof act !== "object" || act === null) {
3473
+ return errorResult(`actions[${i}] must be an object`, "bad_args");
3474
+ }
3475
+ const action = (act as Record<string, unknown>).action;
3476
+ if (typeof action !== "string") {
3477
+ return errorResult(`actions[${i}].action must be a string`, "bad_args");
3478
+ }
3479
+ if (!BATCHABLE_ACTIONS.has(action)) {
3480
+ return errorResult(
3481
+ `actions[${i}].action="${action}" is not allowed in a batch. ` +
3482
+ `Allowed: ${[...BATCHABLE_ACTIONS].join(", ")}.`,
3483
+ "bad_args",
3484
+ );
3485
+ }
3486
+ }
3487
+
3488
+ // prepareForAction ONCE. After this, inner dispatches skip it via
3489
+ // hideBeforeAction:false.
3490
+ if (subGates.hideBeforeAction) {
3491
+ const hidden = await adapter.executor.prepareForAction(
3492
+ overrides.allowedApps.map((a) => a.bundleId),
3493
+ overrides.selectedDisplayId,
3494
+ );
3495
+ if (hidden.length > 0) {
3496
+ overrides.onAppsHidden?.(hidden);
3497
+ }
3498
+ }
3499
+
3500
+ // Inner actions: skip prepare (already ran), skip pixelCompare (stale by
3501
+ // design). Frontmost still checked — runInputActionGates does it
3502
+ // unconditionally.
3503
+ const batchSubGates: CuSubGates = {
3504
+ ...subGates,
3505
+ hideBeforeAction: false,
3506
+ pixelValidation: false,
3507
+ // Batch already took its screenshot (appended at end); a mid-batch
3508
+ // resolver switch would make that screenshot inconsistent with
3509
+ // earlier clicks' lastScreenshot-based scaleCoord targeting.
3510
+ autoTargetDisplay: false,
3511
+ };
3512
+
3513
+ const results: BatchActionResult[] = [];
3514
+ for (const [i, act] of actions.entries()) {
3515
+ // Overlay Stop → host's stopSession → lifecycleState leaves "running"
3516
+ // synchronously before query.interrupt(). The SDK abort tears down the
3517
+ // host's await but not this loop — without this check the remaining
3518
+ // actions fire into a dead session.
3519
+ if (overrides.isAborted?.()) {
3520
+ await releaseHeldMouse(adapter);
3521
+ return errorResult(
3522
+ `Batch aborted after ${results.length} of ${actions.length} actions (user interrupt).`,
3523
+ );
3524
+ }
3525
+
3526
+ // Small inter-step settle. Synthetic CGEvents post instantly; some apps
3527
+ // need a tick to process step N's input before step N+1 lands (e.g. a
3528
+ // click opening a menu before the next click targets a menu item).
3529
+ if (i > 0) await sleep(10);
3530
+
3531
+ const actionArgs = act as Record<string, unknown>;
3532
+ const action = actionArgs.action as string;
3533
+
3534
+ // Drop mid-batch screenshot piggyback (strip .screenshot). Click coords
3535
+ // stay anchored to the pre-batch lastScreenshot.
3536
+ const { screenshot: _dropped, ...inner } = await dispatchAction(
3537
+ action,
3538
+ actionArgs,
3539
+ adapter,
3540
+ overrides,
3541
+ batchSubGates,
3542
+ );
3543
+
3544
+ const text = firstTextContent(inner);
3545
+ const result = { action, ok: !inner.isError, output: text };
3546
+ results.push(result);
3547
+
3548
+ if (inner.isError) {
3549
+ // Stop-on-first-error. Return everything so far + the error.
3550
+ // Forward the inner action's telemetry (error_kind) so cu_tool_call
3551
+ // reflects the actual failure — without this, batch-internal errors
3552
+ // emit error_kind: undefined despite the inner handler tagging it.
3553
+ // Release held mouse: the error may be a mid-grapheme abort in
3554
+ // handleType, or a frontmost gate, landing between mouse_down and
3555
+ // mouse_up.
3556
+ await releaseHeldMouse(adapter);
3557
+ return okJson(
3558
+ {
3559
+ completed: results.slice(0, -1),
3560
+ failed: result,
3561
+ remaining: actions.length - results.length,
3562
+ },
3563
+ inner.telemetry,
3564
+ );
3565
+ }
3566
+ }
3567
+
3568
+ return okJson({ completed: results });
3569
+ }
3570
+
3571
+ function firstTextContent(r: CuCallToolResult): string {
3572
+ const first = r.content[0];
3573
+ return first && first.type === "text" ? first.text : "";
3574
+ }
3575
+
3576
+ /**
3577
+ * Action dispatch shared by handleToolCall and handleComputerBatch. Called
3578
+ * AFTER kill-switch + TCC gates have passed. Never sees request_access — it's
3579
+ * special-cased in handleToolCall for the tccState thread-through.
3580
+ */
3581
+ async function dispatchAction(
3582
+ name: string,
3583
+ a: Record<string, unknown>,
3584
+ adapter: ComputerUseHostAdapter,
3585
+ overrides: ComputerUseOverrides,
3586
+ subGates: CuSubGates,
3587
+ ): Promise<CuCallToolResult> {
3588
+ switch (name) {
3589
+ case "screenshot":
3590
+ return handleScreenshot(adapter, overrides, subGates);
3591
+
3592
+ case "zoom":
3593
+ return handleZoom(adapter, a, overrides);
3594
+
3595
+ case "left_click":
3596
+ return handleClickVariant(adapter, a, overrides, subGates, "left", 1);
3597
+ case "double_click":
3598
+ return handleClickVariant(adapter, a, overrides, subGates, "left", 2);
3599
+ case "triple_click":
3600
+ return handleClickVariant(adapter, a, overrides, subGates, "left", 3);
3601
+ case "right_click":
3602
+ return handleClickVariant(adapter, a, overrides, subGates, "right", 1);
3603
+ case "middle_click":
3604
+ return handleClickVariant(adapter, a, overrides, subGates, "middle", 1);
3605
+
3606
+ case "type":
3607
+ return handleType(adapter, a, overrides, subGates);
3608
+
3609
+ case "key":
3610
+ return handleKey(adapter, a, overrides, subGates);
3611
+
3612
+ case "scroll":
3613
+ return handleScroll(adapter, a, overrides, subGates);
3614
+
3615
+ case "left_click_drag":
3616
+ return handleDrag(adapter, a, overrides, subGates);
3617
+
3618
+ case "mouse_move":
3619
+ return handleMoveMouse(adapter, a, overrides, subGates);
3620
+
3621
+ case "wait":
3622
+ return handleWait(a);
3623
+
3624
+ case "cursor_position":
3625
+ return handleCursorPosition(adapter, overrides);
3626
+
3627
+ case "hold_key":
3628
+ return handleHoldKey(adapter, a, overrides, subGates);
3629
+
3630
+ case "left_mouse_down":
3631
+ return handleLeftMouseDown(adapter, overrides, subGates);
3632
+
3633
+ case "left_mouse_up":
3634
+ return handleLeftMouseUp(adapter, overrides, subGates);
3635
+
3636
+ case "open_application":
3637
+ return handleOpenApplication(adapter, a, overrides);
3638
+
3639
+ case "switch_display":
3640
+ return handleSwitchDisplay(adapter, a, overrides);
3641
+
3642
+ case "list_granted_applications":
3643
+ return handleListGrantedApplications(overrides);
3644
+
3645
+ case "read_clipboard":
3646
+ return handleReadClipboard(adapter, overrides, subGates);
3647
+
3648
+ case "write_clipboard":
3649
+ return handleWriteClipboard(adapter, a, overrides, subGates);
3650
+
3651
+ case "computer_batch":
3652
+ return handleComputerBatch(adapter, a, overrides, subGates);
3653
+
3654
+ default:
3655
+ return errorResult(`Unknown tool "${name}".`, "bad_args");
3656
+ }
3657
+ }
3658
+
3659
+ // ---------------------------------------------------------------------------
3660
+ // Main dispatch
3661
+ // ---------------------------------------------------------------------------
3662
+
3663
+ export async function handleToolCall(
3664
+ adapter: ComputerUseHostAdapter,
3665
+ name: string,
3666
+ args: unknown,
3667
+ rawOverrides: ComputerUseOverrides,
3668
+ ): Promise<CuCallToolResult> {
3669
+ const { logger, serverName } = adapter;
3670
+
3671
+ // Normalize the allowlist before any gate runs:
3672
+ //
3673
+ // (a) Strip user-denied. A grant from a previous session (before the user
3674
+ // added the app to Settings → Desktop app → Computer Use → Denied apps)
3675
+ // must not survive. Without
3676
+ // this, a stale grant bypasses the auto-deny. Stripped silently — the
3677
+ // agent already saw the userDenied guidance at request_access time, and
3678
+ // a live frontmost-gate rejection cites "not in allowed applications".
3679
+ //
3680
+ // (b) Strip policy-denied. Same story as (a) for a grant that predates a
3681
+ // blocklist addition. buildAccessRequest denies these up front for new
3682
+ // requests; this catches stale persisted grants.
3683
+ //
3684
+ // (c) Backfill tier. A grant persisted before the tier field existed has
3685
+ // `tier: undefined`, which `tierSatisfies` treats as `"full"` — wrong
3686
+ // for a legacy Chrome grant. Assign the hardcoded tier based on
3687
+ // bundle-ID category. Modern grants already have a tier.
3688
+ //
3689
+ // `.some()` guard keeps the hot path (empty deny list, no legacy grants)
3690
+ // zero-alloc.
3691
+ const userDeniedSet = new Set(rawOverrides.userDeniedBundleIds);
3692
+ const overrides: ComputerUseOverrides = rawOverrides.allowedApps.some(
3693
+ (a) =>
3694
+ a.tier === undefined ||
3695
+ userDeniedSet.has(a.bundleId) ||
3696
+ isPolicyDenied(a.bundleId, a.displayName),
3697
+ )
3698
+ ? {
3699
+ ...rawOverrides,
3700
+ allowedApps: rawOverrides.allowedApps
3701
+ .filter((a) => !userDeniedSet.has(a.bundleId))
3702
+ .filter((a) => !isPolicyDenied(a.bundleId, a.displayName))
3703
+ .map((a) =>
3704
+ a.tier !== undefined
3705
+ ? a
3706
+ : { ...a, tier: getDefaultTierForApp(a.bundleId, a.displayName) },
3707
+ ),
3708
+ }
3709
+ : rawOverrides;
3710
+
3711
+ // ─── Gate 1: kill switch ─────────────────────────────────────────────
3712
+ if (adapter.isDisabled()) {
3713
+ return errorResult(
3714
+ "Computer control is disabled in Settings. Enable it and try again.",
3715
+ "other",
3716
+ );
3717
+ }
3718
+
3719
+ const a = asRecord(args);
3720
+ const subGates = adapter.getSubGates();
3721
+
3722
+ // ─── Gate 2: TCC ─────────────────────────────────────────────────────
3723
+ // Capability-scoped OS permissions. Normal action tools only request the
3724
+ // specific permission(s) they need; request_* tools read the current state
3725
+ // without triggering fresh prompts so the renderer can drive the TCC UI.
3726
+ const osPermissionPlan = getOsPermissionPlan(name, a, subGates);
3727
+ let tccState:
3728
+ | { accessibility: boolean; screenRecording: boolean }
3729
+ | undefined;
3730
+ if (hasRequiredOsPermissions(osPermissionPlan.required)) {
3731
+ const osPerms = await adapter.ensureOsPermissions(
3732
+ osPermissionPlan.required,
3733
+ { requestMissing: osPermissionPlan.requestMissing },
3734
+ );
3735
+ if (!osPerms.granted) {
3736
+ // request_* tools thread tccState through to the renderer's TCC toggle
3737
+ // panel. Every other tool short-circuits with the missing subset only.
3738
+ if (name !== "request_access" && name !== "request_teach_access") {
3739
+ const missing = getMissingOsPermissionLabels(
3740
+ osPermissionPlan.required,
3741
+ osPerms,
3742
+ );
3743
+ const labels = formatOsPermissionLabels(missing);
3744
+ const plural = missing.length === 1 ? "permission is" : "permissions are";
3745
+ return errorResult(
3746
+ `${labels} ${plural} required for this computer-use action. ` +
3747
+ "Call request_access to show the permission panel.",
3748
+ "tcc_not_granted",
3749
+ );
3750
+ }
3751
+ tccState = {
3752
+ accessibility: osPerms.accessibility,
3753
+ screenRecording: osPerms.screenRecording,
3754
+ };
3755
+ }
3756
+ }
3757
+
3758
+ // ─── Gate 3: global CU lock ──────────────────────────────────────────
3759
+ // At most one session uses CU at a time. Every tool including
3760
+ // request_access hits the CHECK — even showing the approval dialog while
3761
+ // another session holds the lock would be confusing ("why approve access
3762
+ // that can't be used?").
3763
+ //
3764
+ // But ACQUIRE is split: request_access and list_granted_applications
3765
+ // check-without-acquire (the overlay + notifications are driven by
3766
+ // cuLockChanged, and showing "Claude is using your computer" while the
3767
+ // agent is only ASKING for access is premature). First action tool
3768
+ // acquires and the overlay appears. If the user denies and no action
3769
+ // follows, the overlay never shows.
3770
+ //
3771
+ // request_teach_access is NOT in this set — approving teach mode HIDES
3772
+ // the main window (via onTeachModeActivated), and the lock must be held
3773
+ // before that happens. Otherwise a concurrent session's request_access
3774
+ // would render its dialog in an invisible main window during the gap
3775
+ // between hide and the first teach_step (seconds of model inference).
3776
+ // The old acquire-always-at-Gate-3 behavior was correct for teach; only
3777
+ // the non-teach permission tools benefit from deferral.
3778
+ //
3779
+ // Host releases on idle/stop/archive; this package never releases. Both
3780
+ // Cowork (LAM) and CCD (LSM) wire checkCuLock via the shared cuLock
3781
+ // singleton. When undefined (tests/future hosts), no gate — absence of
3782
+ // the mechanism ≠ locked out.
3783
+ const deferAcquire = defersLockAcquire(name);
3784
+ const lock = overrides.checkCuLock?.();
3785
+ if (lock) {
3786
+ if (lock.holder !== undefined && !lock.isSelf) {
3787
+ return errorResult(
3788
+ "Another Gclm Code session is currently using the computer. Wait for " +
3789
+ "the user to acknowledge it is finished (stop button in the Gclm Code " +
3790
+ "window), or find a non-computer-use approach if one is readily " +
3791
+ "apparent.",
3792
+ "cu_lock_held",
3793
+ );
3794
+ }
3795
+ if (lock.holder === undefined && !deferAcquire) {
3796
+ // Acquire. Emits cuLockChanged → overlay shows. Idempotent — if
3797
+ // someone else acquired between check and here (won't happen on a
3798
+ // single-threaded event loop, but defensive), this is a no-op.
3799
+ overrides.acquireCuLock?.();
3800
+ // Fresh lock holder → any prior session's mouseButtonHeld is stale
3801
+ // (e.g. overlay stop mid-drag). Clear it so this session doesn't get
3802
+ // a spurious "already held" error. resetMouseButtonHeld is file-local;
3803
+ // this is the one non-test callsite.
3804
+ resetMouseButtonHeld();
3805
+ }
3806
+ // lock.isSelf → already held by us, proceed.
3807
+ // lock.holder === undefined && deferAcquire →
3808
+ // checked but not acquired — proceed, first action will acquire.
3809
+ }
3810
+
3811
+ // Sub-gates read FRESH every call so a GrowthBook flip takes effect
3812
+ // mid-session (plan §3).
3813
+ // Clipboard guard runs per-action inside runInputActionGates + inline in
3814
+ // handleReadClipboard/handleWriteClipboard. NOT here — per-tool-call sync
3815
+ // would run once for computer_batch and miss sub-actions 2..N, and would
3816
+ // fire during deferAcquire tools / `wait` / teach_step's blocking-dialog
3817
+ // phase where no input is happening.
3818
+
3819
+ logger.silly(
3820
+ `[${serverName}] tool=${name} args=${JSON.stringify(a).slice(0, 200)}`,
3821
+ );
3822
+
3823
+ // ─── Fail-closed dispatch ────────────────────────────────────────────
3824
+ // ANY exception below → tool error, executor never left in a half-called
3825
+ // state. Explicit inversion of the prior `catch → return true` fail-open.
3826
+ try {
3827
+ // request_access / request_teach_access: need tccState thread-through;
3828
+ // dispatchAction never sees them (not batchable).
3829
+ // teach_step: blocking UI tool, also not batchable; needs subGates for
3830
+ // its action-execution phase.
3831
+ if (name === "request_access") {
3832
+ return await handleRequestAccess(adapter, a, overrides, tccState);
3833
+ }
3834
+ if (name === "request_teach_access") {
3835
+ return await handleRequestTeachAccess(adapter, a, overrides, tccState);
3836
+ }
3837
+ if (name === "teach_step") {
3838
+ return await handleTeachStep(adapter, a, overrides, subGates);
3839
+ }
3840
+ if (name === "teach_batch") {
3841
+ return await handleTeachBatch(adapter, a, overrides, subGates);
3842
+ }
3843
+ return await dispatchAction(name, a, adapter, overrides, subGates);
3844
+ } catch (err) {
3845
+ // Fail-closed. If the gate machinery itself throws (e.g.
3846
+ // getFrontmostApp() rejects), the executor has NOT been called yet for
3847
+ // the gated tools — the gates run before the executor in every handler.
3848
+ // For ungated tools, the executor may have been mid-call; that's fine —
3849
+ // the result is still a tool error, never an implicit success.
3850
+ const msg = err instanceof Error ? err.message : String(err);
3851
+ logger.error(`[${serverName}] tool=${name} threw: ${msg}`, err);
3852
+ return errorResult(`Tool "${name}" failed: ${msg}`, "executor_threw");
3853
+ }
3854
+ }
3855
+
3856
+ export const _test = {
3857
+ scaleCoord,
3858
+ coordToPercentageForPixelCompare,
3859
+ segmentGraphemes,
3860
+ decodedByteLength,
3861
+ resolveRequestedApps,
3862
+ buildAccessRequest,
3863
+ buildTierGuidanceMessage,
3864
+ buildUserDeniedGuidance,
3865
+ tierSatisfies,
3866
+ looksLikeBundleId,
3867
+ extractCoordinate,
3868
+ parseKeyChord,
3869
+ buildMonitorNote,
3870
+ handleSwitchDisplay,
3871
+ uniqueDisplayLabels,
3872
+ };