nebula-ai-plugin-system 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/browser.ts ADDED
@@ -0,0 +1,713 @@
1
+ import { type ChildProcess, spawn, spawnSync } from 'node:child_process'
2
+ import {
3
+ closeSync,
4
+ existsSync,
5
+ mkdirSync,
6
+ openSync,
7
+ readFileSync,
8
+ readdirSync,
9
+ rmSync,
10
+ statSync,
11
+ } from 'node:fs'
12
+ import { tmpdir } from 'node:os'
13
+ import { delimiter, join } from 'node:path'
14
+ import { type ToolDef, type VisionInferFn, coerceBool, coerceInt, redactEnv } from 'nebula-ai-core'
15
+ import { z } from 'zod'
16
+ import { sniffMimeFromBytes } from './vision'
17
+
18
+ /**
19
+ * Phase 9.4 + Task #74 browser tools. Wraps the `agent-browser` CLI with
20
+ * hermes-grade resilience: PATH-walker for unlinked Homebrew node@N installs,
21
+ * per-session AGENT_BROWSER_SOCKET_DIR (sidesteps macOS 104-byte AF_UNIX
22
+ * limit), stdout/stderr to temp files (avoids daemon-fd pipe deadlock),
23
+ * optional `NEBULA_BROWSER_CDP_URL` override for connecting to a user-supplied
24
+ * CDP endpoint, and on-exit cleanup of the spawned daemon.
25
+ *
26
+ * Defaults to local headless Chromium via `agent-browser --session`. Set
27
+ * `NEBULA_BROWSER_CDP_URL` to opt into CDP override (e.g. qutebrowser proxy,
28
+ * Browserbase websocket).
29
+ */
30
+
31
+ interface BrowserDeps {
32
+ /** Override the agent-browser binary path. Default: PATH walker resolves it lazily. */
33
+ binPath?: string
34
+ /** Working directory for the spawned process. Default cwd. */
35
+ cwd?: string
36
+ /** Override timeout (ms). Default 60000. */
37
+ timeoutMs?: number
38
+ }
39
+
40
+ interface RunResult {
41
+ ok: boolean
42
+ data?: { stdout: string; stderr?: string; exit_code: number | null }
43
+ error?: string
44
+ }
45
+
46
+ const DEFAULT_TIMEOUT_MS = 60_000
47
+ const SANE_PATH_DIRS = [
48
+ '/opt/homebrew/bin',
49
+ '/opt/homebrew/sbin',
50
+ '/usr/local/bin',
51
+ '/usr/local/sbin',
52
+ '/usr/bin',
53
+ '/usr/sbin',
54
+ '/bin',
55
+ '/sbin',
56
+ ]
57
+
58
+ let cachedSessionName: string | null = null
59
+ let cachedSocketDir: string | null = null
60
+ let cleanupRegistered = false
61
+
62
+ function discoverHomebrewNodeDirs(): string[] {
63
+ const homebrewOpt = '/opt/homebrew/opt'
64
+ if (!existsSync(homebrewOpt)) return []
65
+ try {
66
+ return readdirSync(homebrewOpt)
67
+ .filter(name => name.startsWith('node') && name !== 'node')
68
+ .map(name => join(homebrewOpt, name, 'bin'))
69
+ .filter(dir => existsSync(dir))
70
+ } catch {
71
+ return []
72
+ }
73
+ }
74
+
75
+ function whichIn(name: string, dirs: string[]): string | null {
76
+ for (const dir of dirs) {
77
+ const candidate = join(dir, name)
78
+ // Use statSync (follows symlinks) so a dangling brew symlink (target
79
+ // moved by `brew upgrade`) returns null instead of pointing at a path
80
+ // that exists but can't be exec'd. Returns null on ENOENT via the
81
+ // throwIfNoEntry: false flag.
82
+ const stat = statSync(candidate, { throwIfNoEntry: false })
83
+ if (stat?.isFile()) return candidate
84
+ }
85
+ return null
86
+ }
87
+
88
+ /**
89
+ * Resolve the `agent-browser` CLI. Canonical path is the workspace's
90
+ * `node_modules/.bin/agent-browser` (npm dep, bun-workspace hoist). PATH and
91
+ * known-dir walks catch operator-installed copies (e.g. Homebrew on the host).
92
+ *
93
+ * NOT cached: resolution is a few syscalls (microseconds), and caching invites
94
+ * the dangling-symlink trap when `brew upgrade` runs in another shell.
95
+ *
96
+ * `cwdOverride` is a test-only hook. Production callers leave it unset.
97
+ */
98
+ function findAgentBrowser(override?: string, cwdOverride?: string): string | null {
99
+ if (override) return override
100
+
101
+ const cwd = cwdOverride ?? process.cwd()
102
+
103
+ // Search a small ladder of candidate roots: the operator-supplied cwd
104
+ // first, then the daemon's bun cwd, then a probe one level deeper
105
+ // ("./nebula") which catches the sandbox-harness case where the daemon
106
+ // boots from $HOME but the workspace tree (with node_modules) lives in
107
+ // a sibling dir. Without that probe enigma's `findAgentBrowser` would
108
+ // miss `/home/daytona/nebula/node_modules/.bin/agent-browser` and the
109
+ // brain quietly falls back to web.fetch.
110
+ const candidates = Array.from(new Set([cwd, process.cwd(), join(cwd, 'nebula')]))
111
+ for (const root of candidates) {
112
+ const localBin = join(root, 'node_modules', '.bin', 'agent-browser')
113
+ if (statSync(localBin, { throwIfNoEntry: false })?.isFile()) return localBin
114
+ const localPkg = join(root, 'node_modules', 'agent-browser', 'bin', 'agent-browser.js')
115
+ if (statSync(localPkg, { throwIfNoEntry: false })?.isFile()) return localPkg
116
+ }
117
+
118
+ // Bun global install layout (npm-bootstrapped sandbox containers + any
119
+ // `bun add -g nebula-ai-cli` install). Bun symlinks third-party bins
120
+ // here but does NOT add this dir to $PATH automatically, so the PATH walk
121
+ // below would miss it. Probe explicitly.
122
+ const homeDir = process.env.HOME
123
+ if (homeDir) {
124
+ const bunGlobalBin = join(
125
+ homeDir,
126
+ '.bun',
127
+ 'install',
128
+ 'global',
129
+ 'node_modules',
130
+ '.bin',
131
+ 'agent-browser',
132
+ )
133
+ if (statSync(bunGlobalBin, { throwIfNoEntry: false })?.isFile()) return bunGlobalBin
134
+ const bunGlobalPkg = join(
135
+ homeDir,
136
+ '.bun',
137
+ 'install',
138
+ 'global',
139
+ 'node_modules',
140
+ 'agent-browser',
141
+ 'bin',
142
+ 'agent-browser.js',
143
+ )
144
+ if (statSync(bunGlobalPkg, { throwIfNoEntry: false })?.isFile()) return bunGlobalPkg
145
+ }
146
+
147
+ const pathEnv = process.env.PATH ?? ''
148
+ const pathDirs = pathEnv.split(delimiter).filter(Boolean)
149
+ const inPath = whichIn('agent-browser', pathDirs)
150
+ if (inPath) return inPath
151
+
152
+ const extraDirs = [...discoverHomebrewNodeDirs(), ...SANE_PATH_DIRS].filter(d => existsSync(d))
153
+ const inExtra = whichIn('agent-browser', extraDirs)
154
+ if (inExtra) return inExtra
155
+
156
+ return null
157
+ }
158
+
159
+ /**
160
+ * True when `agent-browser` resolves on this machine. Gates browser.* tool
161
+ * registration so dev installs that skip `bun install` don't crash on first
162
+ * browser.* call.
163
+ */
164
+ /**
165
+ * Detect whether the agent-browser binary is reachable from disk. Accepts
166
+ * an optional `cwdOverride` because the daemon's `process.cwd()` is not
167
+ * always the workspace root — in the enigma sandbox the harness boots
168
+ * from `/home/daytona`, but `node_modules/.bin/agent-browser` lives one
169
+ * level deeper at `/home/daytona/nebula/node_modules/.bin/`. The plugin
170
+ * loader passes `ctx.workspaceRoot` here so registration uses the right
171
+ * tree on both surfaces.
172
+ */
173
+ export function isBrowserAvailable(cwdOverride?: string): boolean {
174
+ return findAgentBrowser(undefined, cwdOverride) !== null
175
+ }
176
+
177
+ /**
178
+ * Same as `isBrowserAvailable` but returns the resolved path (or null).
179
+ * Plugin loaders use this once at registration time and pass the result
180
+ * as `binPath` to each factory so per-call spawns don't re-search PATH —
181
+ * a re-search would fail again when daemon cwd ≠ workspace root.
182
+ */
183
+ export function findAgentBrowserOrNull(cwdOverride?: string): string | null {
184
+ return findAgentBrowser(undefined, cwdOverride)
185
+ }
186
+
187
+ function socketSafeTmpdir(): string {
188
+ if (process.platform === 'darwin') return '/tmp'
189
+ return tmpdir()
190
+ }
191
+
192
+ function randomHex(bytes: number): string {
193
+ const buf = new Uint8Array(bytes)
194
+ crypto.getRandomValues(buf)
195
+ return Array.from(buf, b => b.toString(16).padStart(2, '0')).join('')
196
+ }
197
+
198
+ function getSessionName(): string {
199
+ if (cachedSessionName) return cachedSessionName
200
+ cachedSessionName = `a_${randomHex(5)}`
201
+ return cachedSessionName
202
+ }
203
+
204
+ function getSocketDir(): string {
205
+ if (cachedSocketDir) return cachedSocketDir
206
+ const dir = join(socketSafeTmpdir(), `agent-browser-${getSessionName()}`)
207
+ mkdirSync(dir, { recursive: true, mode: 0o700 })
208
+ cachedSocketDir = dir
209
+ registerCleanup()
210
+ return dir
211
+ }
212
+
213
+ function registerCleanup(): void {
214
+ if (cleanupRegistered) return
215
+ cleanupRegistered = true
216
+ const cleanup = () => {
217
+ try {
218
+ const bin = findAgentBrowser()
219
+ const sess = cachedSessionName
220
+ if (bin && sess && !process.env.NEBULA_BROWSER_CDP_URL) {
221
+ try {
222
+ // spawnSync so the daemon actually receives `close` before we exit.
223
+ // Async + detached drops the message: the parent exits before the
224
+ // child IPC connects to the daemon socket. 5s cap prevents hangs
225
+ // on a frozen daemon.
226
+ spawnSync(bin, ['--session', sess, 'close'], {
227
+ stdio: 'ignore',
228
+ env: cachedSocketDir
229
+ ? { ...process.env, AGENT_BROWSER_SOCKET_DIR: cachedSocketDir }
230
+ : process.env,
231
+ timeout: 5000,
232
+ })
233
+ } catch {}
234
+ }
235
+ if (cachedSocketDir) {
236
+ rmSync(cachedSocketDir, { recursive: true, force: true })
237
+ }
238
+ } catch {}
239
+ }
240
+ process.on('exit', cleanup)
241
+ process.on('SIGINT', () => {
242
+ cleanup()
243
+ process.exit(130)
244
+ })
245
+ process.on('SIGTERM', () => {
246
+ cleanup()
247
+ process.exit(143)
248
+ })
249
+ }
250
+
251
+ function buildBrowserEnv(socketDir: string): NodeJS.ProcessEnv {
252
+ const { env } = redactEnv(process.env as Record<string, string>)
253
+ const existing = (env.PATH ?? '').split(delimiter).filter(Boolean)
254
+ const candidates = [...discoverHomebrewNodeDirs(), ...SANE_PATH_DIRS]
255
+ for (const dir of candidates) {
256
+ if (existsSync(dir) && !existing.includes(dir)) existing.unshift(dir)
257
+ }
258
+ return {
259
+ ...env,
260
+ PATH: existing.join(delimiter),
261
+ AGENT_BROWSER_SOCKET_DIR: socketDir,
262
+ }
263
+ }
264
+
265
+ function readFileSafe(path: string): string {
266
+ try {
267
+ return readFileSync(path, 'utf8')
268
+ } catch {
269
+ return ''
270
+ }
271
+ }
272
+
273
+ function rmSafe(path: string): void {
274
+ try {
275
+ rmSync(path, { force: true })
276
+ } catch {}
277
+ }
278
+
279
+ interface RunOpts {
280
+ /**
281
+ * After the primary command completes, run `agent-browser wait <ms>` so
282
+ * page transitions (navigation, JS-handled form submits) settle before
283
+ * the next snapshot. Set to 0 to skip. Default 0 (the caller chooses).
284
+ */
285
+ settleAfterMs?: number
286
+ }
287
+
288
+ async function runAgentBrowser(
289
+ command: string,
290
+ extraArgs: string[],
291
+ deps: BrowserDeps,
292
+ opts: RunOpts = {},
293
+ ): Promise<RunResult> {
294
+ const result = await runAgentBrowserOnce(command, extraArgs, deps)
295
+ if (!result.ok || !opts.settleAfterMs) return result
296
+ // Best-effort settle wait — the primary call's result is what we report;
297
+ // a wait failure (e.g. timeout) doesn't invalidate the action that just
298
+ // succeeded. We DO surface it via stderr though.
299
+ const settleMs = Math.min(opts.settleAfterMs, 10_000)
300
+ await runAgentBrowserOnce('wait', [String(settleMs)], deps)
301
+ return result
302
+ }
303
+
304
+ async function runAgentBrowserOnce(
305
+ command: string,
306
+ extraArgs: string[],
307
+ deps: BrowserDeps,
308
+ ): Promise<RunResult> {
309
+ const bin = findAgentBrowser(deps.binPath)
310
+ if (!bin) {
311
+ return {
312
+ ok: false,
313
+ error:
314
+ 'agent-browser CLI not found in node_modules/.bin or PATH. Re-run `nebula upgrade` to repair, or `bun install` in the workspace root if running from source.',
315
+ }
316
+ }
317
+ // Path may contain a space if a user-supplied override was passed; preserve
318
+ // it as a single argv0 since spawn() doesn't shell-tokenize.
319
+ const cmdParts = [bin]
320
+
321
+ const cdpOverride = process.env.NEBULA_BROWSER_CDP_URL
322
+ const backendArgs = cdpOverride ? ['--cdp', cdpOverride] : ['--session', getSessionName()]
323
+
324
+ const socketDir = getSocketDir()
325
+ const sanitizedCmd = command.replace(/[^a-z0-9_-]/gi, '_')
326
+ const stdoutPath = join(socketDir, `_stdout_${sanitizedCmd}_${Date.now()}`)
327
+ const stderrPath = join(socketDir, `_stderr_${sanitizedCmd}_${Date.now()}`)
328
+
329
+ const fullArgs = [...cmdParts.slice(1), ...backendArgs, command, ...extraArgs]
330
+ const env = buildBrowserEnv(socketDir)
331
+
332
+ let stdoutFd = -1
333
+ let stderrFd = -1
334
+ try {
335
+ stdoutFd = openSync(stdoutPath, 'w', 0o600)
336
+ stderrFd = openSync(stderrPath, 'w', 0o600)
337
+ } catch (err) {
338
+ return { ok: false, error: `failed to open browser temp files: ${(err as Error).message}` }
339
+ }
340
+
341
+ const timeoutMs = deps.timeoutMs ?? DEFAULT_TIMEOUT_MS
342
+ return await new Promise<RunResult>(resolve => {
343
+ let proc: ChildProcess
344
+ try {
345
+ proc = spawn(cmdParts[0]!, fullArgs, {
346
+ cwd: deps.cwd ?? process.cwd(),
347
+ env,
348
+ stdio: ['ignore', stdoutFd, stderrFd],
349
+ })
350
+ } catch (err) {
351
+ try {
352
+ closeSync(stdoutFd)
353
+ } catch {}
354
+ try {
355
+ closeSync(stderrFd)
356
+ } catch {}
357
+ rmSafe(stdoutPath)
358
+ rmSafe(stderrPath)
359
+ const msg = (err as Error).message
360
+ const code = (err as NodeJS.ErrnoException).code
361
+ if (code === 'ENOENT') {
362
+ resolve({
363
+ ok: false,
364
+ error:
365
+ 'agent-browser binary not executable at resolved path. Re-run `nebula upgrade` (sandbox) or `bun install` (host) to repair the workspace install.',
366
+ })
367
+ } else {
368
+ resolve({ ok: false, error: msg })
369
+ }
370
+ return
371
+ }
372
+ try {
373
+ closeSync(stdoutFd)
374
+ } catch {}
375
+ try {
376
+ closeSync(stderrFd)
377
+ } catch {}
378
+
379
+ let timedOut = false
380
+ const timer = setTimeout(() => {
381
+ timedOut = true
382
+ try {
383
+ proc.kill('SIGKILL')
384
+ } catch {}
385
+ }, timeoutMs)
386
+
387
+ proc.on('error', err => {
388
+ clearTimeout(timer)
389
+ rmSafe(stdoutPath)
390
+ rmSafe(stderrPath)
391
+ const code = (err as NodeJS.ErrnoException).code
392
+ if (code === 'ENOENT') {
393
+ resolve({
394
+ ok: false,
395
+ error:
396
+ 'agent-browser binary not executable at resolved path. Re-run `nebula upgrade` (sandbox) or `bun install` (host) to repair the workspace install.',
397
+ })
398
+ return
399
+ }
400
+ resolve({ ok: false, error: err.message })
401
+ })
402
+
403
+ proc.on('close', code => {
404
+ clearTimeout(timer)
405
+ const stdout = readFileSafe(stdoutPath).slice(-100_000)
406
+ const stderr = readFileSafe(stderrPath).slice(-50_000)
407
+ rmSafe(stdoutPath)
408
+ rmSafe(stderrPath)
409
+ if (timedOut) {
410
+ resolve({
411
+ ok: false,
412
+ error: `agent-browser ${command} timed out after ${timeoutMs}ms`,
413
+ data: { stdout, stderr, exit_code: code },
414
+ })
415
+ return
416
+ }
417
+ resolve({
418
+ ok: (code ?? 1) === 0,
419
+ data: { stdout, stderr, exit_code: code },
420
+ })
421
+ })
422
+ })
423
+ }
424
+
425
+ const NavigateSchema = z.object({
426
+ url: z.string().min(1).describe('Absolute URL to navigate to (e.g. https://...).'),
427
+ })
428
+
429
+ export function makeBrowserNavigate(deps: BrowserDeps): ToolDef<z.infer<typeof NavigateSchema>> {
430
+ return {
431
+ name: 'browser.navigate',
432
+ description:
433
+ 'Open a URL in the agent-browser tab. Returns the new page metadata. Auto-waits 1500ms after navigation so the next browser.snapshot reflects the new page.',
434
+ shouldDefer: true,
435
+ searchHint: 'browser navigate open url page',
436
+ schema: NavigateSchema,
437
+ handler: async args => runAgentBrowser('open', [args.url], deps, { settleAfterMs: 1500 }),
438
+ }
439
+ }
440
+
441
+ const SnapshotSchema = z.object({
442
+ with_image: coerceBool
443
+ .optional()
444
+ .describe('When true, also captures a screenshot saved alongside the accessibility tree.'),
445
+ cap: coerceBool
446
+ .optional()
447
+ .describe('Cap the snapshot output for compactness. Defaults to true (-c flag).'),
448
+ })
449
+
450
+ export function makeBrowserSnapshot(deps: BrowserDeps): ToolDef<z.infer<typeof SnapshotSchema>> {
451
+ return {
452
+ name: 'browser.snapshot',
453
+ description:
454
+ 'Capture the page accessibility tree with element refs (@e1, @e2, ...). Use refs returned here for click/type/scroll actions. Set with_image=true to also write a screenshot.',
455
+ shouldDefer: true,
456
+ searchHint: 'browser snapshot accessibility tree refs page state',
457
+ schema: SnapshotSchema,
458
+ handler: async args => {
459
+ const flags: string[] = []
460
+ if (args.with_image !== false) flags.push('-i')
461
+ if (args.cap !== false) flags.push('-c')
462
+ return runAgentBrowser('snapshot', flags, deps)
463
+ },
464
+ }
465
+ }
466
+
467
+ const ClickSchema = z.object({
468
+ selector: z
469
+ .string()
470
+ .min(1)
471
+ .describe(
472
+ "Snapshot ref (e.g. '@e5') from the most recent browser.snapshot — preferred — OR a plain CSS selector ('button.primary', '#submit'). NOT a Playwright-style pseudo-class: ':has-text()', ':has()', ':contains()' are NOT supported and will fail.",
473
+ ),
474
+ })
475
+
476
+ export function makeBrowserClick(deps: BrowserDeps): ToolDef<z.infer<typeof ClickSchema>> {
477
+ return {
478
+ name: 'browser.click',
479
+ description:
480
+ "Click an element. Arg name is `selector` (snapshot @ref like '@e5' or plain CSS like 'button.primary'). Auto-waits 1200ms post-click so any triggered navigation/state change settles before the next snapshot. To click a link by visible text, take a fresh `browser.snapshot` first and pass the @eN ref of the matching node — Playwright pseudo-classes (:has-text, :contains) are not supported.",
481
+ shouldDefer: true,
482
+ searchHint: 'browser click element selector ref',
483
+ schema: ClickSchema,
484
+ handler: async args => runAgentBrowser('click', [args.selector], deps, { settleAfterMs: 1200 }),
485
+ }
486
+ }
487
+
488
+ const TypeSchema = z.object({
489
+ selector: z.string().min(1),
490
+ text: z.string().describe('Text to type into the element.'),
491
+ })
492
+
493
+ export function makeBrowserType(deps: BrowserDeps): ToolDef<z.infer<typeof TypeSchema>> {
494
+ return {
495
+ name: 'browser.type',
496
+ description:
497
+ 'Type text into an element by selector or snapshot ref. Auto-waits 600ms post-type so debounced input handlers settle before the next snapshot.',
498
+ shouldDefer: true,
499
+ searchHint: 'browser type input text fill',
500
+ schema: TypeSchema,
501
+ handler: async args =>
502
+ runAgentBrowser('type', [args.selector, args.text], deps, { settleAfterMs: 600 }),
503
+ }
504
+ }
505
+
506
+ const ScrollSchema = z.object({
507
+ direction: z
508
+ .enum(['up', 'down', 'left', 'right'])
509
+ .optional()
510
+ .describe(
511
+ "Scroll direction. Defaults to 'down' when omitted. Pass 'up'/'left'/'right' when needed.",
512
+ ),
513
+ pixels: coerceInt
514
+ .refine(n => n > 0, 'pixels must be > 0')
515
+ .optional()
516
+ .describe('Optional scroll distance in pixels. Default 800.'),
517
+ // `amount` is a tolerated alias for `pixels` — observed brain calls
518
+ // (qwen3.6-plus) routinely emit `amount=N` instead of `pixels=N` because
519
+ // the operator's natural-language prompt says "scroll N pixels" and the
520
+ // brain projects that onto a generic `amount` slot. Without this alias
521
+ // the schema silently strips the unknown key and the tool defaults to
522
+ // 800 — the call succeeds but with the wrong distance, which reads as
523
+ // the tool ignoring the operator's intent. Accept both spellings; merge
524
+ // in the handler.
525
+ amount: coerceInt
526
+ .refine(n => n > 0, 'amount must be > 0')
527
+ .optional()
528
+ .describe('Alias for `pixels`. Prefer `pixels`; `amount` accepted for compatibility.'),
529
+ })
530
+
531
+ export function makeBrowserScroll(deps: BrowserDeps): ToolDef<z.infer<typeof ScrollSchema>> {
532
+ return {
533
+ name: 'browser.scroll',
534
+ description:
535
+ "Scroll the page. Both args are optional: `direction` defaults to 'down' (override with 'up'/'left'/'right'); `pixels` defaults to 800. For 'scroll down N pixels' pass pixels=N. The schema also accepts `amount` as an alias for `pixels` — use either; pixels is preferred.",
536
+ shouldDefer: true,
537
+ searchHint: 'browser scroll page up down',
538
+ schema: ScrollSchema,
539
+ handler: async args => {
540
+ const args2: string[] = [args.direction ?? 'down']
541
+ const px = args.pixels ?? args.amount
542
+ if (px) args2.push(String(px))
543
+ return runAgentBrowser('scroll', args2, deps)
544
+ },
545
+ }
546
+ }
547
+
548
+ const BackSchema = z.object({})
549
+
550
+ export function makeBrowserBack(deps: BrowserDeps): ToolDef<z.infer<typeof BackSchema>> {
551
+ return {
552
+ name: 'browser.back',
553
+ description:
554
+ 'Navigate the browser history back one step. Auto-waits 1500ms for the previous page to render before the next snapshot.',
555
+ shouldDefer: true,
556
+ searchHint: 'browser back history previous page',
557
+ schema: BackSchema,
558
+ handler: async () => runAgentBrowser('back', [], deps, { settleAfterMs: 1500 }),
559
+ }
560
+ }
561
+
562
+ const PressSchema = z.object({
563
+ key: z.string().min(1).describe("Key to press, e.g. 'Enter', 'Tab', 'Escape', 'Control+a'."),
564
+ })
565
+
566
+ export function makeBrowserPress(deps: BrowserDeps): ToolDef<z.infer<typeof PressSchema>> {
567
+ return {
568
+ name: 'browser.press',
569
+ description:
570
+ 'Send a single key press (Enter, Tab, Escape, Ctrl+A, etc.). Auto-waits 1500ms post-press so a form submit triggered by Enter has time to navigate before the next snapshot.',
571
+ shouldDefer: true,
572
+ searchHint: 'browser press key keyboard',
573
+ schema: PressSchema,
574
+ handler: async args => runAgentBrowser('press', [args.key], deps, { settleAfterMs: 1500 }),
575
+ }
576
+ }
577
+
578
+ const GetImagesSchema = z.object({
579
+ selector: z.string().optional().describe('Optional CSS selector to scope image extraction.'),
580
+ limit: coerceInt
581
+ .refine(n => n > 0 && n <= 200, 'limit must be 1..200')
582
+ .optional()
583
+ .describe('Cap on returned URLs. Default 50.'),
584
+ })
585
+
586
+ export function makeBrowserGetImages(deps: BrowserDeps): ToolDef<z.infer<typeof GetImagesSchema>> {
587
+ return {
588
+ name: 'browser.get_images',
589
+ description:
590
+ 'Extract image URLs from the current page. Optionally scoped to a CSS selector. Returns up to `limit` (default 50) src URLs as a JSON array string.',
591
+ shouldDefer: true,
592
+ searchHint: 'browser images src extract list',
593
+ schema: GetImagesSchema,
594
+ handler: async args => {
595
+ const sel = (args.selector ?? 'img').replace(/'/g, "\\'")
596
+ const limit = args.limit ?? 50
597
+ // agent-browser `get attr` only returns the first match; eval gets all.
598
+ const js = `JSON.stringify(Array.from(document.querySelectorAll('${sel}')).slice(0, ${limit}).map(i => i.src || i.getAttribute('src') || '').filter(Boolean))`
599
+ return runAgentBrowser('eval', [js], deps)
600
+ },
601
+ }
602
+ }
603
+
604
+ const VisionSchema = z.object({
605
+ prompt: z
606
+ .string()
607
+ .min(1)
608
+ .describe('What you want the vision model to answer/describe about the screenshot.'),
609
+ })
610
+
611
+ export function makeBrowserVision(
612
+ deps: BrowserDeps & { visionInfer: VisionInferFn | null },
613
+ ): ToolDef<z.infer<typeof VisionSchema>> {
614
+ return {
615
+ name: 'browser.vision',
616
+ description:
617
+ "Capture the current page as a screenshot and send it to the configured vision model with a prompt. Returns the model's reply. Routes to the configured vision provider on Mantle Compute (qwen3-vl-30b on mainnet by default).",
618
+ shouldDefer: true,
619
+ searchHint: 'browser vision screenshot describe ocr image',
620
+ schema: VisionSchema,
621
+ handler: async args => {
622
+ if (!deps.visionInfer) {
623
+ return {
624
+ ok: false,
625
+ error:
626
+ 'vision provider not configured. Set `vision.provider` in ~/.nebula/config.ts to a Mantle Compute multimodal provider.',
627
+ }
628
+ }
629
+ const path = join(tmpdir(), `nebula-vision-${Date.now()}-${process.pid}.png`)
630
+ const shot = await runAgentBrowser('screenshot', [path], deps)
631
+ if (!shot.ok) return shot
632
+ let bytes: Uint8Array
633
+ try {
634
+ bytes = new Uint8Array(readFileSync(path))
635
+ } catch (e) {
636
+ return { ok: false, error: `screenshot read failed: ${(e as Error).message}` }
637
+ } finally {
638
+ rmSafe(path)
639
+ }
640
+ const mediaType = sniffMimeFromBytes(bytes, 'png') ?? 'image/png'
641
+ try {
642
+ const result = await deps.visionInfer({
643
+ images: [{ bytes, mediaType }],
644
+ prompt: args.prompt,
645
+ maxOutputTokens: 1024,
646
+ })
647
+ return {
648
+ ok: true,
649
+ data: {
650
+ content: result.content,
651
+ model: result.model ?? null,
652
+ usage: result.usage,
653
+ finishReason: result.finishReason,
654
+ },
655
+ }
656
+ } catch (e) {
657
+ return { ok: false, error: `vision call failed: ${(e as Error).message.slice(0, 240)}` }
658
+ }
659
+ },
660
+ }
661
+ }
662
+
663
+ const ConsoleSchema = z.object({
664
+ clear: coerceBool.optional().describe('When true, clears console after reading.'),
665
+ })
666
+
667
+ export function makeBrowserConsole(deps: BrowserDeps): ToolDef<z.infer<typeof ConsoleSchema>> {
668
+ return {
669
+ name: 'browser.console',
670
+ description: 'Read accumulated console output (logs, warnings, errors) from the page.',
671
+ shouldDefer: true,
672
+ searchHint: 'browser console logs warnings errors',
673
+ schema: ConsoleSchema,
674
+ handler: async args => {
675
+ const flags: string[] = []
676
+ if (args.clear) flags.push('--clear')
677
+ return runAgentBrowser('console', flags, deps)
678
+ },
679
+ }
680
+ }
681
+
682
+ export const ALL_BROWSER_TOOL_FACTORIES = [
683
+ makeBrowserNavigate,
684
+ makeBrowserSnapshot,
685
+ makeBrowserClick,
686
+ makeBrowserType,
687
+ makeBrowserScroll,
688
+ makeBrowserBack,
689
+ makeBrowserPress,
690
+ makeBrowserGetImages,
691
+ makeBrowserConsole,
692
+ ]
693
+
694
+ // Test-only hooks for the regression suite. Resets module-level cache so a
695
+ // test can stub PATH or override the platform without leaking state.
696
+ export const __test = {
697
+ reset(): void {
698
+ cachedSessionName = null
699
+ if (cachedSocketDir) {
700
+ try {
701
+ rmSync(cachedSocketDir, { recursive: true, force: true })
702
+ } catch {}
703
+ }
704
+ cachedSocketDir = null
705
+ cleanupRegistered = false
706
+ },
707
+ findAgentBrowser,
708
+ isBrowserAvailable,
709
+ socketSafeTmpdir,
710
+ getSessionName,
711
+ getSocketDir,
712
+ buildBrowserEnv,
713
+ }