agent-browser 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +150 -122
  2. package/bin/agent-browser-darwin-arm64 +0 -0
  3. package/bin/agent-browser-darwin-x64 +0 -0
  4. package/bin/agent-browser-linux-arm64 +0 -0
  5. package/bin/agent-browser-linux-x64 +0 -0
  6. package/bin/agent-browser-win32-x64.exe +0 -0
  7. package/package.json +6 -40
  8. package/scripts/postinstall.js +12 -15
  9. package/skills/agent-browser/SKILL.md +37 -32
  10. package/skills/electron/SKILL.md +5 -5
  11. package/dist/action-policy.d.ts +0 -14
  12. package/dist/action-policy.d.ts.map +0 -1
  13. package/dist/action-policy.js +0 -253
  14. package/dist/action-policy.js.map +0 -1
  15. package/dist/actions.d.ts +0 -18
  16. package/dist/actions.d.ts.map +0 -1
  17. package/dist/actions.js +0 -2120
  18. package/dist/actions.js.map +0 -1
  19. package/dist/auth-cli.d.ts +0 -2
  20. package/dist/auth-cli.d.ts.map +0 -1
  21. package/dist/auth-cli.js +0 -97
  22. package/dist/auth-cli.js.map +0 -1
  23. package/dist/auth-vault.d.ts +0 -36
  24. package/dist/auth-vault.d.ts.map +0 -1
  25. package/dist/auth-vault.js +0 -125
  26. package/dist/auth-vault.js.map +0 -1
  27. package/dist/browser.d.ts +0 -592
  28. package/dist/browser.d.ts.map +0 -1
  29. package/dist/browser.js +0 -2190
  30. package/dist/browser.js.map +0 -1
  31. package/dist/confirmation.d.ts +0 -8
  32. package/dist/confirmation.d.ts.map +0 -1
  33. package/dist/confirmation.js +0 -30
  34. package/dist/confirmation.js.map +0 -1
  35. package/dist/daemon.d.ts +0 -71
  36. package/dist/daemon.d.ts.map +0 -1
  37. package/dist/daemon.js +0 -671
  38. package/dist/daemon.js.map +0 -1
  39. package/dist/diff.d.ts +0 -18
  40. package/dist/diff.d.ts.map +0 -1
  41. package/dist/diff.js +0 -271
  42. package/dist/diff.js.map +0 -1
  43. package/dist/domain-filter.d.ts +0 -28
  44. package/dist/domain-filter.d.ts.map +0 -1
  45. package/dist/domain-filter.js +0 -149
  46. package/dist/domain-filter.js.map +0 -1
  47. package/dist/encryption.d.ts +0 -73
  48. package/dist/encryption.d.ts.map +0 -1
  49. package/dist/encryption.js +0 -171
  50. package/dist/encryption.js.map +0 -1
  51. package/dist/inspect-server.d.ts +0 -26
  52. package/dist/inspect-server.d.ts.map +0 -1
  53. package/dist/inspect-server.js +0 -218
  54. package/dist/inspect-server.js.map +0 -1
  55. package/dist/ios-actions.d.ts +0 -11
  56. package/dist/ios-actions.d.ts.map +0 -1
  57. package/dist/ios-actions.js +0 -228
  58. package/dist/ios-actions.js.map +0 -1
  59. package/dist/ios-manager.d.ts +0 -266
  60. package/dist/ios-manager.d.ts.map +0 -1
  61. package/dist/ios-manager.js +0 -1073
  62. package/dist/ios-manager.js.map +0 -1
  63. package/dist/protocol.d.ts +0 -28
  64. package/dist/protocol.d.ts.map +0 -1
  65. package/dist/protocol.js +0 -986
  66. package/dist/protocol.js.map +0 -1
  67. package/dist/snapshot.d.ts +0 -67
  68. package/dist/snapshot.d.ts.map +0 -1
  69. package/dist/snapshot.js +0 -514
  70. package/dist/snapshot.js.map +0 -1
  71. package/dist/state-utils.d.ts +0 -77
  72. package/dist/state-utils.d.ts.map +0 -1
  73. package/dist/state-utils.js +0 -178
  74. package/dist/state-utils.js.map +0 -1
  75. package/dist/stream-server.d.ts +0 -117
  76. package/dist/stream-server.d.ts.map +0 -1
  77. package/dist/stream-server.js +0 -309
  78. package/dist/stream-server.js.map +0 -1
  79. package/dist/types.d.ts +0 -925
  80. package/dist/types.d.ts.map +0 -1
  81. package/dist/types.js +0 -2
  82. package/dist/types.js.map +0 -1
package/README.md CHANGED
@@ -1,51 +1,41 @@
1
1
  # agent-browser
2
2
 
3
- Headless browser automation CLI for AI agents. Fast Rust CLI with Node.js fallback.
3
+ Headless browser automation CLI for AI agents. Fast native Rust CLI.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  ### Global Installation (recommended)
8
8
 
9
- Installs the native Rust binary for maximum performance:
9
+ Installs the native Rust binary:
10
10
 
11
11
  ```bash
12
12
  npm install -g agent-browser
13
- agent-browser install # Download Chromium
13
+ agent-browser install # Download Chrome from Chrome for Testing (first time only)
14
14
  ```
15
15
 
16
- This is the fastest option -- commands run through the native Rust CLI directly with sub-millisecond parsing overhead.
17
-
18
- ### Quick Start (no install)
19
-
20
- Run directly with `npx` if you want to try it without installing globally:
21
-
22
- ```bash
23
- npx agent-browser install # Download Chromium (first time only)
24
- npx agent-browser open example.com
25
- ```
26
-
27
- > **Note:** `npx` routes through Node.js before reaching the Rust CLI, so it is noticeably slower than a global install. For regular use, install globally.
28
-
29
16
  ### Project Installation (local dependency)
30
17
 
31
18
  For projects that want to pin the version in `package.json`:
32
19
 
33
20
  ```bash
34
21
  npm install agent-browser
35
- npx agent-browser install
22
+ agent-browser install
36
23
  ```
37
24
 
38
- Then use via `npx` or `package.json` scripts:
25
+ Then use via `package.json` scripts or by invoking `agent-browser` directly.
26
+
27
+ ### Homebrew (macOS)
39
28
 
40
29
  ```bash
41
- npx agent-browser open example.com
30
+ brew install agent-browser
31
+ agent-browser install # Download Chrome from Chrome for Testing (first time only)
42
32
  ```
43
33
 
44
- ### Homebrew (macOS)
34
+ ### Cargo (Rust)
45
35
 
46
36
  ```bash
47
- brew install agent-browser
48
- agent-browser install # Download Chromium
37
+ cargo install agent-browser
38
+ agent-browser install # Download Chrome from Chrome for Testing (first time only)
49
39
  ```
50
40
 
51
41
  ### From Source
@@ -66,9 +56,13 @@ On Linux, install system dependencies:
66
56
 
67
57
  ```bash
68
58
  agent-browser install --with-deps
69
- # or manually: npx playwright install-deps chromium
70
59
  ```
71
60
 
61
+ ### Requirements
62
+
63
+ - **Chrome** - Run `agent-browser install` to download Chrome from [Chrome for Testing](https://developer.chrome.com/blog/chrome-for-testing/) (Google's official automation channel). No Playwright or Node.js required for the daemon.
64
+ - **Rust** - Only needed when building from source (see From Source above).
65
+
72
66
  ## Quick Start
73
67
 
74
68
  ```bash
@@ -115,6 +109,8 @@ agent-browser drag <src> <tgt> # Drag and drop
115
109
  agent-browser upload <sel> <files> # Upload files
116
110
  agent-browser screenshot [path] # Take screenshot (--full for full page, saves to a temporary directory if no path)
117
111
  agent-browser screenshot --annotate # Annotated screenshot with numbered element labels
112
+ agent-browser screenshot --screenshot-dir ./shots # Save to custom directory
113
+ agent-browser screenshot --screenshot-format jpeg --screenshot-quality 80
118
114
  agent-browser pdf <path> # Save as PDF
119
115
  agent-browser snapshot # Accessibility tree with refs (best for AI)
120
116
  agent-browser eval <js> # Run JavaScript (-b for base64, --stdin for piped input)
@@ -165,6 +161,7 @@ agent-browser find nth <n> <sel> <action> [value] # Nth match
165
161
  **Options:** `--name <name>` (filter role by accessible name), `--exact` (require exact text match)
166
162
 
167
163
  **Examples:**
164
+
168
165
  ```bash
169
166
  agent-browser find role button click --name "Submit"
170
167
  agent-browser find text "Sign In" click
@@ -178,14 +175,27 @@ agent-browser find nth 2 "a" text
178
175
  ```bash
179
176
  agent-browser wait <selector> # Wait for element to be visible
180
177
  agent-browser wait <ms> # Wait for time (milliseconds)
181
- agent-browser wait --text "Welcome" # Wait for text to appear
178
+ agent-browser wait --text "Welcome" # Wait for text to appear (substring match)
182
179
  agent-browser wait --url "**/dash" # Wait for URL pattern
183
180
  agent-browser wait --load networkidle # Wait for load state
184
181
  agent-browser wait --fn "window.ready === true" # Wait for JS condition
182
+
183
+ # Wait for text/element to disappear
184
+ agent-browser wait --fn "!document.body.innerText.includes('Loading...')"
185
+ agent-browser wait "#spinner" --state hidden
185
186
  ```
186
187
 
187
188
  **Load states:** `load`, `domcontentloaded`, `networkidle`
188
189
 
190
+ ### Clipboard
191
+
192
+ ```bash
193
+ agent-browser clipboard read # Read text from clipboard
194
+ agent-browser clipboard write "Hello, World!" # Write text to clipboard
195
+ agent-browser clipboard copy # Copy current selection (Ctrl+C)
196
+ agent-browser clipboard paste # Paste from clipboard (Ctrl+V)
197
+ ```
198
+
189
199
  ### Mouse Control
190
200
 
191
201
  ```bash
@@ -306,7 +316,7 @@ agent-browser reload # Reload page
306
316
  ### Setup
307
317
 
308
318
  ```bash
309
- agent-browser install # Download Chromium browser
319
+ agent-browser install # Download Chrome from Chrome for Testing (Google's official automation channel)
310
320
  agent-browser install --with-deps # Also install system deps (Linux)
311
321
  ```
312
322
 
@@ -375,6 +385,7 @@ agent-browser session
375
385
  ```
376
386
 
377
387
  Each session has its own:
388
+
378
389
  - Browser instance
379
390
  - Cookies and storage
380
391
  - Navigation history
@@ -396,6 +407,7 @@ AGENT_BROWSER_PROFILE=~/.myapp-profile agent-browser open myapp.com
396
407
  ```
397
408
 
398
409
  The profile directory stores:
410
+
399
411
  - Cookies and localStorage
400
412
  - IndexedDB data
401
413
  - Service workers
@@ -432,10 +444,10 @@ export AGENT_BROWSER_ENCRYPTION_KEY=<64-char-hex-key>
432
444
  agent-browser --session-name secure open example.com
433
445
  ```
434
446
 
435
- | Variable | Description |
436
- |----------|-------------|
437
- | `AGENT_BROWSER_SESSION_NAME` | Auto-save/load state persistence name |
438
- | `AGENT_BROWSER_ENCRYPTION_KEY` | 64-char hex key for AES-256-GCM encryption |
447
+ | Variable | Description |
448
+ | --------------------------------- | -------------------------------------------------- |
449
+ | `AGENT_BROWSER_SESSION_NAME` | Auto-save/load state persistence name |
450
+ | `AGENT_BROWSER_ENCRYPTION_KEY` | 64-char hex key for AES-256-GCM encryption |
439
451
  | `AGENT_BROWSER_STATE_EXPIRE_DAYS` | Auto-delete states older than N days (default: 30) |
440
452
 
441
453
  ## Security
@@ -449,14 +461,14 @@ agent-browser includes security features for safe AI agent deployments. All feat
449
461
  - **Action Confirmation** -- Require explicit approval for sensitive action categories: `--confirm-actions eval,download`
450
462
  - **Output Length Limits** -- Prevent context flooding: `--max-output 50000`
451
463
 
452
- | Variable | Description |
453
- |----------|-------------|
454
- | `AGENT_BROWSER_CONTENT_BOUNDARIES` | Wrap page output in boundary markers |
455
- | `AGENT_BROWSER_MAX_OUTPUT` | Max characters for page output |
456
- | `AGENT_BROWSER_ALLOWED_DOMAINS` | Comma-separated allowed domain patterns |
457
- | `AGENT_BROWSER_ACTION_POLICY` | Path to action policy JSON file |
458
- | `AGENT_BROWSER_CONFIRM_ACTIONS` | Action categories requiring confirmation |
459
- | `AGENT_BROWSER_CONFIRM_INTERACTIVE` | Enable interactive confirmation prompts |
464
+ | Variable | Description |
465
+ | ----------------------------------- | ---------------------------------------- |
466
+ | `AGENT_BROWSER_CONTENT_BOUNDARIES` | Wrap page output in boundary markers |
467
+ | `AGENT_BROWSER_MAX_OUTPUT` | Max characters for page output |
468
+ | `AGENT_BROWSER_ALLOWED_DOMAINS` | Comma-separated allowed domain patterns |
469
+ | `AGENT_BROWSER_ACTION_POLICY` | Path to action policy JSON file |
470
+ | `AGENT_BROWSER_CONFIRM_ACTIONS` | Action categories requiring confirmation |
471
+ | `AGENT_BROWSER_CONFIRM_INTERACTIVE` | Enable interactive confirmation prompts |
460
472
 
461
473
  See [Security documentation](https://agent-browser.dev/security) for details.
462
474
 
@@ -474,13 +486,13 @@ agent-browser snapshot -s "#main" # Scope to CSS selector
474
486
  agent-browser snapshot -i -c -d 5 # Combine options
475
487
  ```
476
488
 
477
- | Option | Description |
478
- |--------|-------------|
479
- | `-i, --interactive` | Only show interactive elements (buttons, links, inputs) |
480
- | `-C, --cursor` | Include cursor-interactive elements (cursor:pointer, onclick, tabindex) |
481
- | `-c, --compact` | Remove empty structural elements |
482
- | `-d, --depth <n>` | Limit tree depth |
483
- | `-s, --selector <sel>` | Scope to CSS selector |
489
+ | Option | Description |
490
+ | ---------------------- | ----------------------------------------------------------------------- |
491
+ | `-i, --interactive` | Only show interactive elements (buttons, links, inputs) |
492
+ | `-C, --cursor` | Include cursor-interactive elements (cursor:pointer, onclick, tabindex) |
493
+ | `-c, --compact` | Remove empty structural elements |
494
+ | `-d, --depth <n>` | Limit tree depth |
495
+ | `-s, --selector <sel>` | Scope to CSS selector |
484
496
 
485
497
  The `-C` flag is useful for modern web apps that use custom clickable elements (divs, spans) instead of standard buttons/links.
486
498
 
@@ -488,7 +500,7 @@ The `-C` flag is useful for modern web apps that use custom clickable elements (
488
500
 
489
501
  The `--annotate` flag overlays numbered labels on interactive elements in the screenshot. Each label `[N]` corresponds to ref `@eN`, so the same refs work for both visual and text-based workflows.
490
502
 
491
- In native mode, annotated screenshots are supported on the CDP-backed browser path (`--native` with Chromium/Lightpanda). The Safari/WebDriver backend does not yet support `--annotate`.
503
+ Annotated screenshots are supported on the CDP-backed browser path (Chrome/Lightpanda). The Safari/WebDriver backend does not yet support `--annotate`.
492
504
 
493
505
  ```bash
494
506
  agent-browser screenshot --annotate
@@ -529,6 +541,9 @@ This is useful for multimodal AI models that can reason about visual layout, unl
529
541
  | `--json` | JSON output (for agents) |
530
542
  | `--full, -f` | Full page screenshot |
531
543
  | `--annotate` | Annotated screenshot with numbered element labels (or `AGENT_BROWSER_ANNOTATE` env) |
544
+ | `--screenshot-dir <path>` | Default screenshot output directory (or `AGENT_BROWSER_SCREENSHOT_DIR` env) |
545
+ | `--screenshot-quality <n>` | JPEG quality 0-100 (or `AGENT_BROWSER_SCREENSHOT_QUALITY` env) |
546
+ | `--screenshot-format <fmt>` | Screenshot format: `png`, `jpeg` (or `AGENT_BROWSER_SCREENSHOT_FORMAT` env) |
532
547
  | `--headed` | Show browser window (not headless) (or `AGENT_BROWSER_HEADED` env) |
533
548
  | `--cdp <port\|url>` | Connect via Chrome DevTools Protocol (port or WebSocket URL) |
534
549
  | `--auto-connect` | Auto-discover and connect to running Chrome (or `AGENT_BROWSER_AUTO_CONNECT` env) |
@@ -540,8 +555,7 @@ This is useful for multimodal AI models that can reason about visual layout, unl
540
555
  | `--action-policy <path>` | Path to action policy JSON file (or `AGENT_BROWSER_ACTION_POLICY` env) |
541
556
  | `--confirm-actions <list>` | Action categories requiring confirmation (or `AGENT_BROWSER_CONFIRM_ACTIONS` env) |
542
557
  | `--confirm-interactive` | Interactive confirmation prompts; auto-denies if stdin is not a TTY (or `AGENT_BROWSER_CONFIRM_INTERACTIVE` env) |
543
- | `--engine <name>` | Browser engine: `chrome` (default), `lightpanda`; implies `--native` (or `AGENT_BROWSER_ENGINE` env) |
544
- | `--native` | [Experimental] Use native Rust daemon instead of Node.js (or `AGENT_BROWSER_NATIVE` env) |
558
+ | `--engine <name>` | Browser engine: `chrome` (default), `lightpanda` (or `AGENT_BROWSER_ENGINE` env) |
545
559
  | `--config <path>` | Use a custom config file (or `AGENT_BROWSER_CONFIG` env) |
546
560
  | `--debug` | Debug output |
547
561
 
@@ -585,7 +599,7 @@ Auto-discovered config files that are missing are silently ignored. If `--config
585
599
 
586
600
  ## Default Timeout
587
601
 
588
- The default Playwright timeout for standard operations (clicks, waits, fills, etc.) is 25 seconds. This is intentionally below the CLI's 30-second IPC read timeout so that Playwright returns a proper error instead of the CLI timing out with EAGAIN.
602
+ The default timeout for standard operations (clicks, waits, fills, etc.) is 25 seconds. This is intentionally below the CLI's 30-second IPC read timeout so that the daemon returns a proper error instead of the CLI timing out with EAGAIN.
589
603
 
590
604
  Override the default timeout via environment variable:
591
605
 
@@ -594,11 +608,11 @@ Override the default timeout via environment variable:
594
608
  export AGENT_BROWSER_DEFAULT_TIMEOUT=45000
595
609
  ```
596
610
 
597
- > **Note:** Setting this above 30000 (30s) may cause EAGAIN errors on slow operations because the CLI's read timeout will expire before Playwright responds. The CLI retries transient errors automatically, but response times will increase.
611
+ > **Note:** Setting this above 30000 (30s) may cause EAGAIN errors on slow operations because the CLI's read timeout will expire before the daemon responds. The CLI retries transient errors automatically, but response times will increase.
598
612
 
599
- | Variable | Description |
600
- |----------|-------------|
601
- | `AGENT_BROWSER_DEFAULT_TIMEOUT` | Default Playwright timeout in ms (default: 25000) |
613
+ | Variable | Description |
614
+ | ------------------------------- | ---------------------------------------- |
615
+ | `AGENT_BROWSER_DEFAULT_TIMEOUT` | Default operation timeout in ms (default: 25000) |
602
616
 
603
617
  ## Selectors
604
618
 
@@ -623,6 +637,7 @@ agent-browser hover @e4 # Hover the link
623
637
  ```
624
638
 
625
639
  **Why use refs?**
640
+
626
641
  - **Deterministic**: Ref points to exact element from snapshot
627
642
  - **Fast**: No DOM re-query needed
628
643
  - **AI-friendly**: Snapshot + ref workflow is optimal for LLMs
@@ -723,6 +738,7 @@ agent-browser open other-site.com
723
738
  ```
724
739
 
725
740
  This is useful for:
741
+
726
742
  - **Skipping login flows** - Authenticate via headers instead of UI
727
743
  - **Switching users** - Start new sessions with different auth tokens
728
744
  - **API testing** - Access protected endpoints directly
@@ -744,6 +760,7 @@ agent-browser set headers '{"X-Custom-Header": "value"}'
744
760
  ## Custom Browser Executable
745
761
 
746
762
  Use a custom browser executable instead of the bundled Chromium. This is useful for:
763
+
747
764
  - **Serverless deployment**: Use lightweight Chromium builds like `@sparticuz/chromium` (~50MB vs ~684MB)
748
765
  - **System browsers**: Use an existing Chrome/Chromium installation
749
766
  - **Custom builds**: Use modified browser builds
@@ -804,6 +821,7 @@ agent-browser screenshot report.png
804
821
  ```
805
822
 
806
823
  The `--allow-file-access` flag adds Chromium flags (`--allow-file-access-from-files`, `--allow-file-access`) that allow `file://` URLs to:
824
+
807
825
  - Load and render local files
808
826
  - Access other local files via JavaScript (XHR, fetch)
809
827
  - Load local resources (images, scripts, stylesheets)
@@ -831,10 +849,12 @@ agent-browser --cdp "wss://your-browser-service.com/cdp?token=..." snapshot
831
849
  ```
832
850
 
833
851
  The `--cdp` flag accepts either:
852
+
834
853
  - A port number (e.g., `9222`) for local connections via `http://localhost:{port}`
835
854
  - A full WebSocket URL (e.g., `wss://...` or `ws://...`) for remote browser services
836
855
 
837
856
  This enables control of:
857
+
838
858
  - Electron apps
839
859
  - Chrome/Chromium instances with remote debugging
840
860
  - WebView2 applications
@@ -854,10 +874,12 @@ AGENT_BROWSER_AUTO_CONNECT=1 agent-browser snapshot
854
874
  ```
855
875
 
856
876
  Auto-connect discovers Chrome by:
877
+
857
878
  1. Reading Chrome's `DevToolsActivePort` file from the default user data directory
858
879
  2. Falling back to probing common debugging ports (9222, 9229)
859
880
 
860
881
  This is useful when:
882
+
861
883
  - Chrome 144+ has remote debugging enabled via `chrome://inspect/#remote-debugging` (which uses a dynamic port)
862
884
  - You want a zero-configuration connection to your existing browser
863
885
  - You don't want to track which port Chrome is using
@@ -881,6 +903,7 @@ This starts a WebSocket server on the specified port that streams the browser vi
881
903
  Connect to `ws://localhost:9223` to receive frames and send input:
882
904
 
883
905
  **Receive frames:**
906
+
884
907
  ```json
885
908
  {
886
909
  "type": "frame",
@@ -897,6 +920,7 @@ Connect to `ws://localhost:9223` to receive frames and send input:
897
920
  ```
898
921
 
899
922
  **Send mouse events:**
923
+
900
924
  ```json
901
925
  {
902
926
  "type": "input_mouse",
@@ -909,6 +933,7 @@ Connect to `ws://localhost:9223` to receive frames and send input:
909
933
  ```
910
934
 
911
935
  **Send keyboard events:**
936
+
912
937
  ```json
913
938
  {
914
939
  "type": "input_keyboard",
@@ -919,6 +944,7 @@ Connect to `ws://localhost:9223` to receive frames and send input:
919
944
  ```
920
945
 
921
946
  **Send touch events:**
947
+
922
948
  ```json
923
949
  {
924
950
  "type": "input_touch",
@@ -939,16 +965,19 @@ await browser.launch({ headless: true });
939
965
  await browser.navigate('https://example.com');
940
966
 
941
967
  // Start screencast
942
- await browser.startScreencast((frame) => {
943
- // frame.data is base64-encoded image
944
- // frame.metadata contains viewport info
945
- console.log('Frame received:', frame.metadata.deviceWidth, 'x', frame.metadata.deviceHeight);
946
- }, {
947
- format: 'jpeg',
948
- quality: 80,
949
- maxWidth: 1280,
950
- maxHeight: 720,
951
- });
968
+ await browser.startScreencast(
969
+ (frame) => {
970
+ // frame.data is base64-encoded image
971
+ // frame.metadata contains viewport info
972
+ console.log('Frame received:', frame.metadata.deviceWidth, 'x', frame.metadata.deviceHeight);
973
+ },
974
+ {
975
+ format: 'jpeg',
976
+ quality: 80,
977
+ maxWidth: 1280,
978
+ maxHeight: 720,
979
+ }
980
+ );
952
981
 
953
982
  // Inject mouse events
954
983
  await browser.injectMouseEvent({
@@ -973,61 +1002,22 @@ await browser.stopScreencast();
973
1002
 
974
1003
  agent-browser uses a client-daemon architecture:
975
1004
 
976
- 1. **Rust CLI** (fast native binary) - Parses commands, communicates with daemon
977
- 2. **Node.js Daemon** (default) - Manages Playwright browser instance
978
- 3. **Native Daemon** (experimental, `--native`) - Pure Rust daemon using direct CDP, no Node.js required
979
- 4. **Fallback** - If native binary unavailable, uses Node.js directly
980
-
981
- The daemon starts automatically on first command and persists between commands for fast subsequent operations.
1005
+ 1. **Rust CLI** - Parses commands, communicates with daemon
1006
+ 2. **Rust Daemon** - Pure Rust daemon using direct CDP, no Node.js required
982
1007
 
983
- **Browser Engine:** Uses Chromium by default. The default Node.js daemon also supports Firefox and WebKit via Playwright. The experimental native daemon speaks Chrome DevTools Protocol (CDP) directly and supports Chromium-based browsers and Safari (via WebDriver).
1008
+ The daemon starts automatically on first command and persists between commands for fast subsequent operations. To auto-shutdown the daemon after a period of inactivity, set `AGENT_BROWSER_IDLE_TIMEOUT_MS` (value in milliseconds). When set, the daemon closes the browser and exits after receiving no commands for the specified duration.
984
1009
 
985
- ## Experimental: Native Mode
986
-
987
- The native daemon is a pure Rust implementation that communicates with Chrome directly via CDP, eliminating the Node.js and Playwright dependencies. It is currently **experimental** and opt-in.
988
-
989
- ### Enabling Native Mode
990
-
991
- ```bash
992
- # Via flag
993
- agent-browser --native open example.com
994
-
995
- # Via environment variable (recommended for persistent use)
996
- export AGENT_BROWSER_NATIVE=1
997
- agent-browser open example.com
998
- ```
999
-
1000
- Or add to your config file (`agent-browser.json`):
1001
-
1002
- ```json
1003
- {"native": true}
1004
- ```
1005
-
1006
- ### What's Different
1007
-
1008
- | | Default (Node.js) | Native (`--native`) |
1009
- |---|---|---|
1010
- | **Runtime** | Node.js + Playwright | Pure Rust binary |
1011
- | **Protocol** | Playwright protocol | Direct CDP / WebDriver |
1012
- | **Install size** | Larger (Node.js + npm deps) | Smaller (single binary) |
1013
- | **Browser support** | Chromium, Firefox, WebKit | Chromium, Safari (via WebDriver) |
1014
- | **Stability** | Stable | Experimental |
1015
-
1016
- ### Known Limitations
1017
-
1018
- - Firefox and WebKit are not yet supported (Chromium and Safari only)
1019
- - Some Playwright-specific features (tracing format, HAR export) are not available
1020
- - The native daemon and Node.js daemon share the same session socket, so you cannot run both simultaneously for the same session. Use `agent-browser close` before switching modes.
1010
+ **Browser Engine:** Uses Chrome (from Chrome for Testing) by default. The `--engine` flag selects between `chrome` and `lightpanda`. Supported browsers: Chromium/Chrome (via CDP) and Safari (via WebDriver for iOS).
1021
1011
 
1022
1012
  ## Platforms
1023
1013
 
1024
- | Platform | Binary | Fallback |
1025
- |----------|--------|----------|
1026
- | macOS ARM64 | Native Rust | Node.js |
1027
- | macOS x64 | Native Rust | Node.js |
1028
- | Linux ARM64 | Native Rust | Node.js |
1029
- | Linux x64 | Native Rust | Node.js |
1030
- | Windows x64 | Native Rust | Node.js |
1014
+ | Platform | Binary |
1015
+ | ----------- | ----------- |
1016
+ | macOS ARM64 | Native Rust |
1017
+ | macOS x64 | Native Rust |
1018
+ | Linux ARM64 | Native Rust |
1019
+ | Linux x64 | Native Rust |
1020
+ | Windows x64 | Native Rust |
1031
1021
 
1032
1022
  ## Usage with AI Agents
1033
1023
 
@@ -1071,6 +1061,7 @@ For more consistent results, add to your project or global instructions file:
1071
1061
  Use `agent-browser` for web automation. Run `agent-browser --help` for all commands.
1072
1062
 
1073
1063
  Core workflow:
1064
+
1074
1065
  1. `agent-browser open <url>` - Navigate to page
1075
1066
  2. `agent-browser snapshot -i` - Get interactive elements with refs (@e1, @e2)
1076
1067
  3. `agent-browser click @e1` / `fill @e2 "text"` - Interact using refs
@@ -1122,11 +1113,11 @@ export AGENT_BROWSER_IOS_DEVICE="iPhone 16 Pro"
1122
1113
  agent-browser open https://example.com
1123
1114
  ```
1124
1115
 
1125
- | Variable | Description |
1126
- |----------|-------------|
1127
- | `AGENT_BROWSER_PROVIDER` | Set to `ios` to enable iOS mode |
1116
+ | Variable | Description |
1117
+ | -------------------------- | ----------------------------------------------- |
1118
+ | `AGENT_BROWSER_PROVIDER` | Set to `ios` to enable iOS mode |
1128
1119
  | `AGENT_BROWSER_IOS_DEVICE` | Device name (e.g., "iPhone 16 Pro", "iPad Pro") |
1129
- | `AGENT_BROWSER_IOS_UDID` | Device UDID (alternative to device name) |
1120
+ | `AGENT_BROWSER_IOS_UDID` | Device UDID (alternative to device name) |
1130
1121
 
1131
1122
  **Supported devices:** All iOS Simulators available in Xcode (iPhones, iPads), plus real iOS devices.
1132
1123
 
@@ -1137,6 +1128,7 @@ agent-browser open https://example.com
1137
1128
  Appium also supports real iOS devices connected via USB. This requires additional one-time setup:
1138
1129
 
1139
1130
  **1. Get your device UDID:**
1131
+
1140
1132
  ```bash
1141
1133
  xcrun xctrace list devices
1142
1134
  # or
@@ -1144,6 +1136,7 @@ system_profiler SPUSBDataType | grep -A 5 "iPhone\|iPad"
1144
1136
  ```
1145
1137
 
1146
1138
  **2. Sign WebDriverAgent (one-time):**
1139
+
1147
1140
  ```bash
1148
1141
  # Open the WebDriverAgent Xcode project
1149
1142
  cd ~/.appium/node_modules/appium-xcuitest-driver/node_modules/appium-webdriveragent
@@ -1151,12 +1144,14 @@ open WebDriverAgent.xcodeproj
1151
1144
  ```
1152
1145
 
1153
1146
  In Xcode:
1147
+
1154
1148
  - Select the `WebDriverAgentRunner` target
1155
1149
  - Go to Signing & Capabilities
1156
1150
  - Select your Team (requires Apple Developer account, free tier works)
1157
1151
  - Let Xcode manage signing automatically
1158
1152
 
1159
1153
  **3. Use with agent-browser:**
1154
+
1160
1155
  ```bash
1161
1156
  # Connect device via USB, then:
1162
1157
  agent-browser -p ios --device "<DEVICE_UDID>" open https://example.com
@@ -1166,11 +1161,44 @@ agent-browser -p ios --device "John's iPhone" open https://example.com
1166
1161
  ```
1167
1162
 
1168
1163
  **Real device notes:**
1164
+
1169
1165
  - First run installs WebDriverAgent to the device (may require Trust prompt)
1170
1166
  - Device must be unlocked and connected via USB
1171
1167
  - Slightly slower initial connection than simulator
1172
1168
  - Tests against real Safari performance and behavior
1173
1169
 
1170
+ ### Browserless
1171
+
1172
+ [Browserless](https://browserless.io) provides cloud browser infrastructure with a Sessions API. Use it when running agent-browser in environments where a local browser isn't available.
1173
+
1174
+ To enable Browserless, use the `-p` flag:
1175
+
1176
+ ```bash
1177
+ export BROWSERLESS_API_KEY="your-api-token"
1178
+ agent-browser -p browserless open https://example.com
1179
+ ```
1180
+
1181
+ Or use environment variables for CI/scripts:
1182
+
1183
+ ```bash
1184
+ export AGENT_BROWSER_PROVIDER=browserless
1185
+ export BROWSERLESS_API_KEY="your-api-token"
1186
+ agent-browser open https://example.com
1187
+ ```
1188
+
1189
+ Optional configuration via environment variables:
1190
+
1191
+ | Variable | Description | Default |
1192
+ | -------------------------- | ------------------------------------------------ | --------------------------------------- |
1193
+ | `BROWSERLESS_API_URL` | Base API URL (for custom regions or self-hosted) | `https://production-sfo.browserless.io` |
1194
+ | `BROWSERLESS_BROWSER_TYPE` | Type of browser to use (chromium or chrome) | chromium |
1195
+ | `BROWSERLESS_TTL` | Session TTL in milliseconds | `300000` |
1196
+ | `BROWSERLESS_STEALTH` | Enable stealth mode (`true`/`false`) | `true` |
1197
+
1198
+ When enabled, agent-browser connects to a Browserless cloud session instead of launching a local browser. All commands work identically.
1199
+
1200
+ Get your API token from the [Browserless Dashboard](https://browserless.io).
1201
+
1174
1202
  ### Browserbase
1175
1203
 
1176
1204
  [Browserbase](https://browserbase.com) provides remote browser infrastructure to make deployment of agentic browsing agents easy. Use it when running the agent-browser CLI in an environment where a local browser isn't feasible.
@@ -1238,12 +1266,12 @@ agent-browser open https://example.com
1238
1266
 
1239
1267
  Optional configuration via environment variables:
1240
1268
 
1241
- | Variable | Description | Default |
1242
- |----------|-------------|---------|
1243
- | `KERNEL_HEADLESS` | Run browser in headless mode (`true`/`false`) | `false` |
1244
- | `KERNEL_STEALTH` | Enable stealth mode to avoid bot detection (`true`/`false`) | `true` |
1245
- | `KERNEL_TIMEOUT_SECONDS` | Session timeout in seconds | `300` |
1246
- | `KERNEL_PROFILE_NAME` | Browser profile name for persistent cookies/logins (created if it doesn't exist) | (none) |
1269
+ | Variable | Description | Default |
1270
+ | ------------------------ | -------------------------------------------------------------------------------- | ------- |
1271
+ | `KERNEL_HEADLESS` | Run browser in headless mode (`true`/`false`) | `false` |
1272
+ | `KERNEL_STEALTH` | Enable stealth mode to avoid bot detection (`true`/`false`) | `true` |
1273
+ | `KERNEL_TIMEOUT_SECONDS` | Session timeout in seconds | `300` |
1274
+ | `KERNEL_PROFILE_NAME` | Browser profile name for persistent cookies/logins (created if it doesn't exist) | (none) |
1247
1275
 
1248
1276
  When enabled, agent-browser connects to a Kernel cloud session instead of launching a local browser. All commands work identically.
1249
1277
 
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,11 +1,9 @@
1
1
  {
2
2
  "name": "agent-browser",
3
- "version": "0.18.0",
3
+ "version": "0.20.0",
4
4
  "description": "Headless browser automation CLI for AI agents",
5
5
  "type": "module",
6
- "main": "dist/daemon.js",
7
6
  "files": [
8
- "dist",
9
7
  "bin",
10
8
  "scripts",
11
9
  "skills"
@@ -17,7 +15,8 @@
17
15
  "browser",
18
16
  "automation",
19
17
  "headless",
20
- "playwright",
18
+ "chrome",
19
+ "cdp",
21
20
  "cli",
22
21
  "agent"
23
22
  ],
@@ -30,55 +29,22 @@
30
29
  "url": "https://github.com/vercel-labs/agent-browser/issues"
31
30
  },
32
31
  "homepage": "https://github.com/vercel-labs/agent-browser#readme",
33
- "dependencies": {
34
- "node-simctl": "^7.4.0",
35
- "playwright-core": "^1.57.0",
36
- "webdriverio": "^9.15.0",
37
- "ws": "^8.19.0",
38
- "zod": "^3.22.4"
39
- },
40
32
  "devDependencies": {
41
- "@anthropic-ai/claude-agent-sdk": "^0.2.52",
42
- "@changesets/cli": "^2.29.8",
43
- "@types/node": "^20.10.0",
44
- "@types/ws": "^8.18.1",
45
- "husky": "^9.1.7",
46
- "lint-staged": "^15.2.11",
47
- "playwright": "^1.57.0",
48
- "prettier": "^3.7.4",
49
- "tsx": "^4.6.0",
50
- "typescript": "^5.3.0",
51
- "vitest": "^4.0.16"
52
- },
53
- "lint-staged": {
54
- "src/**/*.ts": "prettier --write"
33
+ "@changesets/cli": "^2.29.8"
55
34
  },
56
35
  "scripts": {
57
36
  "version:sync": "node scripts/sync-version.js",
58
37
  "version": "npm run version:sync && git add cli/Cargo.toml",
59
- "build": "tsc",
60
38
  "build:native": "npm run version:sync && cargo build --release --manifest-path cli/Cargo.toml && node scripts/copy-native.js",
61
39
  "build:linux": "npm run version:sync && docker compose -f docker/docker-compose.yml run --rm build-linux",
62
40
  "build:macos": "npm run version:sync && (cargo build --release --manifest-path cli/Cargo.toml --target aarch64-apple-darwin & cargo build --release --manifest-path cli/Cargo.toml --target x86_64-apple-darwin & wait) && cp cli/target/aarch64-apple-darwin/release/agent-browser bin/agent-browser-darwin-arm64 && cp cli/target/x86_64-apple-darwin/release/agent-browser bin/agent-browser-darwin-x64",
63
41
  "build:windows": "npm run version:sync && docker compose -f docker/docker-compose.yml run --rm build-windows",
64
42
  "build:all-platforms": "npm run version:sync && (npm run build:linux & npm run build:windows & wait) && npm run build:macos",
65
43
  "build:docker": "docker build -t agent-browser-builder -f docker/Dockerfile.build .",
66
- "release": "npm run version:sync && npm run build && npm run build:all-platforms && npm publish",
67
- "start": "node dist/daemon.js",
68
- "dev": "tsx src/daemon.ts",
69
- "typecheck": "tsc --noEmit",
70
- "format": "prettier --write 'src/**/*.ts'",
71
- "format:check": "prettier --check 'src/**/*.ts'",
72
- "test": "vitest run",
73
- "test:watch": "vitest",
74
- "test:e2e:dogfood": "vitest run test/e2e/dogfood.eval.ts",
75
- "bench": "pnpm build && tsx test/benchmarks/run.ts",
76
- "bench:node": "pnpm build && tsx test/benchmarks/run.ts --node-only",
77
- "bench:native": "pnpm build && tsx test/benchmarks/run.ts --native-only",
78
- "bench:engine": "pnpm build && tsx test/benchmarks/run.ts --engine",
44
+ "release": "npm run version:sync && npm run build:all-platforms && npm publish",
79
45
  "postinstall": "node scripts/postinstall.js",
80
46
  "changeset": "changeset",
81
47
  "ci:version": "changeset version && pnpm run version:sync && pnpm install --no-frozen-lockfile",
82
- "ci:publish": "pnpm run version:sync && pnpm run build && changeset publish"
48
+ "ci:publish": "pnpm run version:sync && changeset publish"
83
49
  }
84
50
  }