@dyyz1993/agent-browser 0.9.2 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/dist/__tests__/utils/parseCli.d.ts +1 -0
  2. package/dist/__tests__/utils/parseCli.d.ts.map +1 -1
  3. package/dist/__tests__/utils/parseCli.js +18 -10
  4. package/dist/__tests__/utils/parseCli.js.map +1 -1
  5. package/dist/actions.d.ts.map +1 -1
  6. package/dist/actions.js +63 -3
  7. package/dist/actions.js.map +1 -1
  8. package/dist/browser.d.ts +46 -2
  9. package/dist/browser.d.ts.map +1 -1
  10. package/dist/browser.js +343 -13
  11. package/dist/browser.js.map +1 -1
  12. package/dist/cli/commands.d.ts.map +1 -1
  13. package/dist/cli/commands.js +8 -3
  14. package/dist/cli/commands.js.map +1 -1
  15. package/dist/cli/connection.d.ts.map +1 -1
  16. package/dist/cli/connection.js +39 -1
  17. package/dist/cli/connection.js.map +1 -1
  18. package/dist/cli/help.d.ts.map +1 -1
  19. package/dist/cli/help.js +27 -20
  20. package/dist/cli/help.js.map +1 -1
  21. package/dist/cli/output.d.ts.map +1 -1
  22. package/dist/cli/output.js +5 -0
  23. package/dist/cli/output.js.map +1 -1
  24. package/dist/cli.js +20 -0
  25. package/dist/cli.js.map +1 -1
  26. package/dist/daemon.d.ts.map +1 -1
  27. package/dist/daemon.js +147 -1
  28. package/dist/daemon.js.map +1 -1
  29. package/dist/message-bridge.d.ts.map +1 -1
  30. package/dist/message-bridge.js +22 -4
  31. package/dist/message-bridge.js.map +1 -1
  32. package/dist/openapi.d.ts +22 -0
  33. package/dist/openapi.d.ts.map +1 -0
  34. package/dist/openapi.js +382 -0
  35. package/dist/openapi.js.map +1 -0
  36. package/dist/protocol.d.ts.map +1 -1
  37. package/dist/protocol.js +18 -0
  38. package/dist/protocol.js.map +1 -1
  39. package/dist/recorder/inject.js +61 -134
  40. package/dist/stream-server-standalone.d.ts +10 -0
  41. package/dist/stream-server-standalone.d.ts.map +1 -1
  42. package/dist/stream-server-standalone.js +594 -74
  43. package/dist/stream-server-standalone.js.map +1 -1
  44. package/dist/stream-server.d.ts +67 -2
  45. package/dist/stream-server.d.ts.map +1 -1
  46. package/dist/stream-server.js +371 -51
  47. package/dist/stream-server.js.map +1 -1
  48. package/dist/swagger-ui.d.ts +6 -0
  49. package/dist/swagger-ui.d.ts.map +1 -0
  50. package/dist/swagger-ui.js +51 -0
  51. package/dist/swagger-ui.js.map +1 -0
  52. package/dist/test-live.d.ts +2 -0
  53. package/dist/test-live.d.ts.map +1 -0
  54. package/dist/test-live.js +333 -0
  55. package/dist/test-live.js.map +1 -0
  56. package/dist/types.d.ts +7 -1
  57. package/dist/types.d.ts.map +1 -1
  58. package/dist/types.js.map +1 -1
  59. package/dist/viewer-html.d.ts.map +1 -1
  60. package/dist/viewer-html.js +270 -58
  61. package/dist/viewer-html.js.map +1 -1
  62. package/dist/viewer-script.d.ts +20 -2
  63. package/dist/viewer-script.d.ts.map +1 -1
  64. package/dist/viewer-script.js +911 -154
  65. package/dist/viewer-script.js.map +1 -1
  66. package/package.json +1 -1
  67. package/scripts/postinstall.js +6 -32
  68. package/scripts/test-cli-help.sh +51 -0
  69. package/scripts/verify-form.sh +67 -0
  70. package/scripts/verify-login.sh +65 -0
  71. package/scripts/verify-recording.sh +80 -0
  72. package/scripts/verify-upload.sh +41 -0
  73. package/skills/agent-browser/SKILL.md +297 -160
  74. package/skills/agent-browser/references/commands.md +3 -0
  75. package/skills/agent-browser/references/mobile-viewer.md +188 -0
  76. package/skills/agent-browser/references/network-monitoring.md +232 -0
  77. package/skills/agent-browser/references/recorder.md +319 -0
  78. package/skills/agent-browser/references/viewer-mode.md +148 -0
  79. package/skills/agent-browser/templates/api-interception.sh +3 -1
  80. package/skills/agent-browser/templates/data-extraction.sh +8 -4
  81. package/skills/agent-browser/templates/form-automation.sh +18 -23
  82. package/skills/agent-browser/templates/network-intercept-crawl.sh +256 -0
  83. package/skills/agent-browser/templates/recorder-workflow.sh +51 -0
  84. package/skills/agent-browser/templates/viewer-remote.sh +41 -0
  85. package/dist/__tests__/test-iframe.d.ts +0 -2
  86. package/dist/__tests__/test-iframe.d.ts.map +0 -1
  87. package/dist/__tests__/test-iframe.js +0 -52
  88. package/dist/__tests__/test-iframe.js.map +0 -1
  89. package/dist/cli-new.d.ts +0 -3
  90. package/dist/cli-new.d.ts.map +0 -1
  91. package/dist/cli-new.js +0 -308
  92. package/dist/cli-new.js.map +0 -1
  93. package/dist/cli-old.d.ts +0 -3
  94. package/dist/cli-old.d.ts.map +0 -1
  95. package/dist/cli-old.js +0 -1101
  96. package/dist/cli-old.js.map +0 -1
  97. package/dist/recorder/binding.d.ts +0 -24
  98. package/dist/recorder/binding.d.ts.map +0 -1
  99. package/dist/recorder/binding.js +0 -215
  100. package/dist/recorder/binding.js.map +0 -1
  101. package/dist/recorder/index.d.ts +0 -4
  102. package/dist/recorder/index.d.ts.map +0 -1
  103. package/dist/recorder/index.js +0 -4
  104. package/dist/recorder/index.js.map +0 -1
  105. package/dist/recorder/recorder.d.ts +0 -19
  106. package/dist/recorder/recorder.d.ts.map +0 -1
  107. package/dist/recorder/recorder.js +0 -101
  108. package/dist/recorder/recorder.js.map +0 -1
  109. package/dist/recorder/store.d.ts +0 -22
  110. package/dist/recorder/store.d.ts.map +0 -1
  111. package/dist/recorder/store.js +0 -150
  112. package/dist/recorder/store.js.map +0 -1
  113. package/dist/recorder/types.d.ts +0 -73
  114. package/dist/recorder/types.d.ts.map +0 -1
  115. package/dist/recorder/types.js +0 -5
  116. package/dist/recorder/types.js.map +0 -1
@@ -1,12 +1,12 @@
1
1
  ---
2
2
  name: agent-browser
3
- description: Browser automation CLI for AI agents. Use when the user needs to interact with websites, including navigating pages, filling forms, clicking buttons, taking screenshots, extracting data, testing web apps, or automating any browser task. Triggers include requests to "open a website", "fill out a form", "click a button", "take a screenshot", "scrape data from a page", "test this web app", "login to a site", "automate browser actions", or any task requiring programmatic web interaction.
3
+ description: Browser automation CLI for AI agents. Use when the user needs to interact with websites, including navigating pages, filling forms, clicking buttons, taking screenshots, extracting data, testing web apps, viewer/streaming mode, mobile remote control, or automating any browser task. Triggers include requests to "open a website", "fill out a form", "click a button", "take a screenshot", "scrape data from a page", "test this web app", "login to a site", "automate browser actions", "view remote browser", "mobile browsing", or any task requiring programmatic web interaction.
4
4
  allowed-tools: Bash(agent-browser:*)
5
5
  ---
6
6
 
7
7
  # Browser Automation with agent-browser
8
8
 
9
- ## Core Workflow
9
+ ## Quick Start
10
10
 
11
11
  Every browser automation follows this pattern:
12
12
 
@@ -27,112 +27,191 @@ agent-browser wait --load networkidle
27
27
  agent-browser snapshot -i # Check result
28
28
  ```
29
29
 
30
- ## Working with Iframes
30
+ ## Essential Commands
31
31
 
32
- Use `--in-frame` to operate inside iframes. The path uses iframe name/id or index:
32
+ ### Navigation
33
33
 
34
34
  ```bash
35
- # Direct iframe by ID or name
36
- agent-browser snapshot --in-frame "#my-iframe"
35
+ agent-browser open <url> # Navigate (aliases: goto, navigate)
36
+ agent-browser back # Go back
37
+ agent-browser forward # Go forward
38
+ agent-browser reload # Reload page
39
+ agent-browser close # Close browser (alias: quit, exit)
40
+ ```
37
41
 
38
- # Nested iframe using path (name/id or index)
39
- agent-browser snapshot --in-frame "#outer-frame/inner-frame"
42
+ ### Element Interaction
40
43
 
41
- # Example: Click element inside nested cross-origin iframe
42
- agent-browser open https://example.com
43
- agent-browser snapshot --in-frame "#iframe-container"
44
- agent-browser click @e1 --in-frame "#iframe-container/login-frame"
45
- agent-browser fill #username "admin" --in-frame "#iframe-container/login-frame"
46
- agent-browser get value #username --in-frame "#iframe-container/login-frame"
44
+ ```bash
45
+ agent-browser click @e1 # Click element
46
+ agent-browser dblclick @e1 # Double-click
47
+ agent-browser fill @e2 "text" # Clear and type text
48
+ agent-browser type @e2 "text" # Type without clearing
49
+ agent-browser select @e1 "option" # Select dropdown option
50
+ agent-browser check @e1 # Check checkbox
51
+ agent-browser uncheck @e1 # Uncheck checkbox
52
+ agent-browser press Enter # Press key (alias: key)
53
+ agent-browser keydown / keyup # Raw key down / up
54
+ agent-browser hover @e1 # Hover over element
55
+ agent-browser focus @e1 # Focus element
56
+ agent-browser drag @e1 @e2 # Drag from e1 to e2
57
+ agent-browser upload @e1 "/path" # Upload file
58
+ agent-browser download @e1 "/path" # Download resource
47
59
  ```
48
60
 
49
- ### Frame Path Syntax
61
+ ### Scrolling
50
62
 
51
- The frame path supports:
52
- - **ID/Name**: `#frame-id` or `#frame-name`
53
- - **Index**: `#0`, `#1` (by position)
54
- - **Nested**: `#parent/child/grandchild`
55
-
56
- Examples:
57
- - `#my-iframe` - Single iframe
58
- - `#0` - First iframe
59
- - `#outer-iframe/login-frame` - Nested iframes by name
60
- - `#0/1` - First iframe's second child
63
+ ```bash
64
+ agent-browser scroll down 500 # Scroll pixels
65
+ agent-browser scrollintoview @e1 # Scroll element into view
66
+ ```
61
67
 
62
- ## Essential Commands
68
+ ### Snapshot & Inspection
63
69
 
64
70
  ```bash
65
- # Navigation
66
- agent-browser open <url> # Navigate (aliases: goto, navigate)
67
- agent-browser close # Close browser
68
-
69
- # Snapshot
70
71
  agent-browser snapshot -i # Interactive elements with refs (recommended)
71
- agent-browser snapshot -i -C # Include cursor-interactive elements (divs with onclick, cursor:pointer)
72
+ agent-browser snapshot -i -C # Include cursor-interactive elements
72
73
  agent-browser snapshot -s "#selector" # Scope to CSS selector
73
74
  agent-browser snapshot -s "body" --path # Include xpath and cssPath in refs
74
75
  agent-browser snapshot -s "body" --attrs # Include element attributes in refs
76
+ agent-browser snapshot -i --json # JSON output for parsing
77
+ ```
75
78
 
76
- # Interaction (use @refs from snapshot)
77
- agent-browser click @e1 # Click element
78
- agent-browser fill @e2 "text" # Clear and type text
79
- agent-browser type @e2 "text" # Type without clearing
80
- agent-browser select @e1 "option" # Select dropdown option
81
- agent-browser check @e1 # Check checkbox
82
- agent-browser press Enter # Press key
83
- agent-browser scroll down 500 # Scroll page
79
+ ### Getting Information
84
80
 
85
- # Get information
86
- agent-browser get text @e1 # Get element text
81
+ ```bash
82
+ agent-browser get text @e1 # Get element text content
87
83
  agent-browser get url # Get current URL
88
84
  agent-browser get title # Get page title
85
+ agent-browser get count ".item" # Count matching elements
86
+ agent-browser get box @e1 # Bounding box {x,y,width,height}
87
+ agent-browser get styles @e1 # Computed styles
88
+ agent-browser is visible @e1 # Visibility check
89
+ agent-browser is enabled @e1 # Enabled check
90
+ agent-browser is checked @e1 # Checked state
91
+ ```
92
+
93
+ ### Waiting
89
94
 
90
- # Wait
91
- agent-browser wait @e1 # Wait for element
95
+ ```bash
96
+ agent-browser wait @e1 # Wait for element to appear
92
97
  agent-browser wait --load networkidle # Wait for network idle
93
- agent-browser wait --url "**/page" # Wait for URL pattern
94
- agent-browser wait 2000 # Wait milliseconds
98
+ agent-browser wait --load domcontentloaded # Wait for DOM ready
99
+ agent-browser wait --url "**/page" # Wait for URL pattern match
100
+ agent-browser wait --text "Hello" # Wait for text on page
101
+ agent-browser wait --fn "document.hidden === false" # Wait for JS expression
102
+ agent-browser wait --download # Wait for download to complete
103
+ agent-browser wait 2000 # Wait milliseconds (fixed delay)
104
+ agent-browser wait --request "api/data" # Wait for specific network request (background listener)
105
+ ```
95
106
 
96
- # Capture
107
+ ### Capture
108
+
109
+ ```bash
97
110
  agent-browser screenshot # Screenshot to temp dir
98
111
  agent-browser screenshot --full # Full page screenshot
112
+ agent-browser screenshot output.png # Save to file
99
113
  agent-browser pdf output.pdf # Save as PDF
100
114
  ```
101
115
 
102
- ## Human-like Mouse Movement
116
+ ### Network Monitoring
103
117
 
104
- Enable globally via environment variable to simulate natural mouse trajectories:
118
+ ```bash
119
+ agent-browser network requests # View all network requests
120
+ agent-browser network requests --filter "**/api/**" # Filter by URL pattern
121
+ agent-browser network requests --clear # Clear request history
122
+ agent-browser network requests --capture-response # Capture response bodies
123
+ agent-browser network requests --capture-response --type json # Filter captured by content type
124
+ agent-browser network requests --output ./captures/ # Save captures to directory
125
+ agent-browser network route "**/api/**" --abort # Block requests
126
+ agent-browser network route "**/api/**" --body '{"users": []}' # Mock response
127
+ agent-browser network route "**/api/**" --status 404 # Mock status code
128
+ agent-browser network unroute "**/api/**" # Remove route
129
+ ```
130
+
131
+ See [network-monitoring.md](references/network-monitoring.md) for advanced patterns.
132
+
133
+ ### Tabs & Windows
134
+
135
+ ```bash
136
+ agent-browser tab list # List all tabs
137
+ agent-browser tab new # Open new tab
138
+ agent-browser tab close 2 # Close tab by index
139
+ agent-browser tab switch 0 # Switch to tab
140
+ agent-browser window new # Open new window
141
+ ```
142
+
143
+ ### Dialogs & Alerts
144
+
145
+ ```bash
146
+ agent-browser dialog accept # Accept alert/dialog
147
+ agent-browser dialog dismiss # Dismiss alert/dialog
148
+ ```
149
+
150
+ ### Browser State
151
+
152
+ ```bash
153
+ agent-browser state save auth.json # Save cookies/localStorage/session
154
+ agent-browser state clear # Clear all state
155
+ agent-browser storage session dump # Dump session storage
156
+ agent-browser storage session load # Load session storage
157
+ agent-browser cookies set name value domain # Set cookie
158
+ agent-browser cookies export # Export all cookies
159
+ ```
160
+
161
+ ### Debugging
162
+
163
+ ```bash
164
+ agent-browser console "1+1" # Evaluate JS in browser console
165
+ agent-browser errors # Show recent page errors
166
+ agent-browser highlight @e1 # Highlight element on page
167
+ agent-browser trace start # Start Chrome trace
168
+ agent-browser trace stop ./trace.json # Stop and save trace
169
+ ```
170
+
171
+ ### Session Management
105
172
 
106
173
  ```bash
107
- # Enable human mode (default: arc path type)
108
- export AGENT_BROWSER_HUMAN=1
109
-
110
- # Or specify path type
111
- export AGENT_BROWSER_HUMAN=bezier # Bezier curve with overshoot
112
- export AGENT_BROWSER_HUMAN=arc # Smooth arc (default, most natural)
113
- export AGENT_BROWSER_HUMAN=random # Random path with jitter
114
- export AGENT_BROWSER_HUMAN=linear # Straight line (fastest)
115
-
116
- # All interactions will use human-like movement
117
- agent-browser click @e1
118
- agent-browser fill @e1 "text"
119
- agent-browser type @e1 "text"
120
- agent-browser hover @e1
121
- agent-browser dblclick @e1
122
-
123
- # Wait with mouse wandering (when human mode enabled)
124
- agent-browser wait 3000 # Wanders mouse while waiting
125
-
126
- # Disable human mode
127
- unset AGENT_BROWSER_HUMAN
174
+ agent-browser --session site1 open https://a.com # Named session
175
+ agent-browser --session site2 open https://b.com # Parallel session
176
+ agent-browser session list # List active sessions
177
+ agent-browser connect ws://localhost:9222 # Connect to remote CDP browser
178
+ agent-browser kill # Kill daemon process
179
+ agent-browser config # Show/edit config
180
+ agent-browser config [--json] # Config as JSON
128
181
  ```
129
182
 
130
- **Features:**
131
- - Continues from last mouse position for realistic trajectories
132
- - Natural acceleration/deceleration curves
133
- - Randomized delays between movements
134
- - Four trajectory types: `arc` (default), `bezier`, `random`, `linear`
135
- - `wait <ms>` automatically does mouse wandering when enabled
183
+ ## Global Options
184
+
185
+ These flags work with most commands:
186
+
187
+ | Flag | Description |
188
+ | -------------------------- | ---------------------------------------------- |
189
+ | `--session <name>` | Named browser session |
190
+ | `--json` | JSON output format |
191
+ | `--headed` | Show visible browser window |
192
+ | `--cdp <url>` | Connect via Chrome DevTools Protocol directly |
193
+ | `-p/--provider` | Provider: ios, browserbase, kernel, browseruse |
194
+ | `--proxy <url>` | HTTP/SOCKS5 proxy |
195
+ | `--proxy-bypass <rules>` | Proxy bypass rules |
196
+ | `--headers 'K: V'` | Extra HTTP headers per request |
197
+ | `--state <path>` | Restore browser state from file |
198
+ | `--profile <path>` | Chrome profile directory |
199
+ | `--args "<args>"` | Extra Chromium launch arguments |
200
+ | `--user-agent <ua>` | Custom User-Agent string |
201
+ | `--executable-path <path>` | Browser binary path |
202
+ | `--extension <path>` | Load .crx Chrome extension |
203
+ | `--ignore-https-errors` | Ignore HTTPS certificate errors |
204
+ | `--allow-file-access` | Allow file:// URLs |
205
+ | `--timeout <ms>` | Global operation timeout |
206
+ | `--debug` | Verbose debug logging |
207
+
208
+ Examples:
209
+
210
+ ```bash
211
+ agent-browser --proxy http://proxy:8080 open https://example.com
212
+ agent-browser --headed --debug open https://example.com
213
+ agent-browser --user-agent "MyBot/1.0" open https://example.com
214
+ ```
136
215
 
137
216
  ## Common Patterns
138
217
 
@@ -161,7 +240,7 @@ agent-browser click @e3
161
240
  agent-browser wait --url "**/dashboard"
162
241
  agent-browser state save auth.json
163
242
 
164
- # Reuse in future sessions (use --state flag)
243
+ # Reuse in future sessions
165
244
  agent-browser --state auth.json open https://app.example.com/dashboard
166
245
  ```
167
246
 
@@ -170,46 +249,32 @@ agent-browser --state auth.json open https://app.example.com/dashboard
170
249
  ```bash
171
250
  agent-browser open https://example.com/products
172
251
  agent-browser snapshot -i
173
- agent-browser get text @e5 # Get specific element text
174
- agent-browser get text body > page.txt # Get all page text
175
-
176
- # JSON output for parsing
177
- agent-browser snapshot -i --json
178
- agent-browser get text @e1 --json
252
+ agent-browser get text @e5 # Specific element
253
+ agent-browser get text body > page.txt # All page text
254
+ agent-browser snapshot -i --json # JSON for parsing
255
+ agent-browser get text @e1 --json # Element as JSON
179
256
  ```
180
257
 
181
- ### API Interception
258
+ ### API Interception (Passive Capture)
182
259
 
183
- Passively capture API responses without making direct requests. Useful for sites with anti-scraping measures.
260
+ Capture API responses without making direct requests:
184
261
 
185
262
  ```bash
186
- # 1. Open blank page first
187
263
  agent-browser open "about:blank"
188
-
189
- # 2. Start request listener in background
190
264
  (agent-browser wait --request "api/users" --timeout 30000 > response.json) &
191
- WAIT_PID=$!
192
265
  sleep 1
193
-
194
- # 3. Navigate to trigger the API call
195
266
  agent-browser open "https://example.com/user/profile"
196
-
197
- # 4. Wait for response
198
- wait $WAIT_PID
199
-
200
- # 5. Process captured data
267
+ wait $!
201
268
  jq '.body' response.json
202
269
  ```
203
270
 
204
- Example: Capture Douyin user videos
271
+ ### Network Monitoring & API Mocking
272
+
205
273
  ```bash
206
- agent-browser open "about:blank"
207
- (agent-browser wait --request "aweme/post" --timeout 30000 > /tmp/douyin.json) &
208
- sleep 1
209
- agent-browser open "https://www.douyin.com/user/xxx"
210
- sleep 5
211
- wait
212
- jq '.body.aweme_list[:10] | map({id, desc, stats})' /tmp/douyin.json
274
+ agent-browser network requests --filter "**/api/**"
275
+ agent-browser network route "**/api/users" --body '{"users": []}'
276
+ agent-browser network route "**/ads/**" --abort
277
+ agent-browser network unroute "**/api/users"
213
278
  ```
214
279
 
215
280
  ### Parallel Sessions
@@ -217,94 +282,166 @@ jq '.body.aweme_list[:10] | map({id, desc, stats})' /tmp/douyin.json
217
282
  ```bash
218
283
  agent-browser --session site1 open https://site-a.com
219
284
  agent-browser --session site2 open https://site-b.com
220
-
221
285
  agent-browser --session site1 snapshot -i
222
- agent-browser --session site2 snapshot -i
223
-
224
286
  agent-browser session list
225
287
  ```
226
288
 
227
- ### Visual Browser (Debugging)
289
+ ### Local Files (PDFs, HTML)
228
290
 
229
291
  ```bash
230
- agent-browser --headed open https://example.com
231
- agent-browser highlight @e1 # Highlight element
232
- agent-browser record start demo.webm # Record session
292
+ agent-browser --allow-file-access open file:///path/to/doc.pdf
293
+ agent-browser --allow-file-access open file:///path/to/page.html
294
+ agent-browser screenshot output.png
233
295
  ```
234
296
 
235
- ### Local Files (PDFs, HTML)
297
+ ### Working with Iframes
298
+
299
+ Use `--in-frame` to operate inside iframes:
236
300
 
237
301
  ```bash
238
- # Open local files with file:// URLs
239
- agent-browser --allow-file-access open file:///path/to/document.pdf
240
- agent-browser --allow-file-access open file:///path/to/page.html
241
- agent-browser screenshot output.png
302
+ agent-browser snapshot --in-frame "#my-iframe"
303
+ agent-browser snapshot --in-frame "#outer/inner" # Nested path
304
+ agent-browser click @e1 --in-frame "#container/frame"
305
+ agent-browser fill #user "admin" --in-frame "#container/login-frame"
306
+ ```
307
+
308
+ Frame path syntax: `#id-or-name`, `#index` (position), `#parent/child` (nested).
309
+
310
+ ### Semantic Locators (Alternative to Refs)
311
+
312
+ When refs are unavailable, use semantic locators:
313
+
314
+ ```bash
315
+ agent-browser find text "Sign In" click
316
+ agent-browser find label "Email" fill "user@test.com"
317
+ agent-browser find role button click --name "Submit"
318
+ agent-browser find placeholder "Search" type "query"
319
+ agent-browser find testid "submit-btn" click
242
320
  ```
243
321
 
244
- ### iOS Simulator (Mobile Safari)
322
+ ### Proxy Configuration
245
323
 
246
324
  ```bash
247
- # List available iOS simulators
248
- agent-browser device list
325
+ agent-browser --proxy http://proxy:8080 open https://example.com
326
+ agent-browser --proxy socks5://proxy:1080 open https://example.com
327
+ agent-browser --proxy http://user:pass@proxy:8080 --proxy-bypass "localhost,*.internal" open https://example.com
328
+ ```
249
329
 
250
- # Launch Safari on a specific device
251
- agent-browser -p ios --device "iPhone 16 Pro" open https://example.com
330
+ ## Advanced Features
252
331
 
253
- # Same workflow as desktop - snapshot, interact, re-snapshot
254
- agent-browser -p ios snapshot -i
255
- agent-browser -p ios click @e1 # Click/tap element
256
- agent-browser -p ios fill @e2 "text"
257
- agent-browser -p ios scroll down 500 # Scroll gesture
332
+ ### Recording & Replaying Workflows
258
333
 
259
- # Take screenshot
260
- agent-browser -p ios screenshot mobile.png
334
+ For test automation and workflow capture:
261
335
 
262
- # Close session (shuts down simulator)
263
- agent-browser -p ios close
336
+ ```bash
337
+ agent-browser recorder start --session my-test
338
+ agent-browser open https://example.com/form
339
+ agent-browser snapshot -i
340
+ agent-browser fill @e1 "user@example.com"
341
+ agent-browser click @e3
342
+ agent-browser recorder stop --output test-workflow.yaml
343
+ agent-browser recorder replay test-workflow.yaml
264
344
  ```
265
345
 
266
- **Requirements:** macOS with Xcode, Appium (`npm install -g appium && appium driver install xcuitest`)
346
+ See [recorder.md](references/recorder.md) for details.
267
347
 
268
- **Real devices:** Works with physical iOS devices if pre-configured. Use `--device "<UDID>"` where UDID is from `xcrun xctrace list devices`.
348
+ ### Human-like Mouse Movement
269
349
 
270
- **Note:** iOS uses standard commands like `click`, `fill`, `scroll` instead of mobile-specific aliases like `tap` or `swipe`.
350
+ Simulate natural mouse trajectories via environment variable:
271
351
 
272
- ## Ref Lifecycle (Important)
352
+ ```bash
353
+ export AGENT_BROWSER_HUMAN=1 # Enable (default: arc path)
354
+ export AGENT_BROWSER_HUMAN=bezier # Bezier curve with overshoot
355
+ export AGENT_BROWSER_HUMAN=random # Random path with jitter
356
+ export AGENT_BROWSER_HUMAN=linear # Straight line (fastest)
357
+
358
+ agent-browser click @e1 # Uses human trajectory
359
+ agent-browser wait 3000 # Mouse wandering while waiting
360
+ unset AGENT_BROWSER_HUMAN # Disable
361
+ ```
273
362
 
274
- Refs (`@e1`, `@e2`, etc.) are invalidated when the page changes. Always re-snapshot after:
363
+ Features: continuous position tracking, acceleration curves, 4 trajectory types, auto-wandering on wait.
275
364
 
276
- - Clicking links or buttons that navigate
277
- - Form submissions
278
- - Dynamic content loading (dropdowns, modals)
365
+ ### Viewer / Streaming Mode
366
+
367
+ Real-time remote browser visualization with frame streaming over WebSocket.
279
368
 
280
369
  ```bash
281
- agent-browser click @e5 # Navigates to new page
282
- agent-browser snapshot -i # MUST re-snapshot
283
- agent-browser click @e1 # Use new refs
370
+ # Start viewer after opening a page
371
+ agent-browser open https://example.com
372
+ agent-browser viewer # Opens viewer URL in browser
373
+ agent-browser viewer --json # Get connection details as JSON
284
374
  ```
285
375
 
286
- **Important for Shell Scripts:** Refs are session-specific and cannot be used in standalone shell scripts. When converting interactive workflows to scripts, use semantic locators or CSS selectors instead. See [references/snapshot-refs.md](references/snapshot-refs.md#converting-to-shell-scripts) for details.
376
+ **Architecture:** Browser -> Daemon (IPC) -> Standalone Server (:5005) -> Viewer (WebSocket)
377
+
378
+ **Element Crop Mode:** Stream can be cropped to a specific DOM element's bounds. Coordinates auto-map to element-local space.
379
+
380
+ See [viewer-mode.md](references/viewer-mode.md) for architecture details, troubleshooting, and element mode.
381
+
382
+ ### Mobile Remote Control (Touch Devices)
383
+
384
+ When viewer is opened on a phone/tablet, it automatically enters **mobile mode** with touch-optimized UI:
385
+
386
+ - **Touchpad**: Bottom-area gesture surface (tap=click, drag=move cursor, long-press=drag, 2-finger=scroll)
387
+ - **Input Panel**: Tap remote input field -> local text input appears -> syncs to remote via `input_fill`
388
+ - **Virtual Keyboard Toolbar**: Tab, Arrows, Enter, Backspace, Escape
389
+ - **IME Support**: Chinese/Japanese composition (pinyin etc.) — intermediate input NOT sent to remote
390
+ - **DeviceMode**: Auto-detects device type, switches UI dynamically on resize/orientationchange/matchMedia
287
391
 
288
- ## Semantic Locators (Alternative to Refs)
392
+ See [mobile-viewer.md](references/mobile-viewer.md) for touchpad gestures, input panel flow, DeviceMode architecture.
289
393
 
290
- When refs are unavailable or unreliable, use semantic locators:
394
+ ### iOS Simulator (Appium)
395
+
396
+ Native iOS automation via Xcode + Appium:
291
397
 
292
398
  ```bash
293
- agent-browser find text "Sign In" click
294
- agent-browser find label "Email" fill "user@test.com"
295
- agent-browser find role button click --name "Submit"
296
- agent-browser find placeholder "Search" type "query"
297
- agent-browser find testid "submit-btn" click
399
+ agent-browser device list # List simulators
400
+ agent-browser -p ios --device "iPhone 16 Pro" open https://example.com
401
+ agent-browser -p ios snapshot -i && agent-browser -p ios click @e1
402
+ agent-browser -p ios close # Shuts down simulator
403
+ ```
404
+
405
+ Requires: macOS + Xcode + `npm install -g appium && appium driver install xcuitest`.
406
+
407
+ Note: Mobile viewer mode (above) works on ANY phone browser via web viewer — no simulator needed.
408
+
409
+ ### Cloud Browser Providers
410
+
411
+ Connect to managed browser services:
412
+
413
+ ```bash
414
+ BROWSERBASE_API_KEY=key agent-browser --provider browserbase open https://example.com
415
+ KERNEL_API_KEY=key agent-browser --provider kernel open https://example.com
416
+ BROWSERUSE_API_KEY=key agent-browser --provider browseruse open https://example.com
417
+ ```
418
+
419
+ Useful for: geo-distributed testing, IP diversity, team sharing, parallel scaling.
420
+
421
+ ## Ref Lifecycle (Important)
422
+
423
+ Refs (`@e1`, `@e2`) are invalidated when the page changes. Always re-snapshot after navigation, form submission, or dynamic content loading:
424
+
425
+ ```bash
426
+ agent-browser click @e5 # Navigates to new page
427
+ agent-browser snapshot -i # MUST re-snapshot
428
+ agent-browser click @e1 # Use new refs
298
429
  ```
299
430
 
300
- ## Deep-Dive Documentation
301
-
302
- | Reference | When to Use |
303
- |-----------|-------------|
304
- | [references/commands.md](references/commands.md) | Full command reference with all options |
305
- | [references/data-extraction.md](references/data-extraction.md) | **Data extraction patterns: DOM, JS variables, API interception, infinite scroll, iframe** |
306
- | [references/snapshot-refs.md](references/snapshot-refs.md) | Ref lifecycle, invalidation rules, troubleshooting |
307
- | [references/session-management.md](references/session-management.md) | Parallel sessions, state persistence, concurrent scraping |
308
- | [references/authentication.md](references/authentication.md) | Login flows, OAuth, 2FA handling, state reuse |
309
- | [references/video-recording.md](references/video-recording.md) | Recording workflows for debugging and documentation |
310
- | [references/proxy-support.md](references/proxy-support.md) | Proxy configuration, geo-testing, rotating proxies |
431
+ Refs are session-specific. For shell scripts, use semantic locators or CSS selectors instead. See [snapshot-refs.md](references/snapshot-refs.md).
432
+
433
+ ## Reference Docs
434
+
435
+ | Reference | Content |
436
+ | --------------------------------------------------------- | ------------------------------------------------------------- |
437
+ | [commands.md](references/commands.md) | Complete command reference with all options |
438
+ | [data-extraction.md](references/data-extraction.md) | DOM, JS variables, API interception, infinite scroll, iframe |
439
+ | [snapshot-refs.md](references/snapshot-refs.md) | Ref lifecycle, invalidation rules, shell script conversion |
440
+ | [session-management.md](references/session-management.md) | Parallel sessions, state persistence, concurrent scraping |
441
+ | [authentication.md](references/authentication.md) | Login flows, OAuth, 2FA handling, state reuse |
442
+ | [video-recording.md](references/video-recording.md) | Video recording for debugging |
443
+ | [recorder.md](references/recorder.md) | Action recording & replay for test automation |
444
+ | [proxy-support.md](references/proxy-support.md) | Proxy config, geo-testing, rotating proxies |
445
+ | [network-monitoring.md](references/network-monitoring.md) | Request monitoring, API mocking, request blocking |
446
+ | [viewer-mode.md](references/viewer-mode.md) | Streaming viewer, element crop, architecture, troubleshooting |
447
+ | [mobile-viewer.md](references/mobile-viewer.md) | Touchpad, input panel, IME/CJK support, DeviceMode |
@@ -363,6 +363,7 @@ agent-browser --full ... # Full page screenshot (-f)
363
363
  agent-browser --cdp <port> ... # Connect via Chrome DevTools Protocol
364
364
  agent-browser -p <provider> ... # Cloud browser provider (--provider)
365
365
  agent-browser --proxy <url> ... # Use proxy server
366
+ agent-browser --proxy-bypass <list> # Comma-separated bypass list for proxy
366
367
  agent-browser --headers <json> ... # HTTP headers scoped to URL's origin
367
368
  agent-browser --executable-path <p> # Custom browser executable
368
369
  agent-browser --extension <path> ... # Load browser extension (repeatable)
@@ -468,4 +469,6 @@ AGENT_BROWSER_PROVIDER="browserbase" # Cloud browser provider
468
469
  AGENT_BROWSER_STREAM_PORT="9223" # WebSocket streaming port
469
470
  AGENT_BROWSER_HOME="/path/to/agent-browser" # Custom install location
470
471
  AGENT_BROWSER_HUMAN="1" # Enable human-like mouse movement (1, bezier, arc, random, linear)
472
+ AGENT_BROWSER_PROXY="http://proxy:8080" # Proxy server URL
473
+ AGENT_BROWSER_PROXY_BYPASS="localhost,*" # Proxy bypass hosts
471
474
  ```