agent-browser 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -115,6 +115,8 @@ agent-browser drag <src> <tgt> # Drag and drop
115
115
  agent-browser upload <sel> <files> # Upload files
116
116
  agent-browser screenshot [path] # Take screenshot (--full for full page, saves to a temporary directory if no path)
117
117
  agent-browser screenshot --annotate # Annotated screenshot with numbered element labels
118
+ agent-browser screenshot --screenshot-dir ./shots # Save to custom directory
119
+ agent-browser screenshot --screenshot-format jpeg --screenshot-quality 80
118
120
  agent-browser pdf <path> # Save as PDF
119
121
  agent-browser snapshot # Accessibility tree with refs (best for AI)
120
122
  agent-browser eval <js> # Run JavaScript (-b for base64, --stdin for piped input)
@@ -165,6 +167,7 @@ agent-browser find nth <n> <sel> <action> [value] # Nth match
165
167
  **Options:** `--name <name>` (filter role by accessible name), `--exact` (require exact text match)
166
168
 
167
169
  **Examples:**
170
+
168
171
  ```bash
169
172
  agent-browser find role button click --name "Submit"
170
173
  agent-browser find text "Sign In" click
@@ -178,14 +181,27 @@ agent-browser find nth 2 "a" text
178
181
  ```bash
179
182
  agent-browser wait <selector> # Wait for element to be visible
180
183
  agent-browser wait <ms> # Wait for time (milliseconds)
181
- agent-browser wait --text "Welcome" # Wait for text to appear
184
+ agent-browser wait --text "Welcome" # Wait for text to appear (substring match)
182
185
  agent-browser wait --url "**/dash" # Wait for URL pattern
183
186
  agent-browser wait --load networkidle # Wait for load state
184
187
  agent-browser wait --fn "window.ready === true" # Wait for JS condition
188
+
189
+ # Wait for text/element to disappear
190
+ agent-browser wait --fn "!document.body.innerText.includes('Loading...')"
191
+ agent-browser wait "#spinner" --state hidden
185
192
  ```
186
193
 
187
194
  **Load states:** `load`, `domcontentloaded`, `networkidle`
188
195
 
196
+ ### Clipboard
197
+
198
+ ```bash
199
+ agent-browser clipboard read # Read text from clipboard
200
+ agent-browser clipboard write "Hello, World!" # Write text to clipboard
201
+ agent-browser clipboard copy # Copy current selection (Ctrl+C)
202
+ agent-browser clipboard paste # Paste from clipboard (Ctrl+V)
203
+ ```
204
+
189
205
  ### Mouse Control
190
206
 
191
207
  ```bash
@@ -375,6 +391,7 @@ agent-browser session
375
391
  ```
376
392
 
377
393
  Each session has its own:
394
+
378
395
  - Browser instance
379
396
  - Cookies and storage
380
397
  - Navigation history
@@ -396,6 +413,7 @@ AGENT_BROWSER_PROFILE=~/.myapp-profile agent-browser open myapp.com
396
413
  ```
397
414
 
398
415
  The profile directory stores:
416
+
399
417
  - Cookies and localStorage
400
418
  - IndexedDB data
401
419
  - Service workers
@@ -432,10 +450,10 @@ export AGENT_BROWSER_ENCRYPTION_KEY=<64-char-hex-key>
432
450
  agent-browser --session-name secure open example.com
433
451
  ```
434
452
 
435
- | Variable | Description |
436
- |----------|-------------|
437
- | `AGENT_BROWSER_SESSION_NAME` | Auto-save/load state persistence name |
438
- | `AGENT_BROWSER_ENCRYPTION_KEY` | 64-char hex key for AES-256-GCM encryption |
453
+ | Variable | Description |
454
+ | --------------------------------- | -------------------------------------------------- |
455
+ | `AGENT_BROWSER_SESSION_NAME` | Auto-save/load state persistence name |
456
+ | `AGENT_BROWSER_ENCRYPTION_KEY` | 64-char hex key for AES-256-GCM encryption |
439
457
  | `AGENT_BROWSER_STATE_EXPIRE_DAYS` | Auto-delete states older than N days (default: 30) |
440
458
 
441
459
  ## Security
@@ -449,14 +467,14 @@ agent-browser includes security features for safe AI agent deployments. All feat
449
467
  - **Action Confirmation** -- Require explicit approval for sensitive action categories: `--confirm-actions eval,download`
450
468
  - **Output Length Limits** -- Prevent context flooding: `--max-output 50000`
451
469
 
452
- | Variable | Description |
453
- |----------|-------------|
454
- | `AGENT_BROWSER_CONTENT_BOUNDARIES` | Wrap page output in boundary markers |
455
- | `AGENT_BROWSER_MAX_OUTPUT` | Max characters for page output |
456
- | `AGENT_BROWSER_ALLOWED_DOMAINS` | Comma-separated allowed domain patterns |
457
- | `AGENT_BROWSER_ACTION_POLICY` | Path to action policy JSON file |
458
- | `AGENT_BROWSER_CONFIRM_ACTIONS` | Action categories requiring confirmation |
459
- | `AGENT_BROWSER_CONFIRM_INTERACTIVE` | Enable interactive confirmation prompts |
470
+ | Variable | Description |
471
+ | ----------------------------------- | ---------------------------------------- |
472
+ | `AGENT_BROWSER_CONTENT_BOUNDARIES` | Wrap page output in boundary markers |
473
+ | `AGENT_BROWSER_MAX_OUTPUT` | Max characters for page output |
474
+ | `AGENT_BROWSER_ALLOWED_DOMAINS` | Comma-separated allowed domain patterns |
475
+ | `AGENT_BROWSER_ACTION_POLICY` | Path to action policy JSON file |
476
+ | `AGENT_BROWSER_CONFIRM_ACTIONS` | Action categories requiring confirmation |
477
+ | `AGENT_BROWSER_CONFIRM_INTERACTIVE` | Enable interactive confirmation prompts |
460
478
 
461
479
  See [Security documentation](https://agent-browser.dev/security) for details.
462
480
 
@@ -474,13 +492,13 @@ agent-browser snapshot -s "#main" # Scope to CSS selector
474
492
  agent-browser snapshot -i -c -d 5 # Combine options
475
493
  ```
476
494
 
477
- | Option | Description |
478
- |--------|-------------|
479
- | `-i, --interactive` | Only show interactive elements (buttons, links, inputs) |
480
- | `-C, --cursor` | Include cursor-interactive elements (cursor:pointer, onclick, tabindex) |
481
- | `-c, --compact` | Remove empty structural elements |
482
- | `-d, --depth <n>` | Limit tree depth |
483
- | `-s, --selector <sel>` | Scope to CSS selector |
495
+ | Option | Description |
496
+ | ---------------------- | ----------------------------------------------------------------------- |
497
+ | `-i, --interactive` | Only show interactive elements (buttons, links, inputs) |
498
+ | `-C, --cursor` | Include cursor-interactive elements (cursor:pointer, onclick, tabindex) |
499
+ | `-c, --compact` | Remove empty structural elements |
500
+ | `-d, --depth <n>` | Limit tree depth |
501
+ | `-s, --selector <sel>` | Scope to CSS selector |
484
502
 
485
503
  The `-C` flag is useful for modern web apps that use custom clickable elements (divs, spans) instead of standard buttons/links.
486
504
 
@@ -529,6 +547,9 @@ This is useful for multimodal AI models that can reason about visual layout, unl
529
547
  | `--json` | JSON output (for agents) |
530
548
  | `--full, -f` | Full page screenshot |
531
549
  | `--annotate` | Annotated screenshot with numbered element labels (or `AGENT_BROWSER_ANNOTATE` env) |
550
+ | `--screenshot-dir <path>` | Default screenshot output directory (or `AGENT_BROWSER_SCREENSHOT_DIR` env) |
551
+ | `--screenshot-quality <n>` | JPEG quality 0-100 (or `AGENT_BROWSER_SCREENSHOT_QUALITY` env) |
552
+ | `--screenshot-format <fmt>` | Screenshot format: `png`, `jpeg` (or `AGENT_BROWSER_SCREENSHOT_FORMAT` env) |
532
553
  | `--headed` | Show browser window (not headless) (or `AGENT_BROWSER_HEADED` env) |
533
554
  | `--cdp <port\|url>` | Connect via Chrome DevTools Protocol (port or WebSocket URL) |
534
555
  | `--auto-connect` | Auto-discover and connect to running Chrome (or `AGENT_BROWSER_AUTO_CONNECT` env) |
@@ -596,8 +617,8 @@ export AGENT_BROWSER_DEFAULT_TIMEOUT=45000
596
617
 
597
618
  > **Note:** Setting this above 30000 (30s) may cause EAGAIN errors on slow operations because the CLI's read timeout will expire before Playwright responds. The CLI retries transient errors automatically, but response times will increase.
598
619
 
599
- | Variable | Description |
600
- |----------|-------------|
620
+ | Variable | Description |
621
+ | ------------------------------- | ------------------------------------------------- |
601
622
  | `AGENT_BROWSER_DEFAULT_TIMEOUT` | Default Playwright timeout in ms (default: 25000) |
602
623
 
603
624
  ## Selectors
@@ -623,6 +644,7 @@ agent-browser hover @e4 # Hover the link
623
644
  ```
624
645
 
625
646
  **Why use refs?**
647
+
626
648
  - **Deterministic**: Ref points to exact element from snapshot
627
649
  - **Fast**: No DOM re-query needed
628
650
  - **AI-friendly**: Snapshot + ref workflow is optimal for LLMs
@@ -723,6 +745,7 @@ agent-browser open other-site.com
723
745
  ```
724
746
 
725
747
  This is useful for:
748
+
726
749
  - **Skipping login flows** - Authenticate via headers instead of UI
727
750
  - **Switching users** - Start new sessions with different auth tokens
728
751
  - **API testing** - Access protected endpoints directly
@@ -744,6 +767,7 @@ agent-browser set headers '{"X-Custom-Header": "value"}'
744
767
  ## Custom Browser Executable
745
768
 
746
769
  Use a custom browser executable instead of the bundled Chromium. This is useful for:
770
+
747
771
  - **Serverless deployment**: Use lightweight Chromium builds like `@sparticuz/chromium` (~50MB vs ~684MB)
748
772
  - **System browsers**: Use an existing Chrome/Chromium installation
749
773
  - **Custom builds**: Use modified browser builds
@@ -804,6 +828,7 @@ agent-browser screenshot report.png
804
828
  ```
805
829
 
806
830
  The `--allow-file-access` flag adds Chromium flags (`--allow-file-access-from-files`, `--allow-file-access`) that allow `file://` URLs to:
831
+
807
832
  - Load and render local files
808
833
  - Access other local files via JavaScript (XHR, fetch)
809
834
  - Load local resources (images, scripts, stylesheets)
@@ -831,10 +856,12 @@ agent-browser --cdp "wss://your-browser-service.com/cdp?token=..." snapshot
831
856
  ```
832
857
 
833
858
  The `--cdp` flag accepts either:
859
+
834
860
  - A port number (e.g., `9222`) for local connections via `http://localhost:{port}`
835
861
  - A full WebSocket URL (e.g., `wss://...` or `ws://...`) for remote browser services
836
862
 
837
863
  This enables control of:
864
+
838
865
  - Electron apps
839
866
  - Chrome/Chromium instances with remote debugging
840
867
  - WebView2 applications
@@ -854,10 +881,12 @@ AGENT_BROWSER_AUTO_CONNECT=1 agent-browser snapshot
854
881
  ```
855
882
 
856
883
  Auto-connect discovers Chrome by:
884
+
857
885
  1. Reading Chrome's `DevToolsActivePort` file from the default user data directory
858
886
  2. Falling back to probing common debugging ports (9222, 9229)
859
887
 
860
888
  This is useful when:
889
+
861
890
  - Chrome 144+ has remote debugging enabled via `chrome://inspect/#remote-debugging` (which uses a dynamic port)
862
891
  - You want a zero-configuration connection to your existing browser
863
892
  - You don't want to track which port Chrome is using
@@ -881,6 +910,7 @@ This starts a WebSocket server on the specified port that streams the browser vi
881
910
  Connect to `ws://localhost:9223` to receive frames and send input:
882
911
 
883
912
  **Receive frames:**
913
+
884
914
  ```json
885
915
  {
886
916
  "type": "frame",
@@ -897,6 +927,7 @@ Connect to `ws://localhost:9223` to receive frames and send input:
897
927
  ```
898
928
 
899
929
  **Send mouse events:**
930
+
900
931
  ```json
901
932
  {
902
933
  "type": "input_mouse",
@@ -909,6 +940,7 @@ Connect to `ws://localhost:9223` to receive frames and send input:
909
940
  ```
910
941
 
911
942
  **Send keyboard events:**
943
+
912
944
  ```json
913
945
  {
914
946
  "type": "input_keyboard",
@@ -919,6 +951,7 @@ Connect to `ws://localhost:9223` to receive frames and send input:
919
951
  ```
920
952
 
921
953
  **Send touch events:**
954
+
922
955
  ```json
923
956
  {
924
957
  "type": "input_touch",
@@ -939,16 +972,19 @@ await browser.launch({ headless: true });
939
972
  await browser.navigate('https://example.com');
940
973
 
941
974
  // Start screencast
942
- await browser.startScreencast((frame) => {
943
- // frame.data is base64-encoded image
944
- // frame.metadata contains viewport info
945
- console.log('Frame received:', frame.metadata.deviceWidth, 'x', frame.metadata.deviceHeight);
946
- }, {
947
- format: 'jpeg',
948
- quality: 80,
949
- maxWidth: 1280,
950
- maxHeight: 720,
951
- });
975
+ await browser.startScreencast(
976
+ (frame) => {
977
+ // frame.data is base64-encoded image
978
+ // frame.metadata contains viewport info
979
+ console.log('Frame received:', frame.metadata.deviceWidth, 'x', frame.metadata.deviceHeight);
980
+ },
981
+ {
982
+ format: 'jpeg',
983
+ quality: 80,
984
+ maxWidth: 1280,
985
+ maxHeight: 720,
986
+ }
987
+ );
952
988
 
953
989
  // Inject mouse events
954
990
  await browser.injectMouseEvent({
@@ -1000,18 +1036,18 @@ agent-browser open example.com
1000
1036
  Or add to your config file (`agent-browser.json`):
1001
1037
 
1002
1038
  ```json
1003
- {"native": true}
1039
+ { "native": true }
1004
1040
  ```
1005
1041
 
1006
1042
  ### What's Different
1007
1043
 
1008
- | | Default (Node.js) | Native (`--native`) |
1009
- |---|---|---|
1010
- | **Runtime** | Node.js + Playwright | Pure Rust binary |
1011
- | **Protocol** | Playwright protocol | Direct CDP / WebDriver |
1012
- | **Install size** | Larger (Node.js + npm deps) | Smaller (single binary) |
1013
- | **Browser support** | Chromium, Firefox, WebKit | Chromium, Safari (via WebDriver) |
1014
- | **Stability** | Stable | Experimental |
1044
+ | | Default (Node.js) | Native (`--native`) |
1045
+ | ------------------- | --------------------------- | -------------------------------- |
1046
+ | **Runtime** | Node.js + Playwright | Pure Rust binary |
1047
+ | **Protocol** | Playwright protocol | Direct CDP / WebDriver |
1048
+ | **Install size** | Larger (Node.js + npm deps) | Smaller (single binary) |
1049
+ | **Browser support** | Chromium, Firefox, WebKit | Chromium, Safari (via WebDriver) |
1050
+ | **Stability** | Stable | Experimental |
1015
1051
 
1016
1052
  ### Known Limitations
1017
1053
 
@@ -1021,13 +1057,13 @@ Or add to your config file (`agent-browser.json`):
1021
1057
 
1022
1058
  ## Platforms
1023
1059
 
1024
- | Platform | Binary | Fallback |
1025
- |----------|--------|----------|
1026
- | macOS ARM64 | Native Rust | Node.js |
1027
- | macOS x64 | Native Rust | Node.js |
1028
- | Linux ARM64 | Native Rust | Node.js |
1029
- | Linux x64 | Native Rust | Node.js |
1030
- | Windows x64 | Native Rust | Node.js |
1060
+ | Platform | Binary | Fallback |
1061
+ | ----------- | ----------- | -------- |
1062
+ | macOS ARM64 | Native Rust | Node.js |
1063
+ | macOS x64 | Native Rust | Node.js |
1064
+ | Linux ARM64 | Native Rust | Node.js |
1065
+ | Linux x64 | Native Rust | Node.js |
1066
+ | Windows x64 | Native Rust | Node.js |
1031
1067
 
1032
1068
  ## Usage with AI Agents
1033
1069
 
@@ -1071,6 +1107,7 @@ For more consistent results, add to your project or global instructions file:
1071
1107
  Use `agent-browser` for web automation. Run `agent-browser --help` for all commands.
1072
1108
 
1073
1109
  Core workflow:
1110
+
1074
1111
  1. `agent-browser open <url>` - Navigate to page
1075
1112
  2. `agent-browser snapshot -i` - Get interactive elements with refs (@e1, @e2)
1076
1113
  3. `agent-browser click @e1` / `fill @e2 "text"` - Interact using refs
@@ -1122,11 +1159,11 @@ export AGENT_BROWSER_IOS_DEVICE="iPhone 16 Pro"
1122
1159
  agent-browser open https://example.com
1123
1160
  ```
1124
1161
 
1125
- | Variable | Description |
1126
- |----------|-------------|
1127
- | `AGENT_BROWSER_PROVIDER` | Set to `ios` to enable iOS mode |
1162
+ | Variable | Description |
1163
+ | -------------------------- | ----------------------------------------------- |
1164
+ | `AGENT_BROWSER_PROVIDER` | Set to `ios` to enable iOS mode |
1128
1165
  | `AGENT_BROWSER_IOS_DEVICE` | Device name (e.g., "iPhone 16 Pro", "iPad Pro") |
1129
- | `AGENT_BROWSER_IOS_UDID` | Device UDID (alternative to device name) |
1166
+ | `AGENT_BROWSER_IOS_UDID` | Device UDID (alternative to device name) |
1130
1167
 
1131
1168
  **Supported devices:** All iOS Simulators available in Xcode (iPhones, iPads), plus real iOS devices.
1132
1169
 
@@ -1137,6 +1174,7 @@ agent-browser open https://example.com
1137
1174
  Appium also supports real iOS devices connected via USB. This requires additional one-time setup:
1138
1175
 
1139
1176
  **1. Get your device UDID:**
1177
+
1140
1178
  ```bash
1141
1179
  xcrun xctrace list devices
1142
1180
  # or
@@ -1144,6 +1182,7 @@ system_profiler SPUSBDataType | grep -A 5 "iPhone\|iPad"
1144
1182
  ```
1145
1183
 
1146
1184
  **2. Sign WebDriverAgent (one-time):**
1185
+
1147
1186
  ```bash
1148
1187
  # Open the WebDriverAgent Xcode project
1149
1188
  cd ~/.appium/node_modules/appium-xcuitest-driver/node_modules/appium-webdriveragent
@@ -1151,12 +1190,14 @@ open WebDriverAgent.xcodeproj
1151
1190
  ```
1152
1191
 
1153
1192
  In Xcode:
1193
+
1154
1194
  - Select the `WebDriverAgentRunner` target
1155
1195
  - Go to Signing & Capabilities
1156
1196
  - Select your Team (requires Apple Developer account, free tier works)
1157
1197
  - Let Xcode manage signing automatically
1158
1198
 
1159
1199
  **3. Use with agent-browser:**
1200
+
1160
1201
  ```bash
1161
1202
  # Connect device via USB, then:
1162
1203
  agent-browser -p ios --device "<DEVICE_UDID>" open https://example.com
@@ -1166,11 +1207,44 @@ agent-browser -p ios --device "John's iPhone" open https://example.com
1166
1207
  ```
1167
1208
 
1168
1209
  **Real device notes:**
1210
+
1169
1211
  - First run installs WebDriverAgent to the device (may require Trust prompt)
1170
1212
  - Device must be unlocked and connected via USB
1171
1213
  - Slightly slower initial connection than simulator
1172
1214
  - Tests against real Safari performance and behavior
1173
1215
 
1216
+ ### Browserless
1217
+
1218
+ [Browserless](https://browserless.io) provides cloud browser infrastructure with a Sessions API. Use it when running agent-browser in environments where a local browser isn't available.
1219
+
1220
+ To enable Browserless, use the `-p` flag:
1221
+
1222
+ ```bash
1223
+ export BROWSERLESS_API_KEY="your-api-token"
1224
+ agent-browser -p browserless open https://example.com
1225
+ ```
1226
+
1227
+ Or use environment variables for CI/scripts:
1228
+
1229
+ ```bash
1230
+ export AGENT_BROWSER_PROVIDER=browserless
1231
+ export BROWSERLESS_API_KEY="your-api-token"
1232
+ agent-browser open https://example.com
1233
+ ```
1234
+
1235
+ Optional configuration via environment variables:
1236
+
1237
+ | Variable | Description | Default |
1238
+ | -------------------------- | ------------------------------------------------ | --------------------------------------- |
1239
+ | `BROWSERLESS_API_URL` | Base API URL (for custom regions or self-hosted) | `https://production-sfo.browserless.io` |
1240
+ | `BROWSERLESS_BROWSER_TYPE` | Type of browser to use (chromium or chrome) | chromium |
1241
+ | `BROWSERLESS_TTL` | Session TTL in milliseconds | `300000` |
1242
+ | `BROWSERLESS_STEALTH` | Enable stealth mode (`true`/`false`) | `true` |
1243
+
1244
+ When enabled, agent-browser connects to a Browserless cloud session instead of launching a local browser. All commands work identically.
1245
+
1246
+ Get your API token from the [Browserless Dashboard](https://browserless.io).
1247
+
1174
1248
  ### Browserbase
1175
1249
 
1176
1250
  [Browserbase](https://browserbase.com) provides remote browser infrastructure to make deployment of agentic browsing agents easy. Use it when running the agent-browser CLI in an environment where a local browser isn't feasible.
@@ -1238,12 +1312,12 @@ agent-browser open https://example.com
1238
1312
 
1239
1313
  Optional configuration via environment variables:
1240
1314
 
1241
- | Variable | Description | Default |
1242
- |----------|-------------|---------|
1243
- | `KERNEL_HEADLESS` | Run browser in headless mode (`true`/`false`) | `false` |
1244
- | `KERNEL_STEALTH` | Enable stealth mode to avoid bot detection (`true`/`false`) | `true` |
1245
- | `KERNEL_TIMEOUT_SECONDS` | Session timeout in seconds | `300` |
1246
- | `KERNEL_PROFILE_NAME` | Browser profile name for persistent cookies/logins (created if it doesn't exist) | (none) |
1315
+ | Variable | Description | Default |
1316
+ | ------------------------ | -------------------------------------------------------------------------------- | ------- |
1317
+ | `KERNEL_HEADLESS` | Run browser in headless mode (`true`/`false`) | `false` |
1318
+ | `KERNEL_STEALTH` | Enable stealth mode to avoid bot detection (`true`/`false`) | `true` |
1319
+ | `KERNEL_TIMEOUT_SECONDS` | Session timeout in seconds | `300` |
1320
+ | `KERNEL_PROFILE_NAME` | Browser profile name for persistent cookies/logins (created if it doesn't exist) | (none) |
1247
1321
 
1248
1322
  When enabled, agent-browser connects to a Kernel cloud session instead of launching a local browser. All commands work identically.
1249
1323
 
Binary file
Binary file
Binary file
Binary file
Binary file
package/dist/actions.js CHANGED
@@ -411,19 +411,11 @@ async function handleLaunch(command, browser) {
411
411
  return successResponse(command.id, { launched: true });
412
412
  }
413
413
  async function handleNavigate(command, browser) {
414
- browser.checkDomainAllowed(command.url);
415
- const page = browser.getPage();
416
- // If headers are provided, set up scoped headers for this origin
417
- if (command.headers && Object.keys(command.headers).length > 0) {
418
- await browser.setScopedHeaders(command.url, command.headers);
419
- }
420
- await page.goto(command.url, {
421
- waitUntil: command.waitUntil ?? 'load',
422
- });
423
- return successResponse(command.id, {
424
- url: page.url(),
425
- title: await page.title(),
414
+ const result = await browser.navigate(command.url, {
415
+ headers: command.headers,
416
+ waitUntil: command.waitUntil,
426
417
  });
418
+ return successResponse(command.id, result);
427
419
  }
428
420
  async function handleClick(command, browser) {
429
421
  // Support both refs (@e1) and regular selectors
@@ -513,7 +505,7 @@ async function handleScreenshot(command, browser) {
513
505
  const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
514
506
  const random = Math.random().toString(36).substring(2, 8);
515
507
  const filename = `screenshot-${timestamp}-${random}.${ext}`;
516
- const screenshotDir = path.join(getAppDir(), 'tmp', 'screenshots');
508
+ const screenshotDir = command.screenshotDir ?? path.join(getAppDir(), 'tmp', 'screenshots');
517
509
  mkdirSync(screenshotDir, { recursive: true });
518
510
  savePath = path.join(screenshotDir, filename);
519
511
  }
@@ -696,7 +688,10 @@ async function handleEvaluate(command, browser) {
696
688
  }
697
689
  async function handleWait(command, browser) {
698
690
  const page = browser.getPage();
699
- if (command.selector) {
691
+ if (command.text) {
692
+ await page.waitForFunction(`(document.body.innerText || '').includes(${JSON.stringify(command.text)})`, { timeout: command.timeout });
693
+ }
694
+ else if (command.selector) {
700
695
  await page.waitForSelector(command.selector, {
701
696
  state: command.state ?? 'visible',
702
697
  timeout: command.timeout,
@@ -706,7 +701,6 @@ async function handleWait(command, browser) {
706
701
  await page.waitForTimeout(command.timeout);
707
702
  }
708
703
  else {
709
- // Default: wait for load state
710
704
  await page.waitForLoadState('load');
711
705
  }
712
706
  return successResponse(command.id, { waited: true });
@@ -772,7 +766,8 @@ async function handleContent(command, browser) {
772
766
  const page = browser.getPage();
773
767
  let html;
774
768
  if (command.selector) {
775
- html = await page.locator(command.selector).innerHTML();
769
+ const locator = browser.getLocator(command.selector);
770
+ html = await locator.innerHTML();
776
771
  }
777
772
  else {
778
773
  html = await page.content();
@@ -1240,13 +1235,13 @@ async function handleIsChecked(command, browser) {
1240
1235
  return successResponse(command.id, { checked });
1241
1236
  }
1242
1237
  async function handleCount(command, browser) {
1243
- const page = browser.getPage();
1244
- const count = await page.locator(command.selector).count();
1238
+ const locator = browser.getLocator(command.selector);
1239
+ const count = await locator.count();
1245
1240
  return successResponse(command.id, { count });
1246
1241
  }
1247
1242
  async function handleBoundingBox(command, browser) {
1248
- const page = browser.getPage();
1249
- const box = await page.locator(command.selector).boundingBox();
1243
+ const locator = browser.getLocator(command.selector);
1244
+ const box = await locator.boundingBox();
1250
1245
  return successResponse(command.id, { box });
1251
1246
  }
1252
1247
  async function handleStyles(command, browser) {
@@ -1365,8 +1360,6 @@ async function handleStateLoad(command, browser) {
1365
1360
  return errorResponse(command.id, `State file not found: ${command.path}`);
1366
1361
  }
1367
1362
  await browser.launch({
1368
- id: command.id,
1369
- action: 'launch',
1370
1363
  headless: true,
1371
1364
  autoStateFilePath: command.path,
1372
1365
  });
@@ -1534,7 +1527,7 @@ async function handleKeyboard(command, browser) {
1534
1527
  async function handleWheel(command, browser) {
1535
1528
  const page = browser.getPage();
1536
1529
  if (command.selector) {
1537
- const element = page.locator(command.selector);
1530
+ const element = browser.getLocator(command.selector);
1538
1531
  await element.hover();
1539
1532
  }
1540
1533
  await page.mouse.wheel(command.deltaX ?? 0, command.deltaY ?? 0);
@@ -1549,41 +1542,50 @@ async function handleClipboard(command, browser) {
1549
1542
  const page = browser.getPage();
1550
1543
  switch (command.operation) {
1551
1544
  case 'copy':
1552
- await page.keyboard.press('Control+c');
1545
+ await page.keyboard.press('ControlOrMeta+c');
1553
1546
  return successResponse(command.id, { copied: true });
1554
1547
  case 'paste':
1555
- await page.keyboard.press('Control+v');
1548
+ await page.keyboard.press('ControlOrMeta+v');
1556
1549
  return successResponse(command.id, { pasted: true });
1557
- case 'read':
1550
+ case 'read': {
1558
1551
  const text = await page.evaluate('navigator.clipboard.readText()');
1559
1552
  return successResponse(command.id, { text });
1553
+ }
1554
+ case 'write': {
1555
+ if (!command.text) {
1556
+ return errorResponse(command.id, "Missing 'text' parameter for clipboard write");
1557
+ }
1558
+ await page.evaluate(`navigator.clipboard.writeText(${JSON.stringify(command.text)})`);
1559
+ return successResponse(command.id, { written: command.text });
1560
+ }
1560
1561
  default:
1561
1562
  return errorResponse(command.id, 'Unknown clipboard operation');
1562
1563
  }
1563
1564
  }
1564
1565
  async function handleHighlight(command, browser) {
1565
- const page = browser.getPage();
1566
- await page.locator(command.selector).highlight();
1566
+ const locator = browser.getLocator(command.selector);
1567
+ await locator.highlight();
1567
1568
  return successResponse(command.id, { highlighted: true });
1568
1569
  }
1569
1570
  async function handleClear(command, browser) {
1570
- const page = browser.getPage();
1571
- await page.locator(command.selector).clear();
1571
+ const locator = browser.getLocator(command.selector);
1572
+ await locator.clear();
1572
1573
  return successResponse(command.id, { cleared: true });
1573
1574
  }
1574
1575
  async function handleSelectAll(command, browser) {
1575
- const page = browser.getPage();
1576
- await page.locator(command.selector).selectText();
1576
+ const locator = browser.getLocator(command.selector);
1577
+ await locator.selectText();
1577
1578
  return successResponse(command.id, { selected: true });
1578
1579
  }
1579
1580
  async function handleInnerText(command, browser) {
1580
- const page = browser.getPage();
1581
- const text = await page.locator(command.selector).innerText();
1581
+ const locator = browser.getLocator(command.selector);
1582
+ const text = await locator.innerText();
1582
1583
  return successResponse(command.id, { text });
1583
1584
  }
1584
1585
  async function handleInnerHtml(command, browser) {
1585
1586
  const page = browser.getPage();
1586
- const html = await page.locator(command.selector).innerHTML();
1587
+ const locator = browser.getLocator(command.selector);
1588
+ const html = await locator.innerHTML();
1587
1589
  return successResponse(command.id, { html, origin: page.url() });
1588
1590
  }
1589
1591
  async function handleInputValue(command, browser) {
@@ -1593,13 +1595,13 @@ async function handleInputValue(command, browser) {
1593
1595
  return successResponse(command.id, { value, origin: page.url() });
1594
1596
  }
1595
1597
  async function handleSetValue(command, browser) {
1596
- const page = browser.getPage();
1597
- await page.locator(command.selector).fill(command.value);
1598
+ const locator = browser.getLocator(command.selector);
1599
+ await locator.fill(command.value);
1598
1600
  return successResponse(command.id, { set: true });
1599
1601
  }
1600
1602
  async function handleDispatch(command, browser) {
1601
- const page = browser.getPage();
1602
- await page.locator(command.selector).dispatchEvent(command.event, command.eventInit);
1603
+ const locator = browser.getLocator(command.selector);
1604
+ await locator.dispatchEvent(command.event, command.eventInit);
1603
1605
  return successResponse(command.id, { dispatched: command.event });
1604
1606
  }
1605
1607
  async function handleEvalHandle(command, browser) {
@@ -1705,8 +1707,7 @@ async function handleGetByTestId(command, browser) {
1705
1707
  }
1706
1708
  }
1707
1709
  async function handleNth(command, browser) {
1708
- const page = browser.getPage();
1709
- const base = page.locator(command.selector);
1710
+ const base = browser.getLocator(command.selector);
1710
1711
  const locator = command.index === -1 ? base.last() : base.nth(command.index);
1711
1712
  switch (command.subaction) {
1712
1713
  case 'click':
@@ -1816,8 +1817,8 @@ async function handleInsertText(command, browser) {
1816
1817
  return successResponse(command.id, { inserted: true });
1817
1818
  }
1818
1819
  async function handleMultiSelect(command, browser) {
1819
- const page = browser.getPage();
1820
- const selected = await page.locator(command.selector).selectOption(command.values);
1820
+ const locator = browser.getLocator(command.selector);
1821
+ const selected = await locator.selectOption(command.values);
1821
1822
  return successResponse(command.id, { selected });
1822
1823
  }
1823
1824
  async function handleWaitForDownload(command, browser) {
@@ -1965,7 +1966,7 @@ async function handleDiffScreenshot(command, browser) {
1965
1966
  const page = browser.getPage();
1966
1967
  let screenshotBuffer;
1967
1968
  if (command.selector) {
1968
- const locator = browser.getLocatorFromRef(command.selector) || page.locator(command.selector);
1969
+ const locator = browser.getLocator(command.selector);
1969
1970
  screenshotBuffer = await locator.screenshot({ type: 'png' });
1970
1971
  }
1971
1972
  else {