agent-browser 0.4.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -18,6 +18,8 @@ git clone https://github.com/vercel-labs/agent-browser
18
18
  cd agent-browser
19
19
  pnpm install
20
20
  pnpm build
21
+ pnpm build:native # Requires Rust (https://rustup.rs)
22
+ pnpm link --global # Makes agent-browser available globally
21
23
  agent-browser install
22
24
  ```
23
25
 
@@ -72,10 +74,11 @@ agent-browser scroll <dir> [px] # Scroll (up/down/left/right)
72
74
  agent-browser scrollintoview <sel> # Scroll element into view (alias: scrollinto)
73
75
  agent-browser drag <src> <tgt> # Drag and drop
74
76
  agent-browser upload <sel> <files> # Upload files
75
- agent-browser screenshot [path] # Take screenshot (--full for full page)
77
+ agent-browser screenshot [path] # Take screenshot (--full for full page, base64 png to stdout if no path)
76
78
  agent-browser pdf <path> # Save as PDF
77
79
  agent-browser snapshot # Accessibility tree with refs (best for AI)
78
80
  agent-browser eval <js> # Run JavaScript
81
+ agent-browser connect <port> # Connect to browser via CDP
79
82
  agent-browser close # Close browser (aliases: quit, exit)
80
83
  ```
81
84
 
@@ -300,6 +303,7 @@ agent-browser snapshot -i -c -d 5 # Combine options
300
303
  | `--name, -n` | Locator name filter |
301
304
  | `--exact` | Exact text match |
302
305
  | `--headed` | Show browser window (not headless) |
306
+ | `--cdp <port>` | Connect via Chrome DevTools Protocol |
303
307
  | `--debug` | Debug output |
304
308
 
305
309
  ## Selectors
@@ -457,6 +461,136 @@ export async function handler() {
457
461
  }
458
462
  ```
459
463
 
464
+ ## CDP Mode
465
+
466
+ Connect to an existing browser via Chrome DevTools Protocol:
467
+
468
+ ```bash
469
+ # Start Chrome with: google-chrome --remote-debugging-port=9222
470
+
471
+ # Connect once, then run commands without --cdp
472
+ agent-browser connect 9222
473
+ agent-browser snapshot
474
+ agent-browser tab
475
+ agent-browser close
476
+
477
+ # Or pass --cdp on each command
478
+ agent-browser --cdp 9222 snapshot
479
+ ```
480
+
481
+ This enables control of:
482
+ - Electron apps
483
+ - Chrome/Chromium instances with remote debugging
484
+ - WebView2 applications
485
+ - Any browser exposing a CDP endpoint
486
+
487
+ ## Streaming (Browser Preview)
488
+
489
+ Stream the browser viewport via WebSocket for live preview or "pair browsing" where a human can watch and interact alongside an AI agent.
490
+
491
+ ### Enable Streaming
492
+
493
+ Set the `AGENT_BROWSER_STREAM_PORT` environment variable:
494
+
495
+ ```bash
496
+ AGENT_BROWSER_STREAM_PORT=9223 agent-browser open example.com
497
+ ```
498
+
499
+ This starts a WebSocket server on the specified port that streams the browser viewport and accepts input events.
500
+
501
+ ### WebSocket Protocol
502
+
503
+ Connect to `ws://localhost:9223` to receive frames and send input:
504
+
505
+ **Receive frames:**
506
+ ```json
507
+ {
508
+ "type": "frame",
509
+ "data": "<base64-encoded-jpeg>",
510
+ "metadata": {
511
+ "deviceWidth": 1280,
512
+ "deviceHeight": 720,
513
+ "pageScaleFactor": 1,
514
+ "offsetTop": 0,
515
+ "scrollOffsetX": 0,
516
+ "scrollOffsetY": 0
517
+ }
518
+ }
519
+ ```
520
+
521
+ **Send mouse events:**
522
+ ```json
523
+ {
524
+ "type": "input_mouse",
525
+ "eventType": "mousePressed",
526
+ "x": 100,
527
+ "y": 200,
528
+ "button": "left",
529
+ "clickCount": 1
530
+ }
531
+ ```
532
+
533
+ **Send keyboard events:**
534
+ ```json
535
+ {
536
+ "type": "input_keyboard",
537
+ "eventType": "keyDown",
538
+ "key": "Enter",
539
+ "code": "Enter"
540
+ }
541
+ ```
542
+
543
+ **Send touch events:**
544
+ ```json
545
+ {
546
+ "type": "input_touch",
547
+ "eventType": "touchStart",
548
+ "touchPoints": [{ "x": 100, "y": 200 }]
549
+ }
550
+ ```
551
+
552
+ ### Programmatic API
553
+
554
+ For advanced use, control streaming directly via the protocol:
555
+
556
+ ```typescript
557
+ import { BrowserManager } from 'agent-browser';
558
+
559
+ const browser = new BrowserManager();
560
+ await browser.launch({ headless: true });
561
+ await browser.navigate('https://example.com');
562
+
563
+ // Start screencast
564
+ await browser.startScreencast((frame) => {
565
+ // frame.data is base64-encoded image
566
+ // frame.metadata contains viewport info
567
+ console.log('Frame received:', frame.metadata.deviceWidth, 'x', frame.metadata.deviceHeight);
568
+ }, {
569
+ format: 'jpeg',
570
+ quality: 80,
571
+ maxWidth: 1280,
572
+ maxHeight: 720,
573
+ });
574
+
575
+ // Inject mouse events
576
+ await browser.injectMouseEvent({
577
+ type: 'mousePressed',
578
+ x: 100,
579
+ y: 200,
580
+ button: 'left',
581
+ });
582
+
583
+ // Inject keyboard events
584
+ await browser.injectKeyboardEvent({
585
+ type: 'keyDown',
586
+ key: 'Enter',
587
+ code: 'Enter',
588
+ });
589
+
590
+ // Stop when done
591
+ await browser.stopScreencast();
592
+ ```
593
+
460
594
  ## Architecture
461
595
 
462
596
  agent-browser uses a client-daemon architecture:
Binary file
Binary file
Binary file
Binary file
Binary file
package/dist/actions.d.ts CHANGED
@@ -1,5 +1,15 @@
1
- import type { BrowserManager } from './browser.js';
1
+ import type { BrowserManager, ScreencastFrame } from './browser.js';
2
2
  import type { Command, Response } from './types.js';
3
+ /**
4
+ * Set the callback for screencast frames
5
+ * This is called by the daemon to set up frame streaming
6
+ */
7
+ export declare function setScreencastFrameCallback(callback: ((frame: ScreencastFrame) => void) | null): void;
8
+ /**
9
+ * Convert Playwright errors to AI-friendly messages
10
+ * @internal Exported for testing
11
+ */
12
+ export declare function toAIFriendlyError(error: unknown, selector: string): Error;
3
13
  /**
4
14
  * Execute a command and return a response
5
15
  */
@@ -1 +1 @@
1
- {"version":3,"file":"actions.d.ts","sourceRoot":"","sources":["../src/actions.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AACnD,OAAO,KAAK,EACV,OAAO,EACP,QAAQ,EAoGT,MAAM,YAAY,CAAC;AAkDpB;;GAEG;AACH,wBAAsB,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAiPjG"}
1
+ {"version":3,"file":"actions.d.ts","sourceRoot":"","sources":["../src/actions.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AACpE,OAAO,KAAK,EACV,OAAO,EACP,QAAQ,EAqHT,MAAM,YAAY,CAAC;AAMpB;;;GAGG;AACH,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,CAAC,CAAC,KAAK,EAAE,eAAe,KAAK,IAAI,CAAC,GAAG,IAAI,GAClD,IAAI,CAEN;AAQD;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,GAAG,KAAK,CA6CzE;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAmQjG"}
package/dist/actions.js CHANGED
@@ -1,8 +1,18 @@
1
1
  import { successResponse, errorResponse } from './protocol.js';
2
+ // Callback for screencast frames - will be set by the daemon when streaming is active
3
+ let screencastFrameCallback = null;
4
+ /**
5
+ * Set the callback for screencast frames
6
+ * This is called by the daemon to set up frame streaming
7
+ */
8
+ export function setScreencastFrameCallback(callback) {
9
+ screencastFrameCallback = callback;
10
+ }
2
11
  /**
3
12
  * Convert Playwright errors to AI-friendly messages
13
+ * @internal Exported for testing
4
14
  */
5
- function toAIFriendlyError(error, selector) {
15
+ export function toAIFriendlyError(error, selector) {
6
16
  const message = error instanceof Error ? error.message : String(error);
7
17
  // Handle strict mode violation (multiple elements match)
8
18
  if (message.includes('strict mode violation')) {
@@ -12,17 +22,23 @@ function toAIFriendlyError(error, selector) {
12
22
  return new Error(`Selector "${selector}" matched ${count} elements. ` +
13
23
  `Run 'snapshot' to get updated refs, or use a more specific CSS selector.`);
14
24
  }
15
- // Handle element not found
25
+ // Handle element not interactable (must be checked BEFORE timeout case)
26
+ // This includes cases where an overlay/modal blocks the element
27
+ if (message.includes('intercepts pointer events')) {
28
+ return new Error(`Element "${selector}" is blocked by another element (likely a modal or overlay). ` +
29
+ `Try dismissing any modals/cookie banners first.`);
30
+ }
31
+ // Handle element not visible
32
+ if (message.includes('not visible') && !message.includes('Timeout')) {
33
+ return new Error(`Element "${selector}" is not visible. ` +
34
+ `Try scrolling it into view or check if it's hidden.`);
35
+ }
36
+ // Handle element not found (timeout waiting for element)
16
37
  if (message.includes('waiting for') &&
17
38
  (message.includes('to be visible') || message.includes('Timeout'))) {
18
39
  return new Error(`Element "${selector}" not found or not visible. ` +
19
40
  `Run 'snapshot' to see current page elements.`);
20
41
  }
21
- // Handle element not interactable
22
- if (message.includes('intercepts pointer events') || message.includes('not visible')) {
23
- return new Error(`Element "${selector}" is not interactable (may be hidden or covered). ` +
24
- `Try scrolling it into view or check if a modal/overlay is blocking it.`);
25
- }
26
42
  // Return original error for unknown cases
27
43
  return error instanceof Error ? error : new Error(message);
28
44
  }
@@ -154,6 +170,8 @@ export async function executeCommand(command, browser) {
154
170
  return await handleCount(command, browser);
155
171
  case 'boundingbox':
156
172
  return await handleBoundingBox(command, browser);
173
+ case 'styles':
174
+ return await handleStyles(command, browser);
157
175
  case 'video_start':
158
176
  return await handleVideoStart(command, browser);
159
177
  case 'video_stop':
@@ -260,6 +278,22 @@ export async function executeCommand(command, browser) {
260
278
  return await handleWaitForDownload(command, browser);
261
279
  case 'responsebody':
262
280
  return await handleResponseBody(command, browser);
281
+ case 'screencast_start':
282
+ return await handleScreencastStart(command, browser);
283
+ case 'screencast_stop':
284
+ return await handleScreencastStop(command, browser);
285
+ case 'input_mouse':
286
+ return await handleInputMouse(command, browser);
287
+ case 'input_keyboard':
288
+ return await handleInputKeyboard(command, browser);
289
+ case 'input_touch':
290
+ return await handleInputTouch(command, browser);
291
+ case 'recording_start':
292
+ return await handleRecordingStart(command, browser);
293
+ case 'recording_stop':
294
+ return await handleRecordingStop(command, browser);
295
+ case 'recording_restart':
296
+ return await handleRecordingRestart(command, browser);
263
297
  default: {
264
298
  // TypeScript narrows to never here, but we handle it for safety
265
299
  const unknownCommand = command;
@@ -467,6 +501,11 @@ async function handleClose(command, browser) {
467
501
  }
468
502
  async function handleTabNew(command, browser) {
469
503
  const result = await browser.newTab();
504
+ // Navigate to URL if provided (same pattern as handleNavigate)
505
+ if (command.url) {
506
+ const page = browser.getPage();
507
+ await page.goto(command.url, { waitUntil: 'domcontentloaded' });
508
+ }
470
509
  return successResponse(command.id, result);
471
510
  }
472
511
  async function handleTabList(command, browser) {
@@ -477,7 +516,7 @@ async function handleTabList(command, browser) {
477
516
  });
478
517
  }
479
518
  async function handleTabSwitch(command, browser) {
480
- const result = browser.switchTo(command.index);
519
+ const result = await browser.switchTo(command.index);
481
520
  const page = browser.getPage();
482
521
  return successResponse(command.id, {
483
522
  ...result,
@@ -836,6 +875,50 @@ async function handleBoundingBox(command, browser) {
836
875
  const box = await page.locator(command.selector).boundingBox();
837
876
  return successResponse(command.id, { box });
838
877
  }
878
+ async function handleStyles(command, browser) {
879
+ const page = browser.getPage();
880
+ // Shared extraction logic as a string to be eval'd in browser context
881
+ const extractStylesScript = `(function(el) {
882
+ const s = getComputedStyle(el);
883
+ const r = el.getBoundingClientRect();
884
+ return {
885
+ tag: el.tagName.toLowerCase(),
886
+ text: el.innerText?.trim().slice(0, 80) || null,
887
+ box: {
888
+ x: Math.round(r.x),
889
+ y: Math.round(r.y),
890
+ width: Math.round(r.width),
891
+ height: Math.round(r.height),
892
+ },
893
+ styles: {
894
+ fontSize: s.fontSize,
895
+ fontWeight: s.fontWeight,
896
+ fontFamily: s.fontFamily.split(',')[0].trim().replace(/"/g, ''),
897
+ color: s.color,
898
+ backgroundColor: s.backgroundColor,
899
+ borderRadius: s.borderRadius,
900
+ border: s.border !== 'none' && s.borderWidth !== '0px' ? s.border : null,
901
+ boxShadow: s.boxShadow !== 'none' ? s.boxShadow : null,
902
+ padding: s.padding,
903
+ },
904
+ };
905
+ })`;
906
+ // Check if it's a ref - single element
907
+ if (browser.isRef(command.selector)) {
908
+ const locator = browser.getLocator(command.selector);
909
+ const element = (await locator.evaluate((el, script) => {
910
+ const fn = eval(script);
911
+ return fn(el);
912
+ }, extractStylesScript));
913
+ return successResponse(command.id, { elements: [element] });
914
+ }
915
+ // CSS selector - can match multiple elements
916
+ const elements = (await page.$$eval(command.selector, (els, script) => {
917
+ const fn = eval(script);
918
+ return els.map((el) => fn(el));
919
+ }, extractStylesScript));
920
+ return successResponse(command.id, { elements });
921
+ }
839
922
  // Advanced handlers
840
923
  async function handleVideoStart(command, browser) {
841
924
  // Video recording requires context-level setup at launch
@@ -967,8 +1050,8 @@ async function handleInnerHtml(command, browser) {
967
1050
  return successResponse(command.id, { html });
968
1051
  }
969
1052
  async function handleInputValue(command, browser) {
970
- const page = browser.getPage();
971
- const value = await page.locator(command.selector).inputValue();
1053
+ const locator = browser.getLocator(command.selector);
1054
+ const value = await locator.inputValue();
972
1055
  return successResponse(command.id, { value });
973
1056
  }
974
1057
  async function handleSetValue(command, browser) {
@@ -1233,4 +1316,78 @@ async function handleResponseBody(command, browser) {
1233
1316
  body: parsed,
1234
1317
  });
1235
1318
  }
1319
+ // Screencast and input injection handlers
1320
+ async function handleScreencastStart(command, browser) {
1321
+ if (!screencastFrameCallback) {
1322
+ throw new Error('Screencast frame callback not set. Start the streaming server first.');
1323
+ }
1324
+ await browser.startScreencast(screencastFrameCallback, {
1325
+ format: command.format,
1326
+ quality: command.quality,
1327
+ maxWidth: command.maxWidth,
1328
+ maxHeight: command.maxHeight,
1329
+ everyNthFrame: command.everyNthFrame,
1330
+ });
1331
+ return successResponse(command.id, {
1332
+ started: true,
1333
+ format: command.format ?? 'jpeg',
1334
+ quality: command.quality ?? 80,
1335
+ });
1336
+ }
1337
+ async function handleScreencastStop(command, browser) {
1338
+ await browser.stopScreencast();
1339
+ return successResponse(command.id, { stopped: true });
1340
+ }
1341
+ async function handleInputMouse(command, browser) {
1342
+ await browser.injectMouseEvent({
1343
+ type: command.type,
1344
+ x: command.x,
1345
+ y: command.y,
1346
+ button: command.button,
1347
+ clickCount: command.clickCount,
1348
+ deltaX: command.deltaX,
1349
+ deltaY: command.deltaY,
1350
+ modifiers: command.modifiers,
1351
+ });
1352
+ return successResponse(command.id, { injected: true });
1353
+ }
1354
+ async function handleInputKeyboard(command, browser) {
1355
+ await browser.injectKeyboardEvent({
1356
+ type: command.type,
1357
+ key: command.key,
1358
+ code: command.code,
1359
+ text: command.text,
1360
+ modifiers: command.modifiers,
1361
+ });
1362
+ return successResponse(command.id, { injected: true });
1363
+ }
1364
+ async function handleInputTouch(command, browser) {
1365
+ await browser.injectTouchEvent({
1366
+ type: command.type,
1367
+ touchPoints: command.touchPoints,
1368
+ modifiers: command.modifiers,
1369
+ });
1370
+ return successResponse(command.id, { injected: true });
1371
+ }
1372
+ // Recording handlers (Playwright native video recording)
1373
+ async function handleRecordingStart(command, browser) {
1374
+ await browser.startRecording(command.path, command.url);
1375
+ return successResponse(command.id, {
1376
+ started: true,
1377
+ path: command.path,
1378
+ });
1379
+ }
1380
+ async function handleRecordingStop(command, browser) {
1381
+ const result = await browser.stopRecording();
1382
+ return successResponse(command.id, result);
1383
+ }
1384
+ async function handleRecordingRestart(command, browser) {
1385
+ const result = await browser.restartRecording(command.path, command.url);
1386
+ return successResponse(command.id, {
1387
+ started: true,
1388
+ path: command.path,
1389
+ previousPath: result.previousPath,
1390
+ stopped: result.stopped,
1391
+ });
1392
+ }
1236
1393
  //# sourceMappingURL=actions.js.map