agent-browser 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -112,6 +112,7 @@ agent-browser scrollintoview <sel> # Scroll element into view (alias: scrolli
112
112
  agent-browser drag <src> <tgt> # Drag and drop
113
113
  agent-browser upload <sel> <files> # Upload files
114
114
  agent-browser screenshot [path] # Take screenshot (--full for full page, saves to a temporary directory if no path)
115
+ agent-browser screenshot --annotate # Annotated screenshot with numbered element labels
115
116
  agent-browser pdf <path> # Save as PDF
116
117
  agent-browser snapshot # Accessibility tree with refs (best for AI)
117
118
  agent-browser eval <js> # Run JavaScript (-b for base64, --stdin for piped input)
@@ -253,6 +254,21 @@ agent-browser dialog accept [text] # Accept (with optional prompt text)
253
254
  agent-browser dialog dismiss # Dismiss
254
255
  ```
255
256
 
257
+ ### Diff
258
+
259
+ ```bash
260
+ agent-browser diff snapshot # Compare current vs last snapshot
261
+ agent-browser diff snapshot --baseline before.txt # Compare current vs saved snapshot file
262
+ agent-browser diff snapshot --selector "#main" --compact # Scoped snapshot diff
263
+ agent-browser diff screenshot --baseline before.png # Visual pixel diff against baseline
264
+ agent-browser diff screenshot --baseline b.png -o d.png # Save diff image to custom path
265
+ agent-browser diff screenshot --baseline b.png -t 0.2 # Adjust color threshold (0-1)
266
+ agent-browser diff url https://v1.com https://v2.com # Compare two URLs (snapshot diff)
267
+ agent-browser diff url https://v1.com https://v2.com --screenshot # Also visual diff
268
+ agent-browser diff url https://v1.com https://v2.com --wait-until networkidle # Custom wait strategy
269
+ agent-browser diff url https://v1.com https://v2.com --selector "#main" # Scope to element
270
+ ```
271
+
256
272
  ### Debug
257
273
 
258
274
  ```bash
@@ -401,6 +417,27 @@ agent-browser snapshot -i -c -d 5 # Combine options
401
417
 
402
418
  The `-C` flag is useful for modern web apps that use custom clickable elements (divs, spans) instead of standard buttons/links.
403
419
 
420
+ ## Annotated Screenshots
421
+
422
+ The `--annotate` flag overlays numbered labels on interactive elements in the screenshot. Each label `[N]` corresponds to ref `@eN`, so the same refs work for both visual and text-based workflows.
423
+
424
+ ```bash
425
+ agent-browser screenshot --annotate
426
+ # -> Screenshot saved to /tmp/screenshot-2026-02-17T12-00-00-abc123.png
427
+ # [1] @e1 button "Submit"
428
+ # [2] @e2 link "Home"
429
+ # [3] @e3 textbox "Email"
430
+ ```
431
+
432
+ After an annotated screenshot, refs are cached so you can immediately interact with elements:
433
+
434
+ ```bash
435
+ agent-browser screenshot --annotate ./page.png
436
+ agent-browser click @e2 # Click the "Home" link labeled [2]
437
+ ```
438
+
439
+ This is useful for multimodal AI models that can reason about visual layout, unlabeled icon buttons, canvas elements, or visual state that the text accessibility tree cannot capture.
440
+
404
441
  ## Options
405
442
 
406
443
  | Option | Description |
@@ -422,6 +459,7 @@ The `-C` flag is useful for modern web apps that use custom clickable elements (
422
459
  | `--device <name>` | iOS device name, e.g. "iPhone 15 Pro" (or `AGENT_BROWSER_IOS_DEVICE` env) |
423
460
  | `--json` | JSON output (for agents) |
424
461
  | `--full, -f` | Full page screenshot |
462
+ | `--annotate` | Annotated screenshot with numbered element labels (or `AGENT_BROWSER_ANNOTATE` env) |
425
463
  | `--headed` | Show browser window (not headless) |
426
464
  | `--cdp <port\|url>` | Connect via Chrome DevTools Protocol (port or WebSocket URL) |
427
465
  | `--auto-connect` | Auto-discover and connect to running Chrome (or `AGENT_BROWSER_AUTO_CONNECT` env) |
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -1 +1 @@
1
- {"version":3,"file":"actions.d.ts","sourceRoot":"","sources":["../src/actions.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAUpE,OAAO,KAAK,EACV,OAAO,EACP,QAAQ,EA4HT,MAAM,YAAY,CAAC;AAMpB;;;GAGG;AACH,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,CAAC,CAAC,KAAK,EAAE,eAAe,KAAK,IAAI,CAAC,GAAG,IAAI,GAClD,IAAI,CAEN;AAQD;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,GAAG,KAAK,CAqDzE;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAiRjG"}
1
+ {"version":3,"file":"actions.d.ts","sourceRoot":"","sources":["../src/actions.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAUpE,OAAO,KAAK,EACV,OAAO,EACP,QAAQ,EAmIT,MAAM,YAAY,CAAC;AAQpB;;;GAGG;AACH,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,CAAC,CAAC,KAAK,EAAE,eAAe,KAAK,IAAI,CAAC,GAAG,IAAI,GAClD,IAAI,CAEN;AAQD;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,GAAG,KAAK,CAqDzE;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuRjG"}
package/dist/actions.js CHANGED
@@ -4,6 +4,8 @@ import { mkdirSync } from 'node:fs';
4
4
  import { getAppDir } from './daemon.js';
5
5
  import { getSessionsDir, readStateFile, isValidSessionName, isEncryptedPayload, listStateFiles, cleanupExpiredStates, } from './state-utils.js';
6
6
  import { successResponse, errorResponse } from './protocol.js';
7
+ import { diffSnapshots, diffScreenshots } from './diff.js';
8
+ import { getEnhancedSnapshot } from './snapshot.js';
7
9
  // Callback for screencast frames - will be set by the daemon when streaming is active
8
10
  let screencastFrameCallback = null;
9
11
  /**
@@ -318,6 +320,12 @@ export async function executeCommand(command, browser) {
318
320
  return await handleRecordingStop(command, browser);
319
321
  case 'recording_restart':
320
322
  return await handleRecordingRestart(command, browser);
323
+ case 'diff_snapshot':
324
+ return await handleDiffSnapshot(command, browser);
325
+ case 'diff_screenshot':
326
+ return await handleDiffScreenshot(command, browser);
327
+ case 'diff_url':
328
+ return await handleDiffUrl(command, browser);
321
329
  default: {
322
330
  // TypeScript narrows to never here, but we handle it for safety
323
331
  const unknownCommand = command;
@@ -409,6 +417,12 @@ async function handlePress(command, browser) {
409
417
  }
410
418
  return successResponse(command.id, { pressed: true });
411
419
  }
420
+ const ANNOTATION_OVERLAY_ID = '__agent_browser_annotations__';
421
+ async function removeAnnotationOverlay(page) {
422
+ await page
423
+ .evaluate(`(() => { const el = document.getElementById(${JSON.stringify(ANNOTATION_OVERLAY_ID)}); if (el) el.remove(); })()`)
424
+ .catch(() => { });
425
+ }
412
426
  async function handleScreenshot(command, browser) {
413
427
  const page = browser.getPage();
414
428
  const options = {
@@ -422,6 +436,7 @@ async function handleScreenshot(command, browser) {
422
436
  if (command.selector) {
423
437
  target = browser.getLocator(command.selector);
424
438
  }
439
+ let overlayInjected = false;
425
440
  try {
426
441
  let savePath = command.path;
427
442
  if (!savePath) {
@@ -433,10 +448,150 @@ async function handleScreenshot(command, browser) {
433
448
  mkdirSync(screenshotDir, { recursive: true });
434
449
  savePath = path.join(screenshotDir, filename);
435
450
  }
451
+ let annotations;
452
+ if (command.annotate) {
453
+ const { refs } = await browser.getSnapshot({ interactive: true });
454
+ const entries = Object.entries(refs);
455
+ const results = await Promise.all(entries.map(async ([ref, data]) => {
456
+ try {
457
+ const locator = browser.getLocatorFromRef(ref);
458
+ if (!locator)
459
+ return null;
460
+ const box = await locator.boundingBox();
461
+ if (!box || box.width === 0 || box.height === 0)
462
+ return null;
463
+ const num = parseInt(ref.replace('e', ''), 10);
464
+ return {
465
+ ref,
466
+ number: num,
467
+ role: data.role,
468
+ name: data.name || undefined,
469
+ box: {
470
+ x: Math.round(box.x),
471
+ y: Math.round(box.y),
472
+ width: Math.round(box.width),
473
+ height: Math.round(box.height),
474
+ },
475
+ };
476
+ }
477
+ catch {
478
+ return null;
479
+ }
480
+ }));
481
+ // When a selector is provided the screenshot is cropped to that element,
482
+ // so filter to annotations that overlap the target and shift coordinates.
483
+ let targetBox = null;
484
+ if (command.selector) {
485
+ const raw = await browser.getLocator(command.selector).boundingBox();
486
+ if (raw) {
487
+ targetBox = {
488
+ x: Math.round(raw.x),
489
+ y: Math.round(raw.y),
490
+ width: Math.round(raw.width),
491
+ height: Math.round(raw.height),
492
+ };
493
+ }
494
+ }
495
+ const filtered = results.filter((a) => a !== null);
496
+ // Filter by selector overlap if needed, but keep viewport-relative coords
497
+ // for overlay positioning. Coordinate shifting happens later for metadata only.
498
+ let overlayItems;
499
+ if (targetBox) {
500
+ const tb = targetBox;
501
+ overlayItems = filtered
502
+ .filter((a) => {
503
+ const ax2 = a.box.x + a.box.width;
504
+ const ay2 = a.box.y + a.box.height;
505
+ const bx2 = tb.x + tb.width;
506
+ const by2 = tb.y + tb.height;
507
+ return a.box.x < bx2 && ax2 > tb.x && a.box.y < by2 && ay2 > tb.y;
508
+ })
509
+ .sort((a, b) => a.number - b.number);
510
+ }
511
+ else {
512
+ overlayItems = filtered.sort((a, b) => a.number - b.number);
513
+ }
514
+ if (overlayItems.length > 0) {
515
+ const overlayData = overlayItems.map((a) => ({
516
+ number: a.number,
517
+ x: a.box.x,
518
+ y: a.box.y,
519
+ width: a.box.width,
520
+ height: a.box.height,
521
+ }));
522
+ // Uses position:absolute with document-relative coords so labels render
523
+ // correctly for both viewport and fullPage screenshots, and when the
524
+ // screenshot is scoped to a selector element.
525
+ await page.evaluate(`(() => {
526
+ var items = ${JSON.stringify(overlayData)};
527
+ var id = ${JSON.stringify(ANNOTATION_OVERLAY_ID)};
528
+ var sx = window.scrollX || 0;
529
+ var sy = window.scrollY || 0;
530
+ var c = document.createElement('div');
531
+ c.id = id;
532
+ c.style.cssText = 'position:absolute;top:0;left:0;width:0;height:0;pointer-events:none;z-index:2147483647;';
533
+ for (var i = 0; i < items.length; i++) {
534
+ var it = items[i];
535
+ var dx = it.x + sx;
536
+ var dy = it.y + sy;
537
+ var b = document.createElement('div');
538
+ b.style.cssText = 'position:absolute;left:' + dx + 'px;top:' + dy + 'px;width:' + it.width + 'px;height:' + it.height + 'px;border:2px solid rgba(255,0,0,0.8);box-sizing:border-box;pointer-events:none;';
539
+ var l = document.createElement('div');
540
+ l.textContent = String(it.number);
541
+ var labelTop = dy < 14 ? '2px' : '-14px';
542
+ l.style.cssText = 'position:absolute;top:' + labelTop + ';left:-2px;background:rgba(255,0,0,0.9);color:#fff;font:bold 11px/14px monospace;padding:0 4px;border-radius:2px;white-space:nowrap;';
543
+ b.appendChild(l);
544
+ c.appendChild(b);
545
+ }
546
+ document.documentElement.appendChild(c);
547
+ })()`);
548
+ overlayInjected = true;
549
+ }
550
+ // Build returned annotation metadata with image-relative coordinates.
551
+ // Selector: shift to target-element-relative.
552
+ // fullPage: convert to document-relative (matching fullPage image origin).
553
+ // Default: viewport-relative (unchanged).
554
+ if (targetBox) {
555
+ const tb = targetBox;
556
+ annotations = overlayItems.map((a) => ({
557
+ ...a,
558
+ box: {
559
+ x: a.box.x - tb.x,
560
+ y: a.box.y - tb.y,
561
+ width: a.box.width,
562
+ height: a.box.height,
563
+ },
564
+ }));
565
+ }
566
+ else if (command.fullPage) {
567
+ const scroll = (await page.evaluate(`({x: window.scrollX || 0, y: window.scrollY || 0})`));
568
+ annotations = overlayItems.map((a) => ({
569
+ ...a,
570
+ box: {
571
+ x: a.box.x + scroll.x,
572
+ y: a.box.y + scroll.y,
573
+ width: a.box.width,
574
+ height: a.box.height,
575
+ },
576
+ }));
577
+ }
578
+ else {
579
+ annotations = overlayItems;
580
+ }
581
+ }
436
582
  await target.screenshot({ ...options, path: savePath });
437
- return successResponse(command.id, { path: savePath });
583
+ if (overlayInjected) {
584
+ await removeAnnotationOverlay(page);
585
+ }
586
+ return successResponse(command.id, {
587
+ path: savePath,
588
+ ...(annotations && annotations.length > 0 ? { annotations } : {}),
589
+ });
438
590
  }
439
591
  catch (error) {
592
+ if (overlayInjected) {
593
+ await removeAnnotationOverlay(page);
594
+ }
440
595
  if (command.selector) {
441
596
  throw toAIFriendlyError(error, command.selector);
442
597
  }
@@ -1610,4 +1765,83 @@ async function handleRecordingRestart(command, browser) {
1610
1765
  stopped: result.stopped,
1611
1766
  });
1612
1767
  }
1768
+ // Diff handlers
1769
+ async function handleDiffSnapshot(command, browser) {
1770
+ let before;
1771
+ if (command.baseline) {
1772
+ try {
1773
+ before = fs.readFileSync(command.baseline, 'utf-8');
1774
+ }
1775
+ catch {
1776
+ return errorResponse(command.id, `Cannot read baseline file: ${command.baseline}`);
1777
+ }
1778
+ }
1779
+ else {
1780
+ before = browser.getLastSnapshot();
1781
+ if (!before) {
1782
+ return errorResponse(command.id, 'No previous snapshot in this session. Take a snapshot first, or use --baseline <file>.');
1783
+ }
1784
+ }
1785
+ const page = browser.getPage();
1786
+ const { tree } = await getEnhancedSnapshot(page, {
1787
+ selector: command.selector,
1788
+ compact: command.compact,
1789
+ maxDepth: command.maxDepth,
1790
+ });
1791
+ const after = tree || 'Empty page';
1792
+ const result = diffSnapshots(before, after);
1793
+ browser.setLastSnapshot(after);
1794
+ return successResponse(command.id, result);
1795
+ }
1796
+ async function handleDiffScreenshot(command, browser) {
1797
+ if (!fs.existsSync(command.baseline)) {
1798
+ return errorResponse(command.id, `Baseline file not found: ${command.baseline}`);
1799
+ }
1800
+ const page = browser.getPage();
1801
+ let screenshotBuffer;
1802
+ if (command.selector) {
1803
+ const locator = browser.getLocatorFromRef(command.selector) || page.locator(command.selector);
1804
+ screenshotBuffer = await locator.screenshot({ type: 'png' });
1805
+ }
1806
+ else {
1807
+ screenshotBuffer = await page.screenshot({ fullPage: command.fullPage, type: 'png' });
1808
+ }
1809
+ const baselineBuffer = fs.readFileSync(command.baseline);
1810
+ const ext = path.extname(command.baseline).toLowerCase();
1811
+ const baselineMime = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' : 'image/png';
1812
+ const result = await diffScreenshots(page.context(), baselineBuffer, screenshotBuffer, {
1813
+ threshold: command.threshold,
1814
+ outputPath: command.output,
1815
+ baselineMime,
1816
+ });
1817
+ return successResponse(command.id, result);
1818
+ }
1819
+ async function handleDiffUrl(command, browser) {
1820
+ const page = browser.getPage();
1821
+ const waitUntil = command.waitUntil ?? 'load';
1822
+ const snapshotOpts = {
1823
+ selector: command.selector,
1824
+ compact: command.compact,
1825
+ maxDepth: command.maxDepth,
1826
+ };
1827
+ // Capture state of url1
1828
+ await page.goto(command.url1, { waitUntil });
1829
+ const { tree: tree1 } = await getEnhancedSnapshot(page, snapshotOpts);
1830
+ const snapshot1 = tree1 || 'Empty page';
1831
+ let screenshot1;
1832
+ if (command.screenshot) {
1833
+ screenshot1 = await page.screenshot({ fullPage: command.fullPage, type: 'png' });
1834
+ }
1835
+ // Capture state of url2
1836
+ await page.goto(command.url2, { waitUntil });
1837
+ const { tree: tree2 } = await getEnhancedSnapshot(page, snapshotOpts);
1838
+ const snapshot2 = tree2 || 'Empty page';
1839
+ const snapshotDiff = diffSnapshots(snapshot1, snapshot2);
1840
+ const result = { snapshot: snapshotDiff };
1841
+ if (command.screenshot && screenshot1) {
1842
+ const screenshot2 = await page.screenshot({ fullPage: command.fullPage, type: 'png' });
1843
+ result.screenshot = await diffScreenshots(page.context(), screenshot1, screenshot2, {});
1844
+ }
1845
+ return successResponse(command.id, result);
1846
+ }
1613
1847
  //# sourceMappingURL=actions.js.map