agent-browser 0.11.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/bin/agent-browser-darwin-arm64 +0 -0
- package/bin/agent-browser-darwin-x64 +0 -0
- package/bin/agent-browser-linux-arm64 +0 -0
- package/bin/agent-browser-linux-x64 +0 -0
- package/bin/agent-browser-win32-x64.exe +0 -0
- package/dist/actions.d.ts.map +1 -1
- package/dist/actions.js +235 -1
- package/dist/actions.js.map +1 -1
- package/dist/browser.d.ts +8 -0
- package/dist/browser.d.ts.map +1 -1
- package/dist/browser.js +12 -0
- package/dist/browser.js.map +1 -1
- package/dist/diff.d.ts +18 -0
- package/dist/diff.d.ts.map +1 -0
- package/dist/diff.js +271 -0
- package/dist/diff.js.map +1 -0
- package/dist/protocol.d.ts.map +1 -1
- package/dist/protocol.js +31 -0
- package/dist/protocol.js.map +1 -1
- package/dist/types.d.ts +60 -1
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/skills/agent-browser/SKILL.md +53 -0
package/README.md
CHANGED
|
@@ -112,6 +112,7 @@ agent-browser scrollintoview <sel> # Scroll element into view (alias: scrolli
|
|
|
112
112
|
agent-browser drag <src> <tgt> # Drag and drop
|
|
113
113
|
agent-browser upload <sel> <files> # Upload files
|
|
114
114
|
agent-browser screenshot [path] # Take screenshot (--full for full page, saves to a temporary directory if no path)
|
|
115
|
+
agent-browser screenshot --annotate # Annotated screenshot with numbered element labels
|
|
115
116
|
agent-browser pdf <path> # Save as PDF
|
|
116
117
|
agent-browser snapshot # Accessibility tree with refs (best for AI)
|
|
117
118
|
agent-browser eval <js> # Run JavaScript (-b for base64, --stdin for piped input)
|
|
@@ -253,6 +254,21 @@ agent-browser dialog accept [text] # Accept (with optional prompt text)
|
|
|
253
254
|
agent-browser dialog dismiss # Dismiss
|
|
254
255
|
```
|
|
255
256
|
|
|
257
|
+
### Diff
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
agent-browser diff snapshot # Compare current vs last snapshot
|
|
261
|
+
agent-browser diff snapshot --baseline before.txt # Compare current vs saved snapshot file
|
|
262
|
+
agent-browser diff snapshot --selector "#main" --compact # Scoped snapshot diff
|
|
263
|
+
agent-browser diff screenshot --baseline before.png # Visual pixel diff against baseline
|
|
264
|
+
agent-browser diff screenshot --baseline b.png -o d.png # Save diff image to custom path
|
|
265
|
+
agent-browser diff screenshot --baseline b.png -t 0.2 # Adjust color threshold (0-1)
|
|
266
|
+
agent-browser diff url https://v1.com https://v2.com # Compare two URLs (snapshot diff)
|
|
267
|
+
agent-browser diff url https://v1.com https://v2.com --screenshot # Also visual diff
|
|
268
|
+
agent-browser diff url https://v1.com https://v2.com --wait-until networkidle # Custom wait strategy
|
|
269
|
+
agent-browser diff url https://v1.com https://v2.com --selector "#main" # Scope to element
|
|
270
|
+
```
|
|
271
|
+
|
|
256
272
|
### Debug
|
|
257
273
|
|
|
258
274
|
```bash
|
|
@@ -401,6 +417,27 @@ agent-browser snapshot -i -c -d 5 # Combine options
|
|
|
401
417
|
|
|
402
418
|
The `-C` flag is useful for modern web apps that use custom clickable elements (divs, spans) instead of standard buttons/links.
|
|
403
419
|
|
|
420
|
+
## Annotated Screenshots
|
|
421
|
+
|
|
422
|
+
The `--annotate` flag overlays numbered labels on interactive elements in the screenshot. Each label `[N]` corresponds to ref `@eN`, so the same refs work for both visual and text-based workflows.
|
|
423
|
+
|
|
424
|
+
```bash
|
|
425
|
+
agent-browser screenshot --annotate
|
|
426
|
+
# -> Screenshot saved to /tmp/screenshot-2026-02-17T12-00-00-abc123.png
|
|
427
|
+
# [1] @e1 button "Submit"
|
|
428
|
+
# [2] @e2 link "Home"
|
|
429
|
+
# [3] @e3 textbox "Email"
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
After an annotated screenshot, refs are cached so you can immediately interact with elements:
|
|
433
|
+
|
|
434
|
+
```bash
|
|
435
|
+
agent-browser screenshot --annotate ./page.png
|
|
436
|
+
agent-browser click @e2 # Click the "Home" link labeled [2]
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
This is useful for multimodal AI models that can reason about visual layout, unlabeled icon buttons, canvas elements, or visual state that the text accessibility tree cannot capture.
|
|
440
|
+
|
|
404
441
|
## Options
|
|
405
442
|
|
|
406
443
|
| Option | Description |
|
|
@@ -422,6 +459,7 @@ The `-C` flag is useful for modern web apps that use custom clickable elements (
|
|
|
422
459
|
| `--device <name>` | iOS device name, e.g. "iPhone 15 Pro" (or `AGENT_BROWSER_IOS_DEVICE` env) |
|
|
423
460
|
| `--json` | JSON output (for agents) |
|
|
424
461
|
| `--full, -f` | Full page screenshot |
|
|
462
|
+
| `--annotate` | Annotated screenshot with numbered element labels (or `AGENT_BROWSER_ANNOTATE` env) |
|
|
425
463
|
| `--headed` | Show browser window (not headless) |
|
|
426
464
|
| `--cdp <port\|url>` | Connect via Chrome DevTools Protocol (port or WebSocket URL) |
|
|
427
465
|
| `--auto-connect` | Auto-discover and connect to running Chrome (or `AGENT_BROWSER_AUTO_CONNECT` env) |
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/dist/actions.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"actions.d.ts","sourceRoot":"","sources":["../src/actions.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAUpE,OAAO,KAAK,EACV,OAAO,EACP,QAAQ,
|
|
1
|
+
{"version":3,"file":"actions.d.ts","sourceRoot":"","sources":["../src/actions.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAUpE,OAAO,KAAK,EACV,OAAO,EACP,QAAQ,EAmIT,MAAM,YAAY,CAAC;AAQpB;;;GAGG;AACH,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,CAAC,CAAC,KAAK,EAAE,eAAe,KAAK,IAAI,CAAC,GAAG,IAAI,GAClD,IAAI,CAEN;AAQD;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,GAAG,KAAK,CAqDzE;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuRjG"}
|
package/dist/actions.js
CHANGED
|
@@ -4,6 +4,8 @@ import { mkdirSync } from 'node:fs';
|
|
|
4
4
|
import { getAppDir } from './daemon.js';
|
|
5
5
|
import { getSessionsDir, readStateFile, isValidSessionName, isEncryptedPayload, listStateFiles, cleanupExpiredStates, } from './state-utils.js';
|
|
6
6
|
import { successResponse, errorResponse } from './protocol.js';
|
|
7
|
+
import { diffSnapshots, diffScreenshots } from './diff.js';
|
|
8
|
+
import { getEnhancedSnapshot } from './snapshot.js';
|
|
7
9
|
// Callback for screencast frames - will be set by the daemon when streaming is active
|
|
8
10
|
let screencastFrameCallback = null;
|
|
9
11
|
/**
|
|
@@ -318,6 +320,12 @@ export async function executeCommand(command, browser) {
|
|
|
318
320
|
return await handleRecordingStop(command, browser);
|
|
319
321
|
case 'recording_restart':
|
|
320
322
|
return await handleRecordingRestart(command, browser);
|
|
323
|
+
case 'diff_snapshot':
|
|
324
|
+
return await handleDiffSnapshot(command, browser);
|
|
325
|
+
case 'diff_screenshot':
|
|
326
|
+
return await handleDiffScreenshot(command, browser);
|
|
327
|
+
case 'diff_url':
|
|
328
|
+
return await handleDiffUrl(command, browser);
|
|
321
329
|
default: {
|
|
322
330
|
// TypeScript narrows to never here, but we handle it for safety
|
|
323
331
|
const unknownCommand = command;
|
|
@@ -409,6 +417,12 @@ async function handlePress(command, browser) {
|
|
|
409
417
|
}
|
|
410
418
|
return successResponse(command.id, { pressed: true });
|
|
411
419
|
}
|
|
420
|
+
const ANNOTATION_OVERLAY_ID = '__agent_browser_annotations__';
|
|
421
|
+
async function removeAnnotationOverlay(page) {
|
|
422
|
+
await page
|
|
423
|
+
.evaluate(`(() => { const el = document.getElementById(${JSON.stringify(ANNOTATION_OVERLAY_ID)}); if (el) el.remove(); })()`)
|
|
424
|
+
.catch(() => { });
|
|
425
|
+
}
|
|
412
426
|
async function handleScreenshot(command, browser) {
|
|
413
427
|
const page = browser.getPage();
|
|
414
428
|
const options = {
|
|
@@ -422,6 +436,7 @@ async function handleScreenshot(command, browser) {
|
|
|
422
436
|
if (command.selector) {
|
|
423
437
|
target = browser.getLocator(command.selector);
|
|
424
438
|
}
|
|
439
|
+
let overlayInjected = false;
|
|
425
440
|
try {
|
|
426
441
|
let savePath = command.path;
|
|
427
442
|
if (!savePath) {
|
|
@@ -433,10 +448,150 @@ async function handleScreenshot(command, browser) {
|
|
|
433
448
|
mkdirSync(screenshotDir, { recursive: true });
|
|
434
449
|
savePath = path.join(screenshotDir, filename);
|
|
435
450
|
}
|
|
451
|
+
let annotations;
|
|
452
|
+
if (command.annotate) {
|
|
453
|
+
const { refs } = await browser.getSnapshot({ interactive: true });
|
|
454
|
+
const entries = Object.entries(refs);
|
|
455
|
+
const results = await Promise.all(entries.map(async ([ref, data]) => {
|
|
456
|
+
try {
|
|
457
|
+
const locator = browser.getLocatorFromRef(ref);
|
|
458
|
+
if (!locator)
|
|
459
|
+
return null;
|
|
460
|
+
const box = await locator.boundingBox();
|
|
461
|
+
if (!box || box.width === 0 || box.height === 0)
|
|
462
|
+
return null;
|
|
463
|
+
const num = parseInt(ref.replace('e', ''), 10);
|
|
464
|
+
return {
|
|
465
|
+
ref,
|
|
466
|
+
number: num,
|
|
467
|
+
role: data.role,
|
|
468
|
+
name: data.name || undefined,
|
|
469
|
+
box: {
|
|
470
|
+
x: Math.round(box.x),
|
|
471
|
+
y: Math.round(box.y),
|
|
472
|
+
width: Math.round(box.width),
|
|
473
|
+
height: Math.round(box.height),
|
|
474
|
+
},
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
catch {
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
}));
|
|
481
|
+
// When a selector is provided the screenshot is cropped to that element,
|
|
482
|
+
// so filter to annotations that overlap the target and shift coordinates.
|
|
483
|
+
let targetBox = null;
|
|
484
|
+
if (command.selector) {
|
|
485
|
+
const raw = await browser.getLocator(command.selector).boundingBox();
|
|
486
|
+
if (raw) {
|
|
487
|
+
targetBox = {
|
|
488
|
+
x: Math.round(raw.x),
|
|
489
|
+
y: Math.round(raw.y),
|
|
490
|
+
width: Math.round(raw.width),
|
|
491
|
+
height: Math.round(raw.height),
|
|
492
|
+
};
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
const filtered = results.filter((a) => a !== null);
|
|
496
|
+
// Filter by selector overlap if needed, but keep viewport-relative coords
|
|
497
|
+
// for overlay positioning. Coordinate shifting happens later for metadata only.
|
|
498
|
+
let overlayItems;
|
|
499
|
+
if (targetBox) {
|
|
500
|
+
const tb = targetBox;
|
|
501
|
+
overlayItems = filtered
|
|
502
|
+
.filter((a) => {
|
|
503
|
+
const ax2 = a.box.x + a.box.width;
|
|
504
|
+
const ay2 = a.box.y + a.box.height;
|
|
505
|
+
const bx2 = tb.x + tb.width;
|
|
506
|
+
const by2 = tb.y + tb.height;
|
|
507
|
+
return a.box.x < bx2 && ax2 > tb.x && a.box.y < by2 && ay2 > tb.y;
|
|
508
|
+
})
|
|
509
|
+
.sort((a, b) => a.number - b.number);
|
|
510
|
+
}
|
|
511
|
+
else {
|
|
512
|
+
overlayItems = filtered.sort((a, b) => a.number - b.number);
|
|
513
|
+
}
|
|
514
|
+
if (overlayItems.length > 0) {
|
|
515
|
+
const overlayData = overlayItems.map((a) => ({
|
|
516
|
+
number: a.number,
|
|
517
|
+
x: a.box.x,
|
|
518
|
+
y: a.box.y,
|
|
519
|
+
width: a.box.width,
|
|
520
|
+
height: a.box.height,
|
|
521
|
+
}));
|
|
522
|
+
// Uses position:absolute with document-relative coords so labels render
|
|
523
|
+
// correctly for both viewport and fullPage screenshots, and when the
|
|
524
|
+
// screenshot is scoped to a selector element.
|
|
525
|
+
await page.evaluate(`(() => {
|
|
526
|
+
var items = ${JSON.stringify(overlayData)};
|
|
527
|
+
var id = ${JSON.stringify(ANNOTATION_OVERLAY_ID)};
|
|
528
|
+
var sx = window.scrollX || 0;
|
|
529
|
+
var sy = window.scrollY || 0;
|
|
530
|
+
var c = document.createElement('div');
|
|
531
|
+
c.id = id;
|
|
532
|
+
c.style.cssText = 'position:absolute;top:0;left:0;width:0;height:0;pointer-events:none;z-index:2147483647;';
|
|
533
|
+
for (var i = 0; i < items.length; i++) {
|
|
534
|
+
var it = items[i];
|
|
535
|
+
var dx = it.x + sx;
|
|
536
|
+
var dy = it.y + sy;
|
|
537
|
+
var b = document.createElement('div');
|
|
538
|
+
b.style.cssText = 'position:absolute;left:' + dx + 'px;top:' + dy + 'px;width:' + it.width + 'px;height:' + it.height + 'px;border:2px solid rgba(255,0,0,0.8);box-sizing:border-box;pointer-events:none;';
|
|
539
|
+
var l = document.createElement('div');
|
|
540
|
+
l.textContent = String(it.number);
|
|
541
|
+
var labelTop = dy < 14 ? '2px' : '-14px';
|
|
542
|
+
l.style.cssText = 'position:absolute;top:' + labelTop + ';left:-2px;background:rgba(255,0,0,0.9);color:#fff;font:bold 11px/14px monospace;padding:0 4px;border-radius:2px;white-space:nowrap;';
|
|
543
|
+
b.appendChild(l);
|
|
544
|
+
c.appendChild(b);
|
|
545
|
+
}
|
|
546
|
+
document.documentElement.appendChild(c);
|
|
547
|
+
})()`);
|
|
548
|
+
overlayInjected = true;
|
|
549
|
+
}
|
|
550
|
+
// Build returned annotation metadata with image-relative coordinates.
|
|
551
|
+
// Selector: shift to target-element-relative.
|
|
552
|
+
// fullPage: convert to document-relative (matching fullPage image origin).
|
|
553
|
+
// Default: viewport-relative (unchanged).
|
|
554
|
+
if (targetBox) {
|
|
555
|
+
const tb = targetBox;
|
|
556
|
+
annotations = overlayItems.map((a) => ({
|
|
557
|
+
...a,
|
|
558
|
+
box: {
|
|
559
|
+
x: a.box.x - tb.x,
|
|
560
|
+
y: a.box.y - tb.y,
|
|
561
|
+
width: a.box.width,
|
|
562
|
+
height: a.box.height,
|
|
563
|
+
},
|
|
564
|
+
}));
|
|
565
|
+
}
|
|
566
|
+
else if (command.fullPage) {
|
|
567
|
+
const scroll = (await page.evaluate(`({x: window.scrollX || 0, y: window.scrollY || 0})`));
|
|
568
|
+
annotations = overlayItems.map((a) => ({
|
|
569
|
+
...a,
|
|
570
|
+
box: {
|
|
571
|
+
x: a.box.x + scroll.x,
|
|
572
|
+
y: a.box.y + scroll.y,
|
|
573
|
+
width: a.box.width,
|
|
574
|
+
height: a.box.height,
|
|
575
|
+
},
|
|
576
|
+
}));
|
|
577
|
+
}
|
|
578
|
+
else {
|
|
579
|
+
annotations = overlayItems;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
436
582
|
await target.screenshot({ ...options, path: savePath });
|
|
437
|
-
|
|
583
|
+
if (overlayInjected) {
|
|
584
|
+
await removeAnnotationOverlay(page);
|
|
585
|
+
}
|
|
586
|
+
return successResponse(command.id, {
|
|
587
|
+
path: savePath,
|
|
588
|
+
...(annotations && annotations.length > 0 ? { annotations } : {}),
|
|
589
|
+
});
|
|
438
590
|
}
|
|
439
591
|
catch (error) {
|
|
592
|
+
if (overlayInjected) {
|
|
593
|
+
await removeAnnotationOverlay(page);
|
|
594
|
+
}
|
|
440
595
|
if (command.selector) {
|
|
441
596
|
throw toAIFriendlyError(error, command.selector);
|
|
442
597
|
}
|
|
@@ -1610,4 +1765,83 @@ async function handleRecordingRestart(command, browser) {
|
|
|
1610
1765
|
stopped: result.stopped,
|
|
1611
1766
|
});
|
|
1612
1767
|
}
|
|
1768
|
+
// Diff handlers
|
|
1769
|
+
async function handleDiffSnapshot(command, browser) {
|
|
1770
|
+
let before;
|
|
1771
|
+
if (command.baseline) {
|
|
1772
|
+
try {
|
|
1773
|
+
before = fs.readFileSync(command.baseline, 'utf-8');
|
|
1774
|
+
}
|
|
1775
|
+
catch {
|
|
1776
|
+
return errorResponse(command.id, `Cannot read baseline file: ${command.baseline}`);
|
|
1777
|
+
}
|
|
1778
|
+
}
|
|
1779
|
+
else {
|
|
1780
|
+
before = browser.getLastSnapshot();
|
|
1781
|
+
if (!before) {
|
|
1782
|
+
return errorResponse(command.id, 'No previous snapshot in this session. Take a snapshot first, or use --baseline <file>.');
|
|
1783
|
+
}
|
|
1784
|
+
}
|
|
1785
|
+
const page = browser.getPage();
|
|
1786
|
+
const { tree } = await getEnhancedSnapshot(page, {
|
|
1787
|
+
selector: command.selector,
|
|
1788
|
+
compact: command.compact,
|
|
1789
|
+
maxDepth: command.maxDepth,
|
|
1790
|
+
});
|
|
1791
|
+
const after = tree || 'Empty page';
|
|
1792
|
+
const result = diffSnapshots(before, after);
|
|
1793
|
+
browser.setLastSnapshot(after);
|
|
1794
|
+
return successResponse(command.id, result);
|
|
1795
|
+
}
|
|
1796
|
+
async function handleDiffScreenshot(command, browser) {
|
|
1797
|
+
if (!fs.existsSync(command.baseline)) {
|
|
1798
|
+
return errorResponse(command.id, `Baseline file not found: ${command.baseline}`);
|
|
1799
|
+
}
|
|
1800
|
+
const page = browser.getPage();
|
|
1801
|
+
let screenshotBuffer;
|
|
1802
|
+
if (command.selector) {
|
|
1803
|
+
const locator = browser.getLocatorFromRef(command.selector) || page.locator(command.selector);
|
|
1804
|
+
screenshotBuffer = await locator.screenshot({ type: 'png' });
|
|
1805
|
+
}
|
|
1806
|
+
else {
|
|
1807
|
+
screenshotBuffer = await page.screenshot({ fullPage: command.fullPage, type: 'png' });
|
|
1808
|
+
}
|
|
1809
|
+
const baselineBuffer = fs.readFileSync(command.baseline);
|
|
1810
|
+
const ext = path.extname(command.baseline).toLowerCase();
|
|
1811
|
+
const baselineMime = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' : 'image/png';
|
|
1812
|
+
const result = await diffScreenshots(page.context(), baselineBuffer, screenshotBuffer, {
|
|
1813
|
+
threshold: command.threshold,
|
|
1814
|
+
outputPath: command.output,
|
|
1815
|
+
baselineMime,
|
|
1816
|
+
});
|
|
1817
|
+
return successResponse(command.id, result);
|
|
1818
|
+
}
|
|
1819
|
+
async function handleDiffUrl(command, browser) {
|
|
1820
|
+
const page = browser.getPage();
|
|
1821
|
+
const waitUntil = command.waitUntil ?? 'load';
|
|
1822
|
+
const snapshotOpts = {
|
|
1823
|
+
selector: command.selector,
|
|
1824
|
+
compact: command.compact,
|
|
1825
|
+
maxDepth: command.maxDepth,
|
|
1826
|
+
};
|
|
1827
|
+
// Capture state of url1
|
|
1828
|
+
await page.goto(command.url1, { waitUntil });
|
|
1829
|
+
const { tree: tree1 } = await getEnhancedSnapshot(page, snapshotOpts);
|
|
1830
|
+
const snapshot1 = tree1 || 'Empty page';
|
|
1831
|
+
let screenshot1;
|
|
1832
|
+
if (command.screenshot) {
|
|
1833
|
+
screenshot1 = await page.screenshot({ fullPage: command.fullPage, type: 'png' });
|
|
1834
|
+
}
|
|
1835
|
+
// Capture state of url2
|
|
1836
|
+
await page.goto(command.url2, { waitUntil });
|
|
1837
|
+
const { tree: tree2 } = await getEnhancedSnapshot(page, snapshotOpts);
|
|
1838
|
+
const snapshot2 = tree2 || 'Empty page';
|
|
1839
|
+
const snapshotDiff = diffSnapshots(snapshot1, snapshot2);
|
|
1840
|
+
const result = { snapshot: snapshotDiff };
|
|
1841
|
+
if (command.screenshot && screenshot1) {
|
|
1842
|
+
const screenshot2 = await page.screenshot({ fullPage: command.fullPage, type: 'png' });
|
|
1843
|
+
result.screenshot = await diffScreenshots(page.context(), screenshot1, screenshot2, {});
|
|
1844
|
+
}
|
|
1845
|
+
return successResponse(command.id, result);
|
|
1846
|
+
}
|
|
1613
1847
|
//# sourceMappingURL=actions.js.map
|