barebrowse 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +78 -0
- package/README.md +9 -4
- package/barebrowse.context.md +34 -5
- package/cli.js +3 -0
- package/mcp-server.js +35 -7
- package/package.json +35 -5
- package/src/auth.js +7 -4
- package/src/bareagent.js +26 -5
- package/src/cdp.js +15 -4
- package/src/chromium.js +7 -2
- package/src/daemon.js +37 -4
- package/src/index.js +28 -1
- package/src/network-idle.js +4 -1
- package/src/prune.js +1 -1
- package/src/readable.js +116 -0
- package/src/session-client.js +1 -1
- package/src/wearehere.d.ts +6 -0
- package/types/aria.d.ts +17 -0
- package/types/auth.d.ts +35 -0
- package/types/bareagent.d.ts +25 -0
- package/types/blocklist.d.ts +21 -0
- package/types/cdp.d.ts +6 -0
- package/types/chromium.d.ts +58 -0
- package/types/consent.d.ts +9 -0
- package/types/daemon.d.ts +10 -0
- package/types/index.d.ts +138 -0
- package/types/interact.d.ts +79 -0
- package/types/network-idle.d.ts +19 -0
- package/types/prune.d.ts +13 -0
- package/types/readable.d.ts +18 -0
- package/types/session-client.d.ts +19 -0
- package/types/stealth.d.ts +14 -0
- package/types/url-guard.d.ts +26 -0
- package/.github/workflows/publish.yml +0 -26
- package/commands/barebrowse/SKILL.md +0 -137
- package/commands/barebrowse.md +0 -136
package/src/daemon.js
CHANGED
|
@@ -11,6 +11,7 @@ import { writeFileSync, mkdirSync, existsSync, readFileSync, unlinkSync } from '
|
|
|
11
11
|
import { randomBytes, timingSafeEqual } from 'node:crypto';
|
|
12
12
|
import { join, resolve } from 'node:path';
|
|
13
13
|
import { connect } from './index.js';
|
|
14
|
+
import { formatReadable } from './readable.js';
|
|
14
15
|
|
|
15
16
|
/** Owner-only file write helper — daemon artifacts can hold authenticated content. */
|
|
16
17
|
function writeFilePrivate(path, data) {
|
|
@@ -191,6 +192,17 @@ export async function runDaemon(opts, outputDir, initialUrl) {
|
|
|
191
192
|
return { ok: true, file };
|
|
192
193
|
},
|
|
193
194
|
|
|
195
|
+
async readable() {
|
|
196
|
+
const r = await page.readable();
|
|
197
|
+
// A non-article page is not an error — surface the hint so the agent
|
|
198
|
+
// knows to fall back to snapshot, rather than failing the command.
|
|
199
|
+
if (!r.ok) return { ok: true, value: r.hint };
|
|
200
|
+
const ts = new Date().toISOString().replace(/[:.]/g, '-');
|
|
201
|
+
const file = join(absDir, `article-${ts}.txt`);
|
|
202
|
+
writeFilePrivate(file, formatReadable(r));
|
|
203
|
+
return { ok: true, file };
|
|
204
|
+
},
|
|
205
|
+
|
|
194
206
|
async screenshot({ format }) {
|
|
195
207
|
const data = await page.screenshot({ format: format || 'png' });
|
|
196
208
|
const ts = new Date().toISOString().replace(/[:.]/g, '-');
|
|
@@ -361,8 +373,11 @@ export async function runDaemon(opts, outputDir, initialUrl) {
|
|
|
361
373
|
// Start HTTP server on random port
|
|
362
374
|
const server = createServer(async (req, res) => {
|
|
363
375
|
if (req.method === 'GET' && req.url === '/status') {
|
|
376
|
+
// Liveness only — no pid. /status is the one pre-auth endpoint, and
|
|
377
|
+
// isAlive() just checks res.ok; the pid clients show comes from
|
|
378
|
+
// session.json (owner-only), so nothing consumes a pid here.
|
|
364
379
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
365
|
-
res.end(JSON.stringify({ ok: true
|
|
380
|
+
res.end(JSON.stringify({ ok: true }));
|
|
366
381
|
return;
|
|
367
382
|
}
|
|
368
383
|
|
|
@@ -380,8 +395,20 @@ export async function runDaemon(opts, outputDir, initialUrl) {
|
|
|
380
395
|
return;
|
|
381
396
|
}
|
|
382
397
|
|
|
398
|
+
// Cap the request body. Post-auth, single local user, but an unbounded
|
|
399
|
+
// `body +=` is a needless memory-DoS foot-gun. 16 MB covers any realistic
|
|
400
|
+
// eval expression / typed text.
|
|
401
|
+
const MAX_BODY = 16 * 1024 * 1024;
|
|
383
402
|
let body = '';
|
|
384
|
-
for await (const chunk of req)
|
|
403
|
+
for await (const chunk of req) {
|
|
404
|
+
body += chunk;
|
|
405
|
+
if (body.length > MAX_BODY) {
|
|
406
|
+
res.writeHead(413, { 'Content-Type': 'application/json' });
|
|
407
|
+
res.end(JSON.stringify({ ok: false, error: 'Request body too large' }));
|
|
408
|
+
req.destroy();
|
|
409
|
+
return;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
385
412
|
|
|
386
413
|
let parsed;
|
|
387
414
|
try {
|
|
@@ -416,11 +443,17 @@ export async function runDaemon(opts, outputDir, initialUrl) {
|
|
|
416
443
|
}
|
|
417
444
|
});
|
|
418
445
|
|
|
419
|
-
|
|
446
|
+
/** @type {Promise<void>} */
|
|
447
|
+
const listening = new Promise((resolve) => {
|
|
420
448
|
server.listen(0, '127.0.0.1', () => resolve());
|
|
421
449
|
});
|
|
450
|
+
await listening;
|
|
422
451
|
|
|
423
|
-
const
|
|
452
|
+
const address = server.address();
|
|
453
|
+
if (!address || typeof address === 'string') {
|
|
454
|
+
throw new Error('Daemon server failed to bind to a TCP port');
|
|
455
|
+
}
|
|
456
|
+
const port = address.port;
|
|
424
457
|
|
|
425
458
|
// Write session.json so parent/clients can find us. Owner-only: it carries
|
|
426
459
|
// the auth token that gates /command.
|
package/src/index.js
CHANGED
|
@@ -18,6 +18,7 @@ import { dismissConsent } from './consent.js';
|
|
|
18
18
|
import { applyStealth } from './stealth.js';
|
|
19
19
|
import { DEFAULT_BLOCKLIST } from './blocklist.js';
|
|
20
20
|
import { waitForNetworkIdle } from './network-idle.js';
|
|
21
|
+
import { readable as extractReadable } from './readable.js';
|
|
21
22
|
import { assertNavigable, assertUploadAllowed } from './url-guard.js';
|
|
22
23
|
import { join as pathJoin } from 'node:path';
|
|
23
24
|
import { chmodSync } from 'node:fs';
|
|
@@ -37,6 +38,17 @@ import { chmodSync } from 'node:fs';
|
|
|
37
38
|
* See src/blocklist.js for the default set. Set false to disable.
|
|
38
39
|
* @param {string[]} [opts.blockUrls] - Extra URL glob patterns to block,
|
|
39
40
|
* merged with the default unless blockAds:false.
|
|
41
|
+
* @param {boolean} [opts.allowLocalUrls=false] - Permit navigation to local-
|
|
42
|
+
* resource schemes (file:, view-source:, chrome:, …). Blocked by default.
|
|
43
|
+
* @param {boolean} [opts.blockPrivateNetwork=false] - Reject navigation to
|
|
44
|
+
* loopback / RFC-1918 / link-local / cloud-metadata hosts (SSRF guard).
|
|
45
|
+
* @param {string} [opts.proxy] - Proxy server (e.g. 'http://host:port').
|
|
46
|
+
* @param {string} [opts.binary] - Path to browser binary (auto-detected if omitted).
|
|
47
|
+
* @param {string} [opts.userDataDir] - Browser profile directory.
|
|
48
|
+
* @param {{width: number, height: number}} [opts.viewport] - Viewport dimensions.
|
|
49
|
+
* @param {string} [opts.browser] - Source browser for cookie extraction.
|
|
50
|
+
* @param {boolean} [opts.consent=true] - Auto-dismiss cookie consent dialogs.
|
|
51
|
+
* @param {'act'|'browse'|'navigate'|'full'|'read'} [opts.pruneMode='act'] - Pruning mode.
|
|
40
52
|
* @returns {Promise<string>} ARIA snapshot text
|
|
41
53
|
*/
|
|
42
54
|
export async function browse(url, opts = {}) {
|
|
@@ -169,6 +181,14 @@ export async function browse(url, opts = {}) {
|
|
|
169
181
|
* @param {string} [opts.uploadDir] - When set, upload() rejects any file that
|
|
170
182
|
* does not resolve (symlinks included) inside this directory. Sandboxes the
|
|
171
183
|
* agent's file-upload capability. Default: no restriction.
|
|
184
|
+
* @param {string} [opts.proxy] - Proxy server (e.g. 'http://host:port').
|
|
185
|
+
* @param {string} [opts.binary] - Path to browser binary (auto-detected if omitted).
|
|
186
|
+
* @param {string} [opts.userDataDir] - Browser profile directory.
|
|
187
|
+
* @param {{width: number, height: number}} [opts.viewport] - Viewport dimensions.
|
|
188
|
+
* @param {boolean} [opts.consent=true] - Auto-dismiss cookie consent dialogs.
|
|
189
|
+
* @param {string} [opts.storageState] - Path to a storage-state JSON file
|
|
190
|
+
* (cookies + localStorage) to load before navigation.
|
|
191
|
+
* @param {'act'|'browse'|'navigate'|'full'|'read'} [opts.pruneMode='act'] - Pruning mode.
|
|
172
192
|
* @returns {Promise<object>} Page handle with goto, snapshot, close
|
|
173
193
|
*/
|
|
174
194
|
export async function connect(opts = {}) {
|
|
@@ -189,7 +209,7 @@ export async function connect(opts = {}) {
|
|
|
189
209
|
// Reuse the user's running browser — do not launch, do not own the
|
|
190
210
|
// profile. cleanupBrowser() is a no-op on this shape (process: null,
|
|
191
211
|
// ownedProfileDir: null), which is the whole point.
|
|
192
|
-
browser = await attach({ port: opts.port });
|
|
212
|
+
browser = await attach({ port: opts.port ?? 0 });
|
|
193
213
|
cdp = await createCDP(browser.wsUrl);
|
|
194
214
|
} else if (mode === 'headed') {
|
|
195
215
|
browser = await launch({ ...launchOpts, headed: true });
|
|
@@ -440,6 +460,13 @@ export async function connect(opts = {}) {
|
|
|
440
460
|
return stats + '\n' + hint + warn + out;
|
|
441
461
|
},
|
|
442
462
|
|
|
463
|
+
// Clean article text (Firefox Reader View engine), for reading/summarising
|
|
464
|
+
// — not for interacting. Returns { ok:false, hint } on non-article pages.
|
|
465
|
+
// See readable.js for why this never hard-gates on article detection.
|
|
466
|
+
async readable() {
|
|
467
|
+
return extractReadable(page.session);
|
|
468
|
+
},
|
|
469
|
+
|
|
443
470
|
async click(ref) {
|
|
444
471
|
const entry = refMap.get(ref);
|
|
445
472
|
if (!entry) throw new Error(`No element found for ref "${ref}"`);
|
package/src/network-idle.js
CHANGED
|
@@ -12,12 +12,14 @@
|
|
|
12
12
|
* @param {object} [opts]
|
|
13
13
|
* @param {number} [opts.timeout=30000] - Max wait time before reject
|
|
14
14
|
* @param {number} [opts.idle=500] - Required idle duration before resolve
|
|
15
|
+
* @returns {Promise<void>}
|
|
15
16
|
*/
|
|
16
17
|
export function waitForNetworkIdle(session, opts = {}) {
|
|
17
18
|
const timeout = opts.timeout || 30000;
|
|
18
19
|
const idle = opts.idle || 500;
|
|
19
20
|
|
|
20
|
-
|
|
21
|
+
/** @type {Promise<void>} */
|
|
22
|
+
const settled = new Promise((resolve, reject) => {
|
|
21
23
|
const pending = new Set();
|
|
22
24
|
let timer = null;
|
|
23
25
|
const unsubs = [];
|
|
@@ -59,4 +61,5 @@ export function waitForNetworkIdle(session, opts = {}) {
|
|
|
59
61
|
// Start check immediately (might already be idle)
|
|
60
62
|
check();
|
|
61
63
|
});
|
|
64
|
+
return settled;
|
|
62
65
|
}
|
package/src/prune.js
CHANGED
|
@@ -60,7 +60,7 @@ const SKIP_ROLES = new Set([
|
|
|
60
60
|
*
|
|
61
61
|
* @param {object} tree - Root node from buildTree() (CDP format)
|
|
62
62
|
* @param {object} [options]
|
|
63
|
-
* @param {'act'|'browse'|'navigate'|'full'} [options.mode='act'] - Pruning mode
|
|
63
|
+
* @param {'act'|'browse'|'navigate'|'full'|'read'} [options.mode='act'] - Pruning mode ('read' is an alias for 'browse')
|
|
64
64
|
* @param {string} [options.context=''] - Search context for relevance filtering
|
|
65
65
|
* @returns {object|null} Pruned tree
|
|
66
66
|
*/
|
package/src/readable.js
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* readable.js — extract the main article of a page as clean reading text.
|
|
3
|
+
*
|
|
4
|
+
* Companion to snapshot(): snapshot() yields an *actionable* ARIA tree for
|
|
5
|
+
* clicking/typing; readable() yields the *readable* article (title + body
|
|
6
|
+
* prose, nav/ads/sidebars stripped) for "read/summarise this" tasks, where
|
|
7
|
+
* snapshot() is both noisy and silently lossy on long prose.
|
|
8
|
+
*
|
|
9
|
+
* Runs Mozilla's Readability (the engine behind Firefox Reader View) inside
|
|
10
|
+
* the live page over CDP — so JS-rendered articles work, unlike a raw fetch.
|
|
11
|
+
* `isProbablyReaderable` gives an article-likelihood signal, but it is not
|
|
12
|
+
* reliable on its own (false negatives on minimally-marked-up essays, false
|
|
13
|
+
* positives on link-dense portals), so readable() never hard-gates: it always
|
|
14
|
+
* returns whatever Readability extracted plus an advisory `confidence`. A
|
|
15
|
+
* low-confidence result is the agent's cue to fall back to snapshot().
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { readFileSync } from 'node:fs';
|
|
19
|
+
import { createRequire } from 'node:module';
|
|
20
|
+
|
|
21
|
+
const require = createRequire(import.meta.url);
|
|
22
|
+
|
|
23
|
+
// Read the self-contained browser builds once at module load and inject their
|
|
24
|
+
// source into the page. Both define globals (Readability, isProbablyReaderable)
|
|
25
|
+
// when evaluated in a non-module context; the `if (typeof module ...)` tails are
|
|
26
|
+
// harmless no-ops in the page.
|
|
27
|
+
const READABILITY_SRC = readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf8');
|
|
28
|
+
const READERABLE_SRC = readFileSync(require.resolve('@mozilla/readability/Readability-readerable.js'), 'utf8');
|
|
29
|
+
|
|
30
|
+
/** Below this many characters of extracted text, treat as low confidence. */
|
|
31
|
+
const MIN_ARTICLE_CHARS = 1500;
|
|
32
|
+
|
|
33
|
+
// Fully static — interpolates only the two module-level source constants — so
|
|
34
|
+
// it's built once at load, not rebuilt (~120 KB) on every readable() call.
|
|
35
|
+
const EXTRACT_EXPRESSION = `(() => {
|
|
36
|
+
${READERABLE_SRC}
|
|
37
|
+
${READABILITY_SRC}
|
|
38
|
+
try {
|
|
39
|
+
const readerable = isProbablyReaderable(document);
|
|
40
|
+
// Readability mutates the document it parses — clone so the live page
|
|
41
|
+
// (and any later snapshot()/interaction) is untouched.
|
|
42
|
+
const art = new Readability(document.cloneNode(true)).parse();
|
|
43
|
+
if (!art || !art.textContent || !art.textContent.trim()) {
|
|
44
|
+
return { ok: false, readerable };
|
|
45
|
+
}
|
|
46
|
+
return {
|
|
47
|
+
ok: true,
|
|
48
|
+
readerable,
|
|
49
|
+
title: art.title || '',
|
|
50
|
+
byline: art.byline || '',
|
|
51
|
+
text: art.textContent.trim(),
|
|
52
|
+
length: art.length || art.textContent.length,
|
|
53
|
+
};
|
|
54
|
+
} catch (e) {
|
|
55
|
+
return { ok: false, err: String(e && e.message || e) };
|
|
56
|
+
}
|
|
57
|
+
})()`;
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Render a readable() result as a text block: a short header (title / byline /
|
|
61
|
+
* confidence, with the fall-back hint inline when present) then the body. On a
|
|
62
|
+
* failed extraction it returns the hint. Shared by the MCP, bareagent, and
|
|
63
|
+
* CLI/daemon surfaces so their output can't drift apart.
|
|
64
|
+
* @param {object} r - a readable() result.
|
|
65
|
+
* @returns {string}
|
|
66
|
+
*/
|
|
67
|
+
export function formatReadable(r) {
|
|
68
|
+
if (!r.ok) return r.hint;
|
|
69
|
+
const header = `title: ${r.title}${r.byline ? `\nbyline: ${r.byline}` : ''}\n`
|
|
70
|
+
+ `confidence: ${r.confidence}${r.hint ? ` (${r.hint})` : ''}\n\n`;
|
|
71
|
+
return header + r.text;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Extract the main article from the current page.
|
|
76
|
+
* @param {object} session - CDP session-scoped handle (.send()).
|
|
77
|
+
* @returns {Promise<object>} One of:
|
|
78
|
+
* { ok: false, hint } — no article content found
|
|
79
|
+
* { ok: true, title, byline, text, length,
|
|
80
|
+
* confidence: 'high'|'low', readerable, hint? } — extracted article
|
|
81
|
+
*/
|
|
82
|
+
export async function readable(session) {
|
|
83
|
+
const { result } = await session.send('Runtime.evaluate', {
|
|
84
|
+
expression: EXTRACT_EXPRESSION,
|
|
85
|
+
returnByValue: true,
|
|
86
|
+
awaitPromise: true,
|
|
87
|
+
});
|
|
88
|
+
const r = result.value || {};
|
|
89
|
+
|
|
90
|
+
if (!r.ok) {
|
|
91
|
+
return {
|
|
92
|
+
ok: false,
|
|
93
|
+
hint: r.err
|
|
94
|
+
? `readable extraction failed (${r.err}); use snapshot()`
|
|
95
|
+
: 'no article content found on this page; use snapshot() instead',
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Advisory confidence: high only when the reader-view heuristic agrees AND
|
|
100
|
+
// there is a substantial amount of text. Low is not an error — the text is
|
|
101
|
+
// still returned; it just means "verify, or prefer snapshot()".
|
|
102
|
+
const confidence = r.readerable && r.length >= MIN_ARTICLE_CHARS ? 'high' : 'low';
|
|
103
|
+
const out = {
|
|
104
|
+
ok: true,
|
|
105
|
+
title: r.title,
|
|
106
|
+
byline: r.byline,
|
|
107
|
+
text: r.text,
|
|
108
|
+
length: r.length,
|
|
109
|
+
readerable: r.readerable,
|
|
110
|
+
confidence,
|
|
111
|
+
};
|
|
112
|
+
if (confidence === 'low') {
|
|
113
|
+
out.hint = 'low article confidence — this may not be an article; consider snapshot()';
|
|
114
|
+
}
|
|
115
|
+
return out;
|
|
116
|
+
}
|
package/src/session-client.js
CHANGED
|
@@ -13,7 +13,7 @@ const SESSION_FILE = 'session.json';
|
|
|
13
13
|
|
|
14
14
|
/**
|
|
15
15
|
* Read session.json from the output directory.
|
|
16
|
-
* @returns {{ port: number, pid: number, startedAt: string } | null}
|
|
16
|
+
* @returns {{ port: number, pid: number, token?: string, startedAt: string } | null}
|
|
17
17
|
*/
|
|
18
18
|
export function readSession(outputDir) {
|
|
19
19
|
const sessionPath = join(resolve(outputDir), SESSION_FILE);
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
// Ambient shim for the optional 'wearehere' dependency.
|
|
2
|
+
// It is dynamically imported and may not be installed; this declaration
|
|
3
|
+
// satisfies the typechecker without pulling in a hard dependency.
|
|
4
|
+
declare module 'wearehere' {
|
|
5
|
+
export function assess(...args: any[]): Promise<any>;
|
|
6
|
+
}
|
package/types/aria.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* aria.js — Format ARIA accessibility tree nodes for agent consumption.
|
|
3
|
+
*
|
|
4
|
+
* Takes a nested tree (built from CDP's Accessibility.getFullAXTree)
|
|
5
|
+
* and formats it as readable YAML-like text, similar to Playwright's ariaSnapshot.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Format a nested ARIA tree as readable text output.
|
|
9
|
+
*
|
|
10
|
+
* Output format (one node per line, indented):
|
|
11
|
+
* - role "name" [props] [ref=nodeId]
|
|
12
|
+
*
|
|
13
|
+
* @param {object} node - Tree node { role, name, properties, children, ignored, nodeId }
|
|
14
|
+
* @param {number} [depth=0] - Current indentation depth
|
|
15
|
+
* @returns {string} Formatted ARIA tree text
|
|
16
|
+
*/
|
|
17
|
+
export function formatTree(node: object, depth?: number): string;
|
package/types/auth.d.ts
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract cookies from the user's browser, auto-detecting which browser to use.
|
|
3
|
+
* @param {object} [opts]
|
|
4
|
+
* @param {string} [opts.browser] - 'chromium', 'chrome', 'brave', 'edge', 'firefox', or 'auto'
|
|
5
|
+
* @param {string} [opts.domain] - Filter by domain
|
|
6
|
+
* @returns {Array<object>} Cookies in CDP-compatible format
|
|
7
|
+
*/
|
|
8
|
+
export function extractCookies(opts?: {
|
|
9
|
+
browser?: string | undefined;
|
|
10
|
+
domain?: string | undefined;
|
|
11
|
+
}): Array<object>;
|
|
12
|
+
/**
|
|
13
|
+
* Inject cookies into a CDP session via Network.setCookie.
|
|
14
|
+
* @param {object} session - CDP session handle (from cdp.session())
|
|
15
|
+
* @param {Array<object>} cookies - Cookies from extractCookies()
|
|
16
|
+
*/
|
|
17
|
+
export function injectCookies(session: object, cookies: Array<object>): Promise<void>;
|
|
18
|
+
/**
|
|
19
|
+
* RFC 6265 domain-match: does `host` belong to a cookie declared for
|
|
20
|
+
* `cookieDomain`? Leading dot on the cookie domain is ignored (host-only
|
|
21
|
+
* vs domain cookies are matched the same here, intentionally — we want
|
|
22
|
+
* parent-domain cookies like .google.com to apply to mail.google.com).
|
|
23
|
+
* @param {string} host - target hostname (e.g. 'mail.google.com')
|
|
24
|
+
* @param {string} cookieDomain - cookie's host_key (e.g. '.google.com')
|
|
25
|
+
* @returns {boolean}
|
|
26
|
+
*/
|
|
27
|
+
export function cookieDomainMatch(host: string, cookieDomain: string): boolean;
|
|
28
|
+
/**
|
|
29
|
+
* Extract cookies for a URL and inject them into a CDP session.
|
|
30
|
+
* Convenience function combining extractCookies + injectCookies.
|
|
31
|
+
* @param {object} session - CDP session handle
|
|
32
|
+
* @param {string} url - URL to extract cookies for
|
|
33
|
+
* @param {object} [opts] - Options passed to extractCookies
|
|
34
|
+
*/
|
|
35
|
+
export function authenticate(session: object, url: string, opts?: object): Promise<number>;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @typedef {object} BrowseTool
|
|
3
|
+
* @property {string} name
|
|
4
|
+
* @property {string} description
|
|
5
|
+
* @property {object} parameters - JSON-schema-shaped parameter spec
|
|
6
|
+
* @property {(args?: any) => Promise<any>} execute
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Create bareagent-compatible browse tools.
|
|
10
|
+
* @param {object} [opts] - Options passed to connect() for session tools
|
|
11
|
+
* @returns {{ tools: Array, close: () => Promise<void> }}
|
|
12
|
+
*/
|
|
13
|
+
export function createBrowseTools(opts?: object): {
|
|
14
|
+
tools: any[];
|
|
15
|
+
close: () => Promise<void>;
|
|
16
|
+
};
|
|
17
|
+
export type BrowseTool = {
|
|
18
|
+
name: string;
|
|
19
|
+
description: string;
|
|
20
|
+
/**
|
|
21
|
+
* - JSON-schema-shaped parameter spec
|
|
22
|
+
*/
|
|
23
|
+
parameters: object;
|
|
24
|
+
execute: (args?: any) => Promise<any>;
|
|
25
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* blocklist.js — Ad/tracker URL patterns for CDP Network.setBlockedURLs.
|
|
3
|
+
*
|
|
4
|
+
* Curated by real-world frequency, not pulled wholesale from Peter Lowe /
|
|
5
|
+
* EasyList. CDP does linear pattern matching per request, so 3,000-entry
|
|
6
|
+
* lists add ~150ms cumulative cost on a typical page for ~5% extra coverage
|
|
7
|
+
* (long-tail regional networks the agent rarely encounters). The set below
|
|
8
|
+
* is ~120 patterns covering the trackers that actually show up in agent
|
|
9
|
+
* traffic: Google/FB/Amazon/MS/Adobe ad+analytics, the major SaaS analytics
|
|
10
|
+
* stacks (Segment/Amplitude/Mixpanel/HubSpot/Hotjar/FullStory/Heap/Mouseflow),
|
|
11
|
+
* session-replay (LogRocket/Crazy Egg/Optimizely/VWO), content-recommendation
|
|
12
|
+
* (Taboola/Outbrain/Criteo), and the consumer-pixel cluster (LinkedIn/Twitter/
|
|
13
|
+
* TikTok/Snap/Pinterest/Reddit).
|
|
14
|
+
*
|
|
15
|
+
* Patterns are CDP-format globs: '*' matches any character run.
|
|
16
|
+
*
|
|
17
|
+
* To extend at runtime, pass connect({ blockUrls: [...] }) — your patterns
|
|
18
|
+
* are merged with this default. To turn the default off entirely, pass
|
|
19
|
+
* { blockAds: false }.
|
|
20
|
+
*/
|
|
21
|
+
export const DEFAULT_BLOCKLIST: string[];
|
package/types/cdp.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Create a CDP client connected to the given WebSocket URL.
|
|
3
|
+
* @param {string} wsUrl - WebSocket URL (ws://127.0.0.1:PORT/devtools/...)
|
|
4
|
+
* @returns {Promise<object>} CDP client ({ send, on, once, session, close })
|
|
5
|
+
*/
|
|
6
|
+
export function createCDP(wsUrl: string): Promise<object>;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Find the first available Chromium binary on the system.
|
|
3
|
+
* @returns {string} Path to the binary
|
|
4
|
+
* @throws {Error} If no Chromium browser is found
|
|
5
|
+
*/
|
|
6
|
+
export function findBrowser(): string;
|
|
7
|
+
/**
|
|
8
|
+
* Launch a Chromium instance with CDP enabled.
|
|
9
|
+
* @param {object} [opts]
|
|
10
|
+
* @param {string} [opts.binary] - Path to browser binary (auto-detected if omitted)
|
|
11
|
+
* @param {number} [opts.port=0] - CDP port (0 = random available port)
|
|
12
|
+
* @param {string} [opts.userDataDir] - Browser profile directory
|
|
13
|
+
* @param {boolean} [opts.headed=false] - Launch in headed mode (with visible window)
|
|
14
|
+
* @param {string} [opts.proxy] - Proxy server (e.g. 'http://host:port')
|
|
15
|
+
* @returns {Promise<{wsUrl: string, process: import('node:child_process').ChildProcess, port: number}>}
|
|
16
|
+
*/
|
|
17
|
+
export function launch(opts?: {
|
|
18
|
+
binary?: string | undefined;
|
|
19
|
+
port?: number | undefined;
|
|
20
|
+
userDataDir?: string | undefined;
|
|
21
|
+
headed?: boolean | undefined;
|
|
22
|
+
proxy?: string | undefined;
|
|
23
|
+
}): Promise<{
|
|
24
|
+
wsUrl: string;
|
|
25
|
+
process: import("node:child_process").ChildProcess;
|
|
26
|
+
port: number;
|
|
27
|
+
}>;
|
|
28
|
+
/**
|
|
29
|
+
* Kill a launched browser and remove its temp profile dir (if we created one).
|
|
30
|
+
* Waits up to 2s for the process to actually exit before unlinking the dir —
|
|
31
|
+
* Chromium can still hold files briefly after SIGTERM, which races rmSync.
|
|
32
|
+
* Safe to call on partially-failed launches or already-dead processes.
|
|
33
|
+
* @returns {Promise<void>}
|
|
34
|
+
*/
|
|
35
|
+
export function cleanupBrowser(browser: any): Promise<void>;
|
|
36
|
+
/**
|
|
37
|
+
* Get the CDP WebSocket URL for a browser already running with --remote-debugging-port.
|
|
38
|
+
* @param {number} port - The debug port
|
|
39
|
+
* @returns {Promise<string>} WebSocket URL
|
|
40
|
+
*/
|
|
41
|
+
export function getDebugUrl(port: number): Promise<string>;
|
|
42
|
+
/**
|
|
43
|
+
* Attach to a Chromium already running with --remote-debugging-port=<port>.
|
|
44
|
+
* Returns the same shape as launch() but with process: null and
|
|
45
|
+
* ownedProfileDir: null — cleanupBrowser() becomes a no-op so we never
|
|
46
|
+
* kill a browser we did not start or remove a profile we do not own.
|
|
47
|
+
* @param {object} opts
|
|
48
|
+
* @param {number} opts.port - The debug port the running browser is listening on
|
|
49
|
+
* @returns {Promise<{wsUrl: string, process: null, port: number, ownedProfileDir: null}>}
|
|
50
|
+
*/
|
|
51
|
+
export function attach({ port }: {
|
|
52
|
+
port: number;
|
|
53
|
+
}): Promise<{
|
|
54
|
+
wsUrl: string;
|
|
55
|
+
process: null;
|
|
56
|
+
port: number;
|
|
57
|
+
ownedProfileDir: null;
|
|
58
|
+
}>;
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Try to dismiss a cookie consent dialog on the current page.
|
|
3
|
+
* Inspects the ARIA tree for dialog elements with consent-related content,
|
|
4
|
+
* then clicks the "accept" button.
|
|
5
|
+
*
|
|
6
|
+
* @param {object} session - Session-scoped CDP handle
|
|
7
|
+
* @returns {Promise<boolean>} true if a consent dialog was dismissed
|
|
8
|
+
*/
|
|
9
|
+
export function dismissConsent(session: object): Promise<boolean>;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Spawn a detached child process that runs the daemon.
|
|
3
|
+
* Parent polls for session.json, then exits.
|
|
4
|
+
*/
|
|
5
|
+
export function startDaemon(opts: any, outputDir: any, initialUrl: any): Promise<any>;
|
|
6
|
+
/**
|
|
7
|
+
* Run the daemon HTTP server. Called by cli.js --daemon-internal.
|
|
8
|
+
* Holds a connect() session and serves commands over HTTP.
|
|
9
|
+
*/
|
|
10
|
+
export function runDaemon(opts: any, outputDir: any, initialUrl: any): Promise<void>;
|
package/types/index.d.ts
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Browse a URL and return an ARIA snapshot.
|
|
3
|
+
* This is the primary API — URL in, agent-ready snapshot out.
|
|
4
|
+
*
|
|
5
|
+
* @param {string} url - The URL to browse
|
|
6
|
+
* @param {object} [opts]
|
|
7
|
+
* @param {'headless'|'headed'|'hybrid'} [opts.mode='headless'] - Browser mode
|
|
8
|
+
* @param {boolean} [opts.cookies=true] - Inject user's cookies (Phase 2)
|
|
9
|
+
* @param {boolean} [opts.prune=true] - Apply ARIA pruning (Phase 2)
|
|
10
|
+
* @param {number} [opts.timeout=30000] - Navigation timeout in ms
|
|
11
|
+
* @param {boolean} [opts.blockAds=true] - Block ~120 common ad/tracker
|
|
12
|
+
* URL patterns via CDP. Shrinks ARIA snapshots and speeds page loads.
|
|
13
|
+
* See src/blocklist.js for the default set. Set false to disable.
|
|
14
|
+
* @param {string[]} [opts.blockUrls] - Extra URL glob patterns to block,
|
|
15
|
+
* merged with the default unless blockAds:false.
|
|
16
|
+
* @param {boolean} [opts.allowLocalUrls=false] - Permit navigation to local-
|
|
17
|
+
* resource schemes (file:, view-source:, chrome:, …). Blocked by default.
|
|
18
|
+
* @param {boolean} [opts.blockPrivateNetwork=false] - Reject navigation to
|
|
19
|
+
* loopback / RFC-1918 / link-local / cloud-metadata hosts (SSRF guard).
|
|
20
|
+
* @param {string} [opts.proxy] - Proxy server (e.g. 'http://host:port').
|
|
21
|
+
* @param {string} [opts.binary] - Path to browser binary (auto-detected if omitted).
|
|
22
|
+
* @param {string} [opts.userDataDir] - Browser profile directory.
|
|
23
|
+
* @param {{width: number, height: number}} [opts.viewport] - Viewport dimensions.
|
|
24
|
+
* @param {string} [opts.browser] - Source browser for cookie extraction.
|
|
25
|
+
* @param {boolean} [opts.consent=true] - Auto-dismiss cookie consent dialogs.
|
|
26
|
+
* @param {'act'|'browse'|'navigate'|'full'|'read'} [opts.pruneMode='act'] - Pruning mode.
|
|
27
|
+
* @returns {Promise<string>} ARIA snapshot text
|
|
28
|
+
*/
|
|
29
|
+
export function browse(url: string, opts?: {
|
|
30
|
+
mode?: "headless" | "headed" | "hybrid" | undefined;
|
|
31
|
+
cookies?: boolean | undefined;
|
|
32
|
+
prune?: boolean | undefined;
|
|
33
|
+
timeout?: number | undefined;
|
|
34
|
+
blockAds?: boolean | undefined;
|
|
35
|
+
blockUrls?: string[] | undefined;
|
|
36
|
+
allowLocalUrls?: boolean | undefined;
|
|
37
|
+
blockPrivateNetwork?: boolean | undefined;
|
|
38
|
+
proxy?: string | undefined;
|
|
39
|
+
binary?: string | undefined;
|
|
40
|
+
userDataDir?: string | undefined;
|
|
41
|
+
viewport?: {
|
|
42
|
+
width: number;
|
|
43
|
+
height: number;
|
|
44
|
+
} | undefined;
|
|
45
|
+
browser?: string | undefined;
|
|
46
|
+
consent?: boolean | undefined;
|
|
47
|
+
pruneMode?: "act" | "browse" | "navigate" | "full" | "read" | undefined;
|
|
48
|
+
}): Promise<string>;
|
|
49
|
+
/**
|
|
50
|
+
* Connect to a browser for a long-lived interactive session.
|
|
51
|
+
*
|
|
52
|
+
* @param {object} [opts]
|
|
53
|
+
* @param {'headless'|'headed'|'hybrid'} [opts.mode='headless'] - Browser mode
|
|
54
|
+
* @param {number} [opts.port] - Attach to an already-running Chromium at this
|
|
55
|
+
* CDP port instead of launching a new one. The browser keeps running on
|
|
56
|
+
* close(); only the tab we created is torn down. Use this to drive a
|
|
57
|
+
* user's logged-in session (start Chromium with --remote-debugging-port=N).
|
|
58
|
+
* @param {string} [opts.downloadPath] - Directory to save downloaded files.
|
|
59
|
+
* Default: a per-session subdirectory under the OS temp dir. Downloads
|
|
60
|
+
* land here as <guid>; check `page.downloads` for { url, suggestedFilename,
|
|
61
|
+
* savedPath, state, totalBytes, receivedBytes } per file.
|
|
62
|
+
* @param {boolean} [opts.blockAds] - Block ~120 common ad/tracker URL
|
|
63
|
+
* patterns via CDP. Defaults to true for launched browsers, false in
|
|
64
|
+
* attach mode (would affect any tab attached to the user's running
|
|
65
|
+
* session). Setting blockAds:true explicitly in attach mode honors the
|
|
66
|
+
* request — blocking applies to whichever tab the session is currently
|
|
67
|
+
* attached to and follows the session across switchTab() until close.
|
|
68
|
+
* @param {string[]} [opts.blockUrls] - Extra URL glob patterns to block,
|
|
69
|
+
* merged with the default unless blockAds is false.
|
|
70
|
+
* @param {boolean} [opts.allowLocalUrls=false] - Permit navigation to local-
|
|
71
|
+
* resource schemes (file:, view-source:, chrome:, …). Blocked by default
|
|
72
|
+
* because a prompt-injected agent could use them to read local files.
|
|
73
|
+
* @param {boolean} [opts.blockPrivateNetwork=false] - Reject navigation to
|
|
74
|
+
* loopback / RFC-1918 / link-local / cloud-metadata hosts (SSRF guard).
|
|
75
|
+
* Off by default so localhost dev-server browsing keeps working.
|
|
76
|
+
* @param {string} [opts.uploadDir] - When set, upload() rejects any file that
|
|
77
|
+
* does not resolve (symlinks included) inside this directory. Sandboxes the
|
|
78
|
+
* agent's file-upload capability. Default: no restriction.
|
|
79
|
+
* @param {string} [opts.proxy] - Proxy server (e.g. 'http://host:port').
|
|
80
|
+
* @param {string} [opts.binary] - Path to browser binary (auto-detected if omitted).
|
|
81
|
+
* @param {string} [opts.userDataDir] - Browser profile directory.
|
|
82
|
+
* @param {{width: number, height: number}} [opts.viewport] - Viewport dimensions.
|
|
83
|
+
* @param {boolean} [opts.consent=true] - Auto-dismiss cookie consent dialogs.
|
|
84
|
+
* @param {string} [opts.storageState] - Path to a storage-state JSON file
|
|
85
|
+
* (cookies + localStorage) to load before navigation.
|
|
86
|
+
* @param {'act'|'browse'|'navigate'|'full'|'read'} [opts.pruneMode='act'] - Pruning mode.
|
|
87
|
+
* @returns {Promise<object>} Page handle with goto, snapshot, close
|
|
88
|
+
*/
|
|
89
|
+
export function connect(opts?: {
|
|
90
|
+
mode?: "headless" | "headed" | "hybrid" | undefined;
|
|
91
|
+
port?: number | undefined;
|
|
92
|
+
downloadPath?: string | undefined;
|
|
93
|
+
blockAds?: boolean | undefined;
|
|
94
|
+
blockUrls?: string[] | undefined;
|
|
95
|
+
allowLocalUrls?: boolean | undefined;
|
|
96
|
+
blockPrivateNetwork?: boolean | undefined;
|
|
97
|
+
uploadDir?: string | undefined;
|
|
98
|
+
proxy?: string | undefined;
|
|
99
|
+
binary?: string | undefined;
|
|
100
|
+
userDataDir?: string | undefined;
|
|
101
|
+
viewport?: {
|
|
102
|
+
width: number;
|
|
103
|
+
height: number;
|
|
104
|
+
} | undefined;
|
|
105
|
+
consent?: boolean | undefined;
|
|
106
|
+
storageState?: string | undefined;
|
|
107
|
+
pruneMode?: "act" | "browse" | "navigate" | "full" | "read" | undefined;
|
|
108
|
+
}): Promise<object>;
|
|
109
|
+
/**
|
|
110
|
+
* Apply Network.setBlockedURLs for ad/tracker blocking on a session.
|
|
111
|
+
* Default list is on; pass blockAds:false to skip, blockUrls:[] to extend.
|
|
112
|
+
* On failure (legacy Chrome lacking the method) warns once and continues —
|
|
113
|
+
* blocking is an enhancement, not a hard requirement.
|
|
114
|
+
*
|
|
115
|
+
* Exported for unit testing of the warn-once behavior; not part of the public
|
|
116
|
+
* API surface.
|
|
117
|
+
*/
|
|
118
|
+
export function applyBlocklist(session: any, pageOpts: any): Promise<void>;
|
|
119
|
+
/** Test-only: reset the warn-once flag. Not part of the public API. */
|
|
120
|
+
export function _resetBlocklistWarning(): void;
|
|
121
|
+
/**
|
|
122
|
+
* Detect if a page is a bot-challenge page (Cloudflare, hCaptcha, etc.).
|
|
123
|
+
*
|
|
124
|
+
* Pre-H9 this was over-aggressive: `nodeCount < 50` alone fired on any
|
|
125
|
+
* legitimate small page (404s, simple landings, error pages), and generic
|
|
126
|
+
* phrases like "access denied" / "unknown error" / "permission denied"
|
|
127
|
+
* triggered on real HTTP 4xx/5xx pages, kicking hybrid mode into a costly
|
|
128
|
+
* headed fallback for nothing.
|
|
129
|
+
*
|
|
130
|
+
* H9 split: STRONG_PHRASES are essentially-unambiguous challenge UI and
|
|
131
|
+
* fire regardless of page size; WEAK_PHRASES only fire when the page is
|
|
132
|
+
* ALSO tiny (so a legitimate-looking error page with "access denied" in
|
|
133
|
+
* its body doesn't trip the fallback).
|
|
134
|
+
*
|
|
135
|
+
* @param {object} tree - Nested ARIA tree (from buildTree)
|
|
136
|
+
* @param {number} [nodeCount] - Raw CDP node count (from Accessibility.getFullAXTree)
|
|
137
|
+
*/
|
|
138
|
+
export function isChallengePage(tree: object, nodeCount?: number): boolean;
|