barebrowse 0.7.1 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +249 -0
- package/LICENSE +202 -21
- package/NOTICE +8 -0
- package/README.md +39 -10
- package/barebrowse.context.md +45 -18
- package/cli.js +114 -3
- package/mcp-server.js +276 -70
- package/package.json +2 -2
- package/src/bareagent.js +43 -4
- package/src/chromium.js +115 -5
- package/src/consent.js +3 -8
- package/src/daemon.js +13 -0
- package/src/index.js +440 -135
- package/src/network-idle.js +62 -0
- package/src/prune.js +2 -1
- package/src/stealth.js +87 -6
package/src/index.js
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* const snapshot = await browse('https://example.com');
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
-
import { launch } from './chromium.js';
|
|
11
|
+
import { launch, attach, cleanupBrowser } from './chromium.js';
|
|
12
12
|
import { createCDP } from './cdp.js';
|
|
13
13
|
import { formatTree } from './aria.js';
|
|
14
14
|
import { authenticate } from './auth.js';
|
|
@@ -16,6 +16,8 @@ import { prune as pruneTree } from './prune.js';
|
|
|
16
16
|
import { click as cdpClick, type as cdpType, scroll as cdpScroll, press as cdpPress, hover as cdpHover, select as cdpSelect, drag as cdpDrag, upload as cdpUpload } from './interact.js';
|
|
17
17
|
import { dismissConsent } from './consent.js';
|
|
18
18
|
import { applyStealth } from './stealth.js';
|
|
19
|
+
import { waitForNetworkIdle } from './network-idle.js';
|
|
20
|
+
import { join as pathJoin } from 'node:path';
|
|
19
21
|
|
|
20
22
|
/**
|
|
21
23
|
* Browse a URL and return an ARIA snapshot.
|
|
@@ -35,15 +37,18 @@ export async function browse(url, opts = {}) {
|
|
|
35
37
|
|
|
36
38
|
let browser = null;
|
|
37
39
|
let cdp = null;
|
|
40
|
+
// Forward caller-supplied launch knobs (binary, userDataDir, proxy) into
|
|
41
|
+
// every launch() call below, including hybrid-fallback re-launches.
|
|
42
|
+
const launchOpts = { proxy: opts.proxy, binary: opts.binary, userDataDir: opts.userDataDir };
|
|
38
43
|
|
|
39
44
|
try {
|
|
40
45
|
// Step 1: Get a CDP connection
|
|
41
46
|
if (mode === 'headed') {
|
|
42
|
-
browser = await launch({ headed: true
|
|
47
|
+
browser = await launch({ ...launchOpts, headed: true });
|
|
43
48
|
cdp = await createCDP(browser.wsUrl);
|
|
44
49
|
} else {
|
|
45
50
|
// headless or hybrid (start headless)
|
|
46
|
-
browser = await launch(
|
|
51
|
+
browser = await launch(launchOpts);
|
|
47
52
|
cdp = await createCDP(browser.wsUrl);
|
|
48
53
|
}
|
|
49
54
|
|
|
@@ -77,10 +82,10 @@ export async function browse(url, opts = {}) {
|
|
|
77
82
|
if (mode === 'hybrid' && isChallengePage(tree, nodeCount)) {
|
|
78
83
|
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
79
84
|
cdp.close();
|
|
80
|
-
|
|
85
|
+
await cleanupBrowser(browser); browser = null;
|
|
81
86
|
|
|
82
87
|
try {
|
|
83
|
-
browser = await launch({ headed: true
|
|
88
|
+
browser = await launch({ ...launchOpts, headed: true });
|
|
84
89
|
cdp = await createCDP(browser.wsUrl);
|
|
85
90
|
page = await createPage(cdp, false, { viewport: opts.viewport });
|
|
86
91
|
await suppressPermissions(cdp);
|
|
@@ -105,7 +110,11 @@ export async function browse(url, opts = {}) {
|
|
|
105
110
|
snapshot = raw;
|
|
106
111
|
}
|
|
107
112
|
const stats = `url: ${url}\n${raw.length.toLocaleString()} chars → ${snapshot.length.toLocaleString()} chars (${Math.round((1 - snapshot.length / raw.length) * 100)}% pruned)`;
|
|
108
|
-
|
|
113
|
+
const actMode = !opts.pruneMode || opts.pruneMode === 'act';
|
|
114
|
+
const hint = (actMode && raw.length > 5000 && snapshot.length < 500 && snapshot.length < raw.length * 0.05)
|
|
115
|
+
? `hint: act mode dropped most of the page — retry with pruneMode='read' for paragraphs and long text\n`
|
|
116
|
+
: '';
|
|
117
|
+
snapshot = stats + '\n' + hint + snapshot;
|
|
109
118
|
|
|
110
119
|
// Step 7: Clean up
|
|
111
120
|
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
@@ -113,7 +122,7 @@ export async function browse(url, opts = {}) {
|
|
|
113
122
|
return snapshot;
|
|
114
123
|
} finally {
|
|
115
124
|
if (cdp) cdp.close();
|
|
116
|
-
|
|
125
|
+
await cleanupBrowser(browser);
|
|
117
126
|
}
|
|
118
127
|
}
|
|
119
128
|
|
|
@@ -122,28 +131,54 @@ export async function browse(url, opts = {}) {
|
|
|
122
131
|
*
|
|
123
132
|
* @param {object} [opts]
|
|
124
133
|
* @param {'headless'|'headed'|'hybrid'} [opts.mode='headless'] - Browser mode
|
|
134
|
+
* @param {number} [opts.port] - Attach to an already-running Chromium at this
|
|
135
|
+
* CDP port instead of launching a new one. The browser keeps running on
|
|
136
|
+
* close(); only the tab we created is torn down. Use this to drive a
|
|
137
|
+
* user's logged-in session (start Chromium with --remote-debugging-port=N).
|
|
138
|
+
* @param {string} [opts.downloadPath] - Directory to save downloaded files.
|
|
139
|
+
* Default: a per-session subdirectory under the OS temp dir. Downloads
|
|
140
|
+
* land here as <guid>; check `page.downloads` for { url, suggestedFilename,
|
|
141
|
+
* savedPath, state, totalBytes, receivedBytes } per file.
|
|
125
142
|
* @returns {Promise<object>} Page handle with goto, snapshot, close
|
|
126
143
|
*/
|
|
127
144
|
export async function connect(opts = {}) {
|
|
128
145
|
const mode = opts.mode || 'headless';
|
|
146
|
+
const attachMode = !!opts.port;
|
|
129
147
|
let browser = null;
|
|
130
148
|
let cdp;
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
149
|
+
// Forward caller-supplied launch knobs into every launch() below,
|
|
150
|
+
// including hybrid-fallback re-launches inside goto().
|
|
151
|
+
const launchOpts = { proxy: opts.proxy, binary: opts.binary, userDataDir: opts.userDataDir };
|
|
152
|
+
|
|
153
|
+
if (attachMode) {
|
|
154
|
+
// Reuse the user's running browser — do not launch, do not own the
|
|
155
|
+
// profile. cleanupBrowser() is a no-op on this shape (process: null,
|
|
156
|
+
// ownedProfileDir: null), which is the whole point.
|
|
157
|
+
browser = await attach({ port: opts.port });
|
|
158
|
+
cdp = await createCDP(browser.wsUrl);
|
|
159
|
+
} else if (mode === 'headed') {
|
|
160
|
+
browser = await launch({ ...launchOpts, headed: true });
|
|
134
161
|
cdp = await createCDP(browser.wsUrl);
|
|
135
162
|
} else {
|
|
136
|
-
browser = await launch(
|
|
163
|
+
browser = await launch(launchOpts);
|
|
137
164
|
cdp = await createCDP(browser.wsUrl);
|
|
138
165
|
}
|
|
139
166
|
|
|
140
|
-
|
|
167
|
+
// In attach mode we don't know (and shouldn't assume) the user's headed/
|
|
168
|
+
// headless state — treat it as headed so stealth patches are skipped
|
|
169
|
+
// (they'd persist in the user's session via addScriptToEvaluateOnNewDocument)
|
|
170
|
+
// and the headed→headless rewind in goto() is gated off below.
|
|
171
|
+
let currentlyHeaded = attachMode || (mode === 'headed');
|
|
141
172
|
let page = await createPage(cdp, !currentlyHeaded, { viewport: opts.viewport });
|
|
142
173
|
let refMap = new Map();
|
|
143
174
|
let botBlocked = false;
|
|
144
175
|
|
|
145
|
-
// Suppress permission prompts
|
|
146
|
-
|
|
176
|
+
// Suppress permission prompts. Skipped in attach mode — Browser.setPermission
|
|
177
|
+
// is browser-wide (no origin scope here), so flipping permissions to denied
|
|
178
|
+
// would leak into the user's other tabs.
|
|
179
|
+
if (!attachMode) {
|
|
180
|
+
await suppressPermissions(cdp);
|
|
181
|
+
}
|
|
147
182
|
|
|
148
183
|
// Load storage state (cookies + localStorage) from file
|
|
149
184
|
if (opts.storageState) {
|
|
@@ -156,8 +191,72 @@ export async function connect(opts = {}) {
|
|
|
156
191
|
} catch { /* file not found or invalid — continue without */ }
|
|
157
192
|
}
|
|
158
193
|
|
|
159
|
-
//
|
|
194
|
+
// Download tracking — wire Browser.setDownloadBehavior so files actually
|
|
195
|
+
// land on disk (default Chromium would route them to ~/Downloads or
|
|
196
|
+
// nowhere useful in headless), and listen for downloadWillBegin /
|
|
197
|
+
// downloadProgress so callers can read `page.downloads` to know what
|
|
198
|
+
// arrived. In attach mode we don't change the user's running browser's
|
|
199
|
+
// download dir — they almost certainly have an existing preference.
|
|
200
|
+
const downloads = [];
|
|
201
|
+
let ownedDownloadDir = null;
|
|
202
|
+
if (!attachMode) {
|
|
203
|
+
let downloadPath = opts.downloadPath;
|
|
204
|
+
if (!downloadPath) {
|
|
205
|
+
const { mkdtempSync } = await import('node:fs');
|
|
206
|
+
const { tmpdir } = await import('node:os');
|
|
207
|
+
ownedDownloadDir = mkdtempSync(pathJoin(tmpdir(), 'barebrowse-dl-'));
|
|
208
|
+
downloadPath = ownedDownloadDir;
|
|
209
|
+
}
|
|
210
|
+
// Register listeners BEFORE sending setDownloadBehavior so no
|
|
211
|
+
// downloadWillBegin / downloadProgress event can fire into a session
|
|
212
|
+
// without subscribers — about:blank can't initiate a download so the
|
|
213
|
+
// window is microscopic in practice, but ordering it correctly costs
|
|
214
|
+
// nothing.
|
|
215
|
+
cdp.on('Browser.downloadWillBegin', (params) => {
|
|
216
|
+
downloads.push({
|
|
217
|
+
guid: params.guid,
|
|
218
|
+
url: params.url,
|
|
219
|
+
suggestedFilename: params.suggestedFilename,
|
|
220
|
+
savedPath: pathJoin(downloadPath, params.guid),
|
|
221
|
+
state: 'inProgress',
|
|
222
|
+
totalBytes: 0,
|
|
223
|
+
receivedBytes: 0,
|
|
224
|
+
});
|
|
225
|
+
});
|
|
226
|
+
cdp.on('Browser.downloadProgress', (params) => {
|
|
227
|
+
const d = downloads.find((x) => x.guid === params.guid);
|
|
228
|
+
if (!d) return;
|
|
229
|
+
d.state = params.state; // 'inProgress' | 'completed' | 'canceled'
|
|
230
|
+
d.totalBytes = params.totalBytes;
|
|
231
|
+
d.receivedBytes = params.receivedBytes;
|
|
232
|
+
});
|
|
233
|
+
try {
|
|
234
|
+
// 'allowAndName' names saved files by guid for a stable, predictable
|
|
235
|
+
// path; the suggested filename is still surfaced on the download record.
|
|
236
|
+
await cdp.send('Browser.setDownloadBehavior', {
|
|
237
|
+
behavior: 'allowAndName', downloadPath, eventsEnabled: true,
|
|
238
|
+
});
|
|
239
|
+
} catch {
|
|
240
|
+
// Older Chrome may not accept 'allowAndName' — fall back to 'allow'
|
|
241
|
+
// which uses the suggested filename verbatim (no GUID).
|
|
242
|
+
try {
|
|
243
|
+
await cdp.send('Browser.setDownloadBehavior', {
|
|
244
|
+
behavior: 'allow', downloadPath, eventsEnabled: true,
|
|
245
|
+
});
|
|
246
|
+
} catch {
|
|
247
|
+
// Download capture unavailable on this Chrome — downloads still
|
|
248
|
+
// happen, we just can't observe them. page.downloads stays empty.
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// JS dialog handling (alert, confirm, prompt, beforeunload). Default is
|
|
254
|
+
// auto-accept everything except beforeunload (auto-dismiss). The caller
|
|
255
|
+
// can install a custom decision via page.onDialog(handler) — the handler
|
|
256
|
+
// gets { type, message, defaultPrompt } and may return
|
|
257
|
+
// { accept: bool, promptText: string } to override.
|
|
160
258
|
const dialogLog = [];
|
|
259
|
+
let onDialogHandler = null;
|
|
161
260
|
function setupDialogHandler(session) {
|
|
162
261
|
session.on('Page.javascriptDialogOpening', async (params) => {
|
|
163
262
|
dialogLog.push({
|
|
@@ -165,23 +264,45 @@ export async function connect(opts = {}) {
|
|
|
165
264
|
message: params.message,
|
|
166
265
|
timestamp: new Date().toISOString(),
|
|
167
266
|
});
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
267
|
+
let accept = params.type !== 'beforeunload';
|
|
268
|
+
let promptText = params.defaultPrompt || '';
|
|
269
|
+
if (onDialogHandler) {
|
|
270
|
+
try {
|
|
271
|
+
const decision = await onDialogHandler({
|
|
272
|
+
type: params.type,
|
|
273
|
+
message: params.message,
|
|
274
|
+
defaultPrompt: params.defaultPrompt || '',
|
|
275
|
+
});
|
|
276
|
+
if (decision && typeof decision === 'object') {
|
|
277
|
+
if (typeof decision.accept === 'boolean') accept = decision.accept;
|
|
278
|
+
if (typeof decision.promptText === 'string') promptText = decision.promptText;
|
|
279
|
+
}
|
|
280
|
+
} catch {
|
|
281
|
+
// Handler threw — fall back to defaults so the page doesn't hang
|
|
282
|
+
// waiting for a never-arriving handleJavaScriptDialog reply.
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
await session.send('Page.handleJavaScriptDialog', { accept, promptText });
|
|
172
286
|
});
|
|
173
287
|
}
|
|
174
288
|
setupDialogHandler(page.session);
|
|
175
289
|
|
|
176
290
|
return {
|
|
177
291
|
async goto(url, timeout = 30000) {
|
|
178
|
-
//
|
|
179
|
-
|
|
292
|
+
// Refs from the previous page are about to become invalid — clear
|
|
293
|
+
// before navigating so a stale click(ref) errors clearly instead of
|
|
294
|
+
// silently resolving to whatever backendNodeId happens to still be in
|
|
295
|
+
// the map.
|
|
296
|
+
refMap = new Map();
|
|
297
|
+
// Switch back to headless if we fell back to headed previously.
|
|
298
|
+
// Not in attach mode — we never own the browser there, so there's
|
|
299
|
+
// nothing to rewind.
|
|
300
|
+
if (currentlyHeaded && mode === 'hybrid' && !attachMode) {
|
|
180
301
|
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
181
302
|
cdp.close();
|
|
182
|
-
|
|
303
|
+
await cleanupBrowser(browser); browser = null;
|
|
183
304
|
|
|
184
|
-
browser = await launch(
|
|
305
|
+
browser = await launch(launchOpts);
|
|
185
306
|
cdp = await createCDP(browser.wsUrl);
|
|
186
307
|
page = await createPage(cdp, true, { viewport: opts.viewport });
|
|
187
308
|
setupDialogHandler(page.session);
|
|
@@ -198,14 +319,16 @@ export async function connect(opts = {}) {
|
|
|
198
319
|
const { tree, nodeCount } = await ariaTree(page);
|
|
199
320
|
botBlocked = isChallengePage(tree, nodeCount);
|
|
200
321
|
|
|
201
|
-
// Hybrid fallback: if bot-blocked, retry with headed browser
|
|
202
|
-
|
|
322
|
+
// Hybrid fallback: if bot-blocked, retry with headed browser.
|
|
323
|
+
// Suppressed in attach mode — we can't tear down the user's running
|
|
324
|
+
// browser and we don't know what mode they started it in.
|
|
325
|
+
if (botBlocked && mode === 'hybrid' && !attachMode) {
|
|
203
326
|
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
204
327
|
cdp.close();
|
|
205
|
-
|
|
328
|
+
await cleanupBrowser(browser); browser = null;
|
|
206
329
|
|
|
207
330
|
try {
|
|
208
|
-
browser = await launch({ headed: true
|
|
331
|
+
browser = await launch({ ...launchOpts, headed: true });
|
|
209
332
|
cdp = await createCDP(browser.wsUrl);
|
|
210
333
|
page = await createPage(cdp, false, { viewport: opts.viewport });
|
|
211
334
|
setupDialogHandler(page.session);
|
|
@@ -226,15 +349,29 @@ export async function connect(opts = {}) {
|
|
|
226
349
|
async goBack() {
|
|
227
350
|
const { currentIndex, entries } = await page.session.send('Page.getNavigationHistory');
|
|
228
351
|
if (currentIndex <= 0) throw new Error('No previous page in history');
|
|
352
|
+
const loadPromise = page.session.once('Page.loadEventFired', 30000);
|
|
229
353
|
await page.session.send('Page.navigateToHistoryEntry', { entryId: entries[currentIndex - 1].id });
|
|
230
|
-
await new Promise((r) => setTimeout(r, 500));
|
|
354
|
+
try { await loadPromise; } catch { await new Promise((r) => setTimeout(r, 500)); }
|
|
355
|
+
refMap = new Map(); // refs from the previous page are now invalid
|
|
231
356
|
},
|
|
232
357
|
|
|
233
358
|
async goForward() {
|
|
234
359
|
const { currentIndex, entries } = await page.session.send('Page.getNavigationHistory');
|
|
235
360
|
if (currentIndex >= entries.length - 1) throw new Error('No next page in history');
|
|
361
|
+
const loadPromise = page.session.once('Page.loadEventFired', 30000);
|
|
236
362
|
await page.session.send('Page.navigateToHistoryEntry', { entryId: entries[currentIndex + 1].id });
|
|
237
|
-
await new Promise((r) => setTimeout(r, 500));
|
|
363
|
+
try { await loadPromise; } catch { await new Promise((r) => setTimeout(r, 500)); }
|
|
364
|
+
refMap = new Map();
|
|
365
|
+
},
|
|
366
|
+
|
|
367
|
+
async reload(reloadOpts = {}) {
|
|
368
|
+
const timeout = reloadOpts.timeout || 30000;
|
|
369
|
+
const loadPromise = page.session.once('Page.loadEventFired', timeout);
|
|
370
|
+
await page.session.send('Page.reload', {
|
|
371
|
+
ignoreCache: !!reloadOpts.ignoreCache,
|
|
372
|
+
});
|
|
373
|
+
try { await loadPromise; } catch { await new Promise((r) => setTimeout(r, 500)); }
|
|
374
|
+
refMap = new Map(); // refs from the pre-reload page are invalid
|
|
238
375
|
},
|
|
239
376
|
|
|
240
377
|
async injectCookies(url, cookieOpts) {
|
|
@@ -249,22 +386,26 @@ export async function connect(opts = {}) {
|
|
|
249
386
|
const pageUrl = entries[currentIndex]?.url || '';
|
|
250
387
|
const warn = botBlocked ? '[BOT CHALLENGE DETECTED — page content may be incomplete or blocked]\n' : '';
|
|
251
388
|
if (pruneOpts === false) return `url: ${pageUrl}\n` + warn + raw;
|
|
252
|
-
const
|
|
389
|
+
const mode = pruneOpts?.mode || 'act';
|
|
390
|
+
const pruned = pruneTree(result.tree, { mode });
|
|
253
391
|
const out = formatTree(pruned);
|
|
254
392
|
const stats = `url: ${pageUrl}\n${raw.length.toLocaleString()} chars → ${out.length.toLocaleString()} chars (${Math.round((1 - out.length / raw.length) * 100)}% pruned)`;
|
|
255
|
-
|
|
393
|
+
const hint = (mode === 'act' && raw.length > 5000 && out.length < 500 && out.length < raw.length * 0.05)
|
|
394
|
+
? `hint: act mode dropped most of the page — retry with pruneMode='read' for paragraphs and long text\n`
|
|
395
|
+
: '';
|
|
396
|
+
return stats + '\n' + hint + warn + out;
|
|
256
397
|
},
|
|
257
398
|
|
|
258
399
|
async click(ref) {
|
|
259
|
-
const
|
|
260
|
-
if (!
|
|
261
|
-
await cdpClick(
|
|
400
|
+
const entry = refMap.get(ref);
|
|
401
|
+
if (!entry) throw new Error(`No element found for ref "${ref}"`);
|
|
402
|
+
await cdpClick(entry.session, entry.backendNodeId);
|
|
262
403
|
},
|
|
263
404
|
|
|
264
405
|
async type(ref, text, typeOpts) {
|
|
265
|
-
const
|
|
266
|
-
if (!
|
|
267
|
-
await cdpType(
|
|
406
|
+
const entry = refMap.get(ref);
|
|
407
|
+
if (!entry) throw new Error(`No element found for ref "${ref}"`);
|
|
408
|
+
await cdpType(entry.session, entry.backendNodeId, text, typeOpts);
|
|
268
409
|
},
|
|
269
410
|
|
|
270
411
|
async scroll(deltaY) {
|
|
@@ -276,29 +417,34 @@ export async function connect(opts = {}) {
|
|
|
276
417
|
},
|
|
277
418
|
|
|
278
419
|
async hover(ref) {
|
|
279
|
-
const
|
|
280
|
-
if (!
|
|
281
|
-
await cdpHover(
|
|
420
|
+
const entry = refMap.get(ref);
|
|
421
|
+
if (!entry) throw new Error(`No element found for ref "${ref}"`);
|
|
422
|
+
await cdpHover(entry.session, entry.backendNodeId);
|
|
282
423
|
},
|
|
283
424
|
|
|
284
425
|
async select(ref, value) {
|
|
285
|
-
const
|
|
286
|
-
if (!
|
|
287
|
-
await cdpSelect(
|
|
426
|
+
const entry = refMap.get(ref);
|
|
427
|
+
if (!entry) throw new Error(`No element found for ref "${ref}"`);
|
|
428
|
+
await cdpSelect(entry.session, entry.backendNodeId, value);
|
|
288
429
|
},
|
|
289
430
|
|
|
290
431
|
async drag(fromRef, toRef) {
|
|
291
|
-
const
|
|
292
|
-
const
|
|
293
|
-
if (!
|
|
294
|
-
if (!
|
|
295
|
-
|
|
432
|
+
const from = refMap.get(fromRef);
|
|
433
|
+
const to = refMap.get(toRef);
|
|
434
|
+
if (!from) throw new Error(`No element found for ref "${fromRef}"`);
|
|
435
|
+
if (!to) throw new Error(`No element found for ref "${toRef}"`);
|
|
436
|
+
// Drag across different frames isn't physically meaningful — bail
|
|
437
|
+
// rather than mix sessions and produce nonsense coordinates.
|
|
438
|
+
if (from.session !== to.session) {
|
|
439
|
+
throw new Error('drag() between elements in different frames is not supported');
|
|
440
|
+
}
|
|
441
|
+
await cdpDrag(from.session, from.backendNodeId, to.backendNodeId);
|
|
296
442
|
},
|
|
297
443
|
|
|
298
444
|
async upload(ref, files) {
|
|
299
|
-
const
|
|
300
|
-
if (!
|
|
301
|
-
await cdpUpload(
|
|
445
|
+
const entry = refMap.get(ref);
|
|
446
|
+
if (!entry) throw new Error(`No element found for ref "${ref}"`);
|
|
447
|
+
await cdpUpload(entry.session, entry.backendNodeId, files);
|
|
302
448
|
},
|
|
303
449
|
|
|
304
450
|
async pdf(pdfOpts = {}) {
|
|
@@ -320,7 +466,17 @@ export async function connect(opts = {}) {
|
|
|
320
466
|
const { targetInfos } = await cdp.send('Target.getTargets');
|
|
321
467
|
const pages = targetInfos.filter((t) => t.type === 'page');
|
|
322
468
|
if (index < 0 || index >= pages.length) throw new Error(`Tab index ${index} out of range (0-${pages.length - 1})`);
|
|
323
|
-
|
|
469
|
+
const target = pages[index];
|
|
470
|
+
await cdp.send('Target.activateTarget', { targetId: target.targetId });
|
|
471
|
+
if (target.targetId === page.targetId) return; // already on this tab
|
|
472
|
+
// Detach from old session, attach to new — the page variable is the
|
|
473
|
+
// closure handle used by every method below, so swapping it makes
|
|
474
|
+
// snapshot/click/type/etc. operate on the new tab.
|
|
475
|
+
const oldSessionId = page.sessionId;
|
|
476
|
+
page = await attachToExistingTarget(cdp, target.targetId);
|
|
477
|
+
refMap = new Map(); // refs from the previous tab are no longer valid
|
|
478
|
+
setupDialogHandler(page.session);
|
|
479
|
+
try { await cdp.send('Target.detachFromTarget', { sessionId: oldSessionId }); } catch {}
|
|
324
480
|
},
|
|
325
481
|
|
|
326
482
|
async waitFor(waitOpts = {}) {
|
|
@@ -363,6 +519,18 @@ export async function connect(opts = {}) {
|
|
|
363
519
|
|
|
364
520
|
dialogLog,
|
|
365
521
|
|
|
522
|
+
/**
|
|
523
|
+
* Install a custom JS dialog handler. The handler is called with
|
|
524
|
+
* `{ type, message, defaultPrompt }` and may return (sync or async)
|
|
525
|
+
* `{ accept: bool, promptText: string }` to override the auto-accept
|
|
526
|
+
* default. Pass null to restore the default behavior.
|
|
527
|
+
*/
|
|
528
|
+
onDialog(handler) {
|
|
529
|
+
onDialogHandler = handler;
|
|
530
|
+
},
|
|
531
|
+
|
|
532
|
+
downloads,
|
|
533
|
+
|
|
366
534
|
async screenshot(screenshotOpts = {}) {
|
|
367
535
|
const format = screenshotOpts.format || 'png';
|
|
368
536
|
const params = { format };
|
|
@@ -389,12 +557,13 @@ export async function connect(opts = {}) {
|
|
|
389
557
|
return waitForNetworkIdle(page.session, idleOpts);
|
|
390
558
|
},
|
|
391
559
|
|
|
392
|
-
/** Raw CDP session for escape hatch */
|
|
393
|
-
cdp
|
|
560
|
+
/** Raw CDP session for escape hatch — getter so it survives hybrid fallback / tab swaps */
|
|
561
|
+
get cdp() { return page.session; },
|
|
394
562
|
|
|
395
563
|
async createTab() {
|
|
396
564
|
const tab = await createPage(cdp, !currentlyHeaded, { viewport: opts.viewport });
|
|
397
565
|
await suppressPermissions(cdp);
|
|
566
|
+
setupDialogHandler(tab.session);
|
|
398
567
|
let tabBotBlocked = false;
|
|
399
568
|
return {
|
|
400
569
|
async goto(url, timeout = 30000) {
|
|
@@ -422,7 +591,15 @@ export async function connect(opts = {}) {
|
|
|
422
591
|
async close() {
|
|
423
592
|
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
424
593
|
cdp.close();
|
|
425
|
-
|
|
594
|
+
await cleanupBrowser(browser);
|
|
595
|
+
// If we created the download dir ourselves, clean it up too. Caller-
|
|
596
|
+
// supplied opts.downloadPath stays — the caller owns the lifecycle.
|
|
597
|
+
if (ownedDownloadDir) {
|
|
598
|
+
try {
|
|
599
|
+
const { rmSync } = await import('node:fs');
|
|
600
|
+
rmSync(ownedDownloadDir, { recursive: true, force: true });
|
|
601
|
+
} catch {}
|
|
602
|
+
}
|
|
426
603
|
},
|
|
427
604
|
};
|
|
428
605
|
}
|
|
@@ -486,7 +663,69 @@ async function createPage(cdp, stealth = false, pageOpts = {}) {
|
|
|
486
663
|
}
|
|
487
664
|
}
|
|
488
665
|
|
|
489
|
-
|
|
666
|
+
// Track child frame sessions (OOPIF) so ariaTree() can read across frame
|
|
667
|
+
// boundaries. Same-origin iframes don't get their own session and stay
|
|
668
|
+
// queryable via the main session with a frameId param — see ariaTree().
|
|
669
|
+
const framesByFrameId = await attachFrameTracking(cdp, session);
|
|
670
|
+
|
|
671
|
+
return { session, targetId, sessionId, framesByFrameId };
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
/**
|
|
675
|
+
* Wire Target.setAutoAttach on a page session so every OOPIF child target gets
|
|
676
|
+
* its own CDP session, enabled and registered. Returns a live Map<frameId,
|
|
677
|
+
* { session, sessionId, targetId }> that updates as frames attach/detach.
|
|
678
|
+
*/
|
|
679
|
+
async function attachFrameTracking(cdp, mainSession) {
|
|
680
|
+
const framesByFrameId = new Map();
|
|
681
|
+
|
|
682
|
+
mainSession.on('Target.attachedToTarget', async (params) => {
|
|
683
|
+
if (params.targetInfo?.type !== 'iframe') return;
|
|
684
|
+
const childSessionId = params.sessionId;
|
|
685
|
+
const childSession = cdp.session(childSessionId);
|
|
686
|
+
// For OOPIF, targetId === frameId — see CDP Target domain docs.
|
|
687
|
+
const frameId = params.targetInfo.targetId;
|
|
688
|
+
framesByFrameId.set(frameId, { session: childSession, sessionId: childSessionId, targetId: frameId });
|
|
689
|
+
// Enable domains on the child so we can read its AX tree.
|
|
690
|
+
// Recursively auto-attach so nested OOPIF iframes also get sessions.
|
|
691
|
+
try { await childSession.send('Page.enable'); } catch {}
|
|
692
|
+
try { await childSession.send('DOM.enable'); } catch {}
|
|
693
|
+
try {
|
|
694
|
+
await childSession.send('Target.setAutoAttach', {
|
|
695
|
+
autoAttach: true, flatten: true, waitForDebuggerOnStart: false,
|
|
696
|
+
});
|
|
697
|
+
} catch {}
|
|
698
|
+
try { await childSession.send('Runtime.runIfWaitingForDebugger'); } catch {}
|
|
699
|
+
});
|
|
700
|
+
|
|
701
|
+
mainSession.on('Target.detachedFromTarget', (params) => {
|
|
702
|
+
for (const [frameId, entry] of framesByFrameId) {
|
|
703
|
+
if (entry.sessionId === params.sessionId) {
|
|
704
|
+
framesByFrameId.delete(frameId);
|
|
705
|
+
return;
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
});
|
|
709
|
+
|
|
710
|
+
await mainSession.send('Target.setAutoAttach', {
|
|
711
|
+
autoAttach: true, flatten: true, waitForDebuggerOnStart: false,
|
|
712
|
+
});
|
|
713
|
+
|
|
714
|
+
return framesByFrameId;
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
/**
|
|
718
|
+
* Attach a CDP session to an existing target (e.g. a tab opened by window.open).
|
|
719
|
+
* Enables the same domains as createPage so snapshot/click/type work uniformly.
|
|
720
|
+
*/
|
|
721
|
+
async function attachToExistingTarget(cdp, targetId) {
|
|
722
|
+
const { sessionId } = await cdp.send('Target.attachToTarget', { targetId, flatten: true });
|
|
723
|
+
const session = cdp.session(sessionId);
|
|
724
|
+
await session.send('Page.enable');
|
|
725
|
+
await session.send('Network.enable');
|
|
726
|
+
await session.send('DOM.enable');
|
|
727
|
+
const framesByFrameId = await attachFrameTracking(cdp, session);
|
|
728
|
+
return { session, targetId, sessionId, framesByFrameId };
|
|
490
729
|
}
|
|
491
730
|
|
|
492
731
|
/**
|
|
@@ -502,37 +741,111 @@ async function navigate(page, url, timeout = 30000) {
|
|
|
502
741
|
|
|
503
742
|
/**
|
|
504
743
|
* Get the ARIA accessibility tree for a page as a nested object.
|
|
744
|
+
*
|
|
745
|
+
* Walks every frame (main + iframes) via Page.getFrameTree, queries each
|
|
746
|
+
* frame's AX tree on the right session (child session for OOPIF, main
|
|
747
|
+
* session with frameId param for same-origin), and splices child frame
|
|
748
|
+
* trees under their iframe placeholders in the parent. Refs are assigned
|
|
749
|
+
* by a flat global counter so click/type/etc can resolve the right session
|
|
750
|
+
* without the agent having to think about frames at all.
|
|
505
751
|
*/
|
|
506
752
|
async function ariaTree(page) {
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
753
|
+
const main = page.session;
|
|
754
|
+
await main.send('Accessibility.enable');
|
|
755
|
+
|
|
756
|
+
// 1. Linearize the frame tree depth-first: index 0 is the main frame.
|
|
757
|
+
const { frameTree } = await main.send('Page.getFrameTree');
|
|
758
|
+
const frames = [];
|
|
759
|
+
(function walk(node, parentId) {
|
|
760
|
+
frames.push({ frame: node.frame, parentId });
|
|
761
|
+
for (const child of node.childFrames || []) walk(child, node.frame.id);
|
|
762
|
+
})(frameTree, null);
|
|
763
|
+
|
|
764
|
+
// 2. For each frame, fetch its AX nodes and build a tree. refMap value is
|
|
765
|
+
// { session, backendNodeId } so click(ref) routes to the right CDP
|
|
766
|
+
// session (essential for cross-process iframes). refCounter is shared
|
|
767
|
+
// across all frames in one snapshot — refs stay flat integers, so the
|
|
768
|
+
// visible [ref=N] format and existing agent prompts don't change.
|
|
512
769
|
const refMap = new Map();
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
770
|
+
const treesByFrameId = new Map();
|
|
771
|
+
const sessionByFrameId = new Map();
|
|
772
|
+
const refCounter = { value: 1 };
|
|
773
|
+
let totalNodes = 0;
|
|
774
|
+
|
|
775
|
+
for (let i = 0; i < frames.length; i++) {
|
|
776
|
+
const { frame } = frames[i];
|
|
777
|
+
const childEntry = page.framesByFrameId?.get(frame.id);
|
|
778
|
+
const frameSession = childEntry ? childEntry.session : main;
|
|
779
|
+
sessionByFrameId.set(frame.id, frameSession);
|
|
780
|
+
|
|
781
|
+
let nodes = [];
|
|
782
|
+
try {
|
|
783
|
+
if (childEntry) {
|
|
784
|
+
// OOPIF — use the child session, no frameId param needed.
|
|
785
|
+
try { await frameSession.send('Accessibility.enable'); } catch {}
|
|
786
|
+
const res = await frameSession.send('Accessibility.getFullAXTree');
|
|
787
|
+
nodes = res.nodes;
|
|
788
|
+
} else {
|
|
789
|
+
// Main frame or same-origin child — query main session, scoping by
|
|
790
|
+
// frameId for children (Accessibility.getFullAXTree without frameId
|
|
791
|
+
// would just return the top frame, dropping same-origin iframe content).
|
|
792
|
+
const params = i === 0 ? {} : { frameId: frame.id };
|
|
793
|
+
const res = await main.send('Accessibility.getFullAXTree', params);
|
|
794
|
+
nodes = res.nodes;
|
|
795
|
+
}
|
|
796
|
+
} catch {
|
|
797
|
+
// Frame may have navigated mid-snapshot — skip it rather than fail
|
|
798
|
+
// the whole snapshot. The placeholder iframe node will simply have
|
|
799
|
+
// no children in the merged tree.
|
|
800
|
+
continue;
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
totalNodes += nodes.length;
|
|
804
|
+
const tree = buildTree(nodes, frameSession, refMap, refCounter);
|
|
805
|
+
if (tree) treesByFrameId.set(frame.id, tree);
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
// 3. Splice each child frame's tree under its iframe placeholder node in
|
|
809
|
+
// the parent. DOM.getFrameOwner gives the iframe element's
|
|
810
|
+
// backendNodeId in the parent's view; we match it against AX nodes.
|
|
811
|
+
for (const { frame, parentId } of frames) {
|
|
812
|
+
if (parentId === null) continue;
|
|
813
|
+
const parentTree = treesByFrameId.get(parentId);
|
|
814
|
+
const childTree = treesByFrameId.get(frame.id);
|
|
815
|
+
if (!parentTree || !childTree) continue;
|
|
816
|
+
const parentSession = sessionByFrameId.get(parentId);
|
|
817
|
+
try {
|
|
818
|
+
const { backendNodeId } = await parentSession.send('DOM.getFrameOwner', { frameId: frame.id });
|
|
819
|
+
const placeholder = findNodeByBackend(parentTree, backendNodeId);
|
|
820
|
+
if (placeholder) placeholder.children = [childTree];
|
|
821
|
+
} catch {
|
|
822
|
+
// Frame owner lookup failed — leave the iframe placeholder as-is.
|
|
516
823
|
}
|
|
517
824
|
}
|
|
518
825
|
|
|
519
|
-
|
|
826
|
+
const root = treesByFrameId.get(frames[0].frame.id) || null;
|
|
827
|
+
return { tree: root, refMap, nodeCount: totalNodes };
|
|
520
828
|
}
|
|
521
829
|
|
|
522
830
|
/**
|
|
523
|
-
* Transform CDP's flat AXNode array into a nested tree.
|
|
831
|
+
* Transform CDP's flat AXNode array into a nested tree. Every tree node gets
|
|
832
|
+
* a globally unique flat ref string from `refCounter` (shared across all
|
|
833
|
+
* frames in one snapshot), and refMap is populated with ref → { session,
|
|
834
|
+
* backendNodeId } so click/type can route to the right CDP session even when
|
|
835
|
+
* the element lives in an iframe.
|
|
524
836
|
* CDP nodes have parentId — we use that exclusively to avoid double-linking.
|
|
525
837
|
*/
|
|
526
|
-
function buildTree(nodes) {
|
|
838
|
+
function buildTree(nodes, session, refMap, refCounter) {
|
|
527
839
|
if (!nodes || nodes.length === 0) return null;
|
|
528
840
|
|
|
529
841
|
const nodeMap = new Map();
|
|
530
|
-
const linked = new Set();
|
|
842
|
+
const linked = new Set();
|
|
531
843
|
|
|
532
|
-
// First pass: create tree nodes
|
|
844
|
+
// First pass: create tree nodes + populate refMap with flat global refs
|
|
533
845
|
for (const node of nodes) {
|
|
846
|
+
const ref = String(refCounter.value++);
|
|
534
847
|
nodeMap.set(node.nodeId, {
|
|
535
|
-
nodeId:
|
|
848
|
+
nodeId: ref,
|
|
536
849
|
backendDOMNodeId: node.backendDOMNodeId,
|
|
537
850
|
role: node.role?.value || '',
|
|
538
851
|
name: node.name?.value || '',
|
|
@@ -540,6 +853,9 @@ function buildTree(nodes) {
|
|
|
540
853
|
ignored: node.ignored || false,
|
|
541
854
|
children: [],
|
|
542
855
|
});
|
|
856
|
+
if (node.backendDOMNodeId && refMap) {
|
|
857
|
+
refMap.set(ref, { session, backendNodeId: node.backendDOMNodeId });
|
|
858
|
+
}
|
|
543
859
|
}
|
|
544
860
|
|
|
545
861
|
// Second pass: link via parentId only (avoids duplicates from childIds)
|
|
@@ -560,6 +876,16 @@ function buildTree(nodes) {
|
|
|
560
876
|
return root;
|
|
561
877
|
}
|
|
562
878
|
|
|
879
|
+
function findNodeByBackend(node, backendNodeId) {
|
|
880
|
+
if (!node) return null;
|
|
881
|
+
if (node.backendDOMNodeId === backendNodeId) return node;
|
|
882
|
+
for (const child of node.children || []) {
|
|
883
|
+
const found = findNodeByBackend(child, backendNodeId);
|
|
884
|
+
if (found) return found;
|
|
885
|
+
}
|
|
886
|
+
return null;
|
|
887
|
+
}
|
|
888
|
+
|
|
563
889
|
function extractProps(props) {
|
|
564
890
|
if (!props) return {};
|
|
565
891
|
const result = {};
|
|
@@ -568,79 +894,58 @@ function extractProps(props) {
|
|
|
568
894
|
}
|
|
569
895
|
|
|
570
896
|
/**
|
|
571
|
-
*
|
|
572
|
-
*
|
|
573
|
-
*
|
|
574
|
-
*
|
|
575
|
-
*
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
const unsubs = [];
|
|
585
|
-
|
|
586
|
-
const done = () => {
|
|
587
|
-
clearTimeout(timer);
|
|
588
|
-
clearTimeout(deadlineTimer);
|
|
589
|
-
for (const unsub of unsubs) unsub();
|
|
590
|
-
resolve();
|
|
591
|
-
};
|
|
592
|
-
|
|
593
|
-
const check = () => {
|
|
594
|
-
clearTimeout(timer);
|
|
595
|
-
if (pending <= 0) {
|
|
596
|
-
pending = 0;
|
|
597
|
-
timer = setTimeout(done, idle);
|
|
598
|
-
}
|
|
599
|
-
};
|
|
600
|
-
|
|
601
|
-
unsubs.push(session.on('Network.requestWillBeSent', () => { pending++; clearTimeout(timer); }));
|
|
602
|
-
unsubs.push(session.on('Network.loadingFinished', () => { pending--; check(); }));
|
|
603
|
-
unsubs.push(session.on('Network.loadingFailed', () => { pending--; check(); }));
|
|
604
|
-
|
|
605
|
-
const deadlineTimer = setTimeout(() => {
|
|
606
|
-
for (const unsub of unsubs) unsub();
|
|
607
|
-
reject(new Error(`waitForNetworkIdle timed out after ${timeout}ms`));
|
|
608
|
-
}, timeout);
|
|
609
|
-
|
|
610
|
-
// Start check immediately (might already be idle)
|
|
611
|
-
check();
|
|
612
|
-
});
|
|
613
|
-
}
|
|
614
|
-
|
|
615
|
-
/**
|
|
616
|
-
* Detect if a page is a bot-challenge page (Cloudflare, etc.).
|
|
617
|
-
* Heuristic: low ARIA node count, short text, or known challenge phrases.
|
|
897
|
+
* Detect if a page is a bot-challenge page (Cloudflare, hCaptcha, etc.).
|
|
898
|
+
*
|
|
899
|
+
* Pre-H9 this was over-aggressive: `nodeCount < 50` alone fired on any
|
|
900
|
+
* legitimate small page (404s, simple landings, error pages), and generic
|
|
901
|
+
* phrases like "access denied" / "unknown error" / "permission denied"
|
|
902
|
+
* triggered on real HTTP 4xx/5xx pages, kicking hybrid mode into a costly
|
|
903
|
+
* headed fallback for nothing.
|
|
904
|
+
*
|
|
905
|
+
* H9 split: STRONG_PHRASES are essentially-unambiguous challenge UI and
|
|
906
|
+
* fire regardless of page size; WEAK_PHRASES only fire when the page is
|
|
907
|
+
* ALSO tiny (so a legitimate-looking error page with "access denied" in
|
|
908
|
+
* its body doesn't trip the fallback).
|
|
909
|
+
*
|
|
618
910
|
* @param {object} tree - Nested ARIA tree (from buildTree)
|
|
619
911
|
* @param {number} [nodeCount] - Raw CDP node count (from Accessibility.getFullAXTree)
|
|
620
912
|
*/
|
|
621
|
-
function isChallengePage(tree, nodeCount) {
|
|
622
|
-
if (!tree) return true;
|
|
623
|
-
|
|
624
|
-
if (nodeCount !== undefined && nodeCount < 50) return true;
|
|
913
|
+
export function isChallengePage(tree, nodeCount) {
|
|
914
|
+
if (!tree) return true; // truly empty AX tree — something went wrong fetching the page
|
|
915
|
+
|
|
625
916
|
const text = flattenTreeText(tree);
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
'
|
|
632
|
-
'
|
|
633
|
-
'
|
|
917
|
+
const lower = text.toLowerCase();
|
|
918
|
+
|
|
919
|
+
// Strong phrases — distinctive enough to identify the challenge product
|
|
920
|
+
// by name. Fire on their own regardless of node count.
|
|
921
|
+
const STRONG_PHRASES = [
|
|
922
|
+
'just a moment', // Cloudflare interstitial
|
|
923
|
+
'checking if the site connection is secure', // Cloudflare
|
|
924
|
+
'checking your browser', // Various JS challenges
|
|
925
|
+
'verify you are human', // hCaptcha / reCAPTCHA
|
|
634
926
|
'prove your humanity',
|
|
635
|
-
'attention required',
|
|
636
|
-
'
|
|
637
|
-
'
|
|
927
|
+
'attention required', // Cloudflare block page
|
|
928
|
+
'enable javascript and cookies to continue', // Cloudflare
|
|
929
|
+
'please complete the security check', // Cloudflare/Akamai
|
|
930
|
+
];
|
|
931
|
+
if (STRONG_PHRASES.some((p) => lower.includes(p))) return true;
|
|
932
|
+
|
|
933
|
+
// Weak phrases — show up on real challenge pages but ALSO on legitimate
|
|
934
|
+
// small error pages. Only count when the page is itself tiny (low node
|
|
935
|
+
// count or near-empty text), which is the corroborating signal that
|
|
936
|
+
// separates a real error UI from a challenge skeleton.
|
|
937
|
+
const WEAK_PHRASES = [
|
|
938
|
+
'please wait',
|
|
939
|
+
'request blocked',
|
|
638
940
|
'access denied',
|
|
639
941
|
'permission denied',
|
|
640
|
-
'
|
|
942
|
+
'unknown error',
|
|
943
|
+
'file a ticket',
|
|
641
944
|
];
|
|
642
|
-
const
|
|
643
|
-
|
|
945
|
+
const tinyPage = (nodeCount !== undefined && nodeCount < 30) || text.trim().length < 50;
|
|
946
|
+
if (tinyPage && WEAK_PHRASES.some((p) => lower.includes(p))) return true;
|
|
947
|
+
|
|
948
|
+
return false;
|
|
644
949
|
}
|
|
645
950
|
|
|
646
951
|
function flattenTreeText(node) {
|