barebrowse 0.7.1 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +249 -0
- package/LICENSE +202 -21
- package/NOTICE +8 -0
- package/README.md +39 -10
- package/barebrowse.context.md +45 -18
- package/cli.js +114 -3
- package/mcp-server.js +276 -70
- package/package.json +2 -2
- package/src/bareagent.js +43 -4
- package/src/chromium.js +115 -5
- package/src/consent.js +3 -8
- package/src/daemon.js +13 -0
- package/src/index.js +440 -135
- package/src/network-idle.js +62 -0
- package/src/prune.js +2 -1
- package/src/stealth.js +87 -6
package/src/chromium.js
CHANGED
|
@@ -6,7 +6,47 @@
|
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
import { execSync, spawn } from 'node:child_process';
|
|
9
|
-
import { existsSync } from 'node:fs';
|
|
9
|
+
import { existsSync, rmSync } from 'node:fs';
|
|
10
|
+
|
|
11
|
+
// Track launched browsers so we can clean them up if the parent crashes.
|
|
12
|
+
// Registered exit handlers (one-time) iterate this set on shutdown.
|
|
13
|
+
const activeBrowsers = new Set();
|
|
14
|
+
let exitHandlersRegistered = false;
|
|
15
|
+
|
|
16
|
+
function reapAllSync() {
|
|
17
|
+
const toReap = [...activeBrowsers];
|
|
18
|
+
activeBrowsers.clear();
|
|
19
|
+
// Send SIGKILL to everything first so the kernel reaps in parallel
|
|
20
|
+
for (const b of toReap) {
|
|
21
|
+
try { if (!b.process.killed) b.process.kill('SIGKILL'); } catch {}
|
|
22
|
+
}
|
|
23
|
+
// Then poll each for actual death before removing its profile dir —
|
|
24
|
+
// Chromium can hold file handles briefly even after SIGKILL, which would
|
|
25
|
+
// race rmSync. Cap the wait so a stuck process can't hang shutdown.
|
|
26
|
+
for (const b of toReap) {
|
|
27
|
+
for (let i = 0; i < 20; i++) {
|
|
28
|
+
try { process.kill(b.process.pid, 0); } catch { break; }
|
|
29
|
+
try { execSync('sleep 0.05'); } catch {}
|
|
30
|
+
}
|
|
31
|
+
if (b.ownedProfileDir) {
|
|
32
|
+
try { rmSync(b.ownedProfileDir, { recursive: true, force: true }); } catch {}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function registerExitHandlers() {
|
|
38
|
+
if (exitHandlersRegistered) return;
|
|
39
|
+
exitHandlersRegistered = true;
|
|
40
|
+
// 'exit' is sync-only — must use synchronous APIs (SIGKILL, rmSync)
|
|
41
|
+
process.once('exit', reapAllSync);
|
|
42
|
+
for (const sig of ['SIGINT', 'SIGTERM', 'SIGHUP']) {
|
|
43
|
+
process.once(sig, () => {
|
|
44
|
+
reapAllSync();
|
|
45
|
+
// Re-raise default behavior so the parent's exit code matches the signal
|
|
46
|
+
process.kill(process.pid, sig);
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
}
|
|
10
50
|
|
|
11
51
|
// Common Chromium binary paths by platform (Linux focus for POC)
|
|
12
52
|
const CANDIDATES = [
|
|
@@ -75,6 +115,14 @@ export async function launch(opts = {}) {
|
|
|
75
115
|
'--disable-sync',
|
|
76
116
|
'--disable-translate',
|
|
77
117
|
'--mute-audio',
|
|
118
|
+
// Force every iframe (same-origin included) into its own renderer so it
|
|
119
|
+
// gets a dedicated CDP session via Target.setAutoAttach. Without this,
|
|
120
|
+
// same-origin iframes stay in the parent process — getFullAXTree still
|
|
121
|
+
// works via frameId, but Input.dispatchMouseEvent on the parent session
|
|
122
|
+
// uses parent-viewport coords while DOM.getBoxModel for iframe-internal
|
|
123
|
+
// nodes returns frame-local coords, so clicks land off-target. The OOPIF
|
|
124
|
+
// path side-steps that: each frame has its own Input domain.
|
|
125
|
+
'--site-per-process',
|
|
78
126
|
// Headless-only flags
|
|
79
127
|
...(!opts.headed ? ['--headless=new', '--hide-scrollbars'] : []),
|
|
80
128
|
// Suppress permission prompts (location, notifications, camera, mic, etc.)
|
|
@@ -90,12 +138,14 @@ export async function launch(opts = {}) {
|
|
|
90
138
|
args.push(`--proxy-server=${opts.proxy}`);
|
|
91
139
|
}
|
|
92
140
|
|
|
141
|
+
// Track the temp profile dir only when we create one — caller-supplied dirs
|
|
142
|
+
// are the caller's to manage. ownedProfileDir gets rm'd in cleanupBrowser.
|
|
143
|
+
let ownedProfileDir = null;
|
|
93
144
|
if (opts.userDataDir) {
|
|
94
145
|
args.push(`--user-data-dir=${opts.userDataDir}`);
|
|
95
146
|
} else {
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
args.push(`--user-data-dir=/tmp/barebrowse-${process.pid}-${Date.now()}`);
|
|
147
|
+
ownedProfileDir = `/tmp/barebrowse-${process.pid}-${Date.now()}`;
|
|
148
|
+
args.push(`--user-data-dir=${ownedProfileDir}`);
|
|
99
149
|
}
|
|
100
150
|
|
|
101
151
|
// about:blank as initial page
|
|
@@ -138,7 +188,52 @@ export async function launch(opts = {}) {
|
|
|
138
188
|
// Extract port from wsUrl
|
|
139
189
|
const actualPort = parseInt(new URL(wsUrl).port, 10);
|
|
140
190
|
|
|
141
|
-
|
|
191
|
+
const browser = { wsUrl, process: child, port: actualPort, ownedProfileDir };
|
|
192
|
+
|
|
193
|
+
// Register for parent-crash reaping. Auto-untrack on natural exit so
|
|
194
|
+
// a normally-exited browser doesn't leave a stale entry around.
|
|
195
|
+
registerExitHandlers();
|
|
196
|
+
activeBrowsers.add(browser);
|
|
197
|
+
child.once('exit', () => activeBrowsers.delete(browser));
|
|
198
|
+
|
|
199
|
+
return browser;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Kill a launched browser and remove its temp profile dir (if we created one).
|
|
204
|
+
* Waits up to 2s for the process to actually exit before unlinking the dir —
|
|
205
|
+
* Chromium can still hold files briefly after SIGTERM, which races rmSync.
|
|
206
|
+
* Safe to call on partially-failed launches or already-dead processes.
|
|
207
|
+
* @returns {Promise<void>}
|
|
208
|
+
*/
|
|
209
|
+
export async function cleanupBrowser(browser) {
|
|
210
|
+
if (!browser) return;
|
|
211
|
+
activeBrowsers.delete(browser);
|
|
212
|
+
if (browser.process && !browser.process.killed && browser.process.exitCode === null) {
|
|
213
|
+
const exited = new Promise((resolve) => {
|
|
214
|
+
const timer = setTimeout(resolve, 2000);
|
|
215
|
+
browser.process.once('exit', () => { clearTimeout(timer); resolve(); });
|
|
216
|
+
});
|
|
217
|
+
try { browser.process.kill(); } catch {}
|
|
218
|
+
await exited;
|
|
219
|
+
}
|
|
220
|
+
if (browser.ownedProfileDir) {
|
|
221
|
+
// Chromium can still flush files for ~hundreds of ms after exit; with
|
|
222
|
+
// --site-per-process (added in H2) every iframe is its own renderer
|
|
223
|
+
// process, each with its own pending file handles, so the old 10×100ms
|
|
224
|
+
// window (1s) wasn't always enough under parallel test load. Now
|
|
225
|
+
// 25×100ms (2.5s) plus a polling jitter to avoid every concurrent
|
|
226
|
+
// cleanup hammering at the same tick.
|
|
227
|
+
for (let i = 0; i < 25; i++) {
|
|
228
|
+
try {
|
|
229
|
+
rmSync(browser.ownedProfileDir, { recursive: true, force: true });
|
|
230
|
+
break;
|
|
231
|
+
} catch (err) {
|
|
232
|
+
if (err.code !== 'ENOTEMPTY' && err.code !== 'EBUSY') break;
|
|
233
|
+
await new Promise((r) => setTimeout(r, 100 + Math.floor(Math.random() * 50)));
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
142
237
|
}
|
|
143
238
|
|
|
144
239
|
/**
|
|
@@ -152,3 +247,18 @@ export async function getDebugUrl(port) {
|
|
|
152
247
|
const data = await res.json();
|
|
153
248
|
return data.webSocketDebuggerUrl;
|
|
154
249
|
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Attach to a Chromium already running with --remote-debugging-port=<port>.
|
|
253
|
+
* Returns the same shape as launch() but with process: null and
|
|
254
|
+
* ownedProfileDir: null — cleanupBrowser() becomes a no-op so we never
|
|
255
|
+
* kill a browser we did not start or remove a profile we do not own.
|
|
256
|
+
* @param {object} opts
|
|
257
|
+
* @param {number} opts.port - The debug port the running browser is listening on
|
|
258
|
+
* @returns {Promise<{wsUrl: string, process: null, port: number, ownedProfileDir: null}>}
|
|
259
|
+
*/
|
|
260
|
+
export async function attach({ port }) {
|
|
261
|
+
if (!port) throw new Error('attach({ port }) requires a port number');
|
|
262
|
+
const wsUrl = await getDebugUrl(port);
|
|
263
|
+
return { wsUrl, process: null, port, ownedProfileDir: null };
|
|
264
|
+
}
|
package/src/consent.js
CHANGED
|
@@ -290,14 +290,9 @@ function findAcceptButton(dialogId, nodes, nodeMap, parentMap) {
|
|
|
290
290
|
* Only matches strong patterns (not single-word fallbacks) to avoid false positives.
|
|
291
291
|
*/
|
|
292
292
|
function tryGlobalConsentButton(nodes, session) {
|
|
293
|
-
//
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
return src.includes('\\s') || src.includes('\\b.*\\b.*\\b');
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
// Actually, let's just use all non-single-word patterns
|
|
300
|
-
const safePatterns = ACCEPT_PATTERNS.slice(0, -3); // exclude ^accept$, ^agree$, ^ok$
|
|
293
|
+
// Multi-word patterns only — exclude the bare ^accept$/^agree$/^ok$ from
|
|
294
|
+
// ACCEPT_PATTERNS so we don't false-match unrelated buttons page-wide.
|
|
295
|
+
const safePatterns = ACCEPT_PATTERNS.slice(0, -3);
|
|
301
296
|
|
|
302
297
|
for (const pattern of safePatterns) {
|
|
303
298
|
for (const node of nodes) {
|
package/src/daemon.js
CHANGED
|
@@ -39,6 +39,7 @@ export async function startDaemon(opts, outputDir, initialUrl) {
|
|
|
39
39
|
if (opts.proxy) args.push('--proxy', opts.proxy);
|
|
40
40
|
if (opts.viewport) args.push('--viewport', opts.viewport);
|
|
41
41
|
if (opts.storageState) args.push('--storage-state', opts.storageState);
|
|
42
|
+
if (opts.downloadPath) args.push('--download-path', opts.downloadPath);
|
|
42
43
|
|
|
43
44
|
const child = spawn(process.execPath, args, {
|
|
44
45
|
detached: true,
|
|
@@ -77,6 +78,7 @@ export async function runDaemon(opts, outputDir, initialUrl) {
|
|
|
77
78
|
proxy: opts.proxy,
|
|
78
79
|
viewport: opts.viewport,
|
|
79
80
|
storageState: opts.storageState,
|
|
81
|
+
downloadPath: opts.downloadPath,
|
|
80
82
|
});
|
|
81
83
|
|
|
82
84
|
// Console log capture
|
|
@@ -208,6 +210,17 @@ export async function runDaemon(opts, outputDir, initialUrl) {
|
|
|
208
210
|
return { ok: true };
|
|
209
211
|
},
|
|
210
212
|
|
|
213
|
+
async reload({ ignoreCache }) {
|
|
214
|
+
await page.reload({ ignoreCache: !!ignoreCache });
|
|
215
|
+
return { ok: true };
|
|
216
|
+
},
|
|
217
|
+
|
|
218
|
+
async downloads() {
|
|
219
|
+
// Snapshot the array — callers want a static view at the moment of
|
|
220
|
+
// the request, not a reference that mutates under them.
|
|
221
|
+
return { ok: true, value: page.downloads.map((d) => ({ ...d })) };
|
|
222
|
+
},
|
|
223
|
+
|
|
211
224
|
async drag({ fromRef, toRef }) {
|
|
212
225
|
await page.drag(String(fromRef), String(toRef));
|
|
213
226
|
return { ok: true };
|