barebrowse 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +227 -0
- package/LICENSE +202 -21
- package/NOTICE +8 -0
- package/README.md +37 -10
- package/barebrowse.context.md +43 -18
- package/cli.js +114 -3
- package/mcp-server.js +302 -82
- package/package.json +2 -2
- package/src/bareagent.js +33 -0
- package/src/chromium.js +115 -5
- package/src/consent.js +3 -8
- package/src/daemon.js +13 -0
- package/src/index.js +429 -132
- package/src/network-idle.js +62 -0
- package/src/stealth.js +87 -6
- package/.aurora/plans/active/harden-assess/design.md +0 -68
- package/.aurora/plans/active/harden-assess/plan.md +0 -71
- package/.aurora/plans/active/harden-assess/prd.md +0 -38
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* network-idle.js — wait until the page's network has been idle for N ms.
|
|
3
|
+
*
|
|
4
|
+
* Tracks in-flight requests by requestId in a Set, so an orphan
|
|
5
|
+
* loadingFinished/Failed (event for a request whose requestWillBeSent
|
|
6
|
+
* arrived before our listener attached) is a harmless no-op instead of
|
|
7
|
+
* driving a counter negative and resolving prematurely.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @param {object} session - CDP session-scoped handle with .on() returning unsub
|
|
12
|
+
* @param {object} [opts]
|
|
13
|
+
* @param {number} [opts.timeout=30000] - Max wait time before reject
|
|
14
|
+
* @param {number} [opts.idle=500] - Required idle duration before resolve
|
|
15
|
+
*/
|
|
16
|
+
export function waitForNetworkIdle(session, opts = {}) {
|
|
17
|
+
const timeout = opts.timeout || 30000;
|
|
18
|
+
const idle = opts.idle || 500;
|
|
19
|
+
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
const pending = new Set();
|
|
22
|
+
let timer = null;
|
|
23
|
+
const unsubs = [];
|
|
24
|
+
|
|
25
|
+
const done = () => {
|
|
26
|
+
clearTimeout(timer);
|
|
27
|
+
clearTimeout(deadlineTimer);
|
|
28
|
+
for (const unsub of unsubs) unsub();
|
|
29
|
+
resolve();
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
const check = () => {
|
|
33
|
+
clearTimeout(timer);
|
|
34
|
+
if (pending.size === 0) {
|
|
35
|
+
timer = setTimeout(done, idle);
|
|
36
|
+
}
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
unsubs.push(session.on('Network.requestWillBeSent', (p) => {
|
|
40
|
+
pending.add(p.requestId);
|
|
41
|
+
clearTimeout(timer);
|
|
42
|
+
}));
|
|
43
|
+
unsubs.push(session.on('Network.loadingFinished', (p) => {
|
|
44
|
+
// delete() on a Set is a no-op for unknown keys — orphan events from
|
|
45
|
+
// requests started before we attached the listener can't push us negative.
|
|
46
|
+
pending.delete(p.requestId);
|
|
47
|
+
check();
|
|
48
|
+
}));
|
|
49
|
+
unsubs.push(session.on('Network.loadingFailed', (p) => {
|
|
50
|
+
pending.delete(p.requestId);
|
|
51
|
+
check();
|
|
52
|
+
}));
|
|
53
|
+
|
|
54
|
+
const deadlineTimer = setTimeout(() => {
|
|
55
|
+
for (const unsub of unsubs) unsub();
|
|
56
|
+
reject(new Error(`waitForNetworkIdle timed out after ${timeout}ms`));
|
|
57
|
+
}, timeout);
|
|
58
|
+
|
|
59
|
+
// Start check immediately (might already be idle)
|
|
60
|
+
check();
|
|
61
|
+
});
|
|
62
|
+
}
|
package/src/stealth.js
CHANGED
|
@@ -23,28 +23,109 @@ const STEALTH_SCRIPT = `
|
|
|
23
23
|
get: () => ['en-US', 'en'],
|
|
24
24
|
});
|
|
25
25
|
|
|
26
|
-
//
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
// Realistic CPU + memory. Headless under containers can report 1 or odd
|
|
27
|
+
// values that real desktops rarely have, which is its own fingerprint.
|
|
28
|
+
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
|
|
29
|
+
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
|
|
30
|
+
|
|
31
|
+
// chrome / chrome.runtime — headless either omits the object entirely or
|
|
32
|
+
// gives an empty {}; real Chrome has the enum shapes below even before any
|
|
33
|
+
// extension is installed. Fingerprinters check that chrome.runtime exists
|
|
34
|
+
// AND that these enums are present.
|
|
35
|
+
if (!window.chrome) window.chrome = {};
|
|
36
|
+
if (!window.chrome.runtime) {
|
|
37
|
+
window.chrome.runtime = {
|
|
38
|
+
OnInstalledReason: { CHROME_UPDATE: 'chrome_update', INSTALL: 'install', SHARED_MODULE_UPDATE: 'shared_module_update', UPDATE: 'update' },
|
|
39
|
+
OnRestartRequiredReason: { APP_UPDATE: 'app_update', OS_UPDATE: 'os_update', PERIODIC: 'periodic' },
|
|
40
|
+
PlatformArch: { ARM: 'arm', ARM64: 'arm64', MIPS: 'mips', MIPS64: 'mips64', X86_32: 'x86-32', X86_64: 'x86-64' },
|
|
41
|
+
PlatformNaclArch: { ARM: 'arm', MIPS: 'mips', MIPS64: 'mips64', X86_32: 'x86-32', X86_64: 'x86-64' },
|
|
42
|
+
PlatformOs: { ANDROID: 'android', CROS: 'cros', LINUX: 'linux', MAC: 'mac', OPENBSD: 'openbsd', WIN: 'win' },
|
|
43
|
+
RequestUpdateCheckStatus: { NO_UPDATE: 'no_update', THROTTLED: 'throttled', UPDATE_AVAILABLE: 'update_available' },
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Notification — headless Chrome doesn't expose the Notification API at
|
|
48
|
+
// all (even on secure contexts), while real Chrome always does and reports
|
|
49
|
+
// 'default' before any prompt. Fingerprinters check both \`typeof
|
|
50
|
+
// Notification\` and \`Notification.permission\`, so we fake both: the
|
|
51
|
+
// constructor when missing, and only the permission getter when it's
|
|
52
|
+
// present (some Chrome versions ship a non-configurable getter and
|
|
53
|
+
// defineProperty would throw — swallowed so the rest of the script runs).
|
|
54
|
+
if (typeof Notification === 'undefined') {
|
|
55
|
+
window.Notification = function Notification() {};
|
|
56
|
+
window.Notification.permission = 'default';
|
|
57
|
+
window.Notification.requestPermission = () => Promise.resolve('default');
|
|
58
|
+
} else {
|
|
59
|
+
try {
|
|
60
|
+
Object.defineProperty(Notification, 'permission', { get: () => 'default' });
|
|
61
|
+
} catch {}
|
|
29
62
|
}
|
|
30
63
|
|
|
31
|
-
// Permissions.query
|
|
64
|
+
// Permissions.query for notifications: keep it consistent with the
|
|
65
|
+
// Notification.permission override above instead of returning 'prompt'
|
|
66
|
+
// unconditionally (the prior hardcoded value was a tell of its own).
|
|
32
67
|
const origQuery = Permissions.prototype.query;
|
|
33
68
|
Permissions.prototype.query = function(desc) {
|
|
34
|
-
if (desc.name === 'notifications') {
|
|
35
|
-
return Promise.resolve({ state:
|
|
69
|
+
if (desc && desc.name === 'notifications') {
|
|
70
|
+
return Promise.resolve({ state: Notification.permission, onchange: null });
|
|
36
71
|
}
|
|
37
72
|
return origQuery.call(this, desc);
|
|
38
73
|
};
|
|
74
|
+
|
|
75
|
+
// WebGL UNMASKED_VENDOR_WEBGL (37445) and UNMASKED_RENDERER_WEBGL (37446) —
|
|
76
|
+
// headless returns "Google Inc. (Google)" / "Google SwiftShader" which
|
|
77
|
+
// is the single most-used headless fingerprint. Spoof a realistic
|
|
78
|
+
// Intel integrated GPU pair (works on macOS and Linux user agents).
|
|
79
|
+
const SPOOFED_VENDOR = 'Intel Inc.';
|
|
80
|
+
const SPOOFED_RENDERER = 'Intel Iris OpenGL Engine';
|
|
81
|
+
const origGetParam = WebGLRenderingContext.prototype.getParameter;
|
|
82
|
+
WebGLRenderingContext.prototype.getParameter = function(p) {
|
|
83
|
+
if (p === 37445) return SPOOFED_VENDOR;
|
|
84
|
+
if (p === 37446) return SPOOFED_RENDERER;
|
|
85
|
+
return origGetParam.apply(this, arguments);
|
|
86
|
+
};
|
|
87
|
+
if (typeof WebGL2RenderingContext !== 'undefined') {
|
|
88
|
+
const origGetParam2 = WebGL2RenderingContext.prototype.getParameter;
|
|
89
|
+
WebGL2RenderingContext.prototype.getParameter = function(p) {
|
|
90
|
+
if (p === 37445) return SPOOFED_VENDOR;
|
|
91
|
+
if (p === 37446) return SPOOFED_RENDERER;
|
|
92
|
+
return origGetParam2.apply(this, arguments);
|
|
93
|
+
};
|
|
94
|
+
}
|
|
39
95
|
`;
|
|
40
96
|
|
|
41
97
|
/**
|
|
42
98
|
* Apply stealth patches to a CDP session.
|
|
43
99
|
* Must be called before any navigation.
|
|
44
100
|
*
|
|
101
|
+
* Splits into two layers:
|
|
102
|
+
* 1. Network.setUserAgentOverride strips "HeadlessChrome" from the UA
|
|
103
|
+
* that ships in HTTP request headers AND that navigator.userAgent
|
|
104
|
+
* reports — `--headless=new` leaves "HeadlessChrome" in there.
|
|
105
|
+
* 2. Page.addScriptToEvaluateOnNewDocument injects the JS-level patches
|
|
106
|
+
* before any page script runs.
|
|
107
|
+
*
|
|
45
108
|
* @param {object} session - Session-scoped CDP handle
|
|
46
109
|
*/
|
|
47
110
|
export async function applyStealth(session) {
|
|
111
|
+
// 1. UA override — read whatever the running browser actually claims, then
|
|
112
|
+
// rewrite the "Headless" marker out. Doing it this way (vs hardcoding a
|
|
113
|
+
// string) keeps the version + platform fields accurate across Chromium
|
|
114
|
+
// releases. Network.setUserAgentOverride is per-session, so it also
|
|
115
|
+
// cleans up the value navigator.userAgent reports inside the page.
|
|
116
|
+
try {
|
|
117
|
+
const { userAgent } = await session.send('Browser.getVersion');
|
|
118
|
+
if (userAgent && userAgent.includes('HeadlessChrome')) {
|
|
119
|
+
await session.send('Network.setUserAgentOverride', {
|
|
120
|
+
userAgent: userAgent.replace(/HeadlessChrome/g, 'Chrome'),
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
} catch {
|
|
124
|
+
// Browser.getVersion not reachable from this session — skip UA override
|
|
125
|
+
// and rely on the JS-level patches alone.
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// 2. JS-level patches
|
|
48
129
|
await session.send('Page.addScriptToEvaluateOnNewDocument', {
|
|
49
130
|
source: STEALTH_SCRIPT,
|
|
50
131
|
});
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
# Design: Harden assess tool
|
|
2
|
-
|
|
3
|
-
## Architecture
|
|
4
|
-
|
|
5
|
-
### Current flow (broken)
|
|
6
|
-
```
|
|
7
|
-
assess(url) → connect({ mode: 'hybrid' }) ← NEW browser, no cookies
|
|
8
|
-
→ assessFn(page, url)
|
|
9
|
-
→ page.close() ← kills browser
|
|
10
|
-
```
|
|
11
|
-
|
|
12
|
-
### New flow
|
|
13
|
-
```
|
|
14
|
-
assess(url) → getPage() ← reuse session browser
|
|
15
|
-
→ page.createTab() ← new tab in same browser
|
|
16
|
-
→ tab.injectCookies(url) ← cookie injection
|
|
17
|
-
→ assessFn(tab, url) ← assess uses tab
|
|
18
|
-
→ tab.close() ← close tab only
|
|
19
|
-
timeout guard wraps entire flow
|
|
20
|
-
retry wraps entire flow
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
## Key design decisions
|
|
24
|
-
|
|
25
|
-
### Why a new tab, not the session page?
|
|
26
|
-
wearehere's `assess()` calls:
|
|
27
|
-
- `session.send('Page.addScriptToEvaluateOnNewDocument', ...)` — injects fingerprint detection scripts
|
|
28
|
-
- `networkSession.on('Network.requestWillBeSent', ...)` — monitors all network traffic
|
|
29
|
-
|
|
30
|
-
These would pollute the session page. A separate tab has its own CDP session with isolated Page/Network domains.
|
|
31
|
-
|
|
32
|
-
### createTab() page-like interface
|
|
33
|
-
wearehere expects a page object with: `goto()`, `cdp` (raw session), `waitForNetworkIdle()`. createTab() returns exactly this interface:
|
|
34
|
-
|
|
35
|
-
```javascript
|
|
36
|
-
{
|
|
37
|
-
goto(url, timeout) // navigate tab
|
|
38
|
-
cdp // raw CDP session for this tab
|
|
39
|
-
waitForNetworkIdle(opts) // reuses existing waitForNetworkIdle()
|
|
40
|
-
injectCookies(url) // cookie injection for this tab
|
|
41
|
-
close() // close tab, NOT the browser
|
|
42
|
-
}
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### Retry strategy
|
|
46
|
-
```
|
|
47
|
-
attempt 1: assess with 45s timeout
|
|
48
|
-
fail → wait 2s
|
|
49
|
-
attempt 2: assess with 45s timeout (if browser crashed, reset _page first)
|
|
50
|
-
fail → return { error: "assessment_failed", ... }
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
### Timeout implementation
|
|
54
|
-
```javascript
|
|
55
|
-
const result = await Promise.race([
|
|
56
|
-
doAssess(page, url, opts),
|
|
57
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 45000))
|
|
58
|
-
]);
|
|
59
|
-
```
|
|
60
|
-
On timeout, the tab is closed in a finally block.
|
|
61
|
-
|
|
62
|
-
## Files changed
|
|
63
|
-
| File | Change |
|
|
64
|
-
|------|--------|
|
|
65
|
-
| `src/index.js` | Add `createTab()` and tab's `close()` to connect() return object |
|
|
66
|
-
| `mcp-server.js` | Rewrite assess case: use getPage().createTab(), add retry + timeout |
|
|
67
|
-
| `README.md` | Update assess description, add self-healing mention |
|
|
68
|
-
| `barebrowse.context.md` | Update assess section, document createTab() |
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
# Plan: Harden assess tool — session reuse + self-healing
|
|
2
|
-
|
|
3
|
-
## Plan ID
|
|
4
|
-
`harden-assess`
|
|
5
|
-
|
|
6
|
-
## Summary
|
|
7
|
-
Make the assess tool reuse the MCP session's browser instance (cookies, headed fallback) instead of spawning throwaway browsers, add retry logic for transient failures, and add a timeout guard so no single assessment can hang forever.
|
|
8
|
-
|
|
9
|
-
## Problem Statement
|
|
10
|
-
The `assess` tool in `mcp-server.js` calls `connect({ mode: 'hybrid' })` directly — creating a fresh headless browser per call with no cookies, no session state, and no headed fallback. Every other MCP tool uses the `getPage()` singleton. This causes:
|
|
11
|
-
|
|
12
|
-
1. **No cookies** — assess browses as a stranger, getting blocked by consent walls and bot detection that cookies would bypass
|
|
13
|
-
2. **No headed fallback reuse** — if the singleton already fell back to headed mode, assess still starts fresh headless and hits the same blocks
|
|
14
|
-
3. **No retry** — any failure (browser crash, navigation timeout, CDP disconnect) kills the assessment with no recovery
|
|
15
|
-
4. **No timeout guard** — if `wearehere`'s `assess()` hangs (e.g. network idle never resolves), the MCP call blocks indefinitely
|
|
16
|
-
|
|
17
|
-
However, assess can't simply use the shared `_page` singleton directly because `wearehere` injects init scripts (`addScriptToEvaluateOnNewDocument`) and network listeners that would pollute the session page for subsequent calls. The solution is to create a **second page tab** within the same browser instance.
|
|
18
|
-
|
|
19
|
-
## Proposed Solution
|
|
20
|
-
1. **Session-aware page creation** — Instead of `connect()`, assess opens a new CDP tab within the existing browser. This shares the browser process (same cookies, same headed/headless state) but isolates assess's script injections to its own tab.
|
|
21
|
-
2. **Retry with backoff** — Wrap the assess call in a retry loop (max 2 attempts, 2s backoff). On browser crash, reconnect the singleton.
|
|
22
|
-
3. **Timeout guard** — Wrap each assess call in `Promise.race` with a 45s hard deadline. If exceeded, return an error result (not hang).
|
|
23
|
-
|
|
24
|
-
## Benefits
|
|
25
|
-
- Assess gets cookies and headed fallback for free — no separate browser instance
|
|
26
|
-
- Failed assessments auto-retry instead of dying
|
|
27
|
-
- Hanging assessments time out gracefully instead of blocking the MCP server forever
|
|
28
|
-
- Eliminates the 10+ second cold-start per assessment (browser launch)
|
|
29
|
-
|
|
30
|
-
## Scope
|
|
31
|
-
### In Scope
|
|
32
|
-
- Modify `mcp-server.js` assess handler to create tabs within existing browser
|
|
33
|
-
- Add `createTab()` / `closeTab()` helper to `src/index.js` connect() page handle
|
|
34
|
-
- Add retry wrapper in mcp-server.js
|
|
35
|
-
- Add timeout guard in mcp-server.js
|
|
36
|
-
- Update docs: README.md, barebrowse.context.md, CLAUDE.md
|
|
37
|
-
|
|
38
|
-
### Out of Scope
|
|
39
|
-
- Changing wearehere's internal logic
|
|
40
|
-
- Adding retry/self-healing to other MCP tools (future work)
|
|
41
|
-
- Batch/queue mode for multiple assessments
|
|
42
|
-
- Changing the assess tool's MCP interface (same inputs/outputs)
|
|
43
|
-
|
|
44
|
-
## Dependencies
|
|
45
|
-
- `wearehere` package (assess function signature unchanged)
|
|
46
|
-
- `src/index.js` connect() API (adding createTab/closeTab methods)
|
|
47
|
-
- `src/cdp.js` (Target.createTarget / closeTarget already available)
|
|
48
|
-
|
|
49
|
-
## Implementation Strategy
|
|
50
|
-
Phase 1: Add tab management to connect() page handle (createTab, closeTab)
|
|
51
|
-
Phase 2: Rewrite assess handler to use session tab + retry + timeout
|
|
52
|
-
Phase 3: Update documentation
|
|
53
|
-
|
|
54
|
-
## Risks and Mitigations
|
|
55
|
-
| Risk | Impact | Mitigation |
|
|
56
|
-
|------|--------|------------|
|
|
57
|
-
| Init script injection leaks across tabs | Pollutes session page | Each tab gets its own Page domain; addScriptToEvaluateOnNewDocument is per-target |
|
|
58
|
-
| Browser crash during assess kills session too | Session page lost | getPage() already handles reconnection lazily (set _page = null, next call recreates) |
|
|
59
|
-
| wearehere expects full page handle, not raw CDP session | API mismatch | createTab() returns a page-like object with goto, cdp, waitForNetworkIdle — same interface |
|
|
60
|
-
|
|
61
|
-
## Success Criteria
|
|
62
|
-
- [ ] `assess` reuses the session browser (no separate `connect()` call)
|
|
63
|
-
- [ ] `assess` inherits cookies from the session
|
|
64
|
-
- [ ] `assess` works when session is in headed mode (hybrid fallback already triggered)
|
|
65
|
-
- [ ] Failed assessment retries once before returning error
|
|
66
|
-
- [ ] Assessment hanging > 45s returns timeout error, doesn't block server
|
|
67
|
-
- [ ] All existing tests pass
|
|
68
|
-
- [ ] Documentation updated
|
|
69
|
-
|
|
70
|
-
## Open Questions
|
|
71
|
-
1. Should createTab() also inject cookies? — **Recommendation**: Yes, call `authenticate()` for the target URL before navigation, same as `goto` does.
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
# PRD: Harden assess tool
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
The `assess` MCP tool must reuse the session browser, retry on failure, and time out gracefully.
|
|
5
|
-
|
|
6
|
-
## Requirements
|
|
7
|
-
|
|
8
|
-
### R1: Session-aware tab creation
|
|
9
|
-
The assess tool MUST create a new browser tab within the existing MCP session browser instead of spawning a separate browser via `connect()`. The tab MUST:
|
|
10
|
-
- Share the same browser process (inheriting headless/headed state)
|
|
11
|
-
- Have access to the browser's cookie jar
|
|
12
|
-
- Isolate its CDP domains (Page, Network, DOM) from the session page
|
|
13
|
-
- Be closed after each assessment completes or fails
|
|
14
|
-
|
|
15
|
-
### R2: Cookie injection
|
|
16
|
-
Before navigating, the assess tab MUST inject cookies from the user's browser for the target URL, using the same `authenticate()` mechanism as `goto`.
|
|
17
|
-
|
|
18
|
-
### R3: Retry on failure
|
|
19
|
-
If an assessment fails (navigation timeout, CDP error, browser crash), the tool MUST:
|
|
20
|
-
- Retry once after a 2-second delay
|
|
21
|
-
- If the browser crashed, reset the session singleton (`_page = null`) so getPage() reconnects
|
|
22
|
-
- If retry also fails, return a structured error result (not throw)
|
|
23
|
-
|
|
24
|
-
### R4: Timeout guard
|
|
25
|
-
Each assessment MUST have a hard timeout of 45 seconds. If exceeded:
|
|
26
|
-
- The tab is force-closed
|
|
27
|
-
- A structured error result is returned: `{ site, url, error: "timeout", scanned_at }`
|
|
28
|
-
- The session page is NOT affected
|
|
29
|
-
|
|
30
|
-
### R5: Backwards compatibility
|
|
31
|
-
- The assess tool's MCP interface (inputs/outputs) MUST NOT change
|
|
32
|
-
- Successful assessments return the same JSON format as before
|
|
33
|
-
- The tool appears/disappears based on wearehere availability (unchanged)
|
|
34
|
-
|
|
35
|
-
## Non-functional
|
|
36
|
-
- No new dependencies
|
|
37
|
-
- No changes to wearehere package
|
|
38
|
-
- createTab()/closeTab() exposed on connect() page handle for library users too
|