barebrowse 0.5.7 → 0.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/barebrowse.context.md +1 -1
- package/mcp-server.js +15 -20
- package/package.json +1 -1
- package/src/index.js +40 -23
- package/rescan.mjs +0 -107
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,26 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.5.8
|
|
4
|
+
|
|
5
|
+
Bot challenge detection for all browsing, not just assess.
|
|
6
|
+
|
|
7
|
+
### Bot detection (`src/index.js`)
|
|
8
|
+
- `isChallengePage()` now checks ARIA node count (<50 = bot-blocked) in addition to text length and phrase matching
|
|
9
|
+
- `botBlocked` property exposed on both `connect()` pages and `createTab()` tabs
|
|
10
|
+
- `goto()` on main page and tabs sets `botBlocked` after every navigation
|
|
11
|
+
- `snapshot()` prepends `[BOT CHALLENGE DETECTED]` warning line when flagged
|
|
12
|
+
- Hybrid fallback on main page now uses node count for more reliable detection
|
|
13
|
+
|
|
14
|
+
### Assess handler (`mcp-server.js`)
|
|
15
|
+
- Headed fallback now uses `tab.botBlocked` flag instead of naive score threshold (≤5 + all zeros)
|
|
16
|
+
- Previously: sites like Reuters, Home Depot, Leboncoin returned fake-clean scores because the bot challenge page looked "clean" to the scanner
|
|
17
|
+
- Now: node count catches every bot-blocked page regardless of score
|
|
18
|
+
|
|
19
|
+
### Tested
|
|
20
|
+
- reuters.com, homedepot.com, leboncoin.fr, idealista.com all correctly flagged `botBlocked: true`
|
|
21
|
+
- svt.se, whatsapp.com, google.com correctly flagged `false`
|
|
22
|
+
- 71/71 tests passing
|
|
23
|
+
|
|
3
24
|
## 0.5.7
|
|
4
25
|
|
|
5
26
|
MCP server crash resilience + process hardening.
|
package/barebrowse.context.md
CHANGED
package/mcp-server.js
CHANGED
|
@@ -317,8 +317,9 @@ async function handleToolCall(name, args) {
|
|
|
317
317
|
}),
|
|
318
318
|
]);
|
|
319
319
|
clearTimeout(timer);
|
|
320
|
+
const wasBotBlocked = tab.botBlocked;
|
|
320
321
|
await tab.close().catch(() => {});
|
|
321
|
-
return
|
|
322
|
+
return { result, botBlocked: wasBotBlocked };
|
|
322
323
|
} catch (err) {
|
|
323
324
|
clearTimeout(timer);
|
|
324
325
|
await tab.close().catch(() => {});
|
|
@@ -328,29 +329,23 @@ async function handleToolCall(name, args) {
|
|
|
328
329
|
|
|
329
330
|
// Try headless first
|
|
330
331
|
try {
|
|
331
|
-
const result = await runAssess(false);
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
if (allZero && (parsed.score || 0) <= 5) {
|
|
340
|
-
// Likely bot-blocked — retry headed
|
|
341
|
-
try {
|
|
342
|
-
return await runAssess(true);
|
|
343
|
-
} catch {
|
|
344
|
-
return result; // headed failed, return headless result
|
|
345
|
-
}
|
|
332
|
+
const { result, botBlocked } = await runAssess(false);
|
|
333
|
+
if (botBlocked) {
|
|
334
|
+
// Bot-blocked in headless — retry headed
|
|
335
|
+
try {
|
|
336
|
+
const headed = await runAssess(true);
|
|
337
|
+
return JSON.stringify(headed.result, null, 2);
|
|
338
|
+
} catch {
|
|
339
|
+
return JSON.stringify(result, null, 2); // headed failed, return headless result
|
|
346
340
|
}
|
|
347
|
-
}
|
|
348
|
-
return result;
|
|
341
|
+
}
|
|
342
|
+
return JSON.stringify(result, null, 2);
|
|
349
343
|
} catch (err) {
|
|
350
344
|
if (isCdpDead(err)) _page = null;
|
|
351
345
|
// Headless crashed — try headed
|
|
352
346
|
try {
|
|
353
|
-
|
|
347
|
+
const headed = await runAssess(true);
|
|
348
|
+
return JSON.stringify(headed.result, null, 2);
|
|
354
349
|
} catch (retryErr) {
|
|
355
350
|
throw retryErr;
|
|
356
351
|
}
|
|
@@ -379,7 +374,7 @@ async function handleMessage(msg) {
|
|
|
379
374
|
return jsonrpcResponse(id, {
|
|
380
375
|
protocolVersion: '2024-11-05',
|
|
381
376
|
capabilities: { tools: {} },
|
|
382
|
-
serverInfo: { name: 'barebrowse', version: '0.5.
|
|
377
|
+
serverInfo: { name: 'barebrowse', version: '0.5.8' },
|
|
383
378
|
});
|
|
384
379
|
}
|
|
385
380
|
|
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -73,10 +73,10 @@ export async function browse(url, opts = {}) {
|
|
|
73
73
|
}
|
|
74
74
|
|
|
75
75
|
// Step 5: Get ARIA tree
|
|
76
|
-
let { tree } = await ariaTree(page);
|
|
76
|
+
let { tree, nodeCount } = await ariaTree(page);
|
|
77
77
|
|
|
78
78
|
// Step 5.5: Hybrid fallback — if headless was bot-blocked, retry headed
|
|
79
|
-
if (mode === 'hybrid' && isChallengePage(tree)) {
|
|
79
|
+
if (mode === 'hybrid' && isChallengePage(tree, nodeCount)) {
|
|
80
80
|
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
81
81
|
cdp.close();
|
|
82
82
|
if (browser) { browser.process.kill(); browser = null; }
|
|
@@ -140,6 +140,7 @@ export async function connect(opts = {}) {
|
|
|
140
140
|
|
|
141
141
|
let page = await createPage(cdp, mode !== 'headed', { viewport: opts.viewport });
|
|
142
142
|
let refMap = new Map();
|
|
143
|
+
let botBlocked = false;
|
|
143
144
|
|
|
144
145
|
// Suppress permission prompts for all modes
|
|
145
146
|
await suppressPermissions(cdp);
|
|
@@ -179,23 +180,28 @@ export async function connect(opts = {}) {
|
|
|
179
180
|
await dismissConsent(page.session);
|
|
180
181
|
}
|
|
181
182
|
|
|
183
|
+
// Check for bot challenge
|
|
184
|
+
const { tree, nodeCount } = await ariaTree(page);
|
|
185
|
+
botBlocked = isChallengePage(tree, nodeCount);
|
|
186
|
+
|
|
182
187
|
// Hybrid fallback: if bot-blocked, retry with headed browser
|
|
183
|
-
if (mode === 'hybrid') {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
188
|
+
if (botBlocked && mode === 'hybrid') {
|
|
189
|
+
await cdp.send('Target.closeTarget', { targetId: page.targetId });
|
|
190
|
+
cdp.close();
|
|
191
|
+
if (browser) { browser.process.kill(); browser = null; }
|
|
192
|
+
|
|
193
|
+
const port = opts.port || 9222;
|
|
194
|
+
const wsUrl = await getDebugUrl(port);
|
|
195
|
+
cdp = await createCDP(wsUrl);
|
|
196
|
+
page = await createPage(cdp, false, { viewport: opts.viewport });
|
|
197
|
+
setupDialogHandler(page.session);
|
|
198
|
+
await suppressPermissions(cdp);
|
|
199
|
+
await navigate(page, url, timeout);
|
|
200
|
+
if (opts.consent !== false) await dismissConsent(page.session);
|
|
201
|
+
|
|
202
|
+
// Re-check after headed fallback
|
|
203
|
+
const after = await ariaTree(page);
|
|
204
|
+
botBlocked = isChallengePage(after.tree, after.nodeCount);
|
|
199
205
|
}
|
|
200
206
|
},
|
|
201
207
|
|
|
@@ -223,11 +229,12 @@ export async function connect(opts = {}) {
|
|
|
223
229
|
const raw = formatTree(result.tree);
|
|
224
230
|
const { currentIndex, entries } = await page.session.send('Page.getNavigationHistory');
|
|
225
231
|
const pageUrl = entries[currentIndex]?.url || '';
|
|
226
|
-
|
|
232
|
+
const warn = botBlocked ? '[BOT CHALLENGE DETECTED — page content may be incomplete or blocked]\n' : '';
|
|
233
|
+
if (pruneOpts === false) return `url: ${pageUrl}\n` + warn + raw;
|
|
227
234
|
const pruned = pruneTree(result.tree, { mode: pruneOpts?.mode || 'act' });
|
|
228
235
|
const out = formatTree(pruned);
|
|
229
236
|
const stats = `url: ${pageUrl}\n${raw.length.toLocaleString()} chars → ${out.length.toLocaleString()} chars (${Math.round((1 - out.length / raw.length) * 100)}% pruned)`;
|
|
230
|
-
return stats + '\n' + out;
|
|
237
|
+
return stats + '\n' + warn + out;
|
|
231
238
|
},
|
|
232
239
|
|
|
233
240
|
async click(ref) {
|
|
@@ -334,6 +341,8 @@ export async function connect(opts = {}) {
|
|
|
334
341
|
writeFileSync(filePath, JSON.stringify(state, null, 2));
|
|
335
342
|
},
|
|
336
343
|
|
|
344
|
+
get botBlocked() { return botBlocked; },
|
|
345
|
+
|
|
337
346
|
dialogLog,
|
|
338
347
|
|
|
339
348
|
async screenshot(screenshotOpts = {}) {
|
|
@@ -368,13 +377,17 @@ export async function connect(opts = {}) {
|
|
|
368
377
|
async createTab() {
|
|
369
378
|
const tab = await createPage(cdp, mode !== 'headed', { viewport: opts.viewport });
|
|
370
379
|
await suppressPermissions(cdp);
|
|
380
|
+
let tabBotBlocked = false;
|
|
371
381
|
return {
|
|
372
382
|
async goto(url, timeout = 30000) {
|
|
373
383
|
await navigate(tab, url, timeout);
|
|
374
384
|
if (opts.consent !== false) {
|
|
375
385
|
await dismissConsent(tab.session);
|
|
376
386
|
}
|
|
387
|
+
const { tree, nodeCount } = await ariaTree(tab);
|
|
388
|
+
tabBotBlocked = isChallengePage(tree, nodeCount);
|
|
377
389
|
},
|
|
390
|
+
get botBlocked() { return tabBotBlocked; },
|
|
378
391
|
async injectCookies(url, cookieOpts) {
|
|
379
392
|
await authenticate(tab.session, url, { browser: cookieOpts?.browser });
|
|
380
393
|
},
|
|
@@ -485,7 +498,7 @@ async function ariaTree(page) {
|
|
|
485
498
|
}
|
|
486
499
|
}
|
|
487
500
|
|
|
488
|
-
return { tree, refMap };
|
|
501
|
+
return { tree, refMap, nodeCount: nodes.length };
|
|
489
502
|
}
|
|
490
503
|
|
|
491
504
|
/**
|
|
@@ -583,10 +596,14 @@ function waitForNetworkIdle(session, opts = {}) {
|
|
|
583
596
|
|
|
584
597
|
/**
|
|
585
598
|
* Detect if a page is a bot-challenge page (Cloudflare, etc.).
|
|
586
|
-
* Heuristic:
|
|
599
|
+
* Heuristic: low ARIA node count, short text, or known challenge phrases.
|
|
600
|
+
* @param {object} tree - Nested ARIA tree (from buildTree)
|
|
601
|
+
* @param {number} [nodeCount] - Raw CDP node count (from Accessibility.getFullAXTree)
|
|
587
602
|
*/
|
|
588
|
-
function isChallengePage(tree) {
|
|
603
|
+
function isChallengePage(tree, nodeCount) {
|
|
589
604
|
if (!tree) return true;
|
|
605
|
+
// Real pages have 50+ ARIA nodes. Bot challenges have <20.
|
|
606
|
+
if (nodeCount !== undefined && nodeCount < 50) return true;
|
|
590
607
|
const text = flattenTreeText(tree);
|
|
591
608
|
// Near-empty pages are almost certainly blocks
|
|
592
609
|
if (text.trim().length < 50) return true;
|
package/rescan.mjs
DELETED
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
import { connect } from '/home/hamr/PycharmProjects/barebrowse/src/index.js';
|
|
2
|
-
import { assess } from 'wearehere';
|
|
3
|
-
|
|
4
|
-
const SITES = [
|
|
5
|
-
// NL timeouts
|
|
6
|
-
'rtv.nl', 'bijenkorf.nl', 'jumbo.com', 'klm.nl',
|
|
7
|
-
// EU timeouts
|
|
8
|
-
'zalando.de', 'otto.de', 'allegro.pl',
|
|
9
|
-
// Suspected bot-blocked zeros (NL)
|
|
10
|
-
'coolblue.nl', 'rabobank.nl', 'telegraaf.nl', 'wehkamp.nl',
|
|
11
|
-
'ing.nl', 'kvk.nl', 'thuisbezorgd.nl', 'transavia.com', 'schiphol.nl',
|
|
12
|
-
// Suspected bot-blocked zeros (EU)
|
|
13
|
-
'lufthansa.com', 'fnac.com', 'svt.se', 'revolut.com',
|
|
14
|
-
'leboncoin.fr', 'subito.it', 'idealista.com',
|
|
15
|
-
// Suspected bot-blocked zeros (US)
|
|
16
|
-
'washingtonpost.com', 'usatoday.com', 'etsy.com', 'costco.com',
|
|
17
|
-
'homedepot.com', 'chatgpt.com', 'doordash.com', 'yelp.com', 'reuters.com',
|
|
18
|
-
'snapchat.com', 'cnn.com'
|
|
19
|
-
];
|
|
20
|
-
|
|
21
|
-
async function scanSite(url) {
|
|
22
|
-
const full = 'https://www.' + url;
|
|
23
|
-
|
|
24
|
-
// Try headless first
|
|
25
|
-
let page;
|
|
26
|
-
try {
|
|
27
|
-
page = await connect({ mode: 'hybrid' });
|
|
28
|
-
const tab = await page.createTab();
|
|
29
|
-
try {
|
|
30
|
-
await tab.injectCookies(full).catch(() => {});
|
|
31
|
-
const r = await Promise.race([
|
|
32
|
-
assess(tab, full, { timeout: 30000, settle: 3000 }),
|
|
33
|
-
new Promise((_, rej) => setTimeout(() => rej(new Error('timeout')), 35000))
|
|
34
|
-
]);
|
|
35
|
-
await tab.close().catch(() => {});
|
|
36
|
-
|
|
37
|
-
// Check if bot-blocked
|
|
38
|
-
const { network, trackers, profiling } = r.categories;
|
|
39
|
-
const allZero = (network?.score||0) === 0 && (trackers?.score||0) === 0 && (profiling?.score||0) === 0;
|
|
40
|
-
if (allZero && r.score <= 5) {
|
|
41
|
-
await page.close().catch(() => {});
|
|
42
|
-
// Retry headed
|
|
43
|
-
const hp = await connect({ mode: 'headed' });
|
|
44
|
-
try {
|
|
45
|
-
await hp.injectCookies(full).catch(() => {});
|
|
46
|
-
const r2 = await Promise.race([
|
|
47
|
-
assess(hp, full, { timeout: 30000, settle: 3000 }),
|
|
48
|
-
new Promise((_, rej) => setTimeout(() => rej(new Error('timeout')), 35000))
|
|
49
|
-
]);
|
|
50
|
-
console.log(url + '|' + r2.score + '|' + r2.risk + '|HEADED|' + summarize(r2));
|
|
51
|
-
return;
|
|
52
|
-
} finally {
|
|
53
|
-
await hp.close().catch(() => {});
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
console.log(url + '|' + r.score + '|' + r.risk + '|HEADLESS|' + summarize(r));
|
|
57
|
-
} catch (e) {
|
|
58
|
-
await tab.close().catch(() => {});
|
|
59
|
-
if (e.message === 'timeout') {
|
|
60
|
-
// Try headed on timeout too
|
|
61
|
-
await page.close().catch(() => {});
|
|
62
|
-
try {
|
|
63
|
-
const hp = await connect({ mode: 'headed' });
|
|
64
|
-
try {
|
|
65
|
-
await hp.injectCookies(full).catch(() => {});
|
|
66
|
-
const r2 = await Promise.race([
|
|
67
|
-
assess(hp, full, { timeout: 30000, settle: 3000 }),
|
|
68
|
-
new Promise((_, rej) => setTimeout(() => rej(new Error('timeout')), 35000))
|
|
69
|
-
]);
|
|
70
|
-
console.log(url + '|' + r2.score + '|' + r2.risk + '|HEADED|' + summarize(r2));
|
|
71
|
-
return;
|
|
72
|
-
} finally {
|
|
73
|
-
await hp.close().catch(() => {});
|
|
74
|
-
}
|
|
75
|
-
} catch {
|
|
76
|
-
console.log(url + '|—|timeout|BOTH|Failed both modes');
|
|
77
|
-
return;
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
console.log(url + '|—|error|—|' + e.message);
|
|
81
|
-
} finally {
|
|
82
|
-
await page?.close().catch(() => {});
|
|
83
|
-
}
|
|
84
|
-
} catch (e) {
|
|
85
|
-
console.log(url + '|—|error|—|' + e.message);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
function summarize(r) {
|
|
90
|
-
const parts = [];
|
|
91
|
-
const c = r.categories;
|
|
92
|
-
if (c.cookies?.score > 0) parts.push(c.cookies.summary);
|
|
93
|
-
if (c.network?.score > 0) parts.push(c.network.summary);
|
|
94
|
-
if (c.trackers?.score > 0) parts.push(c.trackers.summary);
|
|
95
|
-
if (c.profiling?.score > 0) parts.push(c.profiling.summary);
|
|
96
|
-
if (c.selling_data?.score > 0) parts.push(c.selling_data.summary);
|
|
97
|
-
if (c.pressure?.score > 0) parts.push('pressure:' + c.pressure.score);
|
|
98
|
-
if (c.watching?.score > 0) parts.push(c.watching.summary);
|
|
99
|
-
if (c.stored_data?.score > 0) parts.push(c.stored_data.summary);
|
|
100
|
-
if (c.terms?.score > 0) parts.push('terms:' + c.terms.summary);
|
|
101
|
-
if (parts.length === 0) parts.push('Clean');
|
|
102
|
-
return parts.join('; ');
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
for (const site of SITES) {
|
|
106
|
-
await scanSite(site);
|
|
107
|
-
}
|