barebrowse 0.5.7 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,26 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.5.8
4
+
5
+ Bot challenge detection for all browsing, not just assess.
6
+
7
+ ### Bot detection (`src/index.js`)
8
+ - `isChallengePage()` now checks ARIA node count (<50 = bot-blocked) in addition to text length and phrase matching
9
+ - `botBlocked` property exposed on both `connect()` pages and `createTab()` tabs
10
+ - `goto()` on main page and tabs sets `botBlocked` after every navigation
11
+ - `snapshot()` prepends `[BOT CHALLENGE DETECTED]` warning line when flagged
12
+ - Hybrid fallback on main page now uses node count for more reliable detection
13
+
14
+ ### Assess handler (`mcp-server.js`)
15
+ - Headed fallback now uses `tab.botBlocked` flag instead of naive score threshold (≤5 + all zeros)
16
+ - Previously: sites like Reuters, Home Depot, Leboncoin returned fake-clean scores because the bot challenge page looked "clean" to the scanner
17
+ - Now: node count catches every bot-blocked page regardless of score
18
+
19
+ ### Tested
20
+ - reuters.com, homedepot.com, leboncoin.fr, idealista.com all correctly flagged `botBlocked: true`
21
+ - svt.se, whatsapp.com, google.com correctly flagged `false`
22
+ - 71/71 tests passing
23
+
3
24
  ## 0.5.7
4
25
 
5
26
  MCP server crash resilience + process hardening.
@@ -1,7 +1,7 @@
1
1
  # barebrowse -- Integration Guide
2
2
 
3
3
  > For AI assistants and developers wiring barebrowse into a project.
4
- > v0.5.7 | Node.js >= 22 | 0 required deps | MIT
4
+ > v0.5.8 | Node.js >= 22 | 0 required deps | MIT
5
5
 
6
6
  ## What this is
7
7
 
package/mcp-server.js CHANGED
@@ -317,8 +317,9 @@ async function handleToolCall(name, args) {
317
317
  }),
318
318
  ]);
319
319
  clearTimeout(timer);
320
+ const wasBotBlocked = tab.botBlocked;
320
321
  await tab.close().catch(() => {});
321
- return JSON.stringify(result, null, 2);
322
+ return { result, botBlocked: wasBotBlocked };
322
323
  } catch (err) {
323
324
  clearTimeout(timer);
324
325
  await tab.close().catch(() => {});
@@ -328,29 +329,23 @@ async function handleToolCall(name, args) {
328
329
 
329
330
  // Try headless first
330
331
  try {
331
- const result = await runAssess(false);
332
- // Check if result looks bot-blocked (score 0-5, no trackers, few cookies)
333
- try {
334
- const parsed = JSON.parse(result);
335
- const { network, trackers, profiling } = parsed.categories || {};
336
- const allZero = (network?.score || 0) === 0
337
- && (trackers?.score || 0) === 0
338
- && (profiling?.score || 0) === 0;
339
- if (allZero && (parsed.score || 0) <= 5) {
340
- // Likely bot-blocked — retry headed
341
- try {
342
- return await runAssess(true);
343
- } catch {
344
- return result; // headed failed, return headless result
345
- }
332
+ const { result, botBlocked } = await runAssess(false);
333
+ if (botBlocked) {
334
+ // Bot-blocked in headless — retry headed
335
+ try {
336
+ const headed = await runAssess(true);
337
+ return JSON.stringify(headed.result, null, 2);
338
+ } catch {
339
+ return JSON.stringify(result, null, 2); // headed failed, return headless result
346
340
  }
347
- } catch {}
348
- return result;
341
+ }
342
+ return JSON.stringify(result, null, 2);
349
343
  } catch (err) {
350
344
  if (isCdpDead(err)) _page = null;
351
345
  // Headless crashed — try headed
352
346
  try {
353
- return await runAssess(true);
347
+ const headed = await runAssess(true);
348
+ return JSON.stringify(headed.result, null, 2);
354
349
  } catch (retryErr) {
355
350
  throw retryErr;
356
351
  }
@@ -379,7 +374,7 @@ async function handleMessage(msg) {
379
374
  return jsonrpcResponse(id, {
380
375
  protocolVersion: '2024-11-05',
381
376
  capabilities: { tools: {} },
382
- serverInfo: { name: 'barebrowse', version: '0.5.7' },
377
+ serverInfo: { name: 'barebrowse', version: '0.5.8' },
383
378
  });
384
379
  }
385
380
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "barebrowse",
3
- "version": "0.5.7",
3
+ "version": "0.5.8",
4
4
  "description": "Authenticated web browsing for autonomous agents via CDP. URL in, pruned ARIA snapshot out.",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
package/src/index.js CHANGED
@@ -73,10 +73,10 @@ export async function browse(url, opts = {}) {
73
73
  }
74
74
 
75
75
  // Step 5: Get ARIA tree
76
- let { tree } = await ariaTree(page);
76
+ let { tree, nodeCount } = await ariaTree(page);
77
77
 
78
78
  // Step 5.5: Hybrid fallback — if headless was bot-blocked, retry headed
79
- if (mode === 'hybrid' && isChallengePage(tree)) {
79
+ if (mode === 'hybrid' && isChallengePage(tree, nodeCount)) {
80
80
  await cdp.send('Target.closeTarget', { targetId: page.targetId });
81
81
  cdp.close();
82
82
  if (browser) { browser.process.kill(); browser = null; }
@@ -140,6 +140,7 @@ export async function connect(opts = {}) {
140
140
 
141
141
  let page = await createPage(cdp, mode !== 'headed', { viewport: opts.viewport });
142
142
  let refMap = new Map();
143
+ let botBlocked = false;
143
144
 
144
145
  // Suppress permission prompts for all modes
145
146
  await suppressPermissions(cdp);
@@ -179,23 +180,28 @@ export async function connect(opts = {}) {
179
180
  await dismissConsent(page.session);
180
181
  }
181
182
 
183
+ // Check for bot challenge
184
+ const { tree, nodeCount } = await ariaTree(page);
185
+ botBlocked = isChallengePage(tree, nodeCount);
186
+
182
187
  // Hybrid fallback: if bot-blocked, retry with headed browser
183
- if (mode === 'hybrid') {
184
- const { tree } = await ariaTree(page);
185
- if (isChallengePage(tree)) {
186
- await cdp.send('Target.closeTarget', { targetId: page.targetId });
187
- cdp.close();
188
- if (browser) { browser.process.kill(); browser = null; }
189
-
190
- const port = opts.port || 9222;
191
- const wsUrl = await getDebugUrl(port);
192
- cdp = await createCDP(wsUrl);
193
- page = await createPage(cdp, false, { viewport: opts.viewport });
194
- setupDialogHandler(page.session);
195
- await suppressPermissions(cdp);
196
- await navigate(page, url, timeout);
197
- if (opts.consent !== false) await dismissConsent(page.session);
198
- }
188
+ if (botBlocked && mode === 'hybrid') {
189
+ await cdp.send('Target.closeTarget', { targetId: page.targetId });
190
+ cdp.close();
191
+ if (browser) { browser.process.kill(); browser = null; }
192
+
193
+ const port = opts.port || 9222;
194
+ const wsUrl = await getDebugUrl(port);
195
+ cdp = await createCDP(wsUrl);
196
+ page = await createPage(cdp, false, { viewport: opts.viewport });
197
+ setupDialogHandler(page.session);
198
+ await suppressPermissions(cdp);
199
+ await navigate(page, url, timeout);
200
+ if (opts.consent !== false) await dismissConsent(page.session);
201
+
202
+ // Re-check after headed fallback
203
+ const after = await ariaTree(page);
204
+ botBlocked = isChallengePage(after.tree, after.nodeCount);
199
205
  }
200
206
  },
201
207
 
@@ -223,11 +229,12 @@ export async function connect(opts = {}) {
223
229
  const raw = formatTree(result.tree);
224
230
  const { currentIndex, entries } = await page.session.send('Page.getNavigationHistory');
225
231
  const pageUrl = entries[currentIndex]?.url || '';
226
- if (pruneOpts === false) return `url: ${pageUrl}\n` + raw;
232
+ const warn = botBlocked ? '[BOT CHALLENGE DETECTED — page content may be incomplete or blocked]\n' : '';
233
+ if (pruneOpts === false) return `url: ${pageUrl}\n` + warn + raw;
227
234
  const pruned = pruneTree(result.tree, { mode: pruneOpts?.mode || 'act' });
228
235
  const out = formatTree(pruned);
229
236
  const stats = `url: ${pageUrl}\n${raw.length.toLocaleString()} chars → ${out.length.toLocaleString()} chars (${Math.round((1 - out.length / raw.length) * 100)}% pruned)`;
230
- return stats + '\n' + out;
237
+ return stats + '\n' + warn + out;
231
238
  },
232
239
 
233
240
  async click(ref) {
@@ -334,6 +341,8 @@ export async function connect(opts = {}) {
334
341
  writeFileSync(filePath, JSON.stringify(state, null, 2));
335
342
  },
336
343
 
344
+ get botBlocked() { return botBlocked; },
345
+
337
346
  dialogLog,
338
347
 
339
348
  async screenshot(screenshotOpts = {}) {
@@ -368,13 +377,17 @@ export async function connect(opts = {}) {
368
377
  async createTab() {
369
378
  const tab = await createPage(cdp, mode !== 'headed', { viewport: opts.viewport });
370
379
  await suppressPermissions(cdp);
380
+ let tabBotBlocked = false;
371
381
  return {
372
382
  async goto(url, timeout = 30000) {
373
383
  await navigate(tab, url, timeout);
374
384
  if (opts.consent !== false) {
375
385
  await dismissConsent(tab.session);
376
386
  }
387
+ const { tree, nodeCount } = await ariaTree(tab);
388
+ tabBotBlocked = isChallengePage(tree, nodeCount);
377
389
  },
390
+ get botBlocked() { return tabBotBlocked; },
378
391
  async injectCookies(url, cookieOpts) {
379
392
  await authenticate(tab.session, url, { browser: cookieOpts?.browser });
380
393
  },
@@ -485,7 +498,7 @@ async function ariaTree(page) {
485
498
  }
486
499
  }
487
500
 
488
- return { tree, refMap };
501
+ return { tree, refMap, nodeCount: nodes.length };
489
502
  }
490
503
 
491
504
  /**
@@ -583,10 +596,14 @@ function waitForNetworkIdle(session, opts = {}) {
583
596
 
584
597
  /**
585
598
  * Detect if a page is a bot-challenge page (Cloudflare, etc.).
586
- * Heuristic: very short ARIA tree + known challenge phrases.
599
+ * Heuristic: low ARIA node count, short text, or known challenge phrases.
600
+ * @param {object} tree - Nested ARIA tree (from buildTree)
601
+ * @param {number} [nodeCount] - Raw CDP node count (from Accessibility.getFullAXTree)
587
602
  */
588
- function isChallengePage(tree) {
603
+ function isChallengePage(tree, nodeCount) {
589
604
  if (!tree) return true;
605
+ // Real pages have 50+ ARIA nodes. Bot challenges have <20.
606
+ if (nodeCount !== undefined && nodeCount < 50) return true;
590
607
  const text = flattenTreeText(tree);
591
608
  // Near-empty pages are almost certainly blocks
592
609
  if (text.trim().length < 50) return true;
package/rescan.mjs DELETED
@@ -1,107 +0,0 @@
1
- import { connect } from '/home/hamr/PycharmProjects/barebrowse/src/index.js';
2
- import { assess } from 'wearehere';
3
-
4
- const SITES = [
5
- // NL timeouts
6
- 'rtv.nl', 'bijenkorf.nl', 'jumbo.com', 'klm.nl',
7
- // EU timeouts
8
- 'zalando.de', 'otto.de', 'allegro.pl',
9
- // Suspected bot-blocked zeros (NL)
10
- 'coolblue.nl', 'rabobank.nl', 'telegraaf.nl', 'wehkamp.nl',
11
- 'ing.nl', 'kvk.nl', 'thuisbezorgd.nl', 'transavia.com', 'schiphol.nl',
12
- // Suspected bot-blocked zeros (EU)
13
- 'lufthansa.com', 'fnac.com', 'svt.se', 'revolut.com',
14
- 'leboncoin.fr', 'subito.it', 'idealista.com',
15
- // Suspected bot-blocked zeros (US)
16
- 'washingtonpost.com', 'usatoday.com', 'etsy.com', 'costco.com',
17
- 'homedepot.com', 'chatgpt.com', 'doordash.com', 'yelp.com', 'reuters.com',
18
- 'snapchat.com', 'cnn.com'
19
- ];
20
-
21
- async function scanSite(url) {
22
- const full = 'https://www.' + url;
23
-
24
- // Try headless first
25
- let page;
26
- try {
27
- page = await connect({ mode: 'hybrid' });
28
- const tab = await page.createTab();
29
- try {
30
- await tab.injectCookies(full).catch(() => {});
31
- const r = await Promise.race([
32
- assess(tab, full, { timeout: 30000, settle: 3000 }),
33
- new Promise((_, rej) => setTimeout(() => rej(new Error('timeout')), 35000))
34
- ]);
35
- await tab.close().catch(() => {});
36
-
37
- // Check if bot-blocked
38
- const { network, trackers, profiling } = r.categories;
39
- const allZero = (network?.score||0) === 0 && (trackers?.score||0) === 0 && (profiling?.score||0) === 0;
40
- if (allZero && r.score <= 5) {
41
- await page.close().catch(() => {});
42
- // Retry headed
43
- const hp = await connect({ mode: 'headed' });
44
- try {
45
- await hp.injectCookies(full).catch(() => {});
46
- const r2 = await Promise.race([
47
- assess(hp, full, { timeout: 30000, settle: 3000 }),
48
- new Promise((_, rej) => setTimeout(() => rej(new Error('timeout')), 35000))
49
- ]);
50
- console.log(url + '|' + r2.score + '|' + r2.risk + '|HEADED|' + summarize(r2));
51
- return;
52
- } finally {
53
- await hp.close().catch(() => {});
54
- }
55
- }
56
- console.log(url + '|' + r.score + '|' + r.risk + '|HEADLESS|' + summarize(r));
57
- } catch (e) {
58
- await tab.close().catch(() => {});
59
- if (e.message === 'timeout') {
60
- // Try headed on timeout too
61
- await page.close().catch(() => {});
62
- try {
63
- const hp = await connect({ mode: 'headed' });
64
- try {
65
- await hp.injectCookies(full).catch(() => {});
66
- const r2 = await Promise.race([
67
- assess(hp, full, { timeout: 30000, settle: 3000 }),
68
- new Promise((_, rej) => setTimeout(() => rej(new Error('timeout')), 35000))
69
- ]);
70
- console.log(url + '|' + r2.score + '|' + r2.risk + '|HEADED|' + summarize(r2));
71
- return;
72
- } finally {
73
- await hp.close().catch(() => {});
74
- }
75
- } catch {
76
- console.log(url + '|—|timeout|BOTH|Failed both modes');
77
- return;
78
- }
79
- }
80
- console.log(url + '|—|error|—|' + e.message);
81
- } finally {
82
- await page?.close().catch(() => {});
83
- }
84
- } catch (e) {
85
- console.log(url + '|—|error|—|' + e.message);
86
- }
87
- }
88
-
89
- function summarize(r) {
90
- const parts = [];
91
- const c = r.categories;
92
- if (c.cookies?.score > 0) parts.push(c.cookies.summary);
93
- if (c.network?.score > 0) parts.push(c.network.summary);
94
- if (c.trackers?.score > 0) parts.push(c.trackers.summary);
95
- if (c.profiling?.score > 0) parts.push(c.profiling.summary);
96
- if (c.selling_data?.score > 0) parts.push(c.selling_data.summary);
97
- if (c.pressure?.score > 0) parts.push('pressure:' + c.pressure.score);
98
- if (c.watching?.score > 0) parts.push(c.watching.summary);
99
- if (c.stored_data?.score > 0) parts.push(c.stored_data.summary);
100
- if (c.terms?.score > 0) parts.push('terms:' + c.terms.summary);
101
- if (parts.length === 0) parts.push('Clean');
102
- return parts.join('; ');
103
- }
104
-
105
- for (const site of SITES) {
106
- await scanSite(site);
107
- }