webpeel 0.21.8 → 0.21.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
6
6
  import { detectChallenge } from './challenge-detection.js';
7
7
  import { getRealisticUserAgent } from './user-agents.js';
8
- import { getRandomUserAgent, applyStealthScripts, takePooledPage, ensurePagePool, recyclePooledPage, getBrowser, getStealthBrowser, getProfileBrowser, PAGE_POOL_SIZE, MAX_CONCURRENT_PAGES, getPooledPagesCount, } from './browser-pool.js';
8
+ import { getRandomUserAgent, applyStealthScripts, takePooledPage, ensurePagePool, recyclePooledPage, getBrowser, getStealthBrowser, getStealthPlaywright, getProfileBrowser, PAGE_POOL_SIZE, MAX_CONCURRENT_PAGES, getPooledPagesCount, ANTI_DETECTION_ARGS, getRandomViewport, } from './browser-pool.js';
9
9
  // Proprietary stealth module — gitignored, loaded conditionally
10
10
  let applyStealthPatches;
11
11
  let applyAcceptLanguageHeader;
@@ -90,6 +90,8 @@ export async function browserFetch(url, options = {}) {
90
90
  const usingProfileBrowser = !!profileDir;
91
91
  // Owned context created when storageState injection is requested
92
92
  let ownedContext;
93
+ // Owned browser launched when proxy is specified (dedicated browser with proxy at launch level)
94
+ let ownedBrowser;
93
95
  try {
94
96
  const browser = usingProfileBrowser
95
97
  ? await getProfileBrowser(profileDir, headed, stealth)
@@ -135,11 +137,22 @@ export async function browserFetch(url, options = {}) {
135
137
  log.debug('proxy URL parse failed, using as-is:', e instanceof Error ? e.message : e);
136
138
  playwrightProxy = { server: proxy };
137
139
  }
138
- // Create an isolated context with the proxy and optional storageState
139
- ownedContext = await browser.newContext({
140
- ...pageOptions,
140
+ // Launch a DEDICATED fresh browser with proxy at the launch level.
141
+ // Context-level proxy is unreliable for anti-bot sites — they check the browser's
142
+ // IP at connection time (set at launch), not at context creation.
143
+ const pw = stealth ? await getStealthPlaywright() : (await import('playwright')).chromium;
144
+ const vp = getRandomViewport();
145
+ ownedBrowser = await pw.launch({
146
+ headless: true,
147
+ args: [...ANTI_DETECTION_ARGS, `--window-size=${vp.width},${vp.height}`],
141
148
  proxy: playwrightProxy,
142
- viewport: { width: effectiveViewportWidth, height: effectiveViewportHeight },
149
+ });
150
+ ownedContext = await ownedBrowser.newContext({
151
+ userAgent: validatedUserAgent || getRandomUserAgent(),
152
+ locale: 'en-US',
153
+ timezoneId: 'America/New_York',
154
+ javaScriptEnabled: true,
155
+ viewport: { width: effectiveViewportWidth || vp.width, height: effectiveViewportHeight || vp.height },
143
156
  ...(storageState ? { storageState } : {}),
144
157
  });
145
158
  page = await ownedContext.newPage();
@@ -340,6 +353,29 @@ export async function browserFetch(url, options = {}) {
340
353
  await page.waitForTimeout(extraDelayMs);
341
354
  throwIfAborted();
342
355
  }
356
+ // Human-like delay for proxied requests (helps bypass bot detection on strict sites)
357
+ if (proxy) {
358
+ // Realistic human behavior to bypass behavioral analysis
359
+ const humanDelay = 800 + Math.random() * 1200;
360
+ await page.waitForTimeout(humanDelay);
361
+ throwIfAborted();
362
+ // Realistic mouse movement (simulate human cursor)
363
+ try {
364
+ const vw = await page.evaluate(() => window.innerWidth);
365
+ const vh = await page.evaluate(() => window.innerHeight);
366
+ await page.mouse.move(100 + Math.random() * (vw - 200), 100 + Math.random() * (vh - 200), { steps: 5 + Math.floor(Math.random() * 10) });
367
+ // Small scroll to trigger lazy-loaded content
368
+ await page.evaluate(() => window.scrollBy(0, 200 + Math.random() * 400));
369
+ await page.waitForTimeout(300 + Math.random() * 500);
370
+ throwIfAborted();
371
+ // Second mouse move
372
+ await page.mouse.move(50 + Math.random() * (vw - 100), 50 + Math.random() * (vh - 100), { steps: 3 + Math.floor(Math.random() * 5) });
373
+ }
374
+ catch {
375
+ // Non-fatal: mouse/scroll simulation failed
376
+ }
377
+ throwIfAborted();
378
+ }
343
379
  // Wait for additional time if requested (for dynamic content / screenshots)
344
380
  if (waitMs > 0) {
345
381
  await page.waitForTimeout(waitMs);
@@ -447,7 +483,8 @@ export async function browserFetch(url, options = {}) {
447
483
  contentType: fetchContentType,
448
484
  screenshot: screenshotBuffer,
449
485
  page,
450
- browser,
486
+ // Use ownedBrowser for proxy case, otherwise the shared browser
487
+ browser: ownedBrowser ?? browser,
451
488
  ...(fetchAutoInteract !== undefined ? { autoInteract: fetchAutoInteract } : {}),
452
489
  };
453
490
  }
@@ -492,6 +529,10 @@ export async function browserFetch(url, options = {}) {
492
529
  await page.close().catch(() => { });
493
530
  }
494
531
  }
532
+ // Close the dedicated proxy browser if one was launched (not when keeping page open)
533
+ if (ownedBrowser && !keepPageOpen) {
534
+ await ownedBrowser.close().catch(() => { });
535
+ }
495
536
  activePagesCount--;
496
537
  }
497
538
  }
@@ -3,10 +3,12 @@
3
3
  * Handles Playwright loading, browser instances, and the idle page pool.
4
4
  */
5
5
  import type { Browser, Page } from 'playwright';
6
+ type ChromiumType = typeof import('playwright').chromium;
6
7
  import { closePool } from './http-fetch.js';
7
8
  export { closePool };
8
9
  /** Whether Playwright has been loaded (for diagnostics). */
9
10
  export declare let playwrightLoaded: boolean;
11
+ export declare function getStealthPlaywright(): Promise<ChromiumType>;
10
12
  /**
11
13
  * Returns a realistic Chrome user agent.
12
14
  * Delegates to the curated user-agents module so stealth mode never exposes
@@ -20,7 +20,7 @@ async function getPlaywright() {
20
20
  }
21
21
  return _chromium;
22
22
  }
23
- async function getStealthPlaywright() {
23
+ export async function getStealthPlaywright() {
24
24
  if (!_stealthChromium) {
25
25
  const pwExtra = await import('playwright-extra');
26
26
  const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
@@ -54,7 +54,7 @@ export const ANTI_DETECTION_ARGS = [
54
54
  '--disable-gpu',
55
55
  '--start-maximized',
56
56
  // Chrome branding / stealth hardening
57
- '--disable-features=ChromeUserAgentDataBranding',
57
+ '--disable-features=ChromeUserAgentDataBranding,IsolateOrigins,site-per-process',
58
58
  '--disable-component-extensions-with-background-pages',
59
59
  '--disable-default-apps',
60
60
  '--disable-extensions',
@@ -143,6 +143,95 @@ export async function applyStealthScripts(page) {
143
143
  });
144
144
  })();
145
145
  `);
146
+ // 3. Hide navigator.webdriver (THE #1 BOT SIGNAL)
147
+ await page.addInitScript(`
148
+ Object.defineProperty(navigator, 'webdriver', {
149
+ get: () => false,
150
+ configurable: true,
151
+ });
152
+ try { delete Object.getPrototypeOf(navigator).webdriver; } catch (e) {}
153
+ `);
154
+ // 4. Fake navigator.plugins (empty = bot signal, real Chrome has plugins)
155
+ await page.addInitScript(`
156
+ Object.defineProperty(navigator, 'plugins', {
157
+ get: () => {
158
+ var arr = [
159
+ { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
160
+ { name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '' },
161
+ { name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
162
+ ];
163
+ arr.item = function(i) { return arr[i] || null; };
164
+ arr.namedItem = function(n) { return arr.find(function(p) { return p.name === n; }) || null; };
165
+ arr.refresh = function() {};
166
+ return arr;
167
+ },
168
+ configurable: true,
169
+ });
170
+ `);
171
+ // 5. Fake navigator.languages
172
+ await page.addInitScript(`
173
+ Object.defineProperty(navigator, 'languages', {
174
+ get: () => ['en-US', 'en'],
175
+ configurable: true,
176
+ });
177
+ `);
178
+ // 6. Fake window.chrome object (missing in headless = detected)
179
+ await page.addInitScript(`
180
+ if (!window.chrome) {
181
+ window.chrome = {
182
+ app: {
183
+ isInstalled: false,
184
+ InstallState: { INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
185
+ RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' }
186
+ },
187
+ runtime: {
188
+ OnInstalledReason: {}, OnRestartRequiredReason: {}, PlatformArch: {},
189
+ PlatformNaclArch: {}, PlatformOs: {}, RequestUpdateCheckStatus: {},
190
+ connect: function() {}, sendMessage: function() {}
191
+ },
192
+ };
193
+ }
194
+ `);
195
+ // 7. Fix permissions query (notifications should be 'prompt' not 'denied')
196
+ await page.addInitScript(`
197
+ try {
198
+ var originalQuery = window.Permissions && window.Permissions.prototype && window.Permissions.prototype.query;
199
+ if (originalQuery) {
200
+ window.Permissions.prototype.query = function(params) {
201
+ if (params && params.name === 'notifications') {
202
+ return Promise.resolve({ state: Notification.permission });
203
+ }
204
+ return originalQuery.call(this, params);
205
+ };
206
+ }
207
+ } catch (e) {}
208
+ `);
209
+ // 8. WebGL vendor/renderer spoofing (headless shows "Google SwiftShader")
210
+ await page.addInitScript(`
211
+ try {
212
+ var getParameter = WebGLRenderingContext.prototype.getParameter;
213
+ WebGLRenderingContext.prototype.getParameter = function(parameter) {
214
+ if (parameter === 37445) return 'Intel Inc.';
215
+ if (parameter === 37446) return 'Intel Iris OpenGL Engine';
216
+ return getParameter.call(this, parameter);
217
+ };
218
+ if (typeof WebGL2RenderingContext !== 'undefined') {
219
+ var getParameter2 = WebGL2RenderingContext.prototype.getParameter;
220
+ WebGL2RenderingContext.prototype.getParameter = function(parameter) {
221
+ if (parameter === 37445) return 'Intel Inc.';
222
+ if (parameter === 37446) return 'Intel Iris OpenGL Engine';
223
+ return getParameter2.call(this, parameter);
224
+ };
225
+ }
226
+ } catch (e) {}
227
+ `);
228
+ // 9. Hide automation-related properties
229
+ await page.addInitScript(`
230
+ try { Object.defineProperty(document, '$cdc_asdjflasutopfhvcZLmcfl_', { get: () => undefined }); } catch (e) {}
231
+ try { delete window.callPhantom; } catch (e) {}
232
+ try { delete window._phantom; } catch (e) {}
233
+ try { delete window.__nightmare; } catch (e) {}
234
+ `);
146
235
  }
147
236
  // ── Page pool constants & state ───────────────────────────────────────────────
148
237
  export const MAX_CONCURRENT_PAGES = 5;
@@ -76,6 +76,7 @@ function shouldForceBrowser(url) {
76
76
  'chewy.com', // Amazon subsidiary
77
77
  'aliexpress.com', // anti-bot
78
78
  'wish.com', // anti-bot
79
+ 'cargurus.com', // aggressive bot detection
79
80
  ];
80
81
  for (const domain of stealthDomains) {
81
82
  if (hostname === domain || hostname.endsWith(`.${domain}`)) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.8",
3
+ "version": "0.21.9",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",