webpeel 0.21.8 → 0.21.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
6
6
|
import { detectChallenge } from './challenge-detection.js';
|
|
7
7
|
import { getRealisticUserAgent } from './user-agents.js';
|
|
8
|
-
import { getRandomUserAgent, applyStealthScripts, takePooledPage, ensurePagePool, recyclePooledPage, getBrowser, getStealthBrowser, getProfileBrowser, PAGE_POOL_SIZE, MAX_CONCURRENT_PAGES, getPooledPagesCount, } from './browser-pool.js';
|
|
8
|
+
import { getRandomUserAgent, applyStealthScripts, takePooledPage, ensurePagePool, recyclePooledPage, getBrowser, getStealthBrowser, getStealthPlaywright, getProfileBrowser, PAGE_POOL_SIZE, MAX_CONCURRENT_PAGES, getPooledPagesCount, ANTI_DETECTION_ARGS, getRandomViewport, } from './browser-pool.js';
|
|
9
9
|
// Proprietary stealth module — gitignored, loaded conditionally
|
|
10
10
|
let applyStealthPatches;
|
|
11
11
|
let applyAcceptLanguageHeader;
|
|
@@ -90,6 +90,8 @@ export async function browserFetch(url, options = {}) {
|
|
|
90
90
|
const usingProfileBrowser = !!profileDir;
|
|
91
91
|
// Owned context created when storageState injection is requested
|
|
92
92
|
let ownedContext;
|
|
93
|
+
// Owned browser launched when proxy is specified (dedicated browser with proxy at launch level)
|
|
94
|
+
let ownedBrowser;
|
|
93
95
|
try {
|
|
94
96
|
const browser = usingProfileBrowser
|
|
95
97
|
? await getProfileBrowser(profileDir, headed, stealth)
|
|
@@ -135,11 +137,22 @@ export async function browserFetch(url, options = {}) {
|
|
|
135
137
|
log.debug('proxy URL parse failed, using as-is:', e instanceof Error ? e.message : e);
|
|
136
138
|
playwrightProxy = { server: proxy };
|
|
137
139
|
}
|
|
138
|
-
//
|
|
139
|
-
|
|
140
|
-
|
|
140
|
+
// Launch a DEDICATED fresh browser with proxy at the launch level.
|
|
141
|
+
// Context-level proxy is unreliable for anti-bot sites — they check the browser's
|
|
142
|
+
// IP at connection time (set at launch), not at context creation.
|
|
143
|
+
const pw = stealth ? await getStealthPlaywright() : (await import('playwright')).chromium;
|
|
144
|
+
const vp = getRandomViewport();
|
|
145
|
+
ownedBrowser = await pw.launch({
|
|
146
|
+
headless: true,
|
|
147
|
+
args: [...ANTI_DETECTION_ARGS, `--window-size=${vp.width},${vp.height}`],
|
|
141
148
|
proxy: playwrightProxy,
|
|
142
|
-
|
|
149
|
+
});
|
|
150
|
+
ownedContext = await ownedBrowser.newContext({
|
|
151
|
+
userAgent: validatedUserAgent || getRandomUserAgent(),
|
|
152
|
+
locale: 'en-US',
|
|
153
|
+
timezoneId: 'America/New_York',
|
|
154
|
+
javaScriptEnabled: true,
|
|
155
|
+
viewport: { width: effectiveViewportWidth || vp.width, height: effectiveViewportHeight || vp.height },
|
|
143
156
|
...(storageState ? { storageState } : {}),
|
|
144
157
|
});
|
|
145
158
|
page = await ownedContext.newPage();
|
|
@@ -340,6 +353,29 @@ export async function browserFetch(url, options = {}) {
|
|
|
340
353
|
await page.waitForTimeout(extraDelayMs);
|
|
341
354
|
throwIfAborted();
|
|
342
355
|
}
|
|
356
|
+
// Human-like delay for proxied requests (helps bypass bot detection on strict sites)
|
|
357
|
+
if (proxy) {
|
|
358
|
+
// Realistic human behavior to bypass behavioral analysis
|
|
359
|
+
const humanDelay = 800 + Math.random() * 1200;
|
|
360
|
+
await page.waitForTimeout(humanDelay);
|
|
361
|
+
throwIfAborted();
|
|
362
|
+
// Realistic mouse movement (simulate human cursor)
|
|
363
|
+
try {
|
|
364
|
+
const vw = await page.evaluate(() => window.innerWidth);
|
|
365
|
+
const vh = await page.evaluate(() => window.innerHeight);
|
|
366
|
+
await page.mouse.move(100 + Math.random() * (vw - 200), 100 + Math.random() * (vh - 200), { steps: 5 + Math.floor(Math.random() * 10) });
|
|
367
|
+
// Small scroll to trigger lazy-loaded content
|
|
368
|
+
await page.evaluate(() => window.scrollBy(0, 200 + Math.random() * 400));
|
|
369
|
+
await page.waitForTimeout(300 + Math.random() * 500);
|
|
370
|
+
throwIfAborted();
|
|
371
|
+
// Second mouse move
|
|
372
|
+
await page.mouse.move(50 + Math.random() * (vw - 100), 50 + Math.random() * (vh - 100), { steps: 3 + Math.floor(Math.random() * 5) });
|
|
373
|
+
}
|
|
374
|
+
catch {
|
|
375
|
+
// Non-fatal: mouse/scroll simulation failed
|
|
376
|
+
}
|
|
377
|
+
throwIfAborted();
|
|
378
|
+
}
|
|
343
379
|
// Wait for additional time if requested (for dynamic content / screenshots)
|
|
344
380
|
if (waitMs > 0) {
|
|
345
381
|
await page.waitForTimeout(waitMs);
|
|
@@ -447,7 +483,8 @@ export async function browserFetch(url, options = {}) {
|
|
|
447
483
|
contentType: fetchContentType,
|
|
448
484
|
screenshot: screenshotBuffer,
|
|
449
485
|
page,
|
|
450
|
-
browser
|
|
486
|
+
// Use ownedBrowser for proxy case, otherwise the shared browser
|
|
487
|
+
browser: ownedBrowser ?? browser,
|
|
451
488
|
...(fetchAutoInteract !== undefined ? { autoInteract: fetchAutoInteract } : {}),
|
|
452
489
|
};
|
|
453
490
|
}
|
|
@@ -492,6 +529,10 @@ export async function browserFetch(url, options = {}) {
|
|
|
492
529
|
await page.close().catch(() => { });
|
|
493
530
|
}
|
|
494
531
|
}
|
|
532
|
+
// Close the dedicated proxy browser if one was launched (not when keeping page open)
|
|
533
|
+
if (ownedBrowser && !keepPageOpen) {
|
|
534
|
+
await ownedBrowser.close().catch(() => { });
|
|
535
|
+
}
|
|
495
536
|
activePagesCount--;
|
|
496
537
|
}
|
|
497
538
|
}
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
* Handles Playwright loading, browser instances, and the idle page pool.
|
|
4
4
|
*/
|
|
5
5
|
import type { Browser, Page } from 'playwright';
|
|
6
|
+
type ChromiumType = typeof import('playwright').chromium;
|
|
6
7
|
import { closePool } from './http-fetch.js';
|
|
7
8
|
export { closePool };
|
|
8
9
|
/** Whether Playwright has been loaded (for diagnostics). */
|
|
9
10
|
export declare let playwrightLoaded: boolean;
|
|
11
|
+
export declare function getStealthPlaywright(): Promise<ChromiumType>;
|
|
10
12
|
/**
|
|
11
13
|
* Returns a realistic Chrome user agent.
|
|
12
14
|
* Delegates to the curated user-agents module so stealth mode never exposes
|
|
@@ -20,7 +20,7 @@ async function getPlaywright() {
|
|
|
20
20
|
}
|
|
21
21
|
return _chromium;
|
|
22
22
|
}
|
|
23
|
-
async function getStealthPlaywright() {
|
|
23
|
+
export async function getStealthPlaywright() {
|
|
24
24
|
if (!_stealthChromium) {
|
|
25
25
|
const pwExtra = await import('playwright-extra');
|
|
26
26
|
const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
|
|
@@ -54,7 +54,7 @@ export const ANTI_DETECTION_ARGS = [
|
|
|
54
54
|
'--disable-gpu',
|
|
55
55
|
'--start-maximized',
|
|
56
56
|
// Chrome branding / stealth hardening
|
|
57
|
-
'--disable-features=ChromeUserAgentDataBranding',
|
|
57
|
+
'--disable-features=ChromeUserAgentDataBranding,IsolateOrigins,site-per-process',
|
|
58
58
|
'--disable-component-extensions-with-background-pages',
|
|
59
59
|
'--disable-default-apps',
|
|
60
60
|
'--disable-extensions',
|
|
@@ -143,6 +143,95 @@ export async function applyStealthScripts(page) {
|
|
|
143
143
|
});
|
|
144
144
|
})();
|
|
145
145
|
`);
|
|
146
|
+
// 3. Hide navigator.webdriver (THE #1 BOT SIGNAL)
|
|
147
|
+
await page.addInitScript(`
|
|
148
|
+
Object.defineProperty(navigator, 'webdriver', {
|
|
149
|
+
get: () => false,
|
|
150
|
+
configurable: true,
|
|
151
|
+
});
|
|
152
|
+
try { delete Object.getPrototypeOf(navigator).webdriver; } catch (e) {}
|
|
153
|
+
`);
|
|
154
|
+
// 4. Fake navigator.plugins (empty = bot signal, real Chrome has plugins)
|
|
155
|
+
await page.addInitScript(`
|
|
156
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
157
|
+
get: () => {
|
|
158
|
+
var arr = [
|
|
159
|
+
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
|
|
160
|
+
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '' },
|
|
161
|
+
{ name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
|
|
162
|
+
];
|
|
163
|
+
arr.item = function(i) { return arr[i] || null; };
|
|
164
|
+
arr.namedItem = function(n) { return arr.find(function(p) { return p.name === n; }) || null; };
|
|
165
|
+
arr.refresh = function() {};
|
|
166
|
+
return arr;
|
|
167
|
+
},
|
|
168
|
+
configurable: true,
|
|
169
|
+
});
|
|
170
|
+
`);
|
|
171
|
+
// 5. Fake navigator.languages
|
|
172
|
+
await page.addInitScript(`
|
|
173
|
+
Object.defineProperty(navigator, 'languages', {
|
|
174
|
+
get: () => ['en-US', 'en'],
|
|
175
|
+
configurable: true,
|
|
176
|
+
});
|
|
177
|
+
`);
|
|
178
|
+
// 6. Fake window.chrome object (missing in headless = detected)
|
|
179
|
+
await page.addInitScript(`
|
|
180
|
+
if (!window.chrome) {
|
|
181
|
+
window.chrome = {
|
|
182
|
+
app: {
|
|
183
|
+
isInstalled: false,
|
|
184
|
+
InstallState: { INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
|
|
185
|
+
RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' }
|
|
186
|
+
},
|
|
187
|
+
runtime: {
|
|
188
|
+
OnInstalledReason: {}, OnRestartRequiredReason: {}, PlatformArch: {},
|
|
189
|
+
PlatformNaclArch: {}, PlatformOs: {}, RequestUpdateCheckStatus: {},
|
|
190
|
+
connect: function() {}, sendMessage: function() {}
|
|
191
|
+
},
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
`);
|
|
195
|
+
// 7. Fix permissions query (notifications should be 'prompt' not 'denied')
|
|
196
|
+
await page.addInitScript(`
|
|
197
|
+
try {
|
|
198
|
+
var originalQuery = window.Permissions && window.Permissions.prototype && window.Permissions.prototype.query;
|
|
199
|
+
if (originalQuery) {
|
|
200
|
+
window.Permissions.prototype.query = function(params) {
|
|
201
|
+
if (params && params.name === 'notifications') {
|
|
202
|
+
return Promise.resolve({ state: Notification.permission });
|
|
203
|
+
}
|
|
204
|
+
return originalQuery.call(this, params);
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
} catch (e) {}
|
|
208
|
+
`);
|
|
209
|
+
// 8. WebGL vendor/renderer spoofing (headless shows "Google SwiftShader")
|
|
210
|
+
await page.addInitScript(`
|
|
211
|
+
try {
|
|
212
|
+
var getParameter = WebGLRenderingContext.prototype.getParameter;
|
|
213
|
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
|
214
|
+
if (parameter === 37445) return 'Intel Inc.';
|
|
215
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
|
216
|
+
return getParameter.call(this, parameter);
|
|
217
|
+
};
|
|
218
|
+
if (typeof WebGL2RenderingContext !== 'undefined') {
|
|
219
|
+
var getParameter2 = WebGL2RenderingContext.prototype.getParameter;
|
|
220
|
+
WebGL2RenderingContext.prototype.getParameter = function(parameter) {
|
|
221
|
+
if (parameter === 37445) return 'Intel Inc.';
|
|
222
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
|
223
|
+
return getParameter2.call(this, parameter);
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
} catch (e) {}
|
|
227
|
+
`);
|
|
228
|
+
// 9. Hide automation-related properties
|
|
229
|
+
await page.addInitScript(`
|
|
230
|
+
try { Object.defineProperty(document, '$cdc_asdjflasutopfhvcZLmcfl_', { get: () => undefined }); } catch (e) {}
|
|
231
|
+
try { delete window.callPhantom; } catch (e) {}
|
|
232
|
+
try { delete window._phantom; } catch (e) {}
|
|
233
|
+
try { delete window.__nightmare; } catch (e) {}
|
|
234
|
+
`);
|
|
146
235
|
}
|
|
147
236
|
// ── Page pool constants & state ───────────────────────────────────────────────
|
|
148
237
|
export const MAX_CONCURRENT_PAGES = 5;
|
|
@@ -308,6 +308,7 @@ async function twitterExtractor(html, url) {
|
|
|
308
308
|
if (fxData && fxData.code === 200 && fxData.user) {
|
|
309
309
|
const u = fxData.user;
|
|
310
310
|
const structured = {
|
|
311
|
+
title: `${u.name || ''} (@${u.screen_name || ''}) on X/Twitter`,
|
|
311
312
|
name: u.name || '',
|
|
312
313
|
handle: '@' + (u.screen_name || ''),
|
|
313
314
|
bio: u.description || '',
|
|
@@ -1500,6 +1501,7 @@ async function npmExtractor(_html, url) {
|
|
|
1500
1501
|
}
|
|
1501
1502
|
catch { /* optional */ }
|
|
1502
1503
|
const structured = {
|
|
1504
|
+
title: `${data.name}@${latest || 'unknown'}`,
|
|
1503
1505
|
name: data.name,
|
|
1504
1506
|
description: data.description || '',
|
|
1505
1507
|
version: latest || 'unknown',
|
package/dist/core/strategies.js
CHANGED
|
@@ -76,6 +76,7 @@ function shouldForceBrowser(url) {
|
|
|
76
76
|
'chewy.com', // Amazon subsidiary
|
|
77
77
|
'aliexpress.com', // anti-bot
|
|
78
78
|
'wish.com', // anti-bot
|
|
79
|
+
'cargurus.com', // aggressive bot detection
|
|
79
80
|
];
|
|
80
81
|
for (const domain of stealthDomains) {
|
|
81
82
|
if (hostname === domain || hostname.endsWith(`.${domain}`)) {
|
|
@@ -296,7 +296,13 @@ function heuristicExtractNumber(fieldName, content) {
|
|
|
296
296
|
}
|
|
297
297
|
// Year
|
|
298
298
|
if (/year/.test(lf)) {
|
|
299
|
-
|
|
299
|
+
// Match 4-digit years (1900-2099), prefer explicit "Year: YYYY" pattern first
|
|
300
|
+
const explicit = content.match(/\bYear[:\s]+(\d{4})\b/i);
|
|
301
|
+
if (explicit?.[1]) {
|
|
302
|
+
const n = parseInt(explicit[1]);
|
|
303
|
+
return isNaN(n) ? null : n;
|
|
304
|
+
}
|
|
305
|
+
const m = content.match(/\b((?:19|20)\d{2})\b/);
|
|
300
306
|
if (m?.[1]) {
|
|
301
307
|
const n = parseInt(m[1]);
|
|
302
308
|
return isNaN(n) ? null : n;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.10",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|