webpeel 0.21.7 → 0.21.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/browser-fetch.js +47 -6
- package/dist/core/browser-pool.d.ts +2 -0
- package/dist/core/browser-pool.js +91 -2
- package/dist/core/proxy-config.d.ts +55 -0
- package/dist/core/proxy-config.js +79 -0
- package/dist/core/search-provider.js +7 -0
- package/dist/core/strategies.js +9 -2
- package/dist/core/youtube.js +6 -2
- package/package.json +1 -1
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
6
6
|
import { detectChallenge } from './challenge-detection.js';
|
|
7
7
|
import { getRealisticUserAgent } from './user-agents.js';
|
|
8
|
-
import { getRandomUserAgent, applyStealthScripts, takePooledPage, ensurePagePool, recyclePooledPage, getBrowser, getStealthBrowser, getProfileBrowser, PAGE_POOL_SIZE, MAX_CONCURRENT_PAGES, getPooledPagesCount, } from './browser-pool.js';
|
|
8
|
+
import { getRandomUserAgent, applyStealthScripts, takePooledPage, ensurePagePool, recyclePooledPage, getBrowser, getStealthBrowser, getStealthPlaywright, getProfileBrowser, PAGE_POOL_SIZE, MAX_CONCURRENT_PAGES, getPooledPagesCount, ANTI_DETECTION_ARGS, getRandomViewport, } from './browser-pool.js';
|
|
9
9
|
// Proprietary stealth module — gitignored, loaded conditionally
|
|
10
10
|
let applyStealthPatches;
|
|
11
11
|
let applyAcceptLanguageHeader;
|
|
@@ -90,6 +90,8 @@ export async function browserFetch(url, options = {}) {
|
|
|
90
90
|
const usingProfileBrowser = !!profileDir;
|
|
91
91
|
// Owned context created when storageState injection is requested
|
|
92
92
|
let ownedContext;
|
|
93
|
+
// Owned browser launched when proxy is specified (dedicated browser with proxy at launch level)
|
|
94
|
+
let ownedBrowser;
|
|
93
95
|
try {
|
|
94
96
|
const browser = usingProfileBrowser
|
|
95
97
|
? await getProfileBrowser(profileDir, headed, stealth)
|
|
@@ -135,11 +137,22 @@ export async function browserFetch(url, options = {}) {
|
|
|
135
137
|
log.debug('proxy URL parse failed, using as-is:', e instanceof Error ? e.message : e);
|
|
136
138
|
playwrightProxy = { server: proxy };
|
|
137
139
|
}
|
|
138
|
-
//
|
|
139
|
-
|
|
140
|
-
|
|
140
|
+
// Launch a DEDICATED fresh browser with proxy at the launch level.
|
|
141
|
+
// Context-level proxy is unreliable for anti-bot sites — they check the browser's
|
|
142
|
+
// IP at connection time (set at launch), not at context creation.
|
|
143
|
+
const pw = stealth ? await getStealthPlaywright() : (await import('playwright')).chromium;
|
|
144
|
+
const vp = getRandomViewport();
|
|
145
|
+
ownedBrowser = await pw.launch({
|
|
146
|
+
headless: true,
|
|
147
|
+
args: [...ANTI_DETECTION_ARGS, `--window-size=${vp.width},${vp.height}`],
|
|
141
148
|
proxy: playwrightProxy,
|
|
142
|
-
|
|
149
|
+
});
|
|
150
|
+
ownedContext = await ownedBrowser.newContext({
|
|
151
|
+
userAgent: validatedUserAgent || getRandomUserAgent(),
|
|
152
|
+
locale: 'en-US',
|
|
153
|
+
timezoneId: 'America/New_York',
|
|
154
|
+
javaScriptEnabled: true,
|
|
155
|
+
viewport: { width: effectiveViewportWidth || vp.width, height: effectiveViewportHeight || vp.height },
|
|
143
156
|
...(storageState ? { storageState } : {}),
|
|
144
157
|
});
|
|
145
158
|
page = await ownedContext.newPage();
|
|
@@ -340,6 +353,29 @@ export async function browserFetch(url, options = {}) {
|
|
|
340
353
|
await page.waitForTimeout(extraDelayMs);
|
|
341
354
|
throwIfAborted();
|
|
342
355
|
}
|
|
356
|
+
// Human-like delay for proxied requests (helps bypass bot detection on strict sites)
|
|
357
|
+
if (proxy) {
|
|
358
|
+
// Realistic human behavior to bypass behavioral analysis
|
|
359
|
+
const humanDelay = 800 + Math.random() * 1200;
|
|
360
|
+
await page.waitForTimeout(humanDelay);
|
|
361
|
+
throwIfAborted();
|
|
362
|
+
// Realistic mouse movement (simulate human cursor)
|
|
363
|
+
try {
|
|
364
|
+
const vw = await page.evaluate(() => window.innerWidth);
|
|
365
|
+
const vh = await page.evaluate(() => window.innerHeight);
|
|
366
|
+
await page.mouse.move(100 + Math.random() * (vw - 200), 100 + Math.random() * (vh - 200), { steps: 5 + Math.floor(Math.random() * 10) });
|
|
367
|
+
// Small scroll to trigger lazy-loaded content
|
|
368
|
+
await page.evaluate(() => window.scrollBy(0, 200 + Math.random() * 400));
|
|
369
|
+
await page.waitForTimeout(300 + Math.random() * 500);
|
|
370
|
+
throwIfAborted();
|
|
371
|
+
// Second mouse move
|
|
372
|
+
await page.mouse.move(50 + Math.random() * (vw - 100), 50 + Math.random() * (vh - 100), { steps: 3 + Math.floor(Math.random() * 5) });
|
|
373
|
+
}
|
|
374
|
+
catch {
|
|
375
|
+
// Non-fatal: mouse/scroll simulation failed
|
|
376
|
+
}
|
|
377
|
+
throwIfAborted();
|
|
378
|
+
}
|
|
343
379
|
// Wait for additional time if requested (for dynamic content / screenshots)
|
|
344
380
|
if (waitMs > 0) {
|
|
345
381
|
await page.waitForTimeout(waitMs);
|
|
@@ -447,7 +483,8 @@ export async function browserFetch(url, options = {}) {
|
|
|
447
483
|
contentType: fetchContentType,
|
|
448
484
|
screenshot: screenshotBuffer,
|
|
449
485
|
page,
|
|
450
|
-
browser
|
|
486
|
+
// Use ownedBrowser for proxy case, otherwise the shared browser
|
|
487
|
+
browser: ownedBrowser ?? browser,
|
|
451
488
|
...(fetchAutoInteract !== undefined ? { autoInteract: fetchAutoInteract } : {}),
|
|
452
489
|
};
|
|
453
490
|
}
|
|
@@ -492,6 +529,10 @@ export async function browserFetch(url, options = {}) {
|
|
|
492
529
|
await page.close().catch(() => { });
|
|
493
530
|
}
|
|
494
531
|
}
|
|
532
|
+
// Close the dedicated proxy browser if one was launched (not when keeping page open)
|
|
533
|
+
if (ownedBrowser && !keepPageOpen) {
|
|
534
|
+
await ownedBrowser.close().catch(() => { });
|
|
535
|
+
}
|
|
495
536
|
activePagesCount--;
|
|
496
537
|
}
|
|
497
538
|
}
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
* Handles Playwright loading, browser instances, and the idle page pool.
|
|
4
4
|
*/
|
|
5
5
|
import type { Browser, Page } from 'playwright';
|
|
6
|
+
type ChromiumType = typeof import('playwright').chromium;
|
|
6
7
|
import { closePool } from './http-fetch.js';
|
|
7
8
|
export { closePool };
|
|
8
9
|
/** Whether Playwright has been loaded (for diagnostics). */
|
|
9
10
|
export declare let playwrightLoaded: boolean;
|
|
11
|
+
export declare function getStealthPlaywright(): Promise<ChromiumType>;
|
|
10
12
|
/**
|
|
11
13
|
* Returns a realistic Chrome user agent.
|
|
12
14
|
* Delegates to the curated user-agents module so stealth mode never exposes
|
|
@@ -20,7 +20,7 @@ async function getPlaywright() {
|
|
|
20
20
|
}
|
|
21
21
|
return _chromium;
|
|
22
22
|
}
|
|
23
|
-
async function getStealthPlaywright() {
|
|
23
|
+
export async function getStealthPlaywright() {
|
|
24
24
|
if (!_stealthChromium) {
|
|
25
25
|
const pwExtra = await import('playwright-extra');
|
|
26
26
|
const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
|
|
@@ -54,7 +54,7 @@ export const ANTI_DETECTION_ARGS = [
|
|
|
54
54
|
'--disable-gpu',
|
|
55
55
|
'--start-maximized',
|
|
56
56
|
// Chrome branding / stealth hardening
|
|
57
|
-
'--disable-features=ChromeUserAgentDataBranding',
|
|
57
|
+
'--disable-features=ChromeUserAgentDataBranding,IsolateOrigins,site-per-process',
|
|
58
58
|
'--disable-component-extensions-with-background-pages',
|
|
59
59
|
'--disable-default-apps',
|
|
60
60
|
'--disable-extensions',
|
|
@@ -143,6 +143,95 @@ export async function applyStealthScripts(page) {
|
|
|
143
143
|
});
|
|
144
144
|
})();
|
|
145
145
|
`);
|
|
146
|
+
// 3. Hide navigator.webdriver (THE #1 BOT SIGNAL)
|
|
147
|
+
await page.addInitScript(`
|
|
148
|
+
Object.defineProperty(navigator, 'webdriver', {
|
|
149
|
+
get: () => false,
|
|
150
|
+
configurable: true,
|
|
151
|
+
});
|
|
152
|
+
try { delete Object.getPrototypeOf(navigator).webdriver; } catch (e) {}
|
|
153
|
+
`);
|
|
154
|
+
// 4. Fake navigator.plugins (empty = bot signal, real Chrome has plugins)
|
|
155
|
+
await page.addInitScript(`
|
|
156
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
157
|
+
get: () => {
|
|
158
|
+
var arr = [
|
|
159
|
+
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
|
|
160
|
+
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '' },
|
|
161
|
+
{ name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
|
|
162
|
+
];
|
|
163
|
+
arr.item = function(i) { return arr[i] || null; };
|
|
164
|
+
arr.namedItem = function(n) { return arr.find(function(p) { return p.name === n; }) || null; };
|
|
165
|
+
arr.refresh = function() {};
|
|
166
|
+
return arr;
|
|
167
|
+
},
|
|
168
|
+
configurable: true,
|
|
169
|
+
});
|
|
170
|
+
`);
|
|
171
|
+
// 5. Fake navigator.languages
|
|
172
|
+
await page.addInitScript(`
|
|
173
|
+
Object.defineProperty(navigator, 'languages', {
|
|
174
|
+
get: () => ['en-US', 'en'],
|
|
175
|
+
configurable: true,
|
|
176
|
+
});
|
|
177
|
+
`);
|
|
178
|
+
// 6. Fake window.chrome object (missing in headless = detected)
|
|
179
|
+
await page.addInitScript(`
|
|
180
|
+
if (!window.chrome) {
|
|
181
|
+
window.chrome = {
|
|
182
|
+
app: {
|
|
183
|
+
isInstalled: false,
|
|
184
|
+
InstallState: { INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
|
|
185
|
+
RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' }
|
|
186
|
+
},
|
|
187
|
+
runtime: {
|
|
188
|
+
OnInstalledReason: {}, OnRestartRequiredReason: {}, PlatformArch: {},
|
|
189
|
+
PlatformNaclArch: {}, PlatformOs: {}, RequestUpdateCheckStatus: {},
|
|
190
|
+
connect: function() {}, sendMessage: function() {}
|
|
191
|
+
},
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
`);
|
|
195
|
+
// 7. Fix permissions query (notifications should be 'prompt' not 'denied')
|
|
196
|
+
await page.addInitScript(`
|
|
197
|
+
try {
|
|
198
|
+
var originalQuery = window.Permissions && window.Permissions.prototype && window.Permissions.prototype.query;
|
|
199
|
+
if (originalQuery) {
|
|
200
|
+
window.Permissions.prototype.query = function(params) {
|
|
201
|
+
if (params && params.name === 'notifications') {
|
|
202
|
+
return Promise.resolve({ state: Notification.permission });
|
|
203
|
+
}
|
|
204
|
+
return originalQuery.call(this, params);
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
} catch (e) {}
|
|
208
|
+
`);
|
|
209
|
+
// 8. WebGL vendor/renderer spoofing (headless shows "Google SwiftShader")
|
|
210
|
+
await page.addInitScript(`
|
|
211
|
+
try {
|
|
212
|
+
var getParameter = WebGLRenderingContext.prototype.getParameter;
|
|
213
|
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
|
214
|
+
if (parameter === 37445) return 'Intel Inc.';
|
|
215
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
|
216
|
+
return getParameter.call(this, parameter);
|
|
217
|
+
};
|
|
218
|
+
if (typeof WebGL2RenderingContext !== 'undefined') {
|
|
219
|
+
var getParameter2 = WebGL2RenderingContext.prototype.getParameter;
|
|
220
|
+
WebGL2RenderingContext.prototype.getParameter = function(parameter) {
|
|
221
|
+
if (parameter === 37445) return 'Intel Inc.';
|
|
222
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
|
223
|
+
return getParameter2.call(this, parameter);
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
} catch (e) {}
|
|
227
|
+
`);
|
|
228
|
+
// 9. Hide automation-related properties
|
|
229
|
+
await page.addInitScript(`
|
|
230
|
+
try { Object.defineProperty(document, '$cdc_asdjflasutopfhvcZLmcfl_', { get: () => undefined }); } catch (e) {}
|
|
231
|
+
try { delete window.callPhantom; } catch (e) {}
|
|
232
|
+
try { delete window._phantom; } catch (e) {}
|
|
233
|
+
try { delete window.__nightmare; } catch (e) {}
|
|
234
|
+
`);
|
|
146
235
|
}
|
|
147
236
|
// ── Page pool constants & state ───────────────────────────────────────────────
|
|
148
237
|
export const MAX_CONCURRENT_PAGES = 5;
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Webshare residential proxy configuration.
|
|
3
|
+
*
|
|
4
|
+
* WebPeel uses Webshare residential proxies (configured via env vars) to route
|
|
5
|
+
* requests through US residential IPs, bypassing datacenter IP blocks from
|
|
6
|
+
* DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
|
|
7
|
+
*
|
|
8
|
+
* Proxy credentials are loaded from environment variables:
|
|
9
|
+
* WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
|
|
10
|
+
* WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
|
|
11
|
+
* WEBSHARE_PROXY_USER — proxy username (without slot suffix)
|
|
12
|
+
* WEBSHARE_PROXY_PASS — proxy password
|
|
13
|
+
* WEBSHARE_PROXY_SLOTS — number of available US residential slots
|
|
14
|
+
*
|
|
15
|
+
* With the Webshare backbone plan each US slot has its own port:
|
|
16
|
+
* slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
|
|
17
|
+
*/
|
|
18
|
+
export interface ProxyConfig {
|
|
19
|
+
/** Proxy server URL in the format "http://host:port" */
|
|
20
|
+
server: string;
|
|
21
|
+
/** Proxy username (includes slot suffix, e.g. "user-US-42") */
|
|
22
|
+
username: string;
|
|
23
|
+
/** Proxy password */
|
|
24
|
+
password: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Get a random Webshare residential proxy config.
|
|
28
|
+
* Returns null if the proxy is not configured (env vars missing or slots = 0).
|
|
29
|
+
*
|
|
30
|
+
* Uses random slot selection across all available US slots for even load
|
|
31
|
+
* distribution — same approach as youtube.ts proxyRequestSlotted().
|
|
32
|
+
*/
|
|
33
|
+
export declare function getWebshareProxy(): ProxyConfig | null;
|
|
34
|
+
/**
|
|
35
|
+
* Check if Webshare proxies are configured (env vars are present and non-empty).
|
|
36
|
+
* Does NOT guarantee the proxy is reachable — just that credentials are set.
|
|
37
|
+
*/
|
|
38
|
+
export declare function hasWebshareProxy(): boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Convert a ProxyConfig to a Playwright-compatible proxy object.
|
|
41
|
+
* Useful for passing directly to browser.newContext({ proxy: ... }).
|
|
42
|
+
*/
|
|
43
|
+
export declare function toPlaywrightProxy(config: ProxyConfig): {
|
|
44
|
+
server: string;
|
|
45
|
+
username: string;
|
|
46
|
+
password: string;
|
|
47
|
+
};
|
|
48
|
+
/**
|
|
49
|
+
* Get a random Webshare proxy as a fully-qualified URL string with embedded
|
|
50
|
+
* credentials. The format is: `http://username:password@host:port`
|
|
51
|
+
*
|
|
52
|
+
* Useful for passing to strategies.ts proxy option (which expects a URL string).
|
|
53
|
+
* Returns null if proxies are not configured.
|
|
54
|
+
*/
|
|
55
|
+
export declare function getWebshareProxyUrl(): string | null;
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Webshare residential proxy configuration.
|
|
3
|
+
*
|
|
4
|
+
* WebPeel uses Webshare residential proxies (configured via env vars) to route
|
|
5
|
+
* requests through US residential IPs, bypassing datacenter IP blocks from
|
|
6
|
+
* DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
|
|
7
|
+
*
|
|
8
|
+
* Proxy credentials are loaded from environment variables:
|
|
9
|
+
* WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
|
|
10
|
+
* WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
|
|
11
|
+
* WEBSHARE_PROXY_USER — proxy username (without slot suffix)
|
|
12
|
+
* WEBSHARE_PROXY_PASS — proxy password
|
|
13
|
+
* WEBSHARE_PROXY_SLOTS — number of available US residential slots
|
|
14
|
+
*
|
|
15
|
+
* With the Webshare backbone plan each US slot has its own port:
|
|
16
|
+
* slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Get a random Webshare residential proxy config.
|
|
20
|
+
* Returns null if the proxy is not configured (env vars missing or slots = 0).
|
|
21
|
+
*
|
|
22
|
+
* Uses random slot selection across all available US slots for even load
|
|
23
|
+
* distribution — same approach as youtube.ts proxyRequestSlotted().
|
|
24
|
+
*/
|
|
25
|
+
export function getWebshareProxy() {
|
|
26
|
+
const host = process.env.WEBSHARE_PROXY_HOST;
|
|
27
|
+
const user = process.env.WEBSHARE_PROXY_USER;
|
|
28
|
+
const pass = process.env.WEBSHARE_PROXY_PASS;
|
|
29
|
+
const basePort = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
30
|
+
const slots = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '0', 10);
|
|
31
|
+
if (!host || !user || !pass || slots <= 0)
|
|
32
|
+
return null;
|
|
33
|
+
const slot = Math.floor(Math.random() * slots) + 1;
|
|
34
|
+
const port = basePort + slot - 1;
|
|
35
|
+
return {
|
|
36
|
+
server: `http://${host}:${port}`,
|
|
37
|
+
username: `${user}-US-${slot}`,
|
|
38
|
+
password: pass,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Check if Webshare proxies are configured (env vars are present and non-empty).
|
|
43
|
+
* Does NOT guarantee the proxy is reachable — just that credentials are set.
|
|
44
|
+
*/
|
|
45
|
+
export function hasWebshareProxy() {
|
|
46
|
+
return !!(process.env.WEBSHARE_PROXY_HOST &&
|
|
47
|
+
process.env.WEBSHARE_PROXY_USER &&
|
|
48
|
+
process.env.WEBSHARE_PROXY_PASS);
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Convert a ProxyConfig to a Playwright-compatible proxy object.
|
|
52
|
+
* Useful for passing directly to browser.newContext({ proxy: ... }).
|
|
53
|
+
*/
|
|
54
|
+
export function toPlaywrightProxy(config) {
|
|
55
|
+
return {
|
|
56
|
+
server: config.server,
|
|
57
|
+
username: config.username,
|
|
58
|
+
password: config.password,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Get a random Webshare proxy as a fully-qualified URL string with embedded
|
|
63
|
+
* credentials. The format is: `http://username:password@host:port`
|
|
64
|
+
*
|
|
65
|
+
* Useful for passing to strategies.ts proxy option (which expects a URL string).
|
|
66
|
+
* Returns null if proxies are not configured.
|
|
67
|
+
*/
|
|
68
|
+
export function getWebshareProxyUrl() {
|
|
69
|
+
const config = getWebshareProxy();
|
|
70
|
+
if (!config)
|
|
71
|
+
return null;
|
|
72
|
+
try {
|
|
73
|
+
const url = new URL(config.server);
|
|
74
|
+
return `http://${encodeURIComponent(config.username)}:${encodeURIComponent(config.password)}@${url.host}`;
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { fetch as undiciFetch } from 'undici';
|
|
16
16
|
import { load } from 'cheerio';
|
|
17
17
|
import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { getWebshareProxy } from './proxy-config.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
const log = createLogger('search');
|
|
20
21
|
function decodeHtmlEntities(input) {
|
|
@@ -236,10 +237,12 @@ export class StealthSearchProvider {
|
|
|
236
237
|
const browser = await getStealthBrowser();
|
|
237
238
|
const params = new URLSearchParams({ q: query });
|
|
238
239
|
const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
|
|
240
|
+
const proxy = getWebshareProxy();
|
|
239
241
|
ctx = await browser.newContext({
|
|
240
242
|
userAgent: getRandomUserAgent(),
|
|
241
243
|
locale: 'en-US',
|
|
242
244
|
timezoneId: 'America/New_York',
|
|
245
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
243
246
|
});
|
|
244
247
|
const page = await ctx.newPage();
|
|
245
248
|
await applyStealthScripts(page);
|
|
@@ -303,10 +306,12 @@ export class StealthSearchProvider {
|
|
|
303
306
|
const browser = await getStealthBrowser();
|
|
304
307
|
const params = new URLSearchParams({ q: query });
|
|
305
308
|
const url = `https://www.bing.com/search?${params.toString()}`;
|
|
309
|
+
const proxy = getWebshareProxy();
|
|
306
310
|
ctx = await browser.newContext({
|
|
307
311
|
userAgent: getRandomUserAgent(),
|
|
308
312
|
locale: 'en-US',
|
|
309
313
|
timezoneId: 'America/New_York',
|
|
314
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
310
315
|
});
|
|
311
316
|
const page = await ctx.newPage();
|
|
312
317
|
await applyStealthScripts(page);
|
|
@@ -380,10 +385,12 @@ export class StealthSearchProvider {
|
|
|
380
385
|
const browser = await getStealthBrowser();
|
|
381
386
|
const params = new URLSearchParams({ q: query });
|
|
382
387
|
const url = `https://www.ecosia.org/search?${params.toString()}`;
|
|
388
|
+
const proxy = getWebshareProxy();
|
|
383
389
|
ctx = await browser.newContext({
|
|
384
390
|
userAgent: getRandomUserAgent(),
|
|
385
391
|
locale: 'en-US',
|
|
386
392
|
timezoneId: 'America/New_York',
|
|
393
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
387
394
|
});
|
|
388
395
|
const page = await ctx.newPage();
|
|
389
396
|
await applyStealthScripts(page);
|
package/dist/core/strategies.js
CHANGED
|
@@ -10,6 +10,7 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
|
|
|
10
10
|
import { getCached, setCached as setBasicCache } from './cache.js';
|
|
11
11
|
import { resolveAndCache } from './dns-cache.js';
|
|
12
12
|
import { BlockedError, NetworkError } from '../types.js';
|
|
13
|
+
import { getWebshareProxyUrl } from './proxy-config.js';
|
|
13
14
|
import { detectChallenge } from './challenge-detection.js';
|
|
14
15
|
import { getStrategyHooks, } from './strategy-hooks.js';
|
|
15
16
|
import { createLogger } from './logger.js';
|
|
@@ -75,6 +76,7 @@ function shouldForceBrowser(url) {
|
|
|
75
76
|
'chewy.com', // Amazon subsidiary
|
|
76
77
|
'aliexpress.com', // anti-bot
|
|
77
78
|
'wish.com', // anti-bot
|
|
79
|
+
'cargurus.com', // aggressive bot detection
|
|
78
80
|
];
|
|
79
81
|
for (const domain of stealthDomains) {
|
|
80
82
|
if (hostname === domain || hostname.endsWith(`.${domain}`)) {
|
|
@@ -310,10 +312,15 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
310
312
|
export async function smartFetch(url, options = {}) {
|
|
311
313
|
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
|
|
312
314
|
const usePeelTLS = tls || cycle;
|
|
313
|
-
// Build effective proxy list: explicit proxies array, or single proxy, or empty
|
|
315
|
+
// Build effective proxy list: explicit proxies array, or single proxy, or empty.
|
|
316
|
+
// When no explicit proxy is configured and Webshare is available, automatically
|
|
317
|
+
// add it as a fallback: try direct connection first (fast), then Webshare on block.
|
|
314
318
|
const effectiveProxies = proxies?.length ? proxies :
|
|
315
319
|
proxy ? [proxy] :
|
|
316
|
-
|
|
320
|
+
(() => {
|
|
321
|
+
const wsUrl = getWebshareProxyUrl();
|
|
322
|
+
return wsUrl ? [undefined, wsUrl] : [undefined];
|
|
323
|
+
})();
|
|
317
324
|
const firstProxy = effectiveProxies[0];
|
|
318
325
|
const hooks = getStrategyHooks();
|
|
319
326
|
const fetchStartMs = Date.now();
|
package/dist/core/youtube.js
CHANGED
|
@@ -15,6 +15,7 @@ import { join } from 'node:path';
|
|
|
15
15
|
import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
|
|
16
16
|
import { simpleFetch } from './fetcher.js';
|
|
17
17
|
import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
// ---------------------------------------------------------------------------
|
|
20
21
|
// yt-dlp startup diagnostics
|
|
@@ -239,8 +240,10 @@ export function extractSummary(fullText) {
|
|
|
239
240
|
// ---------------------------------------------------------------------------
|
|
240
241
|
// Proxy-based InnerTube transcript extraction
|
|
241
242
|
// ---------------------------------------------------------------------------
|
|
242
|
-
// Webshare residential proxy config — reads from env vars
|
|
243
|
+
// Webshare residential proxy config — reads from env vars via proxy-config.ts.
|
|
243
244
|
// Locally, falls back to direct fetch (residential IP already works).
|
|
245
|
+
// These constants are kept for use in proxyRequestSlotted() which does
|
|
246
|
+
// low-level HTTP CONNECT tunneling (not Playwright-level proxy).
|
|
244
247
|
const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
|
|
245
248
|
const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
246
249
|
const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
|
|
@@ -249,7 +252,8 @@ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
|
|
|
249
252
|
// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
|
|
250
253
|
const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
|
|
251
254
|
function isProxyConfigured() {
|
|
252
|
-
|
|
255
|
+
// Delegate to the shared proxy-config helper for consistency
|
|
256
|
+
return _hasWebshareProxy();
|
|
253
257
|
}
|
|
254
258
|
/**
|
|
255
259
|
* Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.9",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|