webpeel 0.21.7 → 0.21.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Webshare residential proxy configuration.
|
|
3
|
+
*
|
|
4
|
+
* WebPeel uses Webshare residential proxies (configured via env vars) to route
|
|
5
|
+
* requests through US residential IPs, bypassing datacenter IP blocks from
|
|
6
|
+
* DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
|
|
7
|
+
*
|
|
8
|
+
* Proxy credentials are loaded from environment variables:
|
|
9
|
+
* WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
|
|
10
|
+
* WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
|
|
11
|
+
* WEBSHARE_PROXY_USER — proxy username (without slot suffix)
|
|
12
|
+
* WEBSHARE_PROXY_PASS — proxy password
|
|
13
|
+
* WEBSHARE_PROXY_SLOTS — number of available US residential slots
|
|
14
|
+
*
|
|
15
|
+
* With the Webshare backbone plan each US slot has its own port:
|
|
16
|
+
* slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
|
|
17
|
+
*/
|
|
18
|
+
export interface ProxyConfig {
|
|
19
|
+
/** Proxy server URL in the format "http://host:port" */
|
|
20
|
+
server: string;
|
|
21
|
+
/** Proxy username (includes slot suffix, e.g. "user-US-42") */
|
|
22
|
+
username: string;
|
|
23
|
+
/** Proxy password */
|
|
24
|
+
password: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Get a random Webshare residential proxy config.
|
|
28
|
+
* Returns null if the proxy is not configured (env vars missing or slots = 0).
|
|
29
|
+
*
|
|
30
|
+
* Uses random slot selection across all available US slots for even load
|
|
31
|
+
* distribution — same approach as youtube.ts proxyRequestSlotted().
|
|
32
|
+
*/
|
|
33
|
+
export declare function getWebshareProxy(): ProxyConfig | null;
|
|
34
|
+
/**
|
|
35
|
+
* Check if Webshare proxies are configured (env vars are present and non-empty).
|
|
36
|
+
* Does NOT guarantee the proxy is reachable — just that credentials are set.
|
|
37
|
+
*/
|
|
38
|
+
export declare function hasWebshareProxy(): boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Convert a ProxyConfig to a Playwright-compatible proxy object.
|
|
41
|
+
* Useful for passing directly to browser.newContext({ proxy: ... }).
|
|
42
|
+
*/
|
|
43
|
+
export declare function toPlaywrightProxy(config: ProxyConfig): {
|
|
44
|
+
server: string;
|
|
45
|
+
username: string;
|
|
46
|
+
password: string;
|
|
47
|
+
};
|
|
48
|
+
/**
|
|
49
|
+
* Get a random Webshare proxy as a fully-qualified URL string with embedded
|
|
50
|
+
* credentials. The format is: `http://username:password@host:port`
|
|
51
|
+
*
|
|
52
|
+
* Useful for passing to strategies.ts proxy option (which expects a URL string).
|
|
53
|
+
* Returns null if proxies are not configured.
|
|
54
|
+
*/
|
|
55
|
+
export declare function getWebshareProxyUrl(): string | null;
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Webshare residential proxy configuration.
|
|
3
|
+
*
|
|
4
|
+
* WebPeel uses Webshare residential proxies (configured via env vars) to route
|
|
5
|
+
* requests through US residential IPs, bypassing datacenter IP blocks from
|
|
6
|
+
* DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
|
|
7
|
+
*
|
|
8
|
+
* Proxy credentials are loaded from environment variables:
|
|
9
|
+
* WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
|
|
10
|
+
* WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
|
|
11
|
+
* WEBSHARE_PROXY_USER — proxy username (without slot suffix)
|
|
12
|
+
* WEBSHARE_PROXY_PASS — proxy password
|
|
13
|
+
* WEBSHARE_PROXY_SLOTS — number of available US residential slots
|
|
14
|
+
*
|
|
15
|
+
* With the Webshare backbone plan each US slot has its own port:
|
|
16
|
+
* slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Get a random Webshare residential proxy config.
|
|
20
|
+
* Returns null if the proxy is not configured (env vars missing or slots = 0).
|
|
21
|
+
*
|
|
22
|
+
* Uses random slot selection across all available US slots for even load
|
|
23
|
+
* distribution — same approach as youtube.ts proxyRequestSlotted().
|
|
24
|
+
*/
|
|
25
|
+
export function getWebshareProxy() {
|
|
26
|
+
const host = process.env.WEBSHARE_PROXY_HOST;
|
|
27
|
+
const user = process.env.WEBSHARE_PROXY_USER;
|
|
28
|
+
const pass = process.env.WEBSHARE_PROXY_PASS;
|
|
29
|
+
const basePort = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
30
|
+
const slots = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '0', 10);
|
|
31
|
+
if (!host || !user || !pass || slots <= 0)
|
|
32
|
+
return null;
|
|
33
|
+
const slot = Math.floor(Math.random() * slots) + 1;
|
|
34
|
+
const port = basePort + slot - 1;
|
|
35
|
+
return {
|
|
36
|
+
server: `http://${host}:${port}`,
|
|
37
|
+
username: `${user}-US-${slot}`,
|
|
38
|
+
password: pass,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Check if Webshare proxies are configured (env vars are present and non-empty).
|
|
43
|
+
* Does NOT guarantee the proxy is reachable — just that credentials are set.
|
|
44
|
+
*/
|
|
45
|
+
export function hasWebshareProxy() {
|
|
46
|
+
return !!(process.env.WEBSHARE_PROXY_HOST &&
|
|
47
|
+
process.env.WEBSHARE_PROXY_USER &&
|
|
48
|
+
process.env.WEBSHARE_PROXY_PASS);
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Convert a ProxyConfig to a Playwright-compatible proxy object.
|
|
52
|
+
* Useful for passing directly to browser.newContext({ proxy: ... }).
|
|
53
|
+
*/
|
|
54
|
+
export function toPlaywrightProxy(config) {
|
|
55
|
+
return {
|
|
56
|
+
server: config.server,
|
|
57
|
+
username: config.username,
|
|
58
|
+
password: config.password,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Get a random Webshare proxy as a fully-qualified URL string with embedded
|
|
63
|
+
* credentials. The format is: `http://username:password@host:port`
|
|
64
|
+
*
|
|
65
|
+
* Useful for passing to strategies.ts proxy option (which expects a URL string).
|
|
66
|
+
* Returns null if proxies are not configured.
|
|
67
|
+
*/
|
|
68
|
+
export function getWebshareProxyUrl() {
|
|
69
|
+
const config = getWebshareProxy();
|
|
70
|
+
if (!config)
|
|
71
|
+
return null;
|
|
72
|
+
try {
|
|
73
|
+
const url = new URL(config.server);
|
|
74
|
+
return `http://${encodeURIComponent(config.username)}:${encodeURIComponent(config.password)}@${url.host}`;
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { fetch as undiciFetch } from 'undici';
|
|
16
16
|
import { load } from 'cheerio';
|
|
17
17
|
import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { getWebshareProxy } from './proxy-config.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
const log = createLogger('search');
|
|
20
21
|
function decodeHtmlEntities(input) {
|
|
@@ -236,10 +237,12 @@ export class StealthSearchProvider {
|
|
|
236
237
|
const browser = await getStealthBrowser();
|
|
237
238
|
const params = new URLSearchParams({ q: query });
|
|
238
239
|
const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
|
|
240
|
+
const proxy = getWebshareProxy();
|
|
239
241
|
ctx = await browser.newContext({
|
|
240
242
|
userAgent: getRandomUserAgent(),
|
|
241
243
|
locale: 'en-US',
|
|
242
244
|
timezoneId: 'America/New_York',
|
|
245
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
243
246
|
});
|
|
244
247
|
const page = await ctx.newPage();
|
|
245
248
|
await applyStealthScripts(page);
|
|
@@ -303,10 +306,12 @@ export class StealthSearchProvider {
|
|
|
303
306
|
const browser = await getStealthBrowser();
|
|
304
307
|
const params = new URLSearchParams({ q: query });
|
|
305
308
|
const url = `https://www.bing.com/search?${params.toString()}`;
|
|
309
|
+
const proxy = getWebshareProxy();
|
|
306
310
|
ctx = await browser.newContext({
|
|
307
311
|
userAgent: getRandomUserAgent(),
|
|
308
312
|
locale: 'en-US',
|
|
309
313
|
timezoneId: 'America/New_York',
|
|
314
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
310
315
|
});
|
|
311
316
|
const page = await ctx.newPage();
|
|
312
317
|
await applyStealthScripts(page);
|
|
@@ -380,10 +385,12 @@ export class StealthSearchProvider {
|
|
|
380
385
|
const browser = await getStealthBrowser();
|
|
381
386
|
const params = new URLSearchParams({ q: query });
|
|
382
387
|
const url = `https://www.ecosia.org/search?${params.toString()}`;
|
|
388
|
+
const proxy = getWebshareProxy();
|
|
383
389
|
ctx = await browser.newContext({
|
|
384
390
|
userAgent: getRandomUserAgent(),
|
|
385
391
|
locale: 'en-US',
|
|
386
392
|
timezoneId: 'America/New_York',
|
|
393
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
387
394
|
});
|
|
388
395
|
const page = await ctx.newPage();
|
|
389
396
|
await applyStealthScripts(page);
|
package/dist/core/strategies.js
CHANGED
|
@@ -10,6 +10,7 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
|
|
|
10
10
|
import { getCached, setCached as setBasicCache } from './cache.js';
|
|
11
11
|
import { resolveAndCache } from './dns-cache.js';
|
|
12
12
|
import { BlockedError, NetworkError } from '../types.js';
|
|
13
|
+
import { getWebshareProxyUrl } from './proxy-config.js';
|
|
13
14
|
import { detectChallenge } from './challenge-detection.js';
|
|
14
15
|
import { getStrategyHooks, } from './strategy-hooks.js';
|
|
15
16
|
import { createLogger } from './logger.js';
|
|
@@ -310,10 +311,15 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
310
311
|
export async function smartFetch(url, options = {}) {
|
|
311
312
|
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
|
|
312
313
|
const usePeelTLS = tls || cycle;
|
|
313
|
-
// Build effective proxy list: explicit proxies array, or single proxy, or empty
|
|
314
|
+
// Build effective proxy list: explicit proxies array, or single proxy, or empty.
|
|
315
|
+
// When no explicit proxy is configured and Webshare is available, automatically
|
|
316
|
+
// add it as a fallback: try direct connection first (fast), then Webshare on block.
|
|
314
317
|
const effectiveProxies = proxies?.length ? proxies :
|
|
315
318
|
proxy ? [proxy] :
|
|
316
|
-
|
|
319
|
+
(() => {
|
|
320
|
+
const wsUrl = getWebshareProxyUrl();
|
|
321
|
+
return wsUrl ? [undefined, wsUrl] : [undefined];
|
|
322
|
+
})();
|
|
317
323
|
const firstProxy = effectiveProxies[0];
|
|
318
324
|
const hooks = getStrategyHooks();
|
|
319
325
|
const fetchStartMs = Date.now();
|
package/dist/core/youtube.js
CHANGED
|
@@ -15,6 +15,7 @@ import { join } from 'node:path';
|
|
|
15
15
|
import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
|
|
16
16
|
import { simpleFetch } from './fetcher.js';
|
|
17
17
|
import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
// ---------------------------------------------------------------------------
|
|
20
21
|
// yt-dlp startup diagnostics
|
|
@@ -239,8 +240,10 @@ export function extractSummary(fullText) {
|
|
|
239
240
|
// ---------------------------------------------------------------------------
|
|
240
241
|
// Proxy-based InnerTube transcript extraction
|
|
241
242
|
// ---------------------------------------------------------------------------
|
|
242
|
-
// Webshare residential proxy config — reads from env vars
|
|
243
|
+
// Webshare residential proxy config — reads from env vars via proxy-config.ts.
|
|
243
244
|
// Locally, falls back to direct fetch (residential IP already works).
|
|
245
|
+
// These constants are kept for use in proxyRequestSlotted() which does
|
|
246
|
+
// low-level HTTP CONNECT tunneling (not Playwright-level proxy).
|
|
244
247
|
const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
|
|
245
248
|
const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
246
249
|
const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
|
|
@@ -249,7 +252,8 @@ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
|
|
|
249
252
|
// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
|
|
250
253
|
const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
|
|
251
254
|
function isProxyConfigured() {
|
|
252
|
-
|
|
255
|
+
// Delegate to the shared proxy-config helper for consistency
|
|
256
|
+
return _hasWebshareProxy();
|
|
253
257
|
}
|
|
254
258
|
/**
|
|
255
259
|
* Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.8",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|