webpeel 0.21.7 → 0.21.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Shared Webshare residential proxy configuration.
3
+ *
4
+ * WebPeel uses Webshare residential proxies (configured via env vars) to route
5
+ * requests through US residential IPs, bypassing datacenter IP blocks from
6
+ * DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
7
+ *
8
+ * Proxy credentials are loaded from environment variables:
9
+ * WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
10
+ * WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
11
+ * WEBSHARE_PROXY_USER — proxy username (without slot suffix)
12
+ * WEBSHARE_PROXY_PASS — proxy password
13
+ * WEBSHARE_PROXY_SLOTS — number of available US residential slots
14
+ *
15
+ * With the Webshare backbone plan each US slot has its own port:
16
+ * slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
17
+ */
18
+ export interface ProxyConfig {
19
+ /** Proxy server URL in the format "http://host:port" */
20
+ server: string;
21
+ /** Proxy username (includes slot suffix, e.g. "user-US-42") */
22
+ username: string;
23
+ /** Proxy password */
24
+ password: string;
25
+ }
26
+ /**
27
+ * Get a random Webshare residential proxy config.
28
+ * Returns null if the proxy is not configured (env vars missing or slots = 0).
29
+ *
30
+ * Uses random slot selection across all available US slots for even load
31
+ * distribution — same approach as youtube.ts proxyRequestSlotted().
32
+ */
33
+ export declare function getWebshareProxy(): ProxyConfig | null;
34
+ /**
35
+ * Check if Webshare proxies are configured (env vars are present and non-empty).
36
+ * Does NOT guarantee the proxy is reachable — just that credentials are set.
37
+ */
38
+ export declare function hasWebshareProxy(): boolean;
39
+ /**
40
+ * Convert a ProxyConfig to a Playwright-compatible proxy object.
41
+ * Useful for passing directly to browser.newContext({ proxy: ... }).
42
+ */
43
+ export declare function toPlaywrightProxy(config: ProxyConfig): {
44
+ server: string;
45
+ username: string;
46
+ password: string;
47
+ };
48
+ /**
49
+ * Get a random Webshare proxy as a fully-qualified URL string with embedded
50
+ * credentials. The format is: `http://username:password@host:port`
51
+ *
52
+ * Useful for passing to strategies.ts proxy option (which expects a URL string).
53
+ * Returns null if proxies are not configured.
54
+ */
55
+ export declare function getWebshareProxyUrl(): string | null;
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Shared Webshare residential proxy configuration.
3
+ *
4
+ * WebPeel uses Webshare residential proxies (configured via env vars) to route
5
+ * requests through US residential IPs, bypassing datacenter IP blocks from
6
+ * DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
7
+ *
8
+ * Proxy credentials are loaded from environment variables:
9
+ * WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
10
+ * WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
11
+ * WEBSHARE_PROXY_USER — proxy username (without slot suffix)
12
+ * WEBSHARE_PROXY_PASS — proxy password
13
+ * WEBSHARE_PROXY_SLOTS — number of available US residential slots
14
+ *
15
+ * With the Webshare backbone plan each US slot has its own port:
16
+ * slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
17
+ */
18
+ /**
19
+ * Get a random Webshare residential proxy config.
20
+ * Returns null if the proxy is not configured (env vars missing or slots = 0).
21
+ *
22
+ * Uses random slot selection across all available US slots for even load
23
+ * distribution — same approach as youtube.ts proxyRequestSlotted().
24
+ */
25
+ export function getWebshareProxy() {
26
+ const host = process.env.WEBSHARE_PROXY_HOST;
27
+ const user = process.env.WEBSHARE_PROXY_USER;
28
+ const pass = process.env.WEBSHARE_PROXY_PASS;
29
+ const basePort = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
30
+ const slots = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '0', 10);
31
+ if (!host || !user || !pass || slots <= 0)
32
+ return null;
33
+ const slot = Math.floor(Math.random() * slots) + 1;
34
+ const port = basePort + slot - 1;
35
+ return {
36
+ server: `http://${host}:${port}`,
37
+ username: `${user}-US-${slot}`,
38
+ password: pass,
39
+ };
40
+ }
41
+ /**
42
+ * Check if Webshare proxies are configured (env vars are present and non-empty).
43
+ * Does NOT guarantee the proxy is reachable — just that credentials are set.
44
+ */
45
+ export function hasWebshareProxy() {
46
+ return !!(process.env.WEBSHARE_PROXY_HOST &&
47
+ process.env.WEBSHARE_PROXY_USER &&
48
+ process.env.WEBSHARE_PROXY_PASS);
49
+ }
50
+ /**
51
+ * Convert a ProxyConfig to a Playwright-compatible proxy object.
52
+ * Useful for passing directly to browser.newContext({ proxy: ... }).
53
+ */
54
+ export function toPlaywrightProxy(config) {
55
+ return {
56
+ server: config.server,
57
+ username: config.username,
58
+ password: config.password,
59
+ };
60
+ }
61
+ /**
62
+ * Get a random Webshare proxy as a fully-qualified URL string with embedded
63
+ * credentials. The format is: `http://username:password@host:port`
64
+ *
65
+ * Useful for passing to strategies.ts proxy option (which expects a URL string).
66
+ * Returns null if proxies are not configured.
67
+ */
68
+ export function getWebshareProxyUrl() {
69
+ const config = getWebshareProxy();
70
+ if (!config)
71
+ return null;
72
+ try {
73
+ const url = new URL(config.server);
74
+ return `http://${encodeURIComponent(config.username)}:${encodeURIComponent(config.password)}@${url.host}`;
75
+ }
76
+ catch {
77
+ return null;
78
+ }
79
+ }
@@ -15,6 +15,7 @@
15
15
  import { fetch as undiciFetch } from 'undici';
16
16
  import { load } from 'cheerio';
17
17
  import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
+ import { getWebshareProxy } from './proxy-config.js';
18
19
  import { createLogger } from './logger.js';
19
20
  const log = createLogger('search');
20
21
  function decodeHtmlEntities(input) {
@@ -236,10 +237,12 @@ export class StealthSearchProvider {
236
237
  const browser = await getStealthBrowser();
237
238
  const params = new URLSearchParams({ q: query });
238
239
  const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
240
+ const proxy = getWebshareProxy();
239
241
  ctx = await browser.newContext({
240
242
  userAgent: getRandomUserAgent(),
241
243
  locale: 'en-US',
242
244
  timezoneId: 'America/New_York',
245
+ ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
243
246
  });
244
247
  const page = await ctx.newPage();
245
248
  await applyStealthScripts(page);
@@ -303,10 +306,12 @@ export class StealthSearchProvider {
303
306
  const browser = await getStealthBrowser();
304
307
  const params = new URLSearchParams({ q: query });
305
308
  const url = `https://www.bing.com/search?${params.toString()}`;
309
+ const proxy = getWebshareProxy();
306
310
  ctx = await browser.newContext({
307
311
  userAgent: getRandomUserAgent(),
308
312
  locale: 'en-US',
309
313
  timezoneId: 'America/New_York',
314
+ ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
310
315
  });
311
316
  const page = await ctx.newPage();
312
317
  await applyStealthScripts(page);
@@ -380,10 +385,12 @@ export class StealthSearchProvider {
380
385
  const browser = await getStealthBrowser();
381
386
  const params = new URLSearchParams({ q: query });
382
387
  const url = `https://www.ecosia.org/search?${params.toString()}`;
388
+ const proxy = getWebshareProxy();
383
389
  ctx = await browser.newContext({
384
390
  userAgent: getRandomUserAgent(),
385
391
  locale: 'en-US',
386
392
  timezoneId: 'America/New_York',
393
+ ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
387
394
  });
388
395
  const page = await ctx.newPage();
389
396
  await applyStealthScripts(page);
@@ -10,6 +10,7 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
10
10
  import { getCached, setCached as setBasicCache } from './cache.js';
11
11
  import { resolveAndCache } from './dns-cache.js';
12
12
  import { BlockedError, NetworkError } from '../types.js';
13
+ import { getWebshareProxyUrl } from './proxy-config.js';
13
14
  import { detectChallenge } from './challenge-detection.js';
14
15
  import { getStrategyHooks, } from './strategy-hooks.js';
15
16
  import { createLogger } from './logger.js';
@@ -310,10 +311,15 @@ async function fetchWithBrowserStrategy(url, options) {
310
311
  export async function smartFetch(url, options = {}) {
311
312
  const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
312
313
  const usePeelTLS = tls || cycle;
313
- // Build effective proxy list: explicit proxies array, or single proxy, or empty
314
+ // Build effective proxy list: explicit proxies array, or single proxy, or empty.
315
+ // When no explicit proxy is configured and Webshare is available, automatically
316
+ // add it as a fallback: try direct connection first (fast), then Webshare on block.
314
317
  const effectiveProxies = proxies?.length ? proxies :
315
318
  proxy ? [proxy] :
316
- [undefined]; // undefined = direct connection (no proxy)
319
+ (() => {
320
+ const wsUrl = getWebshareProxyUrl();
321
+ return wsUrl ? [undefined, wsUrl] : [undefined];
322
+ })();
317
323
  const firstProxy = effectiveProxies[0];
318
324
  const hooks = getStrategyHooks();
319
325
  const fetchStartMs = Date.now();
@@ -15,6 +15,7 @@ import { join } from 'node:path';
15
15
  import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
16
16
  import { simpleFetch } from './fetcher.js';
17
17
  import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
+ import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
18
19
  import { createLogger } from './logger.js';
19
20
  // ---------------------------------------------------------------------------
20
21
  // yt-dlp startup diagnostics
@@ -239,8 +240,10 @@ export function extractSummary(fullText) {
239
240
  // ---------------------------------------------------------------------------
240
241
  // Proxy-based InnerTube transcript extraction
241
242
  // ---------------------------------------------------------------------------
242
- // Webshare residential proxy config — reads from env vars on Render.
243
+ // Webshare residential proxy config — reads from env vars via proxy-config.ts.
243
244
  // Locally, falls back to direct fetch (residential IP already works).
245
+ // These constants are kept for use in proxyRequestSlotted() which does
246
+ // low-level HTTP CONNECT tunneling (not Playwright-level proxy).
244
247
  const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
245
248
  const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
246
249
  const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
@@ -249,7 +252,8 @@ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
249
252
  // slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
250
253
  const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
251
254
  function isProxyConfigured() {
252
- return !!(PROXY_USER && PROXY_PASS);
255
+ // Delegate to the shared proxy-config helper for consistency
256
+ return _hasWebshareProxy();
253
257
  }
254
258
  /**
255
259
  * Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.7",
3
+ "version": "0.21.8",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",