webpeel 0.21.69 → 0.21.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/pipeline.js +15 -3
- package/dist/core/strategies.js +45 -3
- package/package.json +1 -1
package/dist/core/pipeline.js
CHANGED
|
@@ -415,9 +415,21 @@ export async function fetchContent(ctx) {
|
|
|
415
415
|
}
|
|
416
416
|
// Enhance error messages with actionable advice
|
|
417
417
|
if (fetchError instanceof BlockedError) {
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
418
|
+
// Instead of crashing, return a helpful response with the block info
|
|
419
|
+
ctx.timer.end('fetch');
|
|
420
|
+
const host = new URL(ctx.url).hostname.replace('www.', '');
|
|
421
|
+
ctx.content = `# ⚠️ ${host} — Access Blocked\n\nThis site uses advanced bot protection and blocked our request.\n\n**What you can try:**\n- Use a browser profile with saved login: \`webpeel login ${host}\`\n- Try an alternative site that provides similar data\n\n*Direct link: [Open in browser](${ctx.url})*`;
|
|
422
|
+
ctx.title = `${host} — Blocked`;
|
|
423
|
+
ctx.quality = 0.2;
|
|
424
|
+
ctx.warnings.push('Site blocked automated access. Showing fallback content.');
|
|
425
|
+
ctx.fetchResult = {
|
|
426
|
+
html: ctx.content,
|
|
427
|
+
url: ctx.url,
|
|
428
|
+
status: 403,
|
|
429
|
+
contentType: 'text/markdown',
|
|
430
|
+
method: 'blocked-fallback',
|
|
431
|
+
};
|
|
432
|
+
return;
|
|
421
433
|
}
|
|
422
434
|
const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
|
|
423
435
|
if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
|
package/dist/core/strategies.js
CHANGED
|
@@ -16,6 +16,40 @@ import { getStrategyHooks, } from './strategy-hooks.js';
|
|
|
16
16
|
import { createLogger } from './logger.js';
|
|
17
17
|
const log = createLogger('fetch');
|
|
18
18
|
/* ---------- hardcoded domain rules -------------------------------------- */
|
|
19
|
+
/**
|
|
20
|
+
* Domains that require a residential proxy to bypass datacenter IP blocks.
|
|
21
|
+
* These sites don't just need stealth — they fingerprint the IP itself and
|
|
22
|
+
* block all cloud/datacenter ranges. Webshare residential proxy bypasses this.
|
|
23
|
+
*
|
|
24
|
+
* When no explicit proxy is set and Webshare is configured, requests to these
|
|
25
|
+
* domains skip the direct (datacenter) attempt and go straight to residential proxy.
|
|
26
|
+
*/
|
|
27
|
+
const RESIDENTIAL_PROXY_DOMAINS = [
|
|
28
|
+
'zillow.com',
|
|
29
|
+
'yelp.com',
|
|
30
|
+
'pinterest.com',
|
|
31
|
+
'ticketmaster.com',
|
|
32
|
+
'stubhub.com',
|
|
33
|
+
'cargurus.com',
|
|
34
|
+
'realtor.com',
|
|
35
|
+
'redfin.com',
|
|
36
|
+
'apartments.com',
|
|
37
|
+
'trulia.com',
|
|
38
|
+
'homefinder.com',
|
|
39
|
+
];
|
|
40
|
+
/**
|
|
41
|
+
* Check if a URL matches a domain that requires residential proxy.
|
|
42
|
+
* Returns true if no explicit proxy is set and Webshare env vars are available.
|
|
43
|
+
*/
|
|
44
|
+
function requiresResidentialProxy(url) {
|
|
45
|
+
try {
|
|
46
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
47
|
+
return RESIDENTIAL_PROXY_DOMAINS.some(domain => hostname === domain || hostname.endsWith(`.${domain}`));
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
19
53
|
function shouldForceBrowser(url) {
|
|
20
54
|
// Hashbang URLs (#!) are always JS-routed SPAs — browser rendering required
|
|
21
55
|
if (url.includes('#!')) {
|
|
@@ -314,13 +348,21 @@ export async function smartFetch(url, options = {}) {
|
|
|
314
348
|
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
|
|
315
349
|
const usePeelTLS = tls || cycle;
|
|
316
350
|
// Build effective proxy list: explicit proxies array, or single proxy, or empty.
|
|
317
|
-
//
|
|
318
|
-
//
|
|
351
|
+
// For domains that require residential proxies (Zillow, Yelp, Pinterest, etc.),
|
|
352
|
+
// skip the direct datacenter connection entirely and go straight to Webshare.
|
|
353
|
+
// For all other domains, try direct first (fast), then Webshare as fallback.
|
|
319
354
|
const effectiveProxies = proxies?.length ? proxies :
|
|
320
355
|
proxy ? [proxy] :
|
|
321
356
|
(() => {
|
|
322
357
|
const wsUrl = getWebshareProxyUrl();
|
|
323
|
-
|
|
358
|
+
if (!wsUrl)
|
|
359
|
+
return [undefined];
|
|
360
|
+
// Skip datacenter IP for known residential-proxy-required domains
|
|
361
|
+
if (requiresResidentialProxy(url)) {
|
|
362
|
+
log.debug('Residential proxy domain detected — skipping datacenter IP, using Webshare directly');
|
|
363
|
+
return [wsUrl];
|
|
364
|
+
}
|
|
365
|
+
return [undefined, wsUrl];
|
|
324
366
|
})();
|
|
325
367
|
const firstProxy = effectiveProxies[0];
|
|
326
368
|
const hooks = getStrategyHooks();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.70",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|