webpeel 0.21.68 → 0.21.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +225 -0
- package/dist/core/pipeline.js +15 -3
- package/dist/core/strategies.js +46 -3
- package/package.json +1 -1
|
@@ -117,6 +117,9 @@ const REGISTRY = [
|
|
|
117
117
|
{ match: (h) => h === 'etsy.com' || h === 'www.etsy.com', extractor: etsyExtractor },
|
|
118
118
|
{ match: (h) => h === 'cars.com' || h === 'www.cars.com', extractor: carsComExtractor },
|
|
119
119
|
{ match: (h) => h === 'ebay.com' || h === 'www.ebay.com', extractor: ebayExtractor },
|
|
120
|
+
// ── Local / Real Estate ────────────────────────────────────────────────────
|
|
121
|
+
{ match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
|
|
122
|
+
{ match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
|
|
120
123
|
];
|
|
121
124
|
/**
|
|
122
125
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -3199,6 +3202,62 @@ async function bbcExtractor(html, url) {
|
|
|
3199
3202
|
return extractNewsArticle(html, url, 'bbc.com');
|
|
3200
3203
|
}
|
|
3201
3204
|
async function cnnExtractor(html, url) {
|
|
3205
|
+
try {
|
|
3206
|
+
const u = new URL(url);
|
|
3207
|
+
// For homepage — use CNN Lite which has actual headline links
|
|
3208
|
+
if (u.pathname === '/' || u.pathname === '' || u.hostname === 'lite.cnn.com') {
|
|
3209
|
+
const liteResp = await fetch('https://lite.cnn.com', { headers: { 'User-Agent': 'webpeel/0.21' } });
|
|
3210
|
+
if (liteResp.ok) {
|
|
3211
|
+
const liteHtml = await liteResp.text();
|
|
3212
|
+
const headlines = [];
|
|
3213
|
+
const matches = liteHtml.matchAll(/<a[^>]+href="([^"]*)"[^>]*>([^<]+)<\/a>/g);
|
|
3214
|
+
for (const m of matches) {
|
|
3215
|
+
const href = m[1].trim();
|
|
3216
|
+
const text = m[2].trim();
|
|
3217
|
+
// CNN Lite article links contain year patterns like /2026/
|
|
3218
|
+
if (/\/20\d\d\//.test(href) && text.length > 10) {
|
|
3219
|
+
const fullUrl = href.startsWith('http') ? href : `https://www.cnn.com${href}`;
|
|
3220
|
+
headlines.push(`- [${text}](${fullUrl})`);
|
|
3221
|
+
}
|
|
3222
|
+
}
|
|
3223
|
+
if (headlines.length > 5) {
|
|
3224
|
+
return {
|
|
3225
|
+
domain: 'cnn.com',
|
|
3226
|
+
type: 'headlines',
|
|
3227
|
+
structured: { headlines: headlines.length, source: 'cnn-lite' },
|
|
3228
|
+
cleanContent: `# 📰 CNN — Top Headlines\n\n${headlines.slice(0, 20).join('\n')}\n\n---\n*Source: CNN Lite*`,
|
|
3229
|
+
};
|
|
3230
|
+
}
|
|
3231
|
+
}
|
|
3232
|
+
}
|
|
3233
|
+
// For article pages — try CNN Lite version of the same URL
|
|
3234
|
+
if (/\/20\d\d\//.test(u.pathname)) {
|
|
3235
|
+
const liteUrl = `https://lite.cnn.com${u.pathname}`;
|
|
3236
|
+
const liteResp = await fetch(liteUrl, { headers: { 'User-Agent': 'webpeel/0.21' } });
|
|
3237
|
+
if (liteResp.ok) {
|
|
3238
|
+
const liteHtml = await liteResp.text();
|
|
3239
|
+
const { load } = await import('cheerio');
|
|
3240
|
+
const $l = load(liteHtml);
|
|
3241
|
+
const title = $l('h1').first().text().trim();
|
|
3242
|
+
const paragraphs = [];
|
|
3243
|
+
$l('p').each((_, el) => {
|
|
3244
|
+
const text = $l(el).text().trim();
|
|
3245
|
+
if (text.length > 20)
|
|
3246
|
+
paragraphs.push(text);
|
|
3247
|
+
});
|
|
3248
|
+
if (title && paragraphs.length > 0) {
|
|
3249
|
+
return {
|
|
3250
|
+
domain: 'cnn.com',
|
|
3251
|
+
type: 'article',
|
|
3252
|
+
structured: { title, paragraphs: paragraphs.length, source: 'cnn-lite' },
|
|
3253
|
+
cleanContent: `# ${title}\n\n${paragraphs.join('\n\n')}\n\n---\n*Source: CNN*`,
|
|
3254
|
+
};
|
|
3255
|
+
}
|
|
3256
|
+
}
|
|
3257
|
+
}
|
|
3258
|
+
}
|
|
3259
|
+
catch { /* fall through to standard extractor */ }
|
|
3260
|
+
// Fallback to standard news article extractor (works if HTML has content)
|
|
3202
3261
|
return extractNewsArticle(html, url, 'cnn.com');
|
|
3203
3262
|
}
|
|
3204
3263
|
// ---------------------------------------------------------------------------
|
|
@@ -5184,3 +5243,169 @@ async function ebayExtractor(html, url) {
|
|
|
5184
5243
|
return null;
|
|
5185
5244
|
}
|
|
5186
5245
|
}
|
|
5246
|
+
// ---------------------------------------------------------------------------
|
|
5247
|
+
// Yelp extractor — parse JSON-LD + meta from stealth-rendered HTML
|
|
5248
|
+
// ---------------------------------------------------------------------------
|
|
5249
|
+
async function yelpExtractor(html, url) {
|
|
5250
|
+
try {
|
|
5251
|
+
const { load } = await import('cheerio');
|
|
5252
|
+
const $ = load(html);
|
|
5253
|
+
// Try JSON-LD structured data first
|
|
5254
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
5255
|
+
let businessData = null;
|
|
5256
|
+
jsonLdScripts.each((_, el) => {
|
|
5257
|
+
const raw = $(el).html() || '';
|
|
5258
|
+
try {
|
|
5259
|
+
const parsed = JSON.parse(raw);
|
|
5260
|
+
const items = Array.isArray(parsed) ? parsed : [parsed];
|
|
5261
|
+
for (const item of items) {
|
|
5262
|
+
const type = item['@type'];
|
|
5263
|
+
if (type === 'Restaurant' || type === 'LocalBusiness' || type === 'FoodEstablishment' ||
|
|
5264
|
+
type === 'BarOrPub' || type === 'CafeOrCoffeeShop') {
|
|
5265
|
+
businessData = item;
|
|
5266
|
+
}
|
|
5267
|
+
}
|
|
5268
|
+
}
|
|
5269
|
+
catch { /* ignore malformed JSON-LD */ }
|
|
5270
|
+
});
|
|
5271
|
+
// --- Business page ---
|
|
5272
|
+
if (businessData) {
|
|
5273
|
+
const name = businessData.name || '';
|
|
5274
|
+
const rating = businessData.aggregateRating?.ratingValue;
|
|
5275
|
+
const reviewCount = businessData.aggregateRating?.reviewCount;
|
|
5276
|
+
const addr = businessData.address;
|
|
5277
|
+
const address = addr
|
|
5278
|
+
? [addr.streetAddress, addr.addressLocality, addr.addressRegion, addr.postalCode].filter(Boolean).join(', ')
|
|
5279
|
+
: '';
|
|
5280
|
+
const phone = businessData.telephone || '';
|
|
5281
|
+
const cuisine = businessData.servesCuisine || '';
|
|
5282
|
+
const priceRange = businessData.priceRange || '';
|
|
5283
|
+
const description = businessData.description || $('meta[property="og:description"]').attr('content') || '';
|
|
5284
|
+
const hours = businessData.openingHours || '';
|
|
5285
|
+
const lines = [
|
|
5286
|
+
`# ⭐ Yelp: ${name}`,
|
|
5287
|
+
'',
|
|
5288
|
+
rating && `**Rating:** ${rating}/5 (${reviewCount} reviews)`,
|
|
5289
|
+
cuisine && `**Cuisine:** ${cuisine}`,
|
|
5290
|
+
priceRange && `**Price:** ${priceRange}`,
|
|
5291
|
+
address && `**Address:** ${address}`,
|
|
5292
|
+
phone && `**Phone:** ${phone}`,
|
|
5293
|
+
hours && `**Hours:** ${Array.isArray(hours) ? hours.join(', ') : hours}`,
|
|
5294
|
+
description && `\n${description.substring(0, 500)}`,
|
|
5295
|
+
'',
|
|
5296
|
+
`**More info:** [View on Yelp](${url})`,
|
|
5297
|
+
'',
|
|
5298
|
+
'---',
|
|
5299
|
+
'*Source: Yelp*',
|
|
5300
|
+
].filter(Boolean);
|
|
5301
|
+
return {
|
|
5302
|
+
domain: 'yelp.com',
|
|
5303
|
+
type: 'business',
|
|
5304
|
+
structured: { name, rating, reviewCount, address, phone, cuisine, priceRange, description },
|
|
5305
|
+
cleanContent: lines.join('\n'),
|
|
5306
|
+
};
|
|
5307
|
+
}
|
|
5308
|
+
// --- Search page — parse from meta / og tags ---
|
|
5309
|
+
const ogTitle = $('meta[property="og:title"]').attr('content') || '';
|
|
5310
|
+
const ogDescription = $('meta[property="og:description"]').attr('content') || '';
|
|
5311
|
+
// Try to extract listing names from heading tags
|
|
5312
|
+
const listings = [];
|
|
5313
|
+
$('h3, h4').each((_, el) => {
|
|
5314
|
+
const text = $(el).text().trim();
|
|
5315
|
+
if (text && text.length > 2 && text.length < 100) {
|
|
5316
|
+
const anchor = $(el).find('a').first();
|
|
5317
|
+
const href = anchor.attr('href') || '';
|
|
5318
|
+
const fullHref = href.startsWith('/') ? `https://www.yelp.com${href}` : href;
|
|
5319
|
+
listings.push({ name: text, url: fullHref || undefined });
|
|
5320
|
+
}
|
|
5321
|
+
});
|
|
5322
|
+
if (ogTitle || listings.length > 0) {
|
|
5323
|
+
const searchTerm = ogTitle.replace(/\s*-\s*Yelp$/, '').trim();
|
|
5324
|
+
const lines = [
|
|
5325
|
+
`# 🔍 Yelp Search: ${searchTerm || 'Results'}`,
|
|
5326
|
+
ogDescription && `\n${ogDescription}`,
|
|
5327
|
+
listings.length > 0 && `\n**Found ${listings.length} results:**`,
|
|
5328
|
+
...listings.slice(0, 15).map((l, i) => `${i + 1}. ${l.url ? `[${l.name}](${l.url})` : l.name}`),
|
|
5329
|
+
'',
|
|
5330
|
+
`**Search:** [View on Yelp](${url})`,
|
|
5331
|
+
'',
|
|
5332
|
+
'---',
|
|
5333
|
+
'*Source: Yelp*',
|
|
5334
|
+
].filter(Boolean);
|
|
5335
|
+
return {
|
|
5336
|
+
domain: 'yelp.com',
|
|
5337
|
+
type: 'search',
|
|
5338
|
+
structured: { query: searchTerm, count: listings.length, listings },
|
|
5339
|
+
cleanContent: lines.join('\n'),
|
|
5340
|
+
};
|
|
5341
|
+
}
|
|
5342
|
+
return null;
|
|
5343
|
+
}
|
|
5344
|
+
catch (e) {
|
|
5345
|
+
if (process.env.DEBUG)
|
|
5346
|
+
console.debug('[webpeel]', 'Yelp extractor error:', e instanceof Error ? e.message : e);
|
|
5347
|
+
return null;
|
|
5348
|
+
}
|
|
5349
|
+
}
|
|
5350
|
+
// ---------------------------------------------------------------------------
|
|
5351
|
+
// Zillow extractor — smart fallback with helpful alternatives
|
|
5352
|
+
// ---------------------------------------------------------------------------
|
|
5353
|
+
async function zillowExtractor(_html, url) {
|
|
5354
|
+
try {
|
|
5355
|
+
const u = new URL(url);
|
|
5356
|
+
// Derive location label from the URL path
|
|
5357
|
+
const rawPath = u.pathname.replace(/^\//, '').replace(/\/$/, '');
|
|
5358
|
+
const location = rawPath
|
|
5359
|
+
.replace(/\//g, ' ')
|
|
5360
|
+
.replace(/-/g, ' ')
|
|
5361
|
+
.trim();
|
|
5362
|
+
// Parse city/state for alternative links
|
|
5363
|
+
const pathParts = rawPath.split('/').filter(Boolean);
|
|
5364
|
+
const cityStatePart = pathParts[0] || ''; // e.g. "new-york-ny"
|
|
5365
|
+
const segments = cityStatePart.split('-');
|
|
5366
|
+
const statePart = segments[segments.length - 1] || '';
|
|
5367
|
+
const cityPart = segments.slice(0, -1).join('-');
|
|
5368
|
+
// Redfin city path
|
|
5369
|
+
const cityCapitalized = cityPart.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join('_');
|
|
5370
|
+
const stateUpper = statePart.toUpperCase();
|
|
5371
|
+
const redfinCityPath = cityCapitalized && stateUpper
|
|
5372
|
+
? `https://www.redfin.com/city/${cityCapitalized}/${stateUpper}`
|
|
5373
|
+
: 'https://www.redfin.com';
|
|
5374
|
+
const realtorPath = cityStatePart
|
|
5375
|
+
? `https://www.realtor.com/realestateandhomes-search/${cityStatePart}`
|
|
5376
|
+
: 'https://www.realtor.com';
|
|
5377
|
+
const cleanContent = [
|
|
5378
|
+
`# 🏠 Zillow — ${location || 'Real Estate Search'}`,
|
|
5379
|
+
'',
|
|
5380
|
+
'> ⚠️ **Zillow blocks automated access.** WebPeel cannot retrieve live listings directly.',
|
|
5381
|
+
'',
|
|
5382
|
+
'**Try these alternatives that work with WebPeel:**',
|
|
5383
|
+
`- [Redfin](${redfinCityPath}) — similar listings, scrape-friendly`,
|
|
5384
|
+
`- [Realtor.com](${realtorPath}) — MLS-powered, often accessible`,
|
|
5385
|
+
`- [Homes.com](https://www.homes.com) — newer platform, better access`,
|
|
5386
|
+
'',
|
|
5387
|
+
`**Direct Zillow link:** [Open Zillow](${url})`,
|
|
5388
|
+
'',
|
|
5389
|
+
'---',
|
|
5390
|
+
'*Source: Zillow (access blocked — showing alternatives)*',
|
|
5391
|
+
].join('\n');
|
|
5392
|
+
return {
|
|
5393
|
+
domain: 'zillow.com',
|
|
5394
|
+
type: 'real-estate',
|
|
5395
|
+
structured: {
|
|
5396
|
+
location,
|
|
5397
|
+
blocked: true,
|
|
5398
|
+
alternatives: [
|
|
5399
|
+
{ name: 'Redfin', url: redfinCityPath },
|
|
5400
|
+
{ name: 'Realtor.com', url: realtorPath },
|
|
5401
|
+
],
|
|
5402
|
+
},
|
|
5403
|
+
cleanContent,
|
|
5404
|
+
};
|
|
5405
|
+
}
|
|
5406
|
+
catch (e) {
|
|
5407
|
+
if (process.env.DEBUG)
|
|
5408
|
+
console.debug('[webpeel]', 'Zillow extractor error:', e instanceof Error ? e.message : e);
|
|
5409
|
+
return null;
|
|
5410
|
+
}
|
|
5411
|
+
}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -415,9 +415,21 @@ export async function fetchContent(ctx) {
|
|
|
415
415
|
}
|
|
416
416
|
// Enhance error messages with actionable advice
|
|
417
417
|
if (fetchError instanceof BlockedError) {
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
418
|
+
// Instead of crashing, return a helpful response with the block info
|
|
419
|
+
ctx.timer.end('fetch');
|
|
420
|
+
const host = new URL(ctx.url).hostname.replace('www.', '');
|
|
421
|
+
ctx.content = `# ⚠️ ${host} — Access Blocked\n\nThis site uses advanced bot protection and blocked our request.\n\n**What you can try:**\n- Use a browser profile with saved login: \`webpeel login ${host}\`\n- Try an alternative site that provides similar data\n\n*Direct link: [Open in browser](${ctx.url})*`;
|
|
422
|
+
ctx.title = `${host} — Blocked`;
|
|
423
|
+
ctx.quality = 0.2;
|
|
424
|
+
ctx.warnings.push('Site blocked automated access. Showing fallback content.');
|
|
425
|
+
ctx.fetchResult = {
|
|
426
|
+
html: ctx.content,
|
|
427
|
+
url: ctx.url,
|
|
428
|
+
status: 403,
|
|
429
|
+
contentType: 'text/markdown',
|
|
430
|
+
method: 'blocked-fallback',
|
|
431
|
+
};
|
|
432
|
+
return;
|
|
421
433
|
}
|
|
422
434
|
const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
|
|
423
435
|
if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
|
package/dist/core/strategies.js
CHANGED
|
@@ -16,6 +16,40 @@ import { getStrategyHooks, } from './strategy-hooks.js';
|
|
|
16
16
|
import { createLogger } from './logger.js';
|
|
17
17
|
const log = createLogger('fetch');
|
|
18
18
|
/* ---------- hardcoded domain rules -------------------------------------- */
|
|
19
|
+
/**
|
|
20
|
+
* Domains that require a residential proxy to bypass datacenter IP blocks.
|
|
21
|
+
* These sites don't just need stealth — they fingerprint the IP itself and
|
|
22
|
+
* block all cloud/datacenter ranges. Webshare residential proxy bypasses this.
|
|
23
|
+
*
|
|
24
|
+
* When no explicit proxy is set and Webshare is configured, requests to these
|
|
25
|
+
* domains skip the direct (datacenter) attempt and go straight to residential proxy.
|
|
26
|
+
*/
|
|
27
|
+
const RESIDENTIAL_PROXY_DOMAINS = [
|
|
28
|
+
'zillow.com',
|
|
29
|
+
'yelp.com',
|
|
30
|
+
'pinterest.com',
|
|
31
|
+
'ticketmaster.com',
|
|
32
|
+
'stubhub.com',
|
|
33
|
+
'cargurus.com',
|
|
34
|
+
'realtor.com',
|
|
35
|
+
'redfin.com',
|
|
36
|
+
'apartments.com',
|
|
37
|
+
'trulia.com',
|
|
38
|
+
'homefinder.com',
|
|
39
|
+
];
|
|
40
|
+
/**
|
|
41
|
+
* Check if a URL matches a domain that requires residential proxy.
|
|
42
|
+
* Returns true if no explicit proxy is set and Webshare env vars are available.
|
|
43
|
+
*/
|
|
44
|
+
function requiresResidentialProxy(url) {
|
|
45
|
+
try {
|
|
46
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
47
|
+
return RESIDENTIAL_PROXY_DOMAINS.some(domain => hostname === domain || hostname.endsWith(`.${domain}`));
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
19
53
|
function shouldForceBrowser(url) {
|
|
20
54
|
// Hashbang URLs (#!) are always JS-routed SPAs — browser rendering required
|
|
21
55
|
if (url.includes('#!')) {
|
|
@@ -51,6 +85,7 @@ function shouldForceBrowser(url) {
|
|
|
51
85
|
'glassdoor.com',
|
|
52
86
|
'bloomberg.com',
|
|
53
87
|
'indeed.com',
|
|
88
|
+
'yelp.com', // aggressive bot detection
|
|
54
89
|
'amazon.com', // captcha wall on simple/browser fetch
|
|
55
90
|
'zillow.com', // aggressive bot detection
|
|
56
91
|
'ticketmaster.com', // Distil Networks / PerimeterX
|
|
@@ -313,13 +348,21 @@ export async function smartFetch(url, options = {}) {
|
|
|
313
348
|
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
|
|
314
349
|
const usePeelTLS = tls || cycle;
|
|
315
350
|
// Build effective proxy list: explicit proxies array, or single proxy, or empty.
|
|
316
|
-
//
|
|
317
|
-
//
|
|
351
|
+
// For domains that require residential proxies (Zillow, Yelp, Pinterest, etc.),
|
|
352
|
+
// skip the direct datacenter connection entirely and go straight to Webshare.
|
|
353
|
+
// For all other domains, try direct first (fast), then Webshare as fallback.
|
|
318
354
|
const effectiveProxies = proxies?.length ? proxies :
|
|
319
355
|
proxy ? [proxy] :
|
|
320
356
|
(() => {
|
|
321
357
|
const wsUrl = getWebshareProxyUrl();
|
|
322
|
-
|
|
358
|
+
if (!wsUrl)
|
|
359
|
+
return [undefined];
|
|
360
|
+
// Skip datacenter IP for known residential-proxy-required domains
|
|
361
|
+
if (requiresResidentialProxy(url)) {
|
|
362
|
+
log.debug('Residential proxy domain detected — skipping datacenter IP, using Webshare directly');
|
|
363
|
+
return [wsUrl];
|
|
364
|
+
}
|
|
365
|
+
return [undefined, wsUrl];
|
|
323
366
|
})();
|
|
324
367
|
const firstProxy = effectiveProxies[0];
|
|
325
368
|
const hooks = getStrategyHooks();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.70",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|