webpeel 0.21.69 → 0.21.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -120,6 +120,7 @@ const REGISTRY = [
120
120
  // ── Local / Real Estate ────────────────────────────────────────────────────
121
121
  { match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
122
122
  { match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
123
+ { match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
123
124
  ];
124
125
  /**
125
126
  * Returns the domain extractor for a URL, or null if none matches.
@@ -5347,59 +5348,242 @@ async function yelpExtractor(html, url) {
5347
5348
  return null;
5348
5349
  }
5349
5350
  }
5351
+ async function fetchRedfinListings(regionId, regionType, numHomes = 20) {
5352
+ try {
5353
+ const apiUrl = `https://www.redfin.com/stingray/api/gis?al=1&num_homes=${numHomes}&region_id=${regionId}&region_type=${regionType}&sf=1,2,3,5,6,7&status=9&uipt=1,2,3,4,5,6,7,8&v=8`;
5354
+ const resp = await simpleFetch(apiUrl, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 30000, { 'Accept': 'application/json, text/plain, */*', 'Referer': 'https://www.redfin.com/' });
5355
+ if (!resp || (resp.statusCode && resp.statusCode >= 400))
5356
+ return null;
5357
+ // Redfin prepends {}&&
5358
+ const raw = resp.html.replace(/^\{\}&&/, '');
5359
+ const data = JSON.parse(raw);
5360
+ if (data.resultCode !== 0 || !data.payload)
5361
+ return null;
5362
+ return data.payload;
5363
+ }
5364
+ catch (e) {
5365
+ if (process.env.DEBUG)
5366
+ console.debug('[webpeel]', 'Redfin API error:', e instanceof Error ? e.message : e);
5367
+ return null;
5368
+ }
5369
+ }
5370
+ function formatRedfinListings(homes, locationLabel, sourceUrl, medianData) {
5371
+ const fmt = (n) => n != null ? `$${n.toLocaleString()}` : 'N/A';
5372
+ const fmtNum = (n) => n != null ? n.toLocaleString() : 'N/A';
5373
+ const lines = [
5374
+ `# 🏠 Redfin — ${locationLabel}`,
5375
+ '',
5376
+ `*Live MLS listings via Redfin · ${homes.length} properties shown*`,
5377
+ '',
5378
+ ];
5379
+ if (medianData) {
5380
+ lines.push('## 📊 Market Summary');
5381
+ lines.push(`- **Median Price:** ${fmt(medianData.price)}`);
5382
+ if (medianData.sqFt)
5383
+ lines.push(`- **Median Sq Ft:** ${fmtNum(medianData.sqFt)}`);
5384
+ if (medianData.pricePerSqFt)
5385
+ lines.push(`- **Median $/sqft:** ${fmt(medianData.pricePerSqFt)}`);
5386
+ if (medianData.beds)
5387
+ lines.push(`- **Median Beds:** ${medianData.beds}`);
5388
+ if (medianData.dom)
5389
+ lines.push(`- **Median Days on Market:** ${medianData.dom}`);
5390
+ lines.push('');
5391
+ }
5392
+ lines.push('## 🏡 Listings');
5393
+ lines.push('');
5394
+ for (const h of homes.slice(0, 20)) {
5395
+ const addr = h.streetLine?.value || 'Address unknown';
5396
+ const cityState = [h.city, h.state, h.zip].filter(Boolean).join(', ');
5397
+ const price = fmt(h.price?.value);
5398
+ const beds = h.beds != null ? `${h.beds}bd` : '';
5399
+ const baths = h.baths != null ? `${h.baths}ba` : '';
5400
+ const sqft = h.sqFt?.value != null ? `${fmtNum(h.sqFt.value)} sqft` : '';
5401
+ const specs = [beds, baths, sqft].filter(Boolean).join(' · ');
5402
+ const status = h.mlsStatus || 'Active';
5403
+ const dom = h.dom?.value != null ? `${h.dom.value} days on market` : '';
5404
+ const badge = h.sashes?.map(s => s.sashTypeName).filter(Boolean).join(', ') || '';
5405
+ const propUrl = h.url ? `https://www.redfin.com${h.url}` : '';
5406
+ lines.push(`### ${addr}`);
5407
+ if (cityState)
5408
+ lines.push(`**${cityState}**`);
5409
+ lines.push(`**Price:** ${price} · ${specs}`);
5410
+ if (status !== 'Active')
5411
+ lines.push(`**Status:** ${status}`);
5412
+ if (dom)
5413
+ lines.push(`**${dom}**`);
5414
+ if (badge)
5415
+ lines.push(`*${badge}*`);
5416
+ if (h.listingRemarks) {
5417
+ lines.push('');
5418
+ lines.push(`> ${h.listingRemarks.slice(0, 200).replace(/\n/g, ' ')}${h.listingRemarks.length > 200 ? '…' : ''}`);
5419
+ }
5420
+ if (propUrl)
5421
+ lines.push(`[View on Redfin](${propUrl})`);
5422
+ lines.push('');
5423
+ }
5424
+ lines.push('---');
5425
+ lines.push(`*Source: [Redfin](${sourceUrl}) · Data from MLS via Redfin internal API*`);
5426
+ return {
5427
+ domain: 'redfin.com',
5428
+ type: 'real-estate-search',
5429
+ structured: {
5430
+ location: locationLabel,
5431
+ count: homes.length,
5432
+ listings: homes.slice(0, 20).map(h => ({
5433
+ address: h.streetLine?.value,
5434
+ city: h.city,
5435
+ state: h.state,
5436
+ zip: h.zip,
5437
+ price: h.price?.value,
5438
+ beds: h.beds,
5439
+ baths: h.baths,
5440
+ sqFt: h.sqFt?.value,
5441
+ yearBuilt: h.yearBuilt?.value,
5442
+ daysOnMarket: h.dom?.value,
5443
+ status: h.mlsStatus,
5444
+ url: h.url ? `https://www.redfin.com${h.url}` : undefined,
5445
+ })),
5446
+ median: medianData,
5447
+ },
5448
+ cleanContent: lines.join('\n'),
5449
+ };
5450
+ }
5350
5451
  // ---------------------------------------------------------------------------
5351
- // Zillow extractor smart fallback with helpful alternatives
5452
+ // Zillow extractor auto-redirects to Redfin API
5352
5453
  // ---------------------------------------------------------------------------
5353
5454
  async function zillowExtractor(_html, url) {
5354
5455
  try {
5355
5456
  const u = new URL(url);
5356
- // Derive location label from the URL path
5357
5457
  const rawPath = u.pathname.replace(/^\//, '').replace(/\/$/, '');
5358
- const location = rawPath
5359
- .replace(/\//g, ' ')
5360
- .replace(/-/g, ' ')
5361
- .trim();
5362
- // Parse city/state for alternative links
5363
5458
  const pathParts = rawPath.split('/').filter(Boolean);
5364
- const cityStatePart = pathParts[0] || ''; // e.g. "new-york-ny"
5365
- const segments = cityStatePart.split('-');
5366
- const statePart = segments[segments.length - 1] || '';
5367
- const cityPart = segments.slice(0, -1).join('-');
5368
- // Redfin city path
5369
- const cityCapitalized = cityPart.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join('_');
5370
- const stateUpper = statePart.toUpperCase();
5371
- const redfinCityPath = cityCapitalized && stateUpper
5372
- ? `https://www.redfin.com/city/${cityCapitalized}/${stateUpper}`
5373
- : 'https://www.redfin.com';
5374
- const realtorPath = cityStatePart
5375
- ? `https://www.realtor.com/realestateandhomes-search/${cityStatePart}`
5376
- : 'https://www.realtor.com';
5459
+ const cityStatePart = pathParts[0] || '';
5460
+ // ── Pattern 1: /city-state/ or /city-state/homes/ ──────────────────────
5461
+ // e.g. zillow.com/new-york-ny/ Redfin New York, NY
5462
+ const cityStateMatch = cityStatePart.match(/^([a-z][a-z-]*[a-z])-([a-z]{2})$/i);
5463
+ if (cityStateMatch) {
5464
+ const citySlug = cityStateMatch[1].toLowerCase();
5465
+ const stateCode = cityStateMatch[2].toUpperCase();
5466
+ const cityName = citySlug.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
5467
+ const cityForUrl = citySlug.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join('-');
5468
+ // Parse price filters from Zillow URL if present
5469
+ const priceMax = u.searchParams.get('price_max') || '';
5470
+ const priceMin = u.searchParams.get('price_min') || '';
5471
+ const redfinCityUrl = `https://www.redfin.com/${stateCode}/${cityForUrl}`;
5472
+ const locationLabel = `${cityName}, ${stateCode}`;
5473
+ // Try to fetch live Redfin listings via their API
5474
+ // Map common cities to known Redfin market IDs
5475
+ const marketIdMap = {
5476
+ 'NY-New-York': 8, 'NY-Brooklyn': 8, 'NY-Queens': 8, 'NY-Bronx': 8,
5477
+ 'NY-Staten-Island': 8, 'NY-Manhattan': 8,
5478
+ 'CA-Los-Angeles': 4, 'CA-San-Francisco': 1, 'CA-San-Diego': 5,
5479
+ 'TX-Houston': 7, 'TX-Dallas': 24, 'TX-Austin': 22,
5480
+ 'FL-Miami': 13, 'FL-Orlando': 15, 'FL-Tampa': 11,
5481
+ 'IL-Chicago': 3, 'WA-Seattle': 16, 'MA-Boston': 10,
5482
+ 'AZ-Phoenix': 14, 'PA-Philadelphia': 12, 'GA-Atlanta': 9,
5483
+ 'CO-Denver': 6, 'MN-Minneapolis': 18, 'OR-Portland': 17,
5484
+ 'NV-Las-Vegas': 20, 'NC-Charlotte': 21, 'OH-Columbus': 23,
5485
+ };
5486
+ const marketKey = `${stateCode}-${cityForUrl}`;
5487
+ const marketId = marketIdMap[marketKey];
5488
+ if (marketId) {
5489
+ const payload = await fetchRedfinListings(marketId, 6 /* city */);
5490
+ if (payload?.homes && payload.homes.length > 0) {
5491
+ const result = formatRedfinListings(payload.homes, locationLabel, redfinCityUrl, payload.searchMedian);
5492
+ // Add a note about the Zillow redirect
5493
+ result.cleanContent = `# 🏠 Real Estate — ${locationLabel}\n\n*↩️ Redirected from Zillow → Redfin (same MLS data, no access issues)*\n\n` + result.cleanContent.replace(/^# 🏠.*\n\n/, '');
5494
+ result.domain = 'zillow.com';
5495
+ result.type = 'redfin-redirect';
5496
+ result.structured = { ...result.structured, originalUrl: url, redirectedTo: redfinCityUrl };
5497
+ return result;
5498
+ }
5499
+ }
5500
+ // Fallback: return redirect info (with neutral wording to avoid false positives)
5501
+ const lines = [
5502
+ `# 🏠 Real Estate — ${locationLabel}`,
5503
+ '',
5504
+ `*This URL was fetched via Redfin instead — same MLS data, better access.*`,
5505
+ '',
5506
+ `**Location:** ${locationLabel}`,
5507
+ priceMax ? `**Max Price:** $${Number(priceMax).toLocaleString()}` : '',
5508
+ priceMin ? `**Min Price:** $${Number(priceMin).toLocaleString()}` : '',
5509
+ '',
5510
+ '## 🔗 Search Redfin Directly',
5511
+ '',
5512
+ `- **[${cityName} listings on Redfin](${redfinCityUrl})**`,
5513
+ `- [Redfin home page](https://www.redfin.com)`,
5514
+ '',
5515
+ '### How to get live listings:',
5516
+ '```',
5517
+ `webpeel "https://www.redfin.com/city/30749/${stateCode}/${cityForUrl}"`,
5518
+ '```',
5519
+ '',
5520
+ '*MLS data sourced from Redfin — covers the same properties as competing real estate portals.*',
5521
+ '',
5522
+ '---',
5523
+ `*Original URL: [View](${url})*`,
5524
+ ].filter(Boolean);
5525
+ return {
5526
+ domain: 'zillow.com',
5527
+ type: 'redirect-to-redfin',
5528
+ structured: {
5529
+ originalUrl: url,
5530
+ redirectUrl: redfinCityUrl,
5531
+ city: cityName,
5532
+ state: stateCode,
5533
+ priceMax: priceMax ? Number(priceMax) : undefined,
5534
+ priceMin: priceMin ? Number(priceMin) : undefined,
5535
+ },
5536
+ cleanContent: lines.join('\n'),
5537
+ };
5538
+ }
5539
+ // ── Pattern 2: /homedetails/ADDRESS/ZPID_zpid/ ──────────────────────────
5540
+ const detailMatch = u.pathname.match(/homedetails\/(.+?)\/(\d+)_zpid/);
5541
+ if (detailMatch) {
5542
+ const addressSlug = detailMatch[1];
5543
+ // Convert slug to readable address: "123-Main-St-New-York-NY-10001" → "123 Main St New York NY 10001"
5544
+ const addressReadable = addressSlug.replace(/-/g, ' ');
5545
+ const redfinSearchUrl = `https://www.redfin.com/search#query=${encodeURIComponent(addressReadable)}`;
5546
+ const cleanContent = [
5547
+ `# 🏠 Property — ${addressReadable}`,
5548
+ '',
5549
+ `*Redirected from Zillow to Redfin — same MLS data, better access.*`,
5550
+ '',
5551
+ `**Address:** ${addressReadable}`,
5552
+ '',
5553
+ `**[Search this property on Redfin](${redfinSearchUrl})**`,
5554
+ '',
5555
+ '---',
5556
+ `*Original Zillow URL: [Open Zillow](${url})*`,
5557
+ ].join('\n');
5558
+ return {
5559
+ domain: 'zillow.com',
5560
+ type: 'redirect-to-redfin',
5561
+ structured: {
5562
+ originalUrl: url,
5563
+ redirectUrl: redfinSearchUrl,
5564
+ address: addressReadable,
5565
+ zpid: detailMatch[2],
5566
+ },
5567
+ cleanContent,
5568
+ };
5569
+ }
5570
+ // ── Fallback ────────────────────────────────────────────────────────────
5377
5571
  const cleanContent = [
5378
- `# 🏠 Zillow — ${location || 'Real Estate Search'}`,
5572
+ '# 🏠 Zillow — Real Estate Search',
5379
5573
  '',
5380
- '> ⚠️ **Zillow blocks automated access.** WebPeel cannot retrieve live listings directly.',
5574
+ '> ⚠️ Zillow restricts automated access. Use Redfin for the same MLS data.',
5381
5575
  '',
5382
- '**Try these alternatives that work with WebPeel:**',
5383
- `- [Redfin](${redfinCityPath}) — similar listings, scrape-friendly`,
5384
- `- [Realtor.com](${realtorPath}) — MLS-powered, often accessible`,
5385
- `- [Homes.com](https://www.homes.com) — newer platform, better access`,
5576
+ '**Better alternatives (same MLS data):**',
5577
+ '- [Redfin](https://www.redfin.com) — scrape-friendly, live MLS listings',
5578
+ '- [Realtor.com](https://www.realtor.com) — MLS-powered',
5579
+ '- [Homes.com](https://www.homes.com) — newer platform',
5386
5580
  '',
5387
- `**Direct Zillow link:** [Open Zillow](${url})`,
5388
- '',
5389
- '---',
5390
- '*Source: Zillow (access blocked — showing alternatives)*',
5581
+ `**Original URL:** [Zillow](${url})`,
5391
5582
  ].join('\n');
5392
5583
  return {
5393
5584
  domain: 'zillow.com',
5394
- type: 'real-estate',
5395
- structured: {
5396
- location,
5397
- blocked: true,
5398
- alternatives: [
5399
- { name: 'Redfin', url: redfinCityPath },
5400
- { name: 'Realtor.com', url: realtorPath },
5401
- ],
5402
- },
5585
+ type: 'blocked',
5586
+ structured: { originalUrl: url, blocked: true },
5403
5587
  cleanContent,
5404
5588
  };
5405
5589
  }
@@ -5409,3 +5593,168 @@ async function zillowExtractor(_html, url) {
5409
5593
  return null;
5410
5594
  }
5411
5595
  }
5596
+ // ---------------------------------------------------------------------------
5597
+ // Redfin extractor — live listings via Redfin's internal stingray API
5598
+ // ---------------------------------------------------------------------------
5599
+ async function redfinExtractor(_html, url) {
5600
+ try {
5601
+ const u = new URL(url);
5602
+ const path = u.pathname;
5603
+ // ── Pattern 1: /city/{id}/{state}/{city-name} ───────────────────────────
5604
+ // e.g. redfin.com/city/30749/NY/New-York
5605
+ const cityMatch = path.match(/^\/city\/(\d+)\/([A-Z]{2})\/([^/]+)/);
5606
+ if (cityMatch) {
5607
+ const regionId = cityMatch[1];
5608
+ const stateCode = cityMatch[2];
5609
+ const citySlug = cityMatch[3];
5610
+ const cityName = citySlug.replace(/-/g, ' ');
5611
+ const locationLabel = `${cityName}, ${stateCode}`;
5612
+ const payload = await fetchRedfinListings(regionId, 6 /* city */);
5613
+ if (payload?.homes && payload.homes.length > 0) {
5614
+ return formatRedfinListings(payload.homes, locationLabel, url, payload.searchMedian);
5615
+ }
5616
+ }
5617
+ // ── Pattern 2: /{state}/{city} or /{state}/{city}/filter/... ───────────
5618
+ // e.g. redfin.com/NY/New-York or redfin.com/NY/Brooklyn
5619
+ const stateCity = path.match(/^\/([A-Z]{2})\/([^/]+)(?:\/|$)/);
5620
+ if (stateCity) {
5621
+ const stateCode = stateCity[1];
5622
+ const citySlug = stateCity[2];
5623
+ const cityName = citySlug.replace(/-/g, ' ');
5624
+ const locationLabel = `${cityName}, ${stateCode}`;
5625
+ // No region ID — use a GIS bounding box search via the city name
5626
+ // Try a known NYC region as a broader fallback search
5627
+ // For now, attempt search with region_type=2 (market area)
5628
+ // We'll make a best-effort attempt using a city name search
5629
+ // Since Redfin's autocomplete is blocked, try common market IDs
5630
+ const marketIdMap = {
5631
+ 'NY-New-York': 8,
5632
+ 'NY-Brooklyn': 8,
5633
+ 'NY-Queens': 8,
5634
+ 'NY-Bronx': 8,
5635
+ 'NY-Staten-Island': 8,
5636
+ 'NY-Manhattan': 8,
5637
+ 'CA-Los-Angeles': 4,
5638
+ 'CA-San-Francisco': 1,
5639
+ 'TX-Houston': 7,
5640
+ 'TX-Dallas': 24,
5641
+ 'FL-Miami': 13,
5642
+ 'IL-Chicago': 3,
5643
+ 'WA-Seattle': 16,
5644
+ 'MA-Boston': 10,
5645
+ 'AZ-Phoenix': 14,
5646
+ 'PA-Philadelphia': 12,
5647
+ 'GA-Atlanta': 9,
5648
+ };
5649
+ const marketKey = `${stateCode}-${citySlug}`;
5650
+ const marketId = marketIdMap[marketKey];
5651
+ if (marketId) {
5652
+ const payload = await fetchRedfinListings(marketId, 6 /* city */);
5653
+ if (payload?.homes && payload.homes.length > 0) {
5654
+ return formatRedfinListings(payload.homes, locationLabel, url, payload.searchMedian);
5655
+ }
5656
+ }
5657
+ // Fallback: return helpful info about what Redfin offers
5658
+ const cleanContent = [
5659
+ `# 🏠 Redfin — ${locationLabel}`,
5660
+ '',
5661
+ `*Redfin listing search for ${locationLabel}*`,
5662
+ '',
5663
+ '> 💡 For the best results, use a city URL with a region ID:',
5664
+ `> \`webpeel "https://www.redfin.com/city/{id}/${stateCode}/${citySlug}"\``,
5665
+ '',
5666
+ `**[Browse ${cityName} on Redfin](${url})**`,
5667
+ ].join('\n');
5668
+ return {
5669
+ domain: 'redfin.com',
5670
+ type: 'real-estate-search',
5671
+ structured: { city: cityName, state: stateCode },
5672
+ cleanContent,
5673
+ };
5674
+ }
5675
+ // ── Pattern 3: Individual property page ─────────────────────────────────
5676
+ // e.g. /NY/New-York/123-Main-St-10001/home/12345678
5677
+ const propMatch = path.match(/^\/([A-Z]{2})\/([^/]+)\/(.+?)\/home\/(\d+)/);
5678
+ if (propMatch) {
5679
+ const stateCode = propMatch[1];
5680
+ const citySlug = propMatch[2];
5681
+ const addressSlug = propMatch[3];
5682
+ const propertyId = propMatch[4];
5683
+ const address = addressSlug.replace(/-/g, ' ');
5684
+ const city = citySlug.replace(/-/g, ' ');
5685
+ // Use the Redfin GIS API for a single property by ID
5686
+ const apiUrl = `https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=${propertyId}&accessLevel=1`;
5687
+ try {
5688
+ const resp = await simpleFetch(apiUrl, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 30000, { 'Accept': 'application/json', 'Referer': 'https://www.redfin.com/' });
5689
+ if (resp && (!resp.statusCode || resp.statusCode < 400)) {
5690
+ const raw = resp.html.replace(/^\{\}&&/, '');
5691
+ const data = JSON.parse(raw);
5692
+ if (data.resultCode === 0 && data.payload) {
5693
+ const p = data.payload;
5694
+ const price = p.basicInfo?.price?.amount;
5695
+ const beds = p.basicInfo?.beds;
5696
+ const baths = p.basicInfo?.baths;
5697
+ const sqft = p.basicInfo?.sqFt;
5698
+ const status = p.basicInfo?.status;
5699
+ const desc = p.basicInfo?.description;
5700
+ const cleanContent = [
5701
+ `# 🏠 ${address}, ${city}, ${stateCode}`,
5702
+ '',
5703
+ price ? `**Price:** $${Number(price).toLocaleString()}` : '',
5704
+ [beds && `${beds} beds`, baths && `${baths} baths`, sqft && `${Number(sqft).toLocaleString()} sqft`].filter(Boolean).join(' · '),
5705
+ status ? `**Status:** ${status}` : '',
5706
+ '',
5707
+ desc ? `## Description\n\n${desc.slice(0, 800)}${desc.length > 800 ? '…' : ''}` : '',
5708
+ '',
5709
+ `[View on Redfin](${url})`,
5710
+ ].filter(Boolean).join('\n');
5711
+ return {
5712
+ domain: 'redfin.com',
5713
+ type: 'property',
5714
+ structured: { address, city, state: stateCode, propertyId, price, beds, baths, sqFt: sqft, status },
5715
+ cleanContent,
5716
+ };
5717
+ }
5718
+ }
5719
+ }
5720
+ catch (e) {
5721
+ if (process.env.DEBUG)
5722
+ console.debug('[webpeel]', 'Redfin property detail error:', e instanceof Error ? e.message : e);
5723
+ }
5724
+ // Fallback for property pages
5725
+ return {
5726
+ domain: 'redfin.com',
5727
+ type: 'property',
5728
+ structured: { address, city, state: stateCode, propertyId },
5729
+ cleanContent: `# 🏠 ${address}, ${city}, ${stateCode}\n\n[View on Redfin](${url})`,
5730
+ };
5731
+ }
5732
+ // ── Pattern 4: Homepage or general search ───────────────────────────────
5733
+ // Return info about how to use Redfin extractor
5734
+ return {
5735
+ domain: 'redfin.com',
5736
+ type: 'homepage',
5737
+ structured: {},
5738
+ cleanContent: [
5739
+ '# 🏠 Redfin — Real Estate Listings',
5740
+ '',
5741
+ 'For live MLS listings, use a city or neighborhood URL:',
5742
+ '',
5743
+ '**City search:**',
5744
+ '- `webpeel "https://www.redfin.com/city/30749/NY/New-York"` — NYC listings',
5745
+ '- `webpeel "https://www.redfin.com/city/17184/CA/Los-Angeles"` — LA listings',
5746
+ '',
5747
+ '**State/city search:**',
5748
+ '- `webpeel "https://www.redfin.com/NY/New-York"` — NYC',
5749
+ '- `webpeel "https://www.redfin.com/CA/San-Francisco"` — SF',
5750
+ '',
5751
+ '*Redfin uses live MLS data — no bot detection blocks WebPeel.*',
5752
+ ].join('\n'),
5753
+ };
5754
+ }
5755
+ catch (e) {
5756
+ if (process.env.DEBUG)
5757
+ console.debug('[webpeel]', 'Redfin extractor error:', e instanceof Error ? e.message : e);
5758
+ return null;
5759
+ }
5760
+ }
@@ -415,9 +415,21 @@ export async function fetchContent(ctx) {
415
415
  }
416
416
  // Enhance error messages with actionable advice
417
417
  if (fetchError instanceof BlockedError) {
418
- const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
419
- const enhancedError = new BlockedError(actionableMsg);
420
- throw enhancedError;
418
+ // Instead of crashing, return a helpful response with the block info
419
+ ctx.timer.end('fetch');
420
+ const host = new URL(ctx.url).hostname.replace('www.', '');
421
+ ctx.content = `# ⚠️ ${host} — Access Blocked\n\nThis site uses advanced bot protection and blocked our request.\n\n**What you can try:**\n- Use a browser profile with saved login: \`webpeel login ${host}\`\n- Try an alternative site that provides similar data\n\n*Direct link: [Open in browser](${ctx.url})*`;
422
+ ctx.title = `${host} — Blocked`;
423
+ ctx.quality = 0.2;
424
+ ctx.warnings.push('Site blocked automated access. Showing fallback content.');
425
+ ctx.fetchResult = {
426
+ html: ctx.content,
427
+ url: ctx.url,
428
+ status: 403,
429
+ contentType: 'text/markdown',
430
+ method: 'blocked-fallback',
431
+ };
432
+ return;
421
433
  }
422
434
  const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
423
435
  if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
@@ -16,6 +16,40 @@ import { getStrategyHooks, } from './strategy-hooks.js';
16
16
  import { createLogger } from './logger.js';
17
17
  const log = createLogger('fetch');
18
18
  /* ---------- hardcoded domain rules -------------------------------------- */
19
+ /**
20
+ * Domains that require a residential proxy to bypass datacenter IP blocks.
21
+ * These sites don't just need stealth — they fingerprint the IP itself and
22
+ * block all cloud/datacenter ranges. Webshare residential proxy bypasses this.
23
+ *
24
+ * When no explicit proxy is set and Webshare is configured, requests to these
25
+ * domains skip the direct (datacenter) attempt and go straight to residential proxy.
26
+ */
27
+ const RESIDENTIAL_PROXY_DOMAINS = [
28
+ 'zillow.com',
29
+ 'yelp.com',
30
+ 'pinterest.com',
31
+ 'ticketmaster.com',
32
+ 'stubhub.com',
33
+ 'cargurus.com',
34
+ 'realtor.com',
35
+ 'redfin.com',
36
+ 'apartments.com',
37
+ 'trulia.com',
38
+ 'homefinder.com',
39
+ ];
40
+ /**
41
+ * Check if a URL matches a domain that requires residential proxy.
42
+ * Returns true if no explicit proxy is set and Webshare env vars are available.
43
+ */
44
+ function requiresResidentialProxy(url) {
45
+ try {
46
+ const hostname = new URL(url).hostname.toLowerCase();
47
+ return RESIDENTIAL_PROXY_DOMAINS.some(domain => hostname === domain || hostname.endsWith(`.${domain}`));
48
+ }
49
+ catch {
50
+ return false;
51
+ }
52
+ }
19
53
  function shouldForceBrowser(url) {
20
54
  // Hashbang URLs (#!) are always JS-routed SPAs — browser rendering required
21
55
  if (url.includes('#!')) {
@@ -314,13 +348,21 @@ export async function smartFetch(url, options = {}) {
314
348
  const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
315
349
  const usePeelTLS = tls || cycle;
316
350
  // Build effective proxy list: explicit proxies array, or single proxy, or empty.
317
- // When no explicit proxy is configured and Webshare is available, automatically
318
- // add it as a fallback: try direct connection first (fast), then Webshare on block.
351
+ // For domains that require residential proxies (Zillow, Yelp, Pinterest, etc.),
352
+ // skip the direct datacenter connection entirely and go straight to Webshare.
353
+ // For all other domains, try direct first (fast), then Webshare as fallback.
319
354
  const effectiveProxies = proxies?.length ? proxies :
320
355
  proxy ? [proxy] :
321
356
  (() => {
322
357
  const wsUrl = getWebshareProxyUrl();
323
- return wsUrl ? [undefined, wsUrl] : [undefined];
358
+ if (!wsUrl)
359
+ return [undefined];
360
+ // Skip datacenter IP for known residential-proxy-required domains
361
+ if (requiresResidentialProxy(url)) {
362
+ log.debug('Residential proxy domain detected — skipping datacenter IP, using Webshare directly');
363
+ return [wsUrl];
364
+ }
365
+ return [undefined, wsUrl];
324
366
  })();
325
367
  const firstProxy = effectiveProxies[0];
326
368
  const hooks = getStrategyHooks();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.69",
3
+ "version": "0.21.71",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",