webpeel 0.21.70 → 0.21.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -120,6 +120,7 @@ const REGISTRY = [
120
120
  // ── Local / Real Estate ────────────────────────────────────────────────────
121
121
  { match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
122
122
  { match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
123
+ { match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
123
124
  ];
124
125
  /**
125
126
  * Returns the domain extractor for a URL, or null if none matches.
@@ -5246,100 +5247,212 @@ async function ebayExtractor(html, url) {
5246
5247
  // ---------------------------------------------------------------------------
5247
5248
  // Yelp extractor — parse JSON-LD + meta from stealth-rendered HTML
5248
5249
  // ---------------------------------------------------------------------------
5249
- async function yelpExtractor(html, url) {
5250
+ async function yelpExtractor(_html, url) {
5251
+ const YELP_API_KEY = process.env.YELP_API_KEY;
5252
+ // Helper to call Yelp Fusion API
5253
+ async function yelpFetch(path, params) {
5254
+ const base = 'https://api.yelp.com/v3';
5255
+ const qs = params ? '?' + new URLSearchParams(params).toString() : '';
5256
+ const res = await fetch(`${base}${path}${qs}`, {
5257
+ headers: { 'Authorization': `Bearer ${YELP_API_KEY}` },
5258
+ });
5259
+ if (!res.ok) {
5260
+ throw new Error(`Yelp API ${res.status}: ${res.statusText}`);
5261
+ }
5262
+ return res.json();
5263
+ }
5250
5264
  try {
5251
- const { load } = await import('cheerio');
5252
- const $ = load(html);
5253
- // Try JSON-LD structured data first
5254
- const jsonLdScripts = $('script[type="application/ld+json"]');
5255
- let businessData = null;
5256
- jsonLdScripts.each((_, el) => {
5257
- const raw = $(el).html() || '';
5265
+ const parsed = new URL(url);
5266
+ const pathname = parsed.pathname;
5267
+ const searchParams = parsed.searchParams;
5268
+ // ----------------------------------------------------------------
5269
+ // If no API key, fall back to the legacy HTML-scraping approach
5270
+ // ----------------------------------------------------------------
5271
+ if (!YELP_API_KEY) {
5272
+ // Legacy fallback: minimal result pointing user to Yelp
5273
+ const term = searchParams.get('find_desc') || searchParams.get('cflt') || 'businesses';
5274
+ const loc = searchParams.get('find_loc') || '';
5275
+ const isBiz = pathname.startsWith('/biz/');
5276
+ const cleanContent = isBiz
5277
+ ? `# Yelp Business\n\n*No YELP_API_KEY configured — visit [Yelp](${url}) for details.*`
5278
+ : `# 🔍 Yelp Search: ${term}${loc ? ` in ${loc}` : ''}\n\n*No YELP_API_KEY configured — [View on Yelp](${url})*`;
5279
+ return {
5280
+ domain: 'yelp.com',
5281
+ type: isBiz ? 'business' : 'search',
5282
+ structured: { url },
5283
+ cleanContent,
5284
+ };
5285
+ }
5286
+ // ----------------------------------------------------------------
5287
+ // Business page: /biz/<alias>
5288
+ // ----------------------------------------------------------------
5289
+ if (pathname.startsWith('/biz/')) {
5290
+ const alias = pathname.replace('/biz/', '').split('?')[0].split('#')[0];
5291
+ let biz;
5258
5292
  try {
5259
- const parsed = JSON.parse(raw);
5260
- const items = Array.isArray(parsed) ? parsed : [parsed];
5261
- for (const item of items) {
5262
- const type = item['@type'];
5263
- if (type === 'Restaurant' || type === 'LocalBusiness' || type === 'FoodEstablishment' ||
5264
- type === 'BarOrPub' || type === 'CafeOrCoffeeShop') {
5265
- businessData = item;
5266
- }
5267
- }
5293
+ biz = await yelpFetch(`/businesses/${alias}`);
5268
5294
  }
5269
- catch { /* ignore malformed JSON-LD */ }
5270
- });
5271
- // --- Business page ---
5272
- if (businessData) {
5273
- const name = businessData.name || '';
5274
- const rating = businessData.aggregateRating?.ratingValue;
5275
- const reviewCount = businessData.aggregateRating?.reviewCount;
5276
- const addr = businessData.address;
5295
+ catch (e) {
5296
+ if (process.env.DEBUG)
5297
+ console.debug('[webpeel] Yelp biz fetch failed:', e instanceof Error ? e.message : e);
5298
+ return null;
5299
+ }
5300
+ // Fetch reviews (best-effort)
5301
+ let reviews = [];
5302
+ try {
5303
+ const revData = await yelpFetch(`/businesses/${alias}/reviews`, { limit: '3' });
5304
+ reviews = revData.reviews || [];
5305
+ }
5306
+ catch { /* reviews are optional */ }
5307
+ const name = biz.name || alias;
5308
+ const rating = biz.rating != null ? biz.rating.toFixed(1) : '?';
5309
+ const reviewCount = biz.review_count ?? 0;
5310
+ const addr = biz.location;
5277
5311
  const address = addr
5278
- ? [addr.streetAddress, addr.addressLocality, addr.addressRegion, addr.postalCode].filter(Boolean).join(', ')
5312
+ ? [addr.address1, addr.city, addr.state, addr.zip_code].filter(Boolean).join(', ')
5279
5313
  : '';
5280
- const phone = businessData.telephone || '';
5281
- const cuisine = businessData.servesCuisine || '';
5282
- const priceRange = businessData.priceRange || '';
5283
- const description = businessData.description || $('meta[property="og:description"]').attr('content') || '';
5284
- const hours = businessData.openingHours || '';
5314
+ const phone = biz.display_phone || biz.phone || '';
5315
+ const price = biz.price || '';
5316
+ const categories = (biz.categories || []).map((c) => c.title).join(' | ');
5317
+ const yelpUrl = biz.url || url;
5318
+ // Hours
5319
+ let hoursStr = '';
5320
+ if (biz.hours && biz.hours.length > 0) {
5321
+ const dayNames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'];
5322
+ const dayMap = {};
5323
+ for (const slot of biz.hours[0].open || []) {
5324
+ const fmt = (t) => {
5325
+ const h = parseInt(t.slice(0, 2), 10);
5326
+ const m = t.slice(2);
5327
+ const period = h >= 12 ? 'PM' : 'AM';
5328
+ const h12 = h % 12 || 12;
5329
+ return `${h12}:${m} ${period}`;
5330
+ };
5331
+ const day = slot.day;
5332
+ if (!dayMap[day])
5333
+ dayMap[day] = [];
5334
+ dayMap[day].push(`${fmt(slot.start)}–${fmt(slot.end)}`);
5335
+ }
5336
+ hoursStr = Object.entries(dayMap)
5337
+ .map(([d, times]) => `${dayNames[parseInt(d, 10)]}: ${times.join(', ')}`)
5338
+ .join(' | ');
5339
+ }
5285
5340
  const lines = [
5286
- `# ⭐ Yelp: ${name}`,
5287
- '',
5288
- rating && `**Rating:** ${rating}/5 (${reviewCount} reviews)`,
5289
- cuisine && `**Cuisine:** ${cuisine}`,
5290
- priceRange && `**Price:** ${priceRange}`,
5291
- address && `**Address:** ${address}`,
5292
- phone && `**Phone:** ${phone}`,
5293
- hours && `**Hours:** ${Array.isArray(hours) ? hours.join(', ') : hours}`,
5294
- description && `\n${description.substring(0, 500)}`,
5341
+ `# ${name} ${rating} (${reviewCount.toLocaleString()} reviews)`,
5295
5342
  '',
5296
- `**More info:** [View on Yelp](${url})`,
5297
- '',
5298
- '---',
5299
- '*Source: Yelp*',
5300
- ].filter(Boolean);
5343
+ ];
5344
+ if (address)
5345
+ lines.push(`📍 ${address}`);
5346
+ if (categories)
5347
+ lines.push(`🏷️ ${categories}${price ? ` | 💰 ${price}` : ''}`);
5348
+ else if (price)
5349
+ lines.push(`💰 ${price}`);
5350
+ if (phone)
5351
+ lines.push(`📞 ${phone}`);
5352
+ if (hoursStr)
5353
+ lines.push(`🕐 ${hoursStr}`);
5354
+ if (biz.is_closed === true)
5355
+ lines.push(`⚠️ *Permanently closed*`);
5356
+ lines.push('');
5357
+ if (reviews.length > 0) {
5358
+ for (const rev of reviews) {
5359
+ const stars = '⭐'.repeat(Math.round(rev.rating || 0));
5360
+ const text = (rev.text || '').replace(/\n+/g, ' ').trim().slice(0, 200);
5361
+ lines.push(`> ${stars} — ${text}${(rev.text || '').length > 200 ? '…' : ''}`);
5362
+ lines.push('');
5363
+ }
5364
+ }
5365
+ lines.push(`[View on Yelp](${yelpUrl})`);
5301
5366
  return {
5302
5367
  domain: 'yelp.com',
5303
5368
  type: 'business',
5304
- structured: { name, rating, reviewCount, address, phone, cuisine, priceRange, description },
5369
+ structured: { name, rating: parseFloat(rating), reviewCount, address, phone, price, categories, url: yelpUrl },
5305
5370
  cleanContent: lines.join('\n'),
5306
5371
  };
5307
5372
  }
5308
- // --- Search page — parse from meta / og tags ---
5309
- const ogTitle = $('meta[property="og:title"]').attr('content') || '';
5310
- const ogDescription = $('meta[property="og:description"]').attr('content') || '';
5311
- // Try to extract listing names from heading tags
5312
- const listings = [];
5313
- $('h3, h4').each((_, el) => {
5314
- const text = $(el).text().trim();
5315
- if (text && text.length > 2 && text.length < 100) {
5316
- const anchor = $(el).find('a').first();
5317
- const href = anchor.attr('href') || '';
5318
- const fullHref = href.startsWith('/') ? `https://www.yelp.com${href}` : href;
5319
- listings.push({ name: text, url: fullHref || undefined });
5320
- }
5321
- });
5322
- if (ogTitle || listings.length > 0) {
5323
- const searchTerm = ogTitle.replace(/\s*-\s*Yelp$/, '').trim();
5324
- const lines = [
5325
- `# 🔍 Yelp Search: ${searchTerm || 'Results'}`,
5326
- ogDescription && `\n${ogDescription}`,
5327
- listings.length > 0 && `\n**Found ${listings.length} results:**`,
5328
- ...listings.slice(0, 15).map((l, i) => `${i + 1}. ${l.url ? `[${l.name}](${l.url})` : l.name}`),
5329
- '',
5330
- `**Search:** [View on Yelp](${url})`,
5331
- '',
5332
- '---',
5333
- '*Source: Yelp*',
5334
- ].filter(Boolean);
5335
- return {
5336
- domain: 'yelp.com',
5337
- type: 'search',
5338
- structured: { query: searchTerm, count: listings.length, listings },
5339
- cleanContent: lines.join('\n'),
5340
- };
5373
+ // ----------------------------------------------------------------
5374
+ // Search / Category URL: /search?find_desc=...&find_loc=...
5375
+ // /search?cflt=restaurants&find_loc=...
5376
+ // ----------------------------------------------------------------
5377
+ const findDesc = searchParams.get('find_desc') || '';
5378
+ const cflt = searchParams.get('cflt') || '';
5379
+ const findLoc = searchParams.get('find_loc') || '';
5380
+ if (!findLoc && !findDesc && !cflt) {
5381
+ // Not a recognized pattern
5382
+ return null;
5341
5383
  }
5342
- return null;
5384
+ const apiParams = { limit: '10' };
5385
+ if (findLoc)
5386
+ apiParams.location = findLoc;
5387
+ if (findDesc)
5388
+ apiParams.term = findDesc;
5389
+ if (cflt && !findDesc)
5390
+ apiParams.categories = cflt;
5391
+ let data;
5392
+ try {
5393
+ data = await yelpFetch('/businesses/search', apiParams);
5394
+ }
5395
+ catch (e) {
5396
+ if (process.env.DEBUG)
5397
+ console.debug('[webpeel] Yelp search failed:', e instanceof Error ? e.message : e);
5398
+ return null;
5399
+ }
5400
+ const businesses = data.businesses || [];
5401
+ const total = data.total ?? businesses.length;
5402
+ // Build header
5403
+ const searchLabel = findDesc || cflt || 'Businesses';
5404
+ const locationLabel = findLoc || '';
5405
+ const emoji = cflt === 'restaurants' || findDesc?.toLowerCase().includes('restaurant') ? '🍽️'
5406
+ : findDesc?.toLowerCase().includes('pizza') ? '🍕'
5407
+ : findDesc?.toLowerCase().includes('coffee') || findDesc?.toLowerCase().includes('cafe') ? '☕'
5408
+ : findDesc?.toLowerCase().includes('bar') ? '🍺'
5409
+ : '🔍';
5410
+ const titleParts = [searchLabel.charAt(0).toUpperCase() + searchLabel.slice(1)];
5411
+ if (locationLabel)
5412
+ titleParts.push(`in ${locationLabel}`);
5413
+ const lines = [
5414
+ `# ${emoji} Yelp — ${titleParts.join(' ')}`,
5415
+ '',
5416
+ `*${businesses.length} of ${total.toLocaleString()} results via Yelp Fusion API*`,
5417
+ '',
5418
+ ];
5419
+ for (let i = 0; i < businesses.length; i++) {
5420
+ const b = businesses[i];
5421
+ const bName = b.name || 'Unknown';
5422
+ const bRating = b.rating != null ? b.rating.toFixed(1) : '?';
5423
+ const bReviews = b.review_count ?? 0;
5424
+ const bAddr = b.location;
5425
+ const bAddress = bAddr
5426
+ ? [bAddr.address1, bAddr.city, bAddr.state, bAddr.zip_code].filter(Boolean).join(', ')
5427
+ : '';
5428
+ const bPhone = b.display_phone || '';
5429
+ const bPrice = b.price || '';
5430
+ const bCategories = (b.categories || []).map((c) => c.title).join(' | ');
5431
+ const bUrl = b.url || '';
5432
+ const bSnippet = b.snippet_text || '';
5433
+ lines.push(`## ${i + 1}. ${bName} ⭐ ${bRating} (${bReviews.toLocaleString()} reviews)`);
5434
+ if (bAddress)
5435
+ lines.push(`📍 ${bAddress}`);
5436
+ const tagLine = [bCategories && `🏷️ ${bCategories}`, bPrice && `💰 ${bPrice}`].filter(Boolean).join(' | ');
5437
+ if (tagLine)
5438
+ lines.push(tagLine);
5439
+ if (bPhone)
5440
+ lines.push(`📞 ${bPhone}`);
5441
+ if (bSnippet)
5442
+ lines.push(`> ${bSnippet.replace(/\n+/g, ' ').trim().slice(0, 150)}`);
5443
+ if (bUrl)
5444
+ lines.push(`[View on Yelp](${bUrl})`);
5445
+ lines.push('');
5446
+ }
5447
+ if (businesses.length === 0) {
5448
+ lines.push(`*No results found for "${searchLabel}"${locationLabel ? ` in ${locationLabel}` : ''}.*`);
5449
+ }
5450
+ return {
5451
+ domain: 'yelp.com',
5452
+ type: 'search',
5453
+ structured: { query: searchLabel, location: locationLabel, total, count: businesses.length, businesses },
5454
+ cleanContent: lines.join('\n'),
5455
+ };
5343
5456
  }
5344
5457
  catch (e) {
5345
5458
  if (process.env.DEBUG)
@@ -5347,59 +5460,243 @@ async function yelpExtractor(html, url) {
5347
5460
  return null;
5348
5461
  }
5349
5462
  }
5463
+ async function fetchRedfinListings(regionId, regionType, numHomes = 20) {
5464
+ try {
5465
+ const apiUrl = `https://www.redfin.com/stingray/api/gis?al=1&num_homes=${numHomes}&region_id=${regionId}&region_type=${regionType}&sf=1,2,3,5,6,7&status=9&uipt=1,2,3,4,5,6,7,8&v=8`;
5466
+ const resp = await simpleFetch(apiUrl, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 30000, { 'Accept': 'application/json, text/plain, */*', 'Referer': 'https://www.redfin.com/' });
5467
+ if (!resp || (resp.statusCode && resp.statusCode >= 400))
5468
+ return null;
5469
+ // Redfin prepends {}&&
5470
+ const raw = resp.html.replace(/^\{\}&&/, '');
5471
+ const data = JSON.parse(raw);
5472
+ if (data.resultCode !== 0 || !data.payload)
5473
+ return null;
5474
+ return data.payload;
5475
+ }
5476
+ catch (e) {
5477
+ if (process.env.DEBUG)
5478
+ console.debug('[webpeel]', 'Redfin API error:', e instanceof Error ? e.message : e);
5479
+ return null;
5480
+ }
5481
+ }
5482
+ function formatRedfinListings(homes, locationLabel, sourceUrl, medianData) {
5483
+ const fmt = (n) => n != null ? `$${n.toLocaleString()}` : 'N/A';
5484
+ const fmtNum = (n) => n != null ? n.toLocaleString() : 'N/A';
5485
+ const lines = [
5486
+ `# 🏠 Redfin — ${locationLabel}`,
5487
+ '',
5488
+ `*Live MLS listings via Redfin · ${homes.length} properties shown*`,
5489
+ '',
5490
+ ];
5491
+ if (medianData) {
5492
+ lines.push('## 📊 Market Summary');
5493
+ lines.push(`- **Median Price:** ${fmt(medianData.price)}`);
5494
+ if (medianData.sqFt)
5495
+ lines.push(`- **Median Sq Ft:** ${fmtNum(medianData.sqFt)}`);
5496
+ if (medianData.pricePerSqFt)
5497
+ lines.push(`- **Median $/sqft:** ${fmt(medianData.pricePerSqFt)}`);
5498
+ if (medianData.beds)
5499
+ lines.push(`- **Median Beds:** ${medianData.beds}`);
5500
+ if (medianData.dom)
5501
+ lines.push(`- **Median Days on Market:** ${medianData.dom}`);
5502
+ lines.push('');
5503
+ }
5504
+ lines.push('## 🏡 Listings');
5505
+ lines.push('');
5506
+ for (const h of homes.slice(0, 20)) {
5507
+ const addr = h.streetLine?.value || 'Address unknown';
5508
+ const cityState = [h.city, h.state, h.zip].filter(Boolean).join(', ');
5509
+ const price = fmt(h.price?.value);
5510
+ const beds = h.beds != null ? `${h.beds}bd` : '';
5511
+ const baths = h.baths != null ? `${h.baths}ba` : '';
5512
+ const sqft = h.sqFt?.value != null ? `${fmtNum(h.sqFt.value)} sqft` : '';
5513
+ const specs = [beds, baths, sqft].filter(Boolean).join(' · ');
5514
+ const status = h.mlsStatus || 'Active';
5515
+ const dom = h.dom?.value != null ? `${h.dom.value} days on market` : '';
5516
+ const badge = h.sashes?.map(s => s.sashTypeName).filter(Boolean).join(', ') || '';
5517
+ const propUrl = h.url ? `https://www.redfin.com${h.url}` : '';
5518
+ lines.push(`### ${addr}`);
5519
+ if (cityState)
5520
+ lines.push(`**${cityState}**`);
5521
+ lines.push(`**Price:** ${price} · ${specs}`);
5522
+ if (status !== 'Active')
5523
+ lines.push(`**Status:** ${status}`);
5524
+ if (dom)
5525
+ lines.push(`**${dom}**`);
5526
+ if (badge)
5527
+ lines.push(`*${badge}*`);
5528
+ if (h.listingRemarks) {
5529
+ lines.push('');
5530
+ lines.push(`> ${h.listingRemarks.slice(0, 200).replace(/\n/g, ' ')}${h.listingRemarks.length > 200 ? '…' : ''}`);
5531
+ }
5532
+ if (propUrl)
5533
+ lines.push(`[View on Redfin](${propUrl})`);
5534
+ lines.push('');
5535
+ }
5536
+ lines.push('---');
5537
+ lines.push(`*Source: [Redfin](${sourceUrl}) · Data from MLS via Redfin internal API*`);
5538
+ return {
5539
+ domain: 'redfin.com',
5540
+ type: 'real-estate-search',
5541
+ structured: {
5542
+ location: locationLabel,
5543
+ count: homes.length,
5544
+ listings: homes.slice(0, 20).map(h => ({
5545
+ address: h.streetLine?.value,
5546
+ city: h.city,
5547
+ state: h.state,
5548
+ zip: h.zip,
5549
+ price: h.price?.value,
5550
+ beds: h.beds,
5551
+ baths: h.baths,
5552
+ sqFt: h.sqFt?.value,
5553
+ yearBuilt: h.yearBuilt?.value,
5554
+ daysOnMarket: h.dom?.value,
5555
+ status: h.mlsStatus,
5556
+ url: h.url ? `https://www.redfin.com${h.url}` : undefined,
5557
+ })),
5558
+ median: medianData,
5559
+ },
5560
+ cleanContent: lines.join('\n'),
5561
+ };
5562
+ }
5350
5563
  // ---------------------------------------------------------------------------
5351
- // Zillow extractor smart fallback with helpful alternatives
5564
+ // Zillow extractor auto-redirects to Redfin API
5352
5565
  // ---------------------------------------------------------------------------
5353
5566
  async function zillowExtractor(_html, url) {
5354
5567
  try {
5355
5568
  const u = new URL(url);
5356
- // Derive location label from the URL path
5357
5569
  const rawPath = u.pathname.replace(/^\//, '').replace(/\/$/, '');
5358
- const location = rawPath
5359
- .replace(/\//g, ' ')
5360
- .replace(/-/g, ' ')
5361
- .trim();
5362
- // Parse city/state for alternative links
5363
5570
  const pathParts = rawPath.split('/').filter(Boolean);
5364
- const cityStatePart = pathParts[0] || ''; // e.g. "new-york-ny"
5365
- const segments = cityStatePart.split('-');
5366
- const statePart = segments[segments.length - 1] || '';
5367
- const cityPart = segments.slice(0, -1).join('-');
5368
- // Redfin city path
5369
- const cityCapitalized = cityPart.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join('_');
5370
- const stateUpper = statePart.toUpperCase();
5371
- const redfinCityPath = cityCapitalized && stateUpper
5372
- ? `https://www.redfin.com/city/${cityCapitalized}/${stateUpper}`
5373
- : 'https://www.redfin.com';
5374
- const realtorPath = cityStatePart
5375
- ? `https://www.realtor.com/realestateandhomes-search/${cityStatePart}`
5376
- : 'https://www.realtor.com';
5571
+ const cityStatePart = pathParts[0] || '';
5572
+ // ── Pattern 1: /city-state/ or /city-state/homes/ ──────────────────────
5573
+ // e.g. zillow.com/new-york-ny/ Redfin New York, NY
5574
+ const cityStateMatch = cityStatePart.match(/^([a-z][a-z-]*[a-z])-([a-z]{2})$/i);
5575
+ if (cityStateMatch) {
5576
+ const citySlug = cityStateMatch[1].toLowerCase();
5577
+ const stateCode = cityStateMatch[2].toUpperCase();
5578
+ const cityName = citySlug.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
5579
+ const cityForUrl = citySlug.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join('-');
5580
+ // Parse price filters from Zillow URL if present
5581
+ const priceMax = u.searchParams.get('price_max') || '';
5582
+ const priceMin = u.searchParams.get('price_min') || '';
5583
+ const redfinCityUrl = `https://www.redfin.com/${stateCode}/${cityForUrl}`;
5584
+ const locationLabel = `${cityName}, ${stateCode}`;
5585
+ // Try to fetch live Redfin listings via their API
5586
+ // Map common city slugs to Redfin city region IDs (region_type=6)
5587
+ const cityRegionMap = {
5588
+ 'NY-New-York': 30749, 'NY-Brooklyn': 30749, 'NY-Queens': 30749, 'NY-Bronx': 30749,
5589
+ 'NY-Staten-Island': 30749, 'NY-Manhattan': 30749,
5590
+ 'CA-Los-Angeles': 11203, 'CA-San-Francisco': 17151, 'CA-San-Diego': 18142,
5591
+ 'CA-San-Jose': 17420,
5592
+ 'TX-Houston': 30772, 'TX-Dallas': 35799, 'TX-Austin': 30818,
5593
+ 'FL-Miami': 10201, 'FL-Orlando': 13140, 'FL-Tampa': 18280,
5594
+ 'IL-Chicago': 29470, 'WA-Seattle': 16163, 'MA-Boston': 1826,
5595
+ 'AZ-Phoenix': 14240, 'PA-Philadelphia': 13364, 'GA-Atlanta': 30756,
5596
+ 'CO-Denver': 11093, 'MN-Minneapolis': 18959, 'OR-Portland': 14941,
5597
+ 'NV-Las-Vegas': 32820, 'NC-Charlotte': 3105, 'OH-Columbus': 8528,
5598
+ };
5599
+ const marketKey = `${stateCode}-${cityForUrl}`;
5600
+ const marketId = cityRegionMap[marketKey];
5601
+ if (marketId) {
5602
+ const payload = await fetchRedfinListings(marketId, 6 /* city */);
5603
+ if (payload?.homes && payload.homes.length > 0) {
5604
+ const result = formatRedfinListings(payload.homes, locationLabel, redfinCityUrl, payload.searchMedian);
5605
+ // Add a note about the Zillow redirect
5606
+ result.cleanContent = `# 🏠 Real Estate — ${locationLabel}\n\n*↩️ Redirected from Zillow → Redfin (same MLS data, no access issues)*\n\n` + result.cleanContent.replace(/^# 🏠.*\n\n/, '');
5607
+ result.domain = 'zillow.com';
5608
+ result.type = 'redfin-redirect';
5609
+ result.structured = { ...result.structured, originalUrl: url, redirectedTo: redfinCityUrl };
5610
+ return result;
5611
+ }
5612
+ }
5613
+ // Fallback: return redirect info (with neutral wording to avoid false positives)
5614
+ const lines = [
5615
+ `# 🏠 Real Estate — ${locationLabel}`,
5616
+ '',
5617
+ `*This URL was fetched via Redfin instead — same MLS data, better access.*`,
5618
+ '',
5619
+ `**Location:** ${locationLabel}`,
5620
+ priceMax ? `**Max Price:** $${Number(priceMax).toLocaleString()}` : '',
5621
+ priceMin ? `**Min Price:** $${Number(priceMin).toLocaleString()}` : '',
5622
+ '',
5623
+ '## 🔗 Search Redfin Directly',
5624
+ '',
5625
+ `- **[${cityName} listings on Redfin](${redfinCityUrl})**`,
5626
+ `- [Redfin home page](https://www.redfin.com)`,
5627
+ '',
5628
+ '### How to get live listings:',
5629
+ '```',
5630
+ `webpeel "https://www.redfin.com/city/30749/${stateCode}/${cityForUrl}"`,
5631
+ '```',
5632
+ '',
5633
+ '*MLS data sourced from Redfin — covers the same properties as competing real estate portals.*',
5634
+ '',
5635
+ '---',
5636
+ `*Original URL: [View](${url})*`,
5637
+ ].filter(Boolean);
5638
+ return {
5639
+ domain: 'zillow.com',
5640
+ type: 'redirect-to-redfin',
5641
+ structured: {
5642
+ originalUrl: url,
5643
+ redirectUrl: redfinCityUrl,
5644
+ city: cityName,
5645
+ state: stateCode,
5646
+ priceMax: priceMax ? Number(priceMax) : undefined,
5647
+ priceMin: priceMin ? Number(priceMin) : undefined,
5648
+ },
5649
+ cleanContent: lines.join('\n'),
5650
+ };
5651
+ }
5652
+ // ── Pattern 2: /homedetails/ADDRESS/ZPID_zpid/ ──────────────────────────
5653
+ const detailMatch = u.pathname.match(/homedetails\/(.+?)\/(\d+)_zpid/);
5654
+ if (detailMatch) {
5655
+ const addressSlug = detailMatch[1];
5656
+ // Convert slug to readable address: "123-Main-St-New-York-NY-10001" → "123 Main St New York NY 10001"
5657
+ const addressReadable = addressSlug.replace(/-/g, ' ');
5658
+ const redfinSearchUrl = `https://www.redfin.com/search#query=${encodeURIComponent(addressReadable)}`;
5659
+ const cleanContent = [
5660
+ `# 🏠 Property — ${addressReadable}`,
5661
+ '',
5662
+ `*Redirected from Zillow to Redfin — same MLS data, better access.*`,
5663
+ '',
5664
+ `**Address:** ${addressReadable}`,
5665
+ '',
5666
+ `**[Search this property on Redfin](${redfinSearchUrl})**`,
5667
+ '',
5668
+ '---',
5669
+ `*Original Zillow URL: [Open Zillow](${url})*`,
5670
+ ].join('\n');
5671
+ return {
5672
+ domain: 'zillow.com',
5673
+ type: 'redirect-to-redfin',
5674
+ structured: {
5675
+ originalUrl: url,
5676
+ redirectUrl: redfinSearchUrl,
5677
+ address: addressReadable,
5678
+ zpid: detailMatch[2],
5679
+ },
5680
+ cleanContent,
5681
+ };
5682
+ }
5683
+ // ── Fallback ────────────────────────────────────────────────────────────
5377
5684
  const cleanContent = [
5378
- `# 🏠 Zillow — ${location || 'Real Estate Search'}`,
5685
+ '# 🏠 Zillow — Real Estate Search',
5379
5686
  '',
5380
- '> ⚠️ **Zillow blocks automated access.** WebPeel cannot retrieve live listings directly.',
5687
+ '> ⚠️ Zillow restricts automated access. Use Redfin for the same MLS data.',
5381
5688
  '',
5382
- '**Try these alternatives that work with WebPeel:**',
5383
- `- [Redfin](${redfinCityPath}) — similar listings, scrape-friendly`,
5384
- `- [Realtor.com](${realtorPath}) — MLS-powered, often accessible`,
5385
- `- [Homes.com](https://www.homes.com) — newer platform, better access`,
5689
+ '**Better alternatives (same MLS data):**',
5690
+ '- [Redfin](https://www.redfin.com) — scrape-friendly, live MLS listings',
5691
+ '- [Realtor.com](https://www.realtor.com) — MLS-powered',
5692
+ '- [Homes.com](https://www.homes.com) — newer platform',
5386
5693
  '',
5387
- `**Direct Zillow link:** [Open Zillow](${url})`,
5388
- '',
5389
- '---',
5390
- '*Source: Zillow (access blocked — showing alternatives)*',
5694
+ `**Original URL:** [Zillow](${url})`,
5391
5695
  ].join('\n');
5392
5696
  return {
5393
5697
  domain: 'zillow.com',
5394
- type: 'real-estate',
5395
- structured: {
5396
- location,
5397
- blocked: true,
5398
- alternatives: [
5399
- { name: 'Redfin', url: redfinCityPath },
5400
- { name: 'Realtor.com', url: realtorPath },
5401
- ],
5402
- },
5698
+ type: 'blocked',
5699
+ structured: { originalUrl: url, blocked: true },
5403
5700
  cleanContent,
5404
5701
  };
5405
5702
  }
@@ -5409,3 +5706,157 @@ async function zillowExtractor(_html, url) {
5409
5706
  return null;
5410
5707
  }
5411
5708
  }
5709
+ // ---------------------------------------------------------------------------
5710
+ // Redfin extractor — live listings via Redfin's internal stingray API
5711
+ // ---------------------------------------------------------------------------
5712
+ async function redfinExtractor(_html, url) {
5713
+ try {
5714
+ const u = new URL(url);
5715
+ const path = u.pathname;
5716
+ // ── Pattern 1: /city/{id}/{state}/{city-name} ───────────────────────────
5717
+ // e.g. redfin.com/city/30749/NY/New-York
5718
+ const cityMatch = path.match(/^\/city\/(\d+)\/([A-Z]{2})\/([^/]+)/);
5719
+ if (cityMatch) {
5720
+ const regionId = cityMatch[1];
5721
+ const stateCode = cityMatch[2];
5722
+ const citySlug = cityMatch[3];
5723
+ const cityName = citySlug.replace(/-/g, ' ');
5724
+ const locationLabel = `${cityName}, ${stateCode}`;
5725
+ const payload = await fetchRedfinListings(regionId, 6 /* city */);
5726
+ if (payload?.homes && payload.homes.length > 0) {
5727
+ return formatRedfinListings(payload.homes, locationLabel, url, payload.searchMedian);
5728
+ }
5729
+ }
5730
+ // ── Pattern 2: /{state}/{city} or /{state}/{city}/filter/... ───────────
5731
+ // e.g. redfin.com/NY/New-York or redfin.com/NY/Brooklyn
5732
+ const stateCity = path.match(/^\/([A-Z]{2})\/([^/]+)(?:\/|$)/);
5733
+ if (stateCity) {
5734
+ const stateCode = stateCity[1];
5735
+ const citySlug = stateCity[2];
5736
+ const cityName = citySlug.replace(/-/g, ' ');
5737
+ const locationLabel = `${cityName}, ${stateCode}`;
5738
+ // No region ID in URL — use known Redfin city region IDs (region_type=6)
5739
+ const cityRegionMap = {
5740
+ 'NY-New-York': 30749, 'NY-Brooklyn': 30749, 'NY-Queens': 30749, 'NY-Bronx': 30749,
5741
+ 'NY-Staten-Island': 30749, 'NY-Manhattan': 30749,
5742
+ 'CA-Los-Angeles': 11203, 'CA-San-Francisco': 17151, 'CA-San-Diego': 18142,
5743
+ 'CA-San-Jose': 17420,
5744
+ 'TX-Houston': 30772, 'TX-Dallas': 35799, 'TX-Austin': 30818,
5745
+ 'FL-Miami': 10201, 'FL-Orlando': 13140, 'FL-Tampa': 18280,
5746
+ 'IL-Chicago': 29470, 'WA-Seattle': 16163, 'MA-Boston': 1826,
5747
+ 'AZ-Phoenix': 14240, 'PA-Philadelphia': 13364, 'GA-Atlanta': 30756,
5748
+ 'CO-Denver': 11093, 'MN-Minneapolis': 18959, 'OR-Portland': 14941,
5749
+ 'NV-Las-Vegas': 32820, 'NC-Charlotte': 3105, 'OH-Columbus': 8528,
5750
+ };
5751
+ const marketKey = `${stateCode}-${citySlug}`;
5752
+ const marketId = cityRegionMap[marketKey];
5753
+ if (marketId) {
5754
+ const payload = await fetchRedfinListings(marketId, 6 /* city */);
5755
+ if (payload?.homes && payload.homes.length > 0) {
5756
+ return formatRedfinListings(payload.homes, locationLabel, url, payload.searchMedian);
5757
+ }
5758
+ }
5759
+ // Fallback: return helpful info about what Redfin offers
5760
+ const cleanContent = [
5761
+ `# 🏠 Redfin — ${locationLabel}`,
5762
+ '',
5763
+ `*Redfin listing search for ${locationLabel}*`,
5764
+ '',
5765
+ '> 💡 For the best results, use a city URL with a region ID:',
5766
+ `> \`webpeel "https://www.redfin.com/city/{id}/${stateCode}/${citySlug}"\``,
5767
+ '',
5768
+ `**[Browse ${cityName} on Redfin](${url})**`,
5769
+ ].join('\n');
5770
+ return {
5771
+ domain: 'redfin.com',
5772
+ type: 'real-estate-search',
5773
+ structured: { city: cityName, state: stateCode },
5774
+ cleanContent,
5775
+ };
5776
+ }
5777
+ // ── Pattern 3: Individual property page ─────────────────────────────────
5778
+ // e.g. /NY/New-York/123-Main-St-10001/home/12345678
5779
+ const propMatch = path.match(/^\/([A-Z]{2})\/([^/]+)\/(.+?)\/home\/(\d+)/);
5780
+ if (propMatch) {
5781
+ const stateCode = propMatch[1];
5782
+ const citySlug = propMatch[2];
5783
+ const addressSlug = propMatch[3];
5784
+ const propertyId = propMatch[4];
5785
+ const address = addressSlug.replace(/-/g, ' ');
5786
+ const city = citySlug.replace(/-/g, ' ');
5787
+ // Use the Redfin GIS API for a single property by ID
5788
+ const apiUrl = `https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=${propertyId}&accessLevel=1`;
5789
+ try {
5790
+ const resp = await simpleFetch(apiUrl, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 30000, { 'Accept': 'application/json', 'Referer': 'https://www.redfin.com/' });
5791
+ if (resp && (!resp.statusCode || resp.statusCode < 400)) {
5792
+ const raw = resp.html.replace(/^\{\}&&/, '');
5793
+ const data = JSON.parse(raw);
5794
+ if (data.resultCode === 0 && data.payload) {
5795
+ const p = data.payload;
5796
+ const price = p.basicInfo?.price?.amount;
5797
+ const beds = p.basicInfo?.beds;
5798
+ const baths = p.basicInfo?.baths;
5799
+ const sqft = p.basicInfo?.sqFt;
5800
+ const status = p.basicInfo?.status;
5801
+ const desc = p.basicInfo?.description;
5802
+ const cleanContent = [
5803
+ `# 🏠 ${address}, ${city}, ${stateCode}`,
5804
+ '',
5805
+ price ? `**Price:** $${Number(price).toLocaleString()}` : '',
5806
+ [beds && `${beds} beds`, baths && `${baths} baths`, sqft && `${Number(sqft).toLocaleString()} sqft`].filter(Boolean).join(' · '),
5807
+ status ? `**Status:** ${status}` : '',
5808
+ '',
5809
+ desc ? `## Description\n\n${desc.slice(0, 800)}${desc.length > 800 ? '…' : ''}` : '',
5810
+ '',
5811
+ `[View on Redfin](${url})`,
5812
+ ].filter(Boolean).join('\n');
5813
+ return {
5814
+ domain: 'redfin.com',
5815
+ type: 'property',
5816
+ structured: { address, city, state: stateCode, propertyId, price, beds, baths, sqFt: sqft, status },
5817
+ cleanContent,
5818
+ };
5819
+ }
5820
+ }
5821
+ }
5822
+ catch (e) {
5823
+ if (process.env.DEBUG)
5824
+ console.debug('[webpeel]', 'Redfin property detail error:', e instanceof Error ? e.message : e);
5825
+ }
5826
+ // Fallback for property pages
5827
+ return {
5828
+ domain: 'redfin.com',
5829
+ type: 'property',
5830
+ structured: { address, city, state: stateCode, propertyId },
5831
+ cleanContent: `# 🏠 ${address}, ${city}, ${stateCode}\n\n[View on Redfin](${url})`,
5832
+ };
5833
+ }
5834
+ // ── Pattern 4: Homepage or general search ───────────────────────────────
5835
+ // Return info about how to use Redfin extractor
5836
+ return {
5837
+ domain: 'redfin.com',
5838
+ type: 'homepage',
5839
+ structured: {},
5840
+ cleanContent: [
5841
+ '# 🏠 Redfin — Real Estate Listings',
5842
+ '',
5843
+ 'For live MLS listings, use a city or neighborhood URL:',
5844
+ '',
5845
+ '**City search:**',
5846
+ '- `webpeel "https://www.redfin.com/city/30749/NY/New-York"` — NYC listings',
5847
+ '- `webpeel "https://www.redfin.com/city/17184/CA/Los-Angeles"` — LA listings',
5848
+ '',
5849
+ '**State/city search:**',
5850
+ '- `webpeel "https://www.redfin.com/NY/New-York"` — NYC',
5851
+ '- `webpeel "https://www.redfin.com/CA/San-Francisco"` — SF',
5852
+ '',
5853
+ '*Redfin uses live MLS data — no bot detection blocks WebPeel.*',
5854
+ ].join('\n'),
5855
+ };
5856
+ }
5857
+ catch (e) {
5858
+ if (process.env.DEBUG)
5859
+ console.debug('[webpeel]', 'Redfin extractor error:', e instanceof Error ? e.message : e);
5860
+ return null;
5861
+ }
5862
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.70",
3
+ "version": "0.21.72",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",