webpeel 0.21.71 → 0.21.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5247,100 +5247,212 @@ async function ebayExtractor(html, url) {
5247
5247
  // ---------------------------------------------------------------------------
5248
5248
  // Yelp extractor — parse JSON-LD + meta from stealth-rendered HTML
5249
5249
  // ---------------------------------------------------------------------------
5250
- async function yelpExtractor(html, url) {
5250
+ async function yelpExtractor(_html, url) {
5251
+ const YELP_API_KEY = process.env.YELP_API_KEY;
5252
+ // Helper to call Yelp Fusion API
5253
+ async function yelpFetch(path, params) {
5254
+ const base = 'https://api.yelp.com/v3';
5255
+ const qs = params ? '?' + new URLSearchParams(params).toString() : '';
5256
+ const res = await fetch(`${base}${path}${qs}`, {
5257
+ headers: { 'Authorization': `Bearer ${YELP_API_KEY}` },
5258
+ });
5259
+ if (!res.ok) {
5260
+ throw new Error(`Yelp API ${res.status}: ${res.statusText}`);
5261
+ }
5262
+ return res.json();
5263
+ }
5251
5264
  try {
5252
- const { load } = await import('cheerio');
5253
- const $ = load(html);
5254
- // Try JSON-LD structured data first
5255
- const jsonLdScripts = $('script[type="application/ld+json"]');
5256
- let businessData = null;
5257
- jsonLdScripts.each((_, el) => {
5258
- const raw = $(el).html() || '';
5265
+ const parsed = new URL(url);
5266
+ const pathname = parsed.pathname;
5267
+ const searchParams = parsed.searchParams;
5268
+ // ----------------------------------------------------------------
5269
+ // If no API key, fall back to the legacy HTML-scraping approach
5270
+ // ----------------------------------------------------------------
5271
+ if (!YELP_API_KEY) {
5272
+ // Legacy fallback: minimal result pointing user to Yelp
5273
+ const term = searchParams.get('find_desc') || searchParams.get('cflt') || 'businesses';
5274
+ const loc = searchParams.get('find_loc') || '';
5275
+ const isBiz = pathname.startsWith('/biz/');
5276
+ const cleanContent = isBiz
5277
+ ? `# Yelp Business\n\n*No YELP_API_KEY configured — visit [Yelp](${url}) for details.*`
5278
+ : `# 🔍 Yelp Search: ${term}${loc ? ` in ${loc}` : ''}\n\n*No YELP_API_KEY configured — [View on Yelp](${url})*`;
5279
+ return {
5280
+ domain: 'yelp.com',
5281
+ type: isBiz ? 'business' : 'search',
5282
+ structured: { url },
5283
+ cleanContent,
5284
+ };
5285
+ }
5286
+ // ----------------------------------------------------------------
5287
+ // Business page: /biz/<alias>
5288
+ // ----------------------------------------------------------------
5289
+ if (pathname.startsWith('/biz/')) {
5290
+ const alias = pathname.replace('/biz/', '').split('?')[0].split('#')[0];
5291
+ let biz;
5259
5292
  try {
5260
- const parsed = JSON.parse(raw);
5261
- const items = Array.isArray(parsed) ? parsed : [parsed];
5262
- for (const item of items) {
5263
- const type = item['@type'];
5264
- if (type === 'Restaurant' || type === 'LocalBusiness' || type === 'FoodEstablishment' ||
5265
- type === 'BarOrPub' || type === 'CafeOrCoffeeShop') {
5266
- businessData = item;
5267
- }
5268
- }
5293
+ biz = await yelpFetch(`/businesses/${alias}`);
5269
5294
  }
5270
- catch { /* ignore malformed JSON-LD */ }
5271
- });
5272
- // --- Business page ---
5273
- if (businessData) {
5274
- const name = businessData.name || '';
5275
- const rating = businessData.aggregateRating?.ratingValue;
5276
- const reviewCount = businessData.aggregateRating?.reviewCount;
5277
- const addr = businessData.address;
5295
+ catch (e) {
5296
+ if (process.env.DEBUG)
5297
+ console.debug('[webpeel] Yelp biz fetch failed:', e instanceof Error ? e.message : e);
5298
+ return null;
5299
+ }
5300
+ // Fetch reviews (best-effort)
5301
+ let reviews = [];
5302
+ try {
5303
+ const revData = await yelpFetch(`/businesses/${alias}/reviews`, { limit: '3' });
5304
+ reviews = revData.reviews || [];
5305
+ }
5306
+ catch { /* reviews are optional */ }
5307
+ const name = biz.name || alias;
5308
+ const rating = biz.rating != null ? biz.rating.toFixed(1) : '?';
5309
+ const reviewCount = biz.review_count ?? 0;
5310
+ const addr = biz.location;
5278
5311
  const address = addr
5279
- ? [addr.streetAddress, addr.addressLocality, addr.addressRegion, addr.postalCode].filter(Boolean).join(', ')
5312
+ ? [addr.address1, addr.city, addr.state, addr.zip_code].filter(Boolean).join(', ')
5280
5313
  : '';
5281
- const phone = businessData.telephone || '';
5282
- const cuisine = businessData.servesCuisine || '';
5283
- const priceRange = businessData.priceRange || '';
5284
- const description = businessData.description || $('meta[property="og:description"]').attr('content') || '';
5285
- const hours = businessData.openingHours || '';
5314
+ const phone = biz.display_phone || biz.phone || '';
5315
+ const price = biz.price || '';
5316
+ const categories = (biz.categories || []).map((c) => c.title).join(' | ');
5317
+ const yelpUrl = biz.url || url;
5318
+ // Hours
5319
+ let hoursStr = '';
5320
+ if (biz.hours && biz.hours.length > 0) {
5321
+ const dayNames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'];
5322
+ const dayMap = {};
5323
+ for (const slot of biz.hours[0].open || []) {
5324
+ const fmt = (t) => {
5325
+ const h = parseInt(t.slice(0, 2), 10);
5326
+ const m = t.slice(2);
5327
+ const period = h >= 12 ? 'PM' : 'AM';
5328
+ const h12 = h % 12 || 12;
5329
+ return `${h12}:${m} ${period}`;
5330
+ };
5331
+ const day = slot.day;
5332
+ if (!dayMap[day])
5333
+ dayMap[day] = [];
5334
+ dayMap[day].push(`${fmt(slot.start)}–${fmt(slot.end)}`);
5335
+ }
5336
+ hoursStr = Object.entries(dayMap)
5337
+ .map(([d, times]) => `${dayNames[parseInt(d, 10)]}: ${times.join(', ')}`)
5338
+ .join(' | ');
5339
+ }
5286
5340
  const lines = [
5287
- `# ⭐ Yelp: ${name}`,
5288
- '',
5289
- rating && `**Rating:** ${rating}/5 (${reviewCount} reviews)`,
5290
- cuisine && `**Cuisine:** ${cuisine}`,
5291
- priceRange && `**Price:** ${priceRange}`,
5292
- address && `**Address:** ${address}`,
5293
- phone && `**Phone:** ${phone}`,
5294
- hours && `**Hours:** ${Array.isArray(hours) ? hours.join(', ') : hours}`,
5295
- description && `\n${description.substring(0, 500)}`,
5341
+ `# ${name} ${rating} (${reviewCount.toLocaleString()} reviews)`,
5296
5342
  '',
5297
- `**More info:** [View on Yelp](${url})`,
5298
- '',
5299
- '---',
5300
- '*Source: Yelp*',
5301
- ].filter(Boolean);
5343
+ ];
5344
+ if (address)
5345
+ lines.push(`📍 ${address}`);
5346
+ if (categories)
5347
+ lines.push(`🏷️ ${categories}${price ? ` | 💰 ${price}` : ''}`);
5348
+ else if (price)
5349
+ lines.push(`💰 ${price}`);
5350
+ if (phone)
5351
+ lines.push(`📞 ${phone}`);
5352
+ if (hoursStr)
5353
+ lines.push(`🕐 ${hoursStr}`);
5354
+ if (biz.is_closed === true)
5355
+ lines.push(`⚠️ *Permanently closed*`);
5356
+ lines.push('');
5357
+ if (reviews.length > 0) {
5358
+ for (const rev of reviews) {
5359
+ const stars = '⭐'.repeat(Math.round(rev.rating || 0));
5360
+ const text = (rev.text || '').replace(/\n+/g, ' ').trim().slice(0, 200);
5361
+ lines.push(`> ${stars} — ${text}${(rev.text || '').length > 200 ? '…' : ''}`);
5362
+ lines.push('');
5363
+ }
5364
+ }
5365
+ lines.push(`[View on Yelp](${yelpUrl})`);
5302
5366
  return {
5303
5367
  domain: 'yelp.com',
5304
5368
  type: 'business',
5305
- structured: { name, rating, reviewCount, address, phone, cuisine, priceRange, description },
5369
+ structured: { name, rating: parseFloat(rating), reviewCount, address, phone, price, categories, url: yelpUrl },
5306
5370
  cleanContent: lines.join('\n'),
5307
5371
  };
5308
5372
  }
5309
- // --- Search page — parse from meta / og tags ---
5310
- const ogTitle = $('meta[property="og:title"]').attr('content') || '';
5311
- const ogDescription = $('meta[property="og:description"]').attr('content') || '';
5312
- // Try to extract listing names from heading tags
5313
- const listings = [];
5314
- $('h3, h4').each((_, el) => {
5315
- const text = $(el).text().trim();
5316
- if (text && text.length > 2 && text.length < 100) {
5317
- const anchor = $(el).find('a').first();
5318
- const href = anchor.attr('href') || '';
5319
- const fullHref = href.startsWith('/') ? `https://www.yelp.com${href}` : href;
5320
- listings.push({ name: text, url: fullHref || undefined });
5321
- }
5322
- });
5323
- if (ogTitle || listings.length > 0) {
5324
- const searchTerm = ogTitle.replace(/\s*-\s*Yelp$/, '').trim();
5325
- const lines = [
5326
- `# 🔍 Yelp Search: ${searchTerm || 'Results'}`,
5327
- ogDescription && `\n${ogDescription}`,
5328
- listings.length > 0 && `\n**Found ${listings.length} results:**`,
5329
- ...listings.slice(0, 15).map((l, i) => `${i + 1}. ${l.url ? `[${l.name}](${l.url})` : l.name}`),
5330
- '',
5331
- `**Search:** [View on Yelp](${url})`,
5332
- '',
5333
- '---',
5334
- '*Source: Yelp*',
5335
- ].filter(Boolean);
5336
- return {
5337
- domain: 'yelp.com',
5338
- type: 'search',
5339
- structured: { query: searchTerm, count: listings.length, listings },
5340
- cleanContent: lines.join('\n'),
5341
- };
5373
+ // ----------------------------------------------------------------
5374
+ // Search / Category URL: /search?find_desc=...&find_loc=...
5375
+ // /search?cflt=restaurants&find_loc=...
5376
+ // ----------------------------------------------------------------
5377
+ const findDesc = searchParams.get('find_desc') || '';
5378
+ const cflt = searchParams.get('cflt') || '';
5379
+ const findLoc = searchParams.get('find_loc') || '';
5380
+ if (!findLoc && !findDesc && !cflt) {
5381
+ // Not a recognized pattern
5382
+ return null;
5342
5383
  }
5343
- return null;
5384
+ const apiParams = { limit: '10' };
5385
+ if (findLoc)
5386
+ apiParams.location = findLoc;
5387
+ if (findDesc)
5388
+ apiParams.term = findDesc;
5389
+ if (cflt && !findDesc)
5390
+ apiParams.categories = cflt;
5391
+ let data;
5392
+ try {
5393
+ data = await yelpFetch('/businesses/search', apiParams);
5394
+ }
5395
+ catch (e) {
5396
+ if (process.env.DEBUG)
5397
+ console.debug('[webpeel] Yelp search failed:', e instanceof Error ? e.message : e);
5398
+ return null;
5399
+ }
5400
+ const businesses = data.businesses || [];
5401
+ const total = data.total ?? businesses.length;
5402
+ // Build header
5403
+ const searchLabel = findDesc || cflt || 'Businesses';
5404
+ const locationLabel = findLoc || '';
5405
+ const emoji = cflt === 'restaurants' || findDesc?.toLowerCase().includes('restaurant') ? '🍽️'
5406
+ : findDesc?.toLowerCase().includes('pizza') ? '🍕'
5407
+ : findDesc?.toLowerCase().includes('coffee') || findDesc?.toLowerCase().includes('cafe') ? '☕'
5408
+ : findDesc?.toLowerCase().includes('bar') ? '🍺'
5409
+ : '🔍';
5410
+ const titleParts = [searchLabel.charAt(0).toUpperCase() + searchLabel.slice(1)];
5411
+ if (locationLabel)
5412
+ titleParts.push(`in ${locationLabel}`);
5413
+ const lines = [
5414
+ `# ${emoji} Yelp — ${titleParts.join(' ')}`,
5415
+ '',
5416
+ `*${businesses.length} of ${total.toLocaleString()} results via Yelp Fusion API*`,
5417
+ '',
5418
+ ];
5419
+ for (let i = 0; i < businesses.length; i++) {
5420
+ const b = businesses[i];
5421
+ const bName = b.name || 'Unknown';
5422
+ const bRating = b.rating != null ? b.rating.toFixed(1) : '?';
5423
+ const bReviews = b.review_count ?? 0;
5424
+ const bAddr = b.location;
5425
+ const bAddress = bAddr
5426
+ ? [bAddr.address1, bAddr.city, bAddr.state, bAddr.zip_code].filter(Boolean).join(', ')
5427
+ : '';
5428
+ const bPhone = b.display_phone || '';
5429
+ const bPrice = b.price || '';
5430
+ const bCategories = (b.categories || []).map((c) => c.title).join(' | ');
5431
+ const bUrl = b.url || '';
5432
+ const bSnippet = b.snippet_text || '';
5433
+ lines.push(`## ${i + 1}. ${bName} ⭐ ${bRating} (${bReviews.toLocaleString()} reviews)`);
5434
+ if (bAddress)
5435
+ lines.push(`📍 ${bAddress}`);
5436
+ const tagLine = [bCategories && `🏷️ ${bCategories}`, bPrice && `💰 ${bPrice}`].filter(Boolean).join(' | ');
5437
+ if (tagLine)
5438
+ lines.push(tagLine);
5439
+ if (bPhone)
5440
+ lines.push(`📞 ${bPhone}`);
5441
+ if (bSnippet)
5442
+ lines.push(`> ${bSnippet.replace(/\n+/g, ' ').trim().slice(0, 150)}`);
5443
+ if (bUrl)
5444
+ lines.push(`[View on Yelp](${bUrl})`);
5445
+ lines.push('');
5446
+ }
5447
+ if (businesses.length === 0) {
5448
+ lines.push(`*No results found for "${searchLabel}"${locationLabel ? ` in ${locationLabel}` : ''}.*`);
5449
+ }
5450
+ return {
5451
+ domain: 'yelp.com',
5452
+ type: 'search',
5453
+ structured: { query: searchLabel, location: locationLabel, total, count: businesses.length, businesses },
5454
+ cleanContent: lines.join('\n'),
5455
+ };
5344
5456
  }
5345
5457
  catch (e) {
5346
5458
  if (process.env.DEBUG)
@@ -5471,20 +5583,21 @@ async function zillowExtractor(_html, url) {
5471
5583
  const redfinCityUrl = `https://www.redfin.com/${stateCode}/${cityForUrl}`;
5472
5584
  const locationLabel = `${cityName}, ${stateCode}`;
5473
5585
  // Try to fetch live Redfin listings via their API
5474
- // Map common cities to known Redfin market IDs
5475
- const marketIdMap = {
5476
- 'NY-New-York': 8, 'NY-Brooklyn': 8, 'NY-Queens': 8, 'NY-Bronx': 8,
5477
- 'NY-Staten-Island': 8, 'NY-Manhattan': 8,
5478
- 'CA-Los-Angeles': 4, 'CA-San-Francisco': 1, 'CA-San-Diego': 5,
5479
- 'TX-Houston': 7, 'TX-Dallas': 24, 'TX-Austin': 22,
5480
- 'FL-Miami': 13, 'FL-Orlando': 15, 'FL-Tampa': 11,
5481
- 'IL-Chicago': 3, 'WA-Seattle': 16, 'MA-Boston': 10,
5482
- 'AZ-Phoenix': 14, 'PA-Philadelphia': 12, 'GA-Atlanta': 9,
5483
- 'CO-Denver': 6, 'MN-Minneapolis': 18, 'OR-Portland': 17,
5484
- 'NV-Las-Vegas': 20, 'NC-Charlotte': 21, 'OH-Columbus': 23,
5586
+ // Map common city slugs to Redfin city region IDs (region_type=6)
5587
+ const cityRegionMap = {
5588
+ 'NY-New-York': 30749, 'NY-Brooklyn': 30749, 'NY-Queens': 30749, 'NY-Bronx': 30749,
5589
+ 'NY-Staten-Island': 30749, 'NY-Manhattan': 30749,
5590
+ 'CA-Los-Angeles': 11203, 'CA-San-Francisco': 17151, 'CA-San-Diego': 18142,
5591
+ 'CA-San-Jose': 17420,
5592
+ 'TX-Houston': 30772, 'TX-Dallas': 35799, 'TX-Austin': 30818,
5593
+ 'FL-Miami': 10201, 'FL-Orlando': 13140, 'FL-Tampa': 18280,
5594
+ 'IL-Chicago': 29470, 'WA-Seattle': 16163, 'MA-Boston': 1826,
5595
+ 'AZ-Phoenix': 14240, 'PA-Philadelphia': 13364, 'GA-Atlanta': 30756,
5596
+ 'CO-Denver': 11093, 'MN-Minneapolis': 18959, 'OR-Portland': 14941,
5597
+ 'NV-Las-Vegas': 32820, 'NC-Charlotte': 3105, 'OH-Columbus': 8528,
5485
5598
  };
5486
5599
  const marketKey = `${stateCode}-${cityForUrl}`;
5487
- const marketId = marketIdMap[marketKey];
5600
+ const marketId = cityRegionMap[marketKey];
5488
5601
  if (marketId) {
5489
5602
  const payload = await fetchRedfinListings(marketId, 6 /* city */);
5490
5603
  if (payload?.homes && payload.homes.length > 0) {
@@ -5622,32 +5735,21 @@ async function redfinExtractor(_html, url) {
5622
5735
  const citySlug = stateCity[2];
5623
5736
  const cityName = citySlug.replace(/-/g, ' ');
5624
5737
  const locationLabel = `${cityName}, ${stateCode}`;
5625
- // No region ID — use a GIS bounding box search via the city name
5626
- // Try a known NYC region as a broader fallback search
5627
- // For now, attempt search with region_type=2 (market area)
5628
- // We'll make a best-effort attempt using a city name search
5629
- // Since Redfin's autocomplete is blocked, try common market IDs
5630
- const marketIdMap = {
5631
- 'NY-New-York': 8,
5632
- 'NY-Brooklyn': 8,
5633
- 'NY-Queens': 8,
5634
- 'NY-Bronx': 8,
5635
- 'NY-Staten-Island': 8,
5636
- 'NY-Manhattan': 8,
5637
- 'CA-Los-Angeles': 4,
5638
- 'CA-San-Francisco': 1,
5639
- 'TX-Houston': 7,
5640
- 'TX-Dallas': 24,
5641
- 'FL-Miami': 13,
5642
- 'IL-Chicago': 3,
5643
- 'WA-Seattle': 16,
5644
- 'MA-Boston': 10,
5645
- 'AZ-Phoenix': 14,
5646
- 'PA-Philadelphia': 12,
5647
- 'GA-Atlanta': 9,
5738
+ // No region ID in URL — use known Redfin city region IDs (region_type=6)
5739
+ const cityRegionMap = {
5740
+ 'NY-New-York': 30749, 'NY-Brooklyn': 30749, 'NY-Queens': 30749, 'NY-Bronx': 30749,
5741
+ 'NY-Staten-Island': 30749, 'NY-Manhattan': 30749,
5742
+ 'CA-Los-Angeles': 11203, 'CA-San-Francisco': 17151, 'CA-San-Diego': 18142,
5743
+ 'CA-San-Jose': 17420,
5744
+ 'TX-Houston': 30772, 'TX-Dallas': 35799, 'TX-Austin': 30818,
5745
+ 'FL-Miami': 10201, 'FL-Orlando': 13140, 'FL-Tampa': 18280,
5746
+ 'IL-Chicago': 29470, 'WA-Seattle': 16163, 'MA-Boston': 1826,
5747
+ 'AZ-Phoenix': 14240, 'PA-Philadelphia': 13364, 'GA-Atlanta': 30756,
5748
+ 'CO-Denver': 11093, 'MN-Minneapolis': 18959, 'OR-Portland': 14941,
5749
+ 'NV-Las-Vegas': 32820, 'NC-Charlotte': 3105, 'OH-Columbus': 8528,
5648
5750
  };
5649
5751
  const marketKey = `${stateCode}-${citySlug}`;
5650
- const marketId = marketIdMap[marketKey];
5752
+ const marketId = cityRegionMap[marketKey];
5651
5753
  if (marketId) {
5652
5754
  const payload = await fetchRedfinListings(marketId, 6 /* city */);
5653
5755
  if (payload?.homes && payload.homes.length > 0) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.71",
3
+ "version": "0.21.72",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",