webpeel 0.21.67 → 0.21.69

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,10 +30,19 @@ export type DomainExtractor = (html: string, url: string) => Promise<DomainExtra
30
30
  export declare function getDomainExtractor(url: string): DomainExtractor | null;
31
31
  /** Clear the extractor response cache (used in tests). */
32
32
  export declare function clearExtractorCache(): void;
33
+ /**
34
+ * Inject a Redis client for shared cross-pod caching.
35
+ * Called from server startup after Redis is initialized.
36
+ * Safe to call with null to disable Redis caching (e.g., CLI mode).
37
+ */
38
+ export declare function setExtractorRedis(redis: any): void;
33
39
  /**
34
40
  * Convenience: run the extractor for the URL (if one exists).
35
- * Wraps _extractDomainDataImpl with a 5-minute LRU cache so that
36
- * rate-limited API responses fall back to cached results instead of
37
- * garbage browser rendering.
41
+ * Wraps _extractDomainDataImpl with a two-tier cache:
42
+ * 1. In-memory LRU (per-pod, fastest)
43
+ * 2. Redis shared cache (cross-pod, shared across all replicas)
44
+ *
45
+ * With multiple API pods, Redis ensures the first pod to fetch a URL
46
+ * populates cache for all others — eliminating redundant API calls.
38
47
  */
39
48
  export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -112,6 +112,14 @@ const REGISTRY = [
112
112
  { match: (h) => h === 'open-meteo.com' || h === 'api.open-meteo.com' || h === 'www.open-meteo.com', extractor: weatherExtractor },
113
113
  { match: (h) => h === 'weather.com' || h === 'www.weather.com', extractor: weatherExtractor },
114
114
  { match: (h) => h === 'accuweather.com' || h === 'www.accuweather.com', extractor: weatherExtractor },
115
+ // ── Marketplaces & Shopping ───────────────────────────────────────────────
116
+ { match: (h) => h === 'facebook.com' || h === 'www.facebook.com', extractor: facebookMarketplaceExtractor },
117
+ { match: (h) => h === 'etsy.com' || h === 'www.etsy.com', extractor: etsyExtractor },
118
+ { match: (h) => h === 'cars.com' || h === 'www.cars.com', extractor: carsComExtractor },
119
+ { match: (h) => h === 'ebay.com' || h === 'www.ebay.com', extractor: ebayExtractor },
120
+ // ── Local / Real Estate ────────────────────────────────────────────────────
121
+ { match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
122
+ { match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
115
123
  ];
116
124
  /**
117
125
  * Returns the domain extractor for a URL, or null if none matches.
@@ -159,6 +167,50 @@ function setCachedExtractorResult(url, result) {
159
167
  EXTRACTOR_CACHE.delete(oldest);
160
168
  }
161
169
  }
170
+ // ── Redis Shared Cache (cross-pod cache for multi-replica deployments) ────────
171
+ // When running multiple API pods, each pod has its own in-memory cache.
172
+ // With 6 pods, 5/6 requests for the same URL miss cache.
173
+ // Redis solves this: all pods share one cache, so the first pod to fetch
174
+ // populates it for all others.
175
+ //
176
+ // Redis is injected from the server startup to keep this core module
177
+ // dependency-free (works in CLI mode without Redis too).
178
+ let _redisClient = null;
179
+ const REDIS_CACHE_PREFIX = 'wp:ext:';
180
+ const REDIS_CACHE_TTL_SECS = 300; // 5 minutes
181
+ /**
182
+ * Inject a Redis client for shared cross-pod caching.
183
+ * Called from server startup after Redis is initialized.
184
+ * Safe to call with null to disable Redis caching (e.g., CLI mode).
185
+ */
186
+ export function setExtractorRedis(redis) {
187
+ _redisClient = redis;
188
+ }
189
+ async function getRedisCache(url) {
190
+ try {
191
+ if (!_redisClient)
192
+ return null;
193
+ const key = REDIS_CACHE_PREFIX + url.replace(/[?#].*$/, '').toLowerCase();
194
+ const cached = await _redisClient.get(key);
195
+ if (!cached)
196
+ return null;
197
+ return JSON.parse(cached);
198
+ }
199
+ catch {
200
+ return null; // Redis unavailable — fall back to in-memory cache
201
+ }
202
+ }
203
+ async function setRedisCache(url, result) {
204
+ try {
205
+ if (!_redisClient)
206
+ return;
207
+ const key = REDIS_CACHE_PREFIX + url.replace(/[?#].*$/, '').toLowerCase();
208
+ await _redisClient.set(key, JSON.stringify(result), 'EX', REDIS_CACHE_TTL_SECS);
209
+ }
210
+ catch {
211
+ // Redis unavailable — in-memory cache still works, this is non-fatal
212
+ }
213
+ }
162
214
  // ─────────────────────────────────────────────────────────────────────────────
163
215
  /**
164
216
  * Internal implementation: run the extractor for the URL (if one exists).
@@ -177,28 +229,39 @@ async function _extractDomainDataImpl(html, url) {
177
229
  }
178
230
  /**
179
231
  * Convenience: run the extractor for the URL (if one exists).
180
- * Wraps _extractDomainDataImpl with a 5-minute LRU cache so that
181
- * rate-limited API responses fall back to cached results instead of
182
- * garbage browser rendering.
232
+ * Wraps _extractDomainDataImpl with a two-tier cache:
233
+ * 1. In-memory LRU (per-pod, fastest)
234
+ * 2. Redis shared cache (cross-pod, shared across all replicas)
235
+ *
236
+ * With multiple API pods, Redis ensures the first pod to fetch a URL
237
+ * populates cache for all others — eliminating redundant API calls.
183
238
  */
184
239
  export async function extractDomainData(html, url) {
185
- // 1. Check fresh cache first
240
+ // 1. Check in-memory cache (fastest — no network)
186
241
  const cached = getCachedExtractorResult(url);
187
242
  if (cached)
188
243
  return cached;
189
- // 2. Try the real extractor
244
+ // 2. Check Redis cache (shared across all pods)
245
+ const redisCached = await getRedisCache(url);
246
+ if (redisCached) {
247
+ // Populate local in-memory cache to avoid Redis round-trips on repeat
248
+ setCachedExtractorResult(url, redisCached);
249
+ return redisCached;
250
+ }
251
+ // 3. Try the real extractor
190
252
  const result = await _extractDomainDataImpl(html, url);
191
253
  if (result && result.cleanContent.length > 20) {
192
- // 3. Cache the successful result
254
+ // 4. Cache the successful result in both layers
193
255
  setCachedExtractorResult(url, result);
256
+ void setRedisCache(url, result); // fire-and-forget, non-blocking
194
257
  return result;
195
258
  }
196
- // 4. Extractor failed/returned garbage — check for any stale cache entry
259
+ // 5. Extractor failed/returned garbage — check for any stale cache entry
197
260
  // (stale structured data beats a browser "Loading…" page)
198
261
  const stale = getCachedExtractorResult(url);
199
262
  if (stale)
200
263
  return stale;
201
- // 5. Genuinely nothing — return null so the pipeline falls back to fetch
264
+ // 6. Genuinely nothing — return null so the pipeline falls back to fetch
202
265
  return result;
203
266
  }
204
267
  // ---------------------------------------------------------------------------
@@ -3139,6 +3202,62 @@ async function bbcExtractor(html, url) {
3139
3202
  return extractNewsArticle(html, url, 'bbc.com');
3140
3203
  }
3141
3204
  async function cnnExtractor(html, url) {
3205
+ try {
3206
+ const u = new URL(url);
3207
+ // For homepage — use CNN Lite which has actual headline links
3208
+ if (u.pathname === '/' || u.pathname === '' || u.hostname === 'lite.cnn.com') {
3209
+ const liteResp = await fetch('https://lite.cnn.com', { headers: { 'User-Agent': 'webpeel/0.21' } });
3210
+ if (liteResp.ok) {
3211
+ const liteHtml = await liteResp.text();
3212
+ const headlines = [];
3213
+ const matches = liteHtml.matchAll(/<a[^>]+href="([^"]*)"[^>]*>([^<]+)<\/a>/g);
3214
+ for (const m of matches) {
3215
+ const href = m[1].trim();
3216
+ const text = m[2].trim();
3217
+ // CNN Lite article links contain year patterns like /2026/
3218
+ if (/\/20\d\d\//.test(href) && text.length > 10) {
3219
+ const fullUrl = href.startsWith('http') ? href : `https://www.cnn.com${href}`;
3220
+ headlines.push(`- [${text}](${fullUrl})`);
3221
+ }
3222
+ }
3223
+ if (headlines.length > 5) {
3224
+ return {
3225
+ domain: 'cnn.com',
3226
+ type: 'headlines',
3227
+ structured: { headlines: headlines.length, source: 'cnn-lite' },
3228
+ cleanContent: `# 📰 CNN — Top Headlines\n\n${headlines.slice(0, 20).join('\n')}\n\n---\n*Source: CNN Lite*`,
3229
+ };
3230
+ }
3231
+ }
3232
+ }
3233
+ // For article pages — try CNN Lite version of the same URL
3234
+ if (/\/20\d\d\//.test(u.pathname)) {
3235
+ const liteUrl = `https://lite.cnn.com${u.pathname}`;
3236
+ const liteResp = await fetch(liteUrl, { headers: { 'User-Agent': 'webpeel/0.21' } });
3237
+ if (liteResp.ok) {
3238
+ const liteHtml = await liteResp.text();
3239
+ const { load } = await import('cheerio');
3240
+ const $l = load(liteHtml);
3241
+ const title = $l('h1').first().text().trim();
3242
+ const paragraphs = [];
3243
+ $l('p').each((_, el) => {
3244
+ const text = $l(el).text().trim();
3245
+ if (text.length > 20)
3246
+ paragraphs.push(text);
3247
+ });
3248
+ if (title && paragraphs.length > 0) {
3249
+ return {
3250
+ domain: 'cnn.com',
3251
+ type: 'article',
3252
+ structured: { title, paragraphs: paragraphs.length, source: 'cnn-lite' },
3253
+ cleanContent: `# ${title}\n\n${paragraphs.join('\n\n')}\n\n---\n*Source: CNN*`,
3254
+ };
3255
+ }
3256
+ }
3257
+ }
3258
+ }
3259
+ catch { /* fall through to standard extractor */ }
3260
+ // Fallback to standard news article extractor (works if HTML has content)
3142
3261
  return extractNewsArticle(html, url, 'cnn.com');
3143
3262
  }
3144
3263
  // ---------------------------------------------------------------------------
@@ -4846,3 +4965,447 @@ async function weatherExtractor(_html, url) {
4846
4965
  return null;
4847
4966
  }
4848
4967
  }
4968
+ // ---------------------------------------------------------------------------
4969
+ // Facebook Marketplace extractor (login-wall fallback)
4970
+ // ---------------------------------------------------------------------------
4971
+ async function facebookMarketplaceExtractor(_html, url) {
4972
+ const u = new URL(url);
4973
+ if (!u.pathname.includes('/marketplace'))
4974
+ return null;
4975
+ const query = u.searchParams.get('query') || '';
4976
+ const maxPrice = u.searchParams.get('maxPrice') || '';
4977
+ const minPrice = u.searchParams.get('minPrice') || '';
4978
+ // Extract location segment: /marketplace/nyc/search → "nyc"
4979
+ const locationMatch = u.pathname.match(/\/marketplace\/([^/]+)(?:\/|$)/);
4980
+ const location = (locationMatch?.[1] && locationMatch[1] !== 'search' && locationMatch[1] !== 'category') ? locationMatch[1] : '';
4981
+ const priceRange = [minPrice && `$${minPrice}`, maxPrice && `$${maxPrice}`].filter(Boolean).join(' – ');
4982
+ const lines = [
4983
+ `# 🛒 Facebook Marketplace`,
4984
+ '',
4985
+ `**Search:** ${query || 'Browse all'}`,
4986
+ ...(location ? [`**Location:** ${location}`] : []),
4987
+ ...(priceRange ? [`**Price range:** ${priceRange}`] : []),
4988
+ '',
4989
+ '> ⚠️ Facebook Marketplace requires authentication. WebPeel cannot access listings directly.',
4990
+ '',
4991
+ '**Alternative searches that work:**',
4992
+ ];
4993
+ if (query) {
4994
+ const clUrl = `https://newyork.craigslist.org/search/sss?query=${encodeURIComponent(query)}${maxPrice ? '&max_price=' + maxPrice : ''}`;
4995
+ const carsUrl = `https://www.cars.com/shopping/results/?keyword=${encodeURIComponent(query)}&list_price_max=${maxPrice || ''}&zip=10001&stock_type=used`;
4996
+ const ebayUrl = `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}${maxPrice ? '&_udhi=' + maxPrice : ''}&LH_BIN=1`;
4997
+ lines.push(`- \`webpeel "${clUrl}"\` — Craigslist`, `- \`webpeel "${carsUrl}"\` — Cars.com`, `- \`webpeel "${ebayUrl}"\` — eBay`);
4998
+ }
4999
+ lines.push('', '*Tip: Craigslist and Cars.com return full structured results with WebPeel.*');
5000
+ return {
5001
+ domain: 'facebook.com',
5002
+ type: 'blocked',
5003
+ structured: {
5004
+ query,
5005
+ location,
5006
+ minPrice,
5007
+ maxPrice,
5008
+ reason: 'authentication required',
5009
+ alternatives: ['craigslist', 'cars.com', 'ebay'],
5010
+ },
5011
+ cleanContent: lines.join('\n'),
5012
+ };
5013
+ }
5014
+ // ---------------------------------------------------------------------------
5015
+ // Etsy extractor (bot-block fallback with Google site-search suggestion)
5016
+ // ---------------------------------------------------------------------------
5017
+ async function etsyExtractor(_html, url) {
5018
+ const u = new URL(url);
5019
+ // Extract search query from various URL patterns
5020
+ // /search?q=handmade+jewelry OR /search/handmade-jewelry
5021
+ let query = u.searchParams.get('q') || '';
5022
+ if (!query) {
5023
+ const pathMatch = u.pathname.match(/\/search\/([^?#]+)/);
5024
+ if (pathMatch)
5025
+ query = decodeURIComponent(pathMatch[1].replace(/-/g, ' '));
5026
+ }
5027
+ // Shop page: /shop/ShopName
5028
+ const shopMatch = u.pathname.match(/^\/shop\/([^/?#]+)/);
5029
+ const shopName = shopMatch?.[1] || '';
5030
+ if (!query && !shopName)
5031
+ return null;
5032
+ const googleUrl = query
5033
+ ? `https://www.google.com/search?q=site:etsy.com+${encodeURIComponent(query)}`
5034
+ : `https://www.google.com/search?q=site:etsy.com+${encodeURIComponent(shopName)}`;
5035
+ const etsySearchUrl = query ? `https://www.etsy.com/search?q=${encodeURIComponent(query)}` : url;
5036
+ const displayTitle = query ? `"${query}"` : `Shop: ${shopName}`;
5037
+ const cleanContent = [
5038
+ `# 🎨 Etsy — ${displayTitle}`,
5039
+ '',
5040
+ '> ⚠️ Etsy blocks automated access. WebPeel cannot scrape listings directly.',
5041
+ '',
5042
+ '**Alternatives that work:**',
5043
+ `- \`webpeel "${googleUrl}"\` — Google site:etsy.com results`,
5044
+ `- Direct link: [etsy.com/search?q=${encodeURIComponent(query || shopName)}](${etsySearchUrl})`,
5045
+ '',
5046
+ ...(query ? [
5047
+ '**Similar items on open marketplaces:**',
5048
+ `- \`webpeel "https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}&LH_BIN=1"\` — eBay`,
5049
+ `- \`webpeel "https://newyork.craigslist.org/search/sss?query=${encodeURIComponent(query)}"\` — Craigslist`,
5050
+ ] : []),
5051
+ '',
5052
+ '*Etsy Open API v3 (free key at etsy.com/developers) can unlock direct access.*',
5053
+ ].join('\n');
5054
+ return {
5055
+ domain: 'etsy.com',
5056
+ type: 'blocked',
5057
+ structured: {
5058
+ query,
5059
+ shopName,
5060
+ reason: 'bot-block',
5061
+ googleFallback: googleUrl,
5062
+ },
5063
+ cleanContent,
5064
+ };
5065
+ }
5066
+ // ---------------------------------------------------------------------------
5067
+ // Cars.com extractor — structured parsing via data-vehicle-details JSON attrs
5068
+ // ---------------------------------------------------------------------------
5069
+ async function carsComExtractor(html, url) {
5070
+ try {
5071
+ const { load } = await import('cheerio');
5072
+ const $ = load(html);
5073
+ const u = new URL(url);
5074
+ const keyword = u.searchParams.get('keyword') || '';
5075
+ const maxPrice = u.searchParams.get('list_price_max') || '';
5076
+ const minPrice = u.searchParams.get('list_price_min') || '';
5077
+ const zip = u.searchParams.get('zip') || '';
5078
+ const stockType = u.searchParams.get('stock_type') || '';
5079
+ // Individual vehicle detail page
5080
+ if (u.pathname.includes('/vehicledetail/')) {
5081
+ const title = $('h1').first().text().trim() ||
5082
+ $('title').text().trim().split(' | ')[0];
5083
+ if (!title)
5084
+ return null;
5085
+ const price = $('[class*="price"]').first().text().trim();
5086
+ const mileage = $('[class*="mileage"]').first().text().trim();
5087
+ return {
5088
+ domain: 'cars.com',
5089
+ type: 'listing',
5090
+ structured: { title, price, mileage, url },
5091
+ cleanContent: [
5092
+ `# 🚗 ${title}`,
5093
+ price && `**Price:** ${price}`,
5094
+ mileage && `**Mileage:** ${mileage}`,
5095
+ `\n[View listing](${url})`,
5096
+ ].filter(Boolean).join('\n'),
5097
+ };
5098
+ }
5099
+ // Search results page — Cars.com embeds JSON in fuse-card data-vehicle-details
5100
+ const listings = [];
5101
+ $('fuse-card[data-vehicle-details]').each((_, el) => {
5102
+ try {
5103
+ const raw = $(el).attr('data-vehicle-details');
5104
+ if (!raw)
5105
+ return;
5106
+ const v = JSON.parse(raw);
5107
+ const listingId = v.listingId || $(el).attr('data-listing-id') || '';
5108
+ const cardLink = $(el).find('card-gallery').attr('card-link') || (listingId ? `/vehicledetail/${listingId}/` : '');
5109
+ const title = `${v.stockType || 'Used'} ${v.year} ${v.make} ${v.model}${v.trim ? ' ' + v.trim : ''}`.trim();
5110
+ const price = v.price ? `$${Number(v.price).toLocaleString()}` : '';
5111
+ const mileage = v.mileage ? `${Number(v.mileage).toLocaleString()} mi` : '';
5112
+ const bodyStyle = v.bodyStyle || '';
5113
+ const fuelType = v.fuelType || '';
5114
+ if (title && title !== 'Used ') {
5115
+ listings.push({ title, price, mileage, bodyStyle, fuelType, url: cardLink });
5116
+ }
5117
+ }
5118
+ catch { /* skip malformed */ }
5119
+ });
5120
+ if (listings.length === 0)
5121
+ return null; // Let pipeline handle it
5122
+ const priceRange = [minPrice && `$${minPrice}`, maxPrice && `$${maxPrice}`].filter(Boolean).join(' – ');
5123
+ const header = [
5124
+ `# 🚗 Cars.com — ${keyword || 'Vehicle Search'}`,
5125
+ '',
5126
+ keyword && `**Search:** ${keyword}`,
5127
+ zip && `**Location:** ZIP ${zip}`,
5128
+ priceRange && `**Price:** up to $${maxPrice}`,
5129
+ stockType && `**Stock:** ${stockType}`,
5130
+ `**Results:** ${listings.length} listings`,
5131
+ '',
5132
+ ].filter(Boolean).join('\n');
5133
+ const rows = listings.slice(0, 20).map((l, i) => {
5134
+ const parts = [
5135
+ `${i + 1}. **${l.title}**`,
5136
+ l.price,
5137
+ l.mileage,
5138
+ l.bodyStyle,
5139
+ l.url && `[→](https://www.cars.com${l.url})`,
5140
+ ].filter(Boolean);
5141
+ return parts.join(' · ');
5142
+ });
5143
+ return {
5144
+ domain: 'cars.com',
5145
+ type: 'search',
5146
+ structured: { keyword, zip, minPrice, maxPrice, stockType, count: listings.length, listings },
5147
+ cleanContent: header + rows.join('\n'),
5148
+ };
5149
+ }
5150
+ catch (e) {
5151
+ if (process.env.DEBUG)
5152
+ console.debug('[webpeel]', 'Cars.com extractor error:', e instanceof Error ? e.message : e);
5153
+ return null;
5154
+ }
5155
+ }
5156
+ // ---------------------------------------------------------------------------
5157
+ // eBay extractor — clean up noisy search results
5158
+ // ---------------------------------------------------------------------------
5159
+ async function ebayExtractor(html, url) {
5160
+ try {
5161
+ const { load } = await import('cheerio');
5162
+ const $ = load(html);
5163
+ const u = new URL(url);
5164
+ // Individual item page
5165
+ if (u.pathname.startsWith('/itm/')) {
5166
+ const title = $('h1').first().text().trim();
5167
+ if (!title)
5168
+ return null;
5169
+ const price = $('[class*="price"]').not('[class*="shipping"]').first().text().trim();
5170
+ const condition = $('[class*="condition"]').first().text().trim();
5171
+ return {
5172
+ domain: 'ebay.com',
5173
+ type: 'listing',
5174
+ structured: { title, price, condition, url },
5175
+ cleanContent: [
5176
+ `# 🛍 ${title}`,
5177
+ price && `**Price:** ${price}`,
5178
+ condition && `**Condition:** ${condition}`,
5179
+ `\n[View on eBay](${url})`,
5180
+ ].filter(Boolean).join('\n'),
5181
+ };
5182
+ }
5183
+ // Search results page
5184
+ const keyword = u.searchParams.get('_nkw') || '';
5185
+ const maxPrice = u.searchParams.get('_udhi') || '';
5186
+ const minPrice = u.searchParams.get('_udlo') || '';
5187
+ const listings = [];
5188
+ // eBay search results use li[data-listingid] + .s-card__title / .s-card__price
5189
+ $('li[data-listingid]').each((_, el) => {
5190
+ const titleRaw = $(el).find('.s-card__title').text().trim()
5191
+ .replace(/Opens in a new window or tab/g, '')
5192
+ .replace(/^New Listing\s*/i, '')
5193
+ .trim();
5194
+ if (!titleRaw || titleRaw === 'Shop on eBay')
5195
+ return;
5196
+ const title = titleRaw;
5197
+ const price = $(el).find('.s-card__price').first().text().trim();
5198
+ // .s-card__subtitle contains "DealerNameCondition" as merged text — extract condition keyword
5199
+ const subtitleText = $(el).find('.s-card__subtitle').text().trim();
5200
+ const conditionKeywords = ['Pre-Owned', 'Brand New', 'Open Box', 'Refurbished', 'For Parts'];
5201
+ const condition = conditionKeywords.find((k) => subtitleText.includes(k)) || '';
5202
+ // Get clean URL — extract /itm/<id> and strip tracking params
5203
+ let href = '';
5204
+ const itemLink = $(el).find('a[href*="/itm/"]').first().attr('href') || '';
5205
+ const itmMatch = itemLink.match(/(https?:\/\/[^/]*\/itm\/\d+)/);
5206
+ if (itmMatch)
5207
+ href = itmMatch[1];
5208
+ const listingId = $(el).attr('data-listingid') || '';
5209
+ if (!href && listingId)
5210
+ href = `https://www.ebay.com/itm/${listingId}`;
5211
+ listings.push({ title, price, condition, url: href });
5212
+ });
5213
+ if (listings.length === 0)
5214
+ return null; // Let pipeline handle it
5215
+ const priceRange = [minPrice && `$${minPrice}`, maxPrice && `$${maxPrice}`].filter(Boolean).join(' – ');
5216
+ const header = [
5217
+ `# 🛍 eBay — ${keyword || 'Search Results'}`,
5218
+ '',
5219
+ keyword && `**Search:** ${keyword}`,
5220
+ priceRange && `**Price:** up to $${maxPrice}`,
5221
+ `**Results:** ${listings.length} listings`,
5222
+ '',
5223
+ ].filter(Boolean).join('\n');
5224
+ const rows = listings.slice(0, 20).map((l, i) => {
5225
+ const parts = [
5226
+ `${i + 1}. **${l.title}**`,
5227
+ l.price,
5228
+ l.condition && `[${l.condition}]`,
5229
+ l.url && `[→](${l.url})`,
5230
+ ].filter(Boolean);
5231
+ return parts.join(' · ');
5232
+ });
5233
+ return {
5234
+ domain: 'ebay.com',
5235
+ type: 'search',
5236
+ structured: { keyword, minPrice, maxPrice, count: listings.length, listings },
5237
+ cleanContent: header + rows.join('\n'),
5238
+ };
5239
+ }
5240
+ catch (e) {
5241
+ if (process.env.DEBUG)
5242
+ console.debug('[webpeel]', 'eBay extractor error:', e instanceof Error ? e.message : e);
5243
+ return null;
5244
+ }
5245
+ }
5246
+ // ---------------------------------------------------------------------------
5247
+ // Yelp extractor — parse JSON-LD + meta from stealth-rendered HTML
5248
+ // ---------------------------------------------------------------------------
5249
+ async function yelpExtractor(html, url) {
5250
+ try {
5251
+ const { load } = await import('cheerio');
5252
+ const $ = load(html);
5253
+ // Try JSON-LD structured data first
5254
+ const jsonLdScripts = $('script[type="application/ld+json"]');
5255
+ let businessData = null;
5256
+ jsonLdScripts.each((_, el) => {
5257
+ const raw = $(el).html() || '';
5258
+ try {
5259
+ const parsed = JSON.parse(raw);
5260
+ const items = Array.isArray(parsed) ? parsed : [parsed];
5261
+ for (const item of items) {
5262
+ const type = item['@type'];
5263
+ if (type === 'Restaurant' || type === 'LocalBusiness' || type === 'FoodEstablishment' ||
5264
+ type === 'BarOrPub' || type === 'CafeOrCoffeeShop') {
5265
+ businessData = item;
5266
+ }
5267
+ }
5268
+ }
5269
+ catch { /* ignore malformed JSON-LD */ }
5270
+ });
5271
+ // --- Business page ---
5272
+ if (businessData) {
5273
+ const name = businessData.name || '';
5274
+ const rating = businessData.aggregateRating?.ratingValue;
5275
+ const reviewCount = businessData.aggregateRating?.reviewCount;
5276
+ const addr = businessData.address;
5277
+ const address = addr
5278
+ ? [addr.streetAddress, addr.addressLocality, addr.addressRegion, addr.postalCode].filter(Boolean).join(', ')
5279
+ : '';
5280
+ const phone = businessData.telephone || '';
5281
+ const cuisine = businessData.servesCuisine || '';
5282
+ const priceRange = businessData.priceRange || '';
5283
+ const description = businessData.description || $('meta[property="og:description"]').attr('content') || '';
5284
+ const hours = businessData.openingHours || '';
5285
+ const lines = [
5286
+ `# ⭐ Yelp: ${name}`,
5287
+ '',
5288
+ rating && `**Rating:** ${rating}/5 (${reviewCount} reviews)`,
5289
+ cuisine && `**Cuisine:** ${cuisine}`,
5290
+ priceRange && `**Price:** ${priceRange}`,
5291
+ address && `**Address:** ${address}`,
5292
+ phone && `**Phone:** ${phone}`,
5293
+ hours && `**Hours:** ${Array.isArray(hours) ? hours.join(', ') : hours}`,
5294
+ description && `\n${description.substring(0, 500)}`,
5295
+ '',
5296
+ `**More info:** [View on Yelp](${url})`,
5297
+ '',
5298
+ '---',
5299
+ '*Source: Yelp*',
5300
+ ].filter(Boolean);
5301
+ return {
5302
+ domain: 'yelp.com',
5303
+ type: 'business',
5304
+ structured: { name, rating, reviewCount, address, phone, cuisine, priceRange, description },
5305
+ cleanContent: lines.join('\n'),
5306
+ };
5307
+ }
5308
+ // --- Search page — parse from meta / og tags ---
5309
+ const ogTitle = $('meta[property="og:title"]').attr('content') || '';
5310
+ const ogDescription = $('meta[property="og:description"]').attr('content') || '';
5311
+ // Try to extract listing names from heading tags
5312
+ const listings = [];
5313
+ $('h3, h4').each((_, el) => {
5314
+ const text = $(el).text().trim();
5315
+ if (text && text.length > 2 && text.length < 100) {
5316
+ const anchor = $(el).find('a').first();
5317
+ const href = anchor.attr('href') || '';
5318
+ const fullHref = href.startsWith('/') ? `https://www.yelp.com${href}` : href;
5319
+ listings.push({ name: text, url: fullHref || undefined });
5320
+ }
5321
+ });
5322
+ if (ogTitle || listings.length > 0) {
5323
+ const searchTerm = ogTitle.replace(/\s*-\s*Yelp$/, '').trim();
5324
+ const lines = [
5325
+ `# 🔍 Yelp Search: ${searchTerm || 'Results'}`,
5326
+ ogDescription && `\n${ogDescription}`,
5327
+ listings.length > 0 && `\n**Found ${listings.length} results:**`,
5328
+ ...listings.slice(0, 15).map((l, i) => `${i + 1}. ${l.url ? `[${l.name}](${l.url})` : l.name}`),
5329
+ '',
5330
+ `**Search:** [View on Yelp](${url})`,
5331
+ '',
5332
+ '---',
5333
+ '*Source: Yelp*',
5334
+ ].filter(Boolean);
5335
+ return {
5336
+ domain: 'yelp.com',
5337
+ type: 'search',
5338
+ structured: { query: searchTerm, count: listings.length, listings },
5339
+ cleanContent: lines.join('\n'),
5340
+ };
5341
+ }
5342
+ return null;
5343
+ }
5344
+ catch (e) {
5345
+ if (process.env.DEBUG)
5346
+ console.debug('[webpeel]', 'Yelp extractor error:', e instanceof Error ? e.message : e);
5347
+ return null;
5348
+ }
5349
+ }
5350
+ // ---------------------------------------------------------------------------
5351
+ // Zillow extractor — smart fallback with helpful alternatives
5352
+ // ---------------------------------------------------------------------------
5353
+ async function zillowExtractor(_html, url) {
5354
+ try {
5355
+ const u = new URL(url);
5356
+ // Derive location label from the URL path
5357
+ const rawPath = u.pathname.replace(/^\//, '').replace(/\/$/, '');
5358
+ const location = rawPath
5359
+ .replace(/\//g, ' ')
5360
+ .replace(/-/g, ' ')
5361
+ .trim();
5362
+ // Parse city/state for alternative links
5363
+ const pathParts = rawPath.split('/').filter(Boolean);
5364
+ const cityStatePart = pathParts[0] || ''; // e.g. "new-york-ny"
5365
+ const segments = cityStatePart.split('-');
5366
+ const statePart = segments[segments.length - 1] || '';
5367
+ const cityPart = segments.slice(0, -1).join('-');
5368
+ // Redfin city path
5369
+ const cityCapitalized = cityPart.split('-').map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join('_');
5370
+ const stateUpper = statePart.toUpperCase();
5371
+ const redfinCityPath = cityCapitalized && stateUpper
5372
+ ? `https://www.redfin.com/city/${cityCapitalized}/${stateUpper}`
5373
+ : 'https://www.redfin.com';
5374
+ const realtorPath = cityStatePart
5375
+ ? `https://www.realtor.com/realestateandhomes-search/${cityStatePart}`
5376
+ : 'https://www.realtor.com';
5377
+ const cleanContent = [
5378
+ `# 🏠 Zillow — ${location || 'Real Estate Search'}`,
5379
+ '',
5380
+ '> ⚠️ **Zillow blocks automated access.** WebPeel cannot retrieve live listings directly.',
5381
+ '',
5382
+ '**Try these alternatives that work with WebPeel:**',
5383
+ `- [Redfin](${redfinCityPath}) — similar listings, scrape-friendly`,
5384
+ `- [Realtor.com](${realtorPath}) — MLS-powered, often accessible`,
5385
+ `- [Homes.com](https://www.homes.com) — newer platform, better access`,
5386
+ '',
5387
+ `**Direct Zillow link:** [Open Zillow](${url})`,
5388
+ '',
5389
+ '---',
5390
+ '*Source: Zillow (access blocked — showing alternatives)*',
5391
+ ].join('\n');
5392
+ return {
5393
+ domain: 'zillow.com',
5394
+ type: 'real-estate',
5395
+ structured: {
5396
+ location,
5397
+ blocked: true,
5398
+ alternatives: [
5399
+ { name: 'Redfin', url: redfinCityPath },
5400
+ { name: 'Realtor.com', url: realtorPath },
5401
+ ],
5402
+ },
5403
+ cleanContent,
5404
+ };
5405
+ }
5406
+ catch (e) {
5407
+ if (process.env.DEBUG)
5408
+ console.debug('[webpeel]', 'Zillow extractor error:', e instanceof Error ? e.message : e);
5409
+ return null;
5410
+ }
5411
+ }
@@ -51,6 +51,7 @@ function shouldForceBrowser(url) {
51
51
  'glassdoor.com',
52
52
  'bloomberg.com',
53
53
  'indeed.com',
54
+ 'yelp.com', // aggressive bot detection
54
55
  'amazon.com', // captcha wall on simple/browser fetch
55
56
  'zillow.com', // aggressive bot detection
56
57
  'ticketmaster.com', // Distil Networks / PerimeterX
@@ -54,6 +54,7 @@ import { createSentryHooks } from './sentry.js';
54
54
  import { requireScope } from './middleware/scope-guard.js';
55
55
  import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
56
56
  import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
57
+ import { setExtractorRedis } from '../core/domain-extractors.js';
57
58
  import { registerPremiumHooks } from './premium/index.js';
58
59
  import { readFileSync } from 'fs';
59
60
  import { join, dirname } from 'path';
@@ -421,6 +422,29 @@ export function startServer(config = {}) {
421
422
  const port = config.port || parseInt(process.env.PORT || '3000', 10);
422
423
  // Activate premium strategy hooks (SWR cache, domain intelligence, race).
423
424
  registerPremiumHooks();
425
+ // Inject Redis into the domain extractor cache for cross-pod cache sharing.
426
+ // When REDIS_URL is set (multi-pod k8s deployments), all pods share one cache
427
+ // so the first pod to fetch a URL populates it for all others.
428
+ if (process.env.REDIS_URL) {
429
+ // @ts-ignore — ioredis CJS/ESM interop
430
+ import('ioredis').then((IoRedisModule) => {
431
+ const IoRedis = IoRedisModule.default ?? IoRedisModule;
432
+ const url = process.env.REDIS_URL;
433
+ const parsed = new URL(url);
434
+ const redis = new IoRedis({
435
+ host: parsed.hostname,
436
+ port: parseInt(parsed.port || '6379', 10),
437
+ db: parseInt(parsed.pathname?.slice(1) || '0', 10) || 0,
438
+ lazyConnect: true,
439
+ maxRetriesPerRequest: 3,
440
+ enableOfflineQueue: false,
441
+ });
442
+ setExtractorRedis(redis);
443
+ log.info('Redis extractor cache initialized (shared cross-pod cache active)');
444
+ }).catch((err) => {
445
+ log.warn('Failed to init Redis extractor cache (in-memory only)', { error: err.message });
446
+ });
447
+ }
424
448
  // Pre-warm browser resources in the background to reduce first-request latency.
425
449
  void warmup().catch((error) => {
426
450
  log.warn('Browser warmup failed', { error: error instanceof Error ? error.message : String(error) });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.67",
3
+ "version": "0.21.69",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",