webpeel 0.21.66 → 0.21.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.d.ts +14 -1
- package/dist/core/domain-extractors.js +402 -2
- package/dist/server/app.js +24 -0
- package/package.json +1 -1
|
@@ -28,8 +28,21 @@ export type DomainExtractor = (html: string, url: string) => Promise<DomainExtra
|
|
|
28
28
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
29
29
|
*/
|
|
30
30
|
export declare function getDomainExtractor(url: string): DomainExtractor | null;
|
|
31
|
+
/** Clear the extractor response cache (used in tests). */
|
|
32
|
+
export declare function clearExtractorCache(): void;
|
|
33
|
+
/**
|
|
34
|
+
* Inject a Redis client for shared cross-pod caching.
|
|
35
|
+
* Called from server startup after Redis is initialized.
|
|
36
|
+
* Safe to call with null to disable Redis caching (e.g., CLI mode).
|
|
37
|
+
*/
|
|
38
|
+
export declare function setExtractorRedis(redis: any): void;
|
|
31
39
|
/**
|
|
32
40
|
* Convenience: run the extractor for the URL (if one exists).
|
|
33
|
-
*
|
|
41
|
+
* Wraps _extractDomainDataImpl with a two-tier cache:
|
|
42
|
+
* 1. In-memory LRU (per-pod, fastest)
|
|
43
|
+
* 2. Redis shared cache (cross-pod, shared across all replicas)
|
|
44
|
+
*
|
|
45
|
+
* With multiple API pods, Redis ensures the first pod to fetch a URL
|
|
46
|
+
* populates cache for all others — eliminating redundant API calls.
|
|
34
47
|
*/
|
|
35
48
|
export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
|
|
@@ -112,6 +112,11 @@ const REGISTRY = [
|
|
|
112
112
|
{ match: (h) => h === 'open-meteo.com' || h === 'api.open-meteo.com' || h === 'www.open-meteo.com', extractor: weatherExtractor },
|
|
113
113
|
{ match: (h) => h === 'weather.com' || h === 'www.weather.com', extractor: weatherExtractor },
|
|
114
114
|
{ match: (h) => h === 'accuweather.com' || h === 'www.accuweather.com', extractor: weatherExtractor },
|
|
115
|
+
// ── Marketplaces & Shopping ───────────────────────────────────────────────
|
|
116
|
+
{ match: (h) => h === 'facebook.com' || h === 'www.facebook.com', extractor: facebookMarketplaceExtractor },
|
|
117
|
+
{ match: (h) => h === 'etsy.com' || h === 'www.etsy.com', extractor: etsyExtractor },
|
|
118
|
+
{ match: (h) => h === 'cars.com' || h === 'www.cars.com', extractor: carsComExtractor },
|
|
119
|
+
{ match: (h) => h === 'ebay.com' || h === 'www.ebay.com', extractor: ebayExtractor },
|
|
115
120
|
];
|
|
116
121
|
/**
|
|
117
122
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -131,11 +136,84 @@ export function getDomainExtractor(url) {
|
|
|
131
136
|
}
|
|
132
137
|
return null;
|
|
133
138
|
}
|
|
139
|
+
// ── Extractor Response Cache ──────────────────────────────────────────────
|
|
140
|
+
// Caches successful API responses for 5 minutes to survive rate limits.
|
|
141
|
+
// If the API rate-limits on the next request, we serve from cache instead
|
|
142
|
+
// of falling back to garbage browser rendering (cookie walls, "Loading…").
|
|
143
|
+
// Key: normalized URL (no query/hash), Value: { result, timestamp }
|
|
144
|
+
const EXTRACTOR_CACHE = new Map();
|
|
145
|
+
/** Clear the extractor response cache (used in tests). */
|
|
146
|
+
export function clearExtractorCache() { EXTRACTOR_CACHE.clear(); }
|
|
147
|
+
const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
148
|
+
function getCachedExtractorResult(url) {
|
|
149
|
+
const key = url.replace(/[?#].*$/, '').toLowerCase(); // strip query+hash
|
|
150
|
+
const entry = EXTRACTOR_CACHE.get(key);
|
|
151
|
+
if (entry && Date.now() - entry.ts < CACHE_TTL_MS) {
|
|
152
|
+
return entry.result;
|
|
153
|
+
}
|
|
154
|
+
EXTRACTOR_CACHE.delete(key); // expired — evict
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
function setCachedExtractorResult(url, result) {
|
|
158
|
+
const key = url.replace(/[?#].*$/, '').toLowerCase();
|
|
159
|
+
EXTRACTOR_CACHE.set(key, { result, ts: Date.now() });
|
|
160
|
+
// Keep cache size bounded at 500 entries (evict oldest)
|
|
161
|
+
if (EXTRACTOR_CACHE.size > 500) {
|
|
162
|
+
const oldest = EXTRACTOR_CACHE.keys().next().value;
|
|
163
|
+
if (oldest)
|
|
164
|
+
EXTRACTOR_CACHE.delete(oldest);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
// ── Redis Shared Cache (cross-pod cache for multi-replica deployments) ────────
|
|
168
|
+
// When running multiple API pods, each pod has its own in-memory cache.
|
|
169
|
+
// With 6 pods, 5/6 requests for the same URL miss cache.
|
|
170
|
+
// Redis solves this: all pods share one cache, so the first pod to fetch
|
|
171
|
+
// populates it for all others.
|
|
172
|
+
//
|
|
173
|
+
// Redis is injected from the server startup to keep this core module
|
|
174
|
+
// dependency-free (works in CLI mode without Redis too).
|
|
175
|
+
let _redisClient = null;
|
|
176
|
+
const REDIS_CACHE_PREFIX = 'wp:ext:';
|
|
177
|
+
const REDIS_CACHE_TTL_SECS = 300; // 5 minutes
|
|
134
178
|
/**
|
|
135
|
-
*
|
|
179
|
+
* Inject a Redis client for shared cross-pod caching.
|
|
180
|
+
* Called from server startup after Redis is initialized.
|
|
181
|
+
* Safe to call with null to disable Redis caching (e.g., CLI mode).
|
|
182
|
+
*/
|
|
183
|
+
export function setExtractorRedis(redis) {
|
|
184
|
+
_redisClient = redis;
|
|
185
|
+
}
|
|
186
|
+
async function getRedisCache(url) {
|
|
187
|
+
try {
|
|
188
|
+
if (!_redisClient)
|
|
189
|
+
return null;
|
|
190
|
+
const key = REDIS_CACHE_PREFIX + url.replace(/[?#].*$/, '').toLowerCase();
|
|
191
|
+
const cached = await _redisClient.get(key);
|
|
192
|
+
if (!cached)
|
|
193
|
+
return null;
|
|
194
|
+
return JSON.parse(cached);
|
|
195
|
+
}
|
|
196
|
+
catch {
|
|
197
|
+
return null; // Redis unavailable — fall back to in-memory cache
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
async function setRedisCache(url, result) {
|
|
201
|
+
try {
|
|
202
|
+
if (!_redisClient)
|
|
203
|
+
return;
|
|
204
|
+
const key = REDIS_CACHE_PREFIX + url.replace(/[?#].*$/, '').toLowerCase();
|
|
205
|
+
await _redisClient.set(key, JSON.stringify(result), 'EX', REDIS_CACHE_TTL_SECS);
|
|
206
|
+
}
|
|
207
|
+
catch {
|
|
208
|
+
// Redis unavailable — in-memory cache still works, this is non-fatal
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
212
|
+
/**
|
|
213
|
+
* Internal implementation: run the extractor for the URL (if one exists).
|
|
136
214
|
* Returns null when no extractor matches or extraction fails.
|
|
137
215
|
*/
|
|
138
|
-
|
|
216
|
+
async function _extractDomainDataImpl(html, url) {
|
|
139
217
|
const extractor = getDomainExtractor(url);
|
|
140
218
|
if (!extractor)
|
|
141
219
|
return null;
|
|
@@ -146,6 +224,43 @@ export async function extractDomainData(html, url) {
|
|
|
146
224
|
return null;
|
|
147
225
|
}
|
|
148
226
|
}
|
|
227
|
+
/**
|
|
228
|
+
* Convenience: run the extractor for the URL (if one exists).
|
|
229
|
+
* Wraps _extractDomainDataImpl with a two-tier cache:
|
|
230
|
+
* 1. In-memory LRU (per-pod, fastest)
|
|
231
|
+
* 2. Redis shared cache (cross-pod, shared across all replicas)
|
|
232
|
+
*
|
|
233
|
+
* With multiple API pods, Redis ensures the first pod to fetch a URL
|
|
234
|
+
* populates cache for all others — eliminating redundant API calls.
|
|
235
|
+
*/
|
|
236
|
+
export async function extractDomainData(html, url) {
|
|
237
|
+
// 1. Check in-memory cache (fastest — no network)
|
|
238
|
+
const cached = getCachedExtractorResult(url);
|
|
239
|
+
if (cached)
|
|
240
|
+
return cached;
|
|
241
|
+
// 2. Check Redis cache (shared across all pods)
|
|
242
|
+
const redisCached = await getRedisCache(url);
|
|
243
|
+
if (redisCached) {
|
|
244
|
+
// Populate local in-memory cache to avoid Redis round-trips on repeat
|
|
245
|
+
setCachedExtractorResult(url, redisCached);
|
|
246
|
+
return redisCached;
|
|
247
|
+
}
|
|
248
|
+
// 3. Try the real extractor
|
|
249
|
+
const result = await _extractDomainDataImpl(html, url);
|
|
250
|
+
if (result && result.cleanContent.length > 20) {
|
|
251
|
+
// 4. Cache the successful result in both layers
|
|
252
|
+
setCachedExtractorResult(url, result);
|
|
253
|
+
void setRedisCache(url, result); // fire-and-forget, non-blocking
|
|
254
|
+
return result;
|
|
255
|
+
}
|
|
256
|
+
// 5. Extractor failed/returned garbage — check for any stale cache entry
|
|
257
|
+
// (stale structured data beats a browser "Loading…" page)
|
|
258
|
+
const stale = getCachedExtractorResult(url);
|
|
259
|
+
if (stale)
|
|
260
|
+
return stale;
|
|
261
|
+
// 6. Genuinely nothing — return null so the pipeline falls back to fetch
|
|
262
|
+
return result;
|
|
263
|
+
}
|
|
149
264
|
// ---------------------------------------------------------------------------
|
|
150
265
|
// Helpers
|
|
151
266
|
// ---------------------------------------------------------------------------
|
|
@@ -184,6 +299,13 @@ async function fetchJson(url, customHeaders) {
|
|
|
184
299
|
redirect: 'follow',
|
|
185
300
|
});
|
|
186
301
|
clearTimeout(timer);
|
|
302
|
+
// Surface 429 as a thrown error so callers can detect rate-limiting
|
|
303
|
+
// and the cache wrapper can serve stale results instead of garbage.
|
|
304
|
+
if (resp.status === 429) {
|
|
305
|
+
const err = new Error(`429 Too Many Requests: ${url}`);
|
|
306
|
+
err.statusCode = 429;
|
|
307
|
+
throw err;
|
|
308
|
+
}
|
|
187
309
|
const text = await resp.text();
|
|
188
310
|
const parsed = tryParseJson(text);
|
|
189
311
|
if (parsed === null && text.length > 0) {
|
|
@@ -4784,3 +4906,281 @@ async function weatherExtractor(_html, url) {
|
|
|
4784
4906
|
return null;
|
|
4785
4907
|
}
|
|
4786
4908
|
}
|
|
4909
|
+
// ---------------------------------------------------------------------------
|
|
4910
|
+
// Facebook Marketplace extractor (login-wall fallback)
|
|
4911
|
+
// ---------------------------------------------------------------------------
|
|
4912
|
+
async function facebookMarketplaceExtractor(_html, url) {
|
|
4913
|
+
const u = new URL(url);
|
|
4914
|
+
if (!u.pathname.includes('/marketplace'))
|
|
4915
|
+
return null;
|
|
4916
|
+
const query = u.searchParams.get('query') || '';
|
|
4917
|
+
const maxPrice = u.searchParams.get('maxPrice') || '';
|
|
4918
|
+
const minPrice = u.searchParams.get('minPrice') || '';
|
|
4919
|
+
// Extract location segment: /marketplace/nyc/search → "nyc"
|
|
4920
|
+
const locationMatch = u.pathname.match(/\/marketplace\/([^/]+)(?:\/|$)/);
|
|
4921
|
+
const location = (locationMatch?.[1] && locationMatch[1] !== 'search' && locationMatch[1] !== 'category') ? locationMatch[1] : '';
|
|
4922
|
+
const priceRange = [minPrice && `$${minPrice}`, maxPrice && `$${maxPrice}`].filter(Boolean).join(' – ');
|
|
4923
|
+
const lines = [
|
|
4924
|
+
`# 🛒 Facebook Marketplace`,
|
|
4925
|
+
'',
|
|
4926
|
+
`**Search:** ${query || 'Browse all'}`,
|
|
4927
|
+
...(location ? [`**Location:** ${location}`] : []),
|
|
4928
|
+
...(priceRange ? [`**Price range:** ${priceRange}`] : []),
|
|
4929
|
+
'',
|
|
4930
|
+
'> ⚠️ Facebook Marketplace requires authentication. WebPeel cannot access listings directly.',
|
|
4931
|
+
'',
|
|
4932
|
+
'**Alternative searches that work:**',
|
|
4933
|
+
];
|
|
4934
|
+
if (query) {
|
|
4935
|
+
const clUrl = `https://newyork.craigslist.org/search/sss?query=${encodeURIComponent(query)}${maxPrice ? '&max_price=' + maxPrice : ''}`;
|
|
4936
|
+
const carsUrl = `https://www.cars.com/shopping/results/?keyword=${encodeURIComponent(query)}&list_price_max=${maxPrice || ''}&zip=10001&stock_type=used`;
|
|
4937
|
+
const ebayUrl = `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}${maxPrice ? '&_udhi=' + maxPrice : ''}&LH_BIN=1`;
|
|
4938
|
+
lines.push(`- \`webpeel "${clUrl}"\` — Craigslist`, `- \`webpeel "${carsUrl}"\` — Cars.com`, `- \`webpeel "${ebayUrl}"\` — eBay`);
|
|
4939
|
+
}
|
|
4940
|
+
lines.push('', '*Tip: Craigslist and Cars.com return full structured results with WebPeel.*');
|
|
4941
|
+
return {
|
|
4942
|
+
domain: 'facebook.com',
|
|
4943
|
+
type: 'blocked',
|
|
4944
|
+
structured: {
|
|
4945
|
+
query,
|
|
4946
|
+
location,
|
|
4947
|
+
minPrice,
|
|
4948
|
+
maxPrice,
|
|
4949
|
+
reason: 'authentication required',
|
|
4950
|
+
alternatives: ['craigslist', 'cars.com', 'ebay'],
|
|
4951
|
+
},
|
|
4952
|
+
cleanContent: lines.join('\n'),
|
|
4953
|
+
};
|
|
4954
|
+
}
|
|
4955
|
+
// ---------------------------------------------------------------------------
|
|
4956
|
+
// Etsy extractor (bot-block fallback with Google site-search suggestion)
|
|
4957
|
+
// ---------------------------------------------------------------------------
|
|
4958
|
+
async function etsyExtractor(_html, url) {
|
|
4959
|
+
const u = new URL(url);
|
|
4960
|
+
// Extract search query from various URL patterns
|
|
4961
|
+
// /search?q=handmade+jewelry OR /search/handmade-jewelry
|
|
4962
|
+
let query = u.searchParams.get('q') || '';
|
|
4963
|
+
if (!query) {
|
|
4964
|
+
const pathMatch = u.pathname.match(/\/search\/([^?#]+)/);
|
|
4965
|
+
if (pathMatch)
|
|
4966
|
+
query = decodeURIComponent(pathMatch[1].replace(/-/g, ' '));
|
|
4967
|
+
}
|
|
4968
|
+
// Shop page: /shop/ShopName
|
|
4969
|
+
const shopMatch = u.pathname.match(/^\/shop\/([^/?#]+)/);
|
|
4970
|
+
const shopName = shopMatch?.[1] || '';
|
|
4971
|
+
if (!query && !shopName)
|
|
4972
|
+
return null;
|
|
4973
|
+
const googleUrl = query
|
|
4974
|
+
? `https://www.google.com/search?q=site:etsy.com+${encodeURIComponent(query)}`
|
|
4975
|
+
: `https://www.google.com/search?q=site:etsy.com+${encodeURIComponent(shopName)}`;
|
|
4976
|
+
const etsySearchUrl = query ? `https://www.etsy.com/search?q=${encodeURIComponent(query)}` : url;
|
|
4977
|
+
const displayTitle = query ? `"${query}"` : `Shop: ${shopName}`;
|
|
4978
|
+
const cleanContent = [
|
|
4979
|
+
`# 🎨 Etsy — ${displayTitle}`,
|
|
4980
|
+
'',
|
|
4981
|
+
'> ⚠️ Etsy blocks automated access. WebPeel cannot scrape listings directly.',
|
|
4982
|
+
'',
|
|
4983
|
+
'**Alternatives that work:**',
|
|
4984
|
+
`- \`webpeel "${googleUrl}"\` — Google site:etsy.com results`,
|
|
4985
|
+
`- Direct link: [etsy.com/search?q=${encodeURIComponent(query || shopName)}](${etsySearchUrl})`,
|
|
4986
|
+
'',
|
|
4987
|
+
...(query ? [
|
|
4988
|
+
'**Similar items on open marketplaces:**',
|
|
4989
|
+
`- \`webpeel "https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}&LH_BIN=1"\` — eBay`,
|
|
4990
|
+
`- \`webpeel "https://newyork.craigslist.org/search/sss?query=${encodeURIComponent(query)}"\` — Craigslist`,
|
|
4991
|
+
] : []),
|
|
4992
|
+
'',
|
|
4993
|
+
'*Etsy Open API v3 (free key at etsy.com/developers) can unlock direct access.*',
|
|
4994
|
+
].join('\n');
|
|
4995
|
+
return {
|
|
4996
|
+
domain: 'etsy.com',
|
|
4997
|
+
type: 'blocked',
|
|
4998
|
+
structured: {
|
|
4999
|
+
query,
|
|
5000
|
+
shopName,
|
|
5001
|
+
reason: 'bot-block',
|
|
5002
|
+
googleFallback: googleUrl,
|
|
5003
|
+
},
|
|
5004
|
+
cleanContent,
|
|
5005
|
+
};
|
|
5006
|
+
}
|
|
5007
|
+
// ---------------------------------------------------------------------------
|
|
5008
|
+
// Cars.com extractor — structured parsing via data-vehicle-details JSON attrs
|
|
5009
|
+
// ---------------------------------------------------------------------------
|
|
5010
|
+
async function carsComExtractor(html, url) {
|
|
5011
|
+
try {
|
|
5012
|
+
const { load } = await import('cheerio');
|
|
5013
|
+
const $ = load(html);
|
|
5014
|
+
const u = new URL(url);
|
|
5015
|
+
const keyword = u.searchParams.get('keyword') || '';
|
|
5016
|
+
const maxPrice = u.searchParams.get('list_price_max') || '';
|
|
5017
|
+
const minPrice = u.searchParams.get('list_price_min') || '';
|
|
5018
|
+
const zip = u.searchParams.get('zip') || '';
|
|
5019
|
+
const stockType = u.searchParams.get('stock_type') || '';
|
|
5020
|
+
// Individual vehicle detail page
|
|
5021
|
+
if (u.pathname.includes('/vehicledetail/')) {
|
|
5022
|
+
const title = $('h1').first().text().trim() ||
|
|
5023
|
+
$('title').text().trim().split(' | ')[0];
|
|
5024
|
+
if (!title)
|
|
5025
|
+
return null;
|
|
5026
|
+
const price = $('[class*="price"]').first().text().trim();
|
|
5027
|
+
const mileage = $('[class*="mileage"]').first().text().trim();
|
|
5028
|
+
return {
|
|
5029
|
+
domain: 'cars.com',
|
|
5030
|
+
type: 'listing',
|
|
5031
|
+
structured: { title, price, mileage, url },
|
|
5032
|
+
cleanContent: [
|
|
5033
|
+
`# 🚗 ${title}`,
|
|
5034
|
+
price && `**Price:** ${price}`,
|
|
5035
|
+
mileage && `**Mileage:** ${mileage}`,
|
|
5036
|
+
`\n[View listing](${url})`,
|
|
5037
|
+
].filter(Boolean).join('\n'),
|
|
5038
|
+
};
|
|
5039
|
+
}
|
|
5040
|
+
// Search results page — Cars.com embeds JSON in fuse-card data-vehicle-details
|
|
5041
|
+
const listings = [];
|
|
5042
|
+
$('fuse-card[data-vehicle-details]').each((_, el) => {
|
|
5043
|
+
try {
|
|
5044
|
+
const raw = $(el).attr('data-vehicle-details');
|
|
5045
|
+
if (!raw)
|
|
5046
|
+
return;
|
|
5047
|
+
const v = JSON.parse(raw);
|
|
5048
|
+
const listingId = v.listingId || $(el).attr('data-listing-id') || '';
|
|
5049
|
+
const cardLink = $(el).find('card-gallery').attr('card-link') || (listingId ? `/vehicledetail/${listingId}/` : '');
|
|
5050
|
+
const title = `${v.stockType || 'Used'} ${v.year} ${v.make} ${v.model}${v.trim ? ' ' + v.trim : ''}`.trim();
|
|
5051
|
+
const price = v.price ? `$${Number(v.price).toLocaleString()}` : '';
|
|
5052
|
+
const mileage = v.mileage ? `${Number(v.mileage).toLocaleString()} mi` : '';
|
|
5053
|
+
const bodyStyle = v.bodyStyle || '';
|
|
5054
|
+
const fuelType = v.fuelType || '';
|
|
5055
|
+
if (title && title !== 'Used ') {
|
|
5056
|
+
listings.push({ title, price, mileage, bodyStyle, fuelType, url: cardLink });
|
|
5057
|
+
}
|
|
5058
|
+
}
|
|
5059
|
+
catch { /* skip malformed */ }
|
|
5060
|
+
});
|
|
5061
|
+
if (listings.length === 0)
|
|
5062
|
+
return null; // Let pipeline handle it
|
|
5063
|
+
const priceRange = [minPrice && `$${minPrice}`, maxPrice && `$${maxPrice}`].filter(Boolean).join(' – ');
|
|
5064
|
+
const header = [
|
|
5065
|
+
`# 🚗 Cars.com — ${keyword || 'Vehicle Search'}`,
|
|
5066
|
+
'',
|
|
5067
|
+
keyword && `**Search:** ${keyword}`,
|
|
5068
|
+
zip && `**Location:** ZIP ${zip}`,
|
|
5069
|
+
priceRange && `**Price:** up to $${maxPrice}`,
|
|
5070
|
+
stockType && `**Stock:** ${stockType}`,
|
|
5071
|
+
`**Results:** ${listings.length} listings`,
|
|
5072
|
+
'',
|
|
5073
|
+
].filter(Boolean).join('\n');
|
|
5074
|
+
const rows = listings.slice(0, 20).map((l, i) => {
|
|
5075
|
+
const parts = [
|
|
5076
|
+
`${i + 1}. **${l.title}**`,
|
|
5077
|
+
l.price,
|
|
5078
|
+
l.mileage,
|
|
5079
|
+
l.bodyStyle,
|
|
5080
|
+
l.url && `[→](https://www.cars.com${l.url})`,
|
|
5081
|
+
].filter(Boolean);
|
|
5082
|
+
return parts.join(' · ');
|
|
5083
|
+
});
|
|
5084
|
+
return {
|
|
5085
|
+
domain: 'cars.com',
|
|
5086
|
+
type: 'search',
|
|
5087
|
+
structured: { keyword, zip, minPrice, maxPrice, stockType, count: listings.length, listings },
|
|
5088
|
+
cleanContent: header + rows.join('\n'),
|
|
5089
|
+
};
|
|
5090
|
+
}
|
|
5091
|
+
catch (e) {
|
|
5092
|
+
if (process.env.DEBUG)
|
|
5093
|
+
console.debug('[webpeel]', 'Cars.com extractor error:', e instanceof Error ? e.message : e);
|
|
5094
|
+
return null;
|
|
5095
|
+
}
|
|
5096
|
+
}
|
|
5097
|
+
// ---------------------------------------------------------------------------
|
|
5098
|
+
// eBay extractor — clean up noisy search results
|
|
5099
|
+
// ---------------------------------------------------------------------------
|
|
5100
|
+
async function ebayExtractor(html, url) {
|
|
5101
|
+
try {
|
|
5102
|
+
const { load } = await import('cheerio');
|
|
5103
|
+
const $ = load(html);
|
|
5104
|
+
const u = new URL(url);
|
|
5105
|
+
// Individual item page
|
|
5106
|
+
if (u.pathname.startsWith('/itm/')) {
|
|
5107
|
+
const title = $('h1').first().text().trim();
|
|
5108
|
+
if (!title)
|
|
5109
|
+
return null;
|
|
5110
|
+
const price = $('[class*="price"]').not('[class*="shipping"]').first().text().trim();
|
|
5111
|
+
const condition = $('[class*="condition"]').first().text().trim();
|
|
5112
|
+
return {
|
|
5113
|
+
domain: 'ebay.com',
|
|
5114
|
+
type: 'listing',
|
|
5115
|
+
structured: { title, price, condition, url },
|
|
5116
|
+
cleanContent: [
|
|
5117
|
+
`# 🛍 ${title}`,
|
|
5118
|
+
price && `**Price:** ${price}`,
|
|
5119
|
+
condition && `**Condition:** ${condition}`,
|
|
5120
|
+
`\n[View on eBay](${url})`,
|
|
5121
|
+
].filter(Boolean).join('\n'),
|
|
5122
|
+
};
|
|
5123
|
+
}
|
|
5124
|
+
// Search results page
|
|
5125
|
+
const keyword = u.searchParams.get('_nkw') || '';
|
|
5126
|
+
const maxPrice = u.searchParams.get('_udhi') || '';
|
|
5127
|
+
const minPrice = u.searchParams.get('_udlo') || '';
|
|
5128
|
+
const listings = [];
|
|
5129
|
+
// eBay search results use li[data-listingid] + .s-card__title / .s-card__price
|
|
5130
|
+
$('li[data-listingid]').each((_, el) => {
|
|
5131
|
+
const titleRaw = $(el).find('.s-card__title').text().trim()
|
|
5132
|
+
.replace(/Opens in a new window or tab/g, '')
|
|
5133
|
+
.replace(/^New Listing\s*/i, '')
|
|
5134
|
+
.trim();
|
|
5135
|
+
if (!titleRaw || titleRaw === 'Shop on eBay')
|
|
5136
|
+
return;
|
|
5137
|
+
const title = titleRaw;
|
|
5138
|
+
const price = $(el).find('.s-card__price').first().text().trim();
|
|
5139
|
+
// .s-card__subtitle contains "DealerNameCondition" as merged text — extract condition keyword
|
|
5140
|
+
const subtitleText = $(el).find('.s-card__subtitle').text().trim();
|
|
5141
|
+
const conditionKeywords = ['Pre-Owned', 'Brand New', 'Open Box', 'Refurbished', 'For Parts'];
|
|
5142
|
+
const condition = conditionKeywords.find((k) => subtitleText.includes(k)) || '';
|
|
5143
|
+
// Get clean URL — extract /itm/<id> and strip tracking params
|
|
5144
|
+
let href = '';
|
|
5145
|
+
const itemLink = $(el).find('a[href*="/itm/"]').first().attr('href') || '';
|
|
5146
|
+
const itmMatch = itemLink.match(/(https?:\/\/[^/]*\/itm\/\d+)/);
|
|
5147
|
+
if (itmMatch)
|
|
5148
|
+
href = itmMatch[1];
|
|
5149
|
+
const listingId = $(el).attr('data-listingid') || '';
|
|
5150
|
+
if (!href && listingId)
|
|
5151
|
+
href = `https://www.ebay.com/itm/${listingId}`;
|
|
5152
|
+
listings.push({ title, price, condition, url: href });
|
|
5153
|
+
});
|
|
5154
|
+
if (listings.length === 0)
|
|
5155
|
+
return null; // Let pipeline handle it
|
|
5156
|
+
const priceRange = [minPrice && `$${minPrice}`, maxPrice && `$${maxPrice}`].filter(Boolean).join(' – ');
|
|
5157
|
+
const header = [
|
|
5158
|
+
`# 🛍 eBay — ${keyword || 'Search Results'}`,
|
|
5159
|
+
'',
|
|
5160
|
+
keyword && `**Search:** ${keyword}`,
|
|
5161
|
+
priceRange && `**Price:** up to $${maxPrice}`,
|
|
5162
|
+
`**Results:** ${listings.length} listings`,
|
|
5163
|
+
'',
|
|
5164
|
+
].filter(Boolean).join('\n');
|
|
5165
|
+
const rows = listings.slice(0, 20).map((l, i) => {
|
|
5166
|
+
const parts = [
|
|
5167
|
+
`${i + 1}. **${l.title}**`,
|
|
5168
|
+
l.price,
|
|
5169
|
+
l.condition && `[${l.condition}]`,
|
|
5170
|
+
l.url && `[→](${l.url})`,
|
|
5171
|
+
].filter(Boolean);
|
|
5172
|
+
return parts.join(' · ');
|
|
5173
|
+
});
|
|
5174
|
+
return {
|
|
5175
|
+
domain: 'ebay.com',
|
|
5176
|
+
type: 'search',
|
|
5177
|
+
structured: { keyword, minPrice, maxPrice, count: listings.length, listings },
|
|
5178
|
+
cleanContent: header + rows.join('\n'),
|
|
5179
|
+
};
|
|
5180
|
+
}
|
|
5181
|
+
catch (e) {
|
|
5182
|
+
if (process.env.DEBUG)
|
|
5183
|
+
console.debug('[webpeel]', 'eBay extractor error:', e instanceof Error ? e.message : e);
|
|
5184
|
+
return null;
|
|
5185
|
+
}
|
|
5186
|
+
}
|
package/dist/server/app.js
CHANGED
|
@@ -54,6 +54,7 @@ import { createSentryHooks } from './sentry.js';
|
|
|
54
54
|
import { requireScope } from './middleware/scope-guard.js';
|
|
55
55
|
import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
|
|
56
56
|
import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
|
|
57
|
+
import { setExtractorRedis } from '../core/domain-extractors.js';
|
|
57
58
|
import { registerPremiumHooks } from './premium/index.js';
|
|
58
59
|
import { readFileSync } from 'fs';
|
|
59
60
|
import { join, dirname } from 'path';
|
|
@@ -421,6 +422,29 @@ export function startServer(config = {}) {
|
|
|
421
422
|
const port = config.port || parseInt(process.env.PORT || '3000', 10);
|
|
422
423
|
// Activate premium strategy hooks (SWR cache, domain intelligence, race).
|
|
423
424
|
registerPremiumHooks();
|
|
425
|
+
// Inject Redis into the domain extractor cache for cross-pod cache sharing.
|
|
426
|
+
// When REDIS_URL is set (multi-pod k8s deployments), all pods share one cache
|
|
427
|
+
// so the first pod to fetch a URL populates it for all others.
|
|
428
|
+
if (process.env.REDIS_URL) {
|
|
429
|
+
// @ts-ignore — ioredis CJS/ESM interop
|
|
430
|
+
import('ioredis').then((IoRedisModule) => {
|
|
431
|
+
const IoRedis = IoRedisModule.default ?? IoRedisModule;
|
|
432
|
+
const url = process.env.REDIS_URL;
|
|
433
|
+
const parsed = new URL(url);
|
|
434
|
+
const redis = new IoRedis({
|
|
435
|
+
host: parsed.hostname,
|
|
436
|
+
port: parseInt(parsed.port || '6379', 10),
|
|
437
|
+
db: parseInt(parsed.pathname?.slice(1) || '0', 10) || 0,
|
|
438
|
+
lazyConnect: true,
|
|
439
|
+
maxRetriesPerRequest: 3,
|
|
440
|
+
enableOfflineQueue: false,
|
|
441
|
+
});
|
|
442
|
+
setExtractorRedis(redis);
|
|
443
|
+
log.info('Redis extractor cache initialized (shared cross-pod cache active)');
|
|
444
|
+
}).catch((err) => {
|
|
445
|
+
log.warn('Failed to init Redis extractor cache (in-memory only)', { error: err.message });
|
|
446
|
+
});
|
|
447
|
+
}
|
|
424
448
|
// Pre-warm browser resources in the background to reduce first-request latency.
|
|
425
449
|
void warmup().catch((error) => {
|
|
426
450
|
log.warn('Browser warmup failed', { error: error instanceof Error ? error.message : String(error) });
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.68",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|