webpeel 0.21.71 → 0.21.73
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/fetch.js +14 -6
- package/dist/cli/commands/search.js +96 -0
- package/dist/cli/utils.js +31 -1
- package/dist/cli.js +14 -0
- package/dist/core/domain-extractors.js +218 -116
- package/dist/core/search-fallback.d.ts +1 -0
- package/dist/core/search-fallback.js +43 -18
- package/package.json +2 -2
|
@@ -682,13 +682,21 @@ export async function runFetch(url, options) {
|
|
|
682
682
|
result = await fetchViaApi(url, peelOptions, fetchApiKey, fetchApiUrl);
|
|
683
683
|
}
|
|
684
684
|
else {
|
|
685
|
-
// No API key —
|
|
685
|
+
// No API key — fall back to local peel() mode (runs locally, no API needed)
|
|
686
686
|
if (spinner)
|
|
687
|
-
spinner.
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
await
|
|
691
|
-
|
|
687
|
+
spinner.text = 'Fetching locally (no API key)…';
|
|
688
|
+
const startLocal = Date.now();
|
|
689
|
+
const { peel } = await import('../../index.js');
|
|
690
|
+
const localResult = await peel(url, peelOptions);
|
|
691
|
+
const elapsed = Date.now() - startLocal;
|
|
692
|
+
// Normalize to the shape fetchViaApi returns
|
|
693
|
+
result = {
|
|
694
|
+
...localResult,
|
|
695
|
+
elapsed: localResult.elapsed ?? elapsed,
|
|
696
|
+
method: localResult.method ?? 'local',
|
|
697
|
+
tokens: localResult.tokens ?? Math.ceil((localResult.content?.length ?? 0) / 4),
|
|
698
|
+
cached: false,
|
|
699
|
+
};
|
|
692
700
|
}
|
|
693
701
|
// Update lastUsed timestamp for named profiles
|
|
694
702
|
if (resolvedProfileName) {
|
|
@@ -588,4 +588,100 @@ export function registerSearchCommands(program) {
|
|
|
588
588
|
process.exit(1);
|
|
589
589
|
}
|
|
590
590
|
});
|
|
591
|
+
// ── extractors command ────────────────────────────────────────────────────
|
|
592
|
+
program
|
|
593
|
+
.command('extractors')
|
|
594
|
+
.alias('list-extractors')
|
|
595
|
+
.description('List all supported domain extractors')
|
|
596
|
+
.option('--json', 'Output as JSON')
|
|
597
|
+
.action((options) => {
|
|
598
|
+
const extractors = [
|
|
599
|
+
// Social
|
|
600
|
+
{ domain: 'twitter.com / x.com', category: 'Social', description: 'Tweets, threads, profiles' },
|
|
601
|
+
{ domain: 'reddit.com', category: 'Social', description: 'Subreddits, posts, comments' },
|
|
602
|
+
{ domain: 'instagram.com', category: 'Social', description: 'Photos, reels, profiles' },
|
|
603
|
+
{ domain: 'tiktok.com', category: 'Social', description: 'Video metadata, captions' },
|
|
604
|
+
{ domain: 'pinterest.com', category: 'Social', description: 'Pins, boards' },
|
|
605
|
+
{ domain: 'linkedin.com', category: 'Social', description: 'Profiles, job listings' },
|
|
606
|
+
{ domain: 'facebook.com', category: 'Social', description: 'Marketplace listings' },
|
|
607
|
+
// Video / Audio
|
|
608
|
+
{ domain: 'youtube.com', category: 'Video', description: 'Transcripts, metadata, comments' },
|
|
609
|
+
{ domain: 'twitch.tv', category: 'Video', description: 'Streams, clips, channel info' },
|
|
610
|
+
{ domain: 'soundcloud.com', category: 'Audio', description: 'Tracks, playlists' },
|
|
611
|
+
{ domain: 'open.spotify.com', category: 'Audio', description: 'Tracks, albums, playlists' },
|
|
612
|
+
// Tech / Dev
|
|
613
|
+
{ domain: 'github.com', category: 'Dev', description: 'Repos, issues, PRs, code' },
|
|
614
|
+
{ domain: 'stackoverflow.com', category: 'Dev', description: 'Questions, answers' },
|
|
615
|
+
{ domain: 'npmjs.com', category: 'Dev', description: 'Package metadata, readme' },
|
|
616
|
+
{ domain: 'pypi.org', category: 'Dev', description: 'Package metadata, readme' },
|
|
617
|
+
{ domain: 'dev.to', category: 'Dev', description: 'Articles, comments' },
|
|
618
|
+
// News / Articles
|
|
619
|
+
{ domain: 'news.ycombinator.com', category: 'News', description: 'HN posts, comments, Ask/Show HN' },
|
|
620
|
+
{ domain: 'medium.com', category: 'Articles', description: 'Articles, publications' },
|
|
621
|
+
{ domain: 'substack.com / *.substack.com', category: 'Articles', description: 'Newsletters, posts' },
|
|
622
|
+
{ domain: 'nytimes.com', category: 'News', description: 'Articles, headlines' },
|
|
623
|
+
{ domain: 'bbc.com', category: 'News', description: 'Articles, headlines' },
|
|
624
|
+
{ domain: 'cnn.com', category: 'News', description: 'Articles, headlines' },
|
|
625
|
+
// Shopping / E-commerce
|
|
626
|
+
{ domain: 'amazon.com', category: 'Shopping', description: 'Products, prices, reviews' },
|
|
627
|
+
{ domain: 'bestbuy.com', category: 'Shopping', description: 'Products, prices, specs' },
|
|
628
|
+
{ domain: 'walmart.com', category: 'Shopping', description: 'Products, prices' },
|
|
629
|
+
{ domain: 'ebay.com', category: 'Shopping', description: 'Listings, prices' },
|
|
630
|
+
{ domain: 'etsy.com', category: 'Shopping', description: 'Handmade listings' },
|
|
631
|
+
// Local / Real Estate
|
|
632
|
+
{ domain: 'yelp.com', category: 'Local', description: 'Business info, reviews (needs YELP_API_KEY)' },
|
|
633
|
+
{ domain: 'craigslist.org', category: 'Local', description: 'Listings, classifieds' },
|
|
634
|
+
{ domain: 'zillow.com', category: 'Real Estate', description: 'Property listings, estimates' },
|
|
635
|
+
{ domain: 'redfin.com', category: 'Real Estate', description: 'Property listings, prices' },
|
|
636
|
+
{ domain: 'cars.com', category: 'Automotive', description: 'Car listings, prices' },
|
|
637
|
+
// Knowledge / Academic
|
|
638
|
+
{ domain: 'en.wikipedia.org', category: 'Knowledge', description: 'Articles, structured data' },
|
|
639
|
+
{ domain: 'arxiv.org', category: 'Academic', description: 'Papers, abstracts, metadata' },
|
|
640
|
+
{ domain: 'semanticscholar.org', category: 'Academic', description: 'Papers, citations' },
|
|
641
|
+
{ domain: 'pubmed.ncbi.nlm.nih.gov', category: 'Academic', description: 'Medical papers, abstracts' },
|
|
642
|
+
{ domain: 'imdb.com', category: 'Knowledge', description: 'Movies, TV shows, cast' },
|
|
643
|
+
{ domain: 'allrecipes.com', category: 'Knowledge', description: 'Recipes, ingredients, steps' },
|
|
644
|
+
// Finance / Markets
|
|
645
|
+
{ domain: 'polymarket.com', category: 'Finance', description: 'Prediction markets' },
|
|
646
|
+
{ domain: 'kalshi.com', category: 'Finance', description: 'Prediction markets' },
|
|
647
|
+
{ domain: 'tradingview.com', category: 'Finance', description: 'Charts, indicators, ideas' },
|
|
648
|
+
{ domain: 'coingecko.com', category: 'Finance', description: 'Crypto prices, market data' },
|
|
649
|
+
{ domain: 'coinmarketcap.com', category: 'Finance', description: 'Crypto prices, market data' },
|
|
650
|
+
// Sports / Betting
|
|
651
|
+
{ domain: 'espn.com', category: 'Sports', description: 'Scores, stats, news' },
|
|
652
|
+
{ domain: 'draftkings.com', category: 'Betting', description: 'Odds, lines' },
|
|
653
|
+
{ domain: 'fanduel.com', category: 'Betting', description: 'Odds, lines' },
|
|
654
|
+
{ domain: 'betmgm.com', category: 'Betting', description: 'Odds, lines' },
|
|
655
|
+
// Entertainment
|
|
656
|
+
{ domain: 'producthunt.com', category: 'Tech', description: 'Product launches, upvotes' },
|
|
657
|
+
// Documents
|
|
658
|
+
{ domain: '*.pdf URLs', category: 'Documents', description: 'PDF text extraction' },
|
|
659
|
+
// Weather
|
|
660
|
+
{ domain: 'weather.com', category: 'Weather', description: 'Forecasts, conditions' },
|
|
661
|
+
{ domain: 'accuweather.com', category: 'Weather', description: 'Forecasts, conditions' },
|
|
662
|
+
{ domain: 'api.open-meteo.com', category: 'Weather', description: 'Free weather API' },
|
|
663
|
+
];
|
|
664
|
+
if (options.json) {
|
|
665
|
+
console.log(JSON.stringify(extractors, null, 2));
|
|
666
|
+
return;
|
|
667
|
+
}
|
|
668
|
+
// Group by category
|
|
669
|
+
const byCategory = new Map();
|
|
670
|
+
for (const e of extractors) {
|
|
671
|
+
if (!byCategory.has(e.category))
|
|
672
|
+
byCategory.set(e.category, []);
|
|
673
|
+
byCategory.get(e.category).push(e);
|
|
674
|
+
}
|
|
675
|
+
console.log(`\n🔌 WebPeel Domain Extractors (${extractors.length} total)\n`);
|
|
676
|
+
for (const [cat, items] of byCategory) {
|
|
677
|
+
console.log(` ${cat}`);
|
|
678
|
+
for (const item of items) {
|
|
679
|
+
const pad = 35;
|
|
680
|
+
const domainPad = item.domain.padEnd(pad);
|
|
681
|
+
console.log(` ${domainPad} ${item.description}`);
|
|
682
|
+
}
|
|
683
|
+
console.log('');
|
|
684
|
+
}
|
|
685
|
+
console.log(' Run `webpeel <url>` to use these automatically based on the URL.');
|
|
686
|
+
});
|
|
591
687
|
}
|
package/dist/cli/utils.js
CHANGED
|
@@ -255,7 +255,37 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
|
|
|
255
255
|
err.statusCode = res.status;
|
|
256
256
|
throw err;
|
|
257
257
|
}
|
|
258
|
-
|
|
258
|
+
let data = await res.json();
|
|
259
|
+
// Handle async job queue mode — API returns { jobId, pollUrl } and we need to poll
|
|
260
|
+
if (data.jobId && data.pollUrl && !data.content) {
|
|
261
|
+
const pollEndpoint = `${apiUrl}${data.pollUrl}`;
|
|
262
|
+
const maxPollMs = 90_000; // 90s max
|
|
263
|
+
const pollInterval = 1_000; // 1s intervals
|
|
264
|
+
const start = Date.now();
|
|
265
|
+
while (Date.now() - start < maxPollMs) {
|
|
266
|
+
await new Promise(r => setTimeout(r, pollInterval));
|
|
267
|
+
const pollRes = await fetch(pollEndpoint, {
|
|
268
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
269
|
+
signal: AbortSignal.timeout(10_000),
|
|
270
|
+
});
|
|
271
|
+
if (!pollRes.ok) {
|
|
272
|
+
throw new Error(`Job poll failed: HTTP ${pollRes.status}`);
|
|
273
|
+
}
|
|
274
|
+
const pollData = await pollRes.json();
|
|
275
|
+
if (pollData.status === 'completed' || pollData.content) {
|
|
276
|
+
data = pollData.result || pollData;
|
|
277
|
+
break;
|
|
278
|
+
}
|
|
279
|
+
if (pollData.status === 'failed' || pollData.status === 'error') {
|
|
280
|
+
throw new Error(pollData.error?.message || pollData.error || 'Job failed on server');
|
|
281
|
+
}
|
|
282
|
+
// Still processing — keep polling
|
|
283
|
+
}
|
|
284
|
+
// If we exited the loop without data, warn
|
|
285
|
+
if (!data.content && data.jobId) {
|
|
286
|
+
throw new Error('Job timed out waiting for server response. Try again or use local mode (unset WEBPEEL_API_KEY).');
|
|
287
|
+
}
|
|
288
|
+
}
|
|
259
289
|
// Map API response to PeelResult shape that the CLI already handles
|
|
260
290
|
return {
|
|
261
291
|
url: data.url || url,
|
package/dist/cli.js
CHANGED
|
@@ -14,6 +14,20 @@
|
|
|
14
14
|
* npx webpeel --help - Condensed help
|
|
15
15
|
* npx webpeel --help-all - Full option reference
|
|
16
16
|
*/
|
|
17
|
+
// ── Auto-load .env from cwd (lightweight, no dotenv dependency) ──────────────
|
|
18
|
+
// Must happen BEFORE any imports that read env vars (e.g., WEBPEEL_API_KEY)
|
|
19
|
+
import { readFileSync, existsSync } from 'fs';
|
|
20
|
+
import { resolve } from 'path';
|
|
21
|
+
{
|
|
22
|
+
const envPath = resolve(process.cwd(), '.env');
|
|
23
|
+
if (existsSync(envPath)) {
|
|
24
|
+
for (const line of readFileSync(envPath, 'utf-8').split('\n')) {
|
|
25
|
+
const m = line.match(/^([A-Z_][A-Z0-9_]*)=(.*)$/);
|
|
26
|
+
if (m && !process.env[m[1]])
|
|
27
|
+
process.env[m[1]] = m[2].replace(/^["']|["']$/g, '');
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
17
31
|
import { Command } from 'commander';
|
|
18
32
|
import { VERB_ALIASES, cliVersion, checkForUpdates, buildCommanderHelp, buildCondensedHelp, } from './cli/utils.js';
|
|
19
33
|
import { registerFetchCommands } from './cli/commands/fetch.js';
|
|
@@ -5247,100 +5247,212 @@ async function ebayExtractor(html, url) {
|
|
|
5247
5247
|
// ---------------------------------------------------------------------------
|
|
5248
5248
|
// Yelp extractor — parse JSON-LD + meta from stealth-rendered HTML
|
|
5249
5249
|
// ---------------------------------------------------------------------------
|
|
5250
|
-
async function yelpExtractor(
|
|
5250
|
+
async function yelpExtractor(_html, url) {
|
|
5251
|
+
const YELP_API_KEY = process.env.YELP_API_KEY;
|
|
5252
|
+
// Helper to call Yelp Fusion API
|
|
5253
|
+
async function yelpFetch(path, params) {
|
|
5254
|
+
const base = 'https://api.yelp.com/v3';
|
|
5255
|
+
const qs = params ? '?' + new URLSearchParams(params).toString() : '';
|
|
5256
|
+
const res = await fetch(`${base}${path}${qs}`, {
|
|
5257
|
+
headers: { 'Authorization': `Bearer ${YELP_API_KEY}` },
|
|
5258
|
+
});
|
|
5259
|
+
if (!res.ok) {
|
|
5260
|
+
throw new Error(`Yelp API ${res.status}: ${res.statusText}`);
|
|
5261
|
+
}
|
|
5262
|
+
return res.json();
|
|
5263
|
+
}
|
|
5251
5264
|
try {
|
|
5252
|
-
const
|
|
5253
|
-
const
|
|
5254
|
-
|
|
5255
|
-
|
|
5256
|
-
|
|
5257
|
-
|
|
5258
|
-
|
|
5265
|
+
const parsed = new URL(url);
|
|
5266
|
+
const pathname = parsed.pathname;
|
|
5267
|
+
const searchParams = parsed.searchParams;
|
|
5268
|
+
// ----------------------------------------------------------------
|
|
5269
|
+
// If no API key, fall back to the legacy HTML-scraping approach
|
|
5270
|
+
// ----------------------------------------------------------------
|
|
5271
|
+
if (!YELP_API_KEY) {
|
|
5272
|
+
// Legacy fallback: minimal result pointing user to Yelp
|
|
5273
|
+
const term = searchParams.get('find_desc') || searchParams.get('cflt') || 'businesses';
|
|
5274
|
+
const loc = searchParams.get('find_loc') || '';
|
|
5275
|
+
const isBiz = pathname.startsWith('/biz/');
|
|
5276
|
+
const cleanContent = isBiz
|
|
5277
|
+
? `# Yelp Business\n\n*No YELP_API_KEY configured — visit [Yelp](${url}) for details.*`
|
|
5278
|
+
: `# 🔍 Yelp Search: ${term}${loc ? ` in ${loc}` : ''}\n\n*No YELP_API_KEY configured — [View on Yelp](${url})*`;
|
|
5279
|
+
return {
|
|
5280
|
+
domain: 'yelp.com',
|
|
5281
|
+
type: isBiz ? 'business' : 'search',
|
|
5282
|
+
structured: { url },
|
|
5283
|
+
cleanContent,
|
|
5284
|
+
};
|
|
5285
|
+
}
|
|
5286
|
+
// ----------------------------------------------------------------
|
|
5287
|
+
// Business page: /biz/<alias>
|
|
5288
|
+
// ----------------------------------------------------------------
|
|
5289
|
+
if (pathname.startsWith('/biz/')) {
|
|
5290
|
+
const alias = pathname.replace('/biz/', '').split('?')[0].split('#')[0];
|
|
5291
|
+
let biz;
|
|
5259
5292
|
try {
|
|
5260
|
-
|
|
5261
|
-
const items = Array.isArray(parsed) ? parsed : [parsed];
|
|
5262
|
-
for (const item of items) {
|
|
5263
|
-
const type = item['@type'];
|
|
5264
|
-
if (type === 'Restaurant' || type === 'LocalBusiness' || type === 'FoodEstablishment' ||
|
|
5265
|
-
type === 'BarOrPub' || type === 'CafeOrCoffeeShop') {
|
|
5266
|
-
businessData = item;
|
|
5267
|
-
}
|
|
5268
|
-
}
|
|
5293
|
+
biz = await yelpFetch(`/businesses/${alias}`);
|
|
5269
5294
|
}
|
|
5270
|
-
catch {
|
|
5271
|
-
|
|
5272
|
-
|
|
5273
|
-
|
|
5274
|
-
|
|
5275
|
-
|
|
5276
|
-
|
|
5277
|
-
|
|
5295
|
+
catch (e) {
|
|
5296
|
+
if (process.env.DEBUG)
|
|
5297
|
+
console.debug('[webpeel] Yelp biz fetch failed:', e instanceof Error ? e.message : e);
|
|
5298
|
+
return null;
|
|
5299
|
+
}
|
|
5300
|
+
// Fetch reviews (best-effort)
|
|
5301
|
+
let reviews = [];
|
|
5302
|
+
try {
|
|
5303
|
+
const revData = await yelpFetch(`/businesses/${alias}/reviews`, { limit: '3' });
|
|
5304
|
+
reviews = revData.reviews || [];
|
|
5305
|
+
}
|
|
5306
|
+
catch { /* reviews are optional */ }
|
|
5307
|
+
const name = biz.name || alias;
|
|
5308
|
+
const rating = biz.rating != null ? biz.rating.toFixed(1) : '?';
|
|
5309
|
+
const reviewCount = biz.review_count ?? 0;
|
|
5310
|
+
const addr = biz.location;
|
|
5278
5311
|
const address = addr
|
|
5279
|
-
? [addr.
|
|
5312
|
+
? [addr.address1, addr.city, addr.state, addr.zip_code].filter(Boolean).join(', ')
|
|
5280
5313
|
: '';
|
|
5281
|
-
const phone =
|
|
5282
|
-
const
|
|
5283
|
-
const
|
|
5284
|
-
const
|
|
5285
|
-
|
|
5314
|
+
const phone = biz.display_phone || biz.phone || '';
|
|
5315
|
+
const price = biz.price || '';
|
|
5316
|
+
const categories = (biz.categories || []).map((c) => c.title).join(' | ');
|
|
5317
|
+
const yelpUrl = biz.url || url;
|
|
5318
|
+
// Hours
|
|
5319
|
+
let hoursStr = '';
|
|
5320
|
+
if (biz.hours && biz.hours.length > 0) {
|
|
5321
|
+
const dayNames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'];
|
|
5322
|
+
const dayMap = {};
|
|
5323
|
+
for (const slot of biz.hours[0].open || []) {
|
|
5324
|
+
const fmt = (t) => {
|
|
5325
|
+
const h = parseInt(t.slice(0, 2), 10);
|
|
5326
|
+
const m = t.slice(2);
|
|
5327
|
+
const period = h >= 12 ? 'PM' : 'AM';
|
|
5328
|
+
const h12 = h % 12 || 12;
|
|
5329
|
+
return `${h12}:${m} ${period}`;
|
|
5330
|
+
};
|
|
5331
|
+
const day = slot.day;
|
|
5332
|
+
if (!dayMap[day])
|
|
5333
|
+
dayMap[day] = [];
|
|
5334
|
+
dayMap[day].push(`${fmt(slot.start)}–${fmt(slot.end)}`);
|
|
5335
|
+
}
|
|
5336
|
+
hoursStr = Object.entries(dayMap)
|
|
5337
|
+
.map(([d, times]) => `${dayNames[parseInt(d, 10)]}: ${times.join(', ')}`)
|
|
5338
|
+
.join(' | ');
|
|
5339
|
+
}
|
|
5286
5340
|
const lines = [
|
|
5287
|
-
`# ⭐
|
|
5288
|
-
'',
|
|
5289
|
-
rating && `**Rating:** ${rating}/5 (${reviewCount} reviews)`,
|
|
5290
|
-
cuisine && `**Cuisine:** ${cuisine}`,
|
|
5291
|
-
priceRange && `**Price:** ${priceRange}`,
|
|
5292
|
-
address && `**Address:** ${address}`,
|
|
5293
|
-
phone && `**Phone:** ${phone}`,
|
|
5294
|
-
hours && `**Hours:** ${Array.isArray(hours) ? hours.join(', ') : hours}`,
|
|
5295
|
-
description && `\n${description.substring(0, 500)}`,
|
|
5341
|
+
`# ${name} ⭐ ${rating} (${reviewCount.toLocaleString()} reviews)`,
|
|
5296
5342
|
'',
|
|
5297
|
-
|
|
5298
|
-
|
|
5299
|
-
|
|
5300
|
-
|
|
5301
|
-
|
|
5343
|
+
];
|
|
5344
|
+
if (address)
|
|
5345
|
+
lines.push(`📍 ${address}`);
|
|
5346
|
+
if (categories)
|
|
5347
|
+
lines.push(`🏷️ ${categories}${price ? ` | 💰 ${price}` : ''}`);
|
|
5348
|
+
else if (price)
|
|
5349
|
+
lines.push(`💰 ${price}`);
|
|
5350
|
+
if (phone)
|
|
5351
|
+
lines.push(`📞 ${phone}`);
|
|
5352
|
+
if (hoursStr)
|
|
5353
|
+
lines.push(`🕐 ${hoursStr}`);
|
|
5354
|
+
if (biz.is_closed === true)
|
|
5355
|
+
lines.push(`⚠️ *Permanently closed*`);
|
|
5356
|
+
lines.push('');
|
|
5357
|
+
if (reviews.length > 0) {
|
|
5358
|
+
for (const rev of reviews) {
|
|
5359
|
+
const stars = '⭐'.repeat(Math.round(rev.rating || 0));
|
|
5360
|
+
const text = (rev.text || '').replace(/\n+/g, ' ').trim().slice(0, 200);
|
|
5361
|
+
lines.push(`> ${stars} — ${text}${(rev.text || '').length > 200 ? '…' : ''}`);
|
|
5362
|
+
lines.push('');
|
|
5363
|
+
}
|
|
5364
|
+
}
|
|
5365
|
+
lines.push(`[View on Yelp](${yelpUrl})`);
|
|
5302
5366
|
return {
|
|
5303
5367
|
domain: 'yelp.com',
|
|
5304
5368
|
type: 'business',
|
|
5305
|
-
structured: { name, rating, reviewCount, address, phone,
|
|
5369
|
+
structured: { name, rating: parseFloat(rating), reviewCount, address, phone, price, categories, url: yelpUrl },
|
|
5306
5370
|
cleanContent: lines.join('\n'),
|
|
5307
5371
|
};
|
|
5308
5372
|
}
|
|
5309
|
-
//
|
|
5310
|
-
|
|
5311
|
-
|
|
5312
|
-
//
|
|
5313
|
-
const
|
|
5314
|
-
|
|
5315
|
-
|
|
5316
|
-
|
|
5317
|
-
|
|
5318
|
-
|
|
5319
|
-
const fullHref = href.startsWith('/') ? `https://www.yelp.com${href}` : href;
|
|
5320
|
-
listings.push({ name: text, url: fullHref || undefined });
|
|
5321
|
-
}
|
|
5322
|
-
});
|
|
5323
|
-
if (ogTitle || listings.length > 0) {
|
|
5324
|
-
const searchTerm = ogTitle.replace(/\s*-\s*Yelp$/, '').trim();
|
|
5325
|
-
const lines = [
|
|
5326
|
-
`# 🔍 Yelp Search: ${searchTerm || 'Results'}`,
|
|
5327
|
-
ogDescription && `\n${ogDescription}`,
|
|
5328
|
-
listings.length > 0 && `\n**Found ${listings.length} results:**`,
|
|
5329
|
-
...listings.slice(0, 15).map((l, i) => `${i + 1}. ${l.url ? `[${l.name}](${l.url})` : l.name}`),
|
|
5330
|
-
'',
|
|
5331
|
-
`**Search:** [View on Yelp](${url})`,
|
|
5332
|
-
'',
|
|
5333
|
-
'---',
|
|
5334
|
-
'*Source: Yelp*',
|
|
5335
|
-
].filter(Boolean);
|
|
5336
|
-
return {
|
|
5337
|
-
domain: 'yelp.com',
|
|
5338
|
-
type: 'search',
|
|
5339
|
-
structured: { query: searchTerm, count: listings.length, listings },
|
|
5340
|
-
cleanContent: lines.join('\n'),
|
|
5341
|
-
};
|
|
5373
|
+
// ----------------------------------------------------------------
|
|
5374
|
+
// Search / Category URL: /search?find_desc=...&find_loc=...
|
|
5375
|
+
// /search?cflt=restaurants&find_loc=...
|
|
5376
|
+
// ----------------------------------------------------------------
|
|
5377
|
+
const findDesc = searchParams.get('find_desc') || '';
|
|
5378
|
+
const cflt = searchParams.get('cflt') || '';
|
|
5379
|
+
const findLoc = searchParams.get('find_loc') || '';
|
|
5380
|
+
if (!findLoc && !findDesc && !cflt) {
|
|
5381
|
+
// Not a recognized pattern
|
|
5382
|
+
return null;
|
|
5342
5383
|
}
|
|
5343
|
-
|
|
5384
|
+
const apiParams = { limit: '10' };
|
|
5385
|
+
if (findLoc)
|
|
5386
|
+
apiParams.location = findLoc;
|
|
5387
|
+
if (findDesc)
|
|
5388
|
+
apiParams.term = findDesc;
|
|
5389
|
+
if (cflt && !findDesc)
|
|
5390
|
+
apiParams.categories = cflt;
|
|
5391
|
+
let data;
|
|
5392
|
+
try {
|
|
5393
|
+
data = await yelpFetch('/businesses/search', apiParams);
|
|
5394
|
+
}
|
|
5395
|
+
catch (e) {
|
|
5396
|
+
if (process.env.DEBUG)
|
|
5397
|
+
console.debug('[webpeel] Yelp search failed:', e instanceof Error ? e.message : e);
|
|
5398
|
+
return null;
|
|
5399
|
+
}
|
|
5400
|
+
const businesses = data.businesses || [];
|
|
5401
|
+
const total = data.total ?? businesses.length;
|
|
5402
|
+
// Build header
|
|
5403
|
+
const searchLabel = findDesc || cflt || 'Businesses';
|
|
5404
|
+
const locationLabel = findLoc || '';
|
|
5405
|
+
const emoji = cflt === 'restaurants' || findDesc?.toLowerCase().includes('restaurant') ? '🍽️'
|
|
5406
|
+
: findDesc?.toLowerCase().includes('pizza') ? '🍕'
|
|
5407
|
+
: findDesc?.toLowerCase().includes('coffee') || findDesc?.toLowerCase().includes('cafe') ? '☕'
|
|
5408
|
+
: findDesc?.toLowerCase().includes('bar') ? '🍺'
|
|
5409
|
+
: '🔍';
|
|
5410
|
+
const titleParts = [searchLabel.charAt(0).toUpperCase() + searchLabel.slice(1)];
|
|
5411
|
+
if (locationLabel)
|
|
5412
|
+
titleParts.push(`in ${locationLabel}`);
|
|
5413
|
+
const lines = [
|
|
5414
|
+
`# ${emoji} Yelp — ${titleParts.join(' ')}`,
|
|
5415
|
+
'',
|
|
5416
|
+
`*${businesses.length} of ${total.toLocaleString()} results via Yelp Fusion API*`,
|
|
5417
|
+
'',
|
|
5418
|
+
];
|
|
5419
|
+
for (let i = 0; i < businesses.length; i++) {
|
|
5420
|
+
const b = businesses[i];
|
|
5421
|
+
const bName = b.name || 'Unknown';
|
|
5422
|
+
const bRating = b.rating != null ? b.rating.toFixed(1) : '?';
|
|
5423
|
+
const bReviews = b.review_count ?? 0;
|
|
5424
|
+
const bAddr = b.location;
|
|
5425
|
+
const bAddress = bAddr
|
|
5426
|
+
? [bAddr.address1, bAddr.city, bAddr.state, bAddr.zip_code].filter(Boolean).join(', ')
|
|
5427
|
+
: '';
|
|
5428
|
+
const bPhone = b.display_phone || '';
|
|
5429
|
+
const bPrice = b.price || '';
|
|
5430
|
+
const bCategories = (b.categories || []).map((c) => c.title).join(' | ');
|
|
5431
|
+
const bUrl = b.url || '';
|
|
5432
|
+
const bSnippet = b.snippet_text || '';
|
|
5433
|
+
lines.push(`## ${i + 1}. ${bName} ⭐ ${bRating} (${bReviews.toLocaleString()} reviews)`);
|
|
5434
|
+
if (bAddress)
|
|
5435
|
+
lines.push(`📍 ${bAddress}`);
|
|
5436
|
+
const tagLine = [bCategories && `🏷️ ${bCategories}`, bPrice && `💰 ${bPrice}`].filter(Boolean).join(' | ');
|
|
5437
|
+
if (tagLine)
|
|
5438
|
+
lines.push(tagLine);
|
|
5439
|
+
if (bPhone)
|
|
5440
|
+
lines.push(`📞 ${bPhone}`);
|
|
5441
|
+
if (bSnippet)
|
|
5442
|
+
lines.push(`> ${bSnippet.replace(/\n+/g, ' ').trim().slice(0, 150)}`);
|
|
5443
|
+
if (bUrl)
|
|
5444
|
+
lines.push(`[View on Yelp](${bUrl})`);
|
|
5445
|
+
lines.push('');
|
|
5446
|
+
}
|
|
5447
|
+
if (businesses.length === 0) {
|
|
5448
|
+
lines.push(`*No results found for "${searchLabel}"${locationLabel ? ` in ${locationLabel}` : ''}.*`);
|
|
5449
|
+
}
|
|
5450
|
+
return {
|
|
5451
|
+
domain: 'yelp.com',
|
|
5452
|
+
type: 'search',
|
|
5453
|
+
structured: { query: searchLabel, location: locationLabel, total, count: businesses.length, businesses },
|
|
5454
|
+
cleanContent: lines.join('\n'),
|
|
5455
|
+
};
|
|
5344
5456
|
}
|
|
5345
5457
|
catch (e) {
|
|
5346
5458
|
if (process.env.DEBUG)
|
|
@@ -5471,20 +5583,21 @@ async function zillowExtractor(_html, url) {
|
|
|
5471
5583
|
const redfinCityUrl = `https://www.redfin.com/${stateCode}/${cityForUrl}`;
|
|
5472
5584
|
const locationLabel = `${cityName}, ${stateCode}`;
|
|
5473
5585
|
// Try to fetch live Redfin listings via their API
|
|
5474
|
-
// Map common
|
|
5475
|
-
const
|
|
5476
|
-
'NY-New-York':
|
|
5477
|
-
'NY-Staten-Island':
|
|
5478
|
-
'CA-Los-Angeles':
|
|
5479
|
-
'
|
|
5480
|
-
'
|
|
5481
|
-
'
|
|
5482
|
-
'
|
|
5483
|
-
'
|
|
5484
|
-
'
|
|
5586
|
+
// Map common city slugs to Redfin city region IDs (region_type=6)
|
|
5587
|
+
const cityRegionMap = {
|
|
5588
|
+
'NY-New-York': 30749, 'NY-Brooklyn': 30749, 'NY-Queens': 30749, 'NY-Bronx': 30749,
|
|
5589
|
+
'NY-Staten-Island': 30749, 'NY-Manhattan': 30749,
|
|
5590
|
+
'CA-Los-Angeles': 11203, 'CA-San-Francisco': 17151, 'CA-San-Diego': 18142,
|
|
5591
|
+
'CA-San-Jose': 17420,
|
|
5592
|
+
'TX-Houston': 30772, 'TX-Dallas': 35799, 'TX-Austin': 30818,
|
|
5593
|
+
'FL-Miami': 10201, 'FL-Orlando': 13140, 'FL-Tampa': 18280,
|
|
5594
|
+
'IL-Chicago': 29470, 'WA-Seattle': 16163, 'MA-Boston': 1826,
|
|
5595
|
+
'AZ-Phoenix': 14240, 'PA-Philadelphia': 13364, 'GA-Atlanta': 30756,
|
|
5596
|
+
'CO-Denver': 11093, 'MN-Minneapolis': 18959, 'OR-Portland': 14941,
|
|
5597
|
+
'NV-Las-Vegas': 32820, 'NC-Charlotte': 3105, 'OH-Columbus': 8528,
|
|
5485
5598
|
};
|
|
5486
5599
|
const marketKey = `${stateCode}-${cityForUrl}`;
|
|
5487
|
-
const marketId =
|
|
5600
|
+
const marketId = cityRegionMap[marketKey];
|
|
5488
5601
|
if (marketId) {
|
|
5489
5602
|
const payload = await fetchRedfinListings(marketId, 6 /* city */);
|
|
5490
5603
|
if (payload?.homes && payload.homes.length > 0) {
|
|
@@ -5622,32 +5735,21 @@ async function redfinExtractor(_html, url) {
|
|
|
5622
5735
|
const citySlug = stateCity[2];
|
|
5623
5736
|
const cityName = citySlug.replace(/-/g, ' ');
|
|
5624
5737
|
const locationLabel = `${cityName}, ${stateCode}`;
|
|
5625
|
-
// No region ID — use
|
|
5626
|
-
|
|
5627
|
-
|
|
5628
|
-
|
|
5629
|
-
|
|
5630
|
-
|
|
5631
|
-
'
|
|
5632
|
-
'
|
|
5633
|
-
'
|
|
5634
|
-
'
|
|
5635
|
-
'
|
|
5636
|
-
'
|
|
5637
|
-
'CA-Los-Angeles': 4,
|
|
5638
|
-
'CA-San-Francisco': 1,
|
|
5639
|
-
'TX-Houston': 7,
|
|
5640
|
-
'TX-Dallas': 24,
|
|
5641
|
-
'FL-Miami': 13,
|
|
5642
|
-
'IL-Chicago': 3,
|
|
5643
|
-
'WA-Seattle': 16,
|
|
5644
|
-
'MA-Boston': 10,
|
|
5645
|
-
'AZ-Phoenix': 14,
|
|
5646
|
-
'PA-Philadelphia': 12,
|
|
5647
|
-
'GA-Atlanta': 9,
|
|
5738
|
+
// No region ID in URL — use known Redfin city region IDs (region_type=6)
|
|
5739
|
+
const cityRegionMap = {
|
|
5740
|
+
'NY-New-York': 30749, 'NY-Brooklyn': 30749, 'NY-Queens': 30749, 'NY-Bronx': 30749,
|
|
5741
|
+
'NY-Staten-Island': 30749, 'NY-Manhattan': 30749,
|
|
5742
|
+
'CA-Los-Angeles': 11203, 'CA-San-Francisco': 17151, 'CA-San-Diego': 18142,
|
|
5743
|
+
'CA-San-Jose': 17420,
|
|
5744
|
+
'TX-Houston': 30772, 'TX-Dallas': 35799, 'TX-Austin': 30818,
|
|
5745
|
+
'FL-Miami': 10201, 'FL-Orlando': 13140, 'FL-Tampa': 18280,
|
|
5746
|
+
'IL-Chicago': 29470, 'WA-Seattle': 16163, 'MA-Boston': 1826,
|
|
5747
|
+
'AZ-Phoenix': 14240, 'PA-Philadelphia': 13364, 'GA-Atlanta': 30756,
|
|
5748
|
+
'CO-Denver': 11093, 'MN-Minneapolis': 18959, 'OR-Portland': 14941,
|
|
5749
|
+
'NV-Las-Vegas': 32820, 'NC-Charlotte': 3105, 'OH-Columbus': 8528,
|
|
5648
5750
|
};
|
|
5649
5751
|
const marketKey = `${stateCode}-${citySlug}`;
|
|
5650
|
-
const marketId =
|
|
5752
|
+
const marketId = cityRegionMap[marketKey];
|
|
5651
5753
|
if (marketId) {
|
|
5652
5754
|
const payload = await fetchRedfinListings(marketId, 6 /* city */);
|
|
5653
5755
|
if (payload?.homes && payload.homes.length > 0) {
|
|
@@ -22,6 +22,7 @@ export interface SearchFallbackResult {
|
|
|
22
22
|
}
|
|
23
23
|
/**
|
|
24
24
|
* Search for a URL using the best available search provider and extract the snippet.
|
|
25
|
+
* Richer fallback: tries multiple engines if the first returns < 100 tokens.
|
|
25
26
|
* Returns the title, snippet, and any extracted product data.
|
|
26
27
|
*/
|
|
27
28
|
export declare function searchFallback(url: string): Promise<SearchFallbackResult>;
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* Google CSE API → Brave API → Google stealth → DDG
|
|
8
8
|
* This avoids direct HTML scraping which is blocked by CAPTCHAs on datacenter IPs.
|
|
9
9
|
*/
|
|
10
|
-
import { getBestSearchProvider } from './search-provider.js';
|
|
10
|
+
import { getBestSearchProvider, DuckDuckGoProvider } from './search-provider.js';
|
|
11
11
|
/**
|
|
12
12
|
* Detect if a URL is likely a product page.
|
|
13
13
|
*/
|
|
@@ -121,8 +121,15 @@ function buildCachedContent(url, title, snippet, productData) {
|
|
|
121
121
|
lines.push(`*⚠️ Limited content — original page blocked direct access. For full data, configure GOOGLE_SEARCH_KEY or BRAVE_SEARCH_KEY.*`);
|
|
122
122
|
return lines.join('\n');
|
|
123
123
|
}
|
|
124
|
+
/**
|
|
125
|
+
* Count approximate tokens in a string (1 token ≈ 4 chars).
|
|
126
|
+
*/
|
|
127
|
+
function countTokens(text) {
|
|
128
|
+
return Math.ceil(text.length / 4);
|
|
129
|
+
}
|
|
124
130
|
/**
|
|
125
131
|
* Search for a URL using the best available search provider and extract the snippet.
|
|
132
|
+
* Richer fallback: tries multiple engines if the first returns < 100 tokens.
|
|
126
133
|
* Returns the title, snippet, and any extracted product data.
|
|
127
134
|
*/
|
|
128
135
|
export async function searchFallback(url) {
|
|
@@ -142,16 +149,42 @@ export async function searchFallback(url) {
|
|
|
142
149
|
}
|
|
143
150
|
const searchQuery = buildSearchQuery(url);
|
|
144
151
|
const { provider, apiKey } = getBestSearchProvider();
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
152
|
+
// Map provider ID to our source type
|
|
153
|
+
const sourceMap = {
|
|
154
|
+
duckduckgo: 'duckduckgo',
|
|
155
|
+
brave: 'google',
|
|
156
|
+
stealth: 'duckduckgo',
|
|
157
|
+
google: 'google',
|
|
158
|
+
};
|
|
159
|
+
// Try the primary (best) provider first
|
|
160
|
+
let results = await provider.searchWeb(searchQuery, { count: 5, apiKey }).catch(() => []);
|
|
161
|
+
// If primary returns sparse results (< 100 tokens), try DDG as secondary
|
|
162
|
+
const combinedSnippets = [];
|
|
163
|
+
let title = '';
|
|
164
|
+
let source = sourceMap[provider.id] ?? 'google';
|
|
165
|
+
if (results.length > 0) {
|
|
166
|
+
title = results[0].title?.trim() || '';
|
|
167
|
+
combinedSnippets.push(...results.map(r => r.snippet?.trim()).filter(Boolean));
|
|
168
|
+
}
|
|
169
|
+
const primaryTokens = countTokens(combinedSnippets.join(' '));
|
|
170
|
+
if (primaryTokens < 100) {
|
|
171
|
+
// Try DDG as a secondary engine to supplement
|
|
172
|
+
try {
|
|
173
|
+
const ddgProvider = new DuckDuckGoProvider();
|
|
174
|
+
const ddgResults = await ddgProvider.searchWeb(searchQuery, { count: 5 });
|
|
175
|
+
if (ddgResults.length > 0) {
|
|
176
|
+
if (!title)
|
|
177
|
+
title = ddgResults[0].title?.trim() || '';
|
|
178
|
+
if (source !== 'duckduckgo')
|
|
179
|
+
source = 'duckduckgo';
|
|
180
|
+
combinedSnippets.push(...ddgResults.map(r => r.snippet?.trim()).filter(Boolean));
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
catch { /* ignore secondary failure */ }
|
|
151
184
|
}
|
|
152
|
-
|
|
153
|
-
const
|
|
154
|
-
const snippet =
|
|
185
|
+
// Also try Google Cache URL as a last-resort content source
|
|
186
|
+
const allSnippets = [...new Set(combinedSnippets)]; // deduplicate
|
|
187
|
+
const snippet = allSnippets.slice(0, 3).join('\n\n');
|
|
155
188
|
if (!title && !snippet) {
|
|
156
189
|
return emptyResult;
|
|
157
190
|
}
|
|
@@ -159,14 +192,6 @@ export async function searchFallback(url) {
|
|
|
159
192
|
? extractProductData(title, snippet)
|
|
160
193
|
: undefined;
|
|
161
194
|
const cachedContent = buildCachedContent(url, title, snippet, productData);
|
|
162
|
-
// Map provider ID to our source type
|
|
163
|
-
const sourceMap = {
|
|
164
|
-
duckduckgo: 'duckduckgo',
|
|
165
|
-
brave: 'google',
|
|
166
|
-
stealth: 'duckduckgo',
|
|
167
|
-
google: 'google',
|
|
168
|
-
};
|
|
169
|
-
const source = sourceMap[provider.id] ?? 'google';
|
|
170
195
|
return {
|
|
171
196
|
title,
|
|
172
197
|
snippet,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.73",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
"main": "./dist/index.js",
|
|
9
9
|
"types": "./dist/index.d.ts",
|
|
10
10
|
"bin": {
|
|
11
|
-
"webpeel": "dist/cli.
|
|
11
|
+
"webpeel": "dist/cli.js"
|
|
12
12
|
},
|
|
13
13
|
"exports": {
|
|
14
14
|
".": {
|