npm - webpeel - Versions diffs - 0.21.72 → 0.21.74 - Mend

webpeel 0.21.72 → 0.21.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cli/commands/fetch.js +14 -6
package/dist/cli/commands/search.js +96 -0
package/dist/cli/utils.js +31 -1
package/dist/cli.js +14 -0
package/dist/core/domain-extractors.js +154 -0
package/dist/core/pipeline.js +6 -1
package/dist/core/search-fallback.d.ts +1 -0
package/dist/core/search-fallback.js +43 -18
package/package.json +2 -2

package/dist/cli/commands/fetch.js CHANGED Viewed

@@ -682,13 +682,21 @@ export async function runFetch(url, options) {
             result = await fetchViaApi(url, peelOptions, fetchApiKey, fetchApiUrl);
         }
         else {
-            // No API key — show helpful message instead of trying local mode
+            // No API key — fall back to local peel() mode (runs locally, no API needed)
             if (spinner)
-                spinner.fail('Authentication required');
-            console.error('No API key configured. Run: webpeel auth <your-key>');
-            console.error('Get a free key at: https://app.webpeel.dev/keys');
-            await cleanup();
-            process.exit(2);
+                spinner.text = 'Fetching locally (no API key)…';
+            const startLocal = Date.now();
+            const { peel } = await import('../../index.js');
+            const localResult = await peel(url, peelOptions);
+            const elapsed = Date.now() - startLocal;
+            // Normalize to the shape fetchViaApi returns
+            result = {
+                ...localResult,
+                elapsed: localResult.elapsed ?? elapsed,
+                method: localResult.method ?? 'local',
+                tokens: localResult.tokens ?? Math.ceil((localResult.content?.length ?? 0) / 4),
+                cached: false,
+            };
         }
         // Update lastUsed timestamp for named profiles
         if (resolvedProfileName) {

package/dist/cli/commands/search.js CHANGED Viewed

@@ -588,4 +588,100 @@ export function registerSearchCommands(program) {
             process.exit(1);
         }
     });
+    // ── extractors command ────────────────────────────────────────────────────
+    program
+        .command('extractors')
+        .alias('list-extractors')
+        .description('List all supported domain extractors')
+        .option('--json', 'Output as JSON')
+        .action((options) => {
+        const extractors = [
+            // Social
+            { domain: 'twitter.com / x.com', category: 'Social', description: 'Tweets, threads, profiles' },
+            { domain: 'reddit.com', category: 'Social', description: 'Subreddits, posts, comments' },
+            { domain: 'instagram.com', category: 'Social', description: 'Photos, reels, profiles' },
+            { domain: 'tiktok.com', category: 'Social', description: 'Video metadata, captions' },
+            { domain: 'pinterest.com', category: 'Social', description: 'Pins, boards' },
+            { domain: 'linkedin.com', category: 'Social', description: 'Profiles, job listings' },
+            { domain: 'facebook.com', category: 'Social', description: 'Marketplace listings' },
+            // Video / Audio
+            { domain: 'youtube.com', category: 'Video', description: 'Transcripts, metadata, comments' },
+            { domain: 'twitch.tv', category: 'Video', description: 'Streams, clips, channel info' },
+            { domain: 'soundcloud.com', category: 'Audio', description: 'Tracks, playlists' },
+            { domain: 'open.spotify.com', category: 'Audio', description: 'Tracks, albums, playlists' },
+            // Tech / Dev
+            { domain: 'github.com', category: 'Dev', description: 'Repos, issues, PRs, code' },
+            { domain: 'stackoverflow.com', category: 'Dev', description: 'Questions, answers' },
+            { domain: 'npmjs.com', category: 'Dev', description: 'Package metadata, readme' },
+            { domain: 'pypi.org', category: 'Dev', description: 'Package metadata, readme' },
+            { domain: 'dev.to', category: 'Dev', description: 'Articles, comments' },
+            // News / Articles
+            { domain: 'news.ycombinator.com', category: 'News', description: 'HN posts, comments, Ask/Show HN' },
+            { domain: 'medium.com', category: 'Articles', description: 'Articles, publications' },
+            { domain: 'substack.com / *.substack.com', category: 'Articles', description: 'Newsletters, posts' },
+            { domain: 'nytimes.com', category: 'News', description: 'Articles, headlines' },
+            { domain: 'bbc.com', category: 'News', description: 'Articles, headlines' },
+            { domain: 'cnn.com', category: 'News', description: 'Articles, headlines' },
+            // Shopping / E-commerce
+            { domain: 'amazon.com', category: 'Shopping', description: 'Products, prices, reviews' },
+            { domain: 'bestbuy.com', category: 'Shopping', description: 'Products, prices, specs' },
+            { domain: 'walmart.com', category: 'Shopping', description: 'Products, prices' },
+            { domain: 'ebay.com', category: 'Shopping', description: 'Listings, prices' },
+            { domain: 'etsy.com', category: 'Shopping', description: 'Handmade listings' },
+            // Local / Real Estate
+            { domain: 'yelp.com', category: 'Local', description: 'Business info, reviews (needs YELP_API_KEY)' },
+            { domain: 'craigslist.org', category: 'Local', description: 'Listings, classifieds' },
+            { domain: 'zillow.com', category: 'Real Estate', description: 'Property listings, estimates' },
+            { domain: 'redfin.com', category: 'Real Estate', description: 'Property listings, prices' },
+            { domain: 'cars.com', category: 'Automotive', description: 'Car listings, prices' },
+            // Knowledge / Academic
+            { domain: 'en.wikipedia.org', category: 'Knowledge', description: 'Articles, structured data' },
+            { domain: 'arxiv.org', category: 'Academic', description: 'Papers, abstracts, metadata' },
+            { domain: 'semanticscholar.org', category: 'Academic', description: 'Papers, citations' },
+            { domain: 'pubmed.ncbi.nlm.nih.gov', category: 'Academic', description: 'Medical papers, abstracts' },
+            { domain: 'imdb.com', category: 'Knowledge', description: 'Movies, TV shows, cast' },
+            { domain: 'allrecipes.com', category: 'Knowledge', description: 'Recipes, ingredients, steps' },
+            // Finance / Markets
+            { domain: 'polymarket.com', category: 'Finance', description: 'Prediction markets' },
+            { domain: 'kalshi.com', category: 'Finance', description: 'Prediction markets' },
+            { domain: 'tradingview.com', category: 'Finance', description: 'Charts, indicators, ideas' },
+            { domain: 'coingecko.com', category: 'Finance', description: 'Crypto prices, market data' },
+            { domain: 'coinmarketcap.com', category: 'Finance', description: 'Crypto prices, market data' },
+            // Sports / Betting
+            { domain: 'espn.com', category: 'Sports', description: 'Scores, stats, news' },
+            { domain: 'draftkings.com', category: 'Betting', description: 'Odds, lines' },
+            { domain: 'fanduel.com', category: 'Betting', description: 'Odds, lines' },
+            { domain: 'betmgm.com', category: 'Betting', description: 'Odds, lines' },
+            // Entertainment
+            { domain: 'producthunt.com', category: 'Tech', description: 'Product launches, upvotes' },
+            // Documents
+            { domain: '*.pdf URLs', category: 'Documents', description: 'PDF text extraction' },
+            // Weather
+            { domain: 'weather.com', category: 'Weather', description: 'Forecasts, conditions' },
+            { domain: 'accuweather.com', category: 'Weather', description: 'Forecasts, conditions' },
+            { domain: 'api.open-meteo.com', category: 'Weather', description: 'Free weather API' },
+        ];
+        if (options.json) {
+            console.log(JSON.stringify(extractors, null, 2));
+            return;
+        }
+        // Group by category
+        const byCategory = new Map();
+        for (const e of extractors) {
+            if (!byCategory.has(e.category))
+                byCategory.set(e.category, []);
+            byCategory.get(e.category).push(e);
+        }
+        console.log(`\n🔌 WebPeel Domain Extractors (${extractors.length} total)\n`);
+        for (const [cat, items] of byCategory) {
+            console.log(`  ${cat}`);
+            for (const item of items) {
+                const pad = 35;
+                const domainPad = item.domain.padEnd(pad);
+                console.log(`    ${domainPad} ${item.description}`);
+            }
+            console.log('');
+        }
+        console.log('  Run `webpeel <url>` to use these automatically based on the URL.');
+    });
 }

package/dist/cli/utils.js CHANGED Viewed

@@ -255,7 +255,37 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
         err.statusCode = res.status;
         throw err;
     }
-    const data = await res.json();
+    let data = await res.json();
+    // Handle async job queue mode — API returns { jobId, pollUrl } and we need to poll
+    if (data.jobId && data.pollUrl && !data.content) {
+        const pollEndpoint = `${apiUrl}${data.pollUrl}`;
+        const maxPollMs = 90_000; // 90s max
+        const pollInterval = 1_000; // 1s intervals
+        const start = Date.now();
+        while (Date.now() - start < maxPollMs) {
+            await new Promise(r => setTimeout(r, pollInterval));
+            const pollRes = await fetch(pollEndpoint, {
+                headers: { Authorization: `Bearer ${apiKey}` },
+                signal: AbortSignal.timeout(10_000),
+            });
+            if (!pollRes.ok) {
+                throw new Error(`Job poll failed: HTTP ${pollRes.status}`);
+            }
+            const pollData = await pollRes.json();
+            if (pollData.status === 'completed' || pollData.content) {
+                data = pollData.result || pollData;
+                break;
+            }
+            if (pollData.status === 'failed' || pollData.status === 'error') {
+                throw new Error(pollData.error?.message || pollData.error || 'Job failed on server');
+            }
+            // Still processing — keep polling
+        }
+        // If we exited the loop without data, warn
+        if (!data.content && data.jobId) {
+            throw new Error('Job timed out waiting for server response. Try again or use local mode (unset WEBPEEL_API_KEY).');
+        }
+    }
     // Map API response to PeelResult shape that the CLI already handles
     return {
         url: data.url || url,

package/dist/cli.js CHANGED Viewed

@@ -14,6 +14,20 @@
  *   npx webpeel --help                 - Condensed help
  *   npx webpeel --help-all             - Full option reference
  */
+// ── Auto-load .env from cwd (lightweight, no dotenv dependency) ──────────────
+// Must happen BEFORE any imports that read env vars (e.g., WEBPEEL_API_KEY)
+import { readFileSync, existsSync } from 'fs';
+import { resolve } from 'path';
+{
+    const envPath = resolve(process.cwd(), '.env');
+    if (existsSync(envPath)) {
+        for (const line of readFileSync(envPath, 'utf-8').split('\n')) {
+            const m = line.match(/^([A-Z_][A-Z0-9_]*)=(.*)$/);
+            if (m && !process.env[m[1]])
+                process.env[m[1]] = m[2].replace(/^["']|["']$/g, '');
+        }
+    }
+}
 import { Command } from 'commander';
 import { VERB_ALIASES, cliVersion, checkForUpdates, buildCommanderHelp, buildCondensedHelp, } from './cli/utils.js';
 import { registerFetchCommands } from './cli/commands/fetch.js';

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -121,6 +121,8 @@ const REGISTRY = [
     { match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
     { match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
     { match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
+    // ── Travel ──────────────────────────────────────────────────────────────
+    { match: (h, url = '') => (h === 'www.google.com' || h === 'google.com') && url.includes('/travel/flights'), extractor: googleFlightsExtractor },
 ];
 /**
  * Returns the domain extractor for a URL, or null if none matches.
@@ -5860,3 +5862,155 @@ async function redfinExtractor(_html, url) {
         return null;
     }
 }
+// ---------------------------------------------------------------------------
+// Google Flights extractor
+// ---------------------------------------------------------------------------
+async function googleFlightsExtractor(_html, url) {
+    if (!url.includes('/travel/flights'))
+        return null;
+    // Google Flights is a SPA. The _html parameter is usually readability-processed markdown
+    // (from the pipeline's post-fetch processing), which looks like:
+    //   -   7:15 PM
+    //       7:15 PM on Sat, Apr 4
+    //        – 10:29 PM
+    //       United
+    //       3 hr 14 min
+    //       EWR
+    //       ...
+    //       $188
+    //
+    // This markdown is much easier to parse than raw HTML.
+    let text = _html;
+    // If this is raw HTML (contains <!DOCTYPE or <html), strip HTML tags
+    if (text.includes('<!DOCTYPE') || text.includes('<html')) {
+        text = text
+            .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
+            .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
+            .replace(/<[^>]+>/g, '\n')
+            .replace(/&amp;/g, '&')
+            .replace(/&lt;/g, '<')
+            .replace(/&gt;/g, '>')
+            .replace(/&#\d+;/g, '')
+            .replace(/\n{2,}/g, '\n');
+    }
+    const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+    const AIRLINES = ['United', 'Delta', 'American', 'JetBlue', 'Spirit', 'Frontier', 'Southwest', 'Breeze', 'Alaska', 'Hawaiian', 'Sun Country', 'Avelo'];
+    const flights = [];
+    for (let i = 0; i < lines.length; i++) {
+        const line = lines[i];
+        // Detect departure time
+        const departMatch = line.match(/^(?:-\s+)?(\d{1,2}:\d{2}\s*[AP]M)$/);
+        if (!departMatch)
+            continue;
+        const departTime = departMatch[1];
+        let departDate = '', arriveTime = '', airline = '', duration = '';
+        let fromAirport = '', toAirport = '', stops = '', bags = '';
+        let price = 0;
+        for (let j = i + 1; j < Math.min(i + 45, lines.length); j++) {
+            const l = lines[j];
+            // Date
+            const dateM = l.match(/on\s+(\w+,\s+\w+\s+\d+)/);
+            if (dateM && !departDate) {
+                departDate = dateM[1];
+                continue;
+            }
+            // Arrival time
+            const arrM = l.match(/^[–\-–—]\s*(\d{1,2}:\d{2}\s*[AP]M)$/) || l.match(/^(\d{1,2}:\d{2}\s*[AP]M)\s+on\s/);
+            if (arrM && !arriveTime && departTime) {
+                arriveTime = arrM[1];
+                continue;
+            }
+            // Arrival time: also check for "10:29 PM on Sat, Apr 4" pattern (second occurrence)
+            if (!arriveTime && l.match(/^\d{1,2}:\d{2}\s*[AP]M\s+on\s/)) {
+                const m = l.match(/^(\d{1,2}:\d{2}\s*[AP]M)/);
+                if (m) {
+                    arriveTime = m[1];
+                    continue;
+                }
+            }
+            // Airline
+            if (!airline) {
+                for (const a of AIRLINES) {
+                    if (l === a || l.startsWith(a + 'Operated') || l.startsWith(a + ' ')) {
+                        airline = a;
+                        break;
+                    }
+                }
+                if (airline)
+                    continue;
+            }
+            // Duration
+            if (!duration && l.match(/^\d+\s+hr\s+\d+\s+min$/)) {
+                duration = l;
+                continue;
+            }
+            // Airport codes
+            if (l.match(/^[A-Z]{3}$/) && !fromAirport) {
+                fromAirport = l;
+                continue;
+            }
+            if (l.match(/^[A-Z]{3}$/) && fromAirport && !toAirport && l !== fromAirport) {
+                toAirport = l;
+                continue;
+            }
+            // Stops
+            if (!stops && (l === 'Nonstop' || l.match(/^\d+\s+stop/))) {
+                stops = l;
+                continue;
+            }
+            // Bags
+            if (l.includes('carry-on bag') && !bags) {
+                bags = l.includes('not included') ? 'Carry-on NOT included (extra fee)' : 'Carry-on included';
+                continue;
+            }
+            // Price — first occurrence only
+            const priceM = l.match(/^\$(\d[\d,]*)$/);
+            if (priceM && !price) {
+                price = parseInt(priceM[1].replace(',', ''));
+                break;
+            }
+        }
+        if (departTime && arriveTime && airline && price) {
+            flights.push({ departTime, arriveTime, departDate, airline, duration, fromAirport, toAirport, stops: stops || 'Unknown', price, priceStr: `$${price}`, bags });
+        }
+    }
+    // Deduplicate
+    const seen = new Set();
+    const unique = flights.filter(f => {
+        const key = `${f.departTime}-${f.airline}-${f.price}`;
+        if (seen.has(key))
+            return false;
+        seen.add(key);
+        return true;
+    });
+    if (unique.length === 0)
+        return null;
+    unique.sort((a, b) => a.price - b.price);
+    // Parse route from URL
+    const u = new URL(url);
+    const query = (u.searchParams.get('q') || '').replace(/Flights?\s+(from\s+)?/i, '').replace(/\s+one\s+way/i, '').trim();
+    const md = [
+        `# ✈️ Flights — ${query || 'Search Results'}`,
+        '',
+        `*${unique.length} flights found · Source: [Google Flights](${url})*`,
+        `*Prices include taxes + fees for 1 adult. Book directly via airline.*`,
+        '',
+    ];
+    for (let idx = 0; idx < unique.length; idx++) {
+        const f = unique[idx];
+        md.push(`## ${idx + 1}. ${f.airline} — ${f.priceStr}`);
+        md.push(`🕐 Depart **${f.departTime}** → Arrive **${f.arriveTime}**${f.departDate ? ` · ${f.departDate}` : ''}`);
+        md.push(`🛫 ${f.fromAirport} → ${f.toAirport} · ${f.duration} · ${f.stops}`);
+        if (f.bags)
+            md.push(`🧳 ${f.bags}`);
+        md.push('');
+    }
+    md.push('---');
+    md.push(`📌 *Prices change frequently. [View live prices on Google Flights](${url})*`);
+    return {
+        domain: 'google.com/travel/flights',
+        type: 'flights',
+        structured: { flights: unique, route: query, source: 'Google Flights', sourceUrl: url },
+        cleanContent: md.join('\n'),
+    };
+}

package/dist/core/pipeline.js CHANGED Viewed

@@ -991,7 +991,12 @@ export async function postProcess(ctx) {
     if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
         try {
             ctx.timer.mark('domainExtract');
-            const ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
+            // Try raw HTML first, then fall back to readability-processed content
+            // (some SPAs like Google Flights have data only after readability processing)
+            let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
+            if (!ddResult && ctx.content) {
+                ddResult = await extractDomainData(ctx.content, fetchResult.url);
+            }
             ctx.timer.end('domainExtract');
             if (ddResult) {
                 ctx.domainData = ddResult;

package/dist/core/search-fallback.d.ts CHANGED Viewed

@@ -22,6 +22,7 @@ export interface SearchFallbackResult {
 }
 /**
  * Search for a URL using the best available search provider and extract the snippet.
+ * Richer fallback: tries multiple engines if the first returns < 100 tokens.
  * Returns the title, snippet, and any extracted product data.
  */
 export declare function searchFallback(url: string): Promise<SearchFallbackResult>;

package/dist/core/search-fallback.js CHANGED Viewed

@@ -7,7 +7,7 @@
  *   Google CSE API → Brave API → Google stealth → DDG
  * This avoids direct HTML scraping which is blocked by CAPTCHAs on datacenter IPs.
  */
-import { getBestSearchProvider } from './search-provider.js';
+import { getBestSearchProvider, DuckDuckGoProvider } from './search-provider.js';
 /**
  * Detect if a URL is likely a product page.
  */
@@ -121,8 +121,15 @@ function buildCachedContent(url, title, snippet, productData) {
     lines.push(`*⚠️ Limited content — original page blocked direct access. For full data, configure GOOGLE_SEARCH_KEY or BRAVE_SEARCH_KEY.*`);
     return lines.join('\n');
 }
+/**
+ * Count approximate tokens in a string (1 token ≈ 4 chars).
+ */
+function countTokens(text) {
+    return Math.ceil(text.length / 4);
+}
 /**
  * Search for a URL using the best available search provider and extract the snippet.
+ * Richer fallback: tries multiple engines if the first returns < 100 tokens.
  * Returns the title, snippet, and any extracted product data.
  */
 export async function searchFallback(url) {
@@ -142,16 +149,42 @@ export async function searchFallback(url) {
         }
         const searchQuery = buildSearchQuery(url);
         const { provider, apiKey } = getBestSearchProvider();
-        const results = await provider.searchWeb(searchQuery, {
-            count: 3,
-            apiKey,
-        });
-        if (!results || results.length === 0) {
-            return emptyResult;
+        // Map provider ID to our source type
+        const sourceMap = {
+            duckduckgo: 'duckduckgo',
+            brave: 'google',
+            stealth: 'duckduckgo',
+            google: 'google',
+        };
+        // Try the primary (best) provider first
+        let results = await provider.searchWeb(searchQuery, { count: 5, apiKey }).catch(() => []);
+        // If primary returns sparse results (< 100 tokens), try DDG as secondary
+        const combinedSnippets = [];
+        let title = '';
+        let source = sourceMap[provider.id] ?? 'google';
+        if (results.length > 0) {
+            title = results[0].title?.trim() || '';
+            combinedSnippets.push(...results.map(r => r.snippet?.trim()).filter(Boolean));
+        }
+        const primaryTokens = countTokens(combinedSnippets.join(' '));
+        if (primaryTokens < 100) {
+            // Try DDG as a secondary engine to supplement
+            try {
+                const ddgProvider = new DuckDuckGoProvider();
+                const ddgResults = await ddgProvider.searchWeb(searchQuery, { count: 5 });
+                if (ddgResults.length > 0) {
+                    if (!title)
+                        title = ddgResults[0].title?.trim() || '';
+                    if (source !== 'duckduckgo')
+                        source = 'duckduckgo';
+                    combinedSnippets.push(...ddgResults.map(r => r.snippet?.trim()).filter(Boolean));
+                }
+            }
+            catch { /* ignore secondary failure */ }
         }
-        const topResult = results[0];
-        const title = topResult.title?.trim() || '';
-        const snippet = topResult.snippet?.trim() || '';
+        // Also try Google Cache URL as a last-resort content source
+        const allSnippets = [...new Set(combinedSnippets)]; // deduplicate
+        const snippet = allSnippets.slice(0, 3).join('\n\n');
         if (!title && !snippet) {
             return emptyResult;
         }
@@ -159,14 +192,6 @@ export async function searchFallback(url) {
             ? extractProductData(title, snippet)
             : undefined;
         const cachedContent = buildCachedContent(url, title, snippet, productData);
-        // Map provider ID to our source type
-        const sourceMap = {
-            duckduckgo: 'duckduckgo',
-            brave: 'google',
-            stealth: 'duckduckgo',
-            google: 'google',
-        };
-        const source = sourceMap[provider.id] ?? 'google';
         return {
             title,
             snippet,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.72",
+  "version": "0.21.74",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",
@@ -8,7 +8,7 @@
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
   "bin": {
-    "webpeel": "dist/cli.bundle.cjs"
+    "webpeel": "dist/cli.js"
   },
   "exports": {
     ".": {