npm - webpeel - Versions diffs - 0.21.86 → 0.21.87 - Mend

webpeel 0.21.86 → 0.21.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/dist/cli/commands/fetch.js +13 -0
package/dist/cli/utils.js +10 -1
package/dist/core/http-fetch.js +19 -2
package/dist/core/pipeline.js +3 -2
package/dist/core/schema-templates.js +37 -24
package/dist/core/search-provider.d.ts +2 -0
package/dist/core/search-provider.js +9 -2
package/dist/core/searxng-provider.d.ts +1 -0
package/dist/core/searxng-provider.js +1 -0
package/dist/ee/domain-extractors.d.ts +4 -44
package/dist/ee/domain-extractors.js +4 -6338
package/dist/ee/extractors/allrecipes.d.ts +2 -0
package/dist/ee/extractors/allrecipes.js +120 -0
package/dist/ee/extractors/amazon.d.ts +2 -0
package/dist/ee/extractors/amazon.js +78 -0
package/dist/ee/extractors/arxiv.d.ts +2 -0
package/dist/ee/extractors/arxiv.js +137 -0
package/dist/ee/extractors/bestbuy.d.ts +2 -0
package/dist/ee/extractors/bestbuy.js +78 -0
package/dist/ee/extractors/carscom.d.ts +2 -0
package/dist/ee/extractors/carscom.js +121 -0
package/dist/ee/extractors/coingecko.d.ts +2 -0
package/dist/ee/extractors/coingecko.js +134 -0
package/dist/ee/extractors/craigslist.d.ts +2 -0
package/dist/ee/extractors/craigslist.js +92 -0
package/dist/ee/extractors/devto.d.ts +2 -0
package/dist/ee/extractors/devto.js +135 -0
package/dist/ee/extractors/ebay.d.ts +2 -0
package/dist/ee/extractors/ebay.js +90 -0
package/dist/ee/extractors/espn.d.ts +2 -0
package/dist/ee/extractors/espn.js +255 -0
package/dist/ee/extractors/etsy.d.ts +2 -0
package/dist/ee/extractors/etsy.js +52 -0
package/dist/ee/extractors/facebook.d.ts +2 -0
package/dist/ee/extractors/facebook.js +46 -0
package/dist/ee/extractors/github.d.ts +2 -0
package/dist/ee/extractors/github.js +196 -0
package/dist/ee/extractors/google-flights.d.ts +2 -0
package/dist/ee/extractors/google-flights.js +176 -0
package/dist/ee/extractors/hackernews.d.ts +2 -0
package/dist/ee/extractors/hackernews.js +147 -0
package/dist/ee/extractors/imdb.d.ts +2 -0
package/dist/ee/extractors/imdb.js +172 -0
package/dist/ee/extractors/index.d.ts +26 -0
package/dist/ee/extractors/index.js +247 -0
package/dist/ee/extractors/instagram.d.ts +2 -0
package/dist/ee/extractors/instagram.js +102 -0
package/dist/ee/extractors/kalshi.d.ts +2 -0
package/dist/ee/extractors/kalshi.js +115 -0
package/dist/ee/extractors/kayak-cars.d.ts +2 -0
package/dist/ee/extractors/kayak-cars.js +270 -0
package/dist/ee/extractors/linkedin.d.ts +2 -0
package/dist/ee/extractors/linkedin.js +113 -0
package/dist/ee/extractors/medium.d.ts +2 -0
package/dist/ee/extractors/medium.js +130 -0
package/dist/ee/extractors/news.d.ts +4 -0
package/dist/ee/extractors/news.js +173 -0
package/dist/ee/extractors/npm.d.ts +2 -0
package/dist/ee/extractors/npm.js +86 -0
package/dist/ee/extractors/pdf.d.ts +2 -0
package/dist/ee/extractors/pdf.js +108 -0
package/dist/ee/extractors/pinterest.d.ts +2 -0
package/dist/ee/extractors/pinterest.js +34 -0
package/dist/ee/extractors/polymarket.d.ts +2 -0
package/dist/ee/extractors/polymarket.js +162 -0
package/dist/ee/extractors/producthunt.d.ts +2 -0
package/dist/ee/extractors/producthunt.js +88 -0
package/dist/ee/extractors/pubmed.d.ts +2 -0
package/dist/ee/extractors/pubmed.js +162 -0
package/dist/ee/extractors/pypi.d.ts +2 -0
package/dist/ee/extractors/pypi.js +80 -0
package/dist/ee/extractors/reddit.d.ts +2 -0
package/dist/ee/extractors/reddit.js +308 -0
package/dist/ee/extractors/redfin.d.ts +2 -0
package/dist/ee/extractors/redfin.js +156 -0
package/dist/ee/extractors/semanticscholar.d.ts +2 -0
package/dist/ee/extractors/semanticscholar.js +131 -0
package/dist/ee/extractors/shared.d.ts +12 -0
package/dist/ee/extractors/shared.js +76 -0
package/dist/ee/extractors/soundcloud.d.ts +2 -0
package/dist/ee/extractors/soundcloud.js +34 -0
package/dist/ee/extractors/sportsbetting.d.ts +2 -0
package/dist/ee/extractors/sportsbetting.js +37 -0
package/dist/ee/extractors/spotify.d.ts +2 -0
package/dist/ee/extractors/spotify.js +34 -0
package/dist/ee/extractors/stackoverflow.d.ts +2 -0
package/dist/ee/extractors/stackoverflow.js +61 -0
package/dist/ee/extractors/substack.d.ts +2 -0
package/dist/ee/extractors/substack.js +115 -0
package/dist/ee/extractors/substackroot.d.ts +2 -0
package/dist/ee/extractors/substackroot.js +46 -0
package/dist/ee/extractors/tiktok.d.ts +2 -0
package/dist/ee/extractors/tiktok.js +29 -0
package/dist/ee/extractors/tradingview.d.ts +2 -0
package/dist/ee/extractors/tradingview.js +176 -0
package/dist/ee/extractors/twitch.d.ts +2 -0
package/dist/ee/extractors/twitch.js +36 -0
package/dist/ee/extractors/twitter.d.ts +2 -0
package/dist/ee/extractors/twitter.js +327 -0
package/dist/ee/extractors/types.d.ts +14 -0
package/dist/ee/extractors/types.js +1 -0
package/dist/ee/extractors/walmart.d.ts +2 -0
package/dist/ee/extractors/walmart.js +50 -0
package/dist/ee/extractors/weather.d.ts +2 -0
package/dist/ee/extractors/weather.js +133 -0
package/dist/ee/extractors/wikipedia.d.ts +4 -0
package/dist/ee/extractors/wikipedia.js +103 -0
package/dist/ee/extractors/yelp.d.ts +2 -0
package/dist/ee/extractors/yelp.js +216 -0
package/dist/ee/extractors/youtube.d.ts +2 -0
package/dist/ee/extractors/youtube.js +189 -0
package/dist/ee/extractors/zillow.d.ts +54 -0
package/dist/ee/extractors/zillow.js +247 -0
package/dist/server/app.js +8 -0
package/dist/server/bull-queues.d.ts +1 -0
package/dist/server/routes/feed.d.ts +15 -0
package/dist/server/routes/feed.js +311 -0
package/dist/server/routes/fetch-queue.js +1 -0
package/dist/server/routes/fetch.js +120 -2
package/dist/server/routes/go.d.ts +14 -0
package/dist/server/routes/go.js +81 -0
package/dist/server/routes/smart-search.d.ts +5 -3
package/dist/server/routes/smart-search.js +1842 -141
package/dist/types.d.ts +4 -0
package/package.json +12 -2
package/dist/core/challenge-solver.d.ts +0 -72
package/dist/core/challenge-solver.js +0 -720
package/dist/core/cloak-fetch.d.ts +0 -42
package/dist/core/cloak-fetch.js +0 -148
package/dist/core/cycle-fetch.d.ts +0 -26
package/dist/core/cycle-fetch.js +0 -98
package/dist/core/domain-extractors-basic.d.ts +0 -36
package/dist/core/domain-extractors-basic.js +0 -28
package/dist/core/domain-extractors-public.d.ts +0 -20
package/dist/core/domain-extractors-public.js +0 -35
package/dist/core/domain-extractors.d.ts +0 -48
package/dist/core/domain-extractors.js +0 -6342
package/dist/core/search-fallback.d.ts +0 -28
package/dist/core/search-fallback.js +0 -209
package/dist/core/stealth-patches.d.ts +0 -14
package/dist/core/stealth-patches.js +0 -20
package/dist/server/premium/challenge.d.ts +0 -1
package/dist/server/premium/challenge.js +0 -1
package/dist/server/premium/domain-intel.d.ts +0 -16
package/dist/server/premium/domain-intel.js +0 -133
package/dist/server/premium/extractors.d.ts +0 -1
package/dist/server/premium/extractors.js +0 -1
package/dist/server/premium/index.d.ts +0 -20
package/dist/server/premium/index.js +0 -50
package/dist/server/premium/spa-detection.d.ts +0 -2
package/dist/server/premium/spa-detection.js +0 -2
package/dist/server/premium/stability.d.ts +0 -4
package/dist/server/premium/stability.js +0 -29
package/dist/server/premium/swr-cache.d.ts +0 -14
package/dist/server/premium/swr-cache.js +0 -34

package/dist/cli/commands/fetch.js CHANGED Viewed

@@ -289,6 +289,7 @@ export async function runFetch(url, options) {
             format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
             budget: null, // Budget excluded from cache key — cache stores full content
             readable: options.readable || false,
+            noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
         };
         const cachedResult = getCache(url, cacheOptions);
         if (cachedResult) {
@@ -603,6 +604,7 @@ export async function runFetch(url, options) {
             headers,
             cookies: options.cookie,
             raw: options.raw || false,
+            noDomainApi: options.skipDomainApi || false,
             lite: options.lite || false,
             actions,
             maxTokens: options.maxTokens,
@@ -724,6 +726,16 @@ export async function runFetch(url, options) {
                 ? ` [${result.domainData.domain}:${result.domainData.type}]`
                 : '';
             spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
+            // Smart hints — suggest features the user might not know about
+            if (!options.silent && !options.json && !options.skipDomainApi) {
+                if (result.method === 'domain-api') {
+                    const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
+                    console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
+                }
+            }
+            if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
+                console.error(`\x1b[33m💡 Tip: Page returned very little content. Try --render for JavaScript-heavy sites or --stealth if blocked.\x1b[0m`);
+            }
         }
         // Show metadata header
         const pageTitle = result.metadata?.title || result.title;
@@ -1176,6 +1188,7 @@ export function registerFetchCommands(program) {
         .option('--images', 'Output image URLs from the page')
         .option('--meta', 'Output only the page metadata (title, description, author, etc.)')
         .option('--raw', 'Return full page without smart content extraction')
+        .option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
         .option('--full', 'Alias for --raw — full page content, no budget')
         .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
         .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')

package/dist/cli/utils.js CHANGED Viewed

@@ -35,7 +35,14 @@ export async function checkForUpdates() {
         if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
             // Skip update notice in silent mode
             if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
-                console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
+                const msg = `Update available: ${cliVersion} → ${latest}`;
+                const cmd = 'npm i -g webpeel@latest';
+                const width = Math.max(msg.length, cmd.length) + 4;
+                const line = '─'.repeat(width);
+                console.error(`\n\x1b[33m╭${line}╮\x1b[0m`);
+                console.error(`\x1b[33m│\x1b[0m  ${msg.padEnd(width - 2)}  \x1b[33m│\x1b[0m`);
+                console.error(`\x1b[33m│\x1b[0m  Run: \x1b[36m${cmd}\x1b[0m${' '.repeat(width - 6 - cmd.length)}  \x1b[33m│\x1b[0m`);
+                console.error(`\x1b[33m╰${line}╯\x1b[0m\n`);
             }
         }
     }
@@ -208,6 +215,8 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
         params.set('budget', String(options.budget));
     if (options.question)
         params.set('question', options.question);
+    if (options.noDomainApi)
+        params.set('noDomainApi', 'true');
     const res = await fetch(`${apiUrl}/v1/fetch?${params}`, {
         headers: { Authorization: `Bearer ${apiKey}` },
         signal: AbortSignal.timeout(60000),

package/dist/core/http-fetch.js CHANGED Viewed

@@ -154,10 +154,12 @@ export function createAbortError() {
  * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
  */
 export const PROXY_PREFERRED_DOMAINS = [
+    // Social / content
     'reddit.com',
     'old.reddit.com',
     'forbes.com',
     'fortune.com',
+    // Auto / cars
     'cargurus.com',
     'edmunds.com',
     'cars.com',
@@ -165,14 +167,29 @@ export const PROXY_PREFERRED_DOMAINS = [
     'autotrader.com',
     'carfax.com',
     'tesla.com',
+    'motortrend.com',
+    'jdpower.com',
+    // Finance / home
     'nerdwallet.com',
     'bankrate.com',
     'homeadvisor.com',
     'angi.com',
+    // EV / auto news
     'insideevs.com',
     'electrek.co',
-    'motortrend.com',
-    'jdpower.com',
+    // Restaurants / food
+    'yelp.com',
+    // Travel
+    'kayak.com',
+    'booking.com',
+    'expedia.com',
+    'tripadvisor.com',
+    'hotels.com',
+    // Shopping / products
+    'amazon.com',
+    'bestbuy.com',
+    'walmart.com',
+    'target.com',
 ];
 /**
  * Returns true if the URL's domain is on the proxy-preferred blocklist.

package/dist/core/pipeline.js CHANGED Viewed

@@ -341,7 +341,8 @@ export async function fetchContent(ctx) {
     const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
     // Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
     // This avoids expensive browser fetches that often get blocked
-    if (hasDomainExtractor(ctx.url)) {
+    // Skip if noDomainApi is set — user wants raw page content, not API shortcut
+    if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
         try {
             ctx.timer.mark('domainApiFirst');
             const ddResult = await runDomainExtract('', ctx.url);
@@ -1078,7 +1079,7 @@ export async function postProcess(ctx) {
     }
     // Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
     // Fires when URL matches a known domain. Replaces content with clean markdown.
-    if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
+    if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
         try {
             ctx.timer.mark('domainExtract');
             // Try raw HTML first, then fall back to readability-processed content

package/dist/core/schema-templates.js CHANGED Viewed

@@ -57,42 +57,55 @@ export const SCHEMA_TEMPLATES = {
         name: 'Event',
         description: 'Extract event information',
         fields: {
-            name: 'event name or title',
-            date: 'event date and time',
-            location: 'venue or location',
-            description: 'event description',
-            price: 'ticket price or cost',
-            organizer: 'event organizer',
-            url: 'registration or ticket URL',
+            name: 'What is the name of this event?',
+            date: 'When does this event take place?',
+            time: 'What time does this event start?',
+            location: 'Where is this event held?',
+            price: 'How much does this event cost?',
+            description: 'What is this event about?',
+            organizer: 'Who is organizing this event?',
         },
     },
     recipe: {
         name: 'Recipe',
         description: 'Extract recipe information from cooking sites',
         fields: {
-            title: 'recipe name',
-            ingredients: 'list of ingredients with quantities',
-            instructions: 'cooking steps or directions',
-            prepTime: 'preparation time',
-            cookTime: 'cooking time',
-            servings: 'number of servings',
-            calories: 'calories per serving',
-            author: 'recipe author or source',
+            name: 'What is the name of this recipe?',
+            ingredients: 'What ingredients are needed? List all.',
+            steps: 'What are the cooking steps or instructions?',
+            prepTime: 'How long does preparation take?',
+            cookTime: 'How long does cooking take?',
+            servings: 'How many servings does this recipe make?',
+            calories: 'How many calories per serving?',
+            rating: 'What is the recipe rating?',
         },
     },
     job: {
         name: 'Job',
         description: 'Extract job posting information',
         fields: {
-            title: 'job title',
-            company: 'company name',
-            location: 'job location',
-            salary: 'salary range or compensation',
-            description: 'job description',
-            requirements: 'required qualifications or skills',
-            type: 'job type (full-time, part-time, remote)',
-            posted: 'date posted',
-            applyUrl: 'application URL or link',
+            title: 'What is the job title?',
+            company: 'What company is hiring?',
+            location: 'Where is the job located?',
+            salary: 'What is the salary or compensation range?',
+            type: 'Is this full-time, part-time, contract, or remote?',
+            requirements: 'What are the key requirements or qualifications?',
+            description: 'What is the job description?',
+            applyUrl: 'What is the URL or method to apply?',
+        },
+    },
+    business: {
+        name: 'Business',
+        description: 'Extract business/company information',
+        fields: {
+            name: 'What is the business name?',
+            address: 'What is the full address?',
+            phone: 'What is the phone number?',
+            hours: 'What are the business hours?',
+            rating: 'What is the business rating?',
+            reviewCount: 'How many reviews does this business have?',
+            website: 'What is the business website URL?',
+            categories: 'What type of business is this?',
         },
     },
     review: {

package/dist/core/search-provider.d.ts CHANGED Viewed

@@ -19,6 +19,8 @@ export interface WebSearchResult {
     snippet: string;
     /** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
     relevanceScore?: number;
+    /** Thumbnail/image URL from SearXNG results (img_src or thumbnail field). */
+    imageUrl?: string;
 }
 export interface WebSearchOptions {
     /** Number of results (1-10) */

package/dist/core/search-provider.js CHANGED Viewed

@@ -1066,8 +1066,15 @@ export class DuckDuckGoProvider {
                 if (searxResults.length > 0) {
                     providerStats.record('searxng', true);
                     log.debug(`source=searxng returned ${searxResults.length} results`);
-                    const filtered = filterRelevantResults(searxResults, query);
-                    return filtered.length > 0 ? filtered : searxResults;
+                    // Map SearXNG results to WebSearchResult (description → snippet, imageUrl passthrough)
+                    const mapped = searxResults.map(r => ({
+                        title: r.title,
+                        url: r.url,
+                        snippet: r.description ?? '',
+                        imageUrl: r.imageUrl,
+                    }));
+                    const filtered = filterRelevantResults(mapped, query);
+                    return filtered.length > 0 ? filtered : mapped;
                 }
                 providerStats.record('searxng', false);
                 log.debug('SearXNG returned 0 results, falling through to DDG');

package/dist/core/searxng-provider.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export interface SearXNGSearchResult {
     description?: string;
     publishedDate?: string;
     score?: number;
+    imageUrl?: string;
 }
 /**
  * Fetches search results from a SearXNG instance.

package/dist/core/searxng-provider.js CHANGED Viewed

@@ -69,6 +69,7 @@ export async function searchViaSearXNG(query, options = {}) {
                 description: r.content ?? undefined,
                 publishedDate: r.publishedDate ?? undefined,
                 score: r.score ?? undefined,
+                imageUrl: r.img_src ?? r.thumbnail ?? undefined,
             });
             if (output.length >= count)
                 break;

package/dist/ee/domain-extractors.d.ts CHANGED Viewed

@@ -1,48 +1,8 @@
 /**
  * Domain-aware structured extractors for WebPeel.
  *
- * When peel() fetches a URL that matches a known domain, the relevant
- * extractor fires and returns clean structured data + a markdown summary.
- *
- * Supported domains:
- *  - twitter.com / x.com  — tweets, threads, profiles
- *  - reddit.com            — posts with comments (via JSON API)
- *  - github.com            — repos, issues, PRs, users (via GitHub API)
- *  - news.ycombinator.com  — stories with comments (via HN Firebase API)
- */
-export interface DomainExtractResult {
-    /** Canonical domain name (e.g. 'twitter.com') */
-    domain: string;
-    /** Page type within the domain (e.g. 'tweet', 'thread', 'repo', 'issue') */
-    type: string;
-    /** Domain-specific structured data */
-    structured: Record<string, any>;
-    /** Clean markdown representation of the content */
-    cleanContent: string;
-    /** Raw HTML size in characters (from the actual HTML page fetched by the extractor) */
-    rawHtmlSize?: number;
-}
-/** An extractor receives the raw HTML and original URL, may make API calls. */
-export type DomainExtractor = (html: string, url: string) => Promise<DomainExtractResult | null>;
-/**
- * Returns the domain extractor for a URL, or null if none matches.
- */
-export declare function getDomainExtractor(url: string): DomainExtractor | null;
-/** Clear the extractor response cache (used in tests). */
-export declare function clearExtractorCache(): void;
-/**
- * Inject a Redis client for shared cross-pod caching.
- * Called from server startup after Redis is initialized.
- * Safe to call with null to disable Redis caching (e.g., CLI mode).
- */
-export declare function setExtractorRedis(redis: any): void;
-/**
- * Convenience: run the extractor for the URL (if one exists).
- * Wraps _extractDomainDataImpl with a two-tier cache:
- *   1. In-memory LRU (per-pod, fastest)
- *   2. Redis shared cache (cross-pod, shared across all replicas)
- *
- * With multiple API pods, Redis ensures the first pod to fetch a URL
- * populates cache for all others — eliminating redundant API calls.
+ * This file re-exports from individual extractor files for backward compatibility.
+ * Each extractor now lives in its own file under src/ee/extractors/.
  */
-export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
+export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
+export type { DomainExtractResult, DomainExtractor } from './extractors/index.js';