npm - webpeel - Versions diffs - 0.21.85 → 0.21.87 - Mend

webpeel 0.21.85 → 0.21.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

package/dist/cli/commands/fetch.js +13 -0
package/dist/cli/utils.js +10 -1
package/dist/core/http-fetch.js +19 -2
package/dist/core/pipeline.js +3 -2
package/dist/core/schema-templates.js +37 -24
package/dist/core/search-provider.d.ts +2 -0
package/dist/core/search-provider.js +9 -2
package/dist/core/searxng-provider.d.ts +1 -0
package/dist/core/searxng-provider.js +1 -0
package/dist/ee/challenge-re-export.d.ts +1 -0
package/dist/ee/challenge-re-export.js +1 -0
package/dist/{core → ee}/challenge-solver.d.ts +1 -1
package/dist/{core → ee}/challenge-solver.js +5 -5
package/dist/ee/domain-extractors.d.ts +8 -0
package/dist/ee/domain-extractors.js +8 -0
package/dist/{server/premium → ee}/domain-intel.d.ts +1 -1
package/dist/ee/extractors/allrecipes.d.ts +2 -0
package/dist/ee/extractors/allrecipes.js +120 -0
package/dist/ee/extractors/amazon.d.ts +2 -0
package/dist/ee/extractors/amazon.js +78 -0
package/dist/ee/extractors/arxiv.d.ts +2 -0
package/dist/ee/extractors/arxiv.js +137 -0
package/dist/ee/extractors/bestbuy.d.ts +2 -0
package/dist/ee/extractors/bestbuy.js +78 -0
package/dist/ee/extractors/carscom.d.ts +2 -0
package/dist/ee/extractors/carscom.js +121 -0
package/dist/ee/extractors/coingecko.d.ts +2 -0
package/dist/ee/extractors/coingecko.js +134 -0
package/dist/ee/extractors/craigslist.d.ts +2 -0
package/dist/ee/extractors/craigslist.js +92 -0
package/dist/ee/extractors/devto.d.ts +2 -0
package/dist/ee/extractors/devto.js +135 -0
package/dist/ee/extractors/ebay.d.ts +2 -0
package/dist/ee/extractors/ebay.js +90 -0
package/dist/ee/extractors/espn.d.ts +2 -0
package/dist/ee/extractors/espn.js +255 -0
package/dist/ee/extractors/etsy.d.ts +2 -0
package/dist/ee/extractors/etsy.js +52 -0
package/dist/ee/extractors/facebook.d.ts +2 -0
package/dist/ee/extractors/facebook.js +46 -0
package/dist/ee/extractors/github.d.ts +2 -0
package/dist/ee/extractors/github.js +196 -0
package/dist/ee/extractors/google-flights.d.ts +2 -0
package/dist/ee/extractors/google-flights.js +176 -0
package/dist/ee/extractors/hackernews.d.ts +2 -0
package/dist/ee/extractors/hackernews.js +147 -0
package/dist/ee/extractors/imdb.d.ts +2 -0
package/dist/ee/extractors/imdb.js +172 -0
package/dist/ee/extractors/index.d.ts +26 -0
package/dist/ee/extractors/index.js +247 -0
package/dist/ee/extractors/instagram.d.ts +2 -0
package/dist/ee/extractors/instagram.js +102 -0
package/dist/ee/extractors/kalshi.d.ts +2 -0
package/dist/ee/extractors/kalshi.js +115 -0
package/dist/ee/extractors/kayak-cars.d.ts +2 -0
package/dist/ee/extractors/kayak-cars.js +270 -0
package/dist/ee/extractors/linkedin.d.ts +2 -0
package/dist/ee/extractors/linkedin.js +113 -0
package/dist/ee/extractors/medium.d.ts +2 -0
package/dist/ee/extractors/medium.js +130 -0
package/dist/ee/extractors/news.d.ts +4 -0
package/dist/ee/extractors/news.js +173 -0
package/dist/ee/extractors/npm.d.ts +2 -0
package/dist/ee/extractors/npm.js +86 -0
package/dist/ee/extractors/pdf.d.ts +2 -0
package/dist/ee/extractors/pdf.js +108 -0
package/dist/ee/extractors/pinterest.d.ts +2 -0
package/dist/ee/extractors/pinterest.js +34 -0
package/dist/ee/extractors/polymarket.d.ts +2 -0
package/dist/ee/extractors/polymarket.js +162 -0
package/dist/ee/extractors/producthunt.d.ts +2 -0
package/dist/ee/extractors/producthunt.js +88 -0
package/dist/ee/extractors/pubmed.d.ts +2 -0
package/dist/ee/extractors/pubmed.js +162 -0
package/dist/ee/extractors/pypi.d.ts +2 -0
package/dist/ee/extractors/pypi.js +80 -0
package/dist/ee/extractors/reddit.d.ts +2 -0
package/dist/ee/extractors/reddit.js +308 -0
package/dist/ee/extractors/redfin.d.ts +2 -0
package/dist/ee/extractors/redfin.js +156 -0
package/dist/ee/extractors/semanticscholar.d.ts +2 -0
package/dist/ee/extractors/semanticscholar.js +131 -0
package/dist/ee/extractors/shared.d.ts +12 -0
package/dist/ee/extractors/shared.js +76 -0
package/dist/ee/extractors/soundcloud.d.ts +2 -0
package/dist/ee/extractors/soundcloud.js +34 -0
package/dist/ee/extractors/sportsbetting.d.ts +2 -0
package/dist/ee/extractors/sportsbetting.js +37 -0
package/dist/ee/extractors/spotify.d.ts +2 -0
package/dist/ee/extractors/spotify.js +34 -0
package/dist/ee/extractors/stackoverflow.d.ts +2 -0
package/dist/ee/extractors/stackoverflow.js +61 -0
package/dist/ee/extractors/substack.d.ts +2 -0
package/dist/ee/extractors/substack.js +115 -0
package/dist/ee/extractors/substackroot.d.ts +2 -0
package/dist/ee/extractors/substackroot.js +46 -0
package/dist/ee/extractors/tiktok.d.ts +2 -0
package/dist/ee/extractors/tiktok.js +29 -0
package/dist/ee/extractors/tradingview.d.ts +2 -0
package/dist/ee/extractors/tradingview.js +176 -0
package/dist/ee/extractors/twitch.d.ts +2 -0
package/dist/ee/extractors/twitch.js +36 -0
package/dist/ee/extractors/twitter.d.ts +2 -0
package/dist/ee/extractors/twitter.js +327 -0
package/dist/ee/extractors/types.d.ts +14 -0
package/dist/ee/extractors/types.js +1 -0
package/dist/ee/extractors/walmart.d.ts +2 -0
package/dist/ee/extractors/walmart.js +50 -0
package/dist/ee/extractors/weather.d.ts +2 -0
package/dist/ee/extractors/weather.js +133 -0
package/dist/ee/extractors/wikipedia.d.ts +4 -0
package/dist/ee/extractors/wikipedia.js +103 -0
package/dist/ee/extractors/yelp.d.ts +2 -0
package/dist/ee/extractors/yelp.js +216 -0
package/dist/ee/extractors/youtube.d.ts +2 -0
package/dist/ee/extractors/youtube.js +189 -0
package/dist/ee/extractors/zillow.d.ts +54 -0
package/dist/ee/extractors/zillow.js +247 -0
package/dist/ee/extractors-re-export.d.ts +1 -0
package/dist/ee/extractors-re-export.js +1 -0
package/dist/{server/premium/index.js → ee/premium-hooks.js} +2 -2
package/dist/{server/premium → ee}/swr-cache.d.ts +1 -1
package/dist/{server/premium → ee}/swr-cache.js +1 -1
package/dist/server/app.js +8 -0
package/dist/server/bull-queues.d.ts +1 -0
package/dist/server/routes/feed.d.ts +15 -0
package/dist/server/routes/feed.js +311 -0
package/dist/server/routes/fetch-queue.js +1 -0
package/dist/server/routes/fetch.js +120 -2
package/dist/server/routes/go.d.ts +14 -0
package/dist/server/routes/go.js +81 -0
package/dist/server/routes/smart-search.d.ts +16 -3
package/dist/server/routes/smart-search.js +1875 -117
package/dist/types.d.ts +4 -0
package/package.json +13 -2
package/dist/core/cloak-fetch.d.ts +0 -42
package/dist/core/cloak-fetch.js +0 -148
package/dist/core/cycle-fetch.d.ts +0 -26
package/dist/core/cycle-fetch.js +0 -98
package/dist/core/domain-extractors-basic.d.ts +0 -36
package/dist/core/domain-extractors-basic.js +0 -28
package/dist/core/domain-extractors-public.d.ts +0 -20
package/dist/core/domain-extractors-public.js +0 -35
package/dist/core/domain-extractors.d.ts +0 -48
package/dist/core/domain-extractors.js +0 -6342
package/dist/core/search-fallback.d.ts +0 -28
package/dist/core/search-fallback.js +0 -209
package/dist/core/stealth-patches.d.ts +0 -14
package/dist/core/stealth-patches.js +0 -20
package/dist/server/premium/challenge.d.ts +0 -1
package/dist/server/premium/challenge.js +0 -1
package/dist/server/premium/extractors.d.ts +0 -1
package/dist/server/premium/extractors.js +0 -1
/package/dist/{server/premium → ee}/domain-intel.js +0 -0
/package/dist/{server/premium/index.d.ts → ee/premium-hooks.d.ts} +0 -0
/package/dist/{server/premium → ee}/spa-detection.d.ts +0 -0
/package/dist/{server/premium → ee}/spa-detection.js +0 -0
/package/dist/{server/premium → ee}/stability.d.ts +0 -0
/package/dist/{server/premium → ee}/stability.js +0 -0

package/dist/cli/commands/fetch.js CHANGED Viewed

@@ -289,6 +289,7 @@ export async function runFetch(url, options) {
             format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
             budget: null, // Budget excluded from cache key — cache stores full content
             readable: options.readable || false,
+            noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
         };
         const cachedResult = getCache(url, cacheOptions);
         if (cachedResult) {
@@ -603,6 +604,7 @@ export async function runFetch(url, options) {
             headers,
             cookies: options.cookie,
             raw: options.raw || false,
+            noDomainApi: options.skipDomainApi || false,
             lite: options.lite || false,
             actions,
             maxTokens: options.maxTokens,
@@ -724,6 +726,16 @@ export async function runFetch(url, options) {
                 ? ` [${result.domainData.domain}:${result.domainData.type}]`
                 : '';
             spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
+            // Smart hints — suggest features the user might not know about
+            if (!options.silent && !options.json && !options.skipDomainApi) {
+                if (result.method === 'domain-api') {
+                    const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
+                    console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
+                }
+            }
+            if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
+                console.error(`\x1b[33m💡 Tip: Page returned very little content. Try --render for JavaScript-heavy sites or --stealth if blocked.\x1b[0m`);
+            }
         }
         // Show metadata header
         const pageTitle = result.metadata?.title || result.title;
@@ -1176,6 +1188,7 @@ export function registerFetchCommands(program) {
         .option('--images', 'Output image URLs from the page')
         .option('--meta', 'Output only the page metadata (title, description, author, etc.)')
         .option('--raw', 'Return full page without smart content extraction')
+        .option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
         .option('--full', 'Alias for --raw — full page content, no budget')
         .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
         .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')

package/dist/cli/utils.js CHANGED Viewed

@@ -35,7 +35,14 @@ export async function checkForUpdates() {
         if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
             // Skip update notice in silent mode
             if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
-                console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
+                const msg = `Update available: ${cliVersion} → ${latest}`;
+                const cmd = 'npm i -g webpeel@latest';
+                const width = Math.max(msg.length, cmd.length) + 4;
+                const line = '─'.repeat(width);
+                console.error(`\n\x1b[33m╭${line}╮\x1b[0m`);
+                console.error(`\x1b[33m│\x1b[0m  ${msg.padEnd(width - 2)}  \x1b[33m│\x1b[0m`);
+                console.error(`\x1b[33m│\x1b[0m  Run: \x1b[36m${cmd}\x1b[0m${' '.repeat(width - 6 - cmd.length)}  \x1b[33m│\x1b[0m`);
+                console.error(`\x1b[33m╰${line}╯\x1b[0m\n`);
             }
         }
     }
@@ -208,6 +215,8 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
         params.set('budget', String(options.budget));
     if (options.question)
         params.set('question', options.question);
+    if (options.noDomainApi)
+        params.set('noDomainApi', 'true');
     const res = await fetch(`${apiUrl}/v1/fetch?${params}`, {
         headers: { Authorization: `Bearer ${apiKey}` },
         signal: AbortSignal.timeout(60000),

package/dist/core/http-fetch.js CHANGED Viewed

@@ -154,10 +154,12 @@ export function createAbortError() {
  * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
  */
 export const PROXY_PREFERRED_DOMAINS = [
+    // Social / content
     'reddit.com',
     'old.reddit.com',
     'forbes.com',
     'fortune.com',
+    // Auto / cars
     'cargurus.com',
     'edmunds.com',
     'cars.com',
@@ -165,14 +167,29 @@ export const PROXY_PREFERRED_DOMAINS = [
     'autotrader.com',
     'carfax.com',
     'tesla.com',
+    'motortrend.com',
+    'jdpower.com',
+    // Finance / home
     'nerdwallet.com',
     'bankrate.com',
     'homeadvisor.com',
     'angi.com',
+    // EV / auto news
     'insideevs.com',
     'electrek.co',
-    'motortrend.com',
-    'jdpower.com',
+    // Restaurants / food
+    'yelp.com',
+    // Travel
+    'kayak.com',
+    'booking.com',
+    'expedia.com',
+    'tripadvisor.com',
+    'hotels.com',
+    // Shopping / products
+    'amazon.com',
+    'bestbuy.com',
+    'walmart.com',
+    'target.com',
 ];
 /**
  * Returns true if the URL's domain is on the proxy-preferred blocklist.

package/dist/core/pipeline.js CHANGED Viewed

@@ -341,7 +341,8 @@ export async function fetchContent(ctx) {
     const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
     // Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
     // This avoids expensive browser fetches that often get blocked
-    if (hasDomainExtractor(ctx.url)) {
+    // Skip if noDomainApi is set — user wants raw page content, not API shortcut
+    if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
         try {
             ctx.timer.mark('domainApiFirst');
             const ddResult = await runDomainExtract('', ctx.url);
@@ -1078,7 +1079,7 @@ export async function postProcess(ctx) {
     }
     // Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
     // Fires when URL matches a known domain. Replaces content with clean markdown.
-    if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
+    if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
         try {
             ctx.timer.mark('domainExtract');
             // Try raw HTML first, then fall back to readability-processed content

package/dist/core/schema-templates.js CHANGED Viewed

@@ -57,42 +57,55 @@ export const SCHEMA_TEMPLATES = {
         name: 'Event',
         description: 'Extract event information',
         fields: {
-            name: 'event name or title',
-            date: 'event date and time',
-            location: 'venue or location',
-            description: 'event description',
-            price: 'ticket price or cost',
-            organizer: 'event organizer',
-            url: 'registration or ticket URL',
+            name: 'What is the name of this event?',
+            date: 'When does this event take place?',
+            time: 'What time does this event start?',
+            location: 'Where is this event held?',
+            price: 'How much does this event cost?',
+            description: 'What is this event about?',
+            organizer: 'Who is organizing this event?',
         },
     },
     recipe: {
         name: 'Recipe',
         description: 'Extract recipe information from cooking sites',
         fields: {
-            title: 'recipe name',
-            ingredients: 'list of ingredients with quantities',
-            instructions: 'cooking steps or directions',
-            prepTime: 'preparation time',
-            cookTime: 'cooking time',
-            servings: 'number of servings',
-            calories: 'calories per serving',
-            author: 'recipe author or source',
+            name: 'What is the name of this recipe?',
+            ingredients: 'What ingredients are needed? List all.',
+            steps: 'What are the cooking steps or instructions?',
+            prepTime: 'How long does preparation take?',
+            cookTime: 'How long does cooking take?',
+            servings: 'How many servings does this recipe make?',
+            calories: 'How many calories per serving?',
+            rating: 'What is the recipe rating?',
         },
     },
     job: {
         name: 'Job',
         description: 'Extract job posting information',
         fields: {
-            title: 'job title',
-            company: 'company name',
-            location: 'job location',
-            salary: 'salary range or compensation',
-            description: 'job description',
-            requirements: 'required qualifications or skills',
-            type: 'job type (full-time, part-time, remote)',
-            posted: 'date posted',
-            applyUrl: 'application URL or link',
+            title: 'What is the job title?',
+            company: 'What company is hiring?',
+            location: 'Where is the job located?',
+            salary: 'What is the salary or compensation range?',
+            type: 'Is this full-time, part-time, contract, or remote?',
+            requirements: 'What are the key requirements or qualifications?',
+            description: 'What is the job description?',
+            applyUrl: 'What is the URL or method to apply?',
+        },
+    },
+    business: {
+        name: 'Business',
+        description: 'Extract business/company information',
+        fields: {
+            name: 'What is the business name?',
+            address: 'What is the full address?',
+            phone: 'What is the phone number?',
+            hours: 'What are the business hours?',
+            rating: 'What is the business rating?',
+            reviewCount: 'How many reviews does this business have?',
+            website: 'What is the business website URL?',
+            categories: 'What type of business is this?',
         },
     },
     review: {

package/dist/core/search-provider.d.ts CHANGED Viewed

@@ -19,6 +19,8 @@ export interface WebSearchResult {
     snippet: string;
     /** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
     relevanceScore?: number;
+    /** Thumbnail/image URL from SearXNG results (img_src or thumbnail field). */
+    imageUrl?: string;
 }
 export interface WebSearchOptions {
     /** Number of results (1-10) */

package/dist/core/search-provider.js CHANGED Viewed

@@ -1066,8 +1066,15 @@ export class DuckDuckGoProvider {
                 if (searxResults.length > 0) {
                     providerStats.record('searxng', true);
                     log.debug(`source=searxng returned ${searxResults.length} results`);
-                    const filtered = filterRelevantResults(searxResults, query);
-                    return filtered.length > 0 ? filtered : searxResults;
+                    // Map SearXNG results to WebSearchResult (description → snippet, imageUrl passthrough)
+                    const mapped = searxResults.map(r => ({
+                        title: r.title,
+                        url: r.url,
+                        snippet: r.description ?? '',
+                        imageUrl: r.imageUrl,
+                    }));
+                    const filtered = filterRelevantResults(mapped, query);
+                    return filtered.length > 0 ? filtered : mapped;
                 }
                 providerStats.record('searxng', false);
                 log.debug('SearXNG returned 0 results, falling through to DDG');

package/dist/core/searxng-provider.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export interface SearXNGSearchResult {
     description?: string;
     publishedDate?: string;
     score?: number;
+    imageUrl?: string;
 }
 /**
  * Fetches search results from a SearXNG instance.

package/dist/core/searxng-provider.js CHANGED Viewed

@@ -69,6 +69,7 @@ export async function searchViaSearXNG(query, options = {}) {
                 description: r.content ?? undefined,
                 publishedDate: r.publishedDate ?? undefined,
                 score: r.score ?? undefined,
+                imageUrl: r.img_src ?? r.thumbnail ?? undefined,
             });
             if (output.length >= count)
                 break;

package/dist/ee/challenge-re-export.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { solveChallenge } from './challenge-solver.js';

package/dist/ee/challenge-re-export.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { solveChallenge } from './challenge-solver.js';

package/dist/{core → ee}/challenge-solver.d.ts RENAMED Viewed

@@ -17,7 +17,7 @@
  *    // result.cookies = ["cf_clearance=...", ...]
  *  }
  */
-import type { ChallengeType } from './challenge-detection.js';
+import type { ChallengeType } from '../core/challenge-detection.js';
 export interface ImageCaptchaResult {
     solved: boolean;
     rounds: number;

package/dist/{core → ee}/challenge-solver.js RENAMED Viewed

@@ -17,8 +17,8 @@
  *    // result.cookies = ["cf_clearance=...", ...]
  *  }
  */
-import { cacheCookiesForUrl } from './cookie-cache.js';
-import { createLogger } from './logger.js';
+import { cacheCookiesForUrl } from '../core/cookie-cache.js';
+import { createLogger } from '../core/logger.js';
 const log = createLogger('challenge-solver');
 // ── Image CAPTCHA solver constants ────────────────────────────────────────────
 const OLLAMA_VISION_URL = 'http://178.156.229.86:11435/api/generate';
@@ -372,7 +372,7 @@ export async function solveChallenge(url, challengeType, html, options = {}) {
 async function solveCaptchaWithVision(url, _html, timeoutMs, proxy) {
     let page = null;
     try {
-        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
+        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
         const browser = await getStealthBrowser();
         const vp = getRandomViewport();
         const ctx = await browser.newContext({
@@ -446,7 +446,7 @@ async function solveCloudflare(url, _html, timeoutMs, proxy) {
     let browser = null;
     let page = null;
     try {
-        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
+        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
         browser = await getStealthBrowser();
         const vp = getRandomViewport();
         const ctx = await browser.newContext({
@@ -528,7 +528,7 @@ async function solveCloudflare(url, _html, timeoutMs, proxy) {
 async function solveWithStealthBrowser(url, _html, timeoutMs, proxy, challengeType) {
     let page = null;
     try {
-        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
+        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
         const browser = await getStealthBrowser();
         const vp = getRandomViewport();
         const ctx = await browser.newContext({

package/dist/ee/domain-extractors.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * Domain-aware structured extractors for WebPeel.
+ *
+ * This file re-exports from individual extractor files for backward compatibility.
+ * Each extractor now lives in its own file under src/ee/extractors/.
+ */
+export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
+export type { DomainExtractResult, DomainExtractor } from './extractors/index.js';

package/dist/ee/domain-extractors.js ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * Domain-aware structured extractors for WebPeel.
+ *
+ * This file re-exports from individual extractor files for backward compatibility.
+ * Each extractor now lives in its own file under src/ee/extractors/.
+ */
+// Re-exported from individual extractor files for backward compatibility
+export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';

package/dist/{server/premium → ee}/domain-intel.d.ts RENAMED Viewed

@@ -11,6 +11,6 @@
  *
  * This module is NOT shipped in the npm package.
  */
-import type { StrategyHooks } from '../../core/strategy-hooks.js';
+import type { StrategyHooks } from '../core/strategy-hooks.js';
 export declare function clearDomainIntel(): void;
 export declare function createDomainIntelHooks(): Pick<StrategyHooks, 'getDomainRecommendation' | 'recordDomainResult'>;

package/dist/ee/extractors/allrecipes.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { DomainExtractResult } from './types.js';
2	+ export declare function allrecipesExtractor(html: string, url: string): Promise<DomainExtractResult \| null>;

package/dist/ee/extractors/allrecipes.js ADDED Viewed

@@ -0,0 +1,120 @@
+import { tryParseJson } from './shared.js';
+// ---------------------------------------------------------------------------
+// 15. Allrecipes (Recipe Sites) extractor
+// ---------------------------------------------------------------------------
+export async function allrecipesExtractor(html, url) {
+    try {
+        const { load } = await import('cheerio');
+        const $ = load(html);
+        // Try Schema.org Recipe JSON-LD first
+        let recipe = null;
+        $('script[type="application/ld+json"]').each((_, el) => {
+            if (recipe)
+                return;
+            const raw = $(el).html() || '';
+            const parsed = tryParseJson(raw);
+            // Can be an array or direct object
+            const candidates = Array.isArray(parsed) ? parsed : [parsed];
+            for (const item of candidates) {
+                if (item?.['@type'] === 'Recipe' || (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe'))) {
+                    recipe = item;
+                    break;
+                }
+                // Sometimes it's nested in @graph
+                if (item?.['@graph']) {
+                    const graphRecipe = item['@graph'].find((g) => g?.['@type'] === 'Recipe');
+                    if (graphRecipe) {
+                        recipe = graphRecipe;
+                        break;
+                    }
+                }
+            }
+        });
+        let title;
+        let ingredients = [];
+        let instructions = [];
+        let prepTime = '';
+        let cookTime = '';
+        let totalTime = '';
+        let servings = '';
+        let rating = '';
+        let reviewCount = '';
+        let description = '';
+        if (recipe) {
+            title = recipe.name || '';
+            description = recipe.description || '';
+            ingredients = (recipe.recipeIngredient || []).map((i) => i.trim());
+            // Instructions can be strings or HowToStep objects
+            const rawInstructions = recipe.recipeInstructions || [];
+            for (const step of rawInstructions) {
+                if (typeof step === 'string')
+                    instructions.push(step.trim());
+                else if (step.text)
+                    instructions.push(step.text.trim());
+                else if (step['@type'] === 'HowToSection' && step.itemListElement) {
+                    for (const s of step.itemListElement) {
+                        if (s.text)
+                            instructions.push(s.text.trim());
+                    }
+                }
+            }
+            // Parse ISO 8601 duration (PT30M, PT1H30M)
+            const parseDuration = (d) => {
+                if (!d)
+                    return '';
+                const h = d.match(/(\d+)H/)?.[1];
+                const m = d.match(/(\d+)M/)?.[1];
+                return [h ? `${h}h` : '', m ? `${m}m` : ''].filter(Boolean).join(' ');
+            };
+            prepTime = parseDuration(recipe.prepTime || '');
+            cookTime = parseDuration(recipe.cookTime || '');
+            totalTime = parseDuration(recipe.totalTime || '');
+            servings = String(recipe.recipeYield || '');
+            rating = recipe.aggregateRating?.ratingValue ? String(recipe.aggregateRating.ratingValue) : '';
+            reviewCount = recipe.aggregateRating?.reviewCount ? String(recipe.aggregateRating.reviewCount) : '';
+        }
+        else {
+            // HTML fallback
+            title = $('h1').first().text().trim() ||
+                $('meta[property="og:title"]').attr('content') || '';
+            description = $('meta[property="og:description"]').attr('content') || '';
+            $('[class*="ingredient"]').each((_, el) => {
+                const text = $(el).text().trim();
+                if (text && text.length < 200)
+                    ingredients.push(text);
+            });
+            $('[class*="instruction"] li, [class*="step"] li').each((_, el) => {
+                const text = $(el).text().trim();
+                if (text)
+                    instructions.push(text);
+            });
+        }
+        if (!title)
+            return null;
+        const structured = {
+            title, description, ingredients, instructions,
+            prepTime, cookTime, totalTime, servings, rating, reviewCount, url,
+        };
+        const timeParts = [
+            prepTime ? `Prep: ${prepTime}` : '',
+            cookTime ? `Cook: ${cookTime}` : '',
+            totalTime ? `Total: ${totalTime}` : '',
+        ].filter(Boolean).join(' | ');
+        const metaLine = [
+            timeParts,
+            servings ? `Servings: ${servings}` : '',
+            rating ? `Rating: ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '',
+        ].filter(Boolean).join(' | ');
+        const ingredientsMd = ingredients.length
+            ? `## Ingredients\n\n${ingredients.map(i => `- ${i}`).join('\n')}`
+            : '';
+        const instructionsMd = instructions.length
+            ? `## Instructions\n\n${instructions.map((s, i) => `${i + 1}. ${s}`).join('\n')}`
+            : '';
+        const cleanContent = `# 🍽️ ${title}\n\n${metaLine ? `*${metaLine}*\n\n` : ''}${description ? description + '\n\n' : ''}${ingredientsMd}\n\n${instructionsMd}`.trim();
+        return { domain: 'allrecipes.com', type: 'recipe', structured, cleanContent };
+    }
+    catch {
+        return null;
+    }
+}

package/dist/ee/extractors/amazon.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { DomainExtractResult } from './types.js';
2	+ export declare function amazonExtractor(html: string, url: string): Promise<DomainExtractResult \| null>;

package/dist/ee/extractors/amazon.js ADDED Viewed

@@ -0,0 +1,78 @@
+import { tryParseJson } from './shared.js';
+// ---------------------------------------------------------------------------
+// 12. Amazon Products extractor
+// ---------------------------------------------------------------------------
+export async function amazonExtractor(html, url) {
+    try {
+        const { load } = await import('cheerio');
+        const $ = load(html);
+        // Extract from JSON-LD first
+        let jsonLdData = null;
+        $('script[type="application/ld+json"]').each((_, el) => {
+            if (jsonLdData)
+                return;
+            const raw = $(el).html() || '';
+            const parsed = tryParseJson(raw);
+            if (parsed?.['@type'] === 'Product')
+                jsonLdData = parsed;
+        });
+        // Meta tag fallbacks
+        const ogTitle = $('meta[property="og:title"]').attr('content') || '';
+        const ogDescription = $('meta[property="og:description"]').attr('content') || '';
+        const ogImage = $('meta[property="og:image"]').attr('content') || '';
+        // HTML selectors
+        const title = jsonLdData?.name ||
+            $('#productTitle').text().trim() ||
+            $('#title').text().trim() ||
+            ogTitle;
+        if (!title)
+            return null;
+        const priceWhole = $('#priceblock_ourprice').text().trim() ||
+            $('.a-price .a-offscreen').first().text().trim() ||
+            $('[data-asin-price]').first().attr('data-asin-price') || '';
+        const rating = jsonLdData?.aggregateRating?.ratingValue ||
+            $('#acrPopover .a-size-base.a-color-base').first().text().trim() ||
+            $('span[data-hook="rating-out-of-text"]').text().trim() || '';
+        const reviewCount = jsonLdData?.aggregateRating?.reviewCount ||
+            $('#acrCustomerReviewText').text().replace(/[^0-9,]/g, '').trim() || '';
+        const availability = jsonLdData?.offers?.availability?.replace('https://schema.org/', '') ||
+            $('#availability span').first().text().trim() || '';
+        const description = jsonLdData?.description ||
+            $('#feature-bullets .a-list-item').map((_, el) => $(el).text().trim()).get().join('\n') ||
+            $('#productDescription p').text().trim() ||
+            ogDescription;
+        const features = [];
+        $('#feature-bullets li').each((_, el) => {
+            const text = $(el).text().trim();
+            if (text && !text.includes('Make sure this fits'))
+                features.push(text);
+        });
+        // ASIN from URL
+        const asinMatch = url.match(/\/dp\/([A-Z0-9]{10})/i);
+        const asin = asinMatch?.[1] || '';
+        const structured = {
+            title,
+            price: priceWhole,
+            rating,
+            reviewCount,
+            availability,
+            description,
+            features,
+            asin,
+            image: ogImage,
+            url,
+        };
+        const ratingLine = rating ? `\n**Rating:** ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '';
+        const priceLine = priceWhole ? `\n**Price:** ${priceWhole}` : '';
+        const availLine = availability ? `\n**Availability:** ${availability}` : '';
+        const featuresSection = features.length
+            ? `\n\n## Features\n\n${features.map(f => `- ${f}`).join('\n')}`
+            : '';
+        const descSection = description ? `\n\n## Description\n\n${description.substring(0, 1000)}` : '';
+        const cleanContent = `# 🛒 ${title}${priceLine}${ratingLine}${availLine}${descSection}${featuresSection}`;
+        return { domain: 'amazon.com', type: 'product', structured, cleanContent };
+    }
+    catch {
+        return null;
+    }
+}

package/dist/ee/extractors/arxiv.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { DomainExtractResult } from './types.js';
2	+ export declare function arxivExtractor(_html: string, url: string): Promise<DomainExtractResult \| null>;