npm - webpeel - Versions diffs - 0.16.0 → 0.17.0 - Mend

webpeel 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/LICENSE +11 -657
package/README.md +246 -325
package/dist/cli.js +330 -73
package/dist/cli.js.map +1 -1
package/dist/core/browser-fetch.d.ts +12 -0
package/dist/core/browser-fetch.d.ts.map +1 -1
package/dist/core/browser-fetch.js +70 -17
package/dist/core/browser-fetch.js.map +1 -1
package/dist/core/cf-worker-proxy.d.ts +33 -0
package/dist/core/cf-worker-proxy.d.ts.map +1 -0
package/dist/core/cf-worker-proxy.js +88 -0
package/dist/core/cf-worker-proxy.js.map +1 -0
package/dist/core/chunker.d.ts +47 -0
package/dist/core/chunker.d.ts.map +1 -0
package/dist/core/chunker.js +250 -0
package/dist/core/chunker.js.map +1 -0
package/dist/core/cloak-fetch.d.ts +43 -0
package/dist/core/cloak-fetch.d.ts.map +1 -0
package/dist/core/cloak-fetch.js +141 -0
package/dist/core/cloak-fetch.js.map +1 -0
package/dist/core/crawl-checkpoint.d.ts +55 -0
package/dist/core/crawl-checkpoint.d.ts.map +1 -0
package/dist/core/crawl-checkpoint.js +105 -0
package/dist/core/crawl-checkpoint.js.map +1 -0
package/dist/core/crawler.d.ts +5 -1
package/dist/core/crawler.d.ts.map +1 -1
package/dist/core/crawler.js +60 -5
package/dist/core/crawler.js.map +1 -1
package/dist/core/cycle-fetch.d.ts +27 -0
package/dist/core/cycle-fetch.d.ts.map +1 -0
package/dist/core/cycle-fetch.js +99 -0
package/dist/core/cycle-fetch.js.map +1 -0
package/dist/core/domain-extractors.d.ts.map +1 -1
package/dist/core/domain-extractors.js +754 -14
package/dist/core/domain-extractors.js.map +1 -1
package/dist/core/google-cache.d.ts +30 -0
package/dist/core/google-cache.d.ts.map +1 -0
package/dist/core/google-cache.js +181 -0
package/dist/core/google-cache.js.map +1 -0
package/dist/core/markdown.d.ts +11 -0
package/dist/core/markdown.d.ts.map +1 -1
package/dist/core/markdown.js +43 -0
package/dist/core/markdown.js.map +1 -1
package/dist/core/peel-tls.d.ts +26 -0
package/dist/core/peel-tls.d.ts.map +1 -0
package/dist/core/peel-tls.js +221 -0
package/dist/core/peel-tls.js.map +1 -0
package/dist/core/pipeline.d.ts +5 -1
package/dist/core/pipeline.d.ts.map +1 -1
package/dist/core/pipeline.js +269 -21
package/dist/core/pipeline.js.map +1 -1
package/dist/core/schema-postprocess.d.ts +33 -0
package/dist/core/schema-postprocess.d.ts.map +1 -0
package/dist/core/schema-postprocess.js +470 -0
package/dist/core/schema-postprocess.js.map +1 -0
package/dist/core/schema-templates.d.ts +20 -0
package/dist/core/schema-templates.d.ts.map +1 -0
package/dist/core/schema-templates.js +131 -0
package/dist/core/schema-templates.js.map +1 -0
package/dist/core/search-fallback.d.ts +28 -0
package/dist/core/search-fallback.d.ts.map +1 -0
package/dist/core/search-fallback.js +185 -0
package/dist/core/search-fallback.js.map +1 -0
package/dist/core/search-provider.d.ts +47 -4
package/dist/core/search-provider.d.ts.map +1 -1
package/dist/core/search-provider.js +278 -7
package/dist/core/search-provider.js.map +1 -1
package/dist/core/stealth-patches.d.ts +58 -0
package/dist/core/stealth-patches.d.ts.map +1 -0
package/dist/core/stealth-patches.js +340 -0
package/dist/core/stealth-patches.js.map +1 -0
package/dist/core/strategies.d.ts +20 -0
package/dist/core/strategies.d.ts.map +1 -1
package/dist/core/strategies.js +284 -48
package/dist/core/strategies.js.map +1 -1
package/dist/core/strategy-hooks.d.ts +1 -1
package/dist/core/strategy-hooks.d.ts.map +1 -1
package/dist/index.d.ts +11 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +37 -15
package/dist/index.js.map +1 -1
package/dist/mcp/server.js +109 -4
package/dist/mcp/server.js.map +1 -1
package/dist/server/app.d.ts.map +1 -1
package/dist/server/app.js +29 -0
package/dist/server/app.js.map +1 -1
package/dist/server/middleware/rate-limit.d.ts +2 -1
package/dist/server/middleware/rate-limit.d.ts.map +1 -1
package/dist/server/middleware/rate-limit.js +24 -8
package/dist/server/middleware/rate-limit.js.map +1 -1
package/dist/server/routes/agent.d.ts +4 -0
package/dist/server/routes/agent.d.ts.map +1 -1
package/dist/server/routes/agent.js +196 -9
package/dist/server/routes/agent.js.map +1 -1
package/dist/server/routes/batch.js +5 -5
package/dist/server/routes/batch.js.map +1 -1
package/dist/server/routes/compat.d.ts.map +1 -1
package/dist/server/routes/compat.js +1 -0
package/dist/server/routes/compat.js.map +1 -1
package/dist/server/routes/fetch.d.ts.map +1 -1
package/dist/server/routes/fetch.js +60 -6
package/dist/server/routes/fetch.js.map +1 -1
package/dist/server/routes/mcp.d.ts.map +1 -1
package/dist/server/routes/mcp.js +103 -2
package/dist/server/routes/mcp.js.map +1 -1
package/dist/server/routes/search.js +1 -1
package/dist/server/routes/search.js.map +1 -1
package/dist/types.d.ts +55 -4
package/dist/types.d.ts.map +1 -1
package/dist/types.js +4 -1
package/dist/types.js.map +1 -1
package/llms.txt +55 -125
package/package.json +15 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -12,6 +12,50 @@
  */
 import { simpleFetch } from './fetcher.js';
 // ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/**
+ * Resolve Reddit share URLs (/s/CODE) to their actual destination.
+ * These are short redirect links that point to the real post URL.
+ */
+async function resolveRedditShareUrl(url) {
+    const urlObj = new URL(url);
+    // Match /r/subreddit/s/CODE or /s/CODE patterns
+    if (!urlObj.pathname.includes('/s/'))
+        return url;
+    try {
+        const { default: https } = await import('https');
+        const { default: http } = await import('http');
+        return new Promise((resolve) => {
+            const client = url.startsWith('https') ? https : http;
+            const req = client.get(url, {
+                headers: { 'User-Agent': 'WebPeel/0.17.0 (web data platform; https://webpeel.dev) Node.js' },
+                timeout: 10000,
+            }, (res) => {
+                // Follow redirect (one hop)
+                if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+                    const redirectUrl = res.headers.location.startsWith('http')
+                        ? res.headers.location
+                        : new URL(res.headers.location, url).href;
+                    resolve(redirectUrl);
+                }
+                else {
+                    resolve(url); // No redirect, return original
+                }
+                res.resume(); // Consume response
+            });
+            req.on('error', () => resolve(url));
+            req.on('timeout', () => {
+                req.destroy();
+                resolve(url);
+            });
+        });
+    }
+    catch {
+        return url; // On any error, return original URL
+    }
+}
+// ---------------------------------------------------------------------------
 // Registry
 // ---------------------------------------------------------------------------
 const REGISTRY = [
@@ -19,6 +63,13 @@ const REGISTRY = [
     { match: (h) => h === 'reddit.com' || h === 'www.reddit.com' || h === 'old.reddit.com', extractor: redditExtractor },
     { match: (h) => h === 'github.com' || h === 'www.github.com', extractor: githubExtractor },
     { match: (h) => h === 'news.ycombinator.com', extractor: hackerNewsExtractor },
+    { match: (h) => h === 'en.wikipedia.org' || h === 'www.wikipedia.org' || /\w+\.wikipedia\.org/.test(h), extractor: wikipediaExtractor },
+    { match: (h) => h === 'youtube.com' || h === 'www.youtube.com' || h === 'youtu.be', extractor: youtubeExtractor },
+    { match: (h) => h === 'arxiv.org' || h === 'export.arxiv.org', extractor: arxivExtractor },
+    { match: (h) => h === 'stackoverflow.com' || h === 'www.stackoverflow.com', extractor: stackOverflowExtractor },
+    { match: (h) => h === 'www.npmjs.com' || h === 'npmjs.com', extractor: npmExtractor },
+    { match: (h) => h === 'www.bestbuy.com' || h === 'bestbuy.com', extractor: bestBuyExtractor },
+    { match: (h) => h === 'www.walmart.com' || h === 'walmart.com', extractor: walmartExtractor },
 ];
 /**
  * Returns the domain extractor for a URL, or null if none matches.
@@ -81,6 +132,23 @@ async function fetchJson(url, customHeaders) {
     });
     return tryParseJson(result.html);
 }
+/** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
+async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
+    for (let attempt = 0; attempt <= retries; attempt++) {
+        try {
+            const result = await fetchJson(url, headers);
+            return result;
+        }
+        catch (e) {
+            // Retry on rate-limit or transient errors
+            if (attempt < retries && (e.message?.includes('429') || e.message?.includes('rate') || e.message?.includes('Too Many'))) {
+                await new Promise(resolve => setTimeout(resolve, baseDelayMs * Math.pow(2, attempt)));
+                continue;
+            }
+            throw e;
+        }
+    }
+}
 // ---------------------------------------------------------------------------
 // 1. Twitter / X extractor
 // ---------------------------------------------------------------------------
@@ -161,6 +229,86 @@ async function twitterExtractor(html, url) {
     const isTweet = pathParts.includes('status');
     const type = isTweet ? 'tweet' : 'profile';
     const domain = 'twitter.com';
+    // --- Try FxTwitter API first (works from datacenter IPs, no auth needed) ---
+    const username = pathParts[0] || '';
+    if (isTweet) {
+        const statusId = pathParts[pathParts.indexOf('status') + 1];
+        if (statusId && username) {
+            try {
+                const fxUrl = `https://api.fxtwitter.com/${username}/status/${statusId}`;
+                const fxData = await fetchJson(fxUrl);
+                if (fxData && fxData.code === 200 && fxData.tweet) {
+                    const t = fxData.tweet;
+                    const structured = {
+                        author: {
+                            name: t.author?.name || '',
+                            handle: '@' + (t.author?.screen_name || ''),
+                            verified: t.author?.verified || false,
+                        },
+                        text: t.text || '',
+                        timestamp: t.created_at ? new Date(t.created_at).toISOString() : undefined,
+                        metrics: {
+                            likes: t.likes ?? 0,
+                            retweets: t.retweets ?? 0,
+                            replies: t.replies ?? 0,
+                            views: t.views ?? 0,
+                        },
+                        media: (t.media?.all || []).map((m) => m.url).filter(Boolean),
+                        quotedTweet: t.quote ? {
+                            text: t.quote.text || '',
+                            author: { name: t.quote.author?.name || '', handle: '@' + (t.quote.author?.screen_name || '') },
+                        } : null,
+                        source: 'fxtwitter',
+                    };
+                    const authorLine = `**${structured.author.name}** (${structured.author.handle})`;
+                    const timeLine = structured.timestamp ? `\n*${structured.timestamp}*` : '';
+                    const metricsLine = `\n\n💬 ${structured.metrics.replies}  🔁 ${structured.metrics.retweets}  ❤️ ${structured.metrics.likes}${structured.metrics.views ? `  👁 ${structured.metrics.views}` : ''}`;
+                    const mediaLine = structured.media.length ? `\n\n📷 Media: ${structured.media.join(', ')}` : '';
+                    const quotedLine = structured.quotedTweet
+                        ? `\n\n> **Quoted tweet by ${structured.quotedTweet.author?.name || 'unknown'}:** ${structured.quotedTweet.text}`
+                        : '';
+                    const cleanContent = `## 🐦 Tweet by ${authorLine}${timeLine}\n\n${structured.text}${quotedLine}${metricsLine}${mediaLine}`;
+                    return { domain, type, structured, cleanContent };
+                }
+            }
+            catch (e) {
+                if (process.env.DEBUG)
+                    console.debug('[webpeel]', 'FxTwitter API failed:', e instanceof Error ? e.message : e);
+            }
+        }
+    }
+    // --- Try FxTwitter for profiles ---
+    if (!isTweet && username) {
+        try {
+            const fxUrl = `https://api.fxtwitter.com/${username}`;
+            const fxData = await fetchJson(fxUrl);
+            if (fxData && fxData.code === 200 && fxData.user) {
+                const u = fxData.user;
+                const structured = {
+                    name: u.name || '',
+                    handle: '@' + (u.screen_name || ''),
+                    bio: u.description || '',
+                    followers: u.followers ?? 0,
+                    following: u.following ?? 0,
+                    tweets: u.tweets ?? 0,
+                    likes: u.likes ?? 0,
+                    verified: u.verification?.verified || false,
+                    location: u.location || '',
+                    created: u.joined || undefined,
+                    avatarUrl: u.avatar_url || null,
+                    bannerUrl: u.banner_url || null,
+                    website: u.website || null,
+                    source: 'fxtwitter',
+                };
+                const cleanContent = `## 🐦 @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' ✓' : ''}\n${structured.bio || ''}\n\n📍 ${structured.location || 'N/A'}  |  👥 ${structured.followers?.toLocaleString() || 0} followers  |  Following: ${structured.following?.toLocaleString() || 0}  |  Tweets: ${structured.tweets?.toLocaleString() || 0}`;
+                return { domain, type: 'profile', structured, cleanContent };
+            }
+        }
+        catch (e) {
+            if (process.env.DEBUG)
+                console.debug('[webpeel]', 'FxTwitter profile API failed:', e instanceof Error ? e.message : e);
+        }
+    }
     // --- Try __NEXT_DATA__ JSON (SSR data) ---
     const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
     let structured = null;
@@ -262,23 +410,142 @@ function parseRedditComment(data, depth) {
     };
 }
 async function redditExtractor(_html, url) {
-    const urlObj = new URL(url);
+    // Resolve Reddit share URLs (/s/CODE) to actual post URLs before any processing
+    let workingUrl = url;
+    if (url.includes('/s/')) {
+        const resolved = await resolveRedditShareUrl(url);
+        if (resolved !== url) {
+            if (process.env.DEBUG)
+                console.debug('[webpeel]', `Reddit share URL resolved: ${url} → ${resolved}`);
+            workingUrl = resolved;
+        }
+    }
+    const urlObj = new URL(workingUrl);
     const path = urlObj.pathname;
     const domain = 'reddit.com';
+    // Normalize old.reddit.com → www.reddit.com for JSON API
+    const normalizedUrl = workingUrl.replace(/old\.reddit\.com/, 'www.reddit.com');
+    const REDDIT_UA = { 'User-Agent': 'WebPeel/0.17.0 (web data platform; https://webpeel.dev) Node.js' };
     // Detect page type
-    const isPost = /\/r\/[^/]+\/comments\//.test(path);
-    const isSubreddit = /^\/r\/[^/]+\/?$/.test(path);
+    const isPost = /\/r\/[^/]+\/comments\//.test(path) || /^\/comments\//.test(path);
+    const isGallery = /\/gallery\//.test(path);
+    // Subreddit with any sort/filter: /r/sub, /r/sub/, /r/sub/hot, /r/sub/top, /r/sub/new, /r/sub/rising
+    const isSubreddit = /^\/r\/[^/]+\/?$/.test(path) || /^\/r\/[^/]+\/(hot|new|top|rising|controversial|best)\/?$/.test(path);
     const isUser = /^\/(u|user)\/[^/]+/.test(path);
-    const type = isPost ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : 'listing';
+    // Home/popular/all pages
+    const isHomeListing = /^\/(hot|new|top|rising|controversial|best|popular|all)\/?$/.test(path) || path === '/' || path === '';
+    const type = isPost || isGallery ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : isHomeListing ? 'listing' : 'listing';
+    if (isGallery) {
+        // Gallery posts: fetch the gallery JSON and extract the post data
+        const galleryJsonUrl = normalizedUrl.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
+        const requestedGallerySub = path.match(/\/r\/([^/]+)/)?.[1] || 'unknown';
+        let galleryData;
+        try {
+            galleryData = await fetchJsonWithRetry(galleryJsonUrl, REDDIT_UA);
+        }
+        catch (e) {
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found or has been deleted', subreddit: `r/${requestedGallerySub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
+            };
+        }
+        if (!Array.isArray(galleryData) || galleryData.length < 1) {
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found', subreddit: `r/${requestedGallerySub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
+            };
+        }
+        const postData = galleryData[0]?.data?.children?.[0]?.data;
+        if (!postData) {
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found', subreddit: `r/${requestedGallerySub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
+            };
+        }
+        // Validate subreddit matches the request
+        const actualGallerySub = postData.subreddit?.toLowerCase();
+        if (requestedGallerySub !== 'unknown' && actualGallerySub && requestedGallerySub.toLowerCase() !== actualGallerySub) {
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found in requested subreddit', requestedSubreddit: `r/${requestedGallerySub}`, actualSubreddit: `r/${actualGallerySub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post was not found in r/${requestedGallerySub}. It may have been deleted or moved.`,
+            };
+        }
+        const structured = {
+            subreddit: `r/${postData.subreddit || ''}`,
+            title: postData.title || '',
+            author: `u/${postData.author || '[deleted]'}`,
+            score: postData.score ?? 0,
+            upvoteRatio: postData.upvote_ratio ?? 1,
+            url: postData.url || url,
+            selftext: postData.selftext || '',
+            commentCount: postData.num_comments ?? 0,
+            created: unixToIso(postData.created_utc),
+            flair: postData.link_flair_text || null,
+            comments: [],
+            isGallery: true,
+        };
+        const cleanContent = `## 📋 ${structured.subreddit}: ${structured.title}
+**Posted by** ${structured.author} | Score: ${structured.score} | ${structured.commentCount} comments
+*${structured.created}*
+*(Gallery post)*`;
+        return { domain, type: 'post', structured, cleanContent };
+    }
     if (isPost) {
         // Fetch post data via Reddit JSON API
-        const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
-        const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
-        if (!Array.isArray(data) || data.length < 2)
-            return null;
+        const jsonUrl = normalizedUrl.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
+        const requestedPostSub = path.match(/\/r\/([^/]+)/)?.[1] || 'unknown';
+        let data;
+        try {
+            data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
+        }
+        catch (e) {
+            // Post not found or API error — return a "not found" result
+            // instead of null (which would trigger browser fallback with wrong content)
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found or has been deleted', subreddit: `r/${requestedPostSub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
+            };
+        }
+        if (!Array.isArray(data) || data.length < 2) {
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found', subreddit: `r/${requestedPostSub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
+            };
+        }
         const postData = data[0]?.data?.children?.[0]?.data;
-        if (!postData)
-            return null;
+        if (!postData) {
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found', subreddit: `r/${requestedPostSub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
+            };
+        }
+        // CRITICAL: Validate subreddit matches the request (prevents cross-subreddit ID reuse exploits)
+        const actualPostSub = postData.subreddit?.toLowerCase();
+        if (requestedPostSub !== 'unknown' && actualPostSub && requestedPostSub.toLowerCase() !== actualPostSub) {
+            // Reddit reused the post ID in a different subreddit — return error instead of wrong content
+            return {
+                domain,
+                type: 'post',
+                structured: { error: 'Post not found in requested subreddit', requestedSubreddit: `r/${requestedPostSub}`, actualSubreddit: `r/${actualPostSub}` },
+                cleanContent: `## ❌ Reddit Post Not Found\n\nThe post was not found in r/${requestedPostSub}. It may have been deleted or moved.`,
+            };
+        }
         // Parse top comments (max 20)
         const commentChildren = data[1]?.data?.children || [];
         const comments = [];
@@ -326,8 +593,13 @@ ${commentsMd || '*No comments found.*'}`;
     }
     if (isSubreddit) {
         // Fetch subreddit listing
-        const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=15';
-        const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
+        // Preserve query params (especially t=day, t=week etc. for sorted views)
+        const queryString = urlObj.search || '';
+        const sortMatch = path.match(/\/r\/[^/]+\/(hot|new|top|rising|controversial|best)/);
+        const sortPath = sortMatch ? `/${sortMatch[1]}` : '';
+        const baseSubUrl = normalizedUrl.match(/\/r\/[^/]+/)?.[0] || normalizedUrl.split('?')[0];
+        const jsonUrl = `https://www.reddit.com${baseSubUrl}${sortPath}.json?limit=15${queryString ? '&' + queryString.slice(1) : ''}`;
+        const data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
         if (!data?.data?.children)
             return null;
         const posts = data.data.children
@@ -350,6 +622,36 @@ ${commentsMd || '*No comments found.*'}`;
 ${posts.map((p, i) => `${i + 1}. **${p.title}**\n   ${p.author} | ↑ ${p.score} | 💬 ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n   ${p.url}`).join('\n\n')}`;
         return { domain, type, structured, cleanContent };
     }
+    if (isHomeListing) {
+        const sortMatch = path.match(/\/(hot|new|top|rising|controversial|best|popular|all)/);
+        const sortType = sortMatch ? sortMatch[1] : 'hot';
+        const queryString = urlObj.search || '';
+        const jsonUrl = `https://www.reddit.com/${sortType}.json?limit=15${queryString ? '&' + queryString.slice(1) : ''}`;
+        const data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
+        if (!data?.data?.children)
+            return null;
+        const posts = data.data.children
+            .filter((c) => c.kind === 't3')
+            .map((c) => {
+            const d = c.data;
+            return {
+                title: d.title || '',
+                author: `u/${d.author || '[deleted]'}`,
+                score: d.score ?? 0,
+                commentCount: d.num_comments ?? 0,
+                url: `https://reddit.com${d.permalink}`,
+                subreddit: `r/${d.subreddit}`,
+                flair: d.link_flair_text || null,
+            };
+        });
+        const structured = { sortType, posts, postCount: posts.length };
+        const listMd = posts.map((p, i) => {
+            const flairTag = p.flair ? ` | ${p.flair}` : '';
+            return `${i + 1}. **${p.title}**\n   ${p.author} in ${p.subreddit} | ↑ ${p.score} | 💬 ${p.commentCount}${flairTag}\n   ${p.url}`;
+        }).join('\n\n');
+        const cleanContent = `## 📋 Reddit — ${sortType.charAt(0).toUpperCase() + sortType.slice(1)} Posts\n\n${listMd}`;
+        return { domain: 'reddit.com', type: 'listing', structured, cleanContent };
+    }
     // User or other — fall back to null (let normal HTML extraction handle it)
     return null;
 }
@@ -499,7 +801,7 @@ ${commentsMd || '*No comments.*'}`;
         let readmeText = '';
         if (readmeData?.content) {
             try {
-                readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 500);
+                readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 5000);
             }
             catch { /* ignore */ }
         }
@@ -529,7 +831,7 @@ ${structured.description || '*No description.*'}
 🏷️ Topics: ${topicsStr}
 🔗 ${structured.homepage || 'No homepage'}  |  Last push: ${structured.lastPush}${structured.archived ? '\n⚠️ **ARCHIVED**' : ''}
-${structured.readme ? `### README (excerpt)\n\n${structured.readme}` : ''}`;
+${structured.readme ? `### README\n\n${structured.readme}` : ''}`;
         return { domain, type: 'repository', structured, cleanContent };
     }
     return null;
@@ -652,4 +954,442 @@ ${structured.about ? '\n' + structured.about : ''}`;
     }
     return null;
 }
+// ---------------------------------------------------------------------------
+// 5. Wikipedia extractor
+// ---------------------------------------------------------------------------
+/** Remove Wikipedia-specific noise from extracted content. */
+function cleanWikipediaContent(content) {
+    return content
+        // Remove [edit] links
+        .replace(/\[edit\]/gi, '')
+        // Remove citation brackets [1], [2], etc.
+        .replace(/\[\d+\]/g, '')
+        // Remove [citation needed], [verification], etc.
+        .replace(/\[(citation needed|verification|improve this article|adding citations[^\]]*|when\?|where\?|who\?|clarification needed|dubious[^\]]*|failed verification[^\]]*|unreliable source[^\]]*)\]/gi, '')
+        // Remove [Learn how and when to remove this message]
+        .replace(/\[Learn how and when to remove this message\]/gi, '')
+        // Clean up excess whitespace
+        .replace(/\n{3,}/g, '\n\n')
+        .trim();
+}
+async function wikipediaExtractor(_html, url) {
+    const urlObj = new URL(url);
+    const pathParts = urlObj.pathname.split('/').filter(Boolean);
+    // Only handle article pages: /wiki/Article_Title
+    if (pathParts[0] !== 'wiki' || pathParts.length < 2)
+        return null;
+    const articleTitle = decodeURIComponent(pathParts[1]);
+    // Skip special pages (contain a colon, e.g. Special:Random, Talk:Article)
+    if (articleTitle.includes(':'))
+        return null;
+    const lang = urlObj.hostname.split('.')[0] || 'en';
+    const apiUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
+    // Wikipedia REST API requires a descriptive User-Agent (https://meta.wikimedia.org/wiki/User-Agent_policy)
+    const wikiHeaders = { 'User-Agent': 'WebPeel/0.17.0 (https://webpeel.dev; jake@jakeliu.me) Node.js', 'Api-User-Agent': 'WebPeel/0.17.0 (https://webpeel.dev; jake@jakeliu.me)' };
+    try {
+        const data = await fetchJson(apiUrl, wikiHeaders);
+        if (!data || data.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found')
+            return null;
+        // For full article content, use the mobile-html endpoint (mobile-sections is deprecated)
+        let fullContent = '';
+        try {
+            const fullUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
+            const fullResult = await simpleFetch(fullUrl, undefined, 15000, {
+                ...wikiHeaders,
+                'Accept': 'text/html',
+            });
+            if (fullResult?.html) {
+                // Parse sections from the mobile HTML
+                const sectionMatches = fullResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
+                for (const section of sectionMatches) {
+                    // Extract section heading
+                    const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
+                    const heading = headingMatch ? stripHtml(headingMatch[2]).trim() : '';
+                    // Extract paragraphs
+                    const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
+                    const sectionText = paragraphs.map((p) => stripHtml(p).trim()).filter((t) => t.length > 0).join('\n\n');
+                    if (sectionText) {
+                        const prefix = heading ? `## ${heading}\n\n` : '';
+                        fullContent += `\n\n${prefix}${sectionText}`;
+                    }
+                }
+            }
+        }
+        catch (e) {
+            // mobile-html failed — use summary extract as fallback
+            if (process.env.DEBUG)
+                console.debug('[webpeel]', 'Wikipedia mobile-html failed, using summary:', e instanceof Error ? e.message : e);
+        }
+        // Clean Wikipedia-specific noise
+        fullContent = cleanWikipediaContent(fullContent);
+        const structured = {
+            title: data.title || articleTitle.replace(/_/g, ' '),
+            description: data.description || '',
+            extract: data.extract || '',
+            thumbnail: data.thumbnail?.source || null,
+            url: data.content_urls?.desktop?.page || url,
+            lastModified: data.timestamp || null,
+        };
+        const cleanContent = `# ${structured.title}\n\n${structured.description ? `*${structured.description}*\n\n` : ''}${fullContent || structured.extract}`;
+        return { domain: 'wikipedia.org', type: 'article', structured, cleanContent };
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'Wikipedia API failed:', e instanceof Error ? e.message : e);
+        return null;
+    }
+}
+// ---------------------------------------------------------------------------
+// 6. YouTube extractor (oEmbed API-first)
+// ---------------------------------------------------------------------------
+async function youtubeExtractor(_html, url) {
+    // Try YouTube oEmbed API first (no auth, works without browser)
+    try {
+        const oembedUrl = `https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`;
+        const oembedData = await fetchJson(oembedUrl);
+        if (oembedData && oembedData.title) {
+            // Also try noembed for richer data
+            let noembedData = null;
+            try {
+                noembedData = await fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`);
+            }
+            catch { /* optional */ }
+            const structured = {
+                title: oembedData.title,
+                author: oembedData.author_name || '',
+                authorUrl: oembedData.author_url || '',
+                thumbnailUrl: oembedData.thumbnail_url || '',
+                type: oembedData.type || 'video',
+                source: 'oembed',
+            };
+            const cleanContent = `## 🎬 ${structured.title}\n\n**Channel:** [${structured.author}](${structured.authorUrl})\n\n${noembedData?.description || 'YouTube video'}`;
+            return { domain: 'youtube.com', type: 'video', structured, cleanContent };
+        }
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'YouTube oEmbed failed:', e instanceof Error ? e.message : e);
+    }
+    // Fallback: return null (no HTML parsing implemented)
+    return null;
+}
+// ---------------------------------------------------------------------------
+// 7. ArXiv extractor (ArXiv API)
+// ---------------------------------------------------------------------------
+async function arxivExtractor(_html, url) {
+    const urlObj = new URL(url);
+    const path = urlObj.pathname;
+    // Extract paper ID from URL patterns:
+    // /abs/2501.12948, /pdf/2501.12948, /abs/2501.12948v2
+    const idMatch = path.match(/\/(abs|pdf|html)\/(\d{4}\.\d{4,5}(?:v\d+)?)/);
+    if (!idMatch)
+        return null;
+    const paperId = idMatch[2];
+    try {
+        // Use ArXiv API
+        const apiUrl = `https://export.arxiv.org/api/query?id_list=${paperId}`;
+        const result = await simpleFetch(apiUrl, 'WebPeel/0.17.0', 15000, { Accept: 'application/xml' });
+        if (!result?.html)
+            return null;
+        const xml = result.html;
+        // Parse XML (simple regex-based for these known fields)
+        const getTag = (tag) => {
+            const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
+            return match ? stripHtml(match[1]).trim() : '';
+        };
+        const getAllTags = (tag) => {
+            const matches = [...xml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
+            return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
+        };
+        const title = getTag('title');
+        const summary = getTag('summary');
+        const published = getTag('published');
+        const updated = getTag('updated');
+        const authors = getAllTags('name');
+        // Extract categories
+        const categories = [...xml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
+        // Extract DOI and journal ref if available
+        const doi = getTag('arxiv:doi');
+        const journalRef = getTag('arxiv:journal_ref');
+        if (!title)
+            return null;
+        const structured = {
+            title,
+            authors,
+            abstract: summary,
+            published: published || undefined,
+            updated: updated || undefined,
+            categories,
+            doi: doi || undefined,
+            journalRef: journalRef || undefined,
+            paperId,
+            pdfUrl: `https://arxiv.org/pdf/${paperId}`,
+            absUrl: `https://arxiv.org/abs/${paperId}`,
+        };
+        const authorLine = authors.length <= 5
+            ? authors.join(', ')
+            : `${authors.slice(0, 5).join(', ')} et al. (${authors.length} authors)`;
+        const cleanContent = `# ${title}\n\n**Authors:** ${authorLine}\n**Published:** ${published?.split('T')[0] || 'N/A'}${categories.length ? `\n**Categories:** ${categories.join(', ')}` : ''}${doi ? `\n**DOI:** ${doi}` : ''}${journalRef ? `\n**Journal:** ${journalRef}` : ''}\n\n## Abstract\n\n${summary}\n\n📄 [PDF](${structured.pdfUrl}) | [Abstract](${structured.absUrl})`;
+        return { domain: 'arxiv.org', type: 'paper', structured, cleanContent };
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'ArXiv API failed:', e instanceof Error ? e.message : e);
+        return null;
+    }
+}
+// ---------------------------------------------------------------------------
+// 8. Stack Overflow extractor (StackExchange API)
+// ---------------------------------------------------------------------------
+async function stackOverflowExtractor(_html, url) {
+    const urlObj = new URL(url);
+    const path = urlObj.pathname;
+    // Match /questions/12345/optional-slug
+    const questionMatch = path.match(/\/questions\/(\d+)/);
+    if (!questionMatch)
+        return null;
+    const questionId = questionMatch[1];
+    try {
+        const apiUrl = `https://api.stackexchange.com/2.3/questions/${questionId}?order=desc&sort=votes&site=stackoverflow&filter=withbody`;
+        const data = await fetchJson(apiUrl);
+        if (!data?.items?.[0])
+            return null;
+        const q = data.items[0];
+        // Also fetch answers
+        let answers = [];
+        try {
+            const answersUrl = `https://api.stackexchange.com/2.3/questions/${questionId}/answers?order=desc&sort=votes&site=stackoverflow&filter=withbody&pagesize=5`;
+            const answersData = await fetchJson(answersUrl);
+            answers = answersData?.items || [];
+        }
+        catch { /* answers optional */ }
+        const structured = {
+            title: stripHtml(q.title || ''),
+            questionId: q.question_id,
+            score: q.score || 0,
+            views: q.view_count || 0,
+            answerCount: q.answer_count || 0,
+            isAnswered: q.is_answered || false,
+            tags: q.tags || [],
+            askedBy: q.owner?.display_name || 'anonymous',
+            askedDate: q.creation_date ? new Date(q.creation_date * 1000).toISOString() : undefined,
+            acceptedAnswerId: q.accepted_answer_id || null,
+            answers: answers.map(a => ({
+                id: a.answer_id,
+                score: a.score,
+                isAccepted: a.is_accepted || false,
+                body: stripHtml(a.body || '').substring(0, 2000),
+                author: a.owner?.display_name || 'anonymous',
+            })),
+        };
+        const questionBody = stripHtml(q.body || '').substring(0, 3000);
+        const tagLine = structured.tags.length ? `**Tags:** ${structured.tags.join(', ')}` : '';
+        let answersContent = '';
+        for (const a of structured.answers.slice(0, 3)) {
+            const acceptedMark = a.isAccepted ? ' ✅ Accepted' : '';
+            answersContent += `\n\n---\n\n### Answer by ${a.author} (Score: ${a.score}${acceptedMark})\n\n${a.body}`;
+        }
+        const cleanContent = `# ${structured.title}\n\n**Score:** ${structured.score} | **Views:** ${structured.views?.toLocaleString()} | **Answers:** ${structured.answerCount}\n${tagLine}\n**Asked by:** ${structured.askedBy}\n\n## Question\n\n${questionBody}${answersContent}`;
+        return { domain: 'stackoverflow.com', type: 'question', structured, cleanContent };
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'StackOverflow API failed:', e instanceof Error ? e.message : e);
+        return null;
+    }
+}
+// ---------------------------------------------------------------------------
+// 9. NPM extractor (npm registry API)
+// ---------------------------------------------------------------------------
+async function npmExtractor(_html, url) {
+    const urlObj = new URL(url);
+    const path = urlObj.pathname;
+    // Match /package/name or /package/@scope/name
+    const packageMatch = path.match(/\/package\/((?:@[^/]+\/)?[^/]+)/);
+    if (!packageMatch)
+        return null;
+    const packageName = packageMatch[1];
+    try {
+        const apiUrl = `https://registry.npmjs.org/${encodeURIComponent(packageName)}`;
+        const data = await fetchJson(apiUrl);
+        if (!data?.name)
+            return null;
+        const latest = data['dist-tags']?.latest;
+        const latestVersion = latest ? data.versions?.[latest] : null;
+        // Get download counts
+        let downloads = null;
+        try {
+            downloads = await fetchJson(`https://api.npmjs.org/downloads/point/last-week/${encodeURIComponent(packageName)}`);
+        }
+        catch { /* optional */ }
+        const structured = {
+            name: data.name,
+            description: data.description || '',
+            version: latest || 'unknown',
+            license: latestVersion?.license || data.license || 'N/A',
+            homepage: data.homepage || latestVersion?.homepage || null,
+            repository: typeof data.repository === 'string' ? data.repository : data.repository?.url || null,
+            author: typeof data.author === 'string' ? data.author : data.author?.name || '',
+            keywords: data.keywords || [],
+            weeklyDownloads: downloads?.downloads || 0,
+            dependencies: Object.keys(latestVersion?.dependencies || {}),
+            devDependencies: Object.keys(latestVersion?.devDependencies || {}),
+            maintainers: (data.maintainers || []).map((m) => m.name || m).slice(0, 10),
+            created: data.time?.created || undefined,
+            modified: data.time?.modified || undefined,
+        };
+        // Include README if available (some packages have it, some don't)
+        const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
+        // Add to structured data
+        structured.readme = readmeText;
+        const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
+        const depsLine = structured.dependencies.length
+            ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
+            : '';
+        const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
+        const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
+        const datesLine = structured.created ? `\n**Created:** ${structured.created?.split('T')[0] || 'N/A'} | **Last modified:** ${structured.modified?.split('T')[0] || 'N/A'}` : '';
+        const readmeSection = readmeText
+            ? `\n\n### README\n\n${readmeText}`
+            : '';
+        const cleanContent = `# 📦 ${structured.name}@${structured.version}
+${structured.description}
+**License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
+**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
+        return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'NPM API failed:', e instanceof Error ? e.message : e);
+        return null;
+    }
+}
+// ---------------------------------------------------------------------------
+// 10. Best Buy extractor (Best Buy Products API)
+// ---------------------------------------------------------------------------
+async function bestBuyExtractor(_html, url) {
+    const apiKey = process.env.BESTBUY_API_KEY;
+    if (!apiKey)
+        return null; // No API key, skip
+    // Extract SKU from URL: /site/.../6587822.p → 6587822
+    const skuMatch = url.match(/\/(\d{7,})\.p/);
+    if (!skuMatch)
+        return null;
+    const sku = skuMatch[1];
+    const apiUrl = `https://api.bestbuy.com/v1/products/${sku}.json?apiKey=${apiKey}&show=sku,name,salePrice,regularPrice,onSale,shortDescription,longDescription,image,largeFrontImage,url,customerReviewAverage,customerReviewCount,categoryPath,manufacturer,modelNumber,upc,freeShipping,inStoreAvailability,onlineAvailability,condition,features.feature`;
+    try {
+        const data = await fetchJson(apiUrl);
+        if (!data || data.error)
+            return null;
+        // Build clean markdown
+        const lines = [];
+        lines.push(`# ${data.name}`);
+        lines.push('');
+        if (data.onSale) {
+            lines.push(`**Sale Price:** $${data.salePrice} (was $${data.regularPrice})`);
+        }
+        else {
+            lines.push(`**Price:** $${data.regularPrice}`);
+        }
+        lines.push(`**SKU:** ${data.sku}`);
+        if (data.manufacturer)
+            lines.push(`**Brand:** ${data.manufacturer}`);
+        if (data.modelNumber)
+            lines.push(`**Model:** ${data.modelNumber}`);
+        if (data.customerReviewAverage) {
+            lines.push(`**Rating:** ${data.customerReviewAverage}/5 (${data.customerReviewCount} reviews)`);
+        }
+        lines.push(`**Availability:** ${data.onlineAvailability ? 'In Stock Online' : 'Out of Stock Online'} | ${data.inStoreAvailability ? 'Available In Store' : 'Not Available In Store'}`);
+        if (data.freeShipping)
+            lines.push('**Free Shipping:** Yes');
+        lines.push('');
+        if (data.shortDescription)
+            lines.push(data.shortDescription);
+        lines.push('');
+        if (data.longDescription)
+            lines.push(data.longDescription);
+        if (data.features?.feature) {
+            lines.push('');
+            lines.push('## Features');
+            for (const f of data.features.feature) {
+                lines.push(`- ${f}`);
+            }
+        }
+        const structured = {
+            sku: data.sku,
+            name: data.name,
+            price: data.salePrice || data.regularPrice,
+            regularPrice: data.regularPrice,
+            onSale: data.onSale,
+            brand: data.manufacturer,
+            model: data.modelNumber,
+            upc: data.upc,
+            rating: data.customerReviewAverage,
+            reviewCount: data.customerReviewCount,
+            image: data.largeFrontImage || data.image,
+            url: data.url,
+            inStock: data.onlineAvailability,
+            freeShipping: data.freeShipping,
+            condition: data.condition,
+            category: data.categoryPath?.map((c) => c.name).join(' > '),
+        };
+        return { domain: 'bestbuy.com', type: 'product', structured, cleanContent: lines.join('\n') };
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'Best Buy API failed:', e instanceof Error ? e.message : e);
+        return null;
+    }
+}
+// ---------------------------------------------------------------------------
+// 11. Walmart extractor (Walmart frontend search API)
+// ---------------------------------------------------------------------------
+async function walmartExtractor(_html, url) {
+    // Extract item ID from URL patterns:
+    // /ip/Product-Name/1234567 or /ip/1234567
+    const itemMatch = url.match(/\/ip\/(?:.*\/)?(\d+)/);
+    if (!itemMatch)
+        return null;
+    const itemId = itemMatch[1];
+    // Try Walmart's BE API (used by their frontend, sometimes accessible)
+    const apiUrl = `https://www.walmart.com/orchestra/snb/graphql/Search?query=${itemId}&page=1&affinityOverride=default&limit=1`;
+    try {
+        const response = await fetchJson(apiUrl, {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
+            'Accept': 'application/json',
+            'Referer': 'https://www.walmart.com/',
+        });
+        if (response?.data?.search?.searchResult?.itemStacks?.[0]?.items?.[0]) {
+            const item = response.data.search.searchResult.itemStacks[0].items[0];
+            const lines = [];
+            lines.push(`# ${item.name}`);
+            if (item.priceInfo?.currentPrice?.price) {
+                lines.push(`**Price:** $${item.priceInfo.currentPrice.price}`);
+            }
+            if (item.averageRating) {
+                lines.push(`**Rating:** ${item.averageRating}/5 (${item.numberOfReviews || 0} reviews)`);
+            }
+            if (item.shortDescription)
+                lines.push(item.shortDescription);
+            const structured = {
+                name: item.name,
+                price: item.priceInfo?.currentPrice?.price,
+                rating: item.averageRating,
+                reviewCount: item.numberOfReviews,
+                image: item.imageInfo?.thumbnailUrl,
+                itemId: itemId,
+                inStock: item.availabilityStatusV2?.value === 'IN_STOCK',
+            };
+            return { domain: 'walmart.com', type: 'product', structured, cleanContent: lines.join('\n') };
+        }
+        return null;
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'Walmart API failed:', e instanceof Error ? e.message : e);
+        return null; // API not accessible, fall through to other methods
+    }
+}
 //# sourceMappingURL=domain-extractors.js.map