npm - webpeel - Versions diffs - 0.21.11 → 0.21.13 - Mend

webpeel 0.21.11 → 0.21.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/core/domain-extractors.js +264 -21
package/dist/core/pipeline.js +21 -0
package/dist/core/structured-extract.js +36 -6
package/package.json +1 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -1518,12 +1518,27 @@ async function npmExtractor(_html, url) {
             modified: data.time?.modified || undefined,
         };
         // Include README if available (some packages have it, some don't)
-        const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
+        let readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
+        // If no README in registry, try fetching from unpkg.com
+        if (!readmeText) {
+            try {
+                const unpkgUrl = `https://unpkg.com/${encodeURIComponent(packageName)}/README.md`;
+                const readmeResult = await simpleFetch(unpkgUrl, undefined, 10000);
+                if (readmeResult?.html && readmeResult.html.length > 10 && !readmeResult.html.trim().startsWith('<')) {
+                    readmeText = readmeResult.html.slice(0, 5000);
+                }
+            }
+            catch { /* README from unpkg optional */ }
+        }
         // Add to structured data
         structured.readme = readmeText;
         const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
+        // Show ALL dependencies (not capped at 15)
         const depsLine = structured.dependencies.length
-            ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
+            ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.join(', ')}`
+            : '';
+        const devDepsLine = structured.devDependencies.length
+            ? `\n**Dev Dependencies (${structured.devDependencies.length}):** ${structured.devDependencies.slice(0, 10).join(', ')}${structured.devDependencies.length > 10 ? '...' : ''}`
             : '';
         const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
         const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
@@ -1536,7 +1551,7 @@ async function npmExtractor(_html, url) {
 ${structured.description}
 **License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
-**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
+**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${devDepsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
         return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
     }
     catch (e) {
@@ -1780,15 +1795,57 @@ async function mediumExtractor(html, url) {
             $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
         const description = jsonLdData?.description ||
             $('meta[property="og:description"]').attr('content') || '';
+        // Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
+        let publication = '';
+        try {
+            const urlObj2 = new URL(url);
+            const hostname = urlObj2.hostname;
+            if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
+                publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
+            }
+        }
+        catch { /* ignore */ }
+        if (!publication) {
+            publication = $('[data-testid="publicationName"]').text().trim() ||
+                $('a[data-testid="publicationName"]').text().trim() ||
+                $('meta[property="article:section"]').attr('content') ||
+                $('a[href*="/tag/"]').first().text().trim() || '';
+        }
+        // Author bio — usually shown in an author card or bio section
+        const authorBio = $('[data-testid="authorBio"]').text().trim() ||
+            $('p[class*="bio"]').first().text().trim() ||
+            $('[aria-label="authorBio"]').text().trim() || '';
+        // Clap count — Medium shows clap button with count
+        let clapCount = '';
+        $('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
+            const txt = $(el).text().trim();
+            if (txt && /\d/.test(txt)) {
+                clapCount = txt;
+                return false;
+            }
+        });
+        if (!clapCount) {
+            // Fallback: find spans that look like clap counts (e.g., "2.4K")
+            $('span').filter((_, el) => {
+                const label = $(el).closest('[aria-label]').attr('aria-label') || '';
+                return label.toLowerCase().includes('clap');
+            }).each((_, el) => {
+                const txt = $(el).text().trim();
+                if (txt && /\d/.test(txt)) {
+                    clapCount = txt;
+                    return false;
+                }
+            });
+        }
         // Extract article body — Medium puts content in <article> or section
         let articleBody = '';
         const articleEl = $('article').first();
         if (articleEl.length) {
-            // Remove nav, aside, buttons
-            articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
+            // Remove nav, aside, buttons, author-card, footer sections
+            articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
             // Get paragraphs and headings
             const parts = [];
-            articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
+            articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
                 const tag = el.name;
                 const text = $(el).text().trim();
                 if (!text || text.length < 5)
@@ -1801,6 +1858,8 @@ async function mediumExtractor(html, url) {
                     parts.push(`> ${text}`);
                 else if (tag === 'pre')
                     parts.push('```\n' + text + '\n```');
+                else if (tag === 'figcaption')
+                    parts.push(`*${text}*`);
                 else
                     parts.push(text);
             });
@@ -1811,15 +1870,22 @@ async function mediumExtractor(html, url) {
         const structured = {
             title,
             author,
+            authorBio,
             publishDate,
             readingTime,
             description,
+            publication,
+            clapCount,
             url,
         };
         const authorLine = author ? `\n**Author:** ${author}` : '';
+        const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
         const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
         const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
-        const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
+        const pubLine = publication ? `\n**Publication:** ${publication}` : '';
+        const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
+        // No hard character cap — let the pipeline's budget/maxTokens handle truncation
+        const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
         return { domain: 'medium.com', type: 'article', structured, cleanContent };
     }
     catch {
@@ -1832,7 +1898,27 @@ async function mediumExtractor(html, url) {
 async function substackExtractor(html, url) {
     try {
         const { load } = await import('cheerio');
-        const $ = load(html);
+        // Handle open.substack.com/pub/{publication}/p/{slug} redirect URLs
+        // These are share links that redirect to the actual post. Redirect to the real URL.
+        const urlObj = new URL(url);
+        let workingHtml = html;
+        let workingUrl = url;
+        if (urlObj.hostname === 'open.substack.com') {
+            const openMatch = urlObj.pathname.match(/\/pub\/([^/]+)\/p\/([^/]+)/);
+            if (openMatch) {
+                const [, publication, slug] = openMatch;
+                const actualUrl = `https://${publication}.substack.com/p/${slug}`;
+                try {
+                    const fetchResult = await simpleFetch(actualUrl, undefined, 15000);
+                    if (fetchResult?.html && fetchResult.html.length > 500) {
+                        workingHtml = fetchResult.html;
+                        workingUrl = actualUrl;
+                    }
+                }
+                catch { /* fall through with original HTML */ }
+            }
+        }
+        const $ = load(workingHtml);
         // JSON-LD
         let jsonLdData = null;
         $('script[type="application/ld+json"]').each((_, el) => {
@@ -1857,14 +1943,14 @@ async function substackExtractor(html, url) {
             $('meta[property="article:published_time"]').attr('content') ||
             $('time').first().attr('datetime') || '';
         const publication = $('meta[property="og:site_name"]').attr('content') ||
-            $('a.navbar-title-link').text().trim() || new URL(url).hostname.replace('.substack.com', '');
+            $('a.navbar-title-link').text().trim() || new URL(workingUrl).hostname.replace('.substack.com', '');
         const description = jsonLdData?.description ||
             $('meta[property="og:description"]').attr('content') || '';
-        // Article content
+        // Article content — try multiple Substack CSS patterns
         let articleBody = '';
-        const postContent = $('.body.markup, .post-content, article').first();
+        const postContent = $('.body.markup, .post-content, article, [class*="post-content"], .available-content').first();
         if (postContent.length) {
-            postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
+            postContent.find('script, style, nav, .paywall, .subscribe-widget, .subscription-widget').remove();
             const parts = [];
             postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
                 const tag = el.name;
@@ -1884,19 +1970,35 @@ async function substackExtractor(html, url) {
             });
             articleBody = parts.join('\n\n');
         }
+        // If no article body found, try broader search
+        if (!articleBody) {
+            const parts = [];
+            $('main p, article p, [class*="content"] p').each((_, el) => {
+                const text = $(el).text().trim();
+                if (text && text.length > 20)
+                    parts.push(text);
+            });
+            articleBody = parts.slice(0, 20).join('\n\n');
+        }
         const contentBody = articleBody || description;
+        // Detect if the post appears paywalled (short content with no article body)
+        const isPaywalled = !articleBody && description.length > 0;
+        const paywallNote = isPaywalled
+            ? '\n\n---\n*⚠️ This post appears to be behind a paywall. Only the preview/description is available. Full content requires a subscription.*'
+            : '';
         const structured = {
             title,
             author,
             publication,
             publishDate,
             description,
-            url,
+            paywalled: isPaywalled,
+            url: workingUrl,
         };
         const authorLine = author ? `\n**Author:** ${author}` : '';
         const pubLine = publication ? `\n**Publication:** ${publication}` : '';
         const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
-        const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
+        const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}${paywallNote}`;
         return { domain: 'substack.com', type: 'post', structured, cleanContent };
     }
     catch {
@@ -2071,11 +2173,37 @@ async function imdbExtractor(html, url) {
                 ? jsonLd.director.map((d) => d.name || d).join(', ')
                 : jsonLd.director?.name || String(jsonLd.director))
             : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
-        // Cast (top few from JSON-LD actor field)
-        const cast = jsonLd?.actor
+        // Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
+        const castPairs = [];
+        // IMDB new UI: each title-cast-item contains actor link + character link
+        $('[data-testid="title-cast-item"]').each((_, el) => {
+            const actorEl = $(el).find('a[href*="/name/nm"]').first();
+            const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
+            const actor = actorEl.text().trim();
+            // Character name may span multiple elements; clean whitespace
+            const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
+            if (actor && actor.length > 1) {
+                castPairs.push({ actor, character: character || '' });
+            }
+        });
+        // Fall back to classic cast list (older IMDB page versions)
+        const castFromHtml = [];
+        if (!castPairs.length) {
+            $('.cast_list td.itemprop a').each((_, el) => {
+                const name = $(el).text().trim();
+                if (name && name.length > 1 && !castFromHtml.includes(name))
+                    castFromHtml.push(name);
+            });
+        }
+        // JSON-LD actors as final fallback
+        const castFromLd = jsonLd?.actor
             ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
-                .map((a) => a.name || a).slice(0, 6)
+                .map((a) => a.name || a)
             : [];
+        // Build final cast list: with characters if available (top 10), otherwise names only
+        const cast = castPairs.length > 0
+            ? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
+            : [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
         // Runtime
         const runtime = jsonLd?.duration
             ? (() => {
@@ -2085,17 +2213,82 @@ async function imdbExtractor(html, url) {
                 return String(jsonLd.duration);
             })()
             : '';
+        // Full plot/storyline — try to get the longer version from HTML
+        const fullPlot = $('[data-testid="storyline-plot-summary"] span, [data-testid="plot-xl"] span, span[data-testid="plot-l"], #titleStoryLine p, .plot_summary .summary_text').first().text().trim() || description;
+        // Additional details: Writers, Keywords, Awards
+        const writers = [];
+        $('[data-testid="title-pc-wide-screen"] li[data-testid="title-pc-principal-credit"]:nth-child(2) a, .credit_summary_item:contains("Writer") a').each((_, el) => {
+            const name = $(el).text().trim();
+            if (name && !writers.includes(name))
+                writers.push(name);
+        });
+        // Keywords — try HTML first, fall back to JSON-LD keywords
+        let keywords = [];
+        $('[data-testid="storyline-plot-keywords"] a, .see-more.inline.canwrap span a, a[href*="keyword"]').each((_, el) => {
+            const kw = $(el).text().trim();
+            if (kw && kw.length < 30 && !keywords.includes(kw))
+                keywords.push(kw);
+        });
+        // Fall back to JSON-LD keywords if HTML didn't yield any
+        if (!keywords.length && jsonLd?.keywords) {
+            keywords = (typeof jsonLd.keywords === 'string'
+                ? jsonLd.keywords.split(',')
+                : Array.isArray(jsonLd.keywords) ? jsonLd.keywords : []).map((k) => k.trim()).filter(Boolean);
+        }
+        // Writers — also try JSON-LD creator field
+        if (!writers.length && jsonLd?.creator) {
+            const creators = Array.isArray(jsonLd.creator) ? jsonLd.creator : [jsonLd.creator];
+            for (const c of creators) {
+                const name = c?.name || (typeof c === 'string' ? c : '');
+                if (name && !writers.includes(name))
+                    writers.push(name);
+            }
+        }
+        // Awards / accolades — try hero accolades chip, then any awards-related link text
+        let awardsSummary = '';
+        // IMDB new UI: awards accolades chip in the hero section
+        const accoladesEl = $('[data-testid="awards-accolades"]');
+        if (accoladesEl.length) {
+            awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
+        }
+        // Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
+        if (!awardsSummary) {
+            const titleMatch = url.match(/\/(tt\d+)/);
+            const titleId = titleMatch ? titleMatch[1] : '';
+            if (titleId) {
+                $(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
+                    const text = $(el).text().trim().replace(/\s+/g, ' ');
+                    if (text && text.length > 3 && text.length < 200) {
+                        awardsSummary = text;
+                        return false; // break
+                    }
+                });
+            }
+        }
+        // Fallback: JSON-LD award field
+        if (!awardsSummary && jsonLd?.award) {
+            awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
+        }
+        // Content rating & release date from JSON-LD
+        const contentRating = jsonLd?.contentRating || '';
+        const datePublished = jsonLd?.datePublished || '';
         const structured = {
-            title, year, contentType, description, ratingValue, ratingCount,
-            genres, director, cast, runtime, url,
+            title, year, contentType, description: fullPlot, ratingValue, ratingCount,
+            genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
         };
         const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
         const genreLine = genres.length ? genres.join(', ') : '';
         const directorLine = director ? `**Director:** ${director}` : '';
+        const writersLine = writers.length ? `**Writers:** ${writers.slice(0, 5).join(', ')}` : '';
         const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
         const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
+        const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
+        const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
+        const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
+        const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
         const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
-        const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${directorLine ? directorLine + '\n' : ''}${castLine ? castLine + '\n' : ''}\n## Plot\n\n${description}`;
+        const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
+        const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
         return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
     }
     catch {
@@ -2232,6 +2425,7 @@ async function pypiExtractor(_html, url) {
             return null;
         const info = data.info;
         const structured = {
+            title: `${info.name} ${info.version}`,
             name: info.name,
             version: info.version,
             description: info.summary || '',
@@ -2245,17 +2439,34 @@ async function pypiExtractor(_html, url) {
             requiresDist: (info.requires_dist || []).slice(0, 20),
             classifiers: (info.classifiers || []).slice(0, 10),
         };
+        // Full description/README from PyPI (info.description is the full README in markdown)
+        const fullDescription = info.description && info.description.length > 100 &&
+            info.description !== 'UNKNOWN' && info.description !== info.summary
+            ? info.description.slice(0, 8000)
+            : null;
+        // Store full description in structured
+        structured.fullDescription = fullDescription;
         const installCmd = `pip install ${info.name}`;
         const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
         const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
+        // Show all dependencies
         const depsLine = structured.requiresDist.length
             ? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
             : '';
+        // Classifiers — extract useful ones (license, status, Python versions)
+        const usefulClassifiers = structured.classifiers.filter((c) => c.startsWith('Programming Language') || c.startsWith('License') || c.startsWith('Development Status'));
+        const classifiersSection = usefulClassifiers.length
+            ? `\n\n## Classifiers\n\n${usefulClassifiers.map((c) => `- ${c}`).join('\n')}`
+            : '';
         // Find project URLs
         const projectUrlLines = [];
         for (const [label, u] of Object.entries(structured.projectUrls)) {
             projectUrlLines.push(`- **${label}:** ${u}`);
         }
+        // Full description section (package README from PyPI)
+        const descSection = fullDescription
+            ? `\n\n## Description\n\n${fullDescription}`
+            : '';
         const cleanContent = `# 📦 ${info.name} ${info.version}
 ${info.summary || ''}
@@ -2266,7 +2477,7 @@ ${installCmd}
 **Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
-${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
+${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}${classifiersSection}${descSection}`;
         return { domain: 'pypi.org', type: 'package', structured, cleanContent };
     }
     catch (e) {
@@ -2289,6 +2500,38 @@ async function devtoExtractor(html, url) {
         const slug = pathParts.length >= 2
             ? pathParts.slice(0, 2).join('/').replace(/^@/, '')
             : null;
+        // Homepage: no slug → fetch recent top articles from Dev.to API
+        if (!slug) {
+            try {
+                const topArticles = await fetchJson('https://dev.to/api/articles?page=1&per_page=20&top=1');
+                if (Array.isArray(topArticles) && topArticles.length > 0) {
+                    const articles = topArticles.map((a) => ({
+                        title: a.title || '',
+                        author: a.user?.name || '',
+                        authorUsername: a.user?.username || '',
+                        tags: a.tag_list || [],
+                        reactions: a.public_reactions_count || 0,
+                        comments: a.comments_count || 0,
+                        readingTime: a.reading_time_minutes ? `${a.reading_time_minutes} min` : '',
+                        url: a.url || '',
+                        publishDate: a.published_at ? a.published_at.split('T')[0] : '',
+                    }));
+                    const listMd = articles.map((a, i) => {
+                        const tags = a.tags.length ? ` · #${a.tags.slice(0, 3).join(' #')}` : '';
+                        const stats = `❤️ ${a.reactions} | 💬 ${a.comments}${a.readingTime ? ` | ${a.readingTime}` : ''}`;
+                        return `${i + 1}. **[${a.title}](${a.url})**\n   by @${a.authorUsername}${tags}\n   ${stats} · ${a.publishDate}`;
+                    }).join('\n\n');
+                    const structured = {
+                        title: 'DEV Community — Top Articles',
+                        articles,
+                        fetchedAt: new Date().toISOString(),
+                    };
+                    const cleanContent = `# 🧑‍💻 DEV Community — Top Articles\n\n*${articles.length} articles from the community*\n\n${listMd}`;
+                    return { domain: 'dev.to', type: 'listing', structured, cleanContent };
+                }
+            }
+            catch { /* fall through to HTML */ }
+        }
         if (slug) {
             try {
                 const apiUrl = `https://dev.to/api/articles/${slug}`;

package/dist/core/pipeline.js CHANGED Viewed

@@ -344,6 +344,7 @@ export async function fetchContent(ctx) {
             blockResources: ctx.options.blockResources,
             cloaked: ctx.options.cloaked,
             cycle: ctx.options.cycle,
+            tls: ctx.options.tls,
             noEscalate: ctx.options.noEscalate,
         });
     }
@@ -410,6 +411,18 @@ export async function fetchContent(ctx) {
             }
             catch { /* Search fallback also failed — rethrow original BlockedError */ }
         }
+        // Enhance error messages with actionable advice
+        if (fetchError instanceof BlockedError) {
+            const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
+            const enhancedError = new BlockedError(actionableMsg);
+            throw enhancedError;
+        }
+        const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
+        if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
+            const ms = ctx.timeout ?? 30000;
+            const enhancedMsg = `Request timed out after ${Math.round(ms / 1000)}s. This site may require browser rendering — try \`render: true\`.`;
+            throw new Error(enhancedMsg);
+        }
         throw fetchError;
     }
     const fetchDuration = ctx.timer.end('fetch');
@@ -1183,6 +1196,14 @@ export function buildResult(ctx) {
     let warning;
     const contentLen = ctx.content.length;
     const htmlLen = ctx.fetchResult?.html?.length || 0;
+    // Add contentQuality metadata for thin content (< 100 words)
+    const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
+    if (wordCount < 100 && wordCount > 0) {
+        ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
+        if (ctx.metadata) {
+            ctx.metadata.contentQuality = 'thin';
+        }
+    }
     if (contentLen < 100 && htmlLen > 1000) {
         warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
     }

package/dist/core/structured-extract.js CHANGED Viewed

@@ -143,6 +143,8 @@ function heuristicExtractString(fieldName, content, pageUrl) {
                 .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
                 .replace(/[*_`[\]]/g, '')
                 .replace(/&[a-z]+;/g, '') // HTML entities
+                // Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
+                .replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
                 .replace(/\s+/g, ' ')
                 .trim().slice(0, 150);
         }
@@ -156,6 +158,12 @@ function heuristicExtractString(fieldName, content, pageUrl) {
         if (pageUrl)
             return pageUrl;
     }
+    // Director (for movies/films)
+    if (/director/.test(lf)) {
+        const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
+        if (m?.[1])
+            return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
+    }
     // Author/writer/by
     if (/author|writer|by/.test(lf)) {
         const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
@@ -339,9 +347,21 @@ async function heuristicExtract(content, schema) {
             fieldsFound++;
         data[field] = value;
     }
-    // Confidence: 0.3 base, up to 0.5 based on fill rate
+    // Confidence based on fill rate:
+    // - ALL fields null → 0.1 (extraction found nothing useful)
+    // - Some fields null → 0.3-0.5 based on fill ratio
+    // - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
     const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
-    const confidence = 0.3 + fillRate * 0.2;
+    let confidence;
+    if (fieldsFound === 0) {
+        confidence = 0.1; // All null — heuristic found nothing
+    }
+    else if (fieldsFound === totalFields) {
+        confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
+    }
+    else {
+        confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
+    }
     return {
         data,
         confidence: parseFloat(confidence.toFixed(2)),
@@ -395,12 +415,22 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
                 return heuristic;
             }
             const { data, missingRequired } = validateAndCoerce(parsed, schema);
-            // Confidence: 0.9 base, penalised for missing required fields
-            const penalty = missingRequired.length * 0.05;
+            // Confidence for LLM extraction:
+            // - ALL fields null → 0.1 (LLM couldn't extract anything)
+            // - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
+            // - All populated → 0.90-0.98 based on fill rate
             const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
             const totalCount = Object.keys(schema.properties).length;
-            const fillBonus = totalCount > 0 ? (filledCount / totalCount) * 0.05 : 0;
-            const confidence = Math.max(0.5, Math.min(0.98, 0.9 + fillBonus - penalty));
+            const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
+            const penalty = missingRequired.length * 0.05;
+            let confidence;
+            if (filledCount === 0) {
+                confidence = 0.1; // LLM returned all nulls — extraction failed
+            }
+            else {
+                const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
+                confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
+            }
             return {
                 data,
                 confidence: parseFloat(confidence.toFixed(2)),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.11",
+  "version": "0.21.13",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",