npm - webpeel - Versions diffs - 0.21.11 → 0.21.12 - Mend

webpeel 0.21.11 → 0.21.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/core/domain-extractors.js +165 -17
package/dist/core/pipeline.js +20 -0
package/dist/core/structured-extract.js +36 -6
package/package.json +1 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -1518,12 +1518,27 @@ async function npmExtractor(_html, url) {
             modified: data.time?.modified || undefined,
         };
         // Include README if available (some packages have it, some don't)
-        const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
+        let readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
+        // If no README in registry, try fetching from unpkg.com
+        if (!readmeText) {
+            try {
+                const unpkgUrl = `https://unpkg.com/${encodeURIComponent(packageName)}/README.md`;
+                const readmeResult = await simpleFetch(unpkgUrl, undefined, 10000);
+                if (readmeResult?.html && readmeResult.html.length > 10 && !readmeResult.html.trim().startsWith('<')) {
+                    readmeText = readmeResult.html.slice(0, 5000);
+                }
+            }
+            catch { /* README from unpkg optional */ }
+        }
         // Add to structured data
         structured.readme = readmeText;
         const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
+        // Show ALL dependencies (not capped at 15)
         const depsLine = structured.dependencies.length
-            ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
+            ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.join(', ')}`
+            : '';
+        const devDepsLine = structured.devDependencies.length
+            ? `\n**Dev Dependencies (${structured.devDependencies.length}):** ${structured.devDependencies.slice(0, 10).join(', ')}${structured.devDependencies.length > 10 ? '...' : ''}`
             : '';
         const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
         const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
@@ -1536,7 +1551,7 @@ async function npmExtractor(_html, url) {
 ${structured.description}
 **License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
-**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
+**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${devDepsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
         return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
     }
     catch (e) {
@@ -1832,7 +1847,27 @@ async function mediumExtractor(html, url) {
 async function substackExtractor(html, url) {
     try {
         const { load } = await import('cheerio');
-        const $ = load(html);
+        // Handle open.substack.com/pub/{publication}/p/{slug} redirect URLs
+        // These are share links that redirect to the actual post. Redirect to the real URL.
+        const urlObj = new URL(url);
+        let workingHtml = html;
+        let workingUrl = url;
+        if (urlObj.hostname === 'open.substack.com') {
+            const openMatch = urlObj.pathname.match(/\/pub\/([^/]+)\/p\/([^/]+)/);
+            if (openMatch) {
+                const [, publication, slug] = openMatch;
+                const actualUrl = `https://${publication}.substack.com/p/${slug}`;
+                try {
+                    const fetchResult = await simpleFetch(actualUrl, undefined, 15000);
+                    if (fetchResult?.html && fetchResult.html.length > 500) {
+                        workingHtml = fetchResult.html;
+                        workingUrl = actualUrl;
+                    }
+                }
+                catch { /* fall through with original HTML */ }
+            }
+        }
+        const $ = load(workingHtml);
         // JSON-LD
         let jsonLdData = null;
         $('script[type="application/ld+json"]').each((_, el) => {
@@ -1857,14 +1892,14 @@ async function substackExtractor(html, url) {
             $('meta[property="article:published_time"]').attr('content') ||
             $('time').first().attr('datetime') || '';
         const publication = $('meta[property="og:site_name"]').attr('content') ||
-            $('a.navbar-title-link').text().trim() || new URL(url).hostname.replace('.substack.com', '');
+            $('a.navbar-title-link').text().trim() || new URL(workingUrl).hostname.replace('.substack.com', '');
         const description = jsonLdData?.description ||
             $('meta[property="og:description"]').attr('content') || '';
-        // Article content
+        // Article content — try multiple Substack CSS patterns
         let articleBody = '';
-        const postContent = $('.body.markup, .post-content, article').first();
+        const postContent = $('.body.markup, .post-content, article, [class*="post-content"], .available-content').first();
         if (postContent.length) {
-            postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
+            postContent.find('script, style, nav, .paywall, .subscribe-widget, .subscription-widget').remove();
             const parts = [];
             postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
                 const tag = el.name;
@@ -1884,19 +1919,35 @@ async function substackExtractor(html, url) {
             });
             articleBody = parts.join('\n\n');
         }
+        // If no article body found, try broader search
+        if (!articleBody) {
+            const parts = [];
+            $('main p, article p, [class*="content"] p').each((_, el) => {
+                const text = $(el).text().trim();
+                if (text && text.length > 20)
+                    parts.push(text);
+            });
+            articleBody = parts.slice(0, 20).join('\n\n');
+        }
         const contentBody = articleBody || description;
+        // Detect if the post appears paywalled (short content with no article body)
+        const isPaywalled = !articleBody && description.length > 0;
+        const paywallNote = isPaywalled
+            ? '\n\n---\n*⚠️ This post appears to be behind a paywall. Only the preview/description is available. Full content requires a subscription.*'
+            : '';
         const structured = {
             title,
             author,
             publication,
             publishDate,
             description,
-            url,
+            paywalled: isPaywalled,
+            url: workingUrl,
         };
         const authorLine = author ? `\n**Author:** ${author}` : '';
         const pubLine = publication ? `\n**Publication:** ${publication}` : '';
         const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
-        const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
+        const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}${paywallNote}`;
         return { domain: 'substack.com', type: 'post', structured, cleanContent };
     }
     catch {
@@ -2071,11 +2122,20 @@ async function imdbExtractor(html, url) {
                 ? jsonLd.director.map((d) => d.name || d).join(', ')
                 : jsonLd.director?.name || String(jsonLd.director))
             : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
-        // Cast (top few from JSON-LD actor field)
-        const cast = jsonLd?.actor
+        // Cast — JSON-LD has top actors, also parse HTML for broader cast list
+        const castFromLd = jsonLd?.actor
             ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
-                .map((a) => a.name || a).slice(0, 6)
+                .map((a) => a.name || a)
             : [];
+        // Parse additional cast from HTML (IMDB cast section)
+        const castFromHtml = [];
+        // Try multiple IMDB cast selectors across page versions
+        $('[data-testid="title-cast-item"] a[href*="/name/nm"], a[data-testid*="cast"] span[class*="title"], .cast_list td.itemprop a').each((_, el) => {
+            const name = $(el).text().trim();
+            if (name && name.length > 1 && !castFromHtml.includes(name))
+                castFromHtml.push(name);
+        });
+        const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
         // Runtime
         const runtime = jsonLd?.duration
             ? (() => {
@@ -2085,17 +2145,56 @@ async function imdbExtractor(html, url) {
                 return String(jsonLd.duration);
             })()
             : '';
+        // Full plot/storyline — try to get the longer version from HTML
+        const fullPlot = $('[data-testid="storyline-plot-summary"] span, [data-testid="plot-xl"] span, span[data-testid="plot-l"], #titleStoryLine p, .plot_summary .summary_text').first().text().trim() || description;
+        // Additional details: Writers, Keywords, Awards
+        const writers = [];
+        $('[data-testid="title-pc-wide-screen"] li[data-testid="title-pc-principal-credit"]:nth-child(2) a, .credit_summary_item:contains("Writer") a').each((_, el) => {
+            const name = $(el).text().trim();
+            if (name && !writers.includes(name))
+                writers.push(name);
+        });
+        // Keywords — try HTML first, fall back to JSON-LD keywords
+        let keywords = [];
+        $('[data-testid="storyline-plot-keywords"] a, .see-more.inline.canwrap span a, a[href*="keyword"]').each((_, el) => {
+            const kw = $(el).text().trim();
+            if (kw && kw.length < 30 && !keywords.includes(kw))
+                keywords.push(kw);
+        });
+        // Fall back to JSON-LD keywords if HTML didn't yield any
+        if (!keywords.length && jsonLd?.keywords) {
+            keywords = (typeof jsonLd.keywords === 'string'
+                ? jsonLd.keywords.split(',')
+                : Array.isArray(jsonLd.keywords) ? jsonLd.keywords : []).map((k) => k.trim()).filter(Boolean);
+        }
+        // Writers — also try JSON-LD creator field
+        if (!writers.length && jsonLd?.creator) {
+            const creators = Array.isArray(jsonLd.creator) ? jsonLd.creator : [jsonLd.creator];
+            for (const c of creators) {
+                const name = c?.name || (typeof c === 'string' ? c : '');
+                if (name && !writers.includes(name))
+                    writers.push(name);
+            }
+        }
+        // Content rating & release date from JSON-LD
+        const contentRating = jsonLd?.contentRating || '';
+        const datePublished = jsonLd?.datePublished || '';
         const structured = {
-            title, year, contentType, description, ratingValue, ratingCount,
-            genres, director, cast, runtime, url,
+            title, year, contentType, description: fullPlot, ratingValue, ratingCount,
+            genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
         };
         const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
         const genreLine = genres.length ? genres.join(', ') : '';
         const directorLine = director ? `**Director:** ${director}` : '';
+        const writersLine = writers.length ? `**Writers:** ${writers.slice(0, 5).join(', ')}` : '';
         const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
         const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
+        const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
+        const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
+        const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
         const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
-        const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${directorLine ? directorLine + '\n' : ''}${castLine ? castLine + '\n' : ''}\n## Plot\n\n${description}`;
+        const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
+        const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
         return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
     }
     catch {
@@ -2245,17 +2344,34 @@ async function pypiExtractor(_html, url) {
             requiresDist: (info.requires_dist || []).slice(0, 20),
             classifiers: (info.classifiers || []).slice(0, 10),
         };
+        // Full description/README from PyPI (info.description is the full README in markdown)
+        const fullDescription = info.description && info.description.length > 100 &&
+            info.description !== 'UNKNOWN' && info.description !== info.summary
+            ? info.description.slice(0, 8000)
+            : null;
+        // Store full description in structured
+        structured.fullDescription = fullDescription;
         const installCmd = `pip install ${info.name}`;
         const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
         const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
+        // Show all dependencies
         const depsLine = structured.requiresDist.length
             ? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
             : '';
+        // Classifiers — extract useful ones (license, status, Python versions)
+        const usefulClassifiers = structured.classifiers.filter((c) => c.startsWith('Programming Language') || c.startsWith('License') || c.startsWith('Development Status'));
+        const classifiersSection = usefulClassifiers.length
+            ? `\n\n## Classifiers\n\n${usefulClassifiers.map((c) => `- ${c}`).join('\n')}`
+            : '';
         // Find project URLs
         const projectUrlLines = [];
         for (const [label, u] of Object.entries(structured.projectUrls)) {
             projectUrlLines.push(`- **${label}:** ${u}`);
         }
+        // Full description section (package README from PyPI)
+        const descSection = fullDescription
+            ? `\n\n## Description\n\n${fullDescription}`
+            : '';
         const cleanContent = `# 📦 ${info.name} ${info.version}
 ${info.summary || ''}
@@ -2266,7 +2382,7 @@ ${installCmd}
 **Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
-${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
+${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}${classifiersSection}${descSection}`;
         return { domain: 'pypi.org', type: 'package', structured, cleanContent };
     }
     catch (e) {
@@ -2289,6 +2405,38 @@ async function devtoExtractor(html, url) {
         const slug = pathParts.length >= 2
             ? pathParts.slice(0, 2).join('/').replace(/^@/, '')
             : null;
+        // Homepage: no slug → fetch recent top articles from Dev.to API
+        if (!slug) {
+            try {
+                const topArticles = await fetchJson('https://dev.to/api/articles?page=1&per_page=20&top=1');
+                if (Array.isArray(topArticles) && topArticles.length > 0) {
+                    const articles = topArticles.map((a) => ({
+                        title: a.title || '',
+                        author: a.user?.name || '',
+                        authorUsername: a.user?.username || '',
+                        tags: a.tag_list || [],
+                        reactions: a.public_reactions_count || 0,
+                        comments: a.comments_count || 0,
+                        readingTime: a.reading_time_minutes ? `${a.reading_time_minutes} min` : '',
+                        url: a.url || '',
+                        publishDate: a.published_at ? a.published_at.split('T')[0] : '',
+                    }));
+                    const listMd = articles.map((a, i) => {
+                        const tags = a.tags.length ? ` · #${a.tags.slice(0, 3).join(' #')}` : '';
+                        const stats = `❤️ ${a.reactions} | 💬 ${a.comments}${a.readingTime ? ` | ${a.readingTime}` : ''}`;
+                        return `${i + 1}. **[${a.title}](${a.url})**\n   by @${a.authorUsername}${tags}\n   ${stats} · ${a.publishDate}`;
+                    }).join('\n\n');
+                    const structured = {
+                        title: 'DEV Community — Top Articles',
+                        articles,
+                        fetchedAt: new Date().toISOString(),
+                    };
+                    const cleanContent = `# 🧑‍💻 DEV Community — Top Articles\n\n*${articles.length} articles from the community*\n\n${listMd}`;
+                    return { domain: 'dev.to', type: 'listing', structured, cleanContent };
+                }
+            }
+            catch { /* fall through to HTML */ }
+        }
         if (slug) {
             try {
                 const apiUrl = `https://dev.to/api/articles/${slug}`;

package/dist/core/pipeline.js CHANGED Viewed

@@ -410,6 +410,18 @@ export async function fetchContent(ctx) {
             }
             catch { /* Search fallback also failed — rethrow original BlockedError */ }
         }
+        // Enhance error messages with actionable advice
+        if (fetchError instanceof BlockedError) {
+            const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
+            const enhancedError = new BlockedError(actionableMsg);
+            throw enhancedError;
+        }
+        const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
+        if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
+            const ms = ctx.timeout ?? 30000;
+            const enhancedMsg = `Request timed out after ${Math.round(ms / 1000)}s. This site may require browser rendering — try \`render: true\`.`;
+            throw new Error(enhancedMsg);
+        }
         throw fetchError;
     }
     const fetchDuration = ctx.timer.end('fetch');
@@ -1183,6 +1195,14 @@ export function buildResult(ctx) {
     let warning;
     const contentLen = ctx.content.length;
     const htmlLen = ctx.fetchResult?.html?.length || 0;
+    // Add contentQuality metadata for thin content (< 100 words)
+    const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
+    if (wordCount < 100 && wordCount > 0) {
+        ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
+        if (ctx.metadata) {
+            ctx.metadata.contentQuality = 'thin';
+        }
+    }
     if (contentLen < 100 && htmlLen > 1000) {
         warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
     }

package/dist/core/structured-extract.js CHANGED Viewed

@@ -143,6 +143,8 @@ function heuristicExtractString(fieldName, content, pageUrl) {
                 .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
                 .replace(/[*_`[\]]/g, '')
                 .replace(/&[a-z]+;/g, '') // HTML entities
+                // Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
+                .replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
                 .replace(/\s+/g, ' ')
                 .trim().slice(0, 150);
         }
@@ -156,6 +158,12 @@ function heuristicExtractString(fieldName, content, pageUrl) {
         if (pageUrl)
             return pageUrl;
     }
+    // Director (for movies/films)
+    if (/director/.test(lf)) {
+        const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
+        if (m?.[1])
+            return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
+    }
     // Author/writer/by
     if (/author|writer|by/.test(lf)) {
         const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
@@ -339,9 +347,21 @@ async function heuristicExtract(content, schema) {
             fieldsFound++;
         data[field] = value;
     }
-    // Confidence: 0.3 base, up to 0.5 based on fill rate
+    // Confidence based on fill rate:
+    // - ALL fields null → 0.1 (extraction found nothing useful)
+    // - Some fields null → 0.3-0.5 based on fill ratio
+    // - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
     const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
-    const confidence = 0.3 + fillRate * 0.2;
+    let confidence;
+    if (fieldsFound === 0) {
+        confidence = 0.1; // All null — heuristic found nothing
+    }
+    else if (fieldsFound === totalFields) {
+        confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
+    }
+    else {
+        confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
+    }
     return {
         data,
         confidence: parseFloat(confidence.toFixed(2)),
@@ -395,12 +415,22 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
                 return heuristic;
             }
             const { data, missingRequired } = validateAndCoerce(parsed, schema);
-            // Confidence: 0.9 base, penalised for missing required fields
-            const penalty = missingRequired.length * 0.05;
+            // Confidence for LLM extraction:
+            // - ALL fields null → 0.1 (LLM couldn't extract anything)
+            // - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
+            // - All populated → 0.90-0.98 based on fill rate
             const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
             const totalCount = Object.keys(schema.properties).length;
-            const fillBonus = totalCount > 0 ? (filledCount / totalCount) * 0.05 : 0;
-            const confidence = Math.max(0.5, Math.min(0.98, 0.9 + fillBonus - penalty));
+            const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
+            const penalty = missingRequired.length * 0.05;
+            let confidence;
+            if (filledCount === 0) {
+                confidence = 0.1; // LLM returned all nulls — extraction failed
+            }
+            else {
+                const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
+                confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
+            }
             return {
                 data,
                 confidence: parseFloat(confidence.toFixed(2)),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.11",
+  "version": "0.21.12",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",