npm - webpeel - Versions diffs - 0.21.12 → 0.21.14 - Mend

webpeel 0.21.12 → 0.21.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/core/domain-extractors.js +115 -16
package/dist/core/pipeline.js +1 -0
package/package.json +1 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -778,6 +778,10 @@ async function githubExtractor(_html, url) {
     if (pathParts.length === 0)
         return null;
     const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
+    // Use GITHUB_TOKEN if available for higher rate limits (5000/hr vs 60/hr)
+    const ghToken = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
+    if (ghToken)
+        ghHeaders.Authorization = `token ${ghToken}`;
     // User profile: /username (single segment)
     if (pathParts.length === 1) {
         const username = pathParts[0];
@@ -1795,15 +1799,57 @@ async function mediumExtractor(html, url) {
             $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
         const description = jsonLdData?.description ||
             $('meta[property="og:description"]').attr('content') || '';
+        // Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
+        let publication = '';
+        try {
+            const urlObj2 = new URL(url);
+            const hostname = urlObj2.hostname;
+            if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
+                publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
+            }
+        }
+        catch { /* ignore */ }
+        if (!publication) {
+            publication = $('[data-testid="publicationName"]').text().trim() ||
+                $('a[data-testid="publicationName"]').text().trim() ||
+                $('meta[property="article:section"]').attr('content') ||
+                $('a[href*="/tag/"]').first().text().trim() || '';
+        }
+        // Author bio — usually shown in an author card or bio section
+        const authorBio = $('[data-testid="authorBio"]').text().trim() ||
+            $('p[class*="bio"]').first().text().trim() ||
+            $('[aria-label="authorBio"]').text().trim() || '';
+        // Clap count — Medium shows clap button with count
+        let clapCount = '';
+        $('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
+            const txt = $(el).text().trim();
+            if (txt && /\d/.test(txt)) {
+                clapCount = txt;
+                return false;
+            }
+        });
+        if (!clapCount) {
+            // Fallback: find spans that look like clap counts (e.g., "2.4K")
+            $('span').filter((_, el) => {
+                const label = $(el).closest('[aria-label]').attr('aria-label') || '';
+                return label.toLowerCase().includes('clap');
+            }).each((_, el) => {
+                const txt = $(el).text().trim();
+                if (txt && /\d/.test(txt)) {
+                    clapCount = txt;
+                    return false;
+                }
+            });
+        }
         // Extract article body — Medium puts content in <article> or section
         let articleBody = '';
         const articleEl = $('article').first();
         if (articleEl.length) {
-            // Remove nav, aside, buttons
-            articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
+            // Remove nav, aside, buttons, author-card, footer sections
+            articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
             // Get paragraphs and headings
             const parts = [];
-            articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
+            articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
                 const tag = el.name;
                 const text = $(el).text().trim();
                 if (!text || text.length < 5)
@@ -1816,6 +1862,8 @@ async function mediumExtractor(html, url) {
                     parts.push(`> ${text}`);
                 else if (tag === 'pre')
                     parts.push('```\n' + text + '\n```');
+                else if (tag === 'figcaption')
+                    parts.push(`*${text}*`);
                 else
                     parts.push(text);
             });
@@ -1826,15 +1874,22 @@ async function mediumExtractor(html, url) {
         const structured = {
             title,
             author,
+            authorBio,
             publishDate,
             readingTime,
             description,
+            publication,
+            clapCount,
             url,
         };
         const authorLine = author ? `\n**Author:** ${author}` : '';
+        const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
         const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
         const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
-        const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
+        const pubLine = publication ? `\n**Publication:** ${publication}` : '';
+        const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
+        // No hard character cap — let the pipeline's budget/maxTokens handle truncation
+        const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
         return { domain: 'medium.com', type: 'article', structured, cleanContent };
     }
     catch {
@@ -2122,20 +2177,37 @@ async function imdbExtractor(html, url) {
                 ? jsonLd.director.map((d) => d.name || d).join(', ')
                 : jsonLd.director?.name || String(jsonLd.director))
             : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
-        // Cast — JSON-LD has top actors, also parse HTML for broader cast list
+        // Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
+        const castPairs = [];
+        // IMDB new UI: each title-cast-item contains actor link + character link
+        $('[data-testid="title-cast-item"]').each((_, el) => {
+            const actorEl = $(el).find('a[href*="/name/nm"]').first();
+            const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
+            const actor = actorEl.text().trim();
+            // Character name may span multiple elements; clean whitespace
+            const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
+            if (actor && actor.length > 1) {
+                castPairs.push({ actor, character: character || '' });
+            }
+        });
+        // Fall back to classic cast list (older IMDB page versions)
+        const castFromHtml = [];
+        if (!castPairs.length) {
+            $('.cast_list td.itemprop a').each((_, el) => {
+                const name = $(el).text().trim();
+                if (name && name.length > 1 && !castFromHtml.includes(name))
+                    castFromHtml.push(name);
+            });
+        }
+        // JSON-LD actors as final fallback
         const castFromLd = jsonLd?.actor
             ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
                 .map((a) => a.name || a)
             : [];
-        // Parse additional cast from HTML (IMDB cast section)
-        const castFromHtml = [];
-        // Try multiple IMDB cast selectors across page versions
-        $('[data-testid="title-cast-item"] a[href*="/name/nm"], a[data-testid*="cast"] span[class*="title"], .cast_list td.itemprop a').each((_, el) => {
-            const name = $(el).text().trim();
-            if (name && name.length > 1 && !castFromHtml.includes(name))
-                castFromHtml.push(name);
-        });
-        const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
+        // Build final cast list: with characters if available (top 10), otherwise names only
+        const cast = castPairs.length > 0
+            ? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
+            : [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
         // Runtime
         const runtime = jsonLd?.duration
             ? (() => {
@@ -2176,12 +2248,37 @@ async function imdbExtractor(html, url) {
                     writers.push(name);
             }
         }
+        // Awards / accolades — try hero accolades chip, then any awards-related link text
+        let awardsSummary = '';
+        // IMDB new UI: awards accolades chip in the hero section
+        const accoladesEl = $('[data-testid="awards-accolades"]');
+        if (accoladesEl.length) {
+            awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
+        }
+        // Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
+        if (!awardsSummary) {
+            const titleMatch = url.match(/\/(tt\d+)/);
+            const titleId = titleMatch ? titleMatch[1] : '';
+            if (titleId) {
+                $(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
+                    const text = $(el).text().trim().replace(/\s+/g, ' ');
+                    if (text && text.length > 3 && text.length < 200) {
+                        awardsSummary = text;
+                        return false; // break
+                    }
+                });
+            }
+        }
+        // Fallback: JSON-LD award field
+        if (!awardsSummary && jsonLd?.award) {
+            awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
+        }
         // Content rating & release date from JSON-LD
         const contentRating = jsonLd?.contentRating || '';
         const datePublished = jsonLd?.datePublished || '';
         const structured = {
             title, year, contentType, description: fullPlot, ratingValue, ratingCount,
-            genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
+            genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
         };
         const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
         const genreLine = genres.length ? genres.join(', ') : '';
@@ -2192,8 +2289,9 @@ async function imdbExtractor(html, url) {
         const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
         const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
         const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
+        const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
         const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
-        const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
+        const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
         const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
         return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
     }
@@ -2331,6 +2429,7 @@ async function pypiExtractor(_html, url) {
             return null;
         const info = data.info;
         const structured = {
+            title: `${info.name} ${info.version}`,
             name: info.name,
             version: info.version,
             description: info.summary || '',

package/dist/core/pipeline.js CHANGED Viewed

@@ -344,6 +344,7 @@ export async function fetchContent(ctx) {
             blockResources: ctx.options.blockResources,
             cloaked: ctx.options.cloaked,
             cycle: ctx.options.cycle,
+            tls: ctx.options.tls,
             noEscalate: ctx.options.noEscalate,
         });
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.12",
+  "version": "0.21.14",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",