npm - webpeel - Versions diffs - 0.20.17 → 0.20.18 - Mend

webpeel 0.20.17 → 0.20.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/core/domain-extractors.js +44 -2
package/dist/core/pipeline.js +50 -15
package/dist/core/youtube.d.ts +4 -0
package/dist/core/youtube.js +12 -0
package/package.json +1 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -1169,14 +1169,37 @@ async function youtubeExtractor(_html, url) {
     const transcriptPromise = withTimeout(getYouTubeTranscript(url), 30000);
     const oembedPromise = fetchJson(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`);
     const noembedPromise = fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`).catch(() => null);
-    const [transcriptResult, oembedResult, noembedResult] = await Promise.allSettled([
+    // Fetch subscriber count from channel page (lightweight, parallel)
+    const subscriberPromise = (async () => {
+        try {
+            // Wait for oEmbed to get channel URL, then fetch subscriber count from channel page
+            const oembed = await oembedPromise;
+            const channelUrl = oembed?.author_url;
+            if (!channelUrl)
+                return '';
+            const resp = await fetch(channelUrl, {
+                headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' },
+                signal: AbortSignal.timeout(5000),
+            });
+            const html = await resp.text();
+            // Look for subscriber count in page metadata (e.g. "4.12M subscribers")
+            const subMatch = html.match(/(\d+(?:\.\d+)?[KMBkmb]?)\s*subscribers/i);
+            return subMatch ? subMatch[1] + ' subscribers' : '';
+        }
+        catch {
+            return '';
+        }
+    })();
+    const [transcriptResult, oembedResult, noembedResult, subscriberResult] = await Promise.allSettled([
         transcriptPromise,
         oembedPromise,
         noembedPromise,
+        subscriberPromise,
     ]);
     const transcript = transcriptResult.status === 'fulfilled' ? transcriptResult.value : null;
     const oembedData = oembedResult.status === 'fulfilled' ? oembedResult.value : null;
     const noembedData = noembedResult.status === 'fulfilled' ? noembedResult.value : null;
+    const subscriberCount = subscriberResult.status === 'fulfilled' ? subscriberResult.value : '';
     if (process.env.DEBUG) {
         if (transcriptResult.status === 'rejected') {
             console.debug('[webpeel]', 'YouTube transcript failed:', transcriptResult.reason instanceof Error ? transcriptResult.reason.message : transcriptResult.reason);
@@ -1198,12 +1221,15 @@ async function youtubeExtractor(_html, url) {
             title,
             channel,
             channelUrl,
+            subscriberCount: subscriberCount || undefined,
             duration: transcript.duration,
             publishDate,
             language: transcript.language,
             availableLanguages: transcript.availableLanguages,
             transcriptSegments: transcript.segments.length,
             wordCount: transcript.wordCount ?? 0,
+            viewCount: transcript.viewCount ?? '',
+            likeCount: transcript.likeCount ?? '',
             description,
             thumbnailUrl,
             chapters: transcript.chapters ?? [],
@@ -1221,10 +1247,26 @@ async function youtubeExtractor(_html, url) {
                 publishStr = publishDate;
             }
         }
+        // Format view count (e.g. "1,234,567" → "1.2M views")
+        let viewStr = '';
+        if (transcript.viewCount) {
+            const v = parseInt(transcript.viewCount, 10);
+            if (!isNaN(v)) {
+                if (v >= 1_000_000)
+                    viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
+                else if (v >= 1_000)
+                    viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
+                else
+                    viewStr = `${v.toLocaleString()} views`;
+            }
+        }
         // Build header line
-        const headerParts = [`**Channel:** ${channel}`];
+        const channelPart = subscriberCount ? `${channel} (${subscriberCount})` : channel;
+        const headerParts = [`**Channel:** ${channelPart}`];
         if (transcript.duration && transcript.duration !== '0:00')
             headerParts.push(`**Duration:** ${transcript.duration}`);
+        if (viewStr)
+            headerParts.push(`**${viewStr}**`);
         if (publishStr)
             headerParts.push(`**Published:** ${publishStr}`);
         const headerLine = headerParts.join(' | ');

package/dist/core/pipeline.js CHANGED Viewed

@@ -161,22 +161,57 @@ export async function handleYouTube(ctx) {
         const transcript = await getYouTubeTranscript(ctx.url, {
             language: ctx.options.language ?? 'en',
         });
+        // Format view count
+        let viewStr = '';
+        if (transcript.viewCount) {
+            const v = parseInt(transcript.viewCount, 10);
+            if (!isNaN(v)) {
+                if (v >= 1_000_000)
+                    viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
+                else if (v >= 1_000)
+                    viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
+                else
+                    viewStr = `${v.toLocaleString()} views`;
+            }
+        }
+        // Format publish date
+        let publishStr = '';
+        if (transcript.publishDate) {
+            try {
+                const d = new Date(transcript.publishDate);
+                publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
+            }
+            catch {
+                publishStr = transcript.publishDate;
+            }
+        }
+        // Build header metadata line
+        const headerParts = [`**Channel:** ${transcript.channel}`];
+        if (transcript.duration && transcript.duration !== '0:00')
+            headerParts.push(`**Duration:** ${transcript.duration}`);
+        if (viewStr)
+            headerParts.push(`**${viewStr}**`);
+        if (publishStr)
+            headerParts.push(`**Published:** ${publishStr}`);
+        // Add paragraph breaks to transcript for readability
+        let readableText = transcript.fullText;
+        readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
+        readableText = readableText.replace(/\n{3,}/g, '\n\n');
         // Build a clean markdown representation of the video + transcript
-        const videoInfoLines = [
-            `# ${transcript.title}`,
-            '',
-            `**Channel:** ${transcript.channel}`,
-            `**Duration:** ${transcript.duration}`,
-            `**Language:** ${transcript.language}`,
-            transcript.availableLanguages.length > 1
-                ? `**Available Languages:** ${transcript.availableLanguages.join(', ')}`
-                : '',
-            '',
-            '## Transcript',
-            '',
-            transcript.fullText,
-        ].filter(l => l !== undefined);
-        const videoInfoContent = videoInfoLines.join('\n');
+        const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
+        if (transcript.summary) {
+            let summaryText = transcript.summary;
+            summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
+            parts.push(`## Summary\n\n${summaryText}`);
+        }
+        if (transcript.keyPoints && transcript.keyPoints.length > 0) {
+            parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
+        }
+        if (transcript.chapters && transcript.chapters.length > 0) {
+            parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
+        }
+        parts.push(`## Full Transcript\n\n${readableText}`);
+        const videoInfoContent = parts.join('\n\n');
         const elapsed = Date.now() - ytStartTime;
         const tokens = estimateTokens(videoInfoContent);
         const fingerprint = createHash('sha256').update(videoInfoContent).digest('hex').slice(0, 16);

package/dist/core/youtube.d.ts CHANGED Viewed

@@ -42,6 +42,10 @@ export interface YouTubeTranscript {
     summary?: string;
     /** Total word count of transcript */
     wordCount?: number;
+    /** View count (numeric string) */
+    viewCount?: string;
+    /** Like count (numeric string, may be empty) */
+    likeCount?: string;
 }
 export interface YouTubeVideoInfo {
     videoId: string;

package/dist/core/youtube.js CHANGED Viewed

@@ -417,6 +417,8 @@ async function getTranscriptViaProxy(videoId, preferredLang) {
             const chapters = parseChaptersFromDescription(description);
             const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
             const summary = extractSummary(fullText);
+            const viewCount = vd.viewCount ?? mf.viewCount ?? '';
+            const likeCount = vd.likeCount ?? '';
             console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
             return {
                 videoId,
@@ -433,6 +435,8 @@ async function getTranscriptViaProxy(videoId, preferredLang) {
                 keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
                 summary,
                 wordCount,
+                viewCount: viewCount || undefined,
+                likeCount: likeCount || undefined,
             };
         }
         catch (err) {
@@ -541,6 +545,8 @@ export async function getYouTubeTranscript(url, options = {}) {
                     keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
                     summary,
                     wordCount,
+                    viewCount: undefined, // not available in this path without extra fetch
+                    likeCount: undefined,
                 };
             }
             console.log('[webpeel] [youtube] Path 0 returned empty segments');
@@ -649,6 +655,8 @@ export async function getYouTubeTranscript(url, options = {}) {
             keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
             summary,
             wordCount,
+            viewCount: (videoDetails.viewCount ?? microformat.viewCount ?? '') || undefined,
+            likeCount: (videoDetails.likeCount ?? '') || undefined,
         };
     }
     catch (err) {
@@ -761,6 +769,8 @@ async function getTranscriptViaYtDlp(videoId, preferredLang) {
                     keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
                     summary,
                     wordCount,
+                    viewCount: (infoData.view_count?.toString() ?? '') || undefined,
+                    likeCount: (infoData.like_count?.toString() ?? '') || undefined,
                 });
             }
             catch {
@@ -887,6 +897,8 @@ async function getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang
             keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
             summary,
             wordCount,
+            viewCount: undefined, // browser path doesn't reliably get this
+            likeCount: undefined,
         };
     }
     finally {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.20.17",
+  "version": "0.20.18",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",