npm - webpeel - Versions diffs - 0.20.14 → 0.20.17 - Mend

webpeel 0.20.14 → 0.20.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/core/domain-extractors.js +12 -4
package/dist/core/youtube.js +232 -0
package/package.json +1 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -1165,8 +1165,8 @@ async function youtubeExtractor(_html, url) {
         ]);
     }
     // Run transcript fetch and oEmbed fetch in parallel
-    // Browser-rendered fetch takes ~10s — use 15s timeout so browser has time to render
-    const transcriptPromise = withTimeout(getYouTubeTranscript(url), 15000);
+    // Proxy-based extraction takes 2-5s, but retry logic may need more time
+    const transcriptPromise = withTimeout(getYouTubeTranscript(url), 30000);
     const oembedPromise = fetchJson(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`);
     const noembedPromise = fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`).catch(() => null);
     const [transcriptResult, oembedResult, noembedResult] = await Promise.allSettled([
@@ -1233,7 +1233,9 @@ async function youtubeExtractor(_html, url) {
         parts.push(headerLine);
         // Summary section
         if (transcript.summary && hasTranscript) {
-            parts.push(`## Summary\n\n${transcript.summary}`);
+            let summaryText = transcript.summary;
+            summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
+            parts.push(`## Summary\n\n${summaryText}`);
         }
         else if (!hasTranscript && transcript.fullText) {
             parts.push(`## Description\n\n${transcript.fullText}`);
@@ -1249,8 +1251,14 @@ async function youtubeExtractor(_html, url) {
             parts.push(`## Chapters\n\n${chLines}`);
         }
         // Full Transcript section (only if we have real transcript segments)
+        // Add intelligent paragraph breaks for readability
         if (hasTranscript) {
-            parts.push(`## Full Transcript\n\n${transcript.fullText}`);
+            let readableText = transcript.fullText;
+            // Break into paragraphs: after sentence-ending punctuation followed by a capital letter
+            readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
+            // Collapse any triple+ newlines
+            readableText = readableText.replace(/\n{3,}/g, '\n\n');
+            parts.push(`## Full Transcript\n\n${readableText}`);
         }
         const cleanContent = parts.join('\n\n');
         return { domain: 'youtube.com', type: 'video', structured, cleanContent };

package/dist/core/youtube.js CHANGED Viewed

@@ -6,6 +6,9 @@
  * track URLs, fetch the timedtext XML, and return structured transcript data.
  */
 import { execFile } from 'node:child_process';
+import * as http from 'node:http';
+import * as https from 'node:https';
+import * as tls from 'node:tls';
 import { readFile, unlink } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
@@ -231,6 +234,217 @@ export function extractSummary(fullText) {
     return words.slice(0, 200).join(' ') + '...';
 }
 // ---------------------------------------------------------------------------
+// Proxy-based InnerTube transcript extraction
+// ---------------------------------------------------------------------------
+// Webshare residential proxy config — reads from env vars on Render.
+// Locally, falls back to direct fetch (residential IP already works).
+const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
+const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
+const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
+const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
+// With paid Webshare backbone plan, each US slot has its own port:
+// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
+const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
+function isProxyConfigured() {
+    return !!(PROXY_USER && PROXY_PASS);
+}
+/**
+ * Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
+ * slotted username (e.g. "argtnlhz-5"). This ensures both the /player call
+ * and the caption XML fetch go through the same residential IP.
+ */
+function proxyRequestSlotted(slottedUser, proxyPort, targetUrl, opts = {}) {
+    const url = new URL(targetUrl);
+    const timeout = opts.timeoutMs ?? 20000;
+    return new Promise((resolve, reject) => {
+        const proxyAuth = Buffer.from(`${slottedUser}:${PROXY_PASS}`).toString('base64');
+        const proxyReq = http.request({
+            host: PROXY_HOST,
+            port: proxyPort,
+            method: 'CONNECT',
+            path: `${url.hostname}:443`,
+            headers: { 'Proxy-Authorization': `Basic ${proxyAuth}` },
+        });
+        const timer = setTimeout(() => {
+            proxyReq.destroy();
+            reject(new Error('Proxy request timed out'));
+        }, timeout);
+        proxyReq.on('connect', (res, socket) => {
+            if (res.statusCode !== 200) {
+                clearTimeout(timer);
+                socket.destroy();
+                reject(new Error(`Proxy CONNECT failed: ${res.statusCode}`));
+                return;
+            }
+            const tlsSocket = tls.connect({ host: url.hostname, socket, servername: url.hostname }, () => {
+                const reqHeaders = {
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                    'Accept-Language': 'en-US,en;q=0.9',
+                    'Cookie': 'CONSENT=YES+; SOCS=CAI',
+                    ...(opts.headers ?? {}),
+                };
+                const req = https.request({
+                    hostname: url.hostname,
+                    path: url.pathname + url.search,
+                    method: opts.method ?? 'GET',
+                    createConnection: () => tlsSocket,
+                    headers: reqHeaders,
+                }, (response) => {
+                    let data = '';
+                    response.on('data', (chunk) => {
+                        data += chunk;
+                    });
+                    response.on('end', () => {
+                        clearTimeout(timer);
+                        resolve({ status: response.statusCode ?? 0, body: data });
+                    });
+                });
+                req.on('error', (e) => {
+                    clearTimeout(timer);
+                    reject(e);
+                });
+                if (opts.body)
+                    req.write(opts.body);
+                req.end();
+            });
+            tlsSocket.on('error', (e) => {
+                clearTimeout(timer);
+                reject(e);
+            });
+        });
+        proxyReq.on('error', (e) => {
+            clearTimeout(timer);
+            reject(e);
+        });
+        proxyReq.end();
+    });
+}
+/**
+ * Fetch YouTube transcript via InnerTube /player API through Webshare proxy.
+ *
+ * This replicates the approach used by the Python `youtube-transcript-api` library:
+ * 1. POST to /youtubei/v1/player with ANDROID client context
+ * 2. Get caption track URLs WITHOUT the `exp=xpe` parameter
+ * 3. Fetch caption XML from those clean URLs (returns actual data, not 0 bytes)
+ *
+ * All requests go through the residential proxy to bypass YouTube's cloud IP blocking.
+ */
+async function getTranscriptViaProxy(videoId, preferredLang) {
+    // Try multiple proxy slots from the 44K+ US residential pool.
+    // Pick random slots across the pool for even distribution and to avoid
+    // rate-limited IPs. Try up to MAX_RETRIES different slots.
+    const MAX_RETRIES = 5;
+    const usedSlots = new Set();
+    const INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8';
+    for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
+        // Pick a random US slot we haven't tried yet
+        let slot;
+        do {
+            slot = Math.floor(Math.random() * PROXY_MAX_US_SLOTS) + 1;
+        } while (usedSlots.has(slot) && usedSlots.size < PROXY_MAX_US_SLOTS);
+        usedSlots.add(slot);
+        const proxyUser = `${PROXY_USER}-US-${slot}`;
+        const proxyPort = PROXY_BASE_PORT + slot - 1;
+        const doProxyRequest = (url, opts = {}) => proxyRequestSlotted(proxyUser, proxyPort, url, opts);
+        try {
+            // Step 1: Call InnerTube /player with ANDROID client
+            // ANDROID client returns caption URLs WITHOUT exp=xpe (avoids 0-byte responses).
+            const playerResp = await doProxyRequest(`https://www.youtube.com/youtubei/v1/player?key=${INNERTUBE_API_KEY}`, {
+                method: 'POST',
+                body: JSON.stringify({
+                    context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
+                    videoId,
+                }),
+                headers: { 'Content-Type': 'application/json' },
+            });
+            if (playerResp.status !== 200) {
+                console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): /player returned ${playerResp.status}`);
+                continue;
+            }
+            const playerData = JSON.parse(playerResp.body);
+            const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
+            if (!captionTracks || captionTracks.length === 0) {
+                console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no caption tracks`);
+                continue;
+            }
+            // Pick best matching language track
+            let track = captionTracks.find((t) => t.languageCode === preferredLang);
+            if (!track) {
+                track = captionTracks.find((t) => t.languageCode === 'en') ?? captionTracks[0];
+            }
+            const captionUrl = track.baseUrl;
+            if (captionUrl.includes('exp=xpe')) {
+                console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption URL has exp=xpe, skipping`);
+                continue;
+            }
+            // Step 2: Fetch caption XML through the SAME proxy slot (same residential IP)
+            const capResp = await doProxyRequest(captionUrl);
+            if (!capResp.body ||
+                capResp.body.length === 0 ||
+                capResp.status === 429 ||
+                capResp.body.includes('<title>Sorry...</title>')) {
+                console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption XML failed (status=${capResp.status}, bytes=${capResp.body?.length ?? 0})`);
+                continue; // Try next slot
+            }
+            // Parse XML segments — handles both <text start="" dur=""> and <p t="" d=""> formats
+            const xmlSegments = [
+                ...capResp.body.matchAll(/<(?:text|p)\s[^>]*?(?:start|t)="([^"]*)"[^>]*?(?:dur|d)="([^"]*)"[^>]*>([\s\S]*?)<\/(?:text|p)>/g),
+            ];
+            if (xmlSegments.length === 0) {
+                console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no segments parsed from XML`);
+                continue;
+            }
+            const segments = xmlSegments
+                .map((m) => ({
+                text: decodeHtmlEntities(m[3].replace(/<[^>]+>/g, '').replace(/\n/g, ' ').trim()),
+                start: parseFloat(m[1]) / (m[1].includes('.') ? 1 : 1000),
+                duration: parseFloat(m[2]) / (m[2].includes('.') ? 1 : 1000),
+            }))
+                .filter((s) => s.text.length > 0);
+            if (segments.length === 0)
+                continue;
+            // Extract metadata from player response
+            const vd = playerData.videoDetails ?? {};
+            const mf = playerData.microformat?.playerMicroformatRenderer ?? {};
+            const title = vd.title ?? '';
+            const channel = vd.author ?? '';
+            const lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
+            const description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
+            const publishDate = mf.publishDate ?? mf.uploadDate ?? '';
+            const availableLanguages = captionTracks.map((t) => t.languageCode);
+            const fullText = segments.map((s) => s.text).join(' ').replace(/\s+/g, ' ').trim();
+            const wordCount = fullText.split(/\s+/).filter(Boolean).length;
+            const chapters = parseChaptersFromDescription(description);
+            const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
+            const summary = extractSummary(fullText);
+            console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
+            return {
+                videoId,
+                title,
+                channel,
+                duration: formatDuration(lengthSeconds),
+                language: track.languageCode ?? preferredLang,
+                segments,
+                fullText,
+                availableLanguages,
+                description,
+                publishDate,
+                chapters: chapters.length > 0 ? chapters : undefined,
+                keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
+                summary,
+                wordCount,
+            };
+        }
+        catch (err) {
+            console.log(`[webpeel] [youtube] Proxy slot ${slot} error:`, err?.message);
+            continue;
+        }
+    }
+    // All slots exhausted
+    console.log('[webpeel] [youtube] All proxy slots exhausted');
+    return null;
+}
+// ---------------------------------------------------------------------------
 // Transcript extraction
 // ---------------------------------------------------------------------------
 /**
@@ -246,6 +460,24 @@ export async function getYouTubeTranscript(url, options = {}) {
     }
     const preferredLang = options.language ?? 'en';
     const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
+    // --- Path P: Proxy-based InnerTube (primary for cloud servers) ---
+    // Uses Webshare residential proxy + ANDROID InnerTube /player API.
+    // This is the approach used by every major YouTube transcript service
+    // (youtubetotranscript.com, youtube-transcript.io, etc.)
+    if (!process.env.VITEST && isProxyConfigured()) {
+        console.log('[webpeel] [youtube] Trying path P: proxy-based InnerTube (residential proxy)');
+        try {
+            const proxyResult = await getTranscriptViaProxy(videoId, preferredLang);
+            if (proxyResult && proxyResult.segments.length > 0) {
+                console.log(`[webpeel] [youtube] Path P success: ${proxyResult.segments.length} segments, ${proxyResult.wordCount} words`);
+                return proxyResult;
+            }
+            console.log('[webpeel] [youtube] Path P returned empty/null, falling through');
+        }
+        catch (err) {
+            console.log('[webpeel] [youtube] Path P failed:', err?.message);
+        }
+    }
     // --- Path 0: youtube-transcript-plus (fastest — uses InnerTube API, ~1s) ---
     // This library calls YouTube's internal InnerTube API directly via POST request,
     // bypassing the IP-locked timedtext XML URLs. Works reliably from cloud servers.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.20.14",
+  "version": "0.20.17",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",