npm - webpeel - Versions diffs - 0.21.5 → 0.21.6 - Mend

webpeel 0.21.5 → 0.21.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/cli/utils.js +13 -1
package/dist/core/domain-extractors.js +20 -2
package/dist/core/pipeline.js +20 -3
package/package.json +1 -1

package/dist/cli/utils.js CHANGED Viewed

@@ -212,7 +212,19 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
     }
     if (!res.ok) {
         const body = await res.text().catch(() => '');
-        throw new Error(`API error ${res.status}: ${body.slice(0, 200)}`);
+        // Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
+        const isHtml = body.trimStart().startsWith('<');
+        let errorMsg;
+        if (res.status === 502 || res.status === 503 || res.status === 504) {
+            errorMsg = `Could not reach this website (gateway error)`;
+        }
+        else if (isHtml) {
+            errorMsg = `Server returned an error page`;
+        }
+        else {
+            errorMsg = body.slice(0, 200) || 'Unknown error';
+        }
+        throw new Error(`API error ${res.status}: ${errorMsg}`);
     }
     const data = await res.json();
     // Map API response to PeelResult shape that the CLI already handles

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -1274,9 +1274,27 @@ async function youtubeExtractor(_html, url) {
         const parts = [];
         parts.push(`# ${title}`);
         parts.push(headerLine);
+        /**
+         * Strip music note symbols from transcript/caption text.
+         * YouTube auto-captions include ♪ and 🎵 as music cues.
+         * Patterns cleaned:
+         *   [♪♪♪]  →  (removed)
+         *   ♪ text ♪  →  text
+         *   standalone ♪ / 🎵  →  (removed)
+         */
+        const cleanMusicNotes = (text) => text
+            // Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
+            .replace(/\[[♪🎵]+\]/g, '')
+            // Unwrap ♪ text ♪ → text (keep the words between notes)
+            .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
+            // Remove any remaining standalone ♪ or 🎵
+            .replace(/[♪🎵]+/g, '')
+            // Collapse extra whitespace introduced by removals
+            .replace(/\s{2,}/g, ' ')
+            .trim();
         // Summary section
         if (transcript.summary && hasTranscript) {
-            let summaryText = transcript.summary;
+            let summaryText = cleanMusicNotes(transcript.summary);
             summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
             parts.push(`## Summary\n\n${summaryText}`);
         }
@@ -1296,7 +1314,7 @@ async function youtubeExtractor(_html, url) {
         // Full Transcript section (only if we have real transcript segments)
         // Add intelligent paragraph breaks for readability
         if (hasTranscript) {
-            let readableText = transcript.fullText;
+            let readableText = cleanMusicNotes(transcript.fullText);
             // Break into paragraphs: after sentence-ending punctuation followed by a capital letter
             readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
             // Collapse any triple+ newlines

package/dist/core/pipeline.js CHANGED Viewed

@@ -193,19 +193,32 @@ export async function handleYouTube(ctx) {
             headerParts.push(`**${viewStr}**`);
         if (publishStr)
             headerParts.push(`**Published:** ${publishStr}`);
+        /**
+         * Strip music note symbols from YouTube auto-caption text.
+         * Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
+         */
+        const cleanMusicNotes = (text) => text
+            .replace(/\[[♪🎵]+\]/g, '')
+            .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
+            .replace(/[♪🎵]+/g, '')
+            .replace(/\s{2,}/g, ' ')
+            .trim();
         // Add paragraph breaks to transcript for readability
-        let readableText = transcript.fullText;
+        let readableText = cleanMusicNotes(transcript.fullText);
         readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
         readableText = readableText.replace(/\n{3,}/g, '\n\n');
         // Build a clean markdown representation of the video + transcript
         const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
         if (transcript.summary) {
-            let summaryText = transcript.summary;
+            let summaryText = cleanMusicNotes(transcript.summary);
             summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
             parts.push(`## Summary\n\n${summaryText}`);
         }
         if (transcript.keyPoints && transcript.keyPoints.length > 0) {
-            parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
+            const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
+            if (cleanedKps.length > 0) {
+                parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
+            }
         }
         if (transcript.chapters && transcript.chapters.length > 0) {
             parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
@@ -927,6 +940,10 @@ export async function postProcess(ctx) {
             if (ddResult) {
                 ctx.domainData = ddResult;
                 ctx.content = ddResult.cleanContent;
+                // Update title from domain extractor (takes precedence over HTML page title)
+                if (ddResult.structured?.title) {
+                    ctx.title = ddResult.structured.title;
+                }
             }
         }
         catch (e) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.5",
+  "version": "0.21.6",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",