npm - webpeel - Versions diffs - 0.21.5 → 0.21.7 - Mend

webpeel 0.21.5 → 0.21.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/cli/utils.js +71 -10
package/dist/core/domain-extractors.js +20 -2
package/dist/core/pipeline.js +20 -3
package/dist/core/structured-extract.js +190 -23
package/dist/server/app.js +2 -2
package/dist/server/routes/fetch.js +76 -34
package/dist/types.d.ts +12 -0
package/package.json +1 -1

package/dist/cli/utils.js CHANGED Viewed

@@ -131,22 +131,30 @@ export function parseActions(actionStrings) {
  */
 export function formatError(error, _url, options) {
     const msg = error.message || String(error);
+    const errorType = error.errorType || '';
     const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
-    if (msg.includes('net::ERR_') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
-        lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
-    }
-    else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
+    // Check structured errorType from API first (takes precedence over message heuristics)
+    if (errorType === 'timeout' || msg.includes('took too long') || msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
         lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
         if (!options.render) {
             lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
         }
     }
-    else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
+    else if (errorType === 'blocked' || msg.includes('blocking automated') || msg.includes('bot protection') || msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
         if (!options.stealth) {
             lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
         }
         lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
     }
+    else if (errorType === 'not_found' || msg.includes('domain may not exist') || msg.includes('not found') || msg.includes('ENOTFOUND') || msg.includes('net::ERR_') || msg.includes('ECONNREFUSED')) {
+        lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
+    }
+    else if (errorType === 'network' || msg.includes('Could not reach') || msg.includes('could not connect') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
+        lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
+    }
+    else if (errorType === 'server_error' || msg.includes('server error')) {
+        lines.push('\x1b[33m💡 The target site returned a server error. Try again in a moment.\x1b[0m');
+    }
     else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
         if (!options.render) {
             lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
@@ -212,7 +220,40 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
     }
     if (!res.ok) {
         const body = await res.text().catch(() => '');
-        throw new Error(`API error ${res.status}: ${body.slice(0, 200)}`);
+        // Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
+        const isHtml = body.trimStart().startsWith('<') || body.includes('<!DOCTYPE') || body.includes('<html');
+        let errorMsg;
+        let errorType;
+        if (res.status === 502 || res.status === 503 || res.status === 504) {
+            errorMsg = `Could not reach this website. The site may be blocking our server or timing out.`;
+            errorType = res.status === 504 ? 'timeout' : 'network';
+        }
+        else if (isHtml) {
+            errorMsg = `Server returned an error page (${res.status})`;
+        }
+        else {
+            // Try to parse a structured JSON error response
+            try {
+                const json = JSON.parse(body);
+                const errObj = json?.error;
+                if (errObj && typeof errObj === 'object') {
+                    errorMsg = typeof errObj.message === 'string' ? errObj.message : (body.slice(0, 200) || 'Unknown error');
+                    if (typeof errObj.type === 'string')
+                        errorType = errObj.type;
+                }
+                else {
+                    errorMsg = body.slice(0, 200) || 'Unknown error';
+                }
+            }
+            catch {
+                errorMsg = body.slice(0, 200) || 'Unknown error';
+            }
+        }
+        const err = new Error(`${errorMsg}`);
+        if (errorType)
+            err.errorType = errorType;
+        err.statusCode = res.status;
+        throw err;
     }
     const data = await res.json();
     // Map API response to PeelResult shape that the CLI already handles
@@ -393,20 +434,40 @@ export function classifyErrorCode(error) {
     // Check for our custom _code first (set in pre-fetch validation)
     if (error._code)
         return error._code;
+    // Check for structured errorType from API responses (set by fetchViaApi)
+    const errorType = error.errorType;
+    if (errorType) {
+        const typeMap = {
+            timeout: 'TIMEOUT',
+            blocked: 'BLOCKED',
+            not_found: 'NOT_FOUND',
+            server_error: 'SERVER_ERROR',
+            network: 'NETWORK',
+            unknown: 'FETCH_FAILED',
+        };
+        if (typeMap[errorType])
+            return typeMap[errorType];
+    }
     const msg = error.message.toLowerCase();
     const name = error.name || '';
-    if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
+    if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out') || msg.includes('took too long')) {
         return 'TIMEOUT';
     }
-    if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
+    if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare') || msg.includes('bot protection')) {
         return 'BLOCKED';
     }
-    if (msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed') || msg.includes('not found')) {
-        return 'DNS_FAILED';
+    if (msg.includes('domain may not exist') || msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed')) {
+        return 'NOT_FOUND';
+    }
+    if (msg.includes('http 404') || msg.includes('page was not found')) {
+        return 'NOT_FOUND';
     }
     if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
         return 'INVALID_URL';
     }
+    if (msg.includes('could not reach') || msg.includes('could not connect') || msg.includes('econnrefused')) {
+        return 'NETWORK';
+    }
     return 'FETCH_FAILED';
 }
 /**

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -1274,9 +1274,27 @@ async function youtubeExtractor(_html, url) {
         const parts = [];
         parts.push(`# ${title}`);
         parts.push(headerLine);
+        /**
+         * Strip music note symbols from transcript/caption text.
+         * YouTube auto-captions include ♪ and 🎵 as music cues.
+         * Patterns cleaned:
+         *   [♪♪♪]  →  (removed)
+         *   ♪ text ♪  →  text
+         *   standalone ♪ / 🎵  →  (removed)
+         */
+        const cleanMusicNotes = (text) => text
+            // Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
+            .replace(/\[[♪🎵]+\]/g, '')
+            // Unwrap ♪ text ♪ → text (keep the words between notes)
+            .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
+            // Remove any remaining standalone ♪ or 🎵
+            .replace(/[♪🎵]+/g, '')
+            // Collapse extra whitespace introduced by removals
+            .replace(/\s{2,}/g, ' ')
+            .trim();
         // Summary section
         if (transcript.summary && hasTranscript) {
-            let summaryText = transcript.summary;
+            let summaryText = cleanMusicNotes(transcript.summary);
             summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
             parts.push(`## Summary\n\n${summaryText}`);
         }
@@ -1296,7 +1314,7 @@ async function youtubeExtractor(_html, url) {
         // Full Transcript section (only if we have real transcript segments)
         // Add intelligent paragraph breaks for readability
         if (hasTranscript) {
-            let readableText = transcript.fullText;
+            let readableText = cleanMusicNotes(transcript.fullText);
             // Break into paragraphs: after sentence-ending punctuation followed by a capital letter
             readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
             // Collapse any triple+ newlines

package/dist/core/pipeline.js CHANGED Viewed

@@ -193,19 +193,32 @@ export async function handleYouTube(ctx) {
             headerParts.push(`**${viewStr}**`);
         if (publishStr)
             headerParts.push(`**Published:** ${publishStr}`);
+        /**
+         * Strip music note symbols from YouTube auto-caption text.
+         * Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
+         */
+        const cleanMusicNotes = (text) => text
+            .replace(/\[[♪🎵]+\]/g, '')
+            .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
+            .replace(/[♪🎵]+/g, '')
+            .replace(/\s{2,}/g, ' ')
+            .trim();
         // Add paragraph breaks to transcript for readability
-        let readableText = transcript.fullText;
+        let readableText = cleanMusicNotes(transcript.fullText);
         readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
         readableText = readableText.replace(/\n{3,}/g, '\n\n');
         // Build a clean markdown representation of the video + transcript
         const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
         if (transcript.summary) {
-            let summaryText = transcript.summary;
+            let summaryText = cleanMusicNotes(transcript.summary);
             summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
             parts.push(`## Summary\n\n${summaryText}`);
         }
         if (transcript.keyPoints && transcript.keyPoints.length > 0) {
-            parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
+            const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
+            if (cleanedKps.length > 0) {
+                parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
+            }
         }
         if (transcript.chapters && transcript.chapters.length > 0) {
             parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
@@ -927,6 +940,10 @@ export async function postProcess(ctx) {
             if (ddResult) {
                 ctx.domainData = ddResult;
                 ctx.content = ddResult.cleanContent;
+                // Update title from domain extractor (takes precedence over HTML page title)
+                if (ddResult.structured?.title) {
+                    ctx.title = ddResult.structured.title;
+                }
             }
         }
         catch (e) {

package/dist/core/structured-extract.js CHANGED Viewed

@@ -86,56 +86,223 @@ function parseLLMJson(text) {
 /**
  * For string fields: search for field name in content, extract surrounding text.
  */
-function heuristicExtractString(fieldName, content) {
+/** Extract first H1 or page title from markdown content */
+function extractPageTitle(content) {
+    const h1 = content.match(/^#\s+(.+)$/m);
+    if (h1?.[1])
+        return h1[1].replace(/[*_`]/g, '').trim();
+    return null;
+}
+/** Extract meta description (after *X min read* pattern common in WebPeel output) */
+function extractDescription(content) {
+    // First paragraph after the title
+    const lines = content.split('\n').filter(l => l.trim());
+    let seenH1 = false;
+    for (const line of lines) {
+        if (line.startsWith('#')) {
+            seenH1 = true;
+            continue;
+        }
+        if (line.startsWith('*') && line.endsWith('*'))
+            continue; // byline
+        if (seenH1 && line.length > 30)
+            return line.replace(/[*_`]/g, '').trim().slice(0, 300);
+    }
+    return null;
+}
+/** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
+function extractCompanyFromTitle(title) {
+    const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
+    if (sep?.[1])
+        return sep[1].trim();
+    return title.trim().slice(0, 60);
+}
+/** Smart field-name-aware string extractor */
+function heuristicExtractString(fieldName, content, pageUrl) {
+    const lf = fieldName.toLowerCase();
     const humanName = fieldName.replace(/_/g, ' ');
+    const title = extractPageTitle(content);
+    // --- Concept-aware extraction ---
+    // Company/brand/organization name
+    if (/company|brand|organization|org_name/.test(lf)) {
+        if (title)
+            return extractCompanyFromTitle(title);
+        // Fallback: extract from first heading of any level
+        const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
+        if (anyHeading?.[1])
+            return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
+    }
+    // Title/name/product → first H1 or any heading, stripped of markdown
+    if (/^(title|name|product_name|product|heading)$/.test(lf)) {
+        const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
+        if (rawTitle) {
+            // Strip markdown links [text](url) → text, badges ![...](url) → '', etc.
+            return rawTitle
+                .replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
+                .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
+                .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
+                .replace(/[*_`[\]]/g, '')
+                .replace(/&[a-z]+;/g, '') // HTML entities
+                .replace(/\s+/g, ' ')
+                .trim().slice(0, 150);
+        }
+    }
+    // Description/summary/about → first paragraph
+    if (/description|summary|about|overview/.test(lf)) {
+        return extractDescription(content) ?? null;
+    }
+    // URL/website/link → use the URL if we have it
+    if (/^(url|website|link|homepage|site)$/.test(lf)) {
+        if (pageUrl)
+            return pageUrl;
+    }
+    // Author/writer/by
+    if (/author|writer|by/.test(lf)) {
+        const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
+        if (m?.[1])
+            return m[1].trim().slice(0, 100);
+    }
+    // Date/published/updated
+    if (/date|published|updated|modified/.test(lf)) {
+        const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
+            ?? content.match(/(\d{4}-\d{2}-\d{2})/);
+        if (m?.[1])
+            return m[1];
+    }
+    // Email
+    if (/email|contact/.test(lf)) {
+        const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
+        if (m?.[0])
+            return m[0];
+    }
+    // Price/cost/pricing → extract value near $
+    if (/price|cost|pricing|fee/.test(lf)) {
+        const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
+            ?? content.match(/(free|no cost|no charge)/i);
+        if (m?.[0])
+            return m[0].trim();
+    }
+    // Language (for GitHub repos)
+    if (/language|lang|tech/.test(lf)) {
+        const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
+        if (m?.[1])
+            return m[1];
+    }
+    // Stars (for GitHub)
+    if (/stars?/.test(lf)) {
+        const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
+        if (m?.[1])
+            return m[1].replace(/,/g, '');
+    }
+    // License
+    if (/license/.test(lf)) {
+        const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
+        if (m?.[1])
+            return m[1];
+    }
+    // --- Generic patterns (exact-ish match) ---
     const patterns = [
-        // "field_name: value" or "Field Name: value" patterns
         new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
-        // JSON-like "field": "value"
         new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
-        // Markdown bold **Field Name**: value
         new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
-        // Heading followed by content
         new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
     ];
     for (const pattern of patterns) {
         const match = content.match(pattern);
-        if (match?.[1]) {
+        if (match?.[1])
             return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
-        }
     }
     return null;
 }
 /**
- * For boolean fields: search for positive/negative indicators near the field name.
+ * For boolean fields: search the ENTIRE content for positive/negative indicators.
  */
 function heuristicExtractBoolean(fieldName, content) {
-    const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
+    const lf = fieldName.toLowerCase();
     const ctx = content.toLowerCase();
-    // Search both underscore and spaced variants
-    let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
-    if (fieldIdx === -1)
-        fieldIdx = ctx.indexOf(humanName);
-    if (fieldIdx === -1)
-        return null;
-    // Look at a window of ±150 chars around the field name
-    const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
-    const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
-    const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
-    for (const pos of positive) {
-        if (window.includes(pos))
+    // Concept-aware boolean extraction — search entire content, not just near field name
+    // Free tier / free plan
+    if (/free_tier|has_free|is_free/.test(lf)) {
+        if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
             return true;
+        if (/no free|paid only|subscription required/.test(ctx))
+            return false;
     }
-    for (const neg of negative) {
-        if (window.includes(neg))
+    // Open source
+    if (/open_source|is_open|oss/.test(lf)) {
+        if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
+            return true;
+        if (/closed[- ]source|proprietary|commercial license/.test(ctx))
             return false;
     }
+    // API availability
+    if (/has_api|api_available|has_rest/.test(lf)) {
+        if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
+            return true;
+    }
+    // Authentication
+    if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
+        if (/login|sign in|authentication|api key|bearer token/.test(ctx))
+            return true;
+    }
+    // General approach: search near field name concept
+    const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
+    let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
+    if (fieldIdx === -1)
+        fieldIdx = ctx.indexOf(humanName);
+    if (fieldIdx !== -1) {
+        const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
+        const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
+        const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
+        for (const pos of positive) {
+            if (window.includes(pos))
+                return true;
+        }
+        for (const neg of negative) {
+            if (window.includes(neg))
+                return false;
+        }
+    }
     return null;
 }
 /**
  * For number fields: find digits near the field name.
  */
 function heuristicExtractNumber(fieldName, content) {
+    const lf = fieldName.toLowerCase();
+    // Stars (GitHub)
+    if (/stars?/.test(lf)) {
+        const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
+        if (m?.[1]) {
+            const n = parseFloat(m[1].replace(/,/g, ''));
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Forks
+    if (/forks?/.test(lf)) {
+        const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
+        if (m?.[1]) {
+            const n = parseFloat(m[1].replace(/,/g, ''));
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Rating/score
+    if (/rating|score/.test(lf)) {
+        const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
+        if (m?.[1]) {
+            const n = parseFloat(m[1]);
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Year
+    if (/year/.test(lf)) {
+        const m = content.match(/\b(20\d{2})\b/);
+        if (m?.[1]) {
+            const n = parseInt(m[1]);
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Generic: find number near field name
     const humanName = fieldName.replace(/_/g, '[\\s_-]*');
     const pattern = new RegExp(`${humanName}[:\\s$]*([\\d,]+\\.?\\d*)`, 'i');
     const match = content.match(pattern);

package/dist/server/app.js CHANGED Viewed

@@ -106,8 +106,8 @@ export function createApp(config = {}) {
             timeoutMs = 120000; // 2min for batch
         else if (path.includes('/screenshot'))
             timeoutMs = 60000; // 1min for screenshots
-        else if (req.query?.render === 'true')
-            timeoutMs = 60000; // 1min for rendered fetches
+        else if (req.query?.render === 'true' || req.query?.stealth === 'true')
+            timeoutMs = 60000; // 1min for browser/stealth fetches
         else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
             timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
         req.setTimeout(timeoutMs);

package/dist/server/routes/fetch.js CHANGED Viewed

@@ -13,6 +13,52 @@ import { getSchemaTemplate } from '../../core/schema-templates.js';
 import { quickAnswer } from '../../core/quick-answer.js';
 import { sendUsageAlertEmail } from '../email-service.js';
 import { extractLinks } from '../../core/links.js';
+// ── Helper: classify an error thrown by peel() into a FetchErrorType ─────────
+function classifyFetchError(err) {
+    const code = err.code || err.name || '';
+    const msg = (err.message || '').toLowerCase();
+    if (code === 'TIMEOUT' || msg.includes('timeout') || msg.includes('timed out')) {
+        return 'timeout';
+    }
+    if (code === 'BLOCKED' || msg.includes('blocked') || msg.includes('cloudflare challenge') || msg.includes('captcha') || msg.includes('bot detection')) {
+        return 'blocked';
+    }
+    if (msg.includes('http 404') || msg.includes('not found') || msg.includes('dns resolution failed') || msg.includes('enotfound') || msg.includes('getaddrinfo')) {
+        return 'not_found';
+    }
+    if (msg.match(/http\s+5\d{2}/) || msg.includes('server error') || msg.includes('internal server')) {
+        return 'server_error';
+    }
+    if (code === 'NETWORK' || msg.includes('network') || msg.includes('econnrefused') || msg.includes('connection refused') || msg.includes('connection reset')) {
+        return 'network';
+    }
+    return 'unknown';
+}
+// ── Helper: build a clean, user-facing error message from a peel() error ─────
+function buildFetchErrorMessage(err) {
+    const type = classifyFetchError(err);
+    const hints = {
+        timeout: 'Try increasing timeout with ?timeout=20000, or use render=true for JS-heavy sites.',
+        blocked: 'This site blocks automated requests. Try render=true or stealth=true.',
+        not_found: 'Verify the URL is correct and the site is accessible.',
+        server_error: 'The target site returned a server error. Try again later.',
+        network: 'Could not connect to the target URL. Verify the URL is correct and the site is online.',
+        unknown: undefined,
+    };
+    // Sanitize message: strip HTML chars, truncate
+    const safeMsg = (err.message || 'An unexpected error occurred while fetching the URL')
+        .replace(/[<>"']/g, '')
+        .trim();
+    const messages = {
+        timeout: `The website took too long to respond. Try with render=true or stealth=true for JavaScript-heavy sites.`,
+        blocked: `This website is blocking automated access (bot protection detected).`,
+        not_found: `The URL could not be reached — the domain may not exist or the page was not found.`,
+        server_error: `The target website returned a server error while processing the request.`,
+        network: `Could not reach this website. The server may be down or the URL may be incorrect.`,
+        unknown: safeMsg,
+    };
+    return { type, message: messages[type] || safeMsg, hint: hints[type] };
+}
 // ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
 function extractSummary(content, maxWords = 150) {
     if (!content)
@@ -527,26 +573,24 @@ export function createFetchRouter(authStore) {
                 });
             }
             // SECURITY: Sanitize error messages to prevent information disclosure
-            if (err.code) {
+            if (res.headersSent)
+                return; // Timeout middleware already responded
+            const requestUrl = req.query.url;
+            if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
                 // WebPeelError from core library - safe to expose with helpful context
-                if (res.headersSent)
-                    return; // Timeout middleware already responded
-                const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
-                const statusCode = err.code === 'TIMEOUT' ? 504
-                    : err.code === 'BLOCKED' ? 403
-                        : err.code === 'NETWORK' ? 502
-                            : 500;
-                const hints = {
-                    TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
-                    BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
-                    NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
-                };
+                const { type, message, hint } = buildFetchErrorMessage(err);
+                const statusCode = type === 'timeout' ? 504
+                    : type === 'blocked' ? 403
+                        : type === 'not_found' ? 404
+                            : type === 'network' || type === 'server_error' ? 502
+                                : 500;
                 res.status(statusCode).json({
                     success: false,
                     error: {
-                        type: err.code,
-                        message: safeMessage,
-                        hint: hints[err.code] || undefined,
+                        type,
+                        message,
+                        url: requestUrl,
+                        ...(hint ? { hint } : {}),
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,
@@ -555,13 +599,12 @@ export function createFetchRouter(authStore) {
             else {
                 // Unexpected error - generic message only
                 console.error('Fetch error:', err); // Log full error server-side
-                if (res.headersSent)
-                    return; // Timeout middleware already responded
                 res.status(500).json({
                     success: false,
                     error: {
-                        type: 'internal_error',
+                        type: 'unknown',
                         message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
+                        url: requestUrl,
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,
@@ -1028,23 +1071,21 @@ export function createFetchRouter(authStore) {
             console.error('POST fetch/scrape error:', err);
             if (res.headersSent)
                 return; // Timeout middleware already responded
-            if (err.code) {
-                const safeMessage = err.message.replace(/[<>"']/g, '');
-                const statusCode = err.code === 'TIMEOUT' ? 504
-                    : err.code === 'BLOCKED' ? 403
-                        : err.code === 'NETWORK' ? 502
-                            : 500;
-                const hints = {
-                    TIMEOUT: 'Try increasing timeout, or set render:true for JS-heavy sites.',
-                    BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
-                    NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
-                };
+            const postUrl = req.body?.url;
+            if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
+                const { type, message, hint } = buildFetchErrorMessage(err);
+                const statusCode = type === 'timeout' ? 504
+                    : type === 'blocked' ? 403
+                        : type === 'not_found' ? 404
+                            : type === 'network' || type === 'server_error' ? 502
+                                : 500;
                 res.status(statusCode).json({
                     success: false,
                     error: {
-                        type: err.code,
-                        message: safeMessage,
-                        hint: hints[err.code] || undefined,
+                        type,
+                        message,
+                        url: postUrl,
+                        ...(hint ? { hint } : {}),
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,
@@ -1054,8 +1095,9 @@ export function createFetchRouter(authStore) {
                 res.status(500).json({
                     success: false,
                     error: {
-                        type: 'internal_error',
+                        type: 'unknown',
                         message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
+                        url: postUrl,
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,

package/dist/types.d.ts CHANGED Viewed

@@ -419,6 +419,18 @@ export interface PeelEnvelope {
      */
     totalAvailable?: number;
 }
+/**
+ * Programmatic error classification for fetch failures.
+ * Returned in the `error.type` field of API error responses.
+ *
+ * - `timeout`      — Site took too long to respond
+ * - `blocked`      — Site actively blocked the request (403, CAPTCHA, bot detection)
+ * - `not_found`    — 404 or the domain/URL does not exist
+ * - `server_error` — Target site returned a 5xx error
+ * - `network`      — DNS failure, connection refused, or other network-level issue
+ * - `unknown`      — Unclassified error
+ */
+export type FetchErrorType = 'timeout' | 'blocked' | 'not_found' | 'server_error' | 'network' | 'unknown';
 export declare class WebPeelError extends Error {
     code?: string | undefined;
     constructor(message: string, code?: string | undefined);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.5",
+  "version": "0.21.7",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",