npm - webpeel - Versions diffs - 0.21.6 → 0.21.8 - Mend

webpeel 0.21.6 → 0.21.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/cli/utils.js +63 -14
package/dist/core/proxy-config.d.ts +55 -0
package/dist/core/proxy-config.js +79 -0
package/dist/core/search-provider.js +7 -0
package/dist/core/strategies.js +8 -2
package/dist/core/structured-extract.js +190 -23
package/dist/core/youtube.js +6 -2
package/dist/server/app.js +2 -2
package/dist/server/routes/fetch.js +76 -34
package/dist/types.d.ts +12 -0
package/package.json +1 -1

package/dist/cli/utils.js CHANGED Viewed

@@ -131,22 +131,30 @@ export function parseActions(actionStrings) {
  */
 export function formatError(error, _url, options) {
     const msg = error.message || String(error);
+    const errorType = error.errorType || '';
     const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
-    if (msg.includes('net::ERR_') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
-        lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
-    }
-    else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
+    // Check structured errorType from API first (takes precedence over message heuristics)
+    if (errorType === 'timeout' || msg.includes('took too long') || msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
         lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
         if (!options.render) {
             lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
         }
     }
-    else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
+    else if (errorType === 'blocked' || msg.includes('blocking automated') || msg.includes('bot protection') || msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
         if (!options.stealth) {
             lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
         }
         lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
     }
+    else if (errorType === 'not_found' || msg.includes('domain may not exist') || msg.includes('not found') || msg.includes('ENOTFOUND') || msg.includes('net::ERR_') || msg.includes('ECONNREFUSED')) {
+        lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
+    }
+    else if (errorType === 'network' || msg.includes('Could not reach') || msg.includes('could not connect') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
+        lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
+    }
+    else if (errorType === 'server_error' || msg.includes('server error')) {
+        lines.push('\x1b[33m💡 The target site returned a server error. Try again in a moment.\x1b[0m');
+    }
     else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
         if (!options.render) {
             lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
@@ -213,18 +221,39 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
     if (!res.ok) {
         const body = await res.text().catch(() => '');
         // Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
-        const isHtml = body.trimStart().startsWith('<');
+        const isHtml = body.trimStart().startsWith('<') || body.includes('<!DOCTYPE') || body.includes('<html');
         let errorMsg;
+        let errorType;
         if (res.status === 502 || res.status === 503 || res.status === 504) {
-            errorMsg = `Could not reach this website (gateway error)`;
+            errorMsg = `Could not reach this website. The site may be blocking our server or timing out.`;
+            errorType = res.status === 504 ? 'timeout' : 'network';
         }
         else if (isHtml) {
-            errorMsg = `Server returned an error page`;
+            errorMsg = `Server returned an error page (${res.status})`;
         }
         else {
-            errorMsg = body.slice(0, 200) || 'Unknown error';
+            // Try to parse a structured JSON error response
+            try {
+                const json = JSON.parse(body);
+                const errObj = json?.error;
+                if (errObj && typeof errObj === 'object') {
+                    errorMsg = typeof errObj.message === 'string' ? errObj.message : (body.slice(0, 200) || 'Unknown error');
+                    if (typeof errObj.type === 'string')
+                        errorType = errObj.type;
+                }
+                else {
+                    errorMsg = body.slice(0, 200) || 'Unknown error';
+                }
+            }
+            catch {
+                errorMsg = body.slice(0, 200) || 'Unknown error';
+            }
         }
-        throw new Error(`API error ${res.status}: ${errorMsg}`);
+        const err = new Error(`${errorMsg}`);
+        if (errorType)
+            err.errorType = errorType;
+        err.statusCode = res.status;
+        throw err;
     }
     const data = await res.json();
     // Map API response to PeelResult shape that the CLI already handles
@@ -405,20 +434,40 @@ export function classifyErrorCode(error) {
     // Check for our custom _code first (set in pre-fetch validation)
     if (error._code)
         return error._code;
+    // Check for structured errorType from API responses (set by fetchViaApi)
+    const errorType = error.errorType;
+    if (errorType) {
+        const typeMap = {
+            timeout: 'TIMEOUT',
+            blocked: 'BLOCKED',
+            not_found: 'NOT_FOUND',
+            server_error: 'SERVER_ERROR',
+            network: 'NETWORK',
+            unknown: 'FETCH_FAILED',
+        };
+        if (typeMap[errorType])
+            return typeMap[errorType];
+    }
     const msg = error.message.toLowerCase();
     const name = error.name || '';
-    if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
+    if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out') || msg.includes('took too long')) {
         return 'TIMEOUT';
     }
-    if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
+    if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare') || msg.includes('bot protection')) {
         return 'BLOCKED';
     }
-    if (msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed') || msg.includes('not found')) {
-        return 'DNS_FAILED';
+    if (msg.includes('domain may not exist') || msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed')) {
+        return 'NOT_FOUND';
+    }
+    if (msg.includes('http 404') || msg.includes('page was not found')) {
+        return 'NOT_FOUND';
     }
     if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
         return 'INVALID_URL';
     }
+    if (msg.includes('could not reach') || msg.includes('could not connect') || msg.includes('econnrefused')) {
+        return 'NETWORK';
+    }
     return 'FETCH_FAILED';
 }
 /**

package/dist/core/proxy-config.d.ts ADDED Viewed

@@ -0,0 +1,55 @@
+/**
+ * Shared Webshare residential proxy configuration.
+ *
+ * WebPeel uses Webshare residential proxies (configured via env vars) to route
+ * requests through US residential IPs, bypassing datacenter IP blocks from
+ * DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
+ *
+ * Proxy credentials are loaded from environment variables:
+ *   WEBSHARE_PROXY_HOST  — proxy hostname (e.g. p.webshare.io)
+ *   WEBSHARE_PROXY_PORT  — base port number (e.g. 10000)
+ *   WEBSHARE_PROXY_USER  — proxy username (without slot suffix)
+ *   WEBSHARE_PROXY_PASS  — proxy password
+ *   WEBSHARE_PROXY_SLOTS — number of available US residential slots
+ *
+ * With the Webshare backbone plan each US slot has its own port:
+ *   slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
+ */
+export interface ProxyConfig {
+    /** Proxy server URL in the format "http://host:port" */
+    server: string;
+    /** Proxy username (includes slot suffix, e.g. "user-US-42") */
+    username: string;
+    /** Proxy password */
+    password: string;
+}
+/**
+ * Get a random Webshare residential proxy config.
+ * Returns null if the proxy is not configured (env vars missing or slots = 0).
+ *
+ * Uses random slot selection across all available US slots for even load
+ * distribution — same approach as youtube.ts proxyRequestSlotted().
+ */
+export declare function getWebshareProxy(): ProxyConfig | null;
+/**
+ * Check if Webshare proxies are configured (env vars are present and non-empty).
+ * Does NOT guarantee the proxy is reachable — just that credentials are set.
+ */
+export declare function hasWebshareProxy(): boolean;
+/**
+ * Convert a ProxyConfig to a Playwright-compatible proxy object.
+ * Useful for passing directly to browser.newContext({ proxy: ... }).
+ */
+export declare function toPlaywrightProxy(config: ProxyConfig): {
+    server: string;
+    username: string;
+    password: string;
+};
+/**
+ * Get a random Webshare proxy as a fully-qualified URL string with embedded
+ * credentials. The format is: `http://username:password@host:port`
+ *
+ * Useful for passing to strategies.ts proxy option (which expects a URL string).
+ * Returns null if proxies are not configured.
+ */
+export declare function getWebshareProxyUrl(): string | null;

package/dist/core/proxy-config.js ADDED Viewed

@@ -0,0 +1,79 @@
+/**
+ * Shared Webshare residential proxy configuration.
+ *
+ * WebPeel uses Webshare residential proxies (configured via env vars) to route
+ * requests through US residential IPs, bypassing datacenter IP blocks from
+ * DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
+ *
+ * Proxy credentials are loaded from environment variables:
+ *   WEBSHARE_PROXY_HOST  — proxy hostname (e.g. p.webshare.io)
+ *   WEBSHARE_PROXY_PORT  — base port number (e.g. 10000)
+ *   WEBSHARE_PROXY_USER  — proxy username (without slot suffix)
+ *   WEBSHARE_PROXY_PASS  — proxy password
+ *   WEBSHARE_PROXY_SLOTS — number of available US residential slots
+ *
+ * With the Webshare backbone plan each US slot has its own port:
+ *   slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
+ */
+/**
+ * Get a random Webshare residential proxy config.
+ * Returns null if the proxy is not configured (env vars missing or slots = 0).
+ *
+ * Uses random slot selection across all available US slots for even load
+ * distribution — same approach as youtube.ts proxyRequestSlotted().
+ */
+export function getWebshareProxy() {
+    const host = process.env.WEBSHARE_PROXY_HOST;
+    const user = process.env.WEBSHARE_PROXY_USER;
+    const pass = process.env.WEBSHARE_PROXY_PASS;
+    const basePort = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
+    const slots = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '0', 10);
+    if (!host || !user || !pass || slots <= 0)
+        return null;
+    const slot = Math.floor(Math.random() * slots) + 1;
+    const port = basePort + slot - 1;
+    return {
+        server: `http://${host}:${port}`,
+        username: `${user}-US-${slot}`,
+        password: pass,
+    };
+}
+/**
+ * Check if Webshare proxies are configured (env vars are present and non-empty).
+ * Does NOT guarantee the proxy is reachable — just that credentials are set.
+ */
+export function hasWebshareProxy() {
+    return !!(process.env.WEBSHARE_PROXY_HOST &&
+        process.env.WEBSHARE_PROXY_USER &&
+        process.env.WEBSHARE_PROXY_PASS);
+}
+/**
+ * Convert a ProxyConfig to a Playwright-compatible proxy object.
+ * Useful for passing directly to browser.newContext({ proxy: ... }).
+ */
+export function toPlaywrightProxy(config) {
+    return {
+        server: config.server,
+        username: config.username,
+        password: config.password,
+    };
+}
+/**
+ * Get a random Webshare proxy as a fully-qualified URL string with embedded
+ * credentials. The format is: `http://username:password@host:port`
+ *
+ * Useful for passing to strategies.ts proxy option (which expects a URL string).
+ * Returns null if proxies are not configured.
+ */
+export function getWebshareProxyUrl() {
+    const config = getWebshareProxy();
+    if (!config)
+        return null;
+    try {
+        const url = new URL(config.server);
+        return `http://${encodeURIComponent(config.username)}:${encodeURIComponent(config.password)}@${url.host}`;
+    }
+    catch {
+        return null;
+    }
+}

package/dist/core/search-provider.js CHANGED Viewed

@@ -15,6 +15,7 @@
 import { fetch as undiciFetch } from 'undici';
 import { load } from 'cheerio';
 import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
+import { getWebshareProxy } from './proxy-config.js';
 import { createLogger } from './logger.js';
 const log = createLogger('search');
 function decodeHtmlEntities(input) {
@@ -236,10 +237,12 @@ export class StealthSearchProvider {
             const browser = await getStealthBrowser();
             const params = new URLSearchParams({ q: query });
             const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
+            const proxy = getWebshareProxy();
             ctx = await browser.newContext({
                 userAgent: getRandomUserAgent(),
                 locale: 'en-US',
                 timezoneId: 'America/New_York',
+                ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
             });
             const page = await ctx.newPage();
             await applyStealthScripts(page);
@@ -303,10 +306,12 @@ export class StealthSearchProvider {
             const browser = await getStealthBrowser();
             const params = new URLSearchParams({ q: query });
             const url = `https://www.bing.com/search?${params.toString()}`;
+            const proxy = getWebshareProxy();
             ctx = await browser.newContext({
                 userAgent: getRandomUserAgent(),
                 locale: 'en-US',
                 timezoneId: 'America/New_York',
+                ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
             });
             const page = await ctx.newPage();
             await applyStealthScripts(page);
@@ -380,10 +385,12 @@ export class StealthSearchProvider {
             const browser = await getStealthBrowser();
             const params = new URLSearchParams({ q: query });
             const url = `https://www.ecosia.org/search?${params.toString()}`;
+            const proxy = getWebshareProxy();
             ctx = await browser.newContext({
                 userAgent: getRandomUserAgent(),
                 locale: 'en-US',
                 timezoneId: 'America/New_York',
+                ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
             });
             const page = await ctx.newPage();
             await applyStealthScripts(page);

package/dist/core/strategies.js CHANGED Viewed

@@ -10,6 +10,7 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
 import { getCached, setCached as setBasicCache } from './cache.js';
 import { resolveAndCache } from './dns-cache.js';
 import { BlockedError, NetworkError } from '../types.js';
+import { getWebshareProxyUrl } from './proxy-config.js';
 import { detectChallenge } from './challenge-detection.js';
 import { getStrategyHooks, } from './strategy-hooks.js';
 import { createLogger } from './logger.js';
@@ -310,10 +311,15 @@ async function fetchWithBrowserStrategy(url, options) {
 export async function smartFetch(url, options = {}) {
     const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
     const usePeelTLS = tls || cycle;
-    // Build effective proxy list: explicit proxies array, or single proxy, or empty
+    // Build effective proxy list: explicit proxies array, or single proxy, or empty.
+    // When no explicit proxy is configured and Webshare is available, automatically
+    // add it as a fallback: try direct connection first (fast), then Webshare on block.
     const effectiveProxies = proxies?.length ? proxies :
         proxy ? [proxy] :
-            [undefined]; // undefined = direct connection (no proxy)
+            (() => {
+                const wsUrl = getWebshareProxyUrl();
+                return wsUrl ? [undefined, wsUrl] : [undefined];
+            })();
     const firstProxy = effectiveProxies[0];
     const hooks = getStrategyHooks();
     const fetchStartMs = Date.now();

package/dist/core/structured-extract.js CHANGED Viewed

@@ -86,56 +86,223 @@ function parseLLMJson(text) {
 /**
  * For string fields: search for field name in content, extract surrounding text.
  */
-function heuristicExtractString(fieldName, content) {
+/** Extract first H1 or page title from markdown content */
+function extractPageTitle(content) {
+    const h1 = content.match(/^#\s+(.+)$/m);
+    if (h1?.[1])
+        return h1[1].replace(/[*_`]/g, '').trim();
+    return null;
+}
+/** Extract meta description (after *X min read* pattern common in WebPeel output) */
+function extractDescription(content) {
+    // First paragraph after the title
+    const lines = content.split('\n').filter(l => l.trim());
+    let seenH1 = false;
+    for (const line of lines) {
+        if (line.startsWith('#')) {
+            seenH1 = true;
+            continue;
+        }
+        if (line.startsWith('*') && line.endsWith('*'))
+            continue; // byline
+        if (seenH1 && line.length > 30)
+            return line.replace(/[*_`]/g, '').trim().slice(0, 300);
+    }
+    return null;
+}
+/** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
+function extractCompanyFromTitle(title) {
+    const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
+    if (sep?.[1])
+        return sep[1].trim();
+    return title.trim().slice(0, 60);
+}
+/** Smart field-name-aware string extractor */
+function heuristicExtractString(fieldName, content, pageUrl) {
+    const lf = fieldName.toLowerCase();
     const humanName = fieldName.replace(/_/g, ' ');
+    const title = extractPageTitle(content);
+    // --- Concept-aware extraction ---
+    // Company/brand/organization name
+    if (/company|brand|organization|org_name/.test(lf)) {
+        if (title)
+            return extractCompanyFromTitle(title);
+        // Fallback: extract from first heading of any level
+        const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
+        if (anyHeading?.[1])
+            return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
+    }
+    // Title/name/product → first H1 or any heading, stripped of markdown
+    if (/^(title|name|product_name|product|heading)$/.test(lf)) {
+        const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
+        if (rawTitle) {
+            // Strip markdown links [text](url) → text, badges ![...](url) → '', etc.
+            return rawTitle
+                .replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
+                .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
+                .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
+                .replace(/[*_`[\]]/g, '')
+                .replace(/&[a-z]+;/g, '') // HTML entities
+                .replace(/\s+/g, ' ')
+                .trim().slice(0, 150);
+        }
+    }
+    // Description/summary/about → first paragraph
+    if (/description|summary|about|overview/.test(lf)) {
+        return extractDescription(content) ?? null;
+    }
+    // URL/website/link → use the URL if we have it
+    if (/^(url|website|link|homepage|site)$/.test(lf)) {
+        if (pageUrl)
+            return pageUrl;
+    }
+    // Author/writer/by
+    if (/author|writer|by/.test(lf)) {
+        const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
+        if (m?.[1])
+            return m[1].trim().slice(0, 100);
+    }
+    // Date/published/updated
+    if (/date|published|updated|modified/.test(lf)) {
+        const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
+            ?? content.match(/(\d{4}-\d{2}-\d{2})/);
+        if (m?.[1])
+            return m[1];
+    }
+    // Email
+    if (/email|contact/.test(lf)) {
+        const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
+        if (m?.[0])
+            return m[0];
+    }
+    // Price/cost/pricing → extract value near $
+    if (/price|cost|pricing|fee/.test(lf)) {
+        const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
+            ?? content.match(/(free|no cost|no charge)/i);
+        if (m?.[0])
+            return m[0].trim();
+    }
+    // Language (for GitHub repos)
+    if (/language|lang|tech/.test(lf)) {
+        const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
+        if (m?.[1])
+            return m[1];
+    }
+    // Stars (for GitHub)
+    if (/stars?/.test(lf)) {
+        const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
+        if (m?.[1])
+            return m[1].replace(/,/g, '');
+    }
+    // License
+    if (/license/.test(lf)) {
+        const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
+        if (m?.[1])
+            return m[1];
+    }
+    // --- Generic patterns (exact-ish match) ---
     const patterns = [
-        // "field_name: value" or "Field Name: value" patterns
         new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
-        // JSON-like "field": "value"
         new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
-        // Markdown bold **Field Name**: value
         new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
-        // Heading followed by content
         new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
     ];
     for (const pattern of patterns) {
         const match = content.match(pattern);
-        if (match?.[1]) {
+        if (match?.[1])
             return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
-        }
     }
     return null;
 }
 /**
- * For boolean fields: search for positive/negative indicators near the field name.
+ * For boolean fields: search the ENTIRE content for positive/negative indicators.
  */
 function heuristicExtractBoolean(fieldName, content) {
-    const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
+    const lf = fieldName.toLowerCase();
     const ctx = content.toLowerCase();
-    // Search both underscore and spaced variants
-    let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
-    if (fieldIdx === -1)
-        fieldIdx = ctx.indexOf(humanName);
-    if (fieldIdx === -1)
-        return null;
-    // Look at a window of ±150 chars around the field name
-    const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
-    const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
-    const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
-    for (const pos of positive) {
-        if (window.includes(pos))
+    // Concept-aware boolean extraction — search entire content, not just near field name
+    // Free tier / free plan
+    if (/free_tier|has_free|is_free/.test(lf)) {
+        if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
             return true;
+        if (/no free|paid only|subscription required/.test(ctx))
+            return false;
     }
-    for (const neg of negative) {
-        if (window.includes(neg))
+    // Open source
+    if (/open_source|is_open|oss/.test(lf)) {
+        if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
+            return true;
+        if (/closed[- ]source|proprietary|commercial license/.test(ctx))
             return false;
     }
+    // API availability
+    if (/has_api|api_available|has_rest/.test(lf)) {
+        if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
+            return true;
+    }
+    // Authentication
+    if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
+        if (/login|sign in|authentication|api key|bearer token/.test(ctx))
+            return true;
+    }
+    // General approach: search near field name concept
+    const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
+    let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
+    if (fieldIdx === -1)
+        fieldIdx = ctx.indexOf(humanName);
+    if (fieldIdx !== -1) {
+        const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
+        const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
+        const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
+        for (const pos of positive) {
+            if (window.includes(pos))
+                return true;
+        }
+        for (const neg of negative) {
+            if (window.includes(neg))
+                return false;
+        }
+    }
     return null;
 }
 /**
  * For number fields: find digits near the field name.
  */
 function heuristicExtractNumber(fieldName, content) {
+    const lf = fieldName.toLowerCase();
+    // Stars (GitHub)
+    if (/stars?/.test(lf)) {
+        const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
+        if (m?.[1]) {
+            const n = parseFloat(m[1].replace(/,/g, ''));
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Forks
+    if (/forks?/.test(lf)) {
+        const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
+        if (m?.[1]) {
+            const n = parseFloat(m[1].replace(/,/g, ''));
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Rating/score
+    if (/rating|score/.test(lf)) {
+        const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
+        if (m?.[1]) {
+            const n = parseFloat(m[1]);
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Year
+    if (/year/.test(lf)) {
+        const m = content.match(/\b(20\d{2})\b/);
+        if (m?.[1]) {
+            const n = parseInt(m[1]);
+            return isNaN(n) ? null : n;
+        }
+    }
+    // Generic: find number near field name
     const humanName = fieldName.replace(/_/g, '[\\s_-]*');
     const pattern = new RegExp(`${humanName}[:\\s$]*([\\d,]+\\.?\\d*)`, 'i');
     const match = content.match(pattern);

package/dist/core/youtube.js CHANGED Viewed

@@ -15,6 +15,7 @@ import { join } from 'node:path';
 import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
 import { simpleFetch } from './fetcher.js';
 import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
+import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
 import { createLogger } from './logger.js';
 // ---------------------------------------------------------------------------
 // yt-dlp startup diagnostics
@@ -239,8 +240,10 @@ export function extractSummary(fullText) {
 // ---------------------------------------------------------------------------
 // Proxy-based InnerTube transcript extraction
 // ---------------------------------------------------------------------------
-// Webshare residential proxy config — reads from env vars on Render.
+// Webshare residential proxy config — reads from env vars via proxy-config.ts.
 // Locally, falls back to direct fetch (residential IP already works).
+// These constants are kept for use in proxyRequestSlotted() which does
+// low-level HTTP CONNECT tunneling (not Playwright-level proxy).
 const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
 const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
 const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
@@ -249,7 +252,8 @@ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
 // slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
 const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
 function isProxyConfigured() {
-    return !!(PROXY_USER && PROXY_PASS);
+    // Delegate to the shared proxy-config helper for consistency
+    return _hasWebshareProxy();
 }
 /**
  * Make an HTTP(S) request through the Webshare CONNECT proxy with a specific

package/dist/server/app.js CHANGED Viewed

@@ -106,8 +106,8 @@ export function createApp(config = {}) {
             timeoutMs = 120000; // 2min for batch
         else if (path.includes('/screenshot'))
             timeoutMs = 60000; // 1min for screenshots
-        else if (req.query?.render === 'true')
-            timeoutMs = 60000; // 1min for rendered fetches
+        else if (req.query?.render === 'true' || req.query?.stealth === 'true')
+            timeoutMs = 60000; // 1min for browser/stealth fetches
         else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
             timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
         req.setTimeout(timeoutMs);

package/dist/server/routes/fetch.js CHANGED Viewed

@@ -13,6 +13,52 @@ import { getSchemaTemplate } from '../../core/schema-templates.js';
 import { quickAnswer } from '../../core/quick-answer.js';
 import { sendUsageAlertEmail } from '../email-service.js';
 import { extractLinks } from '../../core/links.js';
+// ── Helper: classify an error thrown by peel() into a FetchErrorType ─────────
+function classifyFetchError(err) {
+    const code = err.code || err.name || '';
+    const msg = (err.message || '').toLowerCase();
+    if (code === 'TIMEOUT' || msg.includes('timeout') || msg.includes('timed out')) {
+        return 'timeout';
+    }
+    if (code === 'BLOCKED' || msg.includes('blocked') || msg.includes('cloudflare challenge') || msg.includes('captcha') || msg.includes('bot detection')) {
+        return 'blocked';
+    }
+    if (msg.includes('http 404') || msg.includes('not found') || msg.includes('dns resolution failed') || msg.includes('enotfound') || msg.includes('getaddrinfo')) {
+        return 'not_found';
+    }
+    if (msg.match(/http\s+5\d{2}/) || msg.includes('server error') || msg.includes('internal server')) {
+        return 'server_error';
+    }
+    if (code === 'NETWORK' || msg.includes('network') || msg.includes('econnrefused') || msg.includes('connection refused') || msg.includes('connection reset')) {
+        return 'network';
+    }
+    return 'unknown';
+}
+// ── Helper: build a clean, user-facing error message from a peel() error ─────
+function buildFetchErrorMessage(err) {
+    const type = classifyFetchError(err);
+    const hints = {
+        timeout: 'Try increasing timeout with ?timeout=20000, or use render=true for JS-heavy sites.',
+        blocked: 'This site blocks automated requests. Try render=true or stealth=true.',
+        not_found: 'Verify the URL is correct and the site is accessible.',
+        server_error: 'The target site returned a server error. Try again later.',
+        network: 'Could not connect to the target URL. Verify the URL is correct and the site is online.',
+        unknown: undefined,
+    };
+    // Sanitize message: strip HTML chars, truncate
+    const safeMsg = (err.message || 'An unexpected error occurred while fetching the URL')
+        .replace(/[<>"']/g, '')
+        .trim();
+    const messages = {
+        timeout: `The website took too long to respond. Try with render=true or stealth=true for JavaScript-heavy sites.`,
+        blocked: `This website is blocking automated access (bot protection detected).`,
+        not_found: `The URL could not be reached — the domain may not exist or the page was not found.`,
+        server_error: `The target website returned a server error while processing the request.`,
+        network: `Could not reach this website. The server may be down or the URL may be incorrect.`,
+        unknown: safeMsg,
+    };
+    return { type, message: messages[type] || safeMsg, hint: hints[type] };
+}
 // ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
 function extractSummary(content, maxWords = 150) {
     if (!content)
@@ -527,26 +573,24 @@ export function createFetchRouter(authStore) {
                 });
             }
             // SECURITY: Sanitize error messages to prevent information disclosure
-            if (err.code) {
+            if (res.headersSent)
+                return; // Timeout middleware already responded
+            const requestUrl = req.query.url;
+            if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
                 // WebPeelError from core library - safe to expose with helpful context
-                if (res.headersSent)
-                    return; // Timeout middleware already responded
-                const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
-                const statusCode = err.code === 'TIMEOUT' ? 504
-                    : err.code === 'BLOCKED' ? 403
-                        : err.code === 'NETWORK' ? 502
-                            : 500;
-                const hints = {
-                    TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
-                    BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
-                    NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
-                };
+                const { type, message, hint } = buildFetchErrorMessage(err);
+                const statusCode = type === 'timeout' ? 504
+                    : type === 'blocked' ? 403
+                        : type === 'not_found' ? 404
+                            : type === 'network' || type === 'server_error' ? 502
+                                : 500;
                 res.status(statusCode).json({
                     success: false,
                     error: {
-                        type: err.code,
-                        message: safeMessage,
-                        hint: hints[err.code] || undefined,
+                        type,
+                        message,
+                        url: requestUrl,
+                        ...(hint ? { hint } : {}),
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,
@@ -555,13 +599,12 @@ export function createFetchRouter(authStore) {
             else {
                 // Unexpected error - generic message only
                 console.error('Fetch error:', err); // Log full error server-side
-                if (res.headersSent)
-                    return; // Timeout middleware already responded
                 res.status(500).json({
                     success: false,
                     error: {
-                        type: 'internal_error',
+                        type: 'unknown',
                         message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
+                        url: requestUrl,
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,
@@ -1028,23 +1071,21 @@ export function createFetchRouter(authStore) {
             console.error('POST fetch/scrape error:', err);
             if (res.headersSent)
                 return; // Timeout middleware already responded
-            if (err.code) {
-                const safeMessage = err.message.replace(/[<>"']/g, '');
-                const statusCode = err.code === 'TIMEOUT' ? 504
-                    : err.code === 'BLOCKED' ? 403
-                        : err.code === 'NETWORK' ? 502
-                            : 500;
-                const hints = {
-                    TIMEOUT: 'Try increasing timeout, or set render:true for JS-heavy sites.',
-                    BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
-                    NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
-                };
+            const postUrl = req.body?.url;
+            if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
+                const { type, message, hint } = buildFetchErrorMessage(err);
+                const statusCode = type === 'timeout' ? 504
+                    : type === 'blocked' ? 403
+                        : type === 'not_found' ? 404
+                            : type === 'network' || type === 'server_error' ? 502
+                                : 500;
                 res.status(statusCode).json({
                     success: false,
                     error: {
-                        type: err.code,
-                        message: safeMessage,
-                        hint: hints[err.code] || undefined,
+                        type,
+                        message,
+                        url: postUrl,
+                        ...(hint ? { hint } : {}),
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,
@@ -1054,8 +1095,9 @@ export function createFetchRouter(authStore) {
                 res.status(500).json({
                     success: false,
                     error: {
-                        type: 'internal_error',
+                        type: 'unknown',
                         message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
+                        url: postUrl,
                         docs: 'https://webpeel.dev/docs/api-reference#errors',
                     },
                     requestId: req.requestId,

package/dist/types.d.ts CHANGED Viewed

@@ -419,6 +419,18 @@ export interface PeelEnvelope {
      */
     totalAvailable?: number;
 }
+/**
+ * Programmatic error classification for fetch failures.
+ * Returned in the `error.type` field of API error responses.
+ *
+ * - `timeout`      — Site took too long to respond
+ * - `blocked`      — Site actively blocked the request (403, CAPTCHA, bot detection)
+ * - `not_found`    — 404 or the domain/URL does not exist
+ * - `server_error` — Target site returned a 5xx error
+ * - `network`      — DNS failure, connection refused, or other network-level issue
+ * - `unknown`      — Unclassified error
+ */
+export type FetchErrorType = 'timeout' | 'blocked' | 'not_found' | 'server_error' | 'network' | 'unknown';
 export declare class WebPeelError extends Error {
     code?: string | undefined;
     constructor(message: string, code?: string | undefined);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.6",
+  "version": "0.21.8",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",