npm - webpeel - Versions diffs - 0.21.45 → 0.21.46 - Mend

webpeel 0.21.45 → 0.21.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/core/challenge-solver.d.ts +46 -0
package/dist/core/challenge-solver.js +367 -0
package/dist/core/cookie-cache.d.ts +60 -0
package/dist/core/cookie-cache.js +163 -0
package/dist/core/http-fetch.js +9 -1
package/dist/core/pipeline.js +81 -15
package/package.json +1 -1

package/dist/core/challenge-solver.d.ts ADDED Viewed

@@ -0,0 +1,46 @@
+/**
+ * Challenge / bot-protection solver.
+ *
+ * Attempts to bypass bot-protection challenges using free, in-process methods:
+ *  1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
+ *  2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
+ *
+ * Architecture note:
+ *  Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
+ *  is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
+ *  instead of running locally. This keeps the main Render container (512 MB) lean.
+ *
+ * Usage:
+ *  const result = await solveChallenge(url, 'cloudflare', html);
+ *  if (result.solved) {
+ *    // result.html = real page content
+ *    // result.cookies = ["cf_clearance=...", ...]
+ *  }
+ */
+import type { ChallengeType } from './challenge-detection.js';
+export interface SolveOptions {
+    /** Hard timeout in ms (default: 15 000) */
+    timeout?: number;
+    /** Optional proxy URL (http://user:pass@host:port) */
+    proxy?: string;
+}
+export interface SolveResult {
+    solved: boolean;
+    html: string;
+    /** Raw Set-Cookie header values extracted after solve */
+    cookies?: string[];
+    /** How the solve was performed */
+    method?: 'local-browser' | 'remote-worker' | 'accessibility';
+    /** Error details if solve failed */
+    error?: string;
+}
+/**
+ * Attempt to solve a bot-protection challenge.
+ *
+ * @param url            The page URL (used for proxy routing and cookie caching)
+ * @param challengeType  The type of challenge as detected by challenge-detection
+ * @param html           The raw challenge HTML (used for context / fallback)
+ * @param options        Optional timeout and proxy settings
+ * @returns              Solve result with real HTML content and cookies if successful
+ */
+export declare function solveChallenge(url: string, challengeType: ChallengeType, html: string, options?: SolveOptions): Promise<SolveResult>;

package/dist/core/challenge-solver.js ADDED Viewed

@@ -0,0 +1,367 @@
+/**
+ * Challenge / bot-protection solver.
+ *
+ * Attempts to bypass bot-protection challenges using free, in-process methods:
+ *  1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
+ *  2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
+ *
+ * Architecture note:
+ *  Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
+ *  is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
+ *  instead of running locally. This keeps the main Render container (512 MB) lean.
+ *
+ * Usage:
+ *  const result = await solveChallenge(url, 'cloudflare', html);
+ *  if (result.solved) {
+ *    // result.html = real page content
+ *    // result.cookies = ["cf_clearance=...", ...]
+ *  }
+ */
+import { cacheCookiesForUrl } from './cookie-cache.js';
+import { createLogger } from './logger.js';
+const log = createLogger('challenge-solver');
+// ── Constants ─────────────────────────────────────────────────────────────────
+const DEFAULT_TIMEOUT_MS = 15_000;
+/** Cloudflare challenge title before it's solved */
+const CF_CHALLENGE_TITLES = ['just a moment', 'please wait', 'one moment, please', 'checking your browser'];
+/** Cloudflare challenge page markers */
+const CF_CHALLENGE_SELECTORS = [
+    '#challenge-running',
+    '#challenge-form',
+    '#cf-challenge-running',
+    '.cf-browser-verification',
+];
+// ── Main entry point ──────────────────────────────────────────────────────────
+/**
+ * Attempt to solve a bot-protection challenge.
+ *
+ * @param url            The page URL (used for proxy routing and cookie caching)
+ * @param challengeType  The type of challenge as detected by challenge-detection
+ * @param html           The raw challenge HTML (used for context / fallback)
+ * @param options        Optional timeout and proxy settings
+ * @returns              Solve result with real HTML content and cookies if successful
+ */
+export async function solveChallenge(url, challengeType, html, options = {}) {
+    const domain = getDomain(url);
+    const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
+    console.log(`[challenge-solver] Attempting ${challengeType} solve for ${domain}`);
+    // ── Remote worker proxy (Hetzner) ──────────────────────────────────────────
+    const workerUrl = process.env.BROWSER_WORKER_URL;
+    if (workerUrl) {
+        return solveViaRemoteWorker(url, challengeType, html, { timeout, proxy: options.proxy, workerUrl });
+    }
+    // ── Local solve ────────────────────────────────────────────────────────────
+    switch (challengeType) {
+        case 'cloudflare':
+            return solveCloudflare(url, html, timeout, options.proxy);
+        case 'captcha':
+            // TODO: hCaptcha accessibility bypass — see comment below
+            return { solved: false, html, error: 'No free captcha solver available for generic captcha' };
+        case 'datadome':
+            // DataDome can sometimes be bypassed with a stealth browser
+            return solveWithStealthBrowser(url, html, timeout, options.proxy, 'datadome');
+        case 'akamai':
+        case 'perimeterx':
+        case 'incapsula':
+        case 'generic-block':
+            // For other challenges, try stealth browser as a general approach
+            return solveWithStealthBrowser(url, html, timeout, options.proxy, challengeType);
+        case 'empty-shell':
+            // Not really a challenge — just an SPA shell, shouldn't reach here
+            return { solved: false, html, error: 'empty-shell is not a challenge to solve' };
+        default:
+            return { solved: false, html, error: `Unknown challenge type: ${challengeType}` };
+    }
+}
+// ── Cloudflare solver ─────────────────────────────────────────────────────────
+/**
+ * Solve Cloudflare JS challenge by rendering the page in a stealth browser.
+ *
+ * Cloudflare's "Just a moment..." challenge:
+ *  - Runs JavaScript fingerprinting in the browser
+ *  - If the fingerprint passes (looks like a real browser), auto-redirects to the real page
+ *  - No human interaction needed if the browser stealth is good enough
+ *
+ * Strategy:
+ *  1. Open a fresh stealth browser page
+ *  2. Navigate to the URL
+ *  3. Wait for the challenge to complete (title changes OR challenge element disappears)
+ *  4. Extract HTML and cookies
+ *  5. Cache cf_clearance cookie for future requests
+ */
+async function solveCloudflare(url, _html, timeoutMs, proxy) {
+    let browser = null;
+    let page = null;
+    try {
+        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
+        browser = await getStealthBrowser();
+        const vp = getRandomViewport();
+        const ctx = await browser.newContext({
+            userAgent: getRandomUserAgent(),
+            viewport: { width: vp.width, height: vp.height },
+            ...(proxy ? { proxy: { server: proxy } } : {}),
+            // Accept all languages to look more like a real browser
+            locale: 'en-US',
+            timezoneId: 'America/New_York',
+        });
+        page = await ctx.newPage();
+        await applyStealthScripts(page);
+        // Navigate to the challenge URL
+        await page.goto(url, {
+            waitUntil: 'domcontentloaded',
+            timeout: timeoutMs,
+        });
+        // Wait for Cloudflare challenge to resolve
+        const solved = await waitForChallengeResolution(page, timeoutMs);
+        if (!solved) {
+            log.debug('Cloudflare challenge did not resolve within timeout');
+            await ctx.close().catch(() => { });
+            return { solved: false, html: await page.content().catch(() => _html), error: 'Cloudflare challenge timed out' };
+        }
+        // Extract real page content
+        const realHtml = await page.content();
+        // Extract cookies (especially cf_clearance)
+        const cookies = await ctx.cookies();
+        const cookieStrings = cookies.map(c => {
+            let s = `${c.name}=${c.value}`;
+            if (c.path)
+                s += `; Path=${c.path}`;
+            if (c.domain)
+                s += `; Domain=${c.domain}`;
+            if (c.secure)
+                s += '; Secure';
+            if (c.httpOnly)
+                s += '; HttpOnly';
+            if (c.expires && c.expires > 0) {
+                s += `; Expires=${new Date(c.expires * 1000).toUTCString()}`;
+            }
+            return s;
+        });
+        // Determine TTL based on cf_clearance expiry (default 30 min)
+        const cfClearance = cookies.find(c => c.name === 'cf_clearance');
+        const ttlMs = cfClearance?.expires && cfClearance.expires > 0
+            ? Math.min((cfClearance.expires * 1000) - Date.now(), 30 * 60 * 1000)
+            : 30 * 60 * 1000;
+        // Cache cookies for future requests
+        if (cookieStrings.length > 0) {
+            cacheCookiesForUrl(url, cookieStrings, ttlMs);
+            log.debug(`Cached ${cookieStrings.length} cookies for ${getDomain(url)} (TTL: ${Math.round(ttlMs / 60000)}m)`);
+        }
+        await ctx.close().catch(() => { });
+        console.log(`[challenge-solver] Cloudflare challenge solved for ${getDomain(url)}, extracted ${cookieStrings.length} cookies`);
+        return {
+            solved: true,
+            html: realHtml,
+            cookies: cookieStrings,
+            method: 'local-browser',
+        };
+    }
+    catch (err) {
+        const error = err instanceof Error ? err.message : String(err);
+        log.debug('Cloudflare solve failed:', error);
+        return { solved: false, html: _html, error };
+    }
+    finally {
+        // Don't close shared browser — it's managed by browser-pool
+        page = null;
+        browser = null;
+    }
+}
+// ── Generic stealth browser solver ───────────────────────────────────────────
+/**
+ * General-purpose stealth browser solve for challenges that may auto-resolve
+ * when rendered in a legitimate-looking browser (DataDome, Akamai, etc.).
+ */
+async function solveWithStealthBrowser(url, _html, timeoutMs, proxy, challengeType) {
+    let page = null;
+    try {
+        const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
+        const browser = await getStealthBrowser();
+        const vp = getRandomViewport();
+        const ctx = await browser.newContext({
+            userAgent: getRandomUserAgent(),
+            viewport: { width: vp.width, height: vp.height },
+            ...(proxy ? { proxy: { server: proxy } } : {}),
+            locale: 'en-US',
+        });
+        page = await ctx.newPage();
+        await applyStealthScripts(page);
+        await page.goto(url, {
+            waitUntil: 'networkidle',
+            timeout: timeoutMs,
+        });
+        // Wait a bit for any JS-based challenges to execute
+        await page.waitForTimeout(2000);
+        const html = await page.content();
+        const cookies = await ctx.cookies();
+        const cookieStrings = cookies.map(c => `${c.name}=${c.value}; Path=${c.path || '/'}${c.domain ? `; Domain=${c.domain}` : ''}`);
+        // Check if we got real content (not a challenge page)
+        const titleEl = await page.title().catch(() => '');
+        const isStillChallenge = CF_CHALLENGE_TITLES.some(t => titleEl.toLowerCase().includes(t))
+            || html.includes('cf-browser-verification')
+            || html.includes('challenge-form');
+        if (isStillChallenge) {
+            await ctx.close().catch(() => { });
+            return { solved: false, html, error: `${challengeType} challenge did not resolve` };
+        }
+        if (cookieStrings.length > 0) {
+            cacheCookiesForUrl(url, cookieStrings);
+        }
+        await ctx.close().catch(() => { });
+        console.log(`[challenge-solver] ${challengeType} challenge solved for ${getDomain(url)}`);
+        return { solved: true, html, cookies: cookieStrings, method: 'local-browser' };
+    }
+    catch (err) {
+        const error = err instanceof Error ? err.message : String(err);
+        return { solved: false, html: _html, error };
+    }
+    finally {
+        page = null;
+    }
+}
+// ── Remote worker proxy ───────────────────────────────────────────────────────
+/**
+ * Proxy a solve request to a remote browser worker (e.g. Hetzner VPS).
+ *
+ * The worker endpoint is expected to accept:
+ *   POST /solve
+ *   { url, challengeType, timeout, proxy? }
+ *
+ * And return:
+ *   { solved: boolean, html: string, cookies?: string[], error?: string }
+ *
+ * Set BROWSER_WORKER_URL to the worker base URL (e.g. http://hetzner:3001)
+ * to route all browser-based challenge solving to the worker.
+ */
+async function solveViaRemoteWorker(url, challengeType, html, options) {
+    const { workerUrl, timeout, proxy } = options;
+    try {
+        const controller = new AbortController();
+        const timer = setTimeout(() => controller.abort(), timeout + 5000); // Add buffer
+        const response = await fetch(`${workerUrl}/solve`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ url, challengeType, timeout, ...(proxy ? { proxy } : {}) }),
+            signal: controller.signal,
+        });
+        clearTimeout(timer);
+        if (!response.ok) {
+            throw new Error(`Worker returned HTTP ${response.status}`);
+        }
+        const result = await response.json();
+        // Cache cookies from remote solve
+        if (result.solved && result.cookies?.length) {
+            cacheCookiesForUrl(url, result.cookies);
+            console.log(`[challenge-solver] Remote ${challengeType} solve for ${getDomain(url)}, cached ${result.cookies.length} cookies`);
+        }
+        return { ...result, method: 'remote-worker' };
+    }
+    catch (err) {
+        const error = err instanceof Error ? err.message : String(err);
+        log.debug('Remote worker solve failed:', error);
+        // Fall through to local solve on worker failure
+        console.log(`[challenge-solver] Remote worker failed, attempting local ${challengeType} solve for ${getDomain(url)}`);
+        switch (challengeType) {
+            case 'cloudflare':
+                return solveCloudflare(url, html, options.timeout, options.proxy);
+            default:
+                return solveWithStealthBrowser(url, html, options.timeout, options.proxy, challengeType);
+        }
+    }
+}
+// ── Challenge resolution detection ───────────────────────────────────────────
+/**
+ * Wait for a Cloudflare challenge page to resolve.
+ *
+ * Cloudflare's challenge works like this:
+ *  1. Initial page: title is "Just a moment..." with challenge elements
+ *  2. Browser runs JS fingerprinting
+ *  3. On pass: redirects to real page (title and content change)
+ *  4. On fail: stays on challenge page
+ *
+ * We detect resolution by watching for:
+ *  - Title change (away from challenge titles)
+ *  - Challenge element disappearance
+ *  - URL change (often redirects after solve)
+ */
+async function waitForChallengeResolution(page, timeoutMs) {
+    const start = Date.now();
+    const pollInterval = 500;
+    // Quick check: is it even a challenge page?
+    const initialTitle = await page.title().catch(() => '');
+    const isInitiallyChallenge = CF_CHALLENGE_TITLES.some(t => initialTitle.toLowerCase().includes(t));
+    if (!isInitiallyChallenge) {
+        // Not a challenge page to begin with — treat as solved
+        return true;
+    }
+    // Poll until timeout
+    while (Date.now() - start < timeoutMs) {
+        await page.waitForTimeout(pollInterval);
+        const title = await page.title().catch(() => '');
+        const lowerTitle = title.toLowerCase();
+        // Title changed away from challenge
+        const isChallengeTitle = CF_CHALLENGE_TITLES.some(t => lowerTitle.includes(t));
+        if (!isChallengeTitle && title.length > 0) {
+            // Give the page a moment to fully render
+            await page.waitForTimeout(1000);
+            return true;
+        }
+        // Check if challenge elements are gone
+        let challengeElementGone = true;
+        for (const selector of CF_CHALLENGE_SELECTORS) {
+            try {
+                const el = await page.$(selector);
+                if (el) {
+                    challengeElementGone = false;
+                    break;
+                }
+            }
+            catch {
+                // Selector check failed — continue
+            }
+        }
+        if (challengeElementGone && !isChallengeTitle) {
+            await page.waitForTimeout(500);
+            return true;
+        }
+        // Try waiting for network to settle (challenge often triggers fetches)
+        try {
+            await page.waitForLoadState('networkidle', { timeout: Math.min(3000, timeoutMs - (Date.now() - start)) });
+            const finalTitle = await page.title().catch(() => '');
+            if (!CF_CHALLENGE_TITLES.some(t => finalTitle.toLowerCase().includes(t))) {
+                return true;
+            }
+        }
+        catch {
+            // Timeout or error — continue polling
+        }
+    }
+    return false;
+}
+// ── hCaptcha Accessibility Bypass ────────────────────────────────────────────
+// TODO: hCaptcha Accessibility Bypass
+// hCaptcha has an accessibility service at https://www.hcaptcha.com/accessibility
+// that provides a cookie allowing users with accessibility needs to bypass hCaptcha.
+//
+// Implementation notes:
+// - The service used to allow programmatic registration without email verification
+// - As of 2025, it requires manual verification (email link) to activate
+// - Since this requires human interaction, it cannot be fully automated
+//
+// When/if implemented:
+// 1. Check https://www.hcaptcha.com/accessibility for current API status
+// 2. Register with a request to their accessibility API
+// 3. If they return a cookie directly (no email verification), cache it
+// 4. Attach the cookie to requests to sites using hCaptcha
+//
+// const HCAPTCHA_ACCESSIBILITY_URL = 'https://accounts.hcaptcha.com/demo?sitekey=bf5558a0-...';
+// export async function getHCaptchaAccessibilityCookie(): Promise<string | null> { ... }
+// ── Utility ───────────────────────────────────────────────────────────────────
+function getDomain(url) {
+    try {
+        return new URL(url).hostname;
+    }
+    catch {
+        return url;
+    }
+}

package/dist/core/cookie-cache.d.ts ADDED Viewed

@@ -0,0 +1,60 @@
+/**
+ * In-memory cookie cache with TTL.
+ *
+ * Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
+ * Cookies from challenge solves are cached here so future requests to the same
+ * domain skip the challenge entirely.
+ *
+ * Design goals:
+ *  - Zero dependencies (plain Map + setTimeout)
+ *  - In-memory only — no disk/DB persistence
+ *  - TTL per entry (default 30 min, matching cf_clearance lifetime)
+ *  - Thread-safe for single-process Node.js (event loop is single-threaded)
+ */
+export interface CachedCookies {
+    /** Raw "Cookie: ..." header value (semicolon-separated) */
+    cookieHeader: string;
+    /** Individual cookie strings (e.g. ["cf_clearance=abc; Path=/", ...]) */
+    cookies: string[];
+    /** Unix timestamp (ms) when this cache entry expires */
+    expiresAt: number;
+    /** The domain these cookies are for */
+    domain: string;
+}
+/**
+ * Store cookies for a domain.
+ *
+ * @param domain   Hostname (e.g. "example.com" or "sub.example.com")
+ * @param cookies  Array of Set-Cookie header values or cookie strings
+ * @param ttlMs    Time-to-live in ms (default: 30 min)
+ */
+export declare function cacheCookies(domain: string, cookies: string[], ttlMs?: number): void;
+/**
+ * Retrieve cached cookies for a domain (or its parent domain).
+ * Returns null if no valid (non-expired) entry exists.
+ *
+ * @param domain  Hostname to look up
+ */
+export declare function getCachedCookies(domain: string): CachedCookies | null;
+/**
+ * Build a Cookie request header value from a URL.
+ * Returns undefined if no cached cookies exist.
+ */
+export declare function getCookieHeader(url: string): string | undefined;
+/**
+ * Cache cookies from a URL's perspective.
+ * Extracts domain from URL automatically.
+ */
+export declare function cacheCookiesForUrl(url: string, cookies: string[], ttlMs?: number): void;
+/**
+ * Invalidate (remove) cached cookies for a domain.
+ */
+export declare function invalidateCookies(domain: string): void;
+/**
+ * Return the number of cached domains (for diagnostics).
+ */
+export declare function getCacheSize(): number;
+/**
+ * Clear ALL cached cookies. Mainly for tests.
+ */
+export declare function clearCookieCache(): void;

package/dist/core/cookie-cache.js ADDED Viewed

@@ -0,0 +1,163 @@
+/**
+ * In-memory cookie cache with TTL.
+ *
+ * Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
+ * Cookies from challenge solves are cached here so future requests to the same
+ * domain skip the challenge entirely.
+ *
+ * Design goals:
+ *  - Zero dependencies (plain Map + setTimeout)
+ *  - In-memory only — no disk/DB persistence
+ *  - TTL per entry (default 30 min, matching cf_clearance lifetime)
+ *  - Thread-safe for single-process Node.js (event loop is single-threaded)
+ */
+// ── Internal store ────────────────────────────────────────────────────────────
+const store = new Map();
+let cleanupTimer = null;
+/** Default TTL: 30 minutes (cf_clearance lasts 30 min) */
+const DEFAULT_TTL_MS = 30 * 60 * 1000;
+// ── Public API ────────────────────────────────────────────────────────────────
+/**
+ * Store cookies for a domain.
+ *
+ * @param domain   Hostname (e.g. "example.com" or "sub.example.com")
+ * @param cookies  Array of Set-Cookie header values or cookie strings
+ * @param ttlMs    Time-to-live in ms (default: 30 min)
+ */
+export function cacheCookies(domain, cookies, ttlMs = DEFAULT_TTL_MS) {
+    if (!cookies.length)
+        return;
+    const normalizedDomain = normalizeDomain(domain);
+    const cookieHeader = buildCookieHeader(cookies);
+    const expiresAt = Date.now() + ttlMs;
+    store.set(normalizedDomain, {
+        cookieHeader,
+        cookies,
+        expiresAt,
+        domain: normalizedDomain,
+    });
+    // Start periodic cleanup if not already running
+    startCleanup();
+}
+/**
+ * Retrieve cached cookies for a domain (or its parent domain).
+ * Returns null if no valid (non-expired) entry exists.
+ *
+ * @param domain  Hostname to look up
+ */
+export function getCachedCookies(domain) {
+    const normalizedDomain = normalizeDomain(domain);
+    // Try exact match first, then parent domain
+    const candidates = [normalizedDomain, getParentDomain(normalizedDomain)].filter(Boolean);
+    for (const candidate of candidates) {
+        const entry = store.get(candidate);
+        if (entry && entry.expiresAt > Date.now()) {
+            return entry;
+        }
+        // Remove expired entry
+        if (entry) {
+            store.delete(candidate);
+        }
+    }
+    return null;
+}
+/**
+ * Build a Cookie request header value from a URL.
+ * Returns undefined if no cached cookies exist.
+ */
+export function getCookieHeader(url) {
+    try {
+        const domain = new URL(url).hostname;
+        const cached = getCachedCookies(domain);
+        return cached?.cookieHeader;
+    }
+    catch {
+        return undefined;
+    }
+}
+/**
+ * Cache cookies from a URL's perspective.
+ * Extracts domain from URL automatically.
+ */
+export function cacheCookiesForUrl(url, cookies, ttlMs = DEFAULT_TTL_MS) {
+    try {
+        const domain = new URL(url).hostname;
+        cacheCookies(domain, cookies, ttlMs);
+    }
+    catch {
+        // Invalid URL — ignore
+    }
+}
+/**
+ * Invalidate (remove) cached cookies for a domain.
+ */
+export function invalidateCookies(domain) {
+    const normalizedDomain = normalizeDomain(domain);
+    store.delete(normalizedDomain);
+}
+/**
+ * Return the number of cached domains (for diagnostics).
+ */
+export function getCacheSize() {
+    return store.size;
+}
+/**
+ * Clear ALL cached cookies. Mainly for tests.
+ */
+export function clearCookieCache() {
+    store.clear();
+    if (cleanupTimer) {
+        clearInterval(cleanupTimer);
+        cleanupTimer = null;
+    }
+}
+// ── Helpers ───────────────────────────────────────────────────────────────────
+/** Normalize domain: lowercase, strip www. prefix */
+function normalizeDomain(domain) {
+    return domain.toLowerCase().replace(/^www\./, '');
+}
+/** Get parent domain (strip first subdomain label) */
+function getParentDomain(domain) {
+    const parts = domain.split('.');
+    if (parts.length <= 2)
+        return null; // Already a root domain
+    return parts.slice(1).join('.');
+}
+/**
+ * Convert an array of Set-Cookie values or raw cookie strings into a single
+ * "Cookie: name=value; name2=value2" header value.
+ */
+function buildCookieHeader(cookies) {
+    const pairs = [];
+    for (const cookie of cookies) {
+        // Set-Cookie format: "name=value; Path=/; Secure; HttpOnly; ..."
+        // We only want the first "name=value" pair
+        const firstPart = cookie.split(';')[0]?.trim();
+        if (firstPart) {
+            pairs.push(firstPart);
+        }
+    }
+    return pairs.join('; ');
+}
+/** Periodically remove expired entries to prevent memory leaks. */
+function startCleanup() {
+    if (cleanupTimer)
+        return;
+    cleanupTimer = setInterval(() => {
+        const now = Date.now();
+        for (const [domain, entry] of store) {
+            if (entry.expiresAt <= now) {
+                store.delete(domain);
+            }
+        }
+        // Stop the timer if the cache is empty
+        if (store.size === 0 && cleanupTimer) {
+            clearInterval(cleanupTimer);
+            cleanupTimer = null;
+        }
+    }, 5 * 60 * 1000); // Run every 5 minutes
+    // Don't block Node.js process exit
+    if (cleanupTimer && typeof cleanupTimer.unref === 'function') {
+        cleanupTimer.unref();
+    }
+}

package/dist/core/http-fetch.js CHANGED Viewed

@@ -15,6 +15,7 @@ import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types
 import { getCached } from './cache.js';
 import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
 import { detectChallenge } from './challenge-detection.js';
+import { getCookieHeader } from './cookie-cache.js';
 import { createLogger } from './logger.js';
 const log = createLogger('http');
 // ── HTTP status text fallbacks (HTTP/2 omits reason phrases) ──────────────────
@@ -515,8 +516,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
     let activeUserAgent = isSecGov
         ? 'WebPeel/1.0 (support@webpeel.dev)'
         : (userAgent ? validateUserAgent(userAgent) : getHttpUA());
+    // Inject cached challenge-solve cookies (e.g. cf_clearance) if available.
+    // These are merged into customHeaders so they ride along on every request
+    // to this domain, skipping repeated challenge pages.
+    const cachedCookieHeader = getCookieHeader(url);
+    const effectiveCustomHeaders = cachedCookieHeader
+        ? { Cookie: cachedCookieHeader, ...(customHeaders || {}) }
+        : customHeaders;
     // Build stealth headers merged with any caller-supplied custom headers
-    let mergedHeaders = buildMergedHeaders(url, activeUserAgent, customHeaders);
+    let mergedHeaders = buildMergedHeaders(url, activeUserAgent, effectiveCustomHeaders);
     // Auto-route through residential proxy for sites known to block datacenter IPs.
     // The explicit `proxy` param always wins; auto-proxy only kicks in when unset.
     const effectiveProxy = proxy ?? (shouldUseProxy(url) ? (getWebshareProxyUrl() ?? undefined) : undefined);

package/dist/core/pipeline.js CHANGED Viewed

@@ -467,9 +467,38 @@ export async function fetchContent(ctx) {
     // Capture raw HTML size BEFORE any processing (accurate measurement of original content)
     ctx.rawHtmlSize = fetchResult.html?.length || 0;
     ctx.fetchResult = fetchResult;
-    // Warn when a challenge/CAPTCHA page was detected
+    // Attempt to solve challenge/CAPTCHA page when detected
     if (fetchResult.challengeDetected) {
-        ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
+        const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
+        // Only attempt solve if we have a browser worker URL or are not on a resource-constrained env
+        const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
+        if (canSolve) {
+            try {
+                const { solveChallenge } = await import('./challenge-solver.js');
+                const { detectChallenge } = await import('./challenge-detection.js');
+                const rawHtml = fetchResult.html || '';
+                const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
+                const challengeType = detectionResult.type || 'generic-block';
+                const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
+                    timeout: 15000,
+                });
+                if (solveResult.solved && solveResult.html) {
+                    fetchResult.html = solveResult.html;
+                    fetchResult.challengeDetected = false;
+                    log.debug(`Challenge solved (${challengeType}) for ${ctx.url}`);
+                }
+                else {
+                    ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
+                }
+            }
+            catch (e) {
+                ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
+                log.debug('Challenge solve failed:', e instanceof Error ? e.message : e);
+            }
+        }
+        else {
+            ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
+        }
     }
 }
 // ---------------------------------------------------------------------------
@@ -1004,22 +1033,59 @@ export async function postProcess(ctx) {
                 ctx.metadata.blocked = true;
                 ctx.metadata.challengeDetected = true;
             }
-            // Try search fallback for the real content
-            try {
-                // @ts-ignore — proprietary module, gitignored
-                const { searchFallback } = await import('./search-fallback.js');
-                const searchResult = await searchFallback(ctx.url);
-                if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
-                    ctx.content = searchResult.cachedContent;
-                    ctx.title = searchResult.title || ctx.title;
-                    ctx.quality = 0.4;
-                    ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
-                    if (ctx.metadata) {
-                        ctx.metadata.fallbackSource = searchResult.source;
+            // Try challenge solver first (if browser worker available or local solve enabled)
+            let solvedViaChallengeSolver = false;
+            const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
+            const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
+            if (canSolve && ctx.fetchResult?.html) {
+                try {
+                    const { solveChallenge } = await import('./challenge-solver.js');
+                    const { detectChallenge } = await import('./challenge-detection.js');
+                    const rawHtml = ctx.fetchResult.html;
+                    const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
+                    const challengeType = detectionResult.type || 'cloudflare';
+                    const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
+                        timeout: 15000,
+                    });
+                    if (solveResult.solved && solveResult.html) {
+                        // Re-parse the solved HTML
+                        const { htmlToMarkdown, htmlToText, cleanForAI } = await import('./markdown.js');
+                        const fmt = ctx.format || 'markdown';
+                        ctx.content = fmt === 'text' ? htmlToText(solveResult.html)
+                            : fmt === 'clean' ? cleanForAI(solveResult.html)
+                                : htmlToMarkdown(solveResult.html);
+                        ctx.fetchResult.html = solveResult.html;
+                        if (ctx.metadata) {
+                            ctx.metadata.blocked = false;
+                            ctx.metadata.challengeDetected = false;
+                            ctx.metadata.challengeSolved = true;
+                        }
+                        solvedViaChallengeSolver = true;
+                        log.debug(`Content-level challenge solved for ${ctx.url}`);
+                    }
+                }
+                catch (e) {
+                    log.debug('Content-level challenge solve failed:', e instanceof Error ? e.message : e);
+                }
+            }
+            // Fall back to search fallback if challenge solve didn't work
+            if (!solvedViaChallengeSolver) {
+                try {
+                    // @ts-ignore — proprietary module, gitignored
+                    const { searchFallback } = await import('./search-fallback.js');
+                    const searchResult = await searchFallback(ctx.url);
+                    if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
+                        ctx.content = searchResult.cachedContent;
+                        ctx.title = searchResult.title || ctx.title;
+                        ctx.quality = 0.4;
+                        ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
+                        if (ctx.metadata) {
+                            ctx.metadata.fallbackSource = searchResult.source;
+                        }
                     }
                 }
+                catch { /* Search fallback failed — continue with challenge page content */ }
             }
-            catch { /* Search fallback failed — continue with challenge page content */ }
         }
     }
     // === Zero-token safety net ===

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.45",
+  "version": "0.21.46",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",