webpeel 0.21.45 → 0.21.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Challenge / bot-protection solver.
3
+ *
4
+ * Attempts to bypass bot-protection challenges using free, in-process methods:
5
+ * 1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
6
+ * 2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
7
+ *
8
+ * Architecture note:
9
+ * Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
10
+ * is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
11
+ * instead of running locally. This keeps the main Render container (512 MB) lean.
12
+ *
13
+ * Usage:
14
+ * const result = await solveChallenge(url, 'cloudflare', html);
15
+ * if (result.solved) {
16
+ * // result.html = real page content
17
+ * // result.cookies = ["cf_clearance=...", ...]
18
+ * }
19
+ */
20
+ import type { ChallengeType } from './challenge-detection.js';
21
+ export interface SolveOptions {
22
+ /** Hard timeout in ms (default: 15 000) */
23
+ timeout?: number;
24
+ /** Optional proxy URL (http://user:pass@host:port) */
25
+ proxy?: string;
26
+ }
27
+ export interface SolveResult {
28
+ solved: boolean;
29
+ html: string;
30
+ /** Raw Set-Cookie header values extracted after solve */
31
+ cookies?: string[];
32
+ /** How the solve was performed */
33
+ method?: 'local-browser' | 'remote-worker' | 'accessibility';
34
+ /** Error details if solve failed */
35
+ error?: string;
36
+ }
37
+ /**
38
+ * Attempt to solve a bot-protection challenge.
39
+ *
40
+ * @param url The page URL (used for proxy routing and cookie caching)
41
+ * @param challengeType The type of challenge as detected by challenge-detection
42
+ * @param html The raw challenge HTML (used for context / fallback)
43
+ * @param options Optional timeout and proxy settings
44
+ * @returns Solve result with real HTML content and cookies if successful
45
+ */
46
+ export declare function solveChallenge(url: string, challengeType: ChallengeType, html: string, options?: SolveOptions): Promise<SolveResult>;
@@ -0,0 +1,367 @@
1
+ /**
2
+ * Challenge / bot-protection solver.
3
+ *
4
+ * Attempts to bypass bot-protection challenges using free, in-process methods:
5
+ * 1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
6
+ * 2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
7
+ *
8
+ * Architecture note:
9
+ * Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
10
+ * is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
11
+ * instead of running locally. This keeps the main Render container (512 MB) lean.
12
+ *
13
+ * Usage:
14
+ * const result = await solveChallenge(url, 'cloudflare', html);
15
+ * if (result.solved) {
16
+ * // result.html = real page content
17
+ * // result.cookies = ["cf_clearance=...", ...]
18
+ * }
19
+ */
20
+ import { cacheCookiesForUrl } from './cookie-cache.js';
21
+ import { createLogger } from './logger.js';
22
+ const log = createLogger('challenge-solver');
23
+ // ── Constants ─────────────────────────────────────────────────────────────────
24
+ const DEFAULT_TIMEOUT_MS = 15_000;
25
+ /** Cloudflare challenge title before it's solved */
26
+ const CF_CHALLENGE_TITLES = ['just a moment', 'please wait', 'one moment, please', 'checking your browser'];
27
+ /** Cloudflare challenge page markers */
28
+ const CF_CHALLENGE_SELECTORS = [
29
+ '#challenge-running',
30
+ '#challenge-form',
31
+ '#cf-challenge-running',
32
+ '.cf-browser-verification',
33
+ ];
34
+ // ── Main entry point ──────────────────────────────────────────────────────────
35
+ /**
36
+ * Attempt to solve a bot-protection challenge.
37
+ *
38
+ * @param url The page URL (used for proxy routing and cookie caching)
39
+ * @param challengeType The type of challenge as detected by challenge-detection
40
+ * @param html The raw challenge HTML (used for context / fallback)
41
+ * @param options Optional timeout and proxy settings
42
+ * @returns Solve result with real HTML content and cookies if successful
43
+ */
44
+ export async function solveChallenge(url, challengeType, html, options = {}) {
45
+ const domain = getDomain(url);
46
+ const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
47
+ console.log(`[challenge-solver] Attempting ${challengeType} solve for ${domain}`);
48
+ // ── Remote worker proxy (Hetzner) ──────────────────────────────────────────
49
+ const workerUrl = process.env.BROWSER_WORKER_URL;
50
+ if (workerUrl) {
51
+ return solveViaRemoteWorker(url, challengeType, html, { timeout, proxy: options.proxy, workerUrl });
52
+ }
53
+ // ── Local solve ────────────────────────────────────────────────────────────
54
+ switch (challengeType) {
55
+ case 'cloudflare':
56
+ return solveCloudflare(url, html, timeout, options.proxy);
57
+ case 'captcha':
58
+ // TODO: hCaptcha accessibility bypass — see comment below
59
+ return { solved: false, html, error: 'No free captcha solver available for generic captcha' };
60
+ case 'datadome':
61
+ // DataDome can sometimes be bypassed with a stealth browser
62
+ return solveWithStealthBrowser(url, html, timeout, options.proxy, 'datadome');
63
+ case 'akamai':
64
+ case 'perimeterx':
65
+ case 'incapsula':
66
+ case 'generic-block':
67
+ // For other challenges, try stealth browser as a general approach
68
+ return solveWithStealthBrowser(url, html, timeout, options.proxy, challengeType);
69
+ case 'empty-shell':
70
+ // Not really a challenge — just an SPA shell, shouldn't reach here
71
+ return { solved: false, html, error: 'empty-shell is not a challenge to solve' };
72
+ default:
73
+ return { solved: false, html, error: `Unknown challenge type: ${challengeType}` };
74
+ }
75
+ }
76
+ // ── Cloudflare solver ─────────────────────────────────────────────────────────
77
+ /**
78
+ * Solve Cloudflare JS challenge by rendering the page in a stealth browser.
79
+ *
80
+ * Cloudflare's "Just a moment..." challenge:
81
+ * - Runs JavaScript fingerprinting in the browser
82
+ * - If the fingerprint passes (looks like a real browser), auto-redirects to the real page
83
+ * - No human interaction needed if the browser stealth is good enough
84
+ *
85
+ * Strategy:
86
+ * 1. Open a fresh stealth browser page
87
+ * 2. Navigate to the URL
88
+ * 3. Wait for the challenge to complete (title changes OR challenge element disappears)
89
+ * 4. Extract HTML and cookies
90
+ * 5. Cache cf_clearance cookie for future requests
91
+ */
92
+ async function solveCloudflare(url, _html, timeoutMs, proxy) {
93
+ let browser = null;
94
+ let page = null;
95
+ try {
96
+ const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
97
+ browser = await getStealthBrowser();
98
+ const vp = getRandomViewport();
99
+ const ctx = await browser.newContext({
100
+ userAgent: getRandomUserAgent(),
101
+ viewport: { width: vp.width, height: vp.height },
102
+ ...(proxy ? { proxy: { server: proxy } } : {}),
103
+ // Accept all languages to look more like a real browser
104
+ locale: 'en-US',
105
+ timezoneId: 'America/New_York',
106
+ });
107
+ page = await ctx.newPage();
108
+ await applyStealthScripts(page);
109
+ // Navigate to the challenge URL
110
+ await page.goto(url, {
111
+ waitUntil: 'domcontentloaded',
112
+ timeout: timeoutMs,
113
+ });
114
+ // Wait for Cloudflare challenge to resolve
115
+ const solved = await waitForChallengeResolution(page, timeoutMs);
116
+ if (!solved) {
117
+ log.debug('Cloudflare challenge did not resolve within timeout');
118
+ await ctx.close().catch(() => { });
119
+ return { solved: false, html: await page.content().catch(() => _html), error: 'Cloudflare challenge timed out' };
120
+ }
121
+ // Extract real page content
122
+ const realHtml = await page.content();
123
+ // Extract cookies (especially cf_clearance)
124
+ const cookies = await ctx.cookies();
125
+ const cookieStrings = cookies.map(c => {
126
+ let s = `${c.name}=${c.value}`;
127
+ if (c.path)
128
+ s += `; Path=${c.path}`;
129
+ if (c.domain)
130
+ s += `; Domain=${c.domain}`;
131
+ if (c.secure)
132
+ s += '; Secure';
133
+ if (c.httpOnly)
134
+ s += '; HttpOnly';
135
+ if (c.expires && c.expires > 0) {
136
+ s += `; Expires=${new Date(c.expires * 1000).toUTCString()}`;
137
+ }
138
+ return s;
139
+ });
140
+ // Determine TTL based on cf_clearance expiry (default 30 min)
141
+ const cfClearance = cookies.find(c => c.name === 'cf_clearance');
142
+ const ttlMs = cfClearance?.expires && cfClearance.expires > 0
143
+ ? Math.min((cfClearance.expires * 1000) - Date.now(), 30 * 60 * 1000)
144
+ : 30 * 60 * 1000;
145
+ // Cache cookies for future requests
146
+ if (cookieStrings.length > 0) {
147
+ cacheCookiesForUrl(url, cookieStrings, ttlMs);
148
+ log.debug(`Cached ${cookieStrings.length} cookies for ${getDomain(url)} (TTL: ${Math.round(ttlMs / 60000)}m)`);
149
+ }
150
+ await ctx.close().catch(() => { });
151
+ console.log(`[challenge-solver] Cloudflare challenge solved for ${getDomain(url)}, extracted ${cookieStrings.length} cookies`);
152
+ return {
153
+ solved: true,
154
+ html: realHtml,
155
+ cookies: cookieStrings,
156
+ method: 'local-browser',
157
+ };
158
+ }
159
+ catch (err) {
160
+ const error = err instanceof Error ? err.message : String(err);
161
+ log.debug('Cloudflare solve failed:', error);
162
+ return { solved: false, html: _html, error };
163
+ }
164
+ finally {
165
+ // Don't close shared browser — it's managed by browser-pool
166
+ page = null;
167
+ browser = null;
168
+ }
169
+ }
170
+ // ── Generic stealth browser solver ───────────────────────────────────────────
171
+ /**
172
+ * General-purpose stealth browser solve for challenges that may auto-resolve
173
+ * when rendered in a legitimate-looking browser (DataDome, Akamai, etc.).
174
+ */
175
+ async function solveWithStealthBrowser(url, _html, timeoutMs, proxy, challengeType) {
176
+ let page = null;
177
+ try {
178
+ const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
179
+ const browser = await getStealthBrowser();
180
+ const vp = getRandomViewport();
181
+ const ctx = await browser.newContext({
182
+ userAgent: getRandomUserAgent(),
183
+ viewport: { width: vp.width, height: vp.height },
184
+ ...(proxy ? { proxy: { server: proxy } } : {}),
185
+ locale: 'en-US',
186
+ });
187
+ page = await ctx.newPage();
188
+ await applyStealthScripts(page);
189
+ await page.goto(url, {
190
+ waitUntil: 'networkidle',
191
+ timeout: timeoutMs,
192
+ });
193
+ // Wait a bit for any JS-based challenges to execute
194
+ await page.waitForTimeout(2000);
195
+ const html = await page.content();
196
+ const cookies = await ctx.cookies();
197
+ const cookieStrings = cookies.map(c => `${c.name}=${c.value}; Path=${c.path || '/'}${c.domain ? `; Domain=${c.domain}` : ''}`);
198
+ // Check if we got real content (not a challenge page)
199
+ const titleEl = await page.title().catch(() => '');
200
+ const isStillChallenge = CF_CHALLENGE_TITLES.some(t => titleEl.toLowerCase().includes(t))
201
+ || html.includes('cf-browser-verification')
202
+ || html.includes('challenge-form');
203
+ if (isStillChallenge) {
204
+ await ctx.close().catch(() => { });
205
+ return { solved: false, html, error: `${challengeType} challenge did not resolve` };
206
+ }
207
+ if (cookieStrings.length > 0) {
208
+ cacheCookiesForUrl(url, cookieStrings);
209
+ }
210
+ await ctx.close().catch(() => { });
211
+ console.log(`[challenge-solver] ${challengeType} challenge solved for ${getDomain(url)}`);
212
+ return { solved: true, html, cookies: cookieStrings, method: 'local-browser' };
213
+ }
214
+ catch (err) {
215
+ const error = err instanceof Error ? err.message : String(err);
216
+ return { solved: false, html: _html, error };
217
+ }
218
+ finally {
219
+ page = null;
220
+ }
221
+ }
222
+ // ── Remote worker proxy ───────────────────────────────────────────────────────
223
+ /**
224
+ * Proxy a solve request to a remote browser worker (e.g. Hetzner VPS).
225
+ *
226
+ * The worker endpoint is expected to accept:
227
+ * POST /solve
228
+ * { url, challengeType, timeout, proxy? }
229
+ *
230
+ * And return:
231
+ * { solved: boolean, html: string, cookies?: string[], error?: string }
232
+ *
233
+ * Set BROWSER_WORKER_URL to the worker base URL (e.g. http://hetzner:3001)
234
+ * to route all browser-based challenge solving to the worker.
235
+ */
236
+ async function solveViaRemoteWorker(url, challengeType, html, options) {
237
+ const { workerUrl, timeout, proxy } = options;
238
+ try {
239
+ const controller = new AbortController();
240
+ const timer = setTimeout(() => controller.abort(), timeout + 5000); // Add buffer
241
+ const response = await fetch(`${workerUrl}/solve`, {
242
+ method: 'POST',
243
+ headers: { 'Content-Type': 'application/json' },
244
+ body: JSON.stringify({ url, challengeType, timeout, ...(proxy ? { proxy } : {}) }),
245
+ signal: controller.signal,
246
+ });
247
+ clearTimeout(timer);
248
+ if (!response.ok) {
249
+ throw new Error(`Worker returned HTTP ${response.status}`);
250
+ }
251
+ const result = await response.json();
252
+ // Cache cookies from remote solve
253
+ if (result.solved && result.cookies?.length) {
254
+ cacheCookiesForUrl(url, result.cookies);
255
+ console.log(`[challenge-solver] Remote ${challengeType} solve for ${getDomain(url)}, cached ${result.cookies.length} cookies`);
256
+ }
257
+ return { ...result, method: 'remote-worker' };
258
+ }
259
+ catch (err) {
260
+ const error = err instanceof Error ? err.message : String(err);
261
+ log.debug('Remote worker solve failed:', error);
262
+ // Fall through to local solve on worker failure
263
+ console.log(`[challenge-solver] Remote worker failed, attempting local ${challengeType} solve for ${getDomain(url)}`);
264
+ switch (challengeType) {
265
+ case 'cloudflare':
266
+ return solveCloudflare(url, html, options.timeout, options.proxy);
267
+ default:
268
+ return solveWithStealthBrowser(url, html, options.timeout, options.proxy, challengeType);
269
+ }
270
+ }
271
+ }
272
+ // ── Challenge resolution detection ───────────────────────────────────────────
273
+ /**
274
+ * Wait for a Cloudflare challenge page to resolve.
275
+ *
276
+ * Cloudflare's challenge works like this:
277
+ * 1. Initial page: title is "Just a moment..." with challenge elements
278
+ * 2. Browser runs JS fingerprinting
279
+ * 3. On pass: redirects to real page (title and content change)
280
+ * 4. On fail: stays on challenge page
281
+ *
282
+ * We detect resolution by watching for:
283
+ * - Title change (away from challenge titles)
284
+ * - Challenge element disappearance
285
+ * - URL change (often redirects after solve)
286
+ */
287
+ async function waitForChallengeResolution(page, timeoutMs) {
288
+ const start = Date.now();
289
+ const pollInterval = 500;
290
+ // Quick check: is it even a challenge page?
291
+ const initialTitle = await page.title().catch(() => '');
292
+ const isInitiallyChallenge = CF_CHALLENGE_TITLES.some(t => initialTitle.toLowerCase().includes(t));
293
+ if (!isInitiallyChallenge) {
294
+ // Not a challenge page to begin with — treat as solved
295
+ return true;
296
+ }
297
+ // Poll until timeout
298
+ while (Date.now() - start < timeoutMs) {
299
+ await page.waitForTimeout(pollInterval);
300
+ const title = await page.title().catch(() => '');
301
+ const lowerTitle = title.toLowerCase();
302
+ // Title changed away from challenge
303
+ const isChallengeTitle = CF_CHALLENGE_TITLES.some(t => lowerTitle.includes(t));
304
+ if (!isChallengeTitle && title.length > 0) {
305
+ // Give the page a moment to fully render
306
+ await page.waitForTimeout(1000);
307
+ return true;
308
+ }
309
+ // Check if challenge elements are gone
310
+ let challengeElementGone = true;
311
+ for (const selector of CF_CHALLENGE_SELECTORS) {
312
+ try {
313
+ const el = await page.$(selector);
314
+ if (el) {
315
+ challengeElementGone = false;
316
+ break;
317
+ }
318
+ }
319
+ catch {
320
+ // Selector check failed — continue
321
+ }
322
+ }
323
+ if (challengeElementGone && !isChallengeTitle) {
324
+ await page.waitForTimeout(500);
325
+ return true;
326
+ }
327
+ // Try waiting for network to settle (challenge often triggers fetches)
328
+ try {
329
+ await page.waitForLoadState('networkidle', { timeout: Math.min(3000, timeoutMs - (Date.now() - start)) });
330
+ const finalTitle = await page.title().catch(() => '');
331
+ if (!CF_CHALLENGE_TITLES.some(t => finalTitle.toLowerCase().includes(t))) {
332
+ return true;
333
+ }
334
+ }
335
+ catch {
336
+ // Timeout or error — continue polling
337
+ }
338
+ }
339
+ return false;
340
+ }
341
+ // ── hCaptcha Accessibility Bypass ────────────────────────────────────────────
342
+ // TODO: hCaptcha Accessibility Bypass
343
+ // hCaptcha has an accessibility service at https://www.hcaptcha.com/accessibility
344
+ // that provides a cookie allowing users with accessibility needs to bypass hCaptcha.
345
+ //
346
+ // Implementation notes:
347
+ // - The service used to allow programmatic registration without email verification
348
+ // - As of 2025, it requires manual verification (email link) to activate
349
+ // - Since this requires human interaction, it cannot be fully automated
350
+ //
351
+ // When/if implemented:
352
+ // 1. Check https://www.hcaptcha.com/accessibility for current API status
353
+ // 2. Register with a request to their accessibility API
354
+ // 3. If they return a cookie directly (no email verification), cache it
355
+ // 4. Attach the cookie to requests to sites using hCaptcha
356
+ //
357
+ // const HCAPTCHA_ACCESSIBILITY_URL = 'https://accounts.hcaptcha.com/demo?sitekey=bf5558a0-...';
358
+ // export async function getHCaptchaAccessibilityCookie(): Promise<string | null> { ... }
359
+ // ── Utility ───────────────────────────────────────────────────────────────────
360
+ function getDomain(url) {
361
+ try {
362
+ return new URL(url).hostname;
363
+ }
364
+ catch {
365
+ return url;
366
+ }
367
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * In-memory cookie cache with TTL.
3
+ *
4
+ * Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
5
+ * Cookies from challenge solves are cached here so future requests to the same
6
+ * domain skip the challenge entirely.
7
+ *
8
+ * Design goals:
9
+ * - Zero dependencies (plain Map + setTimeout)
10
+ * - In-memory only — no disk/DB persistence
11
+ * - TTL per entry (default 30 min, matching cf_clearance lifetime)
12
+ * - Thread-safe for single-process Node.js (event loop is single-threaded)
13
+ */
14
+ export interface CachedCookies {
15
+ /** Raw "Cookie: ..." header value (semicolon-separated) */
16
+ cookieHeader: string;
17
+ /** Individual cookie strings (e.g. ["cf_clearance=abc; Path=/", ...]) */
18
+ cookies: string[];
19
+ /** Unix timestamp (ms) when this cache entry expires */
20
+ expiresAt: number;
21
+ /** The domain these cookies are for */
22
+ domain: string;
23
+ }
24
+ /**
25
+ * Store cookies for a domain.
26
+ *
27
+ * @param domain Hostname (e.g. "example.com" or "sub.example.com")
28
+ * @param cookies Array of Set-Cookie header values or cookie strings
29
+ * @param ttlMs Time-to-live in ms (default: 30 min)
30
+ */
31
+ export declare function cacheCookies(domain: string, cookies: string[], ttlMs?: number): void;
32
+ /**
33
+ * Retrieve cached cookies for a domain (or its parent domain).
34
+ * Returns null if no valid (non-expired) entry exists.
35
+ *
36
+ * @param domain Hostname to look up
37
+ */
38
+ export declare function getCachedCookies(domain: string): CachedCookies | null;
39
+ /**
40
+ * Build a Cookie request header value from a URL.
41
+ * Returns undefined if no cached cookies exist.
42
+ */
43
+ export declare function getCookieHeader(url: string): string | undefined;
44
+ /**
45
+ * Cache cookies from a URL's perspective.
46
+ * Extracts domain from URL automatically.
47
+ */
48
+ export declare function cacheCookiesForUrl(url: string, cookies: string[], ttlMs?: number): void;
49
+ /**
50
+ * Invalidate (remove) cached cookies for a domain.
51
+ */
52
+ export declare function invalidateCookies(domain: string): void;
53
+ /**
54
+ * Return the number of cached domains (for diagnostics).
55
+ */
56
+ export declare function getCacheSize(): number;
57
+ /**
58
+ * Clear ALL cached cookies. Mainly for tests.
59
+ */
60
+ export declare function clearCookieCache(): void;
@@ -0,0 +1,163 @@
1
+ /**
2
+ * In-memory cookie cache with TTL.
3
+ *
4
+ * Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
5
+ * Cookies from challenge solves are cached here so future requests to the same
6
+ * domain skip the challenge entirely.
7
+ *
8
+ * Design goals:
9
+ * - Zero dependencies (plain Map + setTimeout)
10
+ * - In-memory only — no disk/DB persistence
11
+ * - TTL per entry (default 30 min, matching cf_clearance lifetime)
12
+ * - Thread-safe for single-process Node.js (event loop is single-threaded)
13
+ */
14
+ // ── Internal store ────────────────────────────────────────────────────────────
15
+ const store = new Map();
16
+ let cleanupTimer = null;
17
+ /** Default TTL: 30 minutes (cf_clearance lasts 30 min) */
18
+ const DEFAULT_TTL_MS = 30 * 60 * 1000;
19
+ // ── Public API ────────────────────────────────────────────────────────────────
20
+ /**
21
+ * Store cookies for a domain.
22
+ *
23
+ * @param domain Hostname (e.g. "example.com" or "sub.example.com")
24
+ * @param cookies Array of Set-Cookie header values or cookie strings
25
+ * @param ttlMs Time-to-live in ms (default: 30 min)
26
+ */
27
+ export function cacheCookies(domain, cookies, ttlMs = DEFAULT_TTL_MS) {
28
+ if (!cookies.length)
29
+ return;
30
+ const normalizedDomain = normalizeDomain(domain);
31
+ const cookieHeader = buildCookieHeader(cookies);
32
+ const expiresAt = Date.now() + ttlMs;
33
+ store.set(normalizedDomain, {
34
+ cookieHeader,
35
+ cookies,
36
+ expiresAt,
37
+ domain: normalizedDomain,
38
+ });
39
+ // Start periodic cleanup if not already running
40
+ startCleanup();
41
+ }
42
+ /**
43
+ * Retrieve cached cookies for a domain (or its parent domain).
44
+ * Returns null if no valid (non-expired) entry exists.
45
+ *
46
+ * @param domain Hostname to look up
47
+ */
48
+ export function getCachedCookies(domain) {
49
+ const normalizedDomain = normalizeDomain(domain);
50
+ // Try exact match first, then parent domain
51
+ const candidates = [normalizedDomain, getParentDomain(normalizedDomain)].filter(Boolean);
52
+ for (const candidate of candidates) {
53
+ const entry = store.get(candidate);
54
+ if (entry && entry.expiresAt > Date.now()) {
55
+ return entry;
56
+ }
57
+ // Remove expired entry
58
+ if (entry) {
59
+ store.delete(candidate);
60
+ }
61
+ }
62
+ return null;
63
+ }
64
+ /**
65
+ * Build a Cookie request header value from a URL.
66
+ * Returns undefined if no cached cookies exist.
67
+ */
68
+ export function getCookieHeader(url) {
69
+ try {
70
+ const domain = new URL(url).hostname;
71
+ const cached = getCachedCookies(domain);
72
+ return cached?.cookieHeader;
73
+ }
74
+ catch {
75
+ return undefined;
76
+ }
77
+ }
78
+ /**
79
+ * Cache cookies from a URL's perspective.
80
+ * Extracts domain from URL automatically.
81
+ */
82
+ export function cacheCookiesForUrl(url, cookies, ttlMs = DEFAULT_TTL_MS) {
83
+ try {
84
+ const domain = new URL(url).hostname;
85
+ cacheCookies(domain, cookies, ttlMs);
86
+ }
87
+ catch {
88
+ // Invalid URL — ignore
89
+ }
90
+ }
91
+ /**
92
+ * Invalidate (remove) cached cookies for a domain.
93
+ */
94
+ export function invalidateCookies(domain) {
95
+ const normalizedDomain = normalizeDomain(domain);
96
+ store.delete(normalizedDomain);
97
+ }
98
+ /**
99
+ * Return the number of cached domains (for diagnostics).
100
+ */
101
+ export function getCacheSize() {
102
+ return store.size;
103
+ }
104
+ /**
105
+ * Clear ALL cached cookies. Mainly for tests.
106
+ */
107
+ export function clearCookieCache() {
108
+ store.clear();
109
+ if (cleanupTimer) {
110
+ clearInterval(cleanupTimer);
111
+ cleanupTimer = null;
112
+ }
113
+ }
114
+ // ── Helpers ───────────────────────────────────────────────────────────────────
115
+ /** Normalize domain: lowercase, strip www. prefix */
116
+ function normalizeDomain(domain) {
117
+ return domain.toLowerCase().replace(/^www\./, '');
118
+ }
119
+ /** Get parent domain (strip first subdomain label) */
120
+ function getParentDomain(domain) {
121
+ const parts = domain.split('.');
122
+ if (parts.length <= 2)
123
+ return null; // Already a root domain
124
+ return parts.slice(1).join('.');
125
+ }
126
+ /**
127
+ * Convert an array of Set-Cookie values or raw cookie strings into a single
128
+ * "Cookie: name=value; name2=value2" header value.
129
+ */
130
+ function buildCookieHeader(cookies) {
131
+ const pairs = [];
132
+ for (const cookie of cookies) {
133
+ // Set-Cookie format: "name=value; Path=/; Secure; HttpOnly; ..."
134
+ // We only want the first "name=value" pair
135
+ const firstPart = cookie.split(';')[0]?.trim();
136
+ if (firstPart) {
137
+ pairs.push(firstPart);
138
+ }
139
+ }
140
+ return pairs.join('; ');
141
+ }
142
+ /** Periodically remove expired entries to prevent memory leaks. */
143
+ function startCleanup() {
144
+ if (cleanupTimer)
145
+ return;
146
+ cleanupTimer = setInterval(() => {
147
+ const now = Date.now();
148
+ for (const [domain, entry] of store) {
149
+ if (entry.expiresAt <= now) {
150
+ store.delete(domain);
151
+ }
152
+ }
153
+ // Stop the timer if the cache is empty
154
+ if (store.size === 0 && cleanupTimer) {
155
+ clearInterval(cleanupTimer);
156
+ cleanupTimer = null;
157
+ }
158
+ }, 5 * 60 * 1000); // Run every 5 minutes
159
+ // Don't block Node.js process exit
160
+ if (cleanupTimer && typeof cleanupTimer.unref === 'function') {
161
+ cleanupTimer.unref();
162
+ }
163
+ }
@@ -15,6 +15,7 @@ import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types
15
15
  import { getCached } from './cache.js';
16
16
  import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
17
17
  import { detectChallenge } from './challenge-detection.js';
18
+ import { getCookieHeader } from './cookie-cache.js';
18
19
  import { createLogger } from './logger.js';
19
20
  const log = createLogger('http');
20
21
  // ── HTTP status text fallbacks (HTTP/2 omits reason phrases) ──────────────────
@@ -515,8 +516,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
515
516
  let activeUserAgent = isSecGov
516
517
  ? 'WebPeel/1.0 (support@webpeel.dev)'
517
518
  : (userAgent ? validateUserAgent(userAgent) : getHttpUA());
519
+ // Inject cached challenge-solve cookies (e.g. cf_clearance) if available.
520
+ // These are merged into customHeaders so they ride along on every request
521
+ // to this domain, skipping repeated challenge pages.
522
+ const cachedCookieHeader = getCookieHeader(url);
523
+ const effectiveCustomHeaders = cachedCookieHeader
524
+ ? { Cookie: cachedCookieHeader, ...(customHeaders || {}) }
525
+ : customHeaders;
518
526
  // Build stealth headers merged with any caller-supplied custom headers
519
- let mergedHeaders = buildMergedHeaders(url, activeUserAgent, customHeaders);
527
+ let mergedHeaders = buildMergedHeaders(url, activeUserAgent, effectiveCustomHeaders);
520
528
  // Auto-route through residential proxy for sites known to block datacenter IPs.
521
529
  // The explicit `proxy` param always wins; auto-proxy only kicks in when unset.
522
530
  const effectiveProxy = proxy ?? (shouldUseProxy(url) ? (getWebshareProxyUrl() ?? undefined) : undefined);
@@ -467,9 +467,38 @@ export async function fetchContent(ctx) {
467
467
  // Capture raw HTML size BEFORE any processing (accurate measurement of original content)
468
468
  ctx.rawHtmlSize = fetchResult.html?.length || 0;
469
469
  ctx.fetchResult = fetchResult;
470
- // Warn when a challenge/CAPTCHA page was detected
470
+ // Attempt to solve challenge/CAPTCHA page when detected
471
471
  if (fetchResult.challengeDetected) {
472
- ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
472
+ const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
473
+ // Only attempt solve if we have a browser worker URL or are not on a resource-constrained env
474
+ const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
475
+ if (canSolve) {
476
+ try {
477
+ const { solveChallenge } = await import('./challenge-solver.js');
478
+ const { detectChallenge } = await import('./challenge-detection.js');
479
+ const rawHtml = fetchResult.html || '';
480
+ const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
481
+ const challengeType = detectionResult.type || 'generic-block';
482
+ const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
483
+ timeout: 15000,
484
+ });
485
+ if (solveResult.solved && solveResult.html) {
486
+ fetchResult.html = solveResult.html;
487
+ fetchResult.challengeDetected = false;
488
+ log.debug(`Challenge solved (${challengeType}) for ${ctx.url}`);
489
+ }
490
+ else {
491
+ ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
492
+ }
493
+ }
494
+ catch (e) {
495
+ ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
496
+ log.debug('Challenge solve failed:', e instanceof Error ? e.message : e);
497
+ }
498
+ }
499
+ else {
500
+ ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
501
+ }
473
502
  }
474
503
  }
475
504
  // ---------------------------------------------------------------------------
@@ -1004,22 +1033,59 @@ export async function postProcess(ctx) {
1004
1033
  ctx.metadata.blocked = true;
1005
1034
  ctx.metadata.challengeDetected = true;
1006
1035
  }
1007
- // Try search fallback for the real content
1008
- try {
1009
- // @ts-ignore proprietary module, gitignored
1010
- const { searchFallback } = await import('./search-fallback.js');
1011
- const searchResult = await searchFallback(ctx.url);
1012
- if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
1013
- ctx.content = searchResult.cachedContent;
1014
- ctx.title = searchResult.title || ctx.title;
1015
- ctx.quality = 0.4;
1016
- ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
1017
- if (ctx.metadata) {
1018
- ctx.metadata.fallbackSource = searchResult.source;
1036
+ // Try challenge solver first (if browser worker available or local solve enabled)
1037
+ let solvedViaChallengeSolver = false;
1038
+ const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
1039
+ const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
1040
+ if (canSolve && ctx.fetchResult?.html) {
1041
+ try {
1042
+ const { solveChallenge } = await import('./challenge-solver.js');
1043
+ const { detectChallenge } = await import('./challenge-detection.js');
1044
+ const rawHtml = ctx.fetchResult.html;
1045
+ const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
1046
+ const challengeType = detectionResult.type || 'cloudflare';
1047
+ const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
1048
+ timeout: 15000,
1049
+ });
1050
+ if (solveResult.solved && solveResult.html) {
1051
+ // Re-parse the solved HTML
1052
+ const { htmlToMarkdown, htmlToText, cleanForAI } = await import('./markdown.js');
1053
+ const fmt = ctx.format || 'markdown';
1054
+ ctx.content = fmt === 'text' ? htmlToText(solveResult.html)
1055
+ : fmt === 'clean' ? cleanForAI(solveResult.html)
1056
+ : htmlToMarkdown(solveResult.html);
1057
+ ctx.fetchResult.html = solveResult.html;
1058
+ if (ctx.metadata) {
1059
+ ctx.metadata.blocked = false;
1060
+ ctx.metadata.challengeDetected = false;
1061
+ ctx.metadata.challengeSolved = true;
1062
+ }
1063
+ solvedViaChallengeSolver = true;
1064
+ log.debug(`Content-level challenge solved for ${ctx.url}`);
1065
+ }
1066
+ }
1067
+ catch (e) {
1068
+ log.debug('Content-level challenge solve failed:', e instanceof Error ? e.message : e);
1069
+ }
1070
+ }
1071
+ // Fall back to search fallback if challenge solve didn't work
1072
+ if (!solvedViaChallengeSolver) {
1073
+ try {
1074
+ // @ts-ignore — proprietary module, gitignored
1075
+ const { searchFallback } = await import('./search-fallback.js');
1076
+ const searchResult = await searchFallback(ctx.url);
1077
+ if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
1078
+ ctx.content = searchResult.cachedContent;
1079
+ ctx.title = searchResult.title || ctx.title;
1080
+ ctx.quality = 0.4;
1081
+ ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
1082
+ if (ctx.metadata) {
1083
+ ctx.metadata.fallbackSource = searchResult.source;
1084
+ }
1019
1085
  }
1020
1086
  }
1087
+ catch { /* Search fallback failed — continue with challenge page content */ }
1021
1088
  }
1022
- catch { /* Search fallback failed — continue with challenge page content */ }
1023
1089
  }
1024
1090
  }
1025
1091
  // === Zero-token safety net ===
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.45",
3
+ "version": "0.21.46",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",