webpeel 0.21.45 → 0.21.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/challenge-solver.d.ts +46 -0
- package/dist/core/challenge-solver.js +367 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/http-fetch.js +9 -1
- package/dist/core/pipeline.js +81 -15
- package/package.json +1 -1
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Challenge / bot-protection solver.
|
|
3
|
+
*
|
|
4
|
+
* Attempts to bypass bot-protection challenges using free, in-process methods:
|
|
5
|
+
* 1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
|
|
6
|
+
* 2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
|
|
7
|
+
*
|
|
8
|
+
* Architecture note:
|
|
9
|
+
* Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
|
|
10
|
+
* is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
|
|
11
|
+
* instead of running locally. This keeps the main Render container (512 MB) lean.
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* const result = await solveChallenge(url, 'cloudflare', html);
|
|
15
|
+
* if (result.solved) {
|
|
16
|
+
* // result.html = real page content
|
|
17
|
+
* // result.cookies = ["cf_clearance=...", ...]
|
|
18
|
+
* }
|
|
19
|
+
*/
|
|
20
|
+
import type { ChallengeType } from './challenge-detection.js';
|
|
21
|
+
export interface SolveOptions {
|
|
22
|
+
/** Hard timeout in ms (default: 15 000) */
|
|
23
|
+
timeout?: number;
|
|
24
|
+
/** Optional proxy URL (http://user:pass@host:port) */
|
|
25
|
+
proxy?: string;
|
|
26
|
+
}
|
|
27
|
+
export interface SolveResult {
|
|
28
|
+
solved: boolean;
|
|
29
|
+
html: string;
|
|
30
|
+
/** Raw Set-Cookie header values extracted after solve */
|
|
31
|
+
cookies?: string[];
|
|
32
|
+
/** How the solve was performed */
|
|
33
|
+
method?: 'local-browser' | 'remote-worker' | 'accessibility';
|
|
34
|
+
/** Error details if solve failed */
|
|
35
|
+
error?: string;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Attempt to solve a bot-protection challenge.
|
|
39
|
+
*
|
|
40
|
+
* @param url The page URL (used for proxy routing and cookie caching)
|
|
41
|
+
* @param challengeType The type of challenge as detected by challenge-detection
|
|
42
|
+
* @param html The raw challenge HTML (used for context / fallback)
|
|
43
|
+
* @param options Optional timeout and proxy settings
|
|
44
|
+
* @returns Solve result with real HTML content and cookies if successful
|
|
45
|
+
*/
|
|
46
|
+
export declare function solveChallenge(url: string, challengeType: ChallengeType, html: string, options?: SolveOptions): Promise<SolveResult>;
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Challenge / bot-protection solver.
|
|
3
|
+
*
|
|
4
|
+
* Attempts to bypass bot-protection challenges using free, in-process methods:
|
|
5
|
+
* 1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
|
|
6
|
+
* 2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
|
|
7
|
+
*
|
|
8
|
+
* Architecture note:
|
|
9
|
+
* Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
|
|
10
|
+
* is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
|
|
11
|
+
* instead of running locally. This keeps the main Render container (512 MB) lean.
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* const result = await solveChallenge(url, 'cloudflare', html);
|
|
15
|
+
* if (result.solved) {
|
|
16
|
+
* // result.html = real page content
|
|
17
|
+
* // result.cookies = ["cf_clearance=...", ...]
|
|
18
|
+
* }
|
|
19
|
+
*/
|
|
20
|
+
import { cacheCookiesForUrl } from './cookie-cache.js';
|
|
21
|
+
import { createLogger } from './logger.js';
|
|
22
|
+
const log = createLogger('challenge-solver');
|
|
23
|
+
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
24
|
+
const DEFAULT_TIMEOUT_MS = 15_000;
|
|
25
|
+
/** Cloudflare challenge title before it's solved */
|
|
26
|
+
const CF_CHALLENGE_TITLES = ['just a moment', 'please wait', 'one moment, please', 'checking your browser'];
|
|
27
|
+
/** Cloudflare challenge page markers */
|
|
28
|
+
const CF_CHALLENGE_SELECTORS = [
|
|
29
|
+
'#challenge-running',
|
|
30
|
+
'#challenge-form',
|
|
31
|
+
'#cf-challenge-running',
|
|
32
|
+
'.cf-browser-verification',
|
|
33
|
+
];
|
|
34
|
+
// ── Main entry point ──────────────────────────────────────────────────────────
|
|
35
|
+
/**
|
|
36
|
+
* Attempt to solve a bot-protection challenge.
|
|
37
|
+
*
|
|
38
|
+
* @param url The page URL (used for proxy routing and cookie caching)
|
|
39
|
+
* @param challengeType The type of challenge as detected by challenge-detection
|
|
40
|
+
* @param html The raw challenge HTML (used for context / fallback)
|
|
41
|
+
* @param options Optional timeout and proxy settings
|
|
42
|
+
* @returns Solve result with real HTML content and cookies if successful
|
|
43
|
+
*/
|
|
44
|
+
export async function solveChallenge(url, challengeType, html, options = {}) {
|
|
45
|
+
const domain = getDomain(url);
|
|
46
|
+
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
47
|
+
console.log(`[challenge-solver] Attempting ${challengeType} solve for ${domain}`);
|
|
48
|
+
// ── Remote worker proxy (Hetzner) ──────────────────────────────────────────
|
|
49
|
+
const workerUrl = process.env.BROWSER_WORKER_URL;
|
|
50
|
+
if (workerUrl) {
|
|
51
|
+
return solveViaRemoteWorker(url, challengeType, html, { timeout, proxy: options.proxy, workerUrl });
|
|
52
|
+
}
|
|
53
|
+
// ── Local solve ────────────────────────────────────────────────────────────
|
|
54
|
+
switch (challengeType) {
|
|
55
|
+
case 'cloudflare':
|
|
56
|
+
return solveCloudflare(url, html, timeout, options.proxy);
|
|
57
|
+
case 'captcha':
|
|
58
|
+
// TODO: hCaptcha accessibility bypass — see comment below
|
|
59
|
+
return { solved: false, html, error: 'No free captcha solver available for generic captcha' };
|
|
60
|
+
case 'datadome':
|
|
61
|
+
// DataDome can sometimes be bypassed with a stealth browser
|
|
62
|
+
return solveWithStealthBrowser(url, html, timeout, options.proxy, 'datadome');
|
|
63
|
+
case 'akamai':
|
|
64
|
+
case 'perimeterx':
|
|
65
|
+
case 'incapsula':
|
|
66
|
+
case 'generic-block':
|
|
67
|
+
// For other challenges, try stealth browser as a general approach
|
|
68
|
+
return solveWithStealthBrowser(url, html, timeout, options.proxy, challengeType);
|
|
69
|
+
case 'empty-shell':
|
|
70
|
+
// Not really a challenge — just an SPA shell, shouldn't reach here
|
|
71
|
+
return { solved: false, html, error: 'empty-shell is not a challenge to solve' };
|
|
72
|
+
default:
|
|
73
|
+
return { solved: false, html, error: `Unknown challenge type: ${challengeType}` };
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
// ── Cloudflare solver ─────────────────────────────────────────────────────────
|
|
77
|
+
/**
|
|
78
|
+
* Solve Cloudflare JS challenge by rendering the page in a stealth browser.
|
|
79
|
+
*
|
|
80
|
+
* Cloudflare's "Just a moment..." challenge:
|
|
81
|
+
* - Runs JavaScript fingerprinting in the browser
|
|
82
|
+
* - If the fingerprint passes (looks like a real browser), auto-redirects to the real page
|
|
83
|
+
* - No human interaction needed if the browser stealth is good enough
|
|
84
|
+
*
|
|
85
|
+
* Strategy:
|
|
86
|
+
* 1. Open a fresh stealth browser page
|
|
87
|
+
* 2. Navigate to the URL
|
|
88
|
+
* 3. Wait for the challenge to complete (title changes OR challenge element disappears)
|
|
89
|
+
* 4. Extract HTML and cookies
|
|
90
|
+
* 5. Cache cf_clearance cookie for future requests
|
|
91
|
+
*/
|
|
92
|
+
async function solveCloudflare(url, _html, timeoutMs, proxy) {
|
|
93
|
+
let browser = null;
|
|
94
|
+
let page = null;
|
|
95
|
+
try {
|
|
96
|
+
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
|
|
97
|
+
browser = await getStealthBrowser();
|
|
98
|
+
const vp = getRandomViewport();
|
|
99
|
+
const ctx = await browser.newContext({
|
|
100
|
+
userAgent: getRandomUserAgent(),
|
|
101
|
+
viewport: { width: vp.width, height: vp.height },
|
|
102
|
+
...(proxy ? { proxy: { server: proxy } } : {}),
|
|
103
|
+
// Accept all languages to look more like a real browser
|
|
104
|
+
locale: 'en-US',
|
|
105
|
+
timezoneId: 'America/New_York',
|
|
106
|
+
});
|
|
107
|
+
page = await ctx.newPage();
|
|
108
|
+
await applyStealthScripts(page);
|
|
109
|
+
// Navigate to the challenge URL
|
|
110
|
+
await page.goto(url, {
|
|
111
|
+
waitUntil: 'domcontentloaded',
|
|
112
|
+
timeout: timeoutMs,
|
|
113
|
+
});
|
|
114
|
+
// Wait for Cloudflare challenge to resolve
|
|
115
|
+
const solved = await waitForChallengeResolution(page, timeoutMs);
|
|
116
|
+
if (!solved) {
|
|
117
|
+
log.debug('Cloudflare challenge did not resolve within timeout');
|
|
118
|
+
await ctx.close().catch(() => { });
|
|
119
|
+
return { solved: false, html: await page.content().catch(() => _html), error: 'Cloudflare challenge timed out' };
|
|
120
|
+
}
|
|
121
|
+
// Extract real page content
|
|
122
|
+
const realHtml = await page.content();
|
|
123
|
+
// Extract cookies (especially cf_clearance)
|
|
124
|
+
const cookies = await ctx.cookies();
|
|
125
|
+
const cookieStrings = cookies.map(c => {
|
|
126
|
+
let s = `${c.name}=${c.value}`;
|
|
127
|
+
if (c.path)
|
|
128
|
+
s += `; Path=${c.path}`;
|
|
129
|
+
if (c.domain)
|
|
130
|
+
s += `; Domain=${c.domain}`;
|
|
131
|
+
if (c.secure)
|
|
132
|
+
s += '; Secure';
|
|
133
|
+
if (c.httpOnly)
|
|
134
|
+
s += '; HttpOnly';
|
|
135
|
+
if (c.expires && c.expires > 0) {
|
|
136
|
+
s += `; Expires=${new Date(c.expires * 1000).toUTCString()}`;
|
|
137
|
+
}
|
|
138
|
+
return s;
|
|
139
|
+
});
|
|
140
|
+
// Determine TTL based on cf_clearance expiry (default 30 min)
|
|
141
|
+
const cfClearance = cookies.find(c => c.name === 'cf_clearance');
|
|
142
|
+
const ttlMs = cfClearance?.expires && cfClearance.expires > 0
|
|
143
|
+
? Math.min((cfClearance.expires * 1000) - Date.now(), 30 * 60 * 1000)
|
|
144
|
+
: 30 * 60 * 1000;
|
|
145
|
+
// Cache cookies for future requests
|
|
146
|
+
if (cookieStrings.length > 0) {
|
|
147
|
+
cacheCookiesForUrl(url, cookieStrings, ttlMs);
|
|
148
|
+
log.debug(`Cached ${cookieStrings.length} cookies for ${getDomain(url)} (TTL: ${Math.round(ttlMs / 60000)}m)`);
|
|
149
|
+
}
|
|
150
|
+
await ctx.close().catch(() => { });
|
|
151
|
+
console.log(`[challenge-solver] Cloudflare challenge solved for ${getDomain(url)}, extracted ${cookieStrings.length} cookies`);
|
|
152
|
+
return {
|
|
153
|
+
solved: true,
|
|
154
|
+
html: realHtml,
|
|
155
|
+
cookies: cookieStrings,
|
|
156
|
+
method: 'local-browser',
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
catch (err) {
|
|
160
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
161
|
+
log.debug('Cloudflare solve failed:', error);
|
|
162
|
+
return { solved: false, html: _html, error };
|
|
163
|
+
}
|
|
164
|
+
finally {
|
|
165
|
+
// Don't close shared browser — it's managed by browser-pool
|
|
166
|
+
page = null;
|
|
167
|
+
browser = null;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
// ── Generic stealth browser solver ───────────────────────────────────────────
|
|
171
|
+
/**
|
|
172
|
+
* General-purpose stealth browser solve for challenges that may auto-resolve
|
|
173
|
+
* when rendered in a legitimate-looking browser (DataDome, Akamai, etc.).
|
|
174
|
+
*/
|
|
175
|
+
async function solveWithStealthBrowser(url, _html, timeoutMs, proxy, challengeType) {
|
|
176
|
+
let page = null;
|
|
177
|
+
try {
|
|
178
|
+
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
|
|
179
|
+
const browser = await getStealthBrowser();
|
|
180
|
+
const vp = getRandomViewport();
|
|
181
|
+
const ctx = await browser.newContext({
|
|
182
|
+
userAgent: getRandomUserAgent(),
|
|
183
|
+
viewport: { width: vp.width, height: vp.height },
|
|
184
|
+
...(proxy ? { proxy: { server: proxy } } : {}),
|
|
185
|
+
locale: 'en-US',
|
|
186
|
+
});
|
|
187
|
+
page = await ctx.newPage();
|
|
188
|
+
await applyStealthScripts(page);
|
|
189
|
+
await page.goto(url, {
|
|
190
|
+
waitUntil: 'networkidle',
|
|
191
|
+
timeout: timeoutMs,
|
|
192
|
+
});
|
|
193
|
+
// Wait a bit for any JS-based challenges to execute
|
|
194
|
+
await page.waitForTimeout(2000);
|
|
195
|
+
const html = await page.content();
|
|
196
|
+
const cookies = await ctx.cookies();
|
|
197
|
+
const cookieStrings = cookies.map(c => `${c.name}=${c.value}; Path=${c.path || '/'}${c.domain ? `; Domain=${c.domain}` : ''}`);
|
|
198
|
+
// Check if we got real content (not a challenge page)
|
|
199
|
+
const titleEl = await page.title().catch(() => '');
|
|
200
|
+
const isStillChallenge = CF_CHALLENGE_TITLES.some(t => titleEl.toLowerCase().includes(t))
|
|
201
|
+
|| html.includes('cf-browser-verification')
|
|
202
|
+
|| html.includes('challenge-form');
|
|
203
|
+
if (isStillChallenge) {
|
|
204
|
+
await ctx.close().catch(() => { });
|
|
205
|
+
return { solved: false, html, error: `${challengeType} challenge did not resolve` };
|
|
206
|
+
}
|
|
207
|
+
if (cookieStrings.length > 0) {
|
|
208
|
+
cacheCookiesForUrl(url, cookieStrings);
|
|
209
|
+
}
|
|
210
|
+
await ctx.close().catch(() => { });
|
|
211
|
+
console.log(`[challenge-solver] ${challengeType} challenge solved for ${getDomain(url)}`);
|
|
212
|
+
return { solved: true, html, cookies: cookieStrings, method: 'local-browser' };
|
|
213
|
+
}
|
|
214
|
+
catch (err) {
|
|
215
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
216
|
+
return { solved: false, html: _html, error };
|
|
217
|
+
}
|
|
218
|
+
finally {
|
|
219
|
+
page = null;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
// ── Remote worker proxy ───────────────────────────────────────────────────────
|
|
223
|
+
/**
|
|
224
|
+
* Proxy a solve request to a remote browser worker (e.g. Hetzner VPS).
|
|
225
|
+
*
|
|
226
|
+
* The worker endpoint is expected to accept:
|
|
227
|
+
* POST /solve
|
|
228
|
+
* { url, challengeType, timeout, proxy? }
|
|
229
|
+
*
|
|
230
|
+
* And return:
|
|
231
|
+
* { solved: boolean, html: string, cookies?: string[], error?: string }
|
|
232
|
+
*
|
|
233
|
+
* Set BROWSER_WORKER_URL to the worker base URL (e.g. http://hetzner:3001)
|
|
234
|
+
* to route all browser-based challenge solving to the worker.
|
|
235
|
+
*/
|
|
236
|
+
async function solveViaRemoteWorker(url, challengeType, html, options) {
|
|
237
|
+
const { workerUrl, timeout, proxy } = options;
|
|
238
|
+
try {
|
|
239
|
+
const controller = new AbortController();
|
|
240
|
+
const timer = setTimeout(() => controller.abort(), timeout + 5000); // Add buffer
|
|
241
|
+
const response = await fetch(`${workerUrl}/solve`, {
|
|
242
|
+
method: 'POST',
|
|
243
|
+
headers: { 'Content-Type': 'application/json' },
|
|
244
|
+
body: JSON.stringify({ url, challengeType, timeout, ...(proxy ? { proxy } : {}) }),
|
|
245
|
+
signal: controller.signal,
|
|
246
|
+
});
|
|
247
|
+
clearTimeout(timer);
|
|
248
|
+
if (!response.ok) {
|
|
249
|
+
throw new Error(`Worker returned HTTP ${response.status}`);
|
|
250
|
+
}
|
|
251
|
+
const result = await response.json();
|
|
252
|
+
// Cache cookies from remote solve
|
|
253
|
+
if (result.solved && result.cookies?.length) {
|
|
254
|
+
cacheCookiesForUrl(url, result.cookies);
|
|
255
|
+
console.log(`[challenge-solver] Remote ${challengeType} solve for ${getDomain(url)}, cached ${result.cookies.length} cookies`);
|
|
256
|
+
}
|
|
257
|
+
return { ...result, method: 'remote-worker' };
|
|
258
|
+
}
|
|
259
|
+
catch (err) {
|
|
260
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
261
|
+
log.debug('Remote worker solve failed:', error);
|
|
262
|
+
// Fall through to local solve on worker failure
|
|
263
|
+
console.log(`[challenge-solver] Remote worker failed, attempting local ${challengeType} solve for ${getDomain(url)}`);
|
|
264
|
+
switch (challengeType) {
|
|
265
|
+
case 'cloudflare':
|
|
266
|
+
return solveCloudflare(url, html, options.timeout, options.proxy);
|
|
267
|
+
default:
|
|
268
|
+
return solveWithStealthBrowser(url, html, options.timeout, options.proxy, challengeType);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
// ── Challenge resolution detection ───────────────────────────────────────────
|
|
273
|
+
/**
|
|
274
|
+
* Wait for a Cloudflare challenge page to resolve.
|
|
275
|
+
*
|
|
276
|
+
* Cloudflare's challenge works like this:
|
|
277
|
+
* 1. Initial page: title is "Just a moment..." with challenge elements
|
|
278
|
+
* 2. Browser runs JS fingerprinting
|
|
279
|
+
* 3. On pass: redirects to real page (title and content change)
|
|
280
|
+
* 4. On fail: stays on challenge page
|
|
281
|
+
*
|
|
282
|
+
* We detect resolution by watching for:
|
|
283
|
+
* - Title change (away from challenge titles)
|
|
284
|
+
* - Challenge element disappearance
|
|
285
|
+
* - URL change (often redirects after solve)
|
|
286
|
+
*/
|
|
287
|
+
async function waitForChallengeResolution(page, timeoutMs) {
|
|
288
|
+
const start = Date.now();
|
|
289
|
+
const pollInterval = 500;
|
|
290
|
+
// Quick check: is it even a challenge page?
|
|
291
|
+
const initialTitle = await page.title().catch(() => '');
|
|
292
|
+
const isInitiallyChallenge = CF_CHALLENGE_TITLES.some(t => initialTitle.toLowerCase().includes(t));
|
|
293
|
+
if (!isInitiallyChallenge) {
|
|
294
|
+
// Not a challenge page to begin with — treat as solved
|
|
295
|
+
return true;
|
|
296
|
+
}
|
|
297
|
+
// Poll until timeout
|
|
298
|
+
while (Date.now() - start < timeoutMs) {
|
|
299
|
+
await page.waitForTimeout(pollInterval);
|
|
300
|
+
const title = await page.title().catch(() => '');
|
|
301
|
+
const lowerTitle = title.toLowerCase();
|
|
302
|
+
// Title changed away from challenge
|
|
303
|
+
const isChallengeTitle = CF_CHALLENGE_TITLES.some(t => lowerTitle.includes(t));
|
|
304
|
+
if (!isChallengeTitle && title.length > 0) {
|
|
305
|
+
// Give the page a moment to fully render
|
|
306
|
+
await page.waitForTimeout(1000);
|
|
307
|
+
return true;
|
|
308
|
+
}
|
|
309
|
+
// Check if challenge elements are gone
|
|
310
|
+
let challengeElementGone = true;
|
|
311
|
+
for (const selector of CF_CHALLENGE_SELECTORS) {
|
|
312
|
+
try {
|
|
313
|
+
const el = await page.$(selector);
|
|
314
|
+
if (el) {
|
|
315
|
+
challengeElementGone = false;
|
|
316
|
+
break;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
catch {
|
|
320
|
+
// Selector check failed — continue
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (challengeElementGone && !isChallengeTitle) {
|
|
324
|
+
await page.waitForTimeout(500);
|
|
325
|
+
return true;
|
|
326
|
+
}
|
|
327
|
+
// Try waiting for network to settle (challenge often triggers fetches)
|
|
328
|
+
try {
|
|
329
|
+
await page.waitForLoadState('networkidle', { timeout: Math.min(3000, timeoutMs - (Date.now() - start)) });
|
|
330
|
+
const finalTitle = await page.title().catch(() => '');
|
|
331
|
+
if (!CF_CHALLENGE_TITLES.some(t => finalTitle.toLowerCase().includes(t))) {
|
|
332
|
+
return true;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
catch {
|
|
336
|
+
// Timeout or error — continue polling
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
return false;
|
|
340
|
+
}
|
|
341
|
+
// ── hCaptcha Accessibility Bypass ────────────────────────────────────────────
|
|
342
|
+
// TODO: hCaptcha Accessibility Bypass
|
|
343
|
+
// hCaptcha has an accessibility service at https://www.hcaptcha.com/accessibility
|
|
344
|
+
// that provides a cookie allowing users with accessibility needs to bypass hCaptcha.
|
|
345
|
+
//
|
|
346
|
+
// Implementation notes:
|
|
347
|
+
// - The service used to allow programmatic registration without email verification
|
|
348
|
+
// - As of 2025, it requires manual verification (email link) to activate
|
|
349
|
+
// - Since this requires human interaction, it cannot be fully automated
|
|
350
|
+
//
|
|
351
|
+
// When/if implemented:
|
|
352
|
+
// 1. Check https://www.hcaptcha.com/accessibility for current API status
|
|
353
|
+
// 2. Register with a request to their accessibility API
|
|
354
|
+
// 3. If they return a cookie directly (no email verification), cache it
|
|
355
|
+
// 4. Attach the cookie to requests to sites using hCaptcha
|
|
356
|
+
//
|
|
357
|
+
// const HCAPTCHA_ACCESSIBILITY_URL = 'https://accounts.hcaptcha.com/demo?sitekey=bf5558a0-...';
|
|
358
|
+
// export async function getHCaptchaAccessibilityCookie(): Promise<string | null> { ... }
|
|
359
|
+
// ── Utility ───────────────────────────────────────────────────────────────────
|
|
360
|
+
function getDomain(url) {
|
|
361
|
+
try {
|
|
362
|
+
return new URL(url).hostname;
|
|
363
|
+
}
|
|
364
|
+
catch {
|
|
365
|
+
return url;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory cookie cache with TTL.
|
|
3
|
+
*
|
|
4
|
+
* Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
|
|
5
|
+
* Cookies from challenge solves are cached here so future requests to the same
|
|
6
|
+
* domain skip the challenge entirely.
|
|
7
|
+
*
|
|
8
|
+
* Design goals:
|
|
9
|
+
* - Zero dependencies (plain Map + setTimeout)
|
|
10
|
+
* - In-memory only — no disk/DB persistence
|
|
11
|
+
* - TTL per entry (default 30 min, matching cf_clearance lifetime)
|
|
12
|
+
* - Thread-safe for single-process Node.js (event loop is single-threaded)
|
|
13
|
+
*/
|
|
14
|
+
export interface CachedCookies {
|
|
15
|
+
/** Raw "Cookie: ..." header value (semicolon-separated) */
|
|
16
|
+
cookieHeader: string;
|
|
17
|
+
/** Individual cookie strings (e.g. ["cf_clearance=abc; Path=/", ...]) */
|
|
18
|
+
cookies: string[];
|
|
19
|
+
/** Unix timestamp (ms) when this cache entry expires */
|
|
20
|
+
expiresAt: number;
|
|
21
|
+
/** The domain these cookies are for */
|
|
22
|
+
domain: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Store cookies for a domain.
|
|
26
|
+
*
|
|
27
|
+
* @param domain Hostname (e.g. "example.com" or "sub.example.com")
|
|
28
|
+
* @param cookies Array of Set-Cookie header values or cookie strings
|
|
29
|
+
* @param ttlMs Time-to-live in ms (default: 30 min)
|
|
30
|
+
*/
|
|
31
|
+
export declare function cacheCookies(domain: string, cookies: string[], ttlMs?: number): void;
|
|
32
|
+
/**
|
|
33
|
+
* Retrieve cached cookies for a domain (or its parent domain).
|
|
34
|
+
* Returns null if no valid (non-expired) entry exists.
|
|
35
|
+
*
|
|
36
|
+
* @param domain Hostname to look up
|
|
37
|
+
*/
|
|
38
|
+
export declare function getCachedCookies(domain: string): CachedCookies | null;
|
|
39
|
+
/**
|
|
40
|
+
* Build a Cookie request header value from a URL.
|
|
41
|
+
* Returns undefined if no cached cookies exist.
|
|
42
|
+
*/
|
|
43
|
+
export declare function getCookieHeader(url: string): string | undefined;
|
|
44
|
+
/**
|
|
45
|
+
* Cache cookies from a URL's perspective.
|
|
46
|
+
* Extracts domain from URL automatically.
|
|
47
|
+
*/
|
|
48
|
+
export declare function cacheCookiesForUrl(url: string, cookies: string[], ttlMs?: number): void;
|
|
49
|
+
/**
|
|
50
|
+
* Invalidate (remove) cached cookies for a domain.
|
|
51
|
+
*/
|
|
52
|
+
export declare function invalidateCookies(domain: string): void;
|
|
53
|
+
/**
|
|
54
|
+
* Return the number of cached domains (for diagnostics).
|
|
55
|
+
*/
|
|
56
|
+
export declare function getCacheSize(): number;
|
|
57
|
+
/**
|
|
58
|
+
* Clear ALL cached cookies. Mainly for tests.
|
|
59
|
+
*/
|
|
60
|
+
export declare function clearCookieCache(): void;
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory cookie cache with TTL.
|
|
3
|
+
*
|
|
4
|
+
* Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
|
|
5
|
+
* Cookies from challenge solves are cached here so future requests to the same
|
|
6
|
+
* domain skip the challenge entirely.
|
|
7
|
+
*
|
|
8
|
+
* Design goals:
|
|
9
|
+
* - Zero dependencies (plain Map + setTimeout)
|
|
10
|
+
* - In-memory only — no disk/DB persistence
|
|
11
|
+
* - TTL per entry (default 30 min, matching cf_clearance lifetime)
|
|
12
|
+
* - Thread-safe for single-process Node.js (event loop is single-threaded)
|
|
13
|
+
*/
|
|
14
|
+
// ── Internal store ────────────────────────────────────────────────────────────
|
|
15
|
+
const store = new Map();
|
|
16
|
+
let cleanupTimer = null;
|
|
17
|
+
/** Default TTL: 30 minutes (cf_clearance lasts 30 min) */
|
|
18
|
+
const DEFAULT_TTL_MS = 30 * 60 * 1000;
|
|
19
|
+
// ── Public API ────────────────────────────────────────────────────────────────
|
|
20
|
+
/**
|
|
21
|
+
* Store cookies for a domain.
|
|
22
|
+
*
|
|
23
|
+
* @param domain Hostname (e.g. "example.com" or "sub.example.com")
|
|
24
|
+
* @param cookies Array of Set-Cookie header values or cookie strings
|
|
25
|
+
* @param ttlMs Time-to-live in ms (default: 30 min)
|
|
26
|
+
*/
|
|
27
|
+
export function cacheCookies(domain, cookies, ttlMs = DEFAULT_TTL_MS) {
|
|
28
|
+
if (!cookies.length)
|
|
29
|
+
return;
|
|
30
|
+
const normalizedDomain = normalizeDomain(domain);
|
|
31
|
+
const cookieHeader = buildCookieHeader(cookies);
|
|
32
|
+
const expiresAt = Date.now() + ttlMs;
|
|
33
|
+
store.set(normalizedDomain, {
|
|
34
|
+
cookieHeader,
|
|
35
|
+
cookies,
|
|
36
|
+
expiresAt,
|
|
37
|
+
domain: normalizedDomain,
|
|
38
|
+
});
|
|
39
|
+
// Start periodic cleanup if not already running
|
|
40
|
+
startCleanup();
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Retrieve cached cookies for a domain (or its parent domain).
|
|
44
|
+
* Returns null if no valid (non-expired) entry exists.
|
|
45
|
+
*
|
|
46
|
+
* @param domain Hostname to look up
|
|
47
|
+
*/
|
|
48
|
+
export function getCachedCookies(domain) {
|
|
49
|
+
const normalizedDomain = normalizeDomain(domain);
|
|
50
|
+
// Try exact match first, then parent domain
|
|
51
|
+
const candidates = [normalizedDomain, getParentDomain(normalizedDomain)].filter(Boolean);
|
|
52
|
+
for (const candidate of candidates) {
|
|
53
|
+
const entry = store.get(candidate);
|
|
54
|
+
if (entry && entry.expiresAt > Date.now()) {
|
|
55
|
+
return entry;
|
|
56
|
+
}
|
|
57
|
+
// Remove expired entry
|
|
58
|
+
if (entry) {
|
|
59
|
+
store.delete(candidate);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Build a Cookie request header value from a URL.
|
|
66
|
+
* Returns undefined if no cached cookies exist.
|
|
67
|
+
*/
|
|
68
|
+
export function getCookieHeader(url) {
|
|
69
|
+
try {
|
|
70
|
+
const domain = new URL(url).hostname;
|
|
71
|
+
const cached = getCachedCookies(domain);
|
|
72
|
+
return cached?.cookieHeader;
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Cache cookies from a URL's perspective.
|
|
80
|
+
* Extracts domain from URL automatically.
|
|
81
|
+
*/
|
|
82
|
+
export function cacheCookiesForUrl(url, cookies, ttlMs = DEFAULT_TTL_MS) {
|
|
83
|
+
try {
|
|
84
|
+
const domain = new URL(url).hostname;
|
|
85
|
+
cacheCookies(domain, cookies, ttlMs);
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
// Invalid URL — ignore
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Invalidate (remove) cached cookies for a domain.
|
|
93
|
+
*/
|
|
94
|
+
export function invalidateCookies(domain) {
|
|
95
|
+
const normalizedDomain = normalizeDomain(domain);
|
|
96
|
+
store.delete(normalizedDomain);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Return the number of cached domains (for diagnostics).
|
|
100
|
+
*/
|
|
101
|
+
export function getCacheSize() {
|
|
102
|
+
return store.size;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Clear ALL cached cookies. Mainly for tests.
|
|
106
|
+
*/
|
|
107
|
+
export function clearCookieCache() {
|
|
108
|
+
store.clear();
|
|
109
|
+
if (cleanupTimer) {
|
|
110
|
+
clearInterval(cleanupTimer);
|
|
111
|
+
cleanupTimer = null;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
115
|
+
/** Normalize domain: lowercase, strip www. prefix */
|
|
116
|
+
function normalizeDomain(domain) {
|
|
117
|
+
return domain.toLowerCase().replace(/^www\./, '');
|
|
118
|
+
}
|
|
119
|
+
/** Get parent domain (strip first subdomain label) */
|
|
120
|
+
function getParentDomain(domain) {
|
|
121
|
+
const parts = domain.split('.');
|
|
122
|
+
if (parts.length <= 2)
|
|
123
|
+
return null; // Already a root domain
|
|
124
|
+
return parts.slice(1).join('.');
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Convert an array of Set-Cookie values or raw cookie strings into a single
|
|
128
|
+
* "Cookie: name=value; name2=value2" header value.
|
|
129
|
+
*/
|
|
130
|
+
function buildCookieHeader(cookies) {
|
|
131
|
+
const pairs = [];
|
|
132
|
+
for (const cookie of cookies) {
|
|
133
|
+
// Set-Cookie format: "name=value; Path=/; Secure; HttpOnly; ..."
|
|
134
|
+
// We only want the first "name=value" pair
|
|
135
|
+
const firstPart = cookie.split(';')[0]?.trim();
|
|
136
|
+
if (firstPart) {
|
|
137
|
+
pairs.push(firstPart);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return pairs.join('; ');
|
|
141
|
+
}
|
|
142
|
+
/** Periodically remove expired entries to prevent memory leaks. */
|
|
143
|
+
function startCleanup() {
|
|
144
|
+
if (cleanupTimer)
|
|
145
|
+
return;
|
|
146
|
+
cleanupTimer = setInterval(() => {
|
|
147
|
+
const now = Date.now();
|
|
148
|
+
for (const [domain, entry] of store) {
|
|
149
|
+
if (entry.expiresAt <= now) {
|
|
150
|
+
store.delete(domain);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// Stop the timer if the cache is empty
|
|
154
|
+
if (store.size === 0 && cleanupTimer) {
|
|
155
|
+
clearInterval(cleanupTimer);
|
|
156
|
+
cleanupTimer = null;
|
|
157
|
+
}
|
|
158
|
+
}, 5 * 60 * 1000); // Run every 5 minutes
|
|
159
|
+
// Don't block Node.js process exit
|
|
160
|
+
if (cleanupTimer && typeof cleanupTimer.unref === 'function') {
|
|
161
|
+
cleanupTimer.unref();
|
|
162
|
+
}
|
|
163
|
+
}
|
package/dist/core/http-fetch.js
CHANGED
|
@@ -15,6 +15,7 @@ import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types
|
|
|
15
15
|
import { getCached } from './cache.js';
|
|
16
16
|
import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
|
|
17
17
|
import { detectChallenge } from './challenge-detection.js';
|
|
18
|
+
import { getCookieHeader } from './cookie-cache.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
const log = createLogger('http');
|
|
20
21
|
// ── HTTP status text fallbacks (HTTP/2 omits reason phrases) ──────────────────
|
|
@@ -515,8 +516,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
515
516
|
let activeUserAgent = isSecGov
|
|
516
517
|
? 'WebPeel/1.0 (support@webpeel.dev)'
|
|
517
518
|
: (userAgent ? validateUserAgent(userAgent) : getHttpUA());
|
|
519
|
+
// Inject cached challenge-solve cookies (e.g. cf_clearance) if available.
|
|
520
|
+
// These are merged into customHeaders so they ride along on every request
|
|
521
|
+
// to this domain, skipping repeated challenge pages.
|
|
522
|
+
const cachedCookieHeader = getCookieHeader(url);
|
|
523
|
+
const effectiveCustomHeaders = cachedCookieHeader
|
|
524
|
+
? { Cookie: cachedCookieHeader, ...(customHeaders || {}) }
|
|
525
|
+
: customHeaders;
|
|
518
526
|
// Build stealth headers merged with any caller-supplied custom headers
|
|
519
|
-
let mergedHeaders = buildMergedHeaders(url, activeUserAgent,
|
|
527
|
+
let mergedHeaders = buildMergedHeaders(url, activeUserAgent, effectiveCustomHeaders);
|
|
520
528
|
// Auto-route through residential proxy for sites known to block datacenter IPs.
|
|
521
529
|
// The explicit `proxy` param always wins; auto-proxy only kicks in when unset.
|
|
522
530
|
const effectiveProxy = proxy ?? (shouldUseProxy(url) ? (getWebshareProxyUrl() ?? undefined) : undefined);
|
package/dist/core/pipeline.js
CHANGED
|
@@ -467,9 +467,38 @@ export async function fetchContent(ctx) {
|
|
|
467
467
|
// Capture raw HTML size BEFORE any processing (accurate measurement of original content)
|
|
468
468
|
ctx.rawHtmlSize = fetchResult.html?.length || 0;
|
|
469
469
|
ctx.fetchResult = fetchResult;
|
|
470
|
-
//
|
|
470
|
+
// Attempt to solve challenge/CAPTCHA page when detected
|
|
471
471
|
if (fetchResult.challengeDetected) {
|
|
472
|
-
|
|
472
|
+
const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
|
|
473
|
+
// Only attempt solve if we have a browser worker URL or are not on a resource-constrained env
|
|
474
|
+
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
475
|
+
if (canSolve) {
|
|
476
|
+
try {
|
|
477
|
+
const { solveChallenge } = await import('./challenge-solver.js');
|
|
478
|
+
const { detectChallenge } = await import('./challenge-detection.js');
|
|
479
|
+
const rawHtml = fetchResult.html || '';
|
|
480
|
+
const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
|
|
481
|
+
const challengeType = detectionResult.type || 'generic-block';
|
|
482
|
+
const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
|
|
483
|
+
timeout: 15000,
|
|
484
|
+
});
|
|
485
|
+
if (solveResult.solved && solveResult.html) {
|
|
486
|
+
fetchResult.html = solveResult.html;
|
|
487
|
+
fetchResult.challengeDetected = false;
|
|
488
|
+
log.debug(`Challenge solved (${challengeType}) for ${ctx.url}`);
|
|
489
|
+
}
|
|
490
|
+
else {
|
|
491
|
+
ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
catch (e) {
|
|
495
|
+
ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
|
|
496
|
+
log.debug('Challenge solve failed:', e instanceof Error ? e.message : e);
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
else {
|
|
500
|
+
ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
|
|
501
|
+
}
|
|
473
502
|
}
|
|
474
503
|
}
|
|
475
504
|
// ---------------------------------------------------------------------------
|
|
@@ -1004,22 +1033,59 @@ export async function postProcess(ctx) {
|
|
|
1004
1033
|
ctx.metadata.blocked = true;
|
|
1005
1034
|
ctx.metadata.challengeDetected = true;
|
|
1006
1035
|
}
|
|
1007
|
-
// Try
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1036
|
+
// Try challenge solver first (if browser worker available or local solve enabled)
|
|
1037
|
+
let solvedViaChallengeSolver = false;
|
|
1038
|
+
const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
|
|
1039
|
+
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
1040
|
+
if (canSolve && ctx.fetchResult?.html) {
|
|
1041
|
+
try {
|
|
1042
|
+
const { solveChallenge } = await import('./challenge-solver.js');
|
|
1043
|
+
const { detectChallenge } = await import('./challenge-detection.js');
|
|
1044
|
+
const rawHtml = ctx.fetchResult.html;
|
|
1045
|
+
const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
|
|
1046
|
+
const challengeType = detectionResult.type || 'cloudflare';
|
|
1047
|
+
const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
|
|
1048
|
+
timeout: 15000,
|
|
1049
|
+
});
|
|
1050
|
+
if (solveResult.solved && solveResult.html) {
|
|
1051
|
+
// Re-parse the solved HTML
|
|
1052
|
+
const { htmlToMarkdown, htmlToText, cleanForAI } = await import('./markdown.js');
|
|
1053
|
+
const fmt = ctx.format || 'markdown';
|
|
1054
|
+
ctx.content = fmt === 'text' ? htmlToText(solveResult.html)
|
|
1055
|
+
: fmt === 'clean' ? cleanForAI(solveResult.html)
|
|
1056
|
+
: htmlToMarkdown(solveResult.html);
|
|
1057
|
+
ctx.fetchResult.html = solveResult.html;
|
|
1058
|
+
if (ctx.metadata) {
|
|
1059
|
+
ctx.metadata.blocked = false;
|
|
1060
|
+
ctx.metadata.challengeDetected = false;
|
|
1061
|
+
ctx.metadata.challengeSolved = true;
|
|
1062
|
+
}
|
|
1063
|
+
solvedViaChallengeSolver = true;
|
|
1064
|
+
log.debug(`Content-level challenge solved for ${ctx.url}`);
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
catch (e) {
|
|
1068
|
+
log.debug('Content-level challenge solve failed:', e instanceof Error ? e.message : e);
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
// Fall back to search fallback if challenge solve didn't work
|
|
1072
|
+
if (!solvedViaChallengeSolver) {
|
|
1073
|
+
try {
|
|
1074
|
+
// @ts-ignore — proprietary module, gitignored
|
|
1075
|
+
const { searchFallback } = await import('./search-fallback.js');
|
|
1076
|
+
const searchResult = await searchFallback(ctx.url);
|
|
1077
|
+
if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
|
|
1078
|
+
ctx.content = searchResult.cachedContent;
|
|
1079
|
+
ctx.title = searchResult.title || ctx.title;
|
|
1080
|
+
ctx.quality = 0.4;
|
|
1081
|
+
ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
|
|
1082
|
+
if (ctx.metadata) {
|
|
1083
|
+
ctx.metadata.fallbackSource = searchResult.source;
|
|
1084
|
+
}
|
|
1019
1085
|
}
|
|
1020
1086
|
}
|
|
1087
|
+
catch { /* Search fallback failed — continue with challenge page content */ }
|
|
1021
1088
|
}
|
|
1022
|
-
catch { /* Search fallback failed — continue with challenge page content */ }
|
|
1023
1089
|
}
|
|
1024
1090
|
}
|
|
1025
1091
|
// === Zero-token safety net ===
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.46",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|