webpeel 0.21.48 → 0.21.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/apply.js +1 -0
- package/dist/core/challenge-solver.d.ts +26 -0
- package/dist/core/challenge-solver.js +355 -2
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/pipeline.js +46 -0
- package/dist/server/routes/health.js +13 -0
- package/dist/server/routes/search.js +30 -5
- package/package.json +1 -1
package/dist/core/apply.js
CHANGED
|
@@ -590,6 +590,7 @@ export async function applyToJob(options) {
|
|
|
590
590
|
try {
|
|
591
591
|
// ── 3. Launch persistent browser ──────────────────────────────
|
|
592
592
|
progress('navigating', 'Launching browser with persistent session...');
|
|
593
|
+
// @ts-expect-error rebrowser-playwright types diverge from upstream playwright-core
|
|
593
594
|
context = await stealthChromium.launchPersistentContext(sessionDir, {
|
|
594
595
|
headless: false, // visible so user can monitor (or log in on first run)
|
|
595
596
|
viewport: { width: 1440, height: 900 },
|
|
@@ -18,6 +18,32 @@
|
|
|
18
18
|
* }
|
|
19
19
|
*/
|
|
20
20
|
import type { ChallengeType } from './challenge-detection.js';
|
|
21
|
+
export interface ImageCaptchaResult {
|
|
22
|
+
solved: boolean;
|
|
23
|
+
rounds: number;
|
|
24
|
+
error?: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Ask the moondream vision model which grid cells contain the target object.
|
|
28
|
+
* Returns an array of 1-indexed grid positions (1–9), or null if the call fails.
|
|
29
|
+
*/
|
|
30
|
+
export declare function askVisionModel(base64Image: string, targetObject: string): Promise<number[] | null>;
|
|
31
|
+
/**
|
|
32
|
+
* Detect if the page has an image grid CAPTCHA and extract the target object.
|
|
33
|
+
* Returns the object name (e.g. "traffic lights") or null if not detected.
|
|
34
|
+
*/
|
|
35
|
+
export declare function detectImageCaptchaTarget(page: import('playwright').Page): Promise<string | null>;
|
|
36
|
+
/**
|
|
37
|
+
* Solve an image grid CAPTCHA using the moondream vision model.
|
|
38
|
+
*
|
|
39
|
+
* Flow per round:
|
|
40
|
+
* 1. Screenshot the CAPTCHA grid element
|
|
41
|
+
* 2. Send to moondream → get grid positions
|
|
42
|
+
* 3. Click identified cells
|
|
43
|
+
* 4. Click Verify button
|
|
44
|
+
* 5. Check if solved; if a new round appears, repeat (max 3 rounds)
|
|
45
|
+
*/
|
|
46
|
+
export declare function solveImageCaptcha(page: import('playwright').Page, targetObject: string): Promise<ImageCaptchaResult>;
|
|
21
47
|
export interface SolveOptions {
|
|
22
48
|
/** Hard timeout in ms (default: 15 000) */
|
|
23
49
|
timeout?: number;
|
|
@@ -20,6 +20,298 @@
|
|
|
20
20
|
import { cacheCookiesForUrl } from './cookie-cache.js';
|
|
21
21
|
import { createLogger } from './logger.js';
|
|
22
22
|
const log = createLogger('challenge-solver');
|
|
23
|
+
// ── Image CAPTCHA solver constants ────────────────────────────────────────────
|
|
24
|
+
const OLLAMA_VISION_URL = 'http://178.156.229.86:11435/api/generate';
|
|
25
|
+
const OLLAMA_AUTH_TOKEN = 'c996233de4addb47e4cdec8bc5ff8776397f813ca7bd444e7258e0e2ed251963';
|
|
26
|
+
const OLLAMA_VISION_MODEL = 'moondream';
|
|
27
|
+
/** moondream on the 4GB Hetzner VPS takes ~30s per image */
|
|
28
|
+
const VISION_TIMEOUT_MS = 45_000;
|
|
29
|
+
const IMAGE_CAPTCHA_MAX_ROUNDS = 3;
|
|
30
|
+
/** Grid element selectors to try (reCAPTCHA, hCaptcha, generic) */
|
|
31
|
+
const CAPTCHA_GRID_SELECTORS = [
|
|
32
|
+
'.rc-imageselect-table',
|
|
33
|
+
'.task-grid',
|
|
34
|
+
'.task-image',
|
|
35
|
+
'table.rc-imageselect-table',
|
|
36
|
+
'[class*="grid"]:not(body):not(html)',
|
|
37
|
+
'.captcha-grid',
|
|
38
|
+
];
|
|
39
|
+
/** Verify/Submit button selectors */
|
|
40
|
+
const CAPTCHA_VERIFY_SELECTORS = [
|
|
41
|
+
'#recaptcha-verify-button',
|
|
42
|
+
'button[data-action="verify"]',
|
|
43
|
+
'button[class*="verify"]',
|
|
44
|
+
'button[class*="submit"]',
|
|
45
|
+
'.rc-button-default',
|
|
46
|
+
'[id*="verify"]',
|
|
47
|
+
'[class*="verify"]',
|
|
48
|
+
];
|
|
49
|
+
/** Instruction text containers to extract the target object from */
|
|
50
|
+
const CAPTCHA_INSTRUCTION_SELECTORS = [
|
|
51
|
+
'.rc-imageselect-desc-wrapper',
|
|
52
|
+
'.rc-imageselect-desc',
|
|
53
|
+
'.prompt-text',
|
|
54
|
+
'[class*="prompt"]',
|
|
55
|
+
'[class*="instruction"]',
|
|
56
|
+
'[class*="task-desc"]',
|
|
57
|
+
'[aria-label*="select"]',
|
|
58
|
+
'[aria-label*="click"]',
|
|
59
|
+
];
|
|
60
|
+
/** Patterns to extract the object name from instruction text */
|
|
61
|
+
const CAPTCHA_OBJECT_PATTERNS = [
|
|
62
|
+
/select all (?:images|squares|tiles) (?:with|containing|that (?:have|contain)) (?:a |an )?(.+?)(?:\.|$)/i,
|
|
63
|
+
/click (?:all )?(?:images|squares|tiles) (?:containing|with|that (?:have|contain)) (?:a |an )?(.+?)(?:\.|$)/i,
|
|
64
|
+
/please click each image containing (?:a |an )?(.+?)(?:\.|$)/i,
|
|
65
|
+
/select all (?:the )?(?:image|picture)s? of (?:a |an )?(.+?)(?:\.|$)/i,
|
|
66
|
+
/identify all (?:images|pictures|squares) (?:with|showing|of) (?:a |an )?(.+?)(?:\.|$)/i,
|
|
67
|
+
];
|
|
68
|
+
// ── Vision API call ───────────────────────────────────────────────────────────
|
|
69
|
+
/**
|
|
70
|
+
* Ask the moondream vision model which grid cells contain the target object.
|
|
71
|
+
* Returns an array of 1-indexed grid positions (1–9), or null if the call fails.
|
|
72
|
+
*/
|
|
73
|
+
export async function askVisionModel(base64Image, targetObject) {
|
|
74
|
+
const prompt = `This is a 3x3 image grid CAPTCHA. Select all squares containing "${targetObject}". Reply with ONLY the grid positions as numbers 1-9 (left to right, top to bottom), separated by commas. Example: 1,3,7`;
|
|
75
|
+
const controller = new AbortController();
|
|
76
|
+
const timer = setTimeout(() => controller.abort(), VISION_TIMEOUT_MS);
|
|
77
|
+
try {
|
|
78
|
+
const response = await fetch(OLLAMA_VISION_URL, {
|
|
79
|
+
method: 'POST',
|
|
80
|
+
headers: {
|
|
81
|
+
'Content-Type': 'application/json',
|
|
82
|
+
'Authorization': `Bearer ${OLLAMA_AUTH_TOKEN}`,
|
|
83
|
+
},
|
|
84
|
+
body: JSON.stringify({
|
|
85
|
+
model: OLLAMA_VISION_MODEL,
|
|
86
|
+
prompt,
|
|
87
|
+
images: [base64Image],
|
|
88
|
+
stream: false,
|
|
89
|
+
options: { num_predict: 50, temperature: 0.1 },
|
|
90
|
+
}),
|
|
91
|
+
signal: controller.signal,
|
|
92
|
+
});
|
|
93
|
+
clearTimeout(timer);
|
|
94
|
+
if (!response.ok) {
|
|
95
|
+
log.debug(`Vision API returned HTTP ${response.status}`);
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
const data = await response.json();
|
|
99
|
+
const text = data.response ?? '';
|
|
100
|
+
log.debug(`Vision model response: "${text}"`);
|
|
101
|
+
// Match whole numbers only (not individual digits from multi-digit numbers like 10, 11)
|
|
102
|
+
const positions = text.match(/\b[1-9]\b/g)?.map(Number) ?? [];
|
|
103
|
+
if (positions.length === 0) {
|
|
104
|
+
log.debug('Vision model returned no valid grid positions');
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
return positions;
|
|
108
|
+
}
|
|
109
|
+
catch (err) {
|
|
110
|
+
clearTimeout(timer);
|
|
111
|
+
log.debug('Vision model call failed:', err instanceof Error ? err.message : String(err));
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// ── Target object extraction ──────────────────────────────────────────────────
|
|
116
|
+
/**
|
|
117
|
+
* Detect if the page has an image grid CAPTCHA and extract the target object.
|
|
118
|
+
* Returns the object name (e.g. "traffic lights") or null if not detected.
|
|
119
|
+
*/
|
|
120
|
+
export async function detectImageCaptchaTarget(page) {
|
|
121
|
+
for (const selector of CAPTCHA_INSTRUCTION_SELECTORS) {
|
|
122
|
+
try {
|
|
123
|
+
const el = await page.$(selector);
|
|
124
|
+
if (!el)
|
|
125
|
+
continue;
|
|
126
|
+
const text = await el.innerText().catch(() => '');
|
|
127
|
+
if (!text)
|
|
128
|
+
continue;
|
|
129
|
+
const normalized = text.trim().replace(/\s+/g, ' ');
|
|
130
|
+
for (const pattern of CAPTCHA_OBJECT_PATTERNS) {
|
|
131
|
+
const match = normalized.match(pattern);
|
|
132
|
+
if (match?.[1]) {
|
|
133
|
+
const target = match[1].trim().toLowerCase();
|
|
134
|
+
log.debug(`Detected image CAPTCHA target: "${target}" from selector ${selector}`);
|
|
135
|
+
return target;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
// Continue to next selector
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
// ── Image CAPTCHA solver ──────────────────────────────────────────────────────
|
|
146
|
+
/**
|
|
147
|
+
* Solve an image grid CAPTCHA using the moondream vision model.
|
|
148
|
+
*
|
|
149
|
+
* Flow per round:
|
|
150
|
+
* 1. Screenshot the CAPTCHA grid element
|
|
151
|
+
* 2. Send to moondream → get grid positions
|
|
152
|
+
* 3. Click identified cells
|
|
153
|
+
* 4. Click Verify button
|
|
154
|
+
* 5. Check if solved; if a new round appears, repeat (max 3 rounds)
|
|
155
|
+
*/
|
|
156
|
+
export async function solveImageCaptcha(page, targetObject) {
|
|
157
|
+
// Guard: only run when explicitly enabled or remote worker configured
|
|
158
|
+
const enabled = process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true' || !!process.env.BROWSER_WORKER_URL;
|
|
159
|
+
if (!enabled) {
|
|
160
|
+
return { solved: false, rounds: 0, error: 'Image CAPTCHA solving not enabled (set ENABLE_LOCAL_CHALLENGE_SOLVE=true)' };
|
|
161
|
+
}
|
|
162
|
+
let rounds = 0;
|
|
163
|
+
for (let attempt = 0; attempt < IMAGE_CAPTCHA_MAX_ROUNDS; attempt++) {
|
|
164
|
+
rounds++;
|
|
165
|
+
// ── 1. Screenshot the grid element ─────────────────────────────────────
|
|
166
|
+
let base64Screenshot = null;
|
|
167
|
+
for (const selector of CAPTCHA_GRID_SELECTORS) {
|
|
168
|
+
try {
|
|
169
|
+
const gridEl = await page.$(selector);
|
|
170
|
+
if (!gridEl)
|
|
171
|
+
continue;
|
|
172
|
+
const screenshot = await gridEl.screenshot({ type: 'png' });
|
|
173
|
+
base64Screenshot = screenshot.toString('base64');
|
|
174
|
+
log.debug(`Captured CAPTCHA grid with selector: ${selector}`);
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
catch {
|
|
178
|
+
// Try next selector
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
if (!base64Screenshot) {
|
|
182
|
+
// Fall back to a viewport screenshot if no grid element found
|
|
183
|
+
try {
|
|
184
|
+
const fullshot = await page.screenshot({ type: 'png' });
|
|
185
|
+
base64Screenshot = fullshot.toString('base64');
|
|
186
|
+
log.debug('Fell back to full-page screenshot for CAPTCHA');
|
|
187
|
+
}
|
|
188
|
+
catch (err) {
|
|
189
|
+
return {
|
|
190
|
+
solved: false,
|
|
191
|
+
rounds,
|
|
192
|
+
error: `Screenshot failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
// ── 2. Ask vision model ────────────────────────────────────────────────
|
|
197
|
+
log.debug(`Round ${rounds}: asking moondream to find "${targetObject}"…`);
|
|
198
|
+
const positions = await askVisionModel(base64Screenshot, targetObject);
|
|
199
|
+
if (!positions || positions.length === 0) {
|
|
200
|
+
log.debug(`Round ${rounds}: vision model returned no positions — stopping`);
|
|
201
|
+
return { solved: false, rounds, error: 'Vision model returned no valid positions' };
|
|
202
|
+
}
|
|
203
|
+
log.debug(`Round ${rounds}: vision model selected positions: ${positions.join(',')}`);
|
|
204
|
+
// ── 3. Click grid cells ────────────────────────────────────────────────
|
|
205
|
+
let clickedCount = 0;
|
|
206
|
+
for (const pos of positions) {
|
|
207
|
+
for (const gridSelector of CAPTCHA_GRID_SELECTORS) {
|
|
208
|
+
try {
|
|
209
|
+
const gridEl = await page.$(gridSelector);
|
|
210
|
+
if (!gridEl)
|
|
211
|
+
continue;
|
|
212
|
+
// Each grid cell: nth-child or direct child
|
|
213
|
+
const cells = await gridEl.$$('td, div[class*="cell"], div[class*="tile"], div[class*="image"]');
|
|
214
|
+
if (cells.length === 0) {
|
|
215
|
+
// Try direct children
|
|
216
|
+
const children = await gridEl.$$(':scope > *');
|
|
217
|
+
const target = children[pos - 1];
|
|
218
|
+
if (target) {
|
|
219
|
+
await target.click({ timeout: 5000 });
|
|
220
|
+
clickedCount++;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
else {
|
|
224
|
+
const target = cells[pos - 1];
|
|
225
|
+
if (target) {
|
|
226
|
+
await target.click({ timeout: 5000 });
|
|
227
|
+
clickedCount++;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
catch {
|
|
233
|
+
// Try next selector
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
log.debug(`Round ${rounds}: clicked ${clickedCount}/${positions.length} cells`);
|
|
238
|
+
// Short delay before verify (let animation/state settle)
|
|
239
|
+
await page.waitForTimeout(500);
|
|
240
|
+
// ── 4. Click Verify button ─────────────────────────────────────────────
|
|
241
|
+
let clicked = false;
|
|
242
|
+
for (const btnSelector of CAPTCHA_VERIFY_SELECTORS) {
|
|
243
|
+
try {
|
|
244
|
+
const btn = await page.$(btnSelector);
|
|
245
|
+
if (btn) {
|
|
246
|
+
await btn.click({ timeout: 3000 });
|
|
247
|
+
clicked = true;
|
|
248
|
+
log.debug(`Round ${rounds}: clicked verify button (${btnSelector})`);
|
|
249
|
+
break;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
catch {
|
|
253
|
+
// Try next
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
if (!clicked) {
|
|
257
|
+
log.debug(`Round ${rounds}: could not find verify button`);
|
|
258
|
+
}
|
|
259
|
+
// ── 5. Check if solved ─────────────────────────────────────────────────
|
|
260
|
+
await page.waitForTimeout(2000);
|
|
261
|
+
// Check for success indicators
|
|
262
|
+
const solved = await checkCaptchaSolved(page);
|
|
263
|
+
if (solved) {
|
|
264
|
+
log.debug(`Round ${rounds}: CAPTCHA solved!`);
|
|
265
|
+
return { solved: true, rounds };
|
|
266
|
+
}
|
|
267
|
+
// Check if a new round appeared (grid refreshed)
|
|
268
|
+
const newTarget = await detectImageCaptchaTarget(page);
|
|
269
|
+
if (!newTarget) {
|
|
270
|
+
// No more instructions — likely solved or error
|
|
271
|
+
log.debug(`Round ${rounds}: no more instruction text — assuming solved`);
|
|
272
|
+
return { solved: true, rounds };
|
|
273
|
+
}
|
|
274
|
+
// Update target object for next round (may change between rounds)
|
|
275
|
+
// eslint-disable-next-line no-param-reassign
|
|
276
|
+
targetObject = newTarget;
|
|
277
|
+
log.debug(`Round ${rounds}: new target for next round: "${targetObject}"`);
|
|
278
|
+
}
|
|
279
|
+
return { solved: false, rounds, error: `Reached max rounds (${IMAGE_CAPTCHA_MAX_ROUNDS}) without solving` };
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Check if the CAPTCHA appears to have been solved (challenge gone, success message, etc.)
|
|
283
|
+
*/
|
|
284
|
+
async function checkCaptchaSolved(page) {
|
|
285
|
+
// Check for reCAPTCHA success state
|
|
286
|
+
try {
|
|
287
|
+
const successEl = await page.$('.recaptcha-checkbox-checked, .rc-anchor-normal-footer, [aria-checked="true"]');
|
|
288
|
+
if (successEl)
|
|
289
|
+
return true;
|
|
290
|
+
}
|
|
291
|
+
catch { /* ignore */ }
|
|
292
|
+
// Check if CAPTCHA challenge overlay disappeared (grid gone)
|
|
293
|
+
try {
|
|
294
|
+
const gridEl = await page.$('.rc-imageselect-table, .task-grid');
|
|
295
|
+
// If we were on a CAPTCHA page and the grid is now gone, it was likely solved
|
|
296
|
+
if (!gridEl) {
|
|
297
|
+
// Only count as solved if we're no longer on a CAPTCHA title page
|
|
298
|
+
const title = await page.title().catch(() => '');
|
|
299
|
+
const isCaptchaTitle = title.toLowerCase().includes('captcha') || title.toLowerCase().includes('robot');
|
|
300
|
+
if (!isCaptchaTitle)
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
catch { /* ignore */ }
|
|
305
|
+
// Check page URL changed (successful solve often triggers redirect)
|
|
306
|
+
try {
|
|
307
|
+
const title = await page.title();
|
|
308
|
+
const isCaptchaPage = title.toLowerCase().includes('captcha') || title.toLowerCase().includes('robot');
|
|
309
|
+
if (!isCaptchaPage)
|
|
310
|
+
return true;
|
|
311
|
+
}
|
|
312
|
+
catch { /* ignore */ }
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
23
315
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
24
316
|
const DEFAULT_TIMEOUT_MS = 15_000;
|
|
25
317
|
/** Cloudflare challenge title before it's solved */
|
|
@@ -55,8 +347,7 @@ export async function solveChallenge(url, challengeType, html, options = {}) {
|
|
|
55
347
|
case 'cloudflare':
|
|
56
348
|
return solveCloudflare(url, html, timeout, options.proxy);
|
|
57
349
|
case 'captcha':
|
|
58
|
-
|
|
59
|
-
return { solved: false, html, error: 'No free captcha solver available for generic captcha' };
|
|
350
|
+
return solveCaptchaWithVision(url, html, timeout, options.proxy);
|
|
60
351
|
case 'datadome':
|
|
61
352
|
// DataDome can sometimes be bypassed with a stealth browser
|
|
62
353
|
return solveWithStealthBrowser(url, html, timeout, options.proxy, 'datadome');
|
|
@@ -73,6 +364,68 @@ export async function solveChallenge(url, challengeType, html, options = {}) {
|
|
|
73
364
|
return { solved: false, html, error: `Unknown challenge type: ${challengeType}` };
|
|
74
365
|
}
|
|
75
366
|
}
|
|
367
|
+
// ── Image CAPTCHA orchestrator ────────────────────────────────────────────────
|
|
368
|
+
/**
|
|
369
|
+
* Solve an image CAPTCHA by opening a stealth browser, detecting the target
|
|
370
|
+
* object from the page instructions, and calling solveImageCaptcha().
|
|
371
|
+
*/
|
|
372
|
+
async function solveCaptchaWithVision(url, _html, timeoutMs, proxy) {
|
|
373
|
+
let page = null;
|
|
374
|
+
try {
|
|
375
|
+
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
|
|
376
|
+
const browser = await getStealthBrowser();
|
|
377
|
+
const vp = getRandomViewport();
|
|
378
|
+
const ctx = await browser.newContext({
|
|
379
|
+
userAgent: getRandomUserAgent(),
|
|
380
|
+
viewport: { width: vp.width, height: vp.height },
|
|
381
|
+
...(proxy ? { proxy: { server: proxy } } : {}),
|
|
382
|
+
locale: 'en-US',
|
|
383
|
+
timezoneId: 'America/New_York',
|
|
384
|
+
});
|
|
385
|
+
page = await ctx.newPage();
|
|
386
|
+
await applyStealthScripts(page);
|
|
387
|
+
await page.goto(url, {
|
|
388
|
+
waitUntil: 'domcontentloaded',
|
|
389
|
+
timeout: timeoutMs,
|
|
390
|
+
});
|
|
391
|
+
// Wait for CAPTCHA to render
|
|
392
|
+
await page.waitForTimeout(2000);
|
|
393
|
+
// Detect the target object from the CAPTCHA instructions
|
|
394
|
+
const targetObject = await detectImageCaptchaTarget(page);
|
|
395
|
+
if (!targetObject) {
|
|
396
|
+
const html = await page.content().catch(() => _html);
|
|
397
|
+
await ctx.close().catch(() => { });
|
|
398
|
+
return { solved: false, html, error: 'Could not detect image CAPTCHA target object from page' };
|
|
399
|
+
}
|
|
400
|
+
log.debug(`Image CAPTCHA target: "${targetObject}"`);
|
|
401
|
+
// Solve the CAPTCHA — may take up to VISION_TIMEOUT_MS * IMAGE_CAPTCHA_MAX_ROUNDS
|
|
402
|
+
const captchaResult = await solveImageCaptcha(page, targetObject);
|
|
403
|
+
const html = await page.content().catch(() => _html);
|
|
404
|
+
const cookies = await ctx.cookies();
|
|
405
|
+
const cookieStrings = cookies.map(c => `${c.name}=${c.value}; Path=${c.path || '/'}${c.domain ? `; Domain=${c.domain}` : ''}`);
|
|
406
|
+
if (cookieStrings.length > 0) {
|
|
407
|
+
cacheCookiesForUrl(url, cookieStrings);
|
|
408
|
+
}
|
|
409
|
+
await ctx.close().catch(() => { });
|
|
410
|
+
if (captchaResult.solved) {
|
|
411
|
+
console.log(`[challenge-solver] Image CAPTCHA solved for ${getDomain(url)} in ${captchaResult.rounds} round(s)`);
|
|
412
|
+
return { solved: true, html, cookies: cookieStrings, method: 'local-browser' };
|
|
413
|
+
}
|
|
414
|
+
return {
|
|
415
|
+
solved: false,
|
|
416
|
+
html,
|
|
417
|
+
error: captchaResult.error ?? `Image CAPTCHA not solved after ${captchaResult.rounds} round(s)`,
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
catch (err) {
|
|
421
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
422
|
+
log.debug('Image CAPTCHA solve failed:', error);
|
|
423
|
+
return { solved: false, html: _html, error };
|
|
424
|
+
}
|
|
425
|
+
finally {
|
|
426
|
+
page = null;
|
|
427
|
+
}
|
|
428
|
+
}
|
|
76
429
|
// ── Cloudflare solver ─────────────────────────────────────────────────────────
|
|
77
430
|
/**
|
|
78
431
|
* Solve Cloudflare JS challenge by rendering the page in a stealth browser.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory LRU fetch cache for WebPeel
|
|
3
|
+
*
|
|
4
|
+
* Caches pipeline results to avoid redundant fetches for identical requests.
|
|
5
|
+
* Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
|
|
6
|
+
* Exported as a singleton: import { fetchCache } from './fetch-cache.js'
|
|
7
|
+
*/
|
|
8
|
+
export interface FetchCacheEntry {
|
|
9
|
+
content: string;
|
|
10
|
+
title: string;
|
|
11
|
+
metadata: any;
|
|
12
|
+
method: string;
|
|
13
|
+
tokens: number;
|
|
14
|
+
links?: any[];
|
|
15
|
+
timestamp: number;
|
|
16
|
+
}
|
|
17
|
+
export interface FetchCacheStats {
|
|
18
|
+
size: number;
|
|
19
|
+
hits: number;
|
|
20
|
+
misses: number;
|
|
21
|
+
hitRate: number;
|
|
22
|
+
}
|
|
23
|
+
export declare class FetchCache {
|
|
24
|
+
private cache;
|
|
25
|
+
private maxEntries;
|
|
26
|
+
private defaultTTL;
|
|
27
|
+
private hits;
|
|
28
|
+
private misses;
|
|
29
|
+
constructor(maxEntries?: number, defaultTTLSeconds?: number);
|
|
30
|
+
/**
|
|
31
|
+
* Generate a stable cache key from url + relevant fetch options.
|
|
32
|
+
* Different option combinations produce different cache entries.
|
|
33
|
+
*/
|
|
34
|
+
getKey(url: string, options?: {
|
|
35
|
+
render?: boolean;
|
|
36
|
+
stealth?: boolean;
|
|
37
|
+
budget?: number;
|
|
38
|
+
}): string;
|
|
39
|
+
/**
|
|
40
|
+
* Retrieve a cached entry. Returns null if missing or expired.
|
|
41
|
+
* On hit: entry is moved to the end of the Map (LRU refresh).
|
|
42
|
+
*/
|
|
43
|
+
get(key: string): FetchCacheEntry | null;
|
|
44
|
+
/**
|
|
45
|
+
* Store an entry in the cache.
|
|
46
|
+
* If the cache is at capacity, the least recently used entry is evicted.
|
|
47
|
+
*/
|
|
48
|
+
set(key: string, entry: FetchCacheEntry): void;
|
|
49
|
+
/** Clear all entries and reset stats. */
|
|
50
|
+
clear(): void;
|
|
51
|
+
/** Return cache stats. hitRate is in [0, 1]. */
|
|
52
|
+
stats(): FetchCacheStats;
|
|
53
|
+
}
|
|
54
|
+
/** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
|
|
55
|
+
export declare const fetchCache: FetchCache;
|
|
56
|
+
/** Singleton search cache — shorter TTL since results change faster (60 s). */
|
|
57
|
+
export declare const searchCache: FetchCache;
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory LRU fetch cache for WebPeel
|
|
3
|
+
*
|
|
4
|
+
* Caches pipeline results to avoid redundant fetches for identical requests.
|
|
5
|
+
* Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
|
|
6
|
+
* Exported as a singleton: import { fetchCache } from './fetch-cache.js'
|
|
7
|
+
*/
|
|
8
|
+
export class FetchCache {
|
|
9
|
+
cache;
|
|
10
|
+
maxEntries;
|
|
11
|
+
defaultTTL; // ms
|
|
12
|
+
hits;
|
|
13
|
+
misses;
|
|
14
|
+
constructor(maxEntries = 500, defaultTTLSeconds = 300) {
|
|
15
|
+
this.cache = new Map();
|
|
16
|
+
this.maxEntries = maxEntries;
|
|
17
|
+
this.defaultTTL = defaultTTLSeconds * 1000;
|
|
18
|
+
this.hits = 0;
|
|
19
|
+
this.misses = 0;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Generate a stable cache key from url + relevant fetch options.
|
|
23
|
+
* Different option combinations produce different cache entries.
|
|
24
|
+
*/
|
|
25
|
+
getKey(url, options = {}) {
|
|
26
|
+
const render = options.render ? '1' : '0';
|
|
27
|
+
const stealth = options.stealth ? '1' : '0';
|
|
28
|
+
const budget = options.budget !== undefined ? String(options.budget) : '';
|
|
29
|
+
return `${url}|r:${render}|s:${stealth}|b:${budget}`;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Retrieve a cached entry. Returns null if missing or expired.
|
|
33
|
+
* On hit: entry is moved to the end of the Map (LRU refresh).
|
|
34
|
+
*/
|
|
35
|
+
get(key) {
|
|
36
|
+
const entry = this.cache.get(key);
|
|
37
|
+
if (!entry) {
|
|
38
|
+
this.misses++;
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
const ageMs = Date.now() - entry.timestamp;
|
|
42
|
+
if (ageMs > this.defaultTTL) {
|
|
43
|
+
// Expired — evict and return null
|
|
44
|
+
this.cache.delete(key);
|
|
45
|
+
this.misses++;
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
// LRU touch: move to end
|
|
49
|
+
this.cache.delete(key);
|
|
50
|
+
this.cache.set(key, entry);
|
|
51
|
+
this.hits++;
|
|
52
|
+
return entry;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Store an entry in the cache.
|
|
56
|
+
* If the cache is at capacity, the least recently used entry is evicted.
|
|
57
|
+
*/
|
|
58
|
+
set(key, entry) {
|
|
59
|
+
// Remove existing to refresh position
|
|
60
|
+
if (this.cache.has(key)) {
|
|
61
|
+
this.cache.delete(key);
|
|
62
|
+
}
|
|
63
|
+
this.cache.set(key, entry);
|
|
64
|
+
// LRU eviction: remove oldest entry (first in Map iteration order)
|
|
65
|
+
while (this.cache.size > this.maxEntries) {
|
|
66
|
+
const oldestKey = this.cache.keys().next().value;
|
|
67
|
+
if (oldestKey !== undefined) {
|
|
68
|
+
this.cache.delete(oldestKey);
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
/** Clear all entries and reset stats. */
|
|
76
|
+
clear() {
|
|
77
|
+
this.cache.clear();
|
|
78
|
+
this.hits = 0;
|
|
79
|
+
this.misses = 0;
|
|
80
|
+
}
|
|
81
|
+
/** Return cache stats. hitRate is in [0, 1]. */
|
|
82
|
+
stats() {
|
|
83
|
+
const total = this.hits + this.misses;
|
|
84
|
+
return {
|
|
85
|
+
size: this.cache.size,
|
|
86
|
+
hits: this.hits,
|
|
87
|
+
misses: this.misses,
|
|
88
|
+
hitRate: total === 0 ? 0 : Math.round((this.hits / total) * 100) / 100,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
/** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
|
|
93
|
+
export const fetchCache = new FetchCache(500, 300);
|
|
94
|
+
/** Singleton search cache — shorter TTL since results change faster (60 s). */
|
|
95
|
+
export const searchCache = new FetchCache(500, 60);
|
package/dist/core/pipeline.js
CHANGED
|
@@ -1217,6 +1217,52 @@ export async function finalize(ctx) {
|
|
|
1217
1217
|
log.error('Change tracking failed:', error);
|
|
1218
1218
|
}
|
|
1219
1219
|
}
|
|
1220
|
+
// ── Auto-escalation: retry thin content with browser rendering ──────────────
|
|
1221
|
+
// If simple fetch returned very little content and user didn't explicitly disable render,
|
|
1222
|
+
// automatically retry with browser rendering to handle JS-heavy/paywalled sites.
|
|
1223
|
+
const preEscalationWords = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1224
|
+
const escalationFetchMethod = fetchResult?.method || 'unknown';
|
|
1225
|
+
const alreadyTriedBrowser = escalationFetchMethod === 'browser' || escalationFetchMethod === 'stealth'
|
|
1226
|
+
|| options.render || options.stealth;
|
|
1227
|
+
const userDisabledRender = options.render === false;
|
|
1228
|
+
const escalationCandidate = preEscalationWords < 200 && preEscalationWords > 0
|
|
1229
|
+
&& escalationFetchMethod === 'simple' && !alreadyTriedBrowser && !userDisabledRender
|
|
1230
|
+
&& !ctx._escalated;
|
|
1231
|
+
if (escalationCandidate) {
|
|
1232
|
+
log.info(`thin content (${preEscalationWords}w) from simple fetch, auto-escalating to browser render for ${ctx.url}`);
|
|
1233
|
+
ctx._escalated = true;
|
|
1234
|
+
try {
|
|
1235
|
+
const { smartFetch } = await import('./strategies.js');
|
|
1236
|
+
const browserResult = await smartFetch(ctx.url, {
|
|
1237
|
+
forceBrowser: true,
|
|
1238
|
+
stealth: false,
|
|
1239
|
+
timeoutMs: options.timeout || 15000,
|
|
1240
|
+
proxy: options.proxy,
|
|
1241
|
+
});
|
|
1242
|
+
if (browserResult.html && browserResult.html.length > (fetchResult?.html?.length || 0)) {
|
|
1243
|
+
const { htmlToMarkdown } = await import('./markdown.js');
|
|
1244
|
+
const browserContent = htmlToMarkdown(browserResult.html);
|
|
1245
|
+
const browserWords = browserContent.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1246
|
+
if (browserWords > preEscalationWords) {
|
|
1247
|
+
log.info(`browser escalation improved content: ${preEscalationWords}w → ${browserWords}w`);
|
|
1248
|
+
ctx.content = browserContent;
|
|
1249
|
+
ctx.fetchResult = browserResult;
|
|
1250
|
+
ctx.fetchResult.method = 'browser-escalation';
|
|
1251
|
+
}
|
|
1252
|
+
else {
|
|
1253
|
+
log.debug(`browser escalation did not improve (${browserWords}w vs ${preEscalationWords}w)`);
|
|
1254
|
+
}
|
|
1255
|
+
// Always clean up browser resources
|
|
1256
|
+
if (browserResult.page)
|
|
1257
|
+
await browserResult.page.close().catch(() => { });
|
|
1258
|
+
if (browserResult.browser)
|
|
1259
|
+
await browserResult.browser.close().catch(() => { });
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
catch (e) {
|
|
1263
|
+
log.debug('browser escalation failed:', e instanceof Error ? e.message : e);
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1220
1266
|
// Generate AI summary if requested
|
|
1221
1267
|
if (options.summary && options.llm) {
|
|
1222
1268
|
try {
|
|
@@ -7,6 +7,7 @@ import { Router } from 'express';
|
|
|
7
7
|
import { readFileSync } from 'fs';
|
|
8
8
|
import { join, dirname } from 'path';
|
|
9
9
|
import { fileURLToPath } from 'url';
|
|
10
|
+
import { fetchCache, searchCache } from '../../core/fetch-cache.js';
|
|
10
11
|
const startTime = Date.now();
|
|
11
12
|
// Read version once at startup
|
|
12
13
|
let version = 'unknown';
|
|
@@ -26,11 +27,23 @@ export function createHealthRouter() {
|
|
|
26
27
|
const router = Router();
|
|
27
28
|
router.get('/health', (_req, res) => {
|
|
28
29
|
const uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
30
|
+
const fetchStats = fetchCache.stats();
|
|
31
|
+
const searchStats = searchCache.stats();
|
|
29
32
|
res.json({
|
|
30
33
|
status: 'healthy',
|
|
31
34
|
version,
|
|
32
35
|
uptime,
|
|
33
36
|
timestamp: new Date().toISOString(),
|
|
37
|
+
cache: {
|
|
38
|
+
fetch: {
|
|
39
|
+
size: fetchStats.size,
|
|
40
|
+
hitRate: fetchStats.hitRate,
|
|
41
|
+
},
|
|
42
|
+
search: {
|
|
43
|
+
size: searchStats.size,
|
|
44
|
+
hitRate: searchStats.hitRate,
|
|
45
|
+
},
|
|
46
|
+
},
|
|
34
47
|
});
|
|
35
48
|
});
|
|
36
49
|
return router;
|
|
@@ -7,6 +7,7 @@ import { load } from 'cheerio';
|
|
|
7
7
|
import { LRUCache } from 'lru-cache';
|
|
8
8
|
import { peel } from '../../index.js';
|
|
9
9
|
import { simpleFetch } from '../../core/fetcher.js';
|
|
10
|
+
import { searchCache } from '../../core/fetch-cache.js';
|
|
10
11
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
11
12
|
export function createSearchRouter(authStore) {
|
|
12
13
|
const router = Router();
|
|
@@ -47,9 +48,9 @@ export function createSearchRouter(authStore) {
|
|
|
47
48
|
return;
|
|
48
49
|
}
|
|
49
50
|
// Parse and validate count
|
|
50
|
-
const resultCount = count ? parseInt(count, 10) :
|
|
51
|
-
if (isNaN(resultCount) || resultCount < 1 || resultCount >
|
|
52
|
-
res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and
|
|
51
|
+
const resultCount = count ? parseInt(count, 10) : 10;
|
|
52
|
+
if (isNaN(resultCount) || resultCount < 1 || resultCount > 20) {
|
|
53
|
+
res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and 20', hint: 'Use a count value between 1 and 20', docs: 'https://webpeel.dev/docs/errors#invalid_request' }, requestId: req.requestId });
|
|
53
54
|
return;
|
|
54
55
|
}
|
|
55
56
|
// Parse sources parameter (comma-separated: web,news,images)
|
|
@@ -64,10 +65,12 @@ export function createSearchRouter(authStore) {
|
|
|
64
65
|
// Build cache key (include all parameters)
|
|
65
66
|
const enrichCount = enrich ? Math.min(Math.max(parseInt(enrich, 10) || 0, 0), 5) : 0;
|
|
66
67
|
const cacheKey = `search:${providerId}:${q}:${resultCount}:${sourcesStr}:${shouldScrape}:${enrichCount}:${categoriesStr}:${tbsStr}:${countryStr}:${locationStr}`;
|
|
67
|
-
|
|
68
|
+
const sharedCacheKey = searchCache.getKey(cacheKey, {});
|
|
69
|
+
// Check cache (local LRU first, then shared singleton)
|
|
68
70
|
const cached = cache.get(cacheKey);
|
|
69
71
|
if (cached) {
|
|
70
72
|
res.setHeader('X-Cache', 'HIT');
|
|
73
|
+
res.setHeader('X-Cache-Status', 'HIT');
|
|
71
74
|
res.setHeader('X-Cache-Age', Math.floor((Date.now() - cached.timestamp) / 1000).toString());
|
|
72
75
|
res.json({
|
|
73
76
|
success: true,
|
|
@@ -75,6 +78,19 @@ export function createSearchRouter(authStore) {
|
|
|
75
78
|
});
|
|
76
79
|
return;
|
|
77
80
|
}
|
|
81
|
+
// Also check shared searchCache singleton (used for /health stats)
|
|
82
|
+
const sharedCached = searchCache.get(sharedCacheKey);
|
|
83
|
+
if (sharedCached) {
|
|
84
|
+
const age = Math.floor((Date.now() - sharedCached.timestamp) / 1000);
|
|
85
|
+
res.setHeader('X-Cache', 'HIT');
|
|
86
|
+
res.setHeader('X-Cache-Status', 'HIT');
|
|
87
|
+
res.setHeader('X-Cache-Age', age.toString());
|
|
88
|
+
res.json({
|
|
89
|
+
success: true,
|
|
90
|
+
data: sharedCached.content ? JSON.parse(sharedCached.content) : {},
|
|
91
|
+
});
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
78
94
|
const startTime = Date.now();
|
|
79
95
|
const data = {};
|
|
80
96
|
// Fetch web results via the search-provider abstraction
|
|
@@ -317,13 +333,22 @@ export function createSearchRouter(authStore) {
|
|
|
317
333
|
await pgStore.trackUsage(req.auth.keyInfo.key, 'search');
|
|
318
334
|
}
|
|
319
335
|
}
|
|
320
|
-
// Cache results
|
|
336
|
+
// Cache results (local LRU + shared singleton for /health stats)
|
|
321
337
|
cache.set(cacheKey, {
|
|
322
338
|
data,
|
|
323
339
|
timestamp: Date.now(),
|
|
324
340
|
});
|
|
341
|
+
searchCache.set(sharedCacheKey, {
|
|
342
|
+
content: JSON.stringify(data),
|
|
343
|
+
title: q,
|
|
344
|
+
metadata: {},
|
|
345
|
+
method: 'search',
|
|
346
|
+
tokens: 0,
|
|
347
|
+
timestamp: Date.now(),
|
|
348
|
+
});
|
|
325
349
|
// Add headers
|
|
326
350
|
res.setHeader('X-Cache', 'MISS');
|
|
351
|
+
res.setHeader('X-Cache-Status', 'MISS');
|
|
327
352
|
res.setHeader('X-Credits-Used', '1');
|
|
328
353
|
res.setHeader('X-Processing-Time', elapsed.toString());
|
|
329
354
|
res.setHeader('X-Fetch-Type', 'search');
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.53",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|