webpeel 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -500
- package/dist/cli-auth.d.ts +2 -0
- package/dist/cli-auth.d.ts.map +1 -1
- package/dist/cli-auth.js +16 -3
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +475 -77
- package/dist/cli.js.map +1 -1
- package/dist/core/actions.d.ts +19 -10
- package/dist/core/actions.d.ts.map +1 -1
- package/dist/core/actions.js +214 -43
- package/dist/core/actions.js.map +1 -1
- package/dist/core/agent.d.ts +60 -3
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +375 -86
- package/dist/core/agent.js.map +1 -1
- package/dist/core/answer.d.ts +43 -0
- package/dist/core/answer.d.ts.map +1 -0
- package/dist/core/answer.js +378 -0
- package/dist/core/answer.js.map +1 -0
- package/dist/core/cache.d.ts +14 -0
- package/dist/core/cache.d.ts.map +1 -0
- package/dist/core/cache.js +122 -0
- package/dist/core/cache.js.map +1 -0
- package/dist/core/dns-cache.d.ts +21 -0
- package/dist/core/dns-cache.d.ts.map +1 -0
- package/dist/core/dns-cache.js +184 -0
- package/dist/core/dns-cache.js.map +1 -0
- package/dist/core/documents.d.ts +24 -0
- package/dist/core/documents.d.ts.map +1 -0
- package/dist/core/documents.js +124 -0
- package/dist/core/documents.js.map +1 -0
- package/dist/core/extract-inline.d.ts +39 -0
- package/dist/core/extract-inline.d.ts.map +1 -0
- package/dist/core/extract-inline.js +214 -0
- package/dist/core/extract-inline.js.map +1 -0
- package/dist/core/fetcher.d.ts +33 -7
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +608 -41
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/jobs.d.ts +66 -0
- package/dist/core/jobs.d.ts.map +1 -0
- package/dist/core/jobs.js +513 -0
- package/dist/core/jobs.js.map +1 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +141 -31
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/pdf.d.ts.map +1 -1
- package/dist/core/pdf.js +3 -1
- package/dist/core/pdf.js.map +1 -1
- package/dist/core/screenshot.d.ts +33 -0
- package/dist/core/screenshot.d.ts.map +1 -0
- package/dist/core/screenshot.js +30 -0
- package/dist/core/screenshot.js.map +1 -0
- package/dist/core/search-provider.d.ts +46 -0
- package/dist/core/search-provider.d.ts.map +1 -0
- package/dist/core/search-provider.js +281 -0
- package/dist/core/search-provider.js.map +1 -0
- package/dist/core/strategies.d.ts +7 -10
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +370 -63
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts +9 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +61 -32
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +335 -70
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +43 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +85 -47
- package/package.json +11 -5
package/dist/core/fetcher.js
CHANGED
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Core fetching logic: simple HTTP and browser-based fetching
|
|
3
3
|
*/
|
|
4
|
+
// Force IPv4-first DNS resolution globally.
|
|
5
|
+
// Prevents IPv6 connection failures (TLS errors, timeouts) on hosts that
|
|
6
|
+
// advertise AAAA records but can't actually route IPv6 (e.g. Render containers).
|
|
7
|
+
// Must run before any network library is used.
|
|
8
|
+
import dns from 'dns';
|
|
9
|
+
dns.setDefaultResultOrder('ipv4first');
|
|
4
10
|
import { chromium } from 'playwright';
|
|
5
11
|
import { chromium as stealthChromium } from 'playwright-extra';
|
|
6
12
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
7
|
-
import { fetch as undiciFetch } from 'undici';
|
|
13
|
+
import { fetch as undiciFetch, Agent } from 'undici';
|
|
8
14
|
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
15
|
+
import { getCached } from './cache.js';
|
|
16
|
+
import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
|
|
9
17
|
// Add stealth plugin to playwright-extra
|
|
10
18
|
stealthChromium.use(StealthPlugin());
|
|
11
19
|
const USER_AGENTS = [
|
|
@@ -18,6 +26,102 @@ const USER_AGENTS = [
|
|
|
18
26
|
function getRandomUserAgent() {
|
|
19
27
|
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
20
28
|
}
|
|
29
|
+
function createHttpPool() {
|
|
30
|
+
return new Agent({
|
|
31
|
+
connections: 20,
|
|
32
|
+
pipelining: 6,
|
|
33
|
+
keepAliveTimeout: 60000,
|
|
34
|
+
keepAliveMaxTimeout: 60000,
|
|
35
|
+
allowH2: true,
|
|
36
|
+
connect: {
|
|
37
|
+
lookup: cachedLookup,
|
|
38
|
+
},
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
let httpPool = createHttpPool();
|
|
42
|
+
startDnsWarmup();
|
|
43
|
+
const CONDITIONAL_CACHE_MAX_ENTRIES = 2000;
|
|
44
|
+
const conditionalValidatorsByUrl = new Map();
|
|
45
|
+
function normalizeUrlForConditionalCache(url) {
|
|
46
|
+
try {
|
|
47
|
+
const normalized = new URL(url);
|
|
48
|
+
normalized.hash = '';
|
|
49
|
+
normalized.hostname = normalized.hostname.toLowerCase();
|
|
50
|
+
if ((normalized.protocol === 'http:' && normalized.port === '80') ||
|
|
51
|
+
(normalized.protocol === 'https:' && normalized.port === '443')) {
|
|
52
|
+
normalized.port = '';
|
|
53
|
+
}
|
|
54
|
+
if (!normalized.pathname) {
|
|
55
|
+
normalized.pathname = '/';
|
|
56
|
+
}
|
|
57
|
+
const sortedParams = [...normalized.searchParams.entries()]
|
|
58
|
+
.sort(([a], [b]) => a.localeCompare(b));
|
|
59
|
+
normalized.search = '';
|
|
60
|
+
for (const [key, value] of sortedParams) {
|
|
61
|
+
normalized.searchParams.append(key, value);
|
|
62
|
+
}
|
|
63
|
+
return normalized.toString();
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
return url.trim();
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
function getConditionalValidators(url) {
|
|
70
|
+
const key = normalizeUrlForConditionalCache(url);
|
|
71
|
+
const existing = conditionalValidatorsByUrl.get(key);
|
|
72
|
+
if (!existing) {
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
// LRU touch
|
|
76
|
+
conditionalValidatorsByUrl.delete(key);
|
|
77
|
+
conditionalValidatorsByUrl.set(key, existing);
|
|
78
|
+
return existing;
|
|
79
|
+
}
|
|
80
|
+
function setConditionalValidators(url, validators) {
|
|
81
|
+
const key = normalizeUrlForConditionalCache(url);
|
|
82
|
+
if (conditionalValidatorsByUrl.has(key)) {
|
|
83
|
+
conditionalValidatorsByUrl.delete(key);
|
|
84
|
+
}
|
|
85
|
+
conditionalValidatorsByUrl.set(key, validators);
|
|
86
|
+
while (conditionalValidatorsByUrl.size > CONDITIONAL_CACHE_MAX_ENTRIES) {
|
|
87
|
+
const oldestKey = conditionalValidatorsByUrl.keys().next().value;
|
|
88
|
+
if (!oldestKey) {
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
conditionalValidatorsByUrl.delete(oldestKey);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
function rememberConditionalValidators(url, response) {
|
|
95
|
+
const etag = response.headers.get('etag') || undefined;
|
|
96
|
+
const lastModified = response.headers.get('last-modified') || undefined;
|
|
97
|
+
if (!etag && !lastModified) {
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
setConditionalValidators(url, { etag, lastModified });
|
|
101
|
+
}
|
|
102
|
+
function hasHeader(headers, name) {
|
|
103
|
+
const lowered = name.toLowerCase();
|
|
104
|
+
return Object.keys(headers).some((header) => header.toLowerCase() === lowered);
|
|
105
|
+
}
|
|
106
|
+
function getCachedResultFor304(url, fallbackUrl) {
|
|
107
|
+
const cached = getCached(url) || (fallbackUrl ? getCached(fallbackUrl) : null);
|
|
108
|
+
if (!cached) {
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
return {
|
|
112
|
+
html: cached.html,
|
|
113
|
+
buffer: cached.buffer,
|
|
114
|
+
url: cached.url || url,
|
|
115
|
+
statusCode: 304,
|
|
116
|
+
contentType: cached.contentType,
|
|
117
|
+
screenshot: cached.screenshot,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
function createAbortError() {
|
|
121
|
+
const error = new Error('Operation aborted');
|
|
122
|
+
error.name = 'AbortError';
|
|
123
|
+
return error;
|
|
124
|
+
}
|
|
21
125
|
/**
|
|
22
126
|
* SECURITY: Validate URL to prevent SSRF attacks
|
|
23
127
|
* Blocks localhost, private IPs, link-local, and various bypass techniques
|
|
@@ -229,20 +333,37 @@ function validateUserAgent(userAgent) {
|
|
|
229
333
|
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
230
334
|
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
231
335
|
*/
|
|
232
|
-
export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders) {
|
|
336
|
+
export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders, abortSignal) {
|
|
233
337
|
// SECURITY: Validate URL to prevent SSRF
|
|
234
338
|
validateUrl(url);
|
|
339
|
+
if (abortSignal?.aborted) {
|
|
340
|
+
throw createAbortError();
|
|
341
|
+
}
|
|
235
342
|
// Validate user agent if provided
|
|
236
|
-
|
|
343
|
+
// SEC.gov requires a User-Agent with contact info (their documented automated access policy)
|
|
344
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
345
|
+
const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
|
|
346
|
+
const validatedUserAgent = isSecGov
|
|
347
|
+
? 'WebPeel/1.0 (support@webpeel.dev)'
|
|
348
|
+
: (userAgent ? validateUserAgent(userAgent) : getRandomUserAgent());
|
|
237
349
|
// SECURITY: Merge custom headers with defaults, block Host header override
|
|
238
350
|
const defaultHeaders = {
|
|
239
351
|
'User-Agent': validatedUserAgent,
|
|
240
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
352
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
241
353
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
242
|
-
'Accept-Encoding': 'gzip, deflate
|
|
354
|
+
'Accept-Encoding': 'br, gzip, deflate',
|
|
243
355
|
'DNT': '1',
|
|
244
356
|
'Connection': 'keep-alive',
|
|
245
357
|
'Upgrade-Insecure-Requests': '1',
|
|
358
|
+
'Sec-CH-UA': '"Chromium";v="131", "Not_A Brand";v="24"',
|
|
359
|
+
'Sec-CH-UA-Mobile': '?0',
|
|
360
|
+
'Sec-CH-UA-Platform': '"macOS"',
|
|
361
|
+
'Sec-Fetch-Dest': 'document',
|
|
362
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
363
|
+
'Sec-Fetch-Site': 'none',
|
|
364
|
+
'Sec-Fetch-User': '?1',
|
|
365
|
+
'Cache-Control': 'max-age=0',
|
|
366
|
+
'Priority': 'u=0, i',
|
|
246
367
|
};
|
|
247
368
|
const mergedHeaders = { ...defaultHeaders };
|
|
248
369
|
if (customHeaders) {
|
|
@@ -258,6 +379,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
258
379
|
let redirectCount = 0;
|
|
259
380
|
let currentUrl = url;
|
|
260
381
|
const seenUrls = new Set();
|
|
382
|
+
try {
|
|
383
|
+
const hostname = new URL(url).hostname;
|
|
384
|
+
void resolveAndCache(hostname).catch(() => {
|
|
385
|
+
// Best-effort optimization only.
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
catch {
|
|
389
|
+
// Ignore URL parsing errors here; validation handles invalid input below.
|
|
390
|
+
}
|
|
261
391
|
while (redirectCount <= MAX_REDIRECTS) {
|
|
262
392
|
// Detect redirect loops
|
|
263
393
|
if (seenUrls.has(currentUrl)) {
|
|
@@ -266,15 +396,34 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
266
396
|
seenUrls.add(currentUrl);
|
|
267
397
|
// Re-validate on each redirect
|
|
268
398
|
validateUrl(currentUrl);
|
|
269
|
-
const
|
|
270
|
-
const timer = setTimeout(() =>
|
|
399
|
+
const timeoutController = new AbortController();
|
|
400
|
+
const timer = setTimeout(() => timeoutController.abort(), timeoutMs);
|
|
401
|
+
const signal = abortSignal
|
|
402
|
+
? AbortSignal.any([timeoutController.signal, abortSignal])
|
|
403
|
+
: timeoutController.signal;
|
|
271
404
|
try {
|
|
405
|
+
const requestHeaders = { ...mergedHeaders };
|
|
406
|
+
const validators = getConditionalValidators(currentUrl);
|
|
407
|
+
if (validators?.etag && !hasHeader(requestHeaders, 'if-none-match')) {
|
|
408
|
+
requestHeaders['If-None-Match'] = validators.etag;
|
|
409
|
+
}
|
|
410
|
+
if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
|
|
411
|
+
requestHeaders['If-Modified-Since'] = validators.lastModified;
|
|
412
|
+
}
|
|
272
413
|
const response = await undiciFetch(currentUrl, {
|
|
273
|
-
headers:
|
|
274
|
-
signal
|
|
414
|
+
headers: requestHeaders,
|
|
415
|
+
signal,
|
|
416
|
+
dispatcher: httpPool,
|
|
275
417
|
redirect: 'manual', // SECURITY: Manual redirect handling
|
|
276
418
|
});
|
|
277
419
|
clearTimeout(timer);
|
|
420
|
+
if (response.status === 304) {
|
|
421
|
+
const cachedResult = getCachedResultFor304(currentUrl, url);
|
|
422
|
+
if (cachedResult) {
|
|
423
|
+
return cachedResult;
|
|
424
|
+
}
|
|
425
|
+
throw new NetworkError('HTTP 304 received but no cached response is available');
|
|
426
|
+
}
|
|
278
427
|
// Handle redirects manually
|
|
279
428
|
if (response.status >= 300 && response.status < 400) {
|
|
280
429
|
const location = response.headers.get('location');
|
|
@@ -283,6 +432,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
283
432
|
}
|
|
284
433
|
// Resolve relative URLs
|
|
285
434
|
currentUrl = new URL(location, currentUrl).href;
|
|
435
|
+
try {
|
|
436
|
+
const hostname = new URL(currentUrl).hostname;
|
|
437
|
+
void resolveAndCache(hostname).catch(() => {
|
|
438
|
+
// Best-effort optimization only.
|
|
439
|
+
});
|
|
440
|
+
}
|
|
441
|
+
catch {
|
|
442
|
+
// Ignore URL parsing errors here; validation handles invalid input below.
|
|
443
|
+
}
|
|
286
444
|
redirectCount++;
|
|
287
445
|
continue;
|
|
288
446
|
}
|
|
@@ -292,20 +450,37 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
292
450
|
}
|
|
293
451
|
throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
|
|
294
452
|
}
|
|
295
|
-
|
|
453
|
+
rememberConditionalValidators(currentUrl, response);
|
|
454
|
+
// Content-Type detection
|
|
296
455
|
const contentType = response.headers.get('content-type') || '';
|
|
456
|
+
const contentTypeLower = contentType.toLowerCase();
|
|
457
|
+
const urlLower = currentUrl.toLowerCase();
|
|
458
|
+
// Support binary documents (PDF/DOCX) in the simple HTTP path.
|
|
459
|
+
const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
|
|
460
|
+
const isDocx = contentTypeLower.includes('application/vnd.openxmlformats-officedocument.wordprocessingml.document') || urlLower.endsWith('.docx');
|
|
461
|
+
const isBinaryDoc = isPdf || isDocx;
|
|
462
|
+
// Accept a wide range of text-based content, plus supported binary documents.
|
|
297
463
|
const ALLOWED_TYPES = [
|
|
298
|
-
'text/html', 'application/xhtml+xml',
|
|
464
|
+
'text/html', 'application/xhtml+xml',
|
|
299
465
|
'text/plain', 'text/markdown', 'text/csv',
|
|
300
466
|
'application/json', 'text/json',
|
|
301
467
|
'text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml',
|
|
302
468
|
'application/javascript', 'text/javascript', 'text/css',
|
|
469
|
+
// Documents
|
|
470
|
+
'application/pdf',
|
|
471
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
303
472
|
];
|
|
304
|
-
const isAllowed =
|
|
473
|
+
const isAllowed = !contentTypeLower ||
|
|
474
|
+
ALLOWED_TYPES.some(t => contentTypeLower.includes(t)) ||
|
|
475
|
+
// Many servers mislabel docs as octet-stream; allow when URL implies a supported document.
|
|
476
|
+
(contentTypeLower.includes('application/octet-stream') && isBinaryDoc);
|
|
305
477
|
if (!isAllowed) {
|
|
306
478
|
// Check if it's at least text-based
|
|
307
|
-
|
|
308
|
-
|
|
479
|
+
const isTexty = contentTypeLower.startsWith('text/') ||
|
|
480
|
+
contentTypeLower.includes('json') ||
|
|
481
|
+
contentTypeLower.includes('xml');
|
|
482
|
+
if (!isTexty) {
|
|
483
|
+
throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content and PDF/DOCX documents only.`);
|
|
309
484
|
}
|
|
310
485
|
}
|
|
311
486
|
// SECURITY: Stream response with size limit (prevent memory exhaustion)
|
|
@@ -339,14 +514,18 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
339
514
|
combined.set(chunk, offset);
|
|
340
515
|
offset += chunk.length;
|
|
341
516
|
}
|
|
342
|
-
const
|
|
517
|
+
const buffer = Buffer.from(combined);
|
|
518
|
+
const html = isBinaryDoc ? '' : new TextDecoder().decode(combined);
|
|
343
519
|
// For HTML content, check for suspiciously small responses (bot blocks)
|
|
344
520
|
// Non-HTML content (JSON, text, XML) can legitimately be short
|
|
345
|
-
const isHtmlContent =
|
|
521
|
+
const isHtmlContent = !isBinaryDoc && (contentTypeLower.includes('html') || contentTypeLower.includes('xhtml'));
|
|
346
522
|
if (isHtmlContent && (!html || html.length < 100)) {
|
|
347
523
|
throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
|
|
348
524
|
}
|
|
349
|
-
if (!html) {
|
|
525
|
+
if (!isBinaryDoc && !html) {
|
|
526
|
+
throw new NetworkError('Empty response body');
|
|
527
|
+
}
|
|
528
|
+
if (isBinaryDoc && buffer.length === 0) {
|
|
350
529
|
throw new NetworkError('Empty response body');
|
|
351
530
|
}
|
|
352
531
|
// Check for Cloudflare challenge (only relevant for HTML)
|
|
@@ -355,6 +534,7 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
355
534
|
}
|
|
356
535
|
return {
|
|
357
536
|
html,
|
|
537
|
+
buffer: isBinaryDoc ? buffer : undefined,
|
|
358
538
|
url: currentUrl,
|
|
359
539
|
statusCode: response.status,
|
|
360
540
|
contentType,
|
|
@@ -366,6 +546,9 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
366
546
|
throw error;
|
|
367
547
|
}
|
|
368
548
|
if (error instanceof Error && error.name === 'AbortError') {
|
|
549
|
+
if (abortSignal?.aborted && !timeoutController.signal.aborted) {
|
|
550
|
+
throw createAbortError();
|
|
551
|
+
}
|
|
369
552
|
throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
|
|
370
553
|
}
|
|
371
554
|
// Provide specific error messages based on the actual cause
|
|
@@ -393,15 +576,101 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
393
576
|
}
|
|
394
577
|
throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
|
|
395
578
|
}
|
|
579
|
+
export async function closePool() {
|
|
580
|
+
const oldPool = httpPool;
|
|
581
|
+
httpPool = createHttpPool();
|
|
582
|
+
await oldPool.close().catch(() => { });
|
|
583
|
+
}
|
|
396
584
|
let sharedBrowser = null;
|
|
397
585
|
let sharedStealthBrowser = null;
|
|
398
586
|
let activePagesCount = 0;
|
|
399
587
|
const MAX_CONCURRENT_PAGES = 5;
|
|
588
|
+
const PAGE_POOL_SIZE = 3;
|
|
589
|
+
const pooledPages = new Set();
|
|
590
|
+
const idlePagePool = [];
|
|
591
|
+
let pagePoolFillPromise = null;
|
|
592
|
+
function removePooledPage(page) {
|
|
593
|
+
pooledPages.delete(page);
|
|
594
|
+
const idleIndex = idlePagePool.indexOf(page);
|
|
595
|
+
if (idleIndex >= 0) {
|
|
596
|
+
idlePagePool.splice(idleIndex, 1);
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
function takePooledPage() {
|
|
600
|
+
while (idlePagePool.length > 0) {
|
|
601
|
+
const page = idlePagePool.shift();
|
|
602
|
+
if (page.isClosed()) {
|
|
603
|
+
removePooledPage(page);
|
|
604
|
+
continue;
|
|
605
|
+
}
|
|
606
|
+
return page;
|
|
607
|
+
}
|
|
608
|
+
return null;
|
|
609
|
+
}
|
|
610
|
+
async function ensurePagePool(browser) {
|
|
611
|
+
const activeBrowser = browser ?? sharedBrowser;
|
|
612
|
+
if (!activeBrowser || !activeBrowser.isConnected()) {
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
if (pagePoolFillPromise) {
|
|
616
|
+
await pagePoolFillPromise;
|
|
617
|
+
return;
|
|
618
|
+
}
|
|
619
|
+
pagePoolFillPromise = (async () => {
|
|
620
|
+
while (pooledPages.size < PAGE_POOL_SIZE) {
|
|
621
|
+
const pooledPage = await activeBrowser.newPage({
|
|
622
|
+
userAgent: getRandomUserAgent(),
|
|
623
|
+
});
|
|
624
|
+
pooledPages.add(pooledPage);
|
|
625
|
+
idlePagePool.push(pooledPage);
|
|
626
|
+
}
|
|
627
|
+
})().finally(() => {
|
|
628
|
+
pagePoolFillPromise = null;
|
|
629
|
+
});
|
|
630
|
+
await pagePoolFillPromise;
|
|
631
|
+
}
|
|
632
|
+
async function recyclePooledPage(page) {
|
|
633
|
+
if (!pooledPages.has(page)) {
|
|
634
|
+
await page.close().catch(() => { });
|
|
635
|
+
return;
|
|
636
|
+
}
|
|
637
|
+
if (page.isClosed()) {
|
|
638
|
+
removePooledPage(page);
|
|
639
|
+
if (sharedBrowser?.isConnected()) {
|
|
640
|
+
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
641
|
+
}
|
|
642
|
+
return;
|
|
643
|
+
}
|
|
644
|
+
try {
|
|
645
|
+
await page.unroute('**/*').catch(() => { });
|
|
646
|
+
await page.context().clearCookies().catch(() => { });
|
|
647
|
+
await page.setExtraHTTPHeaders({});
|
|
648
|
+
await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 }).catch(() => { });
|
|
649
|
+
if (!idlePagePool.includes(page)) {
|
|
650
|
+
idlePagePool.push(page);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
catch {
|
|
654
|
+
removePooledPage(page);
|
|
655
|
+
await page.close().catch(() => { });
|
|
656
|
+
}
|
|
657
|
+
if (sharedBrowser?.isConnected() && pooledPages.size < PAGE_POOL_SIZE) {
|
|
658
|
+
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
export async function warmup() {
|
|
662
|
+
startDnsWarmup();
|
|
663
|
+
const browser = await getBrowser();
|
|
664
|
+
await ensurePagePool(browser);
|
|
665
|
+
}
|
|
400
666
|
async function getBrowser() {
|
|
401
667
|
// SECURITY: Check if browser is still connected and healthy
|
|
402
668
|
if (sharedBrowser) {
|
|
403
669
|
try {
|
|
404
670
|
if (sharedBrowser.isConnected()) {
|
|
671
|
+
if (pooledPages.size < PAGE_POOL_SIZE) {
|
|
672
|
+
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
673
|
+
}
|
|
405
674
|
return sharedBrowser;
|
|
406
675
|
}
|
|
407
676
|
}
|
|
@@ -410,7 +679,11 @@ async function getBrowser() {
|
|
|
410
679
|
sharedBrowser = null;
|
|
411
680
|
}
|
|
412
681
|
}
|
|
682
|
+
pooledPages.clear();
|
|
683
|
+
idlePagePool.length = 0;
|
|
684
|
+
pagePoolFillPromise = null;
|
|
413
685
|
sharedBrowser = await chromium.launch({ headless: true });
|
|
686
|
+
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
414
687
|
return sharedBrowser;
|
|
415
688
|
}
|
|
416
689
|
async function getStealthBrowser() {
|
|
@@ -436,13 +709,16 @@ async function getStealthBrowser() {
|
|
|
436
709
|
export async function browserFetch(url, options = {}) {
|
|
437
710
|
// SECURITY: Validate URL to prevent SSRF
|
|
438
711
|
validateUrl(url);
|
|
439
|
-
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, } = options;
|
|
712
|
+
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, } = options;
|
|
440
713
|
// Validate user agent if provided
|
|
441
714
|
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
442
715
|
// Validate wait time
|
|
443
716
|
if (waitMs < 0 || waitMs > 60000) {
|
|
444
717
|
throw new WebPeelError('Wait time must be between 0 and 60000ms');
|
|
445
718
|
}
|
|
719
|
+
if (signal?.aborted) {
|
|
720
|
+
throw createAbortError();
|
|
721
|
+
}
|
|
446
722
|
// SECURITY: Validate custom headers if provided
|
|
447
723
|
if (headers) {
|
|
448
724
|
for (const [key, value] of Object.entries(headers)) {
|
|
@@ -466,14 +742,51 @@ export async function browserFetch(url, options = {}) {
|
|
|
466
742
|
}
|
|
467
743
|
activePagesCount++;
|
|
468
744
|
let page = null;
|
|
745
|
+
let usingPooledPage = false;
|
|
746
|
+
let abortHandler;
|
|
469
747
|
try {
|
|
470
748
|
const browser = stealth ? await getStealthBrowser() : await getBrowser();
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
749
|
+
const shouldUsePagePool = !stealth && !userAgent && !keepPageOpen;
|
|
750
|
+
if (shouldUsePagePool) {
|
|
751
|
+
page = takePooledPage();
|
|
752
|
+
usingPooledPage = !!page;
|
|
753
|
+
if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
|
|
754
|
+
void ensurePagePool(browser).catch(() => { });
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
if (!page) {
|
|
758
|
+
const pageOptions = {
|
|
759
|
+
userAgent: validatedUserAgent,
|
|
760
|
+
...(stealth
|
|
761
|
+
? {
|
|
762
|
+
viewport: { width: 1920, height: 1080 },
|
|
763
|
+
locale: 'en-US',
|
|
764
|
+
timezoneId: 'America/New_York',
|
|
765
|
+
javaScriptEnabled: true,
|
|
766
|
+
}
|
|
767
|
+
: {}),
|
|
768
|
+
};
|
|
769
|
+
page = await browser.newPage(pageOptions);
|
|
770
|
+
usingPooledPage = false;
|
|
771
|
+
}
|
|
772
|
+
else {
|
|
773
|
+
await page.setViewportSize({ width: 1280, height: 720 }).catch(() => { });
|
|
774
|
+
}
|
|
775
|
+
if (signal) {
|
|
776
|
+
abortHandler = () => {
|
|
777
|
+
if (page && !page.isClosed()) {
|
|
778
|
+
void page.close().catch(() => { });
|
|
779
|
+
}
|
|
780
|
+
};
|
|
781
|
+
signal.addEventListener('abort', abortHandler, { once: true });
|
|
782
|
+
}
|
|
783
|
+
await page.unroute('**/*').catch(() => { });
|
|
784
|
+
const mergedHeaders = { ...(headers || {}) };
|
|
785
|
+
if (usingPooledPage) {
|
|
786
|
+
mergedHeaders['User-Agent'] = validatedUserAgent;
|
|
787
|
+
}
|
|
788
|
+
if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
|
|
789
|
+
await page.setExtraHTTPHeaders(mergedHeaders);
|
|
477
790
|
}
|
|
478
791
|
// Set cookies if provided
|
|
479
792
|
if (cookies && cookies.length > 0) {
|
|
@@ -491,8 +804,12 @@ export async function browserFetch(url, options = {}) {
|
|
|
491
804
|
});
|
|
492
805
|
await page.context().addCookies(parsedCookies);
|
|
493
806
|
}
|
|
494
|
-
|
|
495
|
-
|
|
807
|
+
if (signal?.aborted) {
|
|
808
|
+
throw createAbortError();
|
|
809
|
+
}
|
|
810
|
+
// Block images/fonts/etc for speed in non-stealth mode.
|
|
811
|
+
// In stealth mode, blocking common resources can be a bot-detection signal.
|
|
812
|
+
if (!screenshot && !stealth) {
|
|
496
813
|
await page.route('**/*', (route) => {
|
|
497
814
|
const resourceType = route.request().resourceType();
|
|
498
815
|
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
@@ -504,19 +821,48 @@ export async function browserFetch(url, options = {}) {
|
|
|
504
821
|
});
|
|
505
822
|
}
|
|
506
823
|
else {
|
|
507
|
-
// For screenshots, allow all resources
|
|
824
|
+
// For screenshots and stealth mode, allow all resources
|
|
508
825
|
await page.route('**/*', (route) => route.continue());
|
|
509
826
|
}
|
|
510
827
|
// SECURITY: Wrap entire operation in timeout
|
|
511
828
|
let screenshotBuffer;
|
|
829
|
+
const throwIfAborted = () => {
|
|
830
|
+
if (signal?.aborted) {
|
|
831
|
+
throw createAbortError();
|
|
832
|
+
}
|
|
833
|
+
};
|
|
512
834
|
const fetchPromise = (async () => {
|
|
513
|
-
await page.goto(url, {
|
|
835
|
+
const response = await page.goto(url, {
|
|
514
836
|
waitUntil: 'domcontentloaded',
|
|
515
837
|
timeout: timeoutMs,
|
|
516
838
|
});
|
|
517
|
-
|
|
839
|
+
throwIfAborted();
|
|
840
|
+
// Quick check: if body text is very thin, wait for JS to render more content.
|
|
841
|
+
// Only adds latency when the page clearly hasn't loaded yet.
|
|
842
|
+
// eslint-disable-next-line @typescript-eslint/no-implied-eval
|
|
843
|
+
const bodyTextLength = await page.evaluate('document.body?.innerText?.trim().length || 0').catch(() => 0);
|
|
844
|
+
if (bodyTextLength < 500) {
|
|
845
|
+
await page.waitForLoadState('networkidle', { timeout: 1500 }).catch(() => { });
|
|
846
|
+
throwIfAborted();
|
|
847
|
+
}
|
|
848
|
+
const finalUrl = page.url();
|
|
849
|
+
const contentType = response?.headers()?.['content-type'] || '';
|
|
850
|
+
const contentTypeLower = contentType.toLowerCase();
|
|
851
|
+
const urlLower = finalUrl.toLowerCase();
|
|
852
|
+
const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
|
|
853
|
+
const isDocx = contentTypeLower.includes('wordprocessingml.document') || urlLower.endsWith('.docx');
|
|
854
|
+
const isBinaryDoc = !!response && (isPdf || isDocx);
|
|
855
|
+
// Small randomized delay in stealth mode (simulate human behavior)
|
|
856
|
+
// Keep it short — enough to look human, not enough to kill latency
|
|
857
|
+
if (stealth) {
|
|
858
|
+
const extraDelayMs = 200 + Math.floor(Math.random() * 601);
|
|
859
|
+
await page.waitForTimeout(extraDelayMs);
|
|
860
|
+
throwIfAborted();
|
|
861
|
+
}
|
|
862
|
+
// Wait for additional time if requested (for dynamic content / screenshots)
|
|
518
863
|
if (waitMs > 0) {
|
|
519
864
|
await page.waitForTimeout(waitMs);
|
|
865
|
+
throwIfAborted();
|
|
520
866
|
}
|
|
521
867
|
// Execute page actions if provided
|
|
522
868
|
if (actions && actions.length > 0) {
|
|
@@ -525,23 +871,59 @@ export async function browserFetch(url, options = {}) {
|
|
|
525
871
|
if (actionScreenshot) {
|
|
526
872
|
screenshotBuffer = actionScreenshot;
|
|
527
873
|
}
|
|
874
|
+
throwIfAborted();
|
|
875
|
+
}
|
|
876
|
+
// If the navigation returned a binary document (PDF/DOCX), grab the raw body.
|
|
877
|
+
if (isBinaryDoc) {
|
|
878
|
+
const buffer = await response.body();
|
|
879
|
+
throwIfAborted();
|
|
880
|
+
// Capture screenshot if requested (and not already captured by actions)
|
|
881
|
+
if (screenshot && !screenshotBuffer) {
|
|
882
|
+
screenshotBuffer = await page.screenshot({
|
|
883
|
+
fullPage: screenshotFullPage,
|
|
884
|
+
type: 'png',
|
|
885
|
+
});
|
|
886
|
+
}
|
|
887
|
+
return {
|
|
888
|
+
html: '',
|
|
889
|
+
finalUrl,
|
|
890
|
+
buffer,
|
|
891
|
+
contentType,
|
|
892
|
+
statusCode: response.status(),
|
|
893
|
+
};
|
|
528
894
|
}
|
|
529
895
|
const html = await page.content();
|
|
530
|
-
|
|
531
|
-
return {
|
|
896
|
+
throwIfAborted();
|
|
897
|
+
return {
|
|
898
|
+
html,
|
|
899
|
+
finalUrl,
|
|
900
|
+
contentType,
|
|
901
|
+
statusCode: response?.status(),
|
|
902
|
+
};
|
|
532
903
|
})();
|
|
904
|
+
let operationTimeout;
|
|
533
905
|
const timeoutPromise = new Promise((_, reject) => {
|
|
534
|
-
setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
906
|
+
operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
535
907
|
});
|
|
536
|
-
const
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
throw new WebPeelError('Response too large (max 10MB)');
|
|
908
|
+
const fetchData = await Promise.race([fetchPromise, timeoutPromise]);
|
|
909
|
+
if (operationTimeout) {
|
|
910
|
+
clearTimeout(operationTimeout);
|
|
540
911
|
}
|
|
541
|
-
|
|
542
|
-
|
|
912
|
+
const { html, finalUrl } = fetchData;
|
|
913
|
+
const fetchBuffer = 'buffer' in fetchData ? fetchData.buffer : undefined;
|
|
914
|
+
const fetchContentType = 'contentType' in fetchData ? fetchData.contentType : undefined;
|
|
915
|
+
const fetchStatusCode = 'statusCode' in fetchData ? fetchData.statusCode : undefined;
|
|
916
|
+
const isBinaryDoc = !!fetchBuffer;
|
|
917
|
+
// SECURITY: Limit HTML size (skip for binary documents where html is empty)
|
|
918
|
+
if (!isBinaryDoc) {
|
|
919
|
+
if (html.length > 10 * 1024 * 1024) { // 10MB limit
|
|
920
|
+
throw new WebPeelError('Response too large (max 10MB)');
|
|
921
|
+
}
|
|
922
|
+
if (!html || html.length < 100) {
|
|
923
|
+
throw new BlockedError('Empty or suspiciously small response from browser.');
|
|
924
|
+
}
|
|
543
925
|
}
|
|
544
|
-
// Capture screenshot if requested (and not already captured by actions)
|
|
926
|
+
// Capture screenshot if requested (and not already captured by actions or document handler)
|
|
545
927
|
if (screenshot && !screenshotBuffer) {
|
|
546
928
|
screenshotBuffer = await page.screenshot({
|
|
547
929
|
fullPage: screenshotFullPage,
|
|
@@ -552,7 +934,10 @@ export async function browserFetch(url, options = {}) {
|
|
|
552
934
|
if (keepPageOpen && page) {
|
|
553
935
|
return {
|
|
554
936
|
html,
|
|
937
|
+
buffer: fetchBuffer,
|
|
555
938
|
url: finalUrl,
|
|
939
|
+
statusCode: fetchStatusCode,
|
|
940
|
+
contentType: fetchContentType,
|
|
556
941
|
screenshot: screenshotBuffer,
|
|
557
942
|
page,
|
|
558
943
|
browser,
|
|
@@ -560,7 +945,10 @@ export async function browserFetch(url, options = {}) {
|
|
|
560
945
|
}
|
|
561
946
|
return {
|
|
562
947
|
html,
|
|
948
|
+
buffer: fetchBuffer,
|
|
563
949
|
url: finalUrl,
|
|
950
|
+
statusCode: fetchStatusCode,
|
|
951
|
+
contentType: fetchContentType,
|
|
564
952
|
screenshot: screenshotBuffer,
|
|
565
953
|
};
|
|
566
954
|
}
|
|
@@ -568,15 +956,26 @@ export async function browserFetch(url, options = {}) {
|
|
|
568
956
|
if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
|
|
569
957
|
throw error;
|
|
570
958
|
}
|
|
959
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
960
|
+
throw error;
|
|
961
|
+
}
|
|
571
962
|
if (error instanceof Error && error.message.includes('Timeout')) {
|
|
572
963
|
throw new TimeoutError(`Browser navigation timed out`);
|
|
573
964
|
}
|
|
574
965
|
throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
575
966
|
}
|
|
576
967
|
finally {
|
|
577
|
-
|
|
968
|
+
if (signal && abortHandler) {
|
|
969
|
+
signal.removeEventListener('abort', abortHandler);
|
|
970
|
+
}
|
|
971
|
+
// CRITICAL: Always release/close page and decrement counter (unless keepPageOpen and no error)
|
|
578
972
|
if (page && !keepPageOpen) {
|
|
579
|
-
|
|
973
|
+
if (usingPooledPage) {
|
|
974
|
+
await recyclePooledPage(page);
|
|
975
|
+
}
|
|
976
|
+
else {
|
|
977
|
+
await page.close().catch(() => { });
|
|
978
|
+
}
|
|
580
979
|
}
|
|
581
980
|
activePagesCount--;
|
|
582
981
|
}
|
|
@@ -584,6 +983,168 @@ export async function browserFetch(url, options = {}) {
|
|
|
584
983
|
/**
|
|
585
984
|
* Retry a fetch operation with exponential backoff
|
|
586
985
|
*/
|
|
986
|
+
export async function browserScreenshot(url, options = {}) {
|
|
987
|
+
// SECURITY: Validate URL to prevent SSRF
|
|
988
|
+
validateUrl(url);
|
|
989
|
+
const { fullPage = false, width, height, format = 'png', quality, waitMs = 0, timeoutMs = 30000, userAgent, headers, cookies, stealth = false, actions, } = options;
|
|
990
|
+
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
991
|
+
// Basic validation
|
|
992
|
+
if (waitMs < 0 || waitMs > 60000) {
|
|
993
|
+
throw new WebPeelError('Wait time must be between 0 and 60000ms');
|
|
994
|
+
}
|
|
995
|
+
if (timeoutMs < 1000 || timeoutMs > 120000) {
|
|
996
|
+
throw new WebPeelError('Timeout must be between 1000 and 120000ms');
|
|
997
|
+
}
|
|
998
|
+
if (width !== undefined && (!Number.isFinite(width) || width < 100 || width > 5000)) {
|
|
999
|
+
throw new WebPeelError('Width must be between 100 and 5000');
|
|
1000
|
+
}
|
|
1001
|
+
if (height !== undefined && (!Number.isFinite(height) || height < 100 || height > 5000)) {
|
|
1002
|
+
throw new WebPeelError('Height must be between 100 and 5000');
|
|
1003
|
+
}
|
|
1004
|
+
if (format !== 'png' && format !== 'jpeg') {
|
|
1005
|
+
throw new WebPeelError('Format must be png or jpeg');
|
|
1006
|
+
}
|
|
1007
|
+
if (format === 'jpeg' && quality !== undefined) {
|
|
1008
|
+
if (!Number.isFinite(quality) || quality < 1 || quality > 100) {
|
|
1009
|
+
throw new WebPeelError('JPEG quality must be between 1 and 100');
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
// SECURITY: Validate custom headers if provided
|
|
1013
|
+
if (headers) {
|
|
1014
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
1015
|
+
if (key.toLowerCase() === 'host') {
|
|
1016
|
+
throw new WebPeelError('Custom Host header is not allowed');
|
|
1017
|
+
}
|
|
1018
|
+
if (typeof value !== 'string' || value.length > 500) {
|
|
1019
|
+
throw new WebPeelError('Invalid header value');
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
// SECURITY: Limit concurrent browser pages with timeout
|
|
1024
|
+
const queueStartTime = Date.now();
|
|
1025
|
+
const QUEUE_TIMEOUT_MS = 30000;
|
|
1026
|
+
while (activePagesCount >= MAX_CONCURRENT_PAGES) {
|
|
1027
|
+
if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
|
|
1028
|
+
throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
|
|
1029
|
+
}
|
|
1030
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
1031
|
+
}
|
|
1032
|
+
activePagesCount++;
|
|
1033
|
+
let page = null;
|
|
1034
|
+
let usingPooledPage = false;
|
|
1035
|
+
try {
|
|
1036
|
+
const browser = stealth ? await getStealthBrowser() : await getBrowser();
|
|
1037
|
+
const shouldUsePagePool = !stealth && !userAgent;
|
|
1038
|
+
if (shouldUsePagePool) {
|
|
1039
|
+
page = takePooledPage();
|
|
1040
|
+
usingPooledPage = !!page;
|
|
1041
|
+
if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
|
|
1042
|
+
void ensurePagePool(browser).catch(() => { });
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
if (!page) {
|
|
1046
|
+
page = await browser.newPage({
|
|
1047
|
+
userAgent: validatedUserAgent,
|
|
1048
|
+
viewport: width || height ? {
|
|
1049
|
+
width: width || 1280,
|
|
1050
|
+
height: height || 720,
|
|
1051
|
+
} : undefined,
|
|
1052
|
+
});
|
|
1053
|
+
usingPooledPage = false;
|
|
1054
|
+
}
|
|
1055
|
+
else {
|
|
1056
|
+
await page.setViewportSize({
|
|
1057
|
+
width: width || 1280,
|
|
1058
|
+
height: height || 720,
|
|
1059
|
+
}).catch(() => { });
|
|
1060
|
+
}
|
|
1061
|
+
await page.unroute('**/*').catch(() => { });
|
|
1062
|
+
const mergedHeaders = { ...(headers || {}) };
|
|
1063
|
+
if (usingPooledPage) {
|
|
1064
|
+
mergedHeaders['User-Agent'] = validatedUserAgent;
|
|
1065
|
+
}
|
|
1066
|
+
if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
|
|
1067
|
+
await page.setExtraHTTPHeaders(mergedHeaders);
|
|
1068
|
+
}
|
|
1069
|
+
if (cookies && cookies.length > 0) {
|
|
1070
|
+
const parsedCookies = cookies.map(cookie => {
|
|
1071
|
+
const [nameValue] = cookie.split(';').map(s => s.trim());
|
|
1072
|
+
const [name, value] = nameValue.split('=');
|
|
1073
|
+
if (!name || value === undefined) {
|
|
1074
|
+
throw new WebPeelError(`Invalid cookie format: ${cookie}`);
|
|
1075
|
+
}
|
|
1076
|
+
return {
|
|
1077
|
+
name: name.trim(),
|
|
1078
|
+
value: value.trim(),
|
|
1079
|
+
url,
|
|
1080
|
+
};
|
|
1081
|
+
});
|
|
1082
|
+
await page.context().addCookies(parsedCookies);
|
|
1083
|
+
}
|
|
1084
|
+
// For screenshots, allow all resources
|
|
1085
|
+
await page.route('**/*', (route) => route.continue());
|
|
1086
|
+
let screenshotBuffer;
|
|
1087
|
+
const doWork = (async () => {
|
|
1088
|
+
await page.goto(url, {
|
|
1089
|
+
waitUntil: 'domcontentloaded',
|
|
1090
|
+
timeout: timeoutMs,
|
|
1091
|
+
});
|
|
1092
|
+
if (waitMs > 0) {
|
|
1093
|
+
await page.waitForTimeout(waitMs);
|
|
1094
|
+
}
|
|
1095
|
+
if (actions && actions.length > 0) {
|
|
1096
|
+
const { executeActions } = await import('./actions.js');
|
|
1097
|
+
const actionScreenshot = await executeActions(page, actions, {
|
|
1098
|
+
fullPage,
|
|
1099
|
+
type: format,
|
|
1100
|
+
quality,
|
|
1101
|
+
});
|
|
1102
|
+
if (actionScreenshot) {
|
|
1103
|
+
screenshotBuffer = actionScreenshot;
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
const finalUrl = page.url();
|
|
1107
|
+
// Capture screenshot if not captured via actions
|
|
1108
|
+
if (!screenshotBuffer) {
|
|
1109
|
+
screenshotBuffer = await page.screenshot({
|
|
1110
|
+
fullPage,
|
|
1111
|
+
type: format,
|
|
1112
|
+
...(format === 'jpeg' && typeof quality === 'number' ? { quality } : {}),
|
|
1113
|
+
});
|
|
1114
|
+
}
|
|
1115
|
+
return { finalUrl, screenshotBuffer: screenshotBuffer };
|
|
1116
|
+
})();
|
|
1117
|
+
let operationTimeout;
|
|
1118
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1119
|
+
operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
1120
|
+
});
|
|
1121
|
+
const { finalUrl, screenshotBuffer: buf } = await Promise.race([doWork, timeoutPromise]);
|
|
1122
|
+
if (operationTimeout) {
|
|
1123
|
+
clearTimeout(operationTimeout);
|
|
1124
|
+
}
|
|
1125
|
+
return { buffer: buf, finalUrl };
|
|
1126
|
+
}
|
|
1127
|
+
catch (error) {
|
|
1128
|
+
if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
|
|
1129
|
+
throw error;
|
|
1130
|
+
}
|
|
1131
|
+
if (error instanceof Error && error.message.includes('Timeout')) {
|
|
1132
|
+
throw new TimeoutError('Browser screenshot timed out');
|
|
1133
|
+
}
|
|
1134
|
+
throw new NetworkError(`Browser screenshot failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
1135
|
+
}
|
|
1136
|
+
finally {
|
|
1137
|
+
if (page) {
|
|
1138
|
+
if (usingPooledPage) {
|
|
1139
|
+
await recyclePooledPage(page);
|
|
1140
|
+
}
|
|
1141
|
+
else {
|
|
1142
|
+
await page.close().catch(() => { });
|
|
1143
|
+
}
|
|
1144
|
+
}
|
|
1145
|
+
activePagesCount--;
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
587
1148
|
export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
|
|
588
1149
|
let lastError = null;
|
|
589
1150
|
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
@@ -608,6 +1169,11 @@ export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
|
|
|
608
1169
|
* Clean up browser resources
|
|
609
1170
|
*/
|
|
610
1171
|
export async function cleanup() {
|
|
1172
|
+
const pagesToClose = Array.from(pooledPages);
|
|
1173
|
+
pooledPages.clear();
|
|
1174
|
+
idlePagePool.length = 0;
|
|
1175
|
+
pagePoolFillPromise = null;
|
|
1176
|
+
await Promise.all(pagesToClose.map((page) => page.close().catch(() => { })));
|
|
611
1177
|
if (sharedBrowser) {
|
|
612
1178
|
await sharedBrowser.close();
|
|
613
1179
|
sharedBrowser = null;
|
|
@@ -616,5 +1182,6 @@ export async function cleanup() {
|
|
|
616
1182
|
await sharedStealthBrowser.close();
|
|
617
1183
|
sharedStealthBrowser = null;
|
|
618
1184
|
}
|
|
1185
|
+
await closePool().catch(() => { });
|
|
619
1186
|
}
|
|
620
1187
|
//# sourceMappingURL=fetcher.js.map
|