webpeel 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/dist/cache.d.ts.map +1 -1
  2. package/dist/cache.js +11 -4
  3. package/dist/cache.js.map +1 -1
  4. package/dist/cli.bundle.cjs +159248 -0
  5. package/dist/core/agent.js +12 -8
  6. package/dist/core/agent.js.map +1 -1
  7. package/dist/core/application-tracker.js +3 -2
  8. package/dist/core/application-tracker.js.map +1 -1
  9. package/dist/core/auto-extract.js +6 -4
  10. package/dist/core/auto-extract.js.map +1 -1
  11. package/dist/core/browser-fetch.d.ts +90 -0
  12. package/dist/core/browser-fetch.d.ts.map +1 -0
  13. package/dist/core/browser-fetch.js +599 -0
  14. package/dist/core/browser-fetch.js.map +1 -0
  15. package/dist/core/browser-pool.d.ts +70 -0
  16. package/dist/core/browser-pool.d.ts.map +1 -0
  17. package/dist/core/browser-pool.js +378 -0
  18. package/dist/core/browser-pool.js.map +1 -0
  19. package/dist/core/change-tracking.js +3 -2
  20. package/dist/core/change-tracking.js.map +1 -1
  21. package/dist/core/diff.js +3 -2
  22. package/dist/core/diff.js.map +1 -1
  23. package/dist/core/domain-extractors.js +3 -2
  24. package/dist/core/domain-extractors.js.map +1 -1
  25. package/dist/core/extract-inline.js +6 -4
  26. package/dist/core/extract-inline.js.map +1 -1
  27. package/dist/core/fetcher.d.ts +9 -118
  28. package/dist/core/fetcher.d.ts.map +1 -1
  29. package/dist/core/fetcher.js +10 -1525
  30. package/dist/core/fetcher.js.map +1 -1
  31. package/dist/core/http-fetch.d.ts +37 -0
  32. package/dist/core/http-fetch.d.ts.map +1 -0
  33. package/dist/core/http-fetch.js +618 -0
  34. package/dist/core/http-fetch.js.map +1 -0
  35. package/dist/core/metadata.js +18 -12
  36. package/dist/core/metadata.js.map +1 -1
  37. package/dist/core/pipeline.d.ts +104 -0
  38. package/dist/core/pipeline.d.ts.map +1 -0
  39. package/dist/core/pipeline.js +623 -0
  40. package/dist/core/pipeline.js.map +1 -0
  41. package/dist/core/profiles.js +15 -10
  42. package/dist/core/profiles.js.map +1 -1
  43. package/dist/core/quick-answer.d.ts.map +1 -1
  44. package/dist/core/quick-answer.js +120 -9
  45. package/dist/core/quick-answer.js.map +1 -1
  46. package/dist/core/rate-governor.js +3 -2
  47. package/dist/core/rate-governor.js.map +1 -1
  48. package/dist/core/research.js +9 -6
  49. package/dist/core/research.js.map +1 -1
  50. package/dist/core/search-provider.js +12 -8
  51. package/dist/core/search-provider.js.map +1 -1
  52. package/dist/core/youtube.js +3 -2
  53. package/dist/core/youtube.js.map +1 -1
  54. package/dist/index.d.ts.map +1 -1
  55. package/dist/index.js +12 -487
  56. package/dist/index.js.map +1 -1
  57. package/dist/mcp/server.js +1 -1
  58. package/dist/mcp/server.js.map +1 -1
  59. package/dist/server/middleware/auth.js +3 -2
  60. package/dist/server/middleware/auth.js.map +1 -1
  61. package/dist/server/routes/compat.js +3 -2
  62. package/dist/server/routes/compat.js.map +1 -1
  63. package/dist/server/routes/fetch.d.ts.map +1 -1
  64. package/dist/server/routes/fetch.js +44 -4
  65. package/dist/server/routes/fetch.js.map +1 -1
  66. package/dist/server/routes/health.js +3 -2
  67. package/dist/server/routes/health.js.map +1 -1
  68. package/dist/server/routes/mcp.js +1 -1
  69. package/dist/server/routes/mcp.js.map +1 -1
  70. package/dist/server/routes/search.js +6 -4
  71. package/dist/server/routes/search.js.map +1 -1
  72. package/dist/server/routes/users.js +3 -2
  73. package/dist/server/routes/users.js.map +1 -1
  74. package/package.json +1 -1
@@ -1,1528 +1,13 @@
1
1
  /**
2
- * Core fetching logic: simple HTTP and browser-based fetching
3
- */
4
- // Force IPv4-first DNS resolution globally.
5
- // Prevents IPv6 connection failures (TLS errors, timeouts) on hosts that
6
- // advertise AAAA records but can't actually route IPv6 (e.g. Render containers).
7
- // Must run before any network library is used.
8
- import dns from 'dns';
9
- dns.setDefaultResultOrder('ipv4first');
10
- let _chromium = null;
11
- let _stealthChromium = null;
12
- /** Whether Playwright has been loaded (for diagnostics). */
13
- export let playwrightLoaded = false;
14
- async function getPlaywright() {
15
- if (!_chromium) {
16
- const pw = await import('playwright');
17
- _chromium = pw.chromium;
18
- playwrightLoaded = true;
19
- }
20
- return _chromium;
21
- }
22
- async function getStealthPlaywright() {
23
- if (!_stealthChromium) {
24
- const pwExtra = await import('playwright-extra');
25
- const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
26
- _stealthChromium = pwExtra.chromium;
27
- _stealthChromium.use(StealthPlugin());
28
- playwrightLoaded = true;
29
- }
30
- return _stealthChromium;
31
- }
32
- import { getRealisticUserAgent, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
33
- import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
34
- import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
35
- import { getCached } from './cache.js';
36
- import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
37
- import { detectChallenge } from './challenge-detection.js';
38
- /**
39
- * Returns a realistic Chrome user agent.
40
- * Delegates to the curated user-agents module so stealth mode never exposes
41
- * the default "Chrome for Testing" UA which is a reliable bot-detection signal.
42
- */
43
- function getRandomUserAgent() {
44
- return getRealisticUserAgent();
45
- }
46
- /**
47
- * Common Chromium launch arguments for anti-bot-detection.
48
- * Applied to BOTH regular and stealth browser instances.
49
- * NOTE: --window-size is intentionally omitted here; it is added dynamically
50
- * per browser launch using a random realistic viewport (see getRandomViewport()).
51
- */
52
- const ANTI_DETECTION_ARGS = [
53
- '--disable-blink-features=AutomationControlled',
54
- '--disable-infobars',
55
- '--disable-dev-shm-usage',
56
- '--no-sandbox',
57
- '--disable-setuid-sandbox',
58
- '--disable-gpu',
59
- '--start-maximized',
60
- // Chrome branding / stealth hardening
61
- '--disable-features=ChromeUserAgentDataBranding',
62
- '--disable-component-extensions-with-background-pages',
63
- '--disable-default-apps',
64
- '--disable-extensions',
65
- '--disable-hang-monitor',
66
- '--disable-popup-blocking',
67
- '--disable-prompt-on-repost',
68
- '--disable-sync',
69
- '--metrics-recording-only',
70
- '--no-first-run',
71
- ];
72
- /**
73
- * Returns a random realistic viewport weighted by real-world market share.
74
- * Used to avoid the telltale Playwright default of 1280×720.
75
- */
76
- function getRandomViewport() {
77
- // Common real-world resolutions weighted by market share
78
- const viewports = [
79
- { width: 1920, height: 1080, weight: 35 }, // Full HD
80
- { width: 1366, height: 768, weight: 20 }, // Laptop
81
- { width: 1536, height: 864, weight: 15 }, // Scaled laptop
82
- { width: 1440, height: 900, weight: 10 }, // MacBook
83
- { width: 1680, height: 1050, weight: 8 }, // Large laptop
84
- { width: 2560, height: 1440, weight: 7 }, // QHD
85
- { width: 1280, height: 800, weight: 5 }, // Older laptop
86
- ];
87
- const total = viewports.reduce((s, v) => s + v.weight, 0);
88
- let r = Math.random() * total;
89
- for (const v of viewports) {
90
- r -= v.weight;
91
- if (r <= 0)
92
- return { width: v.width, height: v.height };
93
- }
94
- return { width: 1920, height: 1080 };
95
- }
96
- /**
97
- * Apply stealth init scripts to a page to reduce bot-detection signals:
98
- * 1. Hides the `window.__pwInitScripts` Playwright leak.
99
- * 2. Patches `navigator.userAgentData.brands` to include "Google Chrome"
100
- * (Chrome for Testing only ships "Chromium" which is a known detection signal).
101
- */
102
- async function applyStealthScripts(page) {
103
- // 1. Hide Playwright's __pwInitScripts marker
104
- // Uses string form to avoid TypeScript DOM-lib requirements (tsconfig has no DOM lib).
105
- await page.addInitScript(`
106
- Object.defineProperty(window, '__pwInitScripts', {
107
- get: () => undefined,
108
- set: () => {},
109
- configurable: true,
110
- });
111
- `);
112
- // 2. Patch userAgentData brands to include "Google Chrome"
113
- // Chrome for Testing only ships "Chromium" — a well-known bot-detection signal.
114
- await page.addInitScript(`
115
- (function () {
116
- var uad = navigator.userAgentData;
117
- if (!uad) return;
118
- var originalBrands = uad.brands || [];
119
- var hasChromeEntry = originalBrands.some(function(b) { return b.brand === 'Google Chrome'; });
120
- if (hasChromeEntry) return;
121
-
122
- var chromiumEntry = originalBrands.find(function(b) { return b.brand === 'Chromium'; });
123
- var version = (chromiumEntry && chromiumEntry.version) || '136';
124
- var patchedBrands = [
125
- { brand: 'Chromium', version: version },
126
- { brand: 'Google Chrome', version: version },
127
- { brand: 'Not=A?Brand', version: '99' },
128
- ];
129
-
130
- Object.defineProperty(navigator, 'userAgentData', {
131
- get: function() {
132
- return {
133
- brands: patchedBrands,
134
- mobile: false,
135
- platform: uad.platform || 'Windows',
136
- getHighEntropyValues: uad.getHighEntropyValues ? uad.getHighEntropyValues.bind(uad) : undefined,
137
- toJSON: function() {
138
- return {
139
- brands: patchedBrands,
140
- mobile: false,
141
- platform: uad.platform || 'Windows',
142
- };
143
- },
144
- };
145
- },
146
- configurable: true,
147
- });
148
- })();
149
- `);
150
- }
151
- function createHttpPool() {
152
- return new Agent({
153
- connections: 20,
154
- pipelining: 6,
155
- keepAliveTimeout: 60000,
156
- keepAliveMaxTimeout: 60000,
157
- allowH2: true,
158
- connect: {
159
- lookup: cachedLookup,
160
- },
161
- });
162
- }
163
- let httpPool = createHttpPool();
164
- startDnsWarmup();
165
- const CONDITIONAL_CACHE_MAX_ENTRIES = 2000;
166
- const conditionalValidatorsByUrl = new Map();
167
- function normalizeUrlForConditionalCache(url) {
168
- try {
169
- const normalized = new URL(url);
170
- normalized.hash = '';
171
- normalized.hostname = normalized.hostname.toLowerCase();
172
- if ((normalized.protocol === 'http:' && normalized.port === '80') ||
173
- (normalized.protocol === 'https:' && normalized.port === '443')) {
174
- normalized.port = '';
175
- }
176
- if (!normalized.pathname) {
177
- normalized.pathname = '/';
178
- }
179
- const sortedParams = [...normalized.searchParams.entries()]
180
- .sort(([a], [b]) => a.localeCompare(b));
181
- normalized.search = '';
182
- for (const [key, value] of sortedParams) {
183
- normalized.searchParams.append(key, value);
184
- }
185
- return normalized.toString();
186
- }
187
- catch (e) {
188
- // Non-fatal: URL normalization failed, returning raw trimmed URL
189
- if (process.env.DEBUG)
190
- console.debug('[webpeel]', 'URL normalization:', e instanceof Error ? e.message : e);
191
- return url.trim();
192
- }
193
- }
194
- function getConditionalValidators(url) {
195
- const key = normalizeUrlForConditionalCache(url);
196
- const existing = conditionalValidatorsByUrl.get(key);
197
- if (!existing) {
198
- return null;
199
- }
200
- // LRU touch
201
- conditionalValidatorsByUrl.delete(key);
202
- conditionalValidatorsByUrl.set(key, existing);
203
- return existing;
204
- }
205
- function setConditionalValidators(url, validators) {
206
- const key = normalizeUrlForConditionalCache(url);
207
- if (conditionalValidatorsByUrl.has(key)) {
208
- conditionalValidatorsByUrl.delete(key);
209
- }
210
- conditionalValidatorsByUrl.set(key, validators);
211
- while (conditionalValidatorsByUrl.size > CONDITIONAL_CACHE_MAX_ENTRIES) {
212
- const oldestKey = conditionalValidatorsByUrl.keys().next().value;
213
- if (!oldestKey) {
214
- break;
215
- }
216
- conditionalValidatorsByUrl.delete(oldestKey);
217
- }
218
- }
219
- function rememberConditionalValidators(url, response) {
220
- const etag = response.headers.get('etag') || undefined;
221
- const lastModified = response.headers.get('last-modified') || undefined;
222
- if (!etag && !lastModified) {
223
- return;
224
- }
225
- setConditionalValidators(url, { etag, lastModified });
226
- }
227
- function hasHeader(headers, name) {
228
- const lowered = name.toLowerCase();
229
- return Object.keys(headers).some((header) => header.toLowerCase() === lowered);
230
- }
231
- function getCachedResultFor304(url, fallbackUrl) {
232
- const cached = getCached(url) || (fallbackUrl ? getCached(fallbackUrl) : null);
233
- if (!cached) {
234
- return null;
235
- }
236
- return {
237
- html: cached.html,
238
- buffer: cached.buffer,
239
- url: cached.url || url,
240
- statusCode: 304,
241
- contentType: cached.contentType,
242
- screenshot: cached.screenshot,
243
- };
244
- }
245
- function createAbortError() {
246
- const error = new Error('Operation aborted');
247
- error.name = 'AbortError';
248
- return error;
249
- }
250
- /**
251
- * SECURITY: Validate URL to prevent SSRF attacks
252
- * Blocks localhost, private IPs, link-local, and various bypass techniques
253
- */
254
- function validateUrl(urlString) {
255
- // Length check
256
- if (urlString.length > 2048) {
257
- throw new WebPeelError('URL too long (max 2048 characters)');
258
- }
259
- // Check for control characters and suspicious encoding
260
- if (/[\x00-\x1F\x7F]/.test(urlString)) {
261
- throw new WebPeelError('URL contains invalid control characters');
262
- }
263
- let url;
264
- try {
265
- url = new URL(urlString);
266
- }
267
- catch {
268
- throw new WebPeelError('Invalid URL format');
269
- }
270
- // Only allow HTTP(S)
271
- if (!['http:', 'https:'].includes(url.protocol)) {
272
- throw new WebPeelError('Only HTTP and HTTPS protocols are allowed');
273
- }
274
- // Validate hostname is not empty
275
- if (!url.hostname) {
276
- throw new WebPeelError('Invalid hostname');
277
- }
278
- const hostname = url.hostname.toLowerCase();
279
- // Block localhost patterns
280
- const localhostPatterns = ['localhost', '0.0.0.0'];
281
- if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
282
- throw new WebPeelError('Access to localhost is not allowed');
283
- }
284
- // ENHANCED: Parse and validate IP addresses (handles hex, octal, decimal, mixed)
285
- const ipv4Info = parseAndValidateIPv4(hostname);
286
- if (ipv4Info) {
287
- validateIPv4Address(ipv4Info);
288
- }
289
- // ENHANCED: Comprehensive IPv6 validation
290
- if (hostname.includes(':')) {
291
- validateIPv6Address(hostname);
292
- }
293
- }
294
- /**
295
- * Parse IPv4 address in any format (dotted, hex, octal, decimal, mixed)
296
- * Returns null if not an IPv4 address
297
- */
298
- function parseAndValidateIPv4(hostname) {
299
- // Remove brackets if present
300
- const cleaned = hostname.replace(/^\[|\]$/g, '');
301
- // Standard dotted notation: 192.168.1.1
302
- const dottedRegex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
303
- const dottedMatch = cleaned.match(dottedRegex);
304
- if (dottedMatch) {
305
- const octets = dottedMatch.slice(1).map(Number);
306
- if (octets.every(o => o >= 0 && o <= 255)) {
307
- return octets;
308
- }
309
- throw new WebPeelError('Invalid IPv4 address');
310
- }
311
- // Hex notation: 0x7f000001
312
- if (/^0x[0-9a-fA-F]+$/.test(cleaned)) {
313
- const num = parseInt(cleaned, 16);
314
- return [
315
- (num >>> 24) & 0xff,
316
- (num >>> 16) & 0xff,
317
- (num >>> 8) & 0xff,
318
- num & 0xff,
319
- ];
320
- }
321
- // Octal notation: 0177.0.0.1 or full octal 017700000001
322
- if (/^0[0-7]/.test(cleaned)) {
323
- // Full octal (all digits)
324
- if (/^0[0-7]+$/.test(cleaned)) {
325
- const num = parseInt(cleaned, 8);
326
- if (num <= 0xffffffff) {
327
- return [
328
- (num >>> 24) & 0xff,
329
- (num >>> 16) & 0xff,
330
- (num >>> 8) & 0xff,
331
- num & 0xff,
332
- ];
333
- }
334
- }
335
- // Mixed octal-decimal: 0177.0.0.1
336
- const parts = cleaned.split('.');
337
- if (parts.length === 4) {
338
- const octets = parts.map(p => parseInt(p, /^0[0-7]/.test(p) ? 8 : 10));
339
- if (octets.every(o => o >= 0 && o <= 255)) {
340
- return octets;
341
- }
342
- }
343
- }
344
- // Decimal notation: 2130706433
345
- if (/^\d+$/.test(cleaned)) {
346
- const num = parseInt(cleaned, 10);
347
- if (num <= 0xffffffff) {
348
- return [
349
- (num >>> 24) & 0xff,
350
- (num >>> 16) & 0xff,
351
- (num >>> 8) & 0xff,
352
- num & 0xff,
353
- ];
354
- }
355
- }
356
- return null;
357
- }
358
- /**
359
- * Validate IPv4 address against private/reserved ranges
360
- */
361
- function validateIPv4Address(octets) {
362
- const [a, b, c, d] = octets;
363
- // Loopback: 127.0.0.0/8
364
- if (a === 127) {
365
- throw new WebPeelError('Access to loopback addresses is not allowed');
366
- }
367
- // Private: 10.0.0.0/8
368
- if (a === 10) {
369
- throw new WebPeelError('Access to private IP addresses is not allowed');
370
- }
371
- // Private: 172.16.0.0/12
372
- if (a === 172 && b >= 16 && b <= 31) {
373
- throw new WebPeelError('Access to private IP addresses is not allowed');
374
- }
375
- // Private: 192.168.0.0/16
376
- if (a === 192 && b === 168) {
377
- throw new WebPeelError('Access to private IP addresses is not allowed');
378
- }
379
- // Link-local: 169.254.0.0/16
380
- if (a === 169 && b === 254) {
381
- throw new WebPeelError('Access to link-local addresses is not allowed');
382
- }
383
- // Broadcast: 255.255.255.255
384
- if (a === 255 && b === 255 && c === 255 && d === 255) {
385
- throw new WebPeelError('Access to broadcast address is not allowed');
386
- }
387
- // This network: 0.0.0.0/8
388
- if (a === 0) {
389
- throw new WebPeelError('Access to "this network" addresses is not allowed');
390
- }
391
- }
392
- /**
393
- * Validate IPv6 address against private/reserved ranges
394
- */
395
- function validateIPv6Address(hostname) {
396
- // Remove brackets
397
- const addr = hostname.replace(/^\[|\]$/g, '').toLowerCase();
398
- // Loopback: ::1
399
- if (addr === '::1' || addr === '0:0:0:0:0:0:0:1') {
400
- throw new WebPeelError('Access to loopback addresses is not allowed');
401
- }
402
- // IPv6 mapped IPv4: ::ffff:192.168.1.1 or ::ffff:c0a8:0101
403
- if (addr.startsWith('::ffff:')) {
404
- // Extract the IPv4 part
405
- const ipv4Part = addr.substring(7);
406
- // Could be dotted (::ffff:192.168.1.1) or hex (::ffff:c0a8:0101)
407
- if (ipv4Part.includes('.')) {
408
- // Parse dotted IPv4
409
- const parts = ipv4Part.split('.');
410
- if (parts.length === 4) {
411
- const octets = parts.map(p => parseInt(p, 10));
412
- if (octets.every(o => !isNaN(o) && o >= 0 && o <= 255)) {
413
- validateIPv4Address(octets);
414
- }
415
- }
416
- }
417
- else {
418
- // Parse hex IPv4 (e.g., c0a80101 = 192.168.1.1)
419
- const hexStr = ipv4Part.replace(/:/g, '');
420
- if (/^[0-9a-f]{1,8}$/.test(hexStr)) {
421
- const num = parseInt(hexStr, 16);
422
- const octets = [
423
- (num >>> 24) & 0xff,
424
- (num >>> 16) & 0xff,
425
- (num >>> 8) & 0xff,
426
- num & 0xff,
427
- ];
428
- validateIPv4Address(octets);
429
- }
430
- }
431
- throw new WebPeelError('Access to IPv6-mapped IPv4 addresses is not allowed');
432
- }
433
- // Unique local addresses: fc00::/7 (fc00:: to fdff::)
434
- if (addr.startsWith('fc') || addr.startsWith('fd')) {
435
- throw new WebPeelError('Access to unique local IPv6 addresses is not allowed');
436
- }
437
- // Link-local: fe80::/10
438
- if (addr.startsWith('fe8') || addr.startsWith('fe9') ||
439
- addr.startsWith('fea') || addr.startsWith('feb')) {
440
- throw new WebPeelError('Access to link-local IPv6 addresses is not allowed');
441
- }
442
- }
443
- /**
444
- * Validate and sanitize user agent string
445
- */
446
- function validateUserAgent(userAgent) {
447
- if (userAgent.length > 500) {
448
- throw new WebPeelError('User agent too long (max 500 characters)');
449
- }
450
- // Allow only printable ASCII characters
451
- if (!/^[\x20-\x7E]*$/.test(userAgent)) {
452
- throw new WebPeelError('User agent contains invalid characters');
453
- }
454
- return userAgent;
455
- }
456
- /**
457
- * Simple HTTP fetch using native fetch + Cheerio
458
- * Fast and lightweight, but can be blocked by Cloudflare/bot detection
459
- * SECURITY: Manual redirect handling with SSRF re-validation
460
- */
461
- export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders, abortSignal, proxy) {
462
- // SECURITY: Validate URL to prevent SSRF
463
- validateUrl(url);
464
- if (abortSignal?.aborted) {
465
- throw createAbortError();
466
- }
467
- // Validate user agent if provided
468
- // SEC.gov requires a User-Agent with contact info (their documented automated access policy)
469
- const hostname = new URL(url).hostname.toLowerCase();
470
- const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
471
- const validatedUserAgent = isSecGov
472
- ? 'WebPeel/1.0 (support@webpeel.dev)'
473
- : (userAgent ? validateUserAgent(userAgent) : getRandomUserAgent());
474
- // SECURITY: Merge custom headers with defaults, block Host header override
475
- const defaultHeaders = {
476
- 'User-Agent': validatedUserAgent,
477
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
478
- 'Accept-Language': 'en-US,en;q=0.9',
479
- 'Accept-Encoding': 'br, gzip, deflate',
480
- 'DNT': '1',
481
- 'Connection': 'keep-alive',
482
- 'Upgrade-Insecure-Requests': '1',
483
- 'Sec-CH-UA': getSecCHUA(validatedUserAgent),
484
- 'Sec-CH-UA-Mobile': '?0',
485
- 'Sec-CH-UA-Platform': getSecCHUAPlatform(validatedUserAgent),
486
- 'Sec-Fetch-Dest': 'document',
487
- 'Sec-Fetch-Mode': 'navigate',
488
- 'Sec-Fetch-Site': 'none',
489
- 'Sec-Fetch-User': '?1',
490
- 'Cache-Control': 'max-age=0',
491
- 'Priority': 'u=0, i',
492
- };
493
- const mergedHeaders = { ...defaultHeaders };
494
- if (customHeaders) {
495
- for (const [key, value] of Object.entries(customHeaders)) {
496
- // SECURITY: Block Host header override
497
- if (key.toLowerCase() === 'host') {
498
- throw new WebPeelError('Custom Host header is not allowed');
499
- }
500
- mergedHeaders[key] = value;
501
- }
502
- }
503
- const MAX_REDIRECTS = 10;
504
- let redirectCount = 0;
505
- let currentUrl = url;
506
- const seenUrls = new Set();
507
- try {
508
- const hostname = new URL(url).hostname;
509
- void resolveAndCache(hostname).catch(() => {
510
- // Best-effort optimization only.
511
- });
512
- }
513
- catch (e) {
514
- // Ignore URL parsing errors here; validation handles invalid input below.
515
- if (process.env.DEBUG)
516
- console.debug('[webpeel]', 'DNS prefetch (initial URL):', e instanceof Error ? e.message : e);
517
- }
518
- while (redirectCount <= MAX_REDIRECTS) {
519
- // Detect redirect loops
520
- if (seenUrls.has(currentUrl)) {
521
- throw new WebPeelError('Redirect loop detected');
522
- }
523
- seenUrls.add(currentUrl);
524
- // Re-validate on each redirect
525
- validateUrl(currentUrl);
526
- const timeoutController = new AbortController();
527
- const timer = setTimeout(() => timeoutController.abort(), timeoutMs);
528
- const signal = abortSignal
529
- ? AbortSignal.any([timeoutController.signal, abortSignal])
530
- : timeoutController.signal;
531
- try {
532
- const requestHeaders = { ...mergedHeaders };
533
- const validators = getConditionalValidators(currentUrl);
534
- if (validators?.etag && !hasHeader(requestHeaders, 'if-none-match')) {
535
- requestHeaders['If-None-Match'] = validators.etag;
536
- }
537
- if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
538
- requestHeaders['If-Modified-Since'] = validators.lastModified;
539
- }
540
- // Use proxy if provided, otherwise use shared connection pool
541
- const dispatcher = proxy ? new ProxyAgent(proxy) : httpPool;
542
- const response = await undiciFetch(currentUrl, {
543
- headers: requestHeaders,
544
- signal,
545
- dispatcher,
546
- redirect: 'manual', // SECURITY: Manual redirect handling
547
- });
548
- clearTimeout(timer);
549
- if (response.status === 304) {
550
- const cachedResult = getCachedResultFor304(currentUrl, url);
551
- if (cachedResult) {
552
- return cachedResult;
553
- }
554
- throw new NetworkError('HTTP 304 received but no cached response is available');
555
- }
556
- // Handle redirects manually
557
- if (response.status >= 300 && response.status < 400) {
558
- const location = response.headers.get('location');
559
- if (!location) {
560
- throw new NetworkError('Redirect response missing Location header');
561
- }
562
- // Resolve relative URLs
563
- currentUrl = new URL(location, currentUrl).href;
564
- try {
565
- const hostname = new URL(currentUrl).hostname;
566
- void resolveAndCache(hostname).catch(() => {
567
- // Best-effort optimization only.
568
- });
569
- }
570
- catch (e) {
571
- // Ignore URL parsing errors here; validation handles invalid input below.
572
- if (process.env.DEBUG)
573
- console.debug('[webpeel]', 'DNS prefetch (redirect URL):', e instanceof Error ? e.message : e);
574
- }
575
- redirectCount++;
576
- continue;
577
- }
578
- if (!response.ok) {
579
- if (response.status === 403 || response.status === 503) {
580
- throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
581
- }
582
- throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
583
- }
584
- rememberConditionalValidators(currentUrl, response);
585
- // Content-Type detection
586
- const contentType = response.headers.get('content-type') || '';
587
- const contentTypeLower = contentType.toLowerCase();
588
- const urlLower = currentUrl.toLowerCase();
589
- // Support binary documents (PDF/DOCX) in the simple HTTP path.
590
- const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
591
- const isDocx = contentTypeLower.includes('application/vnd.openxmlformats-officedocument.wordprocessingml.document') || urlLower.endsWith('.docx');
592
- const isBinaryDoc = isPdf || isDocx;
593
- // Accept a wide range of text-based content, plus supported binary documents.
594
- const ALLOWED_TYPES = [
595
- 'text/html', 'application/xhtml+xml',
596
- 'text/plain', 'text/markdown', 'text/csv',
597
- 'application/json', 'text/json',
598
- 'text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml',
599
- 'application/javascript', 'text/javascript', 'text/css',
600
- // Documents
601
- 'application/pdf',
602
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
603
- ];
604
- const isAllowed = !contentTypeLower ||
605
- ALLOWED_TYPES.some(t => contentTypeLower.includes(t)) ||
606
- // Many servers mislabel docs as octet-stream; allow when URL implies a supported document.
607
- (contentTypeLower.includes('application/octet-stream') && isBinaryDoc);
608
- if (!isAllowed) {
609
- // Check if it's at least text-based
610
- const isTexty = contentTypeLower.startsWith('text/') ||
611
- contentTypeLower.includes('json') ||
612
- contentTypeLower.includes('xml');
613
- if (!isTexty) {
614
- throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content and PDF/DOCX documents only.`);
615
- }
616
- }
617
- // SECURITY: Stream response with size limit (prevent memory exhaustion)
618
- const chunks = [];
619
- let totalSize = 0;
620
- const MAX_SIZE = 10 * 1024 * 1024; // 10MB
621
- const reader = response.body?.getReader();
622
- if (!reader) {
623
- throw new NetworkError('Response body is not readable');
624
- }
625
- try {
626
- while (true) {
627
- const { done, value } = await reader.read();
628
- if (done)
629
- break;
630
- totalSize += value.length;
631
- if (totalSize > MAX_SIZE) {
632
- reader.cancel();
633
- throw new WebPeelError('Response too large (max 10MB)');
634
- }
635
- chunks.push(value);
636
- }
637
- }
638
- finally {
639
- reader.releaseLock();
640
- }
641
- // Combine chunks
642
- const combined = new Uint8Array(totalSize);
643
- let offset = 0;
644
- for (const chunk of chunks) {
645
- combined.set(chunk, offset);
646
- offset += chunk.length;
647
- }
648
- const buffer = Buffer.from(combined);
649
- const html = isBinaryDoc ? '' : new TextDecoder().decode(combined);
650
- // For HTML content, check for suspiciously small responses (bot blocks)
651
- // Non-HTML content (JSON, text, XML) can legitimately be short
652
- const isHtmlContent = !isBinaryDoc && (contentTypeLower.includes('html') || contentTypeLower.includes('xhtml'));
653
- if (isHtmlContent && (!html || html.length < 100)) {
654
- throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
655
- }
656
- if (!isBinaryDoc && !html) {
657
- throw new NetworkError('Empty response body');
658
- }
659
- if (isBinaryDoc && buffer.length === 0) {
660
- throw new NetworkError('Empty response body');
661
- }
662
- // Check for Cloudflare challenge (only relevant for HTML)
663
- if (isHtmlContent && (html.includes('cf-browser-verification') || html.includes('Just a moment...'))) {
664
- throw new BlockedError('Cloudflare challenge detected. Try --render for browser mode.');
665
- }
666
- // Run full challenge detection for HTML content
667
- // Note: skip empty-shell type — in simple HTTP mode, SPA shells are expected and
668
- // the caller's escalation logic upgrades to browser/stealth rendering.
669
- if (isHtmlContent) {
670
- const challengeResult = detectChallenge(html, response.status);
671
- if (challengeResult.isChallenge && challengeResult.type !== 'empty-shell') {
672
- throw new BlockedError(`Challenge page detected (${challengeResult.type || 'unknown'}, confidence: ${challengeResult.confidence.toFixed(2)}). ` +
673
- `Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
674
- }
675
- }
676
- return {
677
- html,
678
- buffer: isBinaryDoc ? buffer : undefined,
679
- url: currentUrl,
680
- statusCode: response.status,
681
- contentType,
682
- };
683
- }
684
- catch (error) {
685
- clearTimeout(timer);
686
- if (error instanceof BlockedError || error instanceof NetworkError || error instanceof WebPeelError) {
687
- throw error;
688
- }
689
- if (error instanceof Error && error.name === 'AbortError') {
690
- if (abortSignal?.aborted && !timeoutController.signal.aborted) {
691
- throw createAbortError();
692
- }
693
- throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
694
- }
695
- // Provide specific error messages based on the actual cause
696
- const cause = error instanceof Error && error.cause;
697
- const causeMsg = cause?.message || cause?.code || '';
698
- if (causeMsg.includes('certificate') || causeMsg.includes('CERT') || causeMsg.includes('SSL') || causeMsg.includes('TLS')) {
699
- throw new NetworkError(`TLS/SSL certificate error for ${new URL(currentUrl).hostname}. The site's certificate may be expired, self-signed, or untrusted.`);
700
- }
701
- if (causeMsg.includes('ENOTFOUND') || causeMsg.includes('getaddrinfo')) {
702
- throw new NetworkError(`DNS resolution failed: ${new URL(currentUrl).hostname} not found. Check the URL or your network connection.`);
703
- }
704
- if (causeMsg.includes('ECONNREFUSED')) {
705
- throw new NetworkError(`Connection refused by ${new URL(currentUrl).hostname}. The server may be down.`);
706
- }
707
- if (causeMsg.includes('ECONNRESET') || causeMsg.includes('EPIPE')) {
708
- throw new NetworkError(`Connection reset by ${new URL(currentUrl).hostname}. Try again or use --render.`);
709
- }
710
- if (causeMsg.includes('ETIMEDOUT') || causeMsg.includes('ENETUNREACH')) {
711
- throw new TimeoutError(`Network unreachable or connection timed out for ${new URL(currentUrl).hostname}.`);
712
- }
713
- const msg = error instanceof Error ? error.message : 'Unknown error';
714
- const causeDetail = causeMsg ? ` (${causeMsg})` : '';
715
- throw new NetworkError(`Failed to fetch: ${msg}${causeDetail}`);
716
- }
717
- }
718
- throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
719
- }
720
- export async function closePool() {
721
- const oldPool = httpPool;
722
- httpPool = createHttpPool();
723
- await oldPool.close().catch(() => { });
724
- }
725
- let sharedBrowser = null;
726
- let sharedStealthBrowser = null;
727
- let activePagesCount = 0;
728
- const MAX_CONCURRENT_PAGES = 5;
729
- const PAGE_POOL_SIZE = 3;
730
- const pooledPages = new Set();
731
- const idlePagePool = [];
732
- let pagePoolFillPromise = null;
733
- function removePooledPage(page) {
734
- pooledPages.delete(page);
735
- const idleIndex = idlePagePool.indexOf(page);
736
- if (idleIndex >= 0) {
737
- idlePagePool.splice(idleIndex, 1);
738
- }
739
- }
740
- function takePooledPage() {
741
- while (idlePagePool.length > 0) {
742
- const page = idlePagePool.shift();
743
- if (page.isClosed()) {
744
- removePooledPage(page);
745
- continue;
746
- }
747
- return page;
748
- }
749
- return null;
750
- }
751
- async function ensurePagePool(browser) {
752
- const activeBrowser = browser ?? sharedBrowser;
753
- if (!activeBrowser || !activeBrowser.isConnected()) {
754
- return;
755
- }
756
- if (pagePoolFillPromise) {
757
- await pagePoolFillPromise;
758
- return;
759
- }
760
- pagePoolFillPromise = (async () => {
761
- while (pooledPages.size < PAGE_POOL_SIZE) {
762
- const pooledPage = await activeBrowser.newPage({
763
- userAgent: getRandomUserAgent(),
764
- viewport: null, // Use browser window size (set via --window-size at launch)
765
- });
766
- await applyStealthScripts(pooledPage);
767
- pooledPages.add(pooledPage);
768
- idlePagePool.push(pooledPage);
769
- }
770
- })().finally(() => {
771
- pagePoolFillPromise = null;
772
- });
773
- await pagePoolFillPromise;
774
- }
775
- async function recyclePooledPage(page) {
776
- if (!pooledPages.has(page)) {
777
- await page.close().catch(() => { });
778
- return;
779
- }
780
- if (page.isClosed()) {
781
- removePooledPage(page);
782
- if (sharedBrowser?.isConnected()) {
783
- void ensurePagePool(sharedBrowser).catch(() => { });
784
- }
785
- return;
786
- }
787
- try {
788
- await page.unroute('**/*').catch(() => { });
789
- await page.context().clearCookies().catch(() => { });
790
- await page.setExtraHTTPHeaders({});
791
- await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 }).catch(() => { });
792
- if (!idlePagePool.includes(page)) {
793
- idlePagePool.push(page);
794
- }
795
- }
796
- catch (e) {
797
- // Non-fatal: page reset failed, removing from pool and closing
798
- if (process.env.DEBUG)
799
- console.debug('[webpeel]', 'page reset failed:', e instanceof Error ? e.message : e);
800
- removePooledPage(page);
801
- await page.close().catch(() => { });
802
- }
803
- if (sharedBrowser?.isConnected() && pooledPages.size < PAGE_POOL_SIZE) {
804
- void ensurePagePool(sharedBrowser).catch(() => { });
805
- }
806
- }
807
- export async function warmup() {
808
- startDnsWarmup();
809
- const browser = await getBrowser();
810
- await ensurePagePool(browser);
811
- }
812
- async function getBrowser() {
813
- // SECURITY: Check if browser is still connected and healthy
814
- if (sharedBrowser) {
815
- try {
816
- if (sharedBrowser.isConnected()) {
817
- if (pooledPages.size < PAGE_POOL_SIZE) {
818
- void ensurePagePool(sharedBrowser).catch(() => { });
819
- }
820
- return sharedBrowser;
821
- }
822
- }
823
- catch (e) {
824
- // Browser is dead, recreate
825
- if (process.env.DEBUG)
826
- console.debug('[webpeel]', 'shared browser health check failed, recreating:', e instanceof Error ? e.message : e);
827
- sharedBrowser = null;
828
- }
829
- }
830
- pooledPages.clear();
831
- idlePagePool.length = 0;
832
- pagePoolFillPromise = null;
833
- const vp = getRandomViewport();
834
- const pw = await getPlaywright();
835
- sharedBrowser = await pw.launch({
836
- headless: true,
837
- args: [...ANTI_DETECTION_ARGS, `--window-size=${vp.width},${vp.height}`],
838
- });
839
- void ensurePagePool(sharedBrowser).catch(() => { });
840
- return sharedBrowser;
841
- }
842
- async function getStealthBrowser() {
843
- // SECURITY: Check if stealth browser is still connected and healthy
844
- if (sharedStealthBrowser) {
845
- try {
846
- if (sharedStealthBrowser.isConnected()) {
847
- return sharedStealthBrowser;
848
- }
849
- }
850
- catch (e) {
851
- // Browser is dead, recreate
852
- if (process.env.DEBUG)
853
- console.debug('[webpeel]', 'stealth browser health check failed, recreating:', e instanceof Error ? e.message : e);
854
- sharedStealthBrowser = null;
855
- }
856
- }
857
- const stealthVp = getRandomViewport();
858
- const stealthPw = await getStealthPlaywright();
859
- const stealthBrowser = await stealthPw.launch({
860
- headless: true,
861
- args: [...ANTI_DETECTION_ARGS, `--window-size=${stealthVp.width},${stealthVp.height}`],
862
- });
863
- if (!stealthBrowser)
864
- throw new Error('Failed to launch stealth browser');
865
- sharedStealthBrowser = stealthBrowser;
866
- return stealthBrowser;
867
- }
868
- // ── Persistent profile browser instances ─────────────────────────────────────
869
- // Profile browsers are NOT shared — each profileDir gets its own instance.
870
- // These are keyed by profile path and kept alive between fetches in the same process.
871
- const profileBrowsers = new Map();
872
- /**
873
- * Get or create a browser instance with a persistent user data directory.
874
- * Profile browsers bypass the shared browser pool so cookies/sessions survive
875
- * between fetch calls.
2
+ * Core fetching thin re-export layer for backward compatibility.
876
3
  *
877
- * @param profileDir Absolute path to the Chrome user-data-dir directory
878
- * @param headed Whether to launch in headed (visible) mode
879
- * @param stealth Whether to use playwright-extra stealth instead of plain chromium
880
- */
881
- async function getProfileBrowser(profileDir, headed = false, stealth = false) {
882
- const existing = profileBrowsers.get(profileDir);
883
- if (existing) {
884
- try {
885
- if (existing.isConnected())
886
- return existing;
887
- }
888
- catch (e) {
889
- // Profile browser is dead, recreate
890
- if (process.env.DEBUG)
891
- console.debug('[webpeel]', 'profile browser health check failed, recreating:', e instanceof Error ? e.message : e);
892
- }
893
- profileBrowsers.delete(profileDir);
894
- }
895
- const profileVp = getRandomViewport();
896
- const launchOptions = {
897
- headless: !headed,
898
- args: [
899
- ...ANTI_DETECTION_ARGS,
900
- `--window-size=${profileVp.width},${profileVp.height}`,
901
- `--user-data-dir=${profileDir}`,
902
- ],
903
- };
904
- const launched = stealth
905
- ? await (await getStealthPlaywright()).launch(launchOptions)
906
- : await (await getPlaywright()).launch(launchOptions);
907
- if (!launched)
908
- throw new Error('Failed to launch profile browser');
909
- profileBrowsers.set(profileDir, launched);
910
- return launched;
911
- }
912
- /**
913
- * Fetch using headless Chromium via Playwright
914
- * Slower but can handle JavaScript-heavy sites and bypass some bot detection
915
- */
916
- export async function browserFetch(url, options = {}) {
917
- // SECURITY: Validate URL to prevent SSRF
918
- validateUrl(url);
919
- const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, } = options;
920
- // Validate user agent if provided
921
- // In stealth mode with no custom UA, always use a realistic Chrome UA
922
- const validatedUserAgent = userAgent
923
- ? validateUserAgent(userAgent)
924
- : (stealth ? getRealisticUserAgent() : getRandomUserAgent());
925
- // Validate wait time
926
- if (waitMs < 0 || waitMs > 60000) {
927
- throw new WebPeelError('Wait time must be between 0 and 60000ms');
928
- }
929
- if (signal?.aborted) {
930
- throw createAbortError();
931
- }
932
- // SECURITY: Validate custom headers if provided
933
- if (headers) {
934
- for (const [key, value] of Object.entries(headers)) {
935
- // Block Host header override
936
- if (key.toLowerCase() === 'host') {
937
- throw new WebPeelError('Custom Host header is not allowed');
938
- }
939
- if (typeof value !== 'string' || value.length > 500) {
940
- throw new WebPeelError('Invalid header value');
941
- }
942
- }
943
- }
944
- // SECURITY: Limit concurrent browser pages with timeout
945
- const queueStartTime = Date.now();
946
- const QUEUE_TIMEOUT_MS = 30000; // 30 second max wait
947
- while (activePagesCount >= MAX_CONCURRENT_PAGES) {
948
- if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
949
- throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
950
- }
951
- await new Promise(resolve => setTimeout(resolve, 100));
952
- }
953
- activePagesCount++;
954
- let page = null;
955
- let usingPooledPage = false;
956
- let abortHandler;
957
- // Declared here (outside try) so the finally block can reference it
958
- const usingProfileBrowser = !!profileDir;
959
- // Owned context created when storageState injection is requested
960
- let ownedContext;
961
- try {
962
- const browser = usingProfileBrowser
963
- ? await getProfileBrowser(profileDir, headed, stealth)
964
- : stealth
965
- ? await getStealthBrowser()
966
- : await getBrowser();
967
- // Only use the shared page pool for non-stealth, non-profile, non-keepOpen, non-storageState, non-proxy fetches
968
- const shouldUsePagePool = !stealth && !userAgent && !keepPageOpen && !usingProfileBrowser && !storageState && !proxy;
969
- if (shouldUsePagePool) {
970
- page = takePooledPage();
971
- usingPooledPage = !!page;
972
- if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
973
- void ensurePagePool(browser).catch(() => { });
974
- }
975
- }
976
- if (!page) {
977
- const fetchVp = getRandomViewport();
978
- const pageOptions = {
979
- userAgent: validatedUserAgent,
980
- // viewport: null lets the browser use its natural window size (set via --window-size),
981
- // avoiding the telltale Playwright default of 1280×720.
982
- viewport: null,
983
- ...(stealth
984
- ? {
985
- locale: 'en-US',
986
- timezoneId: 'America/New_York',
987
- javaScriptEnabled: true,
988
- }
989
- : {}),
990
- };
991
- if (proxy) {
992
- // Parse proxy URL to extract auth credentials for Playwright
993
- let playwrightProxy;
994
- try {
995
- const proxyUrl = new URL(proxy);
996
- playwrightProxy = {
997
- server: `${proxyUrl.protocol}//${proxyUrl.host}`,
998
- username: proxyUrl.username || undefined,
999
- password: proxyUrl.password || undefined,
1000
- };
1001
- }
1002
- catch (e) {
1003
- // Fallback: use proxy string as-is
1004
- if (process.env.DEBUG)
1005
- console.debug('[webpeel]', 'proxy URL parse failed, using as-is:', e instanceof Error ? e.message : e);
1006
- playwrightProxy = { server: proxy };
1007
- }
1008
- // Create an isolated context with the proxy and optional storageState
1009
- ownedContext = await browser.newContext({
1010
- ...pageOptions,
1011
- proxy: playwrightProxy,
1012
- viewport: { width: fetchVp.width, height: fetchVp.height },
1013
- ...(storageState ? { storageState } : {}),
1014
- });
1015
- page = await ownedContext.newPage();
1016
- }
1017
- else if (storageState) {
1018
- // Create an isolated context with the injected storage state (cookies + localStorage)
1019
- ownedContext = await browser.newContext({
1020
- ...pageOptions,
1021
- storageState,
1022
- viewport: { width: fetchVp.width, height: fetchVp.height },
1023
- });
1024
- page = await ownedContext.newPage();
1025
- }
1026
- else {
1027
- page = await browser.newPage(pageOptions);
1028
- }
1029
- await applyStealthScripts(page);
1030
- usingPooledPage = false;
1031
- }
1032
- else {
1033
- await page.setViewportSize({ width: 1280, height: 720 }).catch(() => { });
1034
- }
1035
- if (signal) {
1036
- abortHandler = () => {
1037
- if (page && !page.isClosed()) {
1038
- void page.close().catch(() => { });
1039
- }
1040
- };
1041
- signal.addEventListener('abort', abortHandler, { once: true });
1042
- }
1043
- await page.unroute('**/*').catch(() => { });
1044
- const mergedHeaders = { ...(headers || {}) };
1045
- if (usingPooledPage) {
1046
- mergedHeaders['User-Agent'] = validatedUserAgent;
1047
- }
1048
- if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
1049
- await page.setExtraHTTPHeaders(mergedHeaders);
1050
- }
1051
- // Set cookies if provided
1052
- if (cookies && cookies.length > 0) {
1053
- const parsedCookies = cookies.map(cookie => {
1054
- const [nameValue] = cookie.split(';').map(s => s.trim());
1055
- const [name, value] = nameValue.split('=');
1056
- if (!name || value === undefined) {
1057
- throw new WebPeelError(`Invalid cookie format: ${cookie}`);
1058
- }
1059
- return {
1060
- name: name.trim(),
1061
- value: value.trim(),
1062
- url,
1063
- };
1064
- });
1065
- await page.context().addCookies(parsedCookies);
1066
- }
1067
- if (signal?.aborted) {
1068
- throw createAbortError();
1069
- }
1070
- // Block images/fonts/etc for speed in non-stealth mode.
1071
- // In stealth mode, blocking common resources can be a bot-detection signal.
1072
- if (!screenshot && !stealth) {
1073
- await page.route('**/*', (route) => {
1074
- const resourceType = route.request().resourceType();
1075
- if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
1076
- route.abort();
1077
- }
1078
- else {
1079
- route.continue();
1080
- }
1081
- });
1082
- }
1083
- else {
1084
- // For screenshots and stealth mode, allow all resources
1085
- await page.route('**/*', (route) => route.continue());
1086
- }
1087
- // SECURITY: Wrap entire operation in timeout
1088
- let screenshotBuffer;
1089
- const throwIfAborted = () => {
1090
- if (signal?.aborted) {
1091
- throw createAbortError();
1092
- }
1093
- };
1094
- const fetchPromise = (async () => {
1095
- const response = await page.goto(url, {
1096
- waitUntil: 'domcontentloaded',
1097
- timeout: timeoutMs,
1098
- });
1099
- throwIfAborted();
1100
- // Quick check: if body text is very thin, wait for JS to render more content.
1101
- // Only adds latency when the page clearly hasn't loaded yet.
1102
- // eslint-disable-next-line @typescript-eslint/no-implied-eval
1103
- const bodyTextLength = await page.evaluate('document.body?.innerText?.trim().length || 0').catch(() => 0);
1104
- if (bodyTextLength < 500) {
1105
- await page.waitForLoadState('networkidle', { timeout: 1500 }).catch(() => { });
1106
- throwIfAborted();
1107
- }
1108
- // DOM stability check: wait for SPA hydration to settle.
1109
- // Polls innerText length every 500ms — if still growing, keep waiting (max 3s extra).
1110
- {
1111
- const stabilityStart = Date.now();
1112
- const MAX_STABILITY_WAIT_MS = 3000;
1113
- const POLL_INTERVAL_MS = 500;
1114
- let prevLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
1115
- let stableCount = 0;
1116
- while (Date.now() - stabilityStart < MAX_STABILITY_WAIT_MS) {
1117
- throwIfAborted();
1118
- await page.waitForTimeout(POLL_INTERVAL_MS);
1119
- const curLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
1120
- if (curLength === prevLength) {
1121
- stableCount++;
1122
- if (stableCount >= 2)
1123
- break; // stable for 2 consecutive checks (~1s)
1124
- }
1125
- else {
1126
- stableCount = 0;
1127
- }
1128
- prevLength = curLength;
1129
- }
1130
- }
1131
- const finalUrl = page.url();
1132
- const contentType = response?.headers()?.['content-type'] || '';
1133
- const contentTypeLower = contentType.toLowerCase();
1134
- const urlLower = finalUrl.toLowerCase();
1135
- const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
1136
- const isDocx = contentTypeLower.includes('wordprocessingml.document') || urlLower.endsWith('.docx');
1137
- const isBinaryDoc = !!response && (isPdf || isDocx);
1138
- // Small randomized delay in stealth mode (simulate human behavior)
1139
- // Keep it short — enough to look human, not enough to kill latency
1140
- if (stealth) {
1141
- const extraDelayMs = 200 + Math.floor(Math.random() * 601);
1142
- await page.waitForTimeout(extraDelayMs);
1143
- throwIfAborted();
1144
- }
1145
- // Wait for additional time if requested (for dynamic content / screenshots)
1146
- if (waitMs > 0) {
1147
- await page.waitForTimeout(waitMs);
1148
- throwIfAborted();
1149
- }
1150
- // Execute page actions if provided
1151
- if (actions && actions.length > 0) {
1152
- const { executeActions } = await import('./actions.js');
1153
- const actionScreenshot = await executeActions(page, actions);
1154
- if (actionScreenshot) {
1155
- screenshotBuffer = actionScreenshot;
1156
- }
1157
- throwIfAborted();
1158
- }
1159
- // If the navigation returned a binary document (PDF/DOCX), grab the raw body.
1160
- if (isBinaryDoc) {
1161
- const buffer = await response.body();
1162
- throwIfAborted();
1163
- // Capture screenshot if requested (and not already captured by actions)
1164
- if (screenshot && !screenshotBuffer) {
1165
- screenshotBuffer = await page.screenshot({
1166
- fullPage: screenshotFullPage,
1167
- type: 'png',
1168
- });
1169
- }
1170
- return {
1171
- html: '',
1172
- finalUrl,
1173
- buffer,
1174
- contentType,
1175
- statusCode: response.status(),
1176
- };
1177
- }
1178
- const html = await page.content();
1179
- throwIfAborted();
1180
- return {
1181
- html,
1182
- finalUrl,
1183
- contentType,
1184
- statusCode: response?.status(),
1185
- };
1186
- })();
1187
- let operationTimeout;
1188
- const timeoutPromise = new Promise((_, reject) => {
1189
- operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
1190
- });
1191
- const fetchData = await Promise.race([fetchPromise, timeoutPromise]);
1192
- if (operationTimeout) {
1193
- clearTimeout(operationTimeout);
1194
- }
1195
- const { html, finalUrl } = fetchData;
1196
- const fetchBuffer = 'buffer' in fetchData ? fetchData.buffer : undefined;
1197
- const fetchContentType = 'contentType' in fetchData ? fetchData.contentType : undefined;
1198
- const fetchStatusCode = 'statusCode' in fetchData ? fetchData.statusCode : undefined;
1199
- const isBinaryDoc = !!fetchBuffer;
1200
- // SECURITY: Limit HTML size (skip for binary documents where html is empty)
1201
- if (!isBinaryDoc) {
1202
- if (html.length > 10 * 1024 * 1024) { // 10MB limit
1203
- throw new WebPeelError('Response too large (max 10MB)');
1204
- }
1205
- if (!html || html.length < 100) {
1206
- throw new BlockedError('Empty or suspiciously small response from browser.');
1207
- }
1208
- // Run challenge detection on browser-fetched HTML (covers both regular and stealth modes)
1209
- // Note: skip empty-shell type — that's a rendering quality issue (SPA needs more JS time),
1210
- // not a bot challenge. The caller's escalation logic handles empty-shell separately.
1211
- const browserChallengeResult = detectChallenge(html, fetchStatusCode);
1212
- if (browserChallengeResult.isChallenge && browserChallengeResult.type !== 'empty-shell') {
1213
- throw new BlockedError(`Challenge page detected (${browserChallengeResult.type || 'unknown'}, confidence: ${browserChallengeResult.confidence.toFixed(2)}). ` +
1214
- `Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
1215
- }
1216
- }
1217
- // Capture screenshot if requested (and not already captured by actions or document handler)
1218
- if (screenshot && !screenshotBuffer) {
1219
- screenshotBuffer = await page.screenshot({
1220
- fullPage: screenshotFullPage,
1221
- type: 'png'
1222
- });
1223
- }
1224
- // If keepPageOpen, return page/browser for caller to use (e.g., branding extraction)
1225
- if (keepPageOpen && page) {
1226
- return {
1227
- html,
1228
- buffer: fetchBuffer,
1229
- url: finalUrl,
1230
- statusCode: fetchStatusCode,
1231
- contentType: fetchContentType,
1232
- screenshot: screenshotBuffer,
1233
- page,
1234
- browser,
1235
- };
1236
- }
1237
- return {
1238
- html,
1239
- buffer: fetchBuffer,
1240
- url: finalUrl,
1241
- statusCode: fetchStatusCode,
1242
- contentType: fetchContentType,
1243
- screenshot: screenshotBuffer,
1244
- };
1245
- }
1246
- catch (error) {
1247
- if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
1248
- throw error;
1249
- }
1250
- if (error instanceof Error && error.name === 'AbortError') {
1251
- throw error;
1252
- }
1253
- if (error instanceof Error && error.message.includes('Timeout')) {
1254
- throw new TimeoutError(`Browser navigation timed out`);
1255
- }
1256
- throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
1257
- }
1258
- finally {
1259
- if (signal && abortHandler) {
1260
- signal.removeEventListener('abort', abortHandler);
1261
- }
1262
- // CRITICAL: Always release/close page and decrement counter (unless keepPageOpen and no error)
1263
- if (page && !keepPageOpen) {
1264
- if (usingPooledPage) {
1265
- await recyclePooledPage(page);
1266
- }
1267
- else if (ownedContext) {
1268
- // Close the owned context (also closes the page)
1269
- await ownedContext.close().catch(() => { });
1270
- }
1271
- else if (!usingProfileBrowser) {
1272
- // Profile browser pages are NOT closed — the profile browser stays alive
1273
- // so that the next fetch in the same process reuses the session.
1274
- await page.close().catch(() => { });
1275
- }
1276
- }
1277
- activePagesCount--;
1278
- }
1279
- }
1280
- /**
1281
- * Retry a fetch operation with exponential backoff
1282
- */
1283
- export async function browserScreenshot(url, options = {}) {
1284
- // SECURITY: Validate URL to prevent SSRF
1285
- validateUrl(url);
1286
- const { fullPage = false, width, height, format = 'png', quality, waitMs = 0, timeoutMs = 30000, userAgent, headers, cookies, stealth = false, actions, } = options;
1287
- const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
1288
- // Basic validation
1289
- if (waitMs < 0 || waitMs > 60000) {
1290
- throw new WebPeelError('Wait time must be between 0 and 60000ms');
1291
- }
1292
- if (timeoutMs < 1000 || timeoutMs > 120000) {
1293
- throw new WebPeelError('Timeout must be between 1000 and 120000ms');
1294
- }
1295
- if (width !== undefined && (!Number.isFinite(width) || width < 100 || width > 5000)) {
1296
- throw new WebPeelError('Width must be between 100 and 5000');
1297
- }
1298
- if (height !== undefined && (!Number.isFinite(height) || height < 100 || height > 5000)) {
1299
- throw new WebPeelError('Height must be between 100 and 5000');
1300
- }
1301
- if (format !== 'png' && format !== 'jpeg') {
1302
- throw new WebPeelError('Format must be png or jpeg');
1303
- }
1304
- if (format === 'jpeg' && quality !== undefined) {
1305
- if (!Number.isFinite(quality) || quality < 1 || quality > 100) {
1306
- throw new WebPeelError('JPEG quality must be between 1 and 100');
1307
- }
1308
- }
1309
- // SECURITY: Validate custom headers if provided
1310
- if (headers) {
1311
- for (const [key, value] of Object.entries(headers)) {
1312
- if (key.toLowerCase() === 'host') {
1313
- throw new WebPeelError('Custom Host header is not allowed');
1314
- }
1315
- if (typeof value !== 'string' || value.length > 500) {
1316
- throw new WebPeelError('Invalid header value');
1317
- }
1318
- }
1319
- }
1320
- // SECURITY: Limit concurrent browser pages with timeout
1321
- const queueStartTime = Date.now();
1322
- const QUEUE_TIMEOUT_MS = 30000;
1323
- while (activePagesCount >= MAX_CONCURRENT_PAGES) {
1324
- if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
1325
- throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
1326
- }
1327
- await new Promise(resolve => setTimeout(resolve, 100));
1328
- }
1329
- activePagesCount++;
1330
- let page = null;
1331
- let usingPooledPage = false;
1332
- try {
1333
- const browser = stealth ? await getStealthBrowser() : await getBrowser();
1334
- const shouldUsePagePool = !stealth && !userAgent;
1335
- if (shouldUsePagePool) {
1336
- page = takePooledPage();
1337
- usingPooledPage = !!page;
1338
- if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
1339
- void ensurePagePool(browser).catch(() => { });
1340
- }
1341
- }
1342
- if (!page) {
1343
- page = await browser.newPage({
1344
- userAgent: validatedUserAgent,
1345
- viewport: width || height ? {
1346
- width: width || 1280,
1347
- height: height || 720,
1348
- } : null, // Use browser window size when no explicit dimensions requested
1349
- });
1350
- await applyStealthScripts(page);
1351
- usingPooledPage = false;
1352
- }
1353
- else {
1354
- await page.setViewportSize({
1355
- width: width || 1280,
1356
- height: height || 720,
1357
- }).catch(() => { });
1358
- }
1359
- await page.unroute('**/*').catch(() => { });
1360
- const mergedHeaders = { ...(headers || {}) };
1361
- if (usingPooledPage) {
1362
- mergedHeaders['User-Agent'] = validatedUserAgent;
1363
- }
1364
- if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
1365
- await page.setExtraHTTPHeaders(mergedHeaders);
1366
- }
1367
- if (cookies && cookies.length > 0) {
1368
- const parsedCookies = cookies.map(cookie => {
1369
- const [nameValue] = cookie.split(';').map(s => s.trim());
1370
- const [name, value] = nameValue.split('=');
1371
- if (!name || value === undefined) {
1372
- throw new WebPeelError(`Invalid cookie format: ${cookie}`);
1373
- }
1374
- return {
1375
- name: name.trim(),
1376
- value: value.trim(),
1377
- url,
1378
- };
1379
- });
1380
- await page.context().addCookies(parsedCookies);
1381
- }
1382
- // For screenshots, allow all resources
1383
- await page.route('**/*', (route) => route.continue());
1384
- let screenshotBuffer;
1385
- const doWork = (async () => {
1386
- await page.goto(url, {
1387
- waitUntil: 'domcontentloaded',
1388
- timeout: timeoutMs,
1389
- });
1390
- if (waitMs > 0) {
1391
- await page.waitForTimeout(waitMs);
1392
- }
1393
- if (actions && actions.length > 0) {
1394
- const { executeActions } = await import('./actions.js');
1395
- const actionScreenshot = await executeActions(page, actions, {
1396
- fullPage,
1397
- type: format,
1398
- quality,
1399
- });
1400
- if (actionScreenshot) {
1401
- screenshotBuffer = actionScreenshot;
1402
- }
1403
- }
1404
- const finalUrl = page.url();
1405
- // Capture screenshot if not captured via actions
1406
- if (!screenshotBuffer) {
1407
- screenshotBuffer = await page.screenshot({
1408
- fullPage,
1409
- type: format,
1410
- ...(format === 'jpeg' && typeof quality === 'number' ? { quality } : {}),
1411
- });
1412
- }
1413
- return { finalUrl, screenshotBuffer: screenshotBuffer };
1414
- })();
1415
- let operationTimeout;
1416
- const timeoutPromise = new Promise((_, reject) => {
1417
- operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
1418
- });
1419
- const { finalUrl, screenshotBuffer: buf } = await Promise.race([doWork, timeoutPromise]);
1420
- if (operationTimeout) {
1421
- clearTimeout(operationTimeout);
1422
- }
1423
- return { buffer: buf, finalUrl };
1424
- }
1425
- catch (error) {
1426
- if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
1427
- throw error;
1428
- }
1429
- if (error instanceof Error && error.message.includes('Timeout')) {
1430
- throw new TimeoutError('Browser screenshot timed out');
1431
- }
1432
- throw new NetworkError(`Browser screenshot failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
1433
- }
1434
- finally {
1435
- if (page) {
1436
- if (usingPooledPage) {
1437
- await recyclePooledPage(page);
1438
- }
1439
- else {
1440
- await page.close().catch(() => { });
1441
- }
1442
- }
1443
- activePagesCount--;
1444
- }
1445
- }
1446
- export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
1447
- let lastError = null;
1448
- for (let attempt = 1; attempt <= maxAttempts; attempt++) {
1449
- try {
1450
- return await fn();
1451
- }
1452
- catch (error) {
1453
- lastError = error instanceof Error ? error : new Error('Unknown error');
1454
- // Don't retry on blocked errors or timeouts
1455
- if (error instanceof BlockedError || error instanceof TimeoutError) {
1456
- throw error;
1457
- }
1458
- if (attempt < maxAttempts) {
1459
- const delay = baseDelayMs * Math.pow(2, attempt - 1);
1460
- await new Promise((resolve) => setTimeout(resolve, delay));
1461
- }
1462
- }
1463
- }
1464
- throw lastError || new NetworkError('Retry failed');
1465
- }
1466
- /**
1467
- * Scroll to the bottom of the page N times, waiting for the network to
1468
- * settle between each scroll. Useful for triggering lazy-loaded content
1469
- * (infinite scroll, deferred images, etc.).
1470
- *
1471
- * @param page - Playwright Page instance.
1472
- * @param times - Number of scroll-and-wait cycles (default: 3).
1473
- * @returns The final page HTML after all scrolls complete.
1474
- */
1475
- export async function scrollAndWait(page, times = 3) {
1476
- for (let i = 0; i < times; i++) {
1477
- // eslint-disable-next-line @typescript-eslint/no-implied-eval
1478
- await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
1479
- // Wait for network to settle (500 ms of no new requests) or 2 s max.
1480
- try {
1481
- await page.waitForLoadState('networkidle', { timeout: 2000 });
1482
- }
1483
- catch (e) {
1484
- // networkidle may never fire — fall back to a flat delay.
1485
- if (process.env.DEBUG)
1486
- console.debug('[webpeel]', 'networkidle timeout, falling back to flat delay:', e instanceof Error ? e.message : e);
1487
- await page.waitForTimeout(1000);
1488
- }
1489
- }
1490
- return page.content();
1491
- }
1492
- /**
1493
- * Clean up browser resources (shared pool, stealth browser, and all profile browsers).
1494
- */
1495
- export async function cleanup() {
1496
- const pagesToClose = Array.from(pooledPages);
1497
- pooledPages.clear();
1498
- idlePagePool.length = 0;
1499
- pagePoolFillPromise = null;
1500
- await Promise.all(pagesToClose.map((page) => page.close().catch(() => { })));
1501
- if (sharedBrowser) {
1502
- await sharedBrowser.close();
1503
- sharedBrowser = null;
1504
- }
1505
- if (sharedStealthBrowser) {
1506
- await sharedStealthBrowser.close();
1507
- sharedStealthBrowser = null;
1508
- }
1509
- // Close all persistent profile browsers
1510
- const profileBrowserList = Array.from(profileBrowsers.values());
1511
- profileBrowsers.clear();
1512
- await Promise.all(profileBrowserList.map(b => b.close().catch(() => { })));
1513
- await closePool().catch(() => { });
1514
- }
1515
- /**
1516
- * Close a specific persistent profile browser (e.g. when done with a session).
1517
- * Safe to call even if the browser has already been closed.
1518
- *
1519
- * @param profileDir Path to the profile directory used when launching
1520
- */
1521
- export async function closeProfileBrowser(profileDir) {
1522
- const browser = profileBrowsers.get(profileDir);
1523
- if (browser) {
1524
- profileBrowsers.delete(profileDir);
1525
- await browser.close().catch(() => { });
1526
- }
1527
- }
4
+ * The implementation has been split into focused modules:
5
+ * - http-fetch.ts — Pure HTTP fetching (simpleFetch, SSRF validation, HTTP pool)
6
+ * - browser-pool.ts Browser lifecycle & page pool (getBrowser, cleanup, warmup)
7
+ * - browser-fetch.ts — Browser-based fetching (browserFetch, browserScreenshot)
8
+ */
9
+ // Re-export everything for backward compatibility
10
+ export { simpleFetch } from './http-fetch.js';
11
+ export { cleanup, warmup, closePool, closeProfileBrowser, playwrightLoaded } from './browser-pool.js';
12
+ export { browserFetch, browserScreenshot, retryFetch, scrollAndWait } from './browser-fetch.js';
1528
13
  //# sourceMappingURL=fetcher.js.map