webpeel 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/dist/cache.d.ts.map +1 -1
  2. package/dist/cache.js +11 -4
  3. package/dist/cache.js.map +1 -1
  4. package/dist/cli.bundle.cjs +159248 -0
  5. package/dist/cli.js +1 -1
  6. package/dist/cli.js.map +1 -1
  7. package/dist/core/agent.js +12 -8
  8. package/dist/core/agent.js.map +1 -1
  9. package/dist/core/application-tracker.js +3 -2
  10. package/dist/core/application-tracker.js.map +1 -1
  11. package/dist/core/auto-extract.js +6 -4
  12. package/dist/core/auto-extract.js.map +1 -1
  13. package/dist/core/browser-fetch.d.ts +90 -0
  14. package/dist/core/browser-fetch.d.ts.map +1 -0
  15. package/dist/core/browser-fetch.js +599 -0
  16. package/dist/core/browser-fetch.js.map +1 -0
  17. package/dist/core/browser-pool.d.ts +70 -0
  18. package/dist/core/browser-pool.d.ts.map +1 -0
  19. package/dist/core/browser-pool.js +378 -0
  20. package/dist/core/browser-pool.js.map +1 -0
  21. package/dist/core/change-tracking.js +3 -2
  22. package/dist/core/change-tracking.js.map +1 -1
  23. package/dist/core/diff.js +3 -2
  24. package/dist/core/diff.js.map +1 -1
  25. package/dist/core/domain-extractors.js +3 -2
  26. package/dist/core/domain-extractors.js.map +1 -1
  27. package/dist/core/extract-inline.js +6 -4
  28. package/dist/core/extract-inline.js.map +1 -1
  29. package/dist/core/fetcher.d.ts +9 -116
  30. package/dist/core/fetcher.d.ts.map +1 -1
  31. package/dist/core/fetcher.js +10 -1484
  32. package/dist/core/fetcher.js.map +1 -1
  33. package/dist/core/http-fetch.d.ts +37 -0
  34. package/dist/core/http-fetch.d.ts.map +1 -0
  35. package/dist/core/http-fetch.js +618 -0
  36. package/dist/core/http-fetch.js.map +1 -0
  37. package/dist/core/metadata.js +18 -12
  38. package/dist/core/metadata.js.map +1 -1
  39. package/dist/core/pipeline.d.ts +104 -0
  40. package/dist/core/pipeline.d.ts.map +1 -0
  41. package/dist/core/pipeline.js +623 -0
  42. package/dist/core/pipeline.js.map +1 -0
  43. package/dist/core/profiles.js +15 -10
  44. package/dist/core/profiles.js.map +1 -1
  45. package/dist/core/quick-answer.d.ts.map +1 -1
  46. package/dist/core/quick-answer.js +120 -9
  47. package/dist/core/quick-answer.js.map +1 -1
  48. package/dist/core/rate-governor.js +3 -2
  49. package/dist/core/rate-governor.js.map +1 -1
  50. package/dist/core/readability.d.ts.map +1 -1
  51. package/dist/core/readability.js +19 -6
  52. package/dist/core/readability.js.map +1 -1
  53. package/dist/core/research.js +9 -6
  54. package/dist/core/research.js.map +1 -1
  55. package/dist/core/search-provider.js +12 -8
  56. package/dist/core/search-provider.js.map +1 -1
  57. package/dist/core/strategies.d.ts.map +1 -1
  58. package/dist/core/strategies.js +14 -5
  59. package/dist/core/strategies.js.map +1 -1
  60. package/dist/core/timing.d.ts +22 -0
  61. package/dist/core/timing.d.ts.map +1 -0
  62. package/dist/core/timing.js +34 -0
  63. package/dist/core/timing.js.map +1 -0
  64. package/dist/core/youtube.d.ts.map +1 -1
  65. package/dist/core/youtube.js +19 -6
  66. package/dist/core/youtube.js.map +1 -1
  67. package/dist/index.d.ts +1 -0
  68. package/dist/index.d.ts.map +1 -1
  69. package/dist/index.js +13 -444
  70. package/dist/index.js.map +1 -1
  71. package/dist/mcp/server.js +1 -1
  72. package/dist/mcp/server.js.map +1 -1
  73. package/dist/server/middleware/auth.js +3 -2
  74. package/dist/server/middleware/auth.js.map +1 -1
  75. package/dist/server/routes/answer.d.ts.map +1 -1
  76. package/dist/server/routes/answer.js +5 -0
  77. package/dist/server/routes/answer.js.map +1 -1
  78. package/dist/server/routes/compat.js +3 -2
  79. package/dist/server/routes/compat.js.map +1 -1
  80. package/dist/server/routes/deep-fetch.d.ts.map +1 -1
  81. package/dist/server/routes/deep-fetch.js +5 -0
  82. package/dist/server/routes/deep-fetch.js.map +1 -1
  83. package/dist/server/routes/fetch.d.ts.map +1 -1
  84. package/dist/server/routes/fetch.js +44 -4
  85. package/dist/server/routes/fetch.js.map +1 -1
  86. package/dist/server/routes/health.js +3 -2
  87. package/dist/server/routes/health.js.map +1 -1
  88. package/dist/server/routes/mcp.js +1 -1
  89. package/dist/server/routes/mcp.js.map +1 -1
  90. package/dist/server/routes/quick-answer.d.ts.map +1 -1
  91. package/dist/server/routes/quick-answer.js +5 -0
  92. package/dist/server/routes/quick-answer.js.map +1 -1
  93. package/dist/server/routes/search.js +6 -4
  94. package/dist/server/routes/search.js.map +1 -1
  95. package/dist/server/routes/users.js +3 -2
  96. package/dist/server/routes/users.js.map +1 -1
  97. package/dist/server/routes/webhooks.d.ts +1 -0
  98. package/dist/server/routes/webhooks.d.ts.map +1 -1
  99. package/dist/server/routes/webhooks.js +1 -0
  100. package/dist/server/routes/webhooks.js.map +1 -1
  101. package/dist/server/routes/youtube.d.ts.map +1 -1
  102. package/dist/server/routes/youtube.js +5 -0
  103. package/dist/server/routes/youtube.js.map +1 -1
  104. package/dist/types.d.ts +2 -0
  105. package/dist/types.d.ts.map +1 -1
  106. package/dist/types.js.map +1 -1
  107. package/package.json +5 -2
@@ -1,1487 +1,13 @@
1
1
  /**
2
- * Core fetching logic: simple HTTP and browser-based fetching
3
- */
4
- // Force IPv4-first DNS resolution globally.
5
- // Prevents IPv6 connection failures (TLS errors, timeouts) on hosts that
6
- // advertise AAAA records but can't actually route IPv6 (e.g. Render containers).
7
- // Must run before any network library is used.
8
- import dns from 'dns';
9
- dns.setDefaultResultOrder('ipv4first');
10
- import { chromium } from 'playwright';
11
- import { chromium as stealthChromium } from 'playwright-extra';
12
- import StealthPlugin from 'puppeteer-extra-plugin-stealth';
13
- import { getRealisticUserAgent, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
14
- import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
15
- import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
16
- import { getCached } from './cache.js';
17
- import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
18
- import { detectChallenge } from './challenge-detection.js';
19
- // Add stealth plugin to playwright-extra
20
- stealthChromium.use(StealthPlugin());
21
- /**
22
- * Returns a realistic Chrome user agent.
23
- * Delegates to the curated user-agents module so stealth mode never exposes
24
- * the default "Chrome for Testing" UA which is a reliable bot-detection signal.
25
- */
26
- function getRandomUserAgent() {
27
- return getRealisticUserAgent();
28
- }
29
- /**
30
- * Common Chromium launch arguments for anti-bot-detection.
31
- * Applied to BOTH regular and stealth browser instances.
32
- * NOTE: --window-size is intentionally omitted here; it is added dynamically
33
- * per browser launch using a random realistic viewport (see getRandomViewport()).
34
- */
35
- const ANTI_DETECTION_ARGS = [
36
- '--disable-blink-features=AutomationControlled',
37
- '--disable-infobars',
38
- '--disable-dev-shm-usage',
39
- '--no-sandbox',
40
- '--disable-setuid-sandbox',
41
- '--disable-gpu',
42
- '--start-maximized',
43
- // Chrome branding / stealth hardening
44
- '--disable-features=ChromeUserAgentDataBranding',
45
- '--disable-component-extensions-with-background-pages',
46
- '--disable-default-apps',
47
- '--disable-extensions',
48
- '--disable-hang-monitor',
49
- '--disable-popup-blocking',
50
- '--disable-prompt-on-repost',
51
- '--disable-sync',
52
- '--metrics-recording-only',
53
- '--no-first-run',
54
- ];
55
- /**
56
- * Returns a random realistic viewport weighted by real-world market share.
57
- * Used to avoid the telltale Playwright default of 1280×720.
58
- */
59
- function getRandomViewport() {
60
- // Common real-world resolutions weighted by market share
61
- const viewports = [
62
- { width: 1920, height: 1080, weight: 35 }, // Full HD
63
- { width: 1366, height: 768, weight: 20 }, // Laptop
64
- { width: 1536, height: 864, weight: 15 }, // Scaled laptop
65
- { width: 1440, height: 900, weight: 10 }, // MacBook
66
- { width: 1680, height: 1050, weight: 8 }, // Large laptop
67
- { width: 2560, height: 1440, weight: 7 }, // QHD
68
- { width: 1280, height: 800, weight: 5 }, // Older laptop
69
- ];
70
- const total = viewports.reduce((s, v) => s + v.weight, 0);
71
- let r = Math.random() * total;
72
- for (const v of viewports) {
73
- r -= v.weight;
74
- if (r <= 0)
75
- return { width: v.width, height: v.height };
76
- }
77
- return { width: 1920, height: 1080 };
78
- }
79
- /**
80
- * Apply stealth init scripts to a page to reduce bot-detection signals:
81
- * 1. Hides the `window.__pwInitScripts` Playwright leak.
82
- * 2. Patches `navigator.userAgentData.brands` to include "Google Chrome"
83
- * (Chrome for Testing only ships "Chromium" which is a known detection signal).
84
- */
85
- async function applyStealthScripts(page) {
86
- // 1. Hide Playwright's __pwInitScripts marker
87
- // Uses string form to avoid TypeScript DOM-lib requirements (tsconfig has no DOM lib).
88
- await page.addInitScript(`
89
- Object.defineProperty(window, '__pwInitScripts', {
90
- get: () => undefined,
91
- set: () => {},
92
- configurable: true,
93
- });
94
- `);
95
- // 2. Patch userAgentData brands to include "Google Chrome"
96
- // Chrome for Testing only ships "Chromium" — a well-known bot-detection signal.
97
- await page.addInitScript(`
98
- (function () {
99
- var uad = navigator.userAgentData;
100
- if (!uad) return;
101
- var originalBrands = uad.brands || [];
102
- var hasChromeEntry = originalBrands.some(function(b) { return b.brand === 'Google Chrome'; });
103
- if (hasChromeEntry) return;
104
-
105
- var chromiumEntry = originalBrands.find(function(b) { return b.brand === 'Chromium'; });
106
- var version = (chromiumEntry && chromiumEntry.version) || '136';
107
- var patchedBrands = [
108
- { brand: 'Chromium', version: version },
109
- { brand: 'Google Chrome', version: version },
110
- { brand: 'Not=A?Brand', version: '99' },
111
- ];
112
-
113
- Object.defineProperty(navigator, 'userAgentData', {
114
- get: function() {
115
- return {
116
- brands: patchedBrands,
117
- mobile: false,
118
- platform: uad.platform || 'Windows',
119
- getHighEntropyValues: uad.getHighEntropyValues ? uad.getHighEntropyValues.bind(uad) : undefined,
120
- toJSON: function() {
121
- return {
122
- brands: patchedBrands,
123
- mobile: false,
124
- platform: uad.platform || 'Windows',
125
- };
126
- },
127
- };
128
- },
129
- configurable: true,
130
- });
131
- })();
132
- `);
133
- }
134
- function createHttpPool() {
135
- return new Agent({
136
- connections: 20,
137
- pipelining: 6,
138
- keepAliveTimeout: 60000,
139
- keepAliveMaxTimeout: 60000,
140
- allowH2: true,
141
- connect: {
142
- lookup: cachedLookup,
143
- },
144
- });
145
- }
146
- let httpPool = createHttpPool();
147
- startDnsWarmup();
148
- const CONDITIONAL_CACHE_MAX_ENTRIES = 2000;
149
- const conditionalValidatorsByUrl = new Map();
150
- function normalizeUrlForConditionalCache(url) {
151
- try {
152
- const normalized = new URL(url);
153
- normalized.hash = '';
154
- normalized.hostname = normalized.hostname.toLowerCase();
155
- if ((normalized.protocol === 'http:' && normalized.port === '80') ||
156
- (normalized.protocol === 'https:' && normalized.port === '443')) {
157
- normalized.port = '';
158
- }
159
- if (!normalized.pathname) {
160
- normalized.pathname = '/';
161
- }
162
- const sortedParams = [...normalized.searchParams.entries()]
163
- .sort(([a], [b]) => a.localeCompare(b));
164
- normalized.search = '';
165
- for (const [key, value] of sortedParams) {
166
- normalized.searchParams.append(key, value);
167
- }
168
- return normalized.toString();
169
- }
170
- catch {
171
- return url.trim();
172
- }
173
- }
174
- function getConditionalValidators(url) {
175
- const key = normalizeUrlForConditionalCache(url);
176
- const existing = conditionalValidatorsByUrl.get(key);
177
- if (!existing) {
178
- return null;
179
- }
180
- // LRU touch
181
- conditionalValidatorsByUrl.delete(key);
182
- conditionalValidatorsByUrl.set(key, existing);
183
- return existing;
184
- }
185
- function setConditionalValidators(url, validators) {
186
- const key = normalizeUrlForConditionalCache(url);
187
- if (conditionalValidatorsByUrl.has(key)) {
188
- conditionalValidatorsByUrl.delete(key);
189
- }
190
- conditionalValidatorsByUrl.set(key, validators);
191
- while (conditionalValidatorsByUrl.size > CONDITIONAL_CACHE_MAX_ENTRIES) {
192
- const oldestKey = conditionalValidatorsByUrl.keys().next().value;
193
- if (!oldestKey) {
194
- break;
195
- }
196
- conditionalValidatorsByUrl.delete(oldestKey);
197
- }
198
- }
199
- function rememberConditionalValidators(url, response) {
200
- const etag = response.headers.get('etag') || undefined;
201
- const lastModified = response.headers.get('last-modified') || undefined;
202
- if (!etag && !lastModified) {
203
- return;
204
- }
205
- setConditionalValidators(url, { etag, lastModified });
206
- }
207
- function hasHeader(headers, name) {
208
- const lowered = name.toLowerCase();
209
- return Object.keys(headers).some((header) => header.toLowerCase() === lowered);
210
- }
211
- function getCachedResultFor304(url, fallbackUrl) {
212
- const cached = getCached(url) || (fallbackUrl ? getCached(fallbackUrl) : null);
213
- if (!cached) {
214
- return null;
215
- }
216
- return {
217
- html: cached.html,
218
- buffer: cached.buffer,
219
- url: cached.url || url,
220
- statusCode: 304,
221
- contentType: cached.contentType,
222
- screenshot: cached.screenshot,
223
- };
224
- }
225
- function createAbortError() {
226
- const error = new Error('Operation aborted');
227
- error.name = 'AbortError';
228
- return error;
229
- }
230
- /**
231
- * SECURITY: Validate URL to prevent SSRF attacks
232
- * Blocks localhost, private IPs, link-local, and various bypass techniques
233
- */
234
- function validateUrl(urlString) {
235
- // Length check
236
- if (urlString.length > 2048) {
237
- throw new WebPeelError('URL too long (max 2048 characters)');
238
- }
239
- // Check for control characters and suspicious encoding
240
- if (/[\x00-\x1F\x7F]/.test(urlString)) {
241
- throw new WebPeelError('URL contains invalid control characters');
242
- }
243
- let url;
244
- try {
245
- url = new URL(urlString);
246
- }
247
- catch {
248
- throw new WebPeelError('Invalid URL format');
249
- }
250
- // Only allow HTTP(S)
251
- if (!['http:', 'https:'].includes(url.protocol)) {
252
- throw new WebPeelError('Only HTTP and HTTPS protocols are allowed');
253
- }
254
- // Validate hostname is not empty
255
- if (!url.hostname) {
256
- throw new WebPeelError('Invalid hostname');
257
- }
258
- const hostname = url.hostname.toLowerCase();
259
- // Block localhost patterns
260
- const localhostPatterns = ['localhost', '0.0.0.0'];
261
- if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
262
- throw new WebPeelError('Access to localhost is not allowed');
263
- }
264
- // ENHANCED: Parse and validate IP addresses (handles hex, octal, decimal, mixed)
265
- const ipv4Info = parseAndValidateIPv4(hostname);
266
- if (ipv4Info) {
267
- validateIPv4Address(ipv4Info);
268
- }
269
- // ENHANCED: Comprehensive IPv6 validation
270
- if (hostname.includes(':')) {
271
- validateIPv6Address(hostname);
272
- }
273
- }
274
- /**
275
- * Parse IPv4 address in any format (dotted, hex, octal, decimal, mixed)
276
- * Returns null if not an IPv4 address
277
- */
278
- function parseAndValidateIPv4(hostname) {
279
- // Remove brackets if present
280
- const cleaned = hostname.replace(/^\[|\]$/g, '');
281
- // Standard dotted notation: 192.168.1.1
282
- const dottedRegex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
283
- const dottedMatch = cleaned.match(dottedRegex);
284
- if (dottedMatch) {
285
- const octets = dottedMatch.slice(1).map(Number);
286
- if (octets.every(o => o >= 0 && o <= 255)) {
287
- return octets;
288
- }
289
- throw new WebPeelError('Invalid IPv4 address');
290
- }
291
- // Hex notation: 0x7f000001
292
- if (/^0x[0-9a-fA-F]+$/.test(cleaned)) {
293
- const num = parseInt(cleaned, 16);
294
- return [
295
- (num >>> 24) & 0xff,
296
- (num >>> 16) & 0xff,
297
- (num >>> 8) & 0xff,
298
- num & 0xff,
299
- ];
300
- }
301
- // Octal notation: 0177.0.0.1 or full octal 017700000001
302
- if (/^0[0-7]/.test(cleaned)) {
303
- // Full octal (all digits)
304
- if (/^0[0-7]+$/.test(cleaned)) {
305
- const num = parseInt(cleaned, 8);
306
- if (num <= 0xffffffff) {
307
- return [
308
- (num >>> 24) & 0xff,
309
- (num >>> 16) & 0xff,
310
- (num >>> 8) & 0xff,
311
- num & 0xff,
312
- ];
313
- }
314
- }
315
- // Mixed octal-decimal: 0177.0.0.1
316
- const parts = cleaned.split('.');
317
- if (parts.length === 4) {
318
- const octets = parts.map(p => parseInt(p, /^0[0-7]/.test(p) ? 8 : 10));
319
- if (octets.every(o => o >= 0 && o <= 255)) {
320
- return octets;
321
- }
322
- }
323
- }
324
- // Decimal notation: 2130706433
325
- if (/^\d+$/.test(cleaned)) {
326
- const num = parseInt(cleaned, 10);
327
- if (num <= 0xffffffff) {
328
- return [
329
- (num >>> 24) & 0xff,
330
- (num >>> 16) & 0xff,
331
- (num >>> 8) & 0xff,
332
- num & 0xff,
333
- ];
334
- }
335
- }
336
- return null;
337
- }
338
- /**
339
- * Validate IPv4 address against private/reserved ranges
340
- */
341
- function validateIPv4Address(octets) {
342
- const [a, b, c, d] = octets;
343
- // Loopback: 127.0.0.0/8
344
- if (a === 127) {
345
- throw new WebPeelError('Access to loopback addresses is not allowed');
346
- }
347
- // Private: 10.0.0.0/8
348
- if (a === 10) {
349
- throw new WebPeelError('Access to private IP addresses is not allowed');
350
- }
351
- // Private: 172.16.0.0/12
352
- if (a === 172 && b >= 16 && b <= 31) {
353
- throw new WebPeelError('Access to private IP addresses is not allowed');
354
- }
355
- // Private: 192.168.0.0/16
356
- if (a === 192 && b === 168) {
357
- throw new WebPeelError('Access to private IP addresses is not allowed');
358
- }
359
- // Link-local: 169.254.0.0/16
360
- if (a === 169 && b === 254) {
361
- throw new WebPeelError('Access to link-local addresses is not allowed');
362
- }
363
- // Broadcast: 255.255.255.255
364
- if (a === 255 && b === 255 && c === 255 && d === 255) {
365
- throw new WebPeelError('Access to broadcast address is not allowed');
366
- }
367
- // This network: 0.0.0.0/8
368
- if (a === 0) {
369
- throw new WebPeelError('Access to "this network" addresses is not allowed');
370
- }
371
- }
372
- /**
373
- * Validate IPv6 address against private/reserved ranges
374
- */
375
- function validateIPv6Address(hostname) {
376
- // Remove brackets
377
- const addr = hostname.replace(/^\[|\]$/g, '').toLowerCase();
378
- // Loopback: ::1
379
- if (addr === '::1' || addr === '0:0:0:0:0:0:0:1') {
380
- throw new WebPeelError('Access to loopback addresses is not allowed');
381
- }
382
- // IPv6 mapped IPv4: ::ffff:192.168.1.1 or ::ffff:c0a8:0101
383
- if (addr.startsWith('::ffff:')) {
384
- // Extract the IPv4 part
385
- const ipv4Part = addr.substring(7);
386
- // Could be dotted (::ffff:192.168.1.1) or hex (::ffff:c0a8:0101)
387
- if (ipv4Part.includes('.')) {
388
- // Parse dotted IPv4
389
- const parts = ipv4Part.split('.');
390
- if (parts.length === 4) {
391
- const octets = parts.map(p => parseInt(p, 10));
392
- if (octets.every(o => !isNaN(o) && o >= 0 && o <= 255)) {
393
- validateIPv4Address(octets);
394
- }
395
- }
396
- }
397
- else {
398
- // Parse hex IPv4 (e.g., c0a80101 = 192.168.1.1)
399
- const hexStr = ipv4Part.replace(/:/g, '');
400
- if (/^[0-9a-f]{1,8}$/.test(hexStr)) {
401
- const num = parseInt(hexStr, 16);
402
- const octets = [
403
- (num >>> 24) & 0xff,
404
- (num >>> 16) & 0xff,
405
- (num >>> 8) & 0xff,
406
- num & 0xff,
407
- ];
408
- validateIPv4Address(octets);
409
- }
410
- }
411
- throw new WebPeelError('Access to IPv6-mapped IPv4 addresses is not allowed');
412
- }
413
- // Unique local addresses: fc00::/7 (fc00:: to fdff::)
414
- if (addr.startsWith('fc') || addr.startsWith('fd')) {
415
- throw new WebPeelError('Access to unique local IPv6 addresses is not allowed');
416
- }
417
- // Link-local: fe80::/10
418
- if (addr.startsWith('fe8') || addr.startsWith('fe9') ||
419
- addr.startsWith('fea') || addr.startsWith('feb')) {
420
- throw new WebPeelError('Access to link-local IPv6 addresses is not allowed');
421
- }
422
- }
423
- /**
424
- * Validate and sanitize user agent string
425
- */
426
- function validateUserAgent(userAgent) {
427
- if (userAgent.length > 500) {
428
- throw new WebPeelError('User agent too long (max 500 characters)');
429
- }
430
- // Allow only printable ASCII characters
431
- if (!/^[\x20-\x7E]*$/.test(userAgent)) {
432
- throw new WebPeelError('User agent contains invalid characters');
433
- }
434
- return userAgent;
435
- }
436
- /**
437
- * Simple HTTP fetch using native fetch + Cheerio
438
- * Fast and lightweight, but can be blocked by Cloudflare/bot detection
439
- * SECURITY: Manual redirect handling with SSRF re-validation
440
- */
441
- export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders, abortSignal, proxy) {
442
- // SECURITY: Validate URL to prevent SSRF
443
- validateUrl(url);
444
- if (abortSignal?.aborted) {
445
- throw createAbortError();
446
- }
447
- // Validate user agent if provided
448
- // SEC.gov requires a User-Agent with contact info (their documented automated access policy)
449
- const hostname = new URL(url).hostname.toLowerCase();
450
- const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
451
- const validatedUserAgent = isSecGov
452
- ? 'WebPeel/1.0 (support@webpeel.dev)'
453
- : (userAgent ? validateUserAgent(userAgent) : getRandomUserAgent());
454
- // SECURITY: Merge custom headers with defaults, block Host header override
455
- const defaultHeaders = {
456
- 'User-Agent': validatedUserAgent,
457
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
458
- 'Accept-Language': 'en-US,en;q=0.9',
459
- 'Accept-Encoding': 'br, gzip, deflate',
460
- 'DNT': '1',
461
- 'Connection': 'keep-alive',
462
- 'Upgrade-Insecure-Requests': '1',
463
- 'Sec-CH-UA': getSecCHUA(validatedUserAgent),
464
- 'Sec-CH-UA-Mobile': '?0',
465
- 'Sec-CH-UA-Platform': getSecCHUAPlatform(validatedUserAgent),
466
- 'Sec-Fetch-Dest': 'document',
467
- 'Sec-Fetch-Mode': 'navigate',
468
- 'Sec-Fetch-Site': 'none',
469
- 'Sec-Fetch-User': '?1',
470
- 'Cache-Control': 'max-age=0',
471
- 'Priority': 'u=0, i',
472
- };
473
- const mergedHeaders = { ...defaultHeaders };
474
- if (customHeaders) {
475
- for (const [key, value] of Object.entries(customHeaders)) {
476
- // SECURITY: Block Host header override
477
- if (key.toLowerCase() === 'host') {
478
- throw new WebPeelError('Custom Host header is not allowed');
479
- }
480
- mergedHeaders[key] = value;
481
- }
482
- }
483
- const MAX_REDIRECTS = 10;
484
- let redirectCount = 0;
485
- let currentUrl = url;
486
- const seenUrls = new Set();
487
- try {
488
- const hostname = new URL(url).hostname;
489
- void resolveAndCache(hostname).catch(() => {
490
- // Best-effort optimization only.
491
- });
492
- }
493
- catch {
494
- // Ignore URL parsing errors here; validation handles invalid input below.
495
- }
496
- while (redirectCount <= MAX_REDIRECTS) {
497
- // Detect redirect loops
498
- if (seenUrls.has(currentUrl)) {
499
- throw new WebPeelError('Redirect loop detected');
500
- }
501
- seenUrls.add(currentUrl);
502
- // Re-validate on each redirect
503
- validateUrl(currentUrl);
504
- const timeoutController = new AbortController();
505
- const timer = setTimeout(() => timeoutController.abort(), timeoutMs);
506
- const signal = abortSignal
507
- ? AbortSignal.any([timeoutController.signal, abortSignal])
508
- : timeoutController.signal;
509
- try {
510
- const requestHeaders = { ...mergedHeaders };
511
- const validators = getConditionalValidators(currentUrl);
512
- if (validators?.etag && !hasHeader(requestHeaders, 'if-none-match')) {
513
- requestHeaders['If-None-Match'] = validators.etag;
514
- }
515
- if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
516
- requestHeaders['If-Modified-Since'] = validators.lastModified;
517
- }
518
- // Use proxy if provided, otherwise use shared connection pool
519
- const dispatcher = proxy ? new ProxyAgent(proxy) : httpPool;
520
- const response = await undiciFetch(currentUrl, {
521
- headers: requestHeaders,
522
- signal,
523
- dispatcher,
524
- redirect: 'manual', // SECURITY: Manual redirect handling
525
- });
526
- clearTimeout(timer);
527
- if (response.status === 304) {
528
- const cachedResult = getCachedResultFor304(currentUrl, url);
529
- if (cachedResult) {
530
- return cachedResult;
531
- }
532
- throw new NetworkError('HTTP 304 received but no cached response is available');
533
- }
534
- // Handle redirects manually
535
- if (response.status >= 300 && response.status < 400) {
536
- const location = response.headers.get('location');
537
- if (!location) {
538
- throw new NetworkError('Redirect response missing Location header');
539
- }
540
- // Resolve relative URLs
541
- currentUrl = new URL(location, currentUrl).href;
542
- try {
543
- const hostname = new URL(currentUrl).hostname;
544
- void resolveAndCache(hostname).catch(() => {
545
- // Best-effort optimization only.
546
- });
547
- }
548
- catch {
549
- // Ignore URL parsing errors here; validation handles invalid input below.
550
- }
551
- redirectCount++;
552
- continue;
553
- }
554
- if (!response.ok) {
555
- if (response.status === 403 || response.status === 503) {
556
- throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
557
- }
558
- throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
559
- }
560
- rememberConditionalValidators(currentUrl, response);
561
- // Content-Type detection
562
- const contentType = response.headers.get('content-type') || '';
563
- const contentTypeLower = contentType.toLowerCase();
564
- const urlLower = currentUrl.toLowerCase();
565
- // Support binary documents (PDF/DOCX) in the simple HTTP path.
566
- const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
567
- const isDocx = contentTypeLower.includes('application/vnd.openxmlformats-officedocument.wordprocessingml.document') || urlLower.endsWith('.docx');
568
- const isBinaryDoc = isPdf || isDocx;
569
- // Accept a wide range of text-based content, plus supported binary documents.
570
- const ALLOWED_TYPES = [
571
- 'text/html', 'application/xhtml+xml',
572
- 'text/plain', 'text/markdown', 'text/csv',
573
- 'application/json', 'text/json',
574
- 'text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml',
575
- 'application/javascript', 'text/javascript', 'text/css',
576
- // Documents
577
- 'application/pdf',
578
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
579
- ];
580
- const isAllowed = !contentTypeLower ||
581
- ALLOWED_TYPES.some(t => contentTypeLower.includes(t)) ||
582
- // Many servers mislabel docs as octet-stream; allow when URL implies a supported document.
583
- (contentTypeLower.includes('application/octet-stream') && isBinaryDoc);
584
- if (!isAllowed) {
585
- // Check if it's at least text-based
586
- const isTexty = contentTypeLower.startsWith('text/') ||
587
- contentTypeLower.includes('json') ||
588
- contentTypeLower.includes('xml');
589
- if (!isTexty) {
590
- throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content and PDF/DOCX documents only.`);
591
- }
592
- }
593
- // SECURITY: Stream response with size limit (prevent memory exhaustion)
594
- const chunks = [];
595
- let totalSize = 0;
596
- const MAX_SIZE = 10 * 1024 * 1024; // 10MB
597
- const reader = response.body?.getReader();
598
- if (!reader) {
599
- throw new NetworkError('Response body is not readable');
600
- }
601
- try {
602
- while (true) {
603
- const { done, value } = await reader.read();
604
- if (done)
605
- break;
606
- totalSize += value.length;
607
- if (totalSize > MAX_SIZE) {
608
- reader.cancel();
609
- throw new WebPeelError('Response too large (max 10MB)');
610
- }
611
- chunks.push(value);
612
- }
613
- }
614
- finally {
615
- reader.releaseLock();
616
- }
617
- // Combine chunks
618
- const combined = new Uint8Array(totalSize);
619
- let offset = 0;
620
- for (const chunk of chunks) {
621
- combined.set(chunk, offset);
622
- offset += chunk.length;
623
- }
624
- const buffer = Buffer.from(combined);
625
- const html = isBinaryDoc ? '' : new TextDecoder().decode(combined);
626
- // For HTML content, check for suspiciously small responses (bot blocks)
627
- // Non-HTML content (JSON, text, XML) can legitimately be short
628
- const isHtmlContent = !isBinaryDoc && (contentTypeLower.includes('html') || contentTypeLower.includes('xhtml'));
629
- if (isHtmlContent && (!html || html.length < 100)) {
630
- throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
631
- }
632
- if (!isBinaryDoc && !html) {
633
- throw new NetworkError('Empty response body');
634
- }
635
- if (isBinaryDoc && buffer.length === 0) {
636
- throw new NetworkError('Empty response body');
637
- }
638
- // Check for Cloudflare challenge (only relevant for HTML)
639
- if (isHtmlContent && (html.includes('cf-browser-verification') || html.includes('Just a moment...'))) {
640
- throw new BlockedError('Cloudflare challenge detected. Try --render for browser mode.');
641
- }
642
- // Run full challenge detection for HTML content
643
- // Note: skip empty-shell type — in simple HTTP mode, SPA shells are expected and
644
- // the caller's escalation logic upgrades to browser/stealth rendering.
645
- if (isHtmlContent) {
646
- const challengeResult = detectChallenge(html, response.status);
647
- if (challengeResult.isChallenge && challengeResult.type !== 'empty-shell') {
648
- throw new BlockedError(`Challenge page detected (${challengeResult.type || 'unknown'}, confidence: ${challengeResult.confidence.toFixed(2)}). ` +
649
- `Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
650
- }
651
- }
652
- return {
653
- html,
654
- buffer: isBinaryDoc ? buffer : undefined,
655
- url: currentUrl,
656
- statusCode: response.status,
657
- contentType,
658
- };
659
- }
660
- catch (error) {
661
- clearTimeout(timer);
662
- if (error instanceof BlockedError || error instanceof NetworkError || error instanceof WebPeelError) {
663
- throw error;
664
- }
665
- if (error instanceof Error && error.name === 'AbortError') {
666
- if (abortSignal?.aborted && !timeoutController.signal.aborted) {
667
- throw createAbortError();
668
- }
669
- throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
670
- }
671
- // Provide specific error messages based on the actual cause
672
- const cause = error instanceof Error && error.cause;
673
- const causeMsg = cause?.message || cause?.code || '';
674
- if (causeMsg.includes('certificate') || causeMsg.includes('CERT') || causeMsg.includes('SSL') || causeMsg.includes('TLS')) {
675
- throw new NetworkError(`TLS/SSL certificate error for ${new URL(currentUrl).hostname}. The site's certificate may be expired, self-signed, or untrusted.`);
676
- }
677
- if (causeMsg.includes('ENOTFOUND') || causeMsg.includes('getaddrinfo')) {
678
- throw new NetworkError(`DNS resolution failed: ${new URL(currentUrl).hostname} not found. Check the URL or your network connection.`);
679
- }
680
- if (causeMsg.includes('ECONNREFUSED')) {
681
- throw new NetworkError(`Connection refused by ${new URL(currentUrl).hostname}. The server may be down.`);
682
- }
683
- if (causeMsg.includes('ECONNRESET') || causeMsg.includes('EPIPE')) {
684
- throw new NetworkError(`Connection reset by ${new URL(currentUrl).hostname}. Try again or use --render.`);
685
- }
686
- if (causeMsg.includes('ETIMEDOUT') || causeMsg.includes('ENETUNREACH')) {
687
- throw new TimeoutError(`Network unreachable or connection timed out for ${new URL(currentUrl).hostname}.`);
688
- }
689
- const msg = error instanceof Error ? error.message : 'Unknown error';
690
- const causeDetail = causeMsg ? ` (${causeMsg})` : '';
691
- throw new NetworkError(`Failed to fetch: ${msg}${causeDetail}`);
692
- }
693
- }
694
- throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
695
- }
696
- export async function closePool() {
697
- const oldPool = httpPool;
698
- httpPool = createHttpPool();
699
- await oldPool.close().catch(() => { });
700
- }
701
- let sharedBrowser = null;
702
- let sharedStealthBrowser = null;
703
- let activePagesCount = 0;
704
- const MAX_CONCURRENT_PAGES = 5;
705
- const PAGE_POOL_SIZE = 3;
706
- const pooledPages = new Set();
707
- const idlePagePool = [];
708
- let pagePoolFillPromise = null;
709
- function removePooledPage(page) {
710
- pooledPages.delete(page);
711
- const idleIndex = idlePagePool.indexOf(page);
712
- if (idleIndex >= 0) {
713
- idlePagePool.splice(idleIndex, 1);
714
- }
715
- }
716
- function takePooledPage() {
717
- while (idlePagePool.length > 0) {
718
- const page = idlePagePool.shift();
719
- if (page.isClosed()) {
720
- removePooledPage(page);
721
- continue;
722
- }
723
- return page;
724
- }
725
- return null;
726
- }
727
- async function ensurePagePool(browser) {
728
- const activeBrowser = browser ?? sharedBrowser;
729
- if (!activeBrowser || !activeBrowser.isConnected()) {
730
- return;
731
- }
732
- if (pagePoolFillPromise) {
733
- await pagePoolFillPromise;
734
- return;
735
- }
736
- pagePoolFillPromise = (async () => {
737
- while (pooledPages.size < PAGE_POOL_SIZE) {
738
- const pooledPage = await activeBrowser.newPage({
739
- userAgent: getRandomUserAgent(),
740
- viewport: null, // Use browser window size (set via --window-size at launch)
741
- });
742
- await applyStealthScripts(pooledPage);
743
- pooledPages.add(pooledPage);
744
- idlePagePool.push(pooledPage);
745
- }
746
- })().finally(() => {
747
- pagePoolFillPromise = null;
748
- });
749
- await pagePoolFillPromise;
750
- }
751
- async function recyclePooledPage(page) {
752
- if (!pooledPages.has(page)) {
753
- await page.close().catch(() => { });
754
- return;
755
- }
756
- if (page.isClosed()) {
757
- removePooledPage(page);
758
- if (sharedBrowser?.isConnected()) {
759
- void ensurePagePool(sharedBrowser).catch(() => { });
760
- }
761
- return;
762
- }
763
- try {
764
- await page.unroute('**/*').catch(() => { });
765
- await page.context().clearCookies().catch(() => { });
766
- await page.setExtraHTTPHeaders({});
767
- await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 }).catch(() => { });
768
- if (!idlePagePool.includes(page)) {
769
- idlePagePool.push(page);
770
- }
771
- }
772
- catch {
773
- removePooledPage(page);
774
- await page.close().catch(() => { });
775
- }
776
- if (sharedBrowser?.isConnected() && pooledPages.size < PAGE_POOL_SIZE) {
777
- void ensurePagePool(sharedBrowser).catch(() => { });
778
- }
779
- }
780
- export async function warmup() {
781
- startDnsWarmup();
782
- const browser = await getBrowser();
783
- await ensurePagePool(browser);
784
- }
785
- async function getBrowser() {
786
- // SECURITY: Check if browser is still connected and healthy
787
- if (sharedBrowser) {
788
- try {
789
- if (sharedBrowser.isConnected()) {
790
- if (pooledPages.size < PAGE_POOL_SIZE) {
791
- void ensurePagePool(sharedBrowser).catch(() => { });
792
- }
793
- return sharedBrowser;
794
- }
795
- }
796
- catch {
797
- // Browser is dead, recreate
798
- sharedBrowser = null;
799
- }
800
- }
801
- pooledPages.clear();
802
- idlePagePool.length = 0;
803
- pagePoolFillPromise = null;
804
- const vp = getRandomViewport();
805
- sharedBrowser = await chromium.launch({
806
- headless: true,
807
- args: [...ANTI_DETECTION_ARGS, `--window-size=${vp.width},${vp.height}`],
808
- });
809
- void ensurePagePool(sharedBrowser).catch(() => { });
810
- return sharedBrowser;
811
- }
812
- async function getStealthBrowser() {
813
- // SECURITY: Check if stealth browser is still connected and healthy
814
- if (sharedStealthBrowser) {
815
- try {
816
- if (sharedStealthBrowser.isConnected()) {
817
- return sharedStealthBrowser;
818
- }
819
- }
820
- catch {
821
- // Browser is dead, recreate
822
- sharedStealthBrowser = null;
823
- }
824
- }
825
- const stealthVp = getRandomViewport();
826
- const stealthBrowser = await stealthChromium.launch({
827
- headless: true,
828
- args: [...ANTI_DETECTION_ARGS, `--window-size=${stealthVp.width},${stealthVp.height}`],
829
- });
830
- if (!stealthBrowser)
831
- throw new Error('Failed to launch stealth browser');
832
- sharedStealthBrowser = stealthBrowser;
833
- return stealthBrowser;
834
- }
835
- // ── Persistent profile browser instances ─────────────────────────────────────
836
- // Profile browsers are NOT shared — each profileDir gets its own instance.
837
- // These are keyed by profile path and kept alive between fetches in the same process.
838
- const profileBrowsers = new Map();
839
- /**
840
- * Get or create a browser instance with a persistent user data directory.
841
- * Profile browsers bypass the shared browser pool so cookies/sessions survive
842
- * between fetch calls.
2
+ * Core fetching thin re-export layer for backward compatibility.
843
3
  *
844
- * @param profileDir Absolute path to the Chrome user-data-dir directory
845
- * @param headed Whether to launch in headed (visible) mode
846
- * @param stealth Whether to use playwright-extra stealth instead of plain chromium
847
- */
848
- async function getProfileBrowser(profileDir, headed = false, stealth = false) {
849
- const existing = profileBrowsers.get(profileDir);
850
- if (existing) {
851
- try {
852
- if (existing.isConnected())
853
- return existing;
854
- }
855
- catch { /* dead, recreate */ }
856
- profileBrowsers.delete(profileDir);
857
- }
858
- const profileVp = getRandomViewport();
859
- const launchOptions = {
860
- headless: !headed,
861
- args: [
862
- ...ANTI_DETECTION_ARGS,
863
- `--window-size=${profileVp.width},${profileVp.height}`,
864
- `--user-data-dir=${profileDir}`,
865
- ],
866
- };
867
- const launched = stealth
868
- ? await stealthChromium.launch(launchOptions)
869
- : await chromium.launch(launchOptions);
870
- if (!launched)
871
- throw new Error('Failed to launch profile browser');
872
- profileBrowsers.set(profileDir, launched);
873
- return launched;
874
- }
875
- /**
876
- * Fetch using headless Chromium via Playwright
877
- * Slower but can handle JavaScript-heavy sites and bypass some bot detection
878
- */
879
- export async function browserFetch(url, options = {}) {
880
- // SECURITY: Validate URL to prevent SSRF
881
- validateUrl(url);
882
- const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, } = options;
883
- // Validate user agent if provided
884
- // In stealth mode with no custom UA, always use a realistic Chrome UA
885
- const validatedUserAgent = userAgent
886
- ? validateUserAgent(userAgent)
887
- : (stealth ? getRealisticUserAgent() : getRandomUserAgent());
888
- // Validate wait time
889
- if (waitMs < 0 || waitMs > 60000) {
890
- throw new WebPeelError('Wait time must be between 0 and 60000ms');
891
- }
892
- if (signal?.aborted) {
893
- throw createAbortError();
894
- }
895
- // SECURITY: Validate custom headers if provided
896
- if (headers) {
897
- for (const [key, value] of Object.entries(headers)) {
898
- // Block Host header override
899
- if (key.toLowerCase() === 'host') {
900
- throw new WebPeelError('Custom Host header is not allowed');
901
- }
902
- if (typeof value !== 'string' || value.length > 500) {
903
- throw new WebPeelError('Invalid header value');
904
- }
905
- }
906
- }
907
- // SECURITY: Limit concurrent browser pages with timeout
908
- const queueStartTime = Date.now();
909
- const QUEUE_TIMEOUT_MS = 30000; // 30 second max wait
910
- while (activePagesCount >= MAX_CONCURRENT_PAGES) {
911
- if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
912
- throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
913
- }
914
- await new Promise(resolve => setTimeout(resolve, 100));
915
- }
916
- activePagesCount++;
917
- let page = null;
918
- let usingPooledPage = false;
919
- let abortHandler;
920
- // Declared here (outside try) so the finally block can reference it
921
- const usingProfileBrowser = !!profileDir;
922
- // Owned context created when storageState injection is requested
923
- let ownedContext;
924
- try {
925
- const browser = usingProfileBrowser
926
- ? await getProfileBrowser(profileDir, headed, stealth)
927
- : stealth
928
- ? await getStealthBrowser()
929
- : await getBrowser();
930
- // Only use the shared page pool for non-stealth, non-profile, non-keepOpen, non-storageState, non-proxy fetches
931
- const shouldUsePagePool = !stealth && !userAgent && !keepPageOpen && !usingProfileBrowser && !storageState && !proxy;
932
- if (shouldUsePagePool) {
933
- page = takePooledPage();
934
- usingPooledPage = !!page;
935
- if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
936
- void ensurePagePool(browser).catch(() => { });
937
- }
938
- }
939
- if (!page) {
940
- const fetchVp = getRandomViewport();
941
- const pageOptions = {
942
- userAgent: validatedUserAgent,
943
- // viewport: null lets the browser use its natural window size (set via --window-size),
944
- // avoiding the telltale Playwright default of 1280×720.
945
- viewport: null,
946
- ...(stealth
947
- ? {
948
- locale: 'en-US',
949
- timezoneId: 'America/New_York',
950
- javaScriptEnabled: true,
951
- }
952
- : {}),
953
- };
954
- if (proxy) {
955
- // Parse proxy URL to extract auth credentials for Playwright
956
- let playwrightProxy;
957
- try {
958
- const proxyUrl = new URL(proxy);
959
- playwrightProxy = {
960
- server: `${proxyUrl.protocol}//${proxyUrl.host}`,
961
- username: proxyUrl.username || undefined,
962
- password: proxyUrl.password || undefined,
963
- };
964
- }
965
- catch {
966
- // Fallback: use proxy string as-is
967
- playwrightProxy = { server: proxy };
968
- }
969
- // Create an isolated context with the proxy and optional storageState
970
- ownedContext = await browser.newContext({
971
- ...pageOptions,
972
- proxy: playwrightProxy,
973
- viewport: { width: fetchVp.width, height: fetchVp.height },
974
- ...(storageState ? { storageState } : {}),
975
- });
976
- page = await ownedContext.newPage();
977
- }
978
- else if (storageState) {
979
- // Create an isolated context with the injected storage state (cookies + localStorage)
980
- ownedContext = await browser.newContext({
981
- ...pageOptions,
982
- storageState,
983
- viewport: { width: fetchVp.width, height: fetchVp.height },
984
- });
985
- page = await ownedContext.newPage();
986
- }
987
- else {
988
- page = await browser.newPage(pageOptions);
989
- }
990
- await applyStealthScripts(page);
991
- usingPooledPage = false;
992
- }
993
- else {
994
- await page.setViewportSize({ width: 1280, height: 720 }).catch(() => { });
995
- }
996
- if (signal) {
997
- abortHandler = () => {
998
- if (page && !page.isClosed()) {
999
- void page.close().catch(() => { });
1000
- }
1001
- };
1002
- signal.addEventListener('abort', abortHandler, { once: true });
1003
- }
1004
- await page.unroute('**/*').catch(() => { });
1005
- const mergedHeaders = { ...(headers || {}) };
1006
- if (usingPooledPage) {
1007
- mergedHeaders['User-Agent'] = validatedUserAgent;
1008
- }
1009
- if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
1010
- await page.setExtraHTTPHeaders(mergedHeaders);
1011
- }
1012
- // Set cookies if provided
1013
- if (cookies && cookies.length > 0) {
1014
- const parsedCookies = cookies.map(cookie => {
1015
- const [nameValue] = cookie.split(';').map(s => s.trim());
1016
- const [name, value] = nameValue.split('=');
1017
- if (!name || value === undefined) {
1018
- throw new WebPeelError(`Invalid cookie format: ${cookie}`);
1019
- }
1020
- return {
1021
- name: name.trim(),
1022
- value: value.trim(),
1023
- url,
1024
- };
1025
- });
1026
- await page.context().addCookies(parsedCookies);
1027
- }
1028
- if (signal?.aborted) {
1029
- throw createAbortError();
1030
- }
1031
- // Block images/fonts/etc for speed in non-stealth mode.
1032
- // In stealth mode, blocking common resources can be a bot-detection signal.
1033
- if (!screenshot && !stealth) {
1034
- await page.route('**/*', (route) => {
1035
- const resourceType = route.request().resourceType();
1036
- if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
1037
- route.abort();
1038
- }
1039
- else {
1040
- route.continue();
1041
- }
1042
- });
1043
- }
1044
- else {
1045
- // For screenshots and stealth mode, allow all resources
1046
- await page.route('**/*', (route) => route.continue());
1047
- }
1048
- // SECURITY: Wrap entire operation in timeout
1049
- let screenshotBuffer;
1050
- const throwIfAborted = () => {
1051
- if (signal?.aborted) {
1052
- throw createAbortError();
1053
- }
1054
- };
1055
- const fetchPromise = (async () => {
1056
- const response = await page.goto(url, {
1057
- waitUntil: 'domcontentloaded',
1058
- timeout: timeoutMs,
1059
- });
1060
- throwIfAborted();
1061
- // Quick check: if body text is very thin, wait for JS to render more content.
1062
- // Only adds latency when the page clearly hasn't loaded yet.
1063
- // eslint-disable-next-line @typescript-eslint/no-implied-eval
1064
- const bodyTextLength = await page.evaluate('document.body?.innerText?.trim().length || 0').catch(() => 0);
1065
- if (bodyTextLength < 500) {
1066
- await page.waitForLoadState('networkidle', { timeout: 1500 }).catch(() => { });
1067
- throwIfAborted();
1068
- }
1069
- // DOM stability check: wait for SPA hydration to settle.
1070
- // Polls innerText length every 500ms — if still growing, keep waiting (max 3s extra).
1071
- {
1072
- const stabilityStart = Date.now();
1073
- const MAX_STABILITY_WAIT_MS = 3000;
1074
- const POLL_INTERVAL_MS = 500;
1075
- let prevLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
1076
- let stableCount = 0;
1077
- while (Date.now() - stabilityStart < MAX_STABILITY_WAIT_MS) {
1078
- throwIfAborted();
1079
- await page.waitForTimeout(POLL_INTERVAL_MS);
1080
- const curLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
1081
- if (curLength === prevLength) {
1082
- stableCount++;
1083
- if (stableCount >= 2)
1084
- break; // stable for 2 consecutive checks (~1s)
1085
- }
1086
- else {
1087
- stableCount = 0;
1088
- }
1089
- prevLength = curLength;
1090
- }
1091
- }
1092
- const finalUrl = page.url();
1093
- const contentType = response?.headers()?.['content-type'] || '';
1094
- const contentTypeLower = contentType.toLowerCase();
1095
- const urlLower = finalUrl.toLowerCase();
1096
- const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
1097
- const isDocx = contentTypeLower.includes('wordprocessingml.document') || urlLower.endsWith('.docx');
1098
- const isBinaryDoc = !!response && (isPdf || isDocx);
1099
- // Small randomized delay in stealth mode (simulate human behavior)
1100
- // Keep it short — enough to look human, not enough to kill latency
1101
- if (stealth) {
1102
- const extraDelayMs = 200 + Math.floor(Math.random() * 601);
1103
- await page.waitForTimeout(extraDelayMs);
1104
- throwIfAborted();
1105
- }
1106
- // Wait for additional time if requested (for dynamic content / screenshots)
1107
- if (waitMs > 0) {
1108
- await page.waitForTimeout(waitMs);
1109
- throwIfAborted();
1110
- }
1111
- // Execute page actions if provided
1112
- if (actions && actions.length > 0) {
1113
- const { executeActions } = await import('./actions.js');
1114
- const actionScreenshot = await executeActions(page, actions);
1115
- if (actionScreenshot) {
1116
- screenshotBuffer = actionScreenshot;
1117
- }
1118
- throwIfAborted();
1119
- }
1120
- // If the navigation returned a binary document (PDF/DOCX), grab the raw body.
1121
- if (isBinaryDoc) {
1122
- const buffer = await response.body();
1123
- throwIfAborted();
1124
- // Capture screenshot if requested (and not already captured by actions)
1125
- if (screenshot && !screenshotBuffer) {
1126
- screenshotBuffer = await page.screenshot({
1127
- fullPage: screenshotFullPage,
1128
- type: 'png',
1129
- });
1130
- }
1131
- return {
1132
- html: '',
1133
- finalUrl,
1134
- buffer,
1135
- contentType,
1136
- statusCode: response.status(),
1137
- };
1138
- }
1139
- const html = await page.content();
1140
- throwIfAborted();
1141
- return {
1142
- html,
1143
- finalUrl,
1144
- contentType,
1145
- statusCode: response?.status(),
1146
- };
1147
- })();
1148
- let operationTimeout;
1149
- const timeoutPromise = new Promise((_, reject) => {
1150
- operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
1151
- });
1152
- const fetchData = await Promise.race([fetchPromise, timeoutPromise]);
1153
- if (operationTimeout) {
1154
- clearTimeout(operationTimeout);
1155
- }
1156
- const { html, finalUrl } = fetchData;
1157
- const fetchBuffer = 'buffer' in fetchData ? fetchData.buffer : undefined;
1158
- const fetchContentType = 'contentType' in fetchData ? fetchData.contentType : undefined;
1159
- const fetchStatusCode = 'statusCode' in fetchData ? fetchData.statusCode : undefined;
1160
- const isBinaryDoc = !!fetchBuffer;
1161
- // SECURITY: Limit HTML size (skip for binary documents where html is empty)
1162
- if (!isBinaryDoc) {
1163
- if (html.length > 10 * 1024 * 1024) { // 10MB limit
1164
- throw new WebPeelError('Response too large (max 10MB)');
1165
- }
1166
- if (!html || html.length < 100) {
1167
- throw new BlockedError('Empty or suspiciously small response from browser.');
1168
- }
1169
- // Run challenge detection on browser-fetched HTML (covers both regular and stealth modes)
1170
- // Note: skip empty-shell type — that's a rendering quality issue (SPA needs more JS time),
1171
- // not a bot challenge. The caller's escalation logic handles empty-shell separately.
1172
- const browserChallengeResult = detectChallenge(html, fetchStatusCode);
1173
- if (browserChallengeResult.isChallenge && browserChallengeResult.type !== 'empty-shell') {
1174
- throw new BlockedError(`Challenge page detected (${browserChallengeResult.type || 'unknown'}, confidence: ${browserChallengeResult.confidence.toFixed(2)}). ` +
1175
- `Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
1176
- }
1177
- }
1178
- // Capture screenshot if requested (and not already captured by actions or document handler)
1179
- if (screenshot && !screenshotBuffer) {
1180
- screenshotBuffer = await page.screenshot({
1181
- fullPage: screenshotFullPage,
1182
- type: 'png'
1183
- });
1184
- }
1185
- // If keepPageOpen, return page/browser for caller to use (e.g., branding extraction)
1186
- if (keepPageOpen && page) {
1187
- return {
1188
- html,
1189
- buffer: fetchBuffer,
1190
- url: finalUrl,
1191
- statusCode: fetchStatusCode,
1192
- contentType: fetchContentType,
1193
- screenshot: screenshotBuffer,
1194
- page,
1195
- browser,
1196
- };
1197
- }
1198
- return {
1199
- html,
1200
- buffer: fetchBuffer,
1201
- url: finalUrl,
1202
- statusCode: fetchStatusCode,
1203
- contentType: fetchContentType,
1204
- screenshot: screenshotBuffer,
1205
- };
1206
- }
1207
- catch (error) {
1208
- if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
1209
- throw error;
1210
- }
1211
- if (error instanceof Error && error.name === 'AbortError') {
1212
- throw error;
1213
- }
1214
- if (error instanceof Error && error.message.includes('Timeout')) {
1215
- throw new TimeoutError(`Browser navigation timed out`);
1216
- }
1217
- throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
1218
- }
1219
- finally {
1220
- if (signal && abortHandler) {
1221
- signal.removeEventListener('abort', abortHandler);
1222
- }
1223
- // CRITICAL: Always release/close page and decrement counter (unless keepPageOpen and no error)
1224
- if (page && !keepPageOpen) {
1225
- if (usingPooledPage) {
1226
- await recyclePooledPage(page);
1227
- }
1228
- else if (ownedContext) {
1229
- // Close the owned context (also closes the page)
1230
- await ownedContext.close().catch(() => { });
1231
- }
1232
- else if (!usingProfileBrowser) {
1233
- // Profile browser pages are NOT closed — the profile browser stays alive
1234
- // so that the next fetch in the same process reuses the session.
1235
- await page.close().catch(() => { });
1236
- }
1237
- }
1238
- activePagesCount--;
1239
- }
1240
- }
1241
- /**
1242
- * Retry a fetch operation with exponential backoff
1243
- */
1244
- export async function browserScreenshot(url, options = {}) {
1245
- // SECURITY: Validate URL to prevent SSRF
1246
- validateUrl(url);
1247
- const { fullPage = false, width, height, format = 'png', quality, waitMs = 0, timeoutMs = 30000, userAgent, headers, cookies, stealth = false, actions, } = options;
1248
- const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
1249
- // Basic validation
1250
- if (waitMs < 0 || waitMs > 60000) {
1251
- throw new WebPeelError('Wait time must be between 0 and 60000ms');
1252
- }
1253
- if (timeoutMs < 1000 || timeoutMs > 120000) {
1254
- throw new WebPeelError('Timeout must be between 1000 and 120000ms');
1255
- }
1256
- if (width !== undefined && (!Number.isFinite(width) || width < 100 || width > 5000)) {
1257
- throw new WebPeelError('Width must be between 100 and 5000');
1258
- }
1259
- if (height !== undefined && (!Number.isFinite(height) || height < 100 || height > 5000)) {
1260
- throw new WebPeelError('Height must be between 100 and 5000');
1261
- }
1262
- if (format !== 'png' && format !== 'jpeg') {
1263
- throw new WebPeelError('Format must be png or jpeg');
1264
- }
1265
- if (format === 'jpeg' && quality !== undefined) {
1266
- if (!Number.isFinite(quality) || quality < 1 || quality > 100) {
1267
- throw new WebPeelError('JPEG quality must be between 1 and 100');
1268
- }
1269
- }
1270
- // SECURITY: Validate custom headers if provided
1271
- if (headers) {
1272
- for (const [key, value] of Object.entries(headers)) {
1273
- if (key.toLowerCase() === 'host') {
1274
- throw new WebPeelError('Custom Host header is not allowed');
1275
- }
1276
- if (typeof value !== 'string' || value.length > 500) {
1277
- throw new WebPeelError('Invalid header value');
1278
- }
1279
- }
1280
- }
1281
- // SECURITY: Limit concurrent browser pages with timeout
1282
- const queueStartTime = Date.now();
1283
- const QUEUE_TIMEOUT_MS = 30000;
1284
- while (activePagesCount >= MAX_CONCURRENT_PAGES) {
1285
- if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
1286
- throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
1287
- }
1288
- await new Promise(resolve => setTimeout(resolve, 100));
1289
- }
1290
- activePagesCount++;
1291
- let page = null;
1292
- let usingPooledPage = false;
1293
- try {
1294
- const browser = stealth ? await getStealthBrowser() : await getBrowser();
1295
- const shouldUsePagePool = !stealth && !userAgent;
1296
- if (shouldUsePagePool) {
1297
- page = takePooledPage();
1298
- usingPooledPage = !!page;
1299
- if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
1300
- void ensurePagePool(browser).catch(() => { });
1301
- }
1302
- }
1303
- if (!page) {
1304
- page = await browser.newPage({
1305
- userAgent: validatedUserAgent,
1306
- viewport: width || height ? {
1307
- width: width || 1280,
1308
- height: height || 720,
1309
- } : null, // Use browser window size when no explicit dimensions requested
1310
- });
1311
- await applyStealthScripts(page);
1312
- usingPooledPage = false;
1313
- }
1314
- else {
1315
- await page.setViewportSize({
1316
- width: width || 1280,
1317
- height: height || 720,
1318
- }).catch(() => { });
1319
- }
1320
- await page.unroute('**/*').catch(() => { });
1321
- const mergedHeaders = { ...(headers || {}) };
1322
- if (usingPooledPage) {
1323
- mergedHeaders['User-Agent'] = validatedUserAgent;
1324
- }
1325
- if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
1326
- await page.setExtraHTTPHeaders(mergedHeaders);
1327
- }
1328
- if (cookies && cookies.length > 0) {
1329
- const parsedCookies = cookies.map(cookie => {
1330
- const [nameValue] = cookie.split(';').map(s => s.trim());
1331
- const [name, value] = nameValue.split('=');
1332
- if (!name || value === undefined) {
1333
- throw new WebPeelError(`Invalid cookie format: ${cookie}`);
1334
- }
1335
- return {
1336
- name: name.trim(),
1337
- value: value.trim(),
1338
- url,
1339
- };
1340
- });
1341
- await page.context().addCookies(parsedCookies);
1342
- }
1343
- // For screenshots, allow all resources
1344
- await page.route('**/*', (route) => route.continue());
1345
- let screenshotBuffer;
1346
- const doWork = (async () => {
1347
- await page.goto(url, {
1348
- waitUntil: 'domcontentloaded',
1349
- timeout: timeoutMs,
1350
- });
1351
- if (waitMs > 0) {
1352
- await page.waitForTimeout(waitMs);
1353
- }
1354
- if (actions && actions.length > 0) {
1355
- const { executeActions } = await import('./actions.js');
1356
- const actionScreenshot = await executeActions(page, actions, {
1357
- fullPage,
1358
- type: format,
1359
- quality,
1360
- });
1361
- if (actionScreenshot) {
1362
- screenshotBuffer = actionScreenshot;
1363
- }
1364
- }
1365
- const finalUrl = page.url();
1366
- // Capture screenshot if not captured via actions
1367
- if (!screenshotBuffer) {
1368
- screenshotBuffer = await page.screenshot({
1369
- fullPage,
1370
- type: format,
1371
- ...(format === 'jpeg' && typeof quality === 'number' ? { quality } : {}),
1372
- });
1373
- }
1374
- return { finalUrl, screenshotBuffer: screenshotBuffer };
1375
- })();
1376
- let operationTimeout;
1377
- const timeoutPromise = new Promise((_, reject) => {
1378
- operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
1379
- });
1380
- const { finalUrl, screenshotBuffer: buf } = await Promise.race([doWork, timeoutPromise]);
1381
- if (operationTimeout) {
1382
- clearTimeout(operationTimeout);
1383
- }
1384
- return { buffer: buf, finalUrl };
1385
- }
1386
- catch (error) {
1387
- if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
1388
- throw error;
1389
- }
1390
- if (error instanceof Error && error.message.includes('Timeout')) {
1391
- throw new TimeoutError('Browser screenshot timed out');
1392
- }
1393
- throw new NetworkError(`Browser screenshot failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
1394
- }
1395
- finally {
1396
- if (page) {
1397
- if (usingPooledPage) {
1398
- await recyclePooledPage(page);
1399
- }
1400
- else {
1401
- await page.close().catch(() => { });
1402
- }
1403
- }
1404
- activePagesCount--;
1405
- }
1406
- }
1407
- export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
1408
- let lastError = null;
1409
- for (let attempt = 1; attempt <= maxAttempts; attempt++) {
1410
- try {
1411
- return await fn();
1412
- }
1413
- catch (error) {
1414
- lastError = error instanceof Error ? error : new Error('Unknown error');
1415
- // Don't retry on blocked errors or timeouts
1416
- if (error instanceof BlockedError || error instanceof TimeoutError) {
1417
- throw error;
1418
- }
1419
- if (attempt < maxAttempts) {
1420
- const delay = baseDelayMs * Math.pow(2, attempt - 1);
1421
- await new Promise((resolve) => setTimeout(resolve, delay));
1422
- }
1423
- }
1424
- }
1425
- throw lastError || new NetworkError('Retry failed');
1426
- }
1427
- /**
1428
- * Scroll to the bottom of the page N times, waiting for the network to
1429
- * settle between each scroll. Useful for triggering lazy-loaded content
1430
- * (infinite scroll, deferred images, etc.).
1431
- *
1432
- * @param page - Playwright Page instance.
1433
- * @param times - Number of scroll-and-wait cycles (default: 3).
1434
- * @returns The final page HTML after all scrolls complete.
1435
- */
1436
- export async function scrollAndWait(page, times = 3) {
1437
- for (let i = 0; i < times; i++) {
1438
- // eslint-disable-next-line @typescript-eslint/no-implied-eval
1439
- await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
1440
- // Wait for network to settle (500 ms of no new requests) or 2 s max.
1441
- try {
1442
- await page.waitForLoadState('networkidle', { timeout: 2000 });
1443
- }
1444
- catch {
1445
- // networkidle may never fire — fall back to a flat delay.
1446
- await page.waitForTimeout(1000);
1447
- }
1448
- }
1449
- return page.content();
1450
- }
1451
- /**
1452
- * Clean up browser resources (shared pool, stealth browser, and all profile browsers).
1453
- */
1454
- export async function cleanup() {
1455
- const pagesToClose = Array.from(pooledPages);
1456
- pooledPages.clear();
1457
- idlePagePool.length = 0;
1458
- pagePoolFillPromise = null;
1459
- await Promise.all(pagesToClose.map((page) => page.close().catch(() => { })));
1460
- if (sharedBrowser) {
1461
- await sharedBrowser.close();
1462
- sharedBrowser = null;
1463
- }
1464
- if (sharedStealthBrowser) {
1465
- await sharedStealthBrowser.close();
1466
- sharedStealthBrowser = null;
1467
- }
1468
- // Close all persistent profile browsers
1469
- const profileBrowserList = Array.from(profileBrowsers.values());
1470
- profileBrowsers.clear();
1471
- await Promise.all(profileBrowserList.map(b => b.close().catch(() => { })));
1472
- await closePool().catch(() => { });
1473
- }
1474
- /**
1475
- * Close a specific persistent profile browser (e.g. when done with a session).
1476
- * Safe to call even if the browser has already been closed.
1477
- *
1478
- * @param profileDir Path to the profile directory used when launching
1479
- */
1480
- export async function closeProfileBrowser(profileDir) {
1481
- const browser = profileBrowsers.get(profileDir);
1482
- if (browser) {
1483
- profileBrowsers.delete(profileDir);
1484
- await browser.close().catch(() => { });
1485
- }
1486
- }
4
+ * The implementation has been split into focused modules:
5
+ * - http-fetch.ts — Pure HTTP fetching (simpleFetch, SSRF validation, HTTP pool)
6
+ * - browser-pool.ts Browser lifecycle & page pool (getBrowser, cleanup, warmup)
7
+ * - browser-fetch.ts — Browser-based fetching (browserFetch, browserScreenshot)
8
+ */
9
+ // Re-export everything for backward compatibility
10
+ export { simpleFetch } from './http-fetch.js';
11
+ export { cleanup, warmup, closePool, closeProfileBrowser, playwrightLoaded } from './browser-pool.js';
12
+ export { browserFetch, browserScreenshot, retryFetch, scrollAndWait } from './browser-fetch.js';
1487
13
  //# sourceMappingURL=fetcher.js.map