webpeel 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +140 -500
  2. package/dist/cli-auth.d.ts +2 -0
  3. package/dist/cli-auth.d.ts.map +1 -1
  4. package/dist/cli-auth.js +16 -3
  5. package/dist/cli-auth.js.map +1 -1
  6. package/dist/cli.js +475 -77
  7. package/dist/cli.js.map +1 -1
  8. package/dist/core/actions.d.ts +19 -10
  9. package/dist/core/actions.d.ts.map +1 -1
  10. package/dist/core/actions.js +214 -43
  11. package/dist/core/actions.js.map +1 -1
  12. package/dist/core/agent.d.ts +60 -3
  13. package/dist/core/agent.d.ts.map +1 -1
  14. package/dist/core/agent.js +375 -86
  15. package/dist/core/agent.js.map +1 -1
  16. package/dist/core/answer.d.ts +43 -0
  17. package/dist/core/answer.d.ts.map +1 -0
  18. package/dist/core/answer.js +378 -0
  19. package/dist/core/answer.js.map +1 -0
  20. package/dist/core/cache.d.ts +14 -0
  21. package/dist/core/cache.d.ts.map +1 -0
  22. package/dist/core/cache.js +122 -0
  23. package/dist/core/cache.js.map +1 -0
  24. package/dist/core/dns-cache.d.ts +21 -0
  25. package/dist/core/dns-cache.d.ts.map +1 -0
  26. package/dist/core/dns-cache.js +184 -0
  27. package/dist/core/dns-cache.js.map +1 -0
  28. package/dist/core/documents.d.ts +24 -0
  29. package/dist/core/documents.d.ts.map +1 -0
  30. package/dist/core/documents.js +124 -0
  31. package/dist/core/documents.js.map +1 -0
  32. package/dist/core/extract-inline.d.ts +39 -0
  33. package/dist/core/extract-inline.d.ts.map +1 -0
  34. package/dist/core/extract-inline.js +214 -0
  35. package/dist/core/extract-inline.js.map +1 -0
  36. package/dist/core/fetcher.d.ts +33 -7
  37. package/dist/core/fetcher.d.ts.map +1 -1
  38. package/dist/core/fetcher.js +608 -41
  39. package/dist/core/fetcher.js.map +1 -1
  40. package/dist/core/jobs.d.ts +66 -0
  41. package/dist/core/jobs.d.ts.map +1 -0
  42. package/dist/core/jobs.js +513 -0
  43. package/dist/core/jobs.js.map +1 -0
  44. package/dist/core/markdown.d.ts.map +1 -1
  45. package/dist/core/markdown.js +141 -31
  46. package/dist/core/markdown.js.map +1 -1
  47. package/dist/core/pdf.d.ts.map +1 -1
  48. package/dist/core/pdf.js +3 -1
  49. package/dist/core/pdf.js.map +1 -1
  50. package/dist/core/screenshot.d.ts +33 -0
  51. package/dist/core/screenshot.d.ts.map +1 -0
  52. package/dist/core/screenshot.js +30 -0
  53. package/dist/core/screenshot.js.map +1 -0
  54. package/dist/core/search-provider.d.ts +46 -0
  55. package/dist/core/search-provider.d.ts.map +1 -0
  56. package/dist/core/search-provider.js +281 -0
  57. package/dist/core/search-provider.js.map +1 -0
  58. package/dist/core/strategies.d.ts +7 -10
  59. package/dist/core/strategies.d.ts.map +1 -1
  60. package/dist/core/strategies.js +370 -63
  61. package/dist/core/strategies.js.map +1 -1
  62. package/dist/index.d.ts +9 -3
  63. package/dist/index.d.ts.map +1 -1
  64. package/dist/index.js +61 -32
  65. package/dist/index.js.map +1 -1
  66. package/dist/mcp/server.js +335 -70
  67. package/dist/mcp/server.js.map +1 -1
  68. package/dist/types.d.ts +43 -1
  69. package/dist/types.d.ts.map +1 -1
  70. package/dist/types.js.map +1 -1
  71. package/llms.txt +85 -47
  72. package/package.json +11 -5
@@ -1,11 +1,19 @@
1
1
  /**
2
2
  * Core fetching logic: simple HTTP and browser-based fetching
3
3
  */
4
+ // Force IPv4-first DNS resolution globally.
5
+ // Prevents IPv6 connection failures (TLS errors, timeouts) on hosts that
6
+ // advertise AAAA records but can't actually route IPv6 (e.g. Render containers).
7
+ // Must run before any network library is used.
8
+ import dns from 'dns';
9
+ dns.setDefaultResultOrder('ipv4first');
4
10
  import { chromium } from 'playwright';
5
11
  import { chromium as stealthChromium } from 'playwright-extra';
6
12
  import StealthPlugin from 'puppeteer-extra-plugin-stealth';
7
- import { fetch as undiciFetch } from 'undici';
13
+ import { fetch as undiciFetch, Agent } from 'undici';
8
14
  import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
15
+ import { getCached } from './cache.js';
16
+ import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
9
17
  // Add stealth plugin to playwright-extra
10
18
  stealthChromium.use(StealthPlugin());
11
19
  const USER_AGENTS = [
@@ -18,6 +26,102 @@ const USER_AGENTS = [
18
26
  function getRandomUserAgent() {
19
27
  return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
20
28
  }
29
+ function createHttpPool() {
30
+ return new Agent({
31
+ connections: 20,
32
+ pipelining: 6,
33
+ keepAliveTimeout: 60000,
34
+ keepAliveMaxTimeout: 60000,
35
+ allowH2: true,
36
+ connect: {
37
+ lookup: cachedLookup,
38
+ },
39
+ });
40
+ }
41
+ let httpPool = createHttpPool();
42
+ startDnsWarmup();
43
+ const CONDITIONAL_CACHE_MAX_ENTRIES = 2000;
44
+ const conditionalValidatorsByUrl = new Map();
45
+ function normalizeUrlForConditionalCache(url) {
46
+ try {
47
+ const normalized = new URL(url);
48
+ normalized.hash = '';
49
+ normalized.hostname = normalized.hostname.toLowerCase();
50
+ if ((normalized.protocol === 'http:' && normalized.port === '80') ||
51
+ (normalized.protocol === 'https:' && normalized.port === '443')) {
52
+ normalized.port = '';
53
+ }
54
+ if (!normalized.pathname) {
55
+ normalized.pathname = '/';
56
+ }
57
+ const sortedParams = [...normalized.searchParams.entries()]
58
+ .sort(([a], [b]) => a.localeCompare(b));
59
+ normalized.search = '';
60
+ for (const [key, value] of sortedParams) {
61
+ normalized.searchParams.append(key, value);
62
+ }
63
+ return normalized.toString();
64
+ }
65
+ catch {
66
+ return url.trim();
67
+ }
68
+ }
69
+ function getConditionalValidators(url) {
70
+ const key = normalizeUrlForConditionalCache(url);
71
+ const existing = conditionalValidatorsByUrl.get(key);
72
+ if (!existing) {
73
+ return null;
74
+ }
75
+ // LRU touch
76
+ conditionalValidatorsByUrl.delete(key);
77
+ conditionalValidatorsByUrl.set(key, existing);
78
+ return existing;
79
+ }
80
+ function setConditionalValidators(url, validators) {
81
+ const key = normalizeUrlForConditionalCache(url);
82
+ if (conditionalValidatorsByUrl.has(key)) {
83
+ conditionalValidatorsByUrl.delete(key);
84
+ }
85
+ conditionalValidatorsByUrl.set(key, validators);
86
+ while (conditionalValidatorsByUrl.size > CONDITIONAL_CACHE_MAX_ENTRIES) {
87
+ const oldestKey = conditionalValidatorsByUrl.keys().next().value;
88
+ if (!oldestKey) {
89
+ break;
90
+ }
91
+ conditionalValidatorsByUrl.delete(oldestKey);
92
+ }
93
+ }
94
+ function rememberConditionalValidators(url, response) {
95
+ const etag = response.headers.get('etag') || undefined;
96
+ const lastModified = response.headers.get('last-modified') || undefined;
97
+ if (!etag && !lastModified) {
98
+ return;
99
+ }
100
+ setConditionalValidators(url, { etag, lastModified });
101
+ }
102
+ function hasHeader(headers, name) {
103
+ const lowered = name.toLowerCase();
104
+ return Object.keys(headers).some((header) => header.toLowerCase() === lowered);
105
+ }
106
+ function getCachedResultFor304(url, fallbackUrl) {
107
+ const cached = getCached(url) || (fallbackUrl ? getCached(fallbackUrl) : null);
108
+ if (!cached) {
109
+ return null;
110
+ }
111
+ return {
112
+ html: cached.html,
113
+ buffer: cached.buffer,
114
+ url: cached.url || url,
115
+ statusCode: 304,
116
+ contentType: cached.contentType,
117
+ screenshot: cached.screenshot,
118
+ };
119
+ }
120
+ function createAbortError() {
121
+ const error = new Error('Operation aborted');
122
+ error.name = 'AbortError';
123
+ return error;
124
+ }
21
125
  /**
22
126
  * SECURITY: Validate URL to prevent SSRF attacks
23
127
  * Blocks localhost, private IPs, link-local, and various bypass techniques
@@ -229,20 +333,37 @@ function validateUserAgent(userAgent) {
229
333
  * Fast and lightweight, but can be blocked by Cloudflare/bot detection
230
334
  * SECURITY: Manual redirect handling with SSRF re-validation
231
335
  */
232
- export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders) {
336
+ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders, abortSignal) {
233
337
  // SECURITY: Validate URL to prevent SSRF
234
338
  validateUrl(url);
339
+ if (abortSignal?.aborted) {
340
+ throw createAbortError();
341
+ }
235
342
  // Validate user agent if provided
236
- const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
343
+ // SEC.gov requires a User-Agent with contact info (their documented automated access policy)
344
+ const hostname = new URL(url).hostname.toLowerCase();
345
+ const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
346
+ const validatedUserAgent = isSecGov
347
+ ? 'WebPeel/1.0 (support@webpeel.dev)'
348
+ : (userAgent ? validateUserAgent(userAgent) : getRandomUserAgent());
237
349
  // SECURITY: Merge custom headers with defaults, block Host header override
238
350
  const defaultHeaders = {
239
351
  'User-Agent': validatedUserAgent,
240
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
352
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
241
353
  'Accept-Language': 'en-US,en;q=0.9',
242
- 'Accept-Encoding': 'gzip, deflate, br',
354
+ 'Accept-Encoding': 'br, gzip, deflate',
243
355
  'DNT': '1',
244
356
  'Connection': 'keep-alive',
245
357
  'Upgrade-Insecure-Requests': '1',
358
+ 'Sec-CH-UA': '"Chromium";v="131", "Not_A Brand";v="24"',
359
+ 'Sec-CH-UA-Mobile': '?0',
360
+ 'Sec-CH-UA-Platform': '"macOS"',
361
+ 'Sec-Fetch-Dest': 'document',
362
+ 'Sec-Fetch-Mode': 'navigate',
363
+ 'Sec-Fetch-Site': 'none',
364
+ 'Sec-Fetch-User': '?1',
365
+ 'Cache-Control': 'max-age=0',
366
+ 'Priority': 'u=0, i',
246
367
  };
247
368
  const mergedHeaders = { ...defaultHeaders };
248
369
  if (customHeaders) {
@@ -258,6 +379,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
258
379
  let redirectCount = 0;
259
380
  let currentUrl = url;
260
381
  const seenUrls = new Set();
382
+ try {
383
+ const hostname = new URL(url).hostname;
384
+ void resolveAndCache(hostname).catch(() => {
385
+ // Best-effort optimization only.
386
+ });
387
+ }
388
+ catch {
389
+ // Ignore URL parsing errors here; validation handles invalid input below.
390
+ }
261
391
  while (redirectCount <= MAX_REDIRECTS) {
262
392
  // Detect redirect loops
263
393
  if (seenUrls.has(currentUrl)) {
@@ -266,15 +396,34 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
266
396
  seenUrls.add(currentUrl);
267
397
  // Re-validate on each redirect
268
398
  validateUrl(currentUrl);
269
- const controller = new AbortController();
270
- const timer = setTimeout(() => controller.abort(), timeoutMs);
399
+ const timeoutController = new AbortController();
400
+ const timer = setTimeout(() => timeoutController.abort(), timeoutMs);
401
+ const signal = abortSignal
402
+ ? AbortSignal.any([timeoutController.signal, abortSignal])
403
+ : timeoutController.signal;
271
404
  try {
405
+ const requestHeaders = { ...mergedHeaders };
406
+ const validators = getConditionalValidators(currentUrl);
407
+ if (validators?.etag && !hasHeader(requestHeaders, 'if-none-match')) {
408
+ requestHeaders['If-None-Match'] = validators.etag;
409
+ }
410
+ if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
411
+ requestHeaders['If-Modified-Since'] = validators.lastModified;
412
+ }
272
413
  const response = await undiciFetch(currentUrl, {
273
- headers: mergedHeaders,
274
- signal: controller.signal,
414
+ headers: requestHeaders,
415
+ signal,
416
+ dispatcher: httpPool,
275
417
  redirect: 'manual', // SECURITY: Manual redirect handling
276
418
  });
277
419
  clearTimeout(timer);
420
+ if (response.status === 304) {
421
+ const cachedResult = getCachedResultFor304(currentUrl, url);
422
+ if (cachedResult) {
423
+ return cachedResult;
424
+ }
425
+ throw new NetworkError('HTTP 304 received but no cached response is available');
426
+ }
278
427
  // Handle redirects manually
279
428
  if (response.status >= 300 && response.status < 400) {
280
429
  const location = response.headers.get('location');
@@ -283,6 +432,15 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
283
432
  }
284
433
  // Resolve relative URLs
285
434
  currentUrl = new URL(location, currentUrl).href;
435
+ try {
436
+ const hostname = new URL(currentUrl).hostname;
437
+ void resolveAndCache(hostname).catch(() => {
438
+ // Best-effort optimization only.
439
+ });
440
+ }
441
+ catch {
442
+ // Ignore URL parsing errors here; validation handles invalid input below.
443
+ }
286
444
  redirectCount++;
287
445
  continue;
288
446
  }
@@ -292,20 +450,37 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
292
450
  }
293
451
  throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
294
452
  }
295
- // Content-Type detection — accept a wide range of text-based content
453
+ rememberConditionalValidators(currentUrl, response);
454
+ // Content-Type detection
296
455
  const contentType = response.headers.get('content-type') || '';
456
+ const contentTypeLower = contentType.toLowerCase();
457
+ const urlLower = currentUrl.toLowerCase();
458
+ // Support binary documents (PDF/DOCX) in the simple HTTP path.
459
+ const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
460
+ const isDocx = contentTypeLower.includes('application/vnd.openxmlformats-officedocument.wordprocessingml.document') || urlLower.endsWith('.docx');
461
+ const isBinaryDoc = isPdf || isDocx;
462
+ // Accept a wide range of text-based content, plus supported binary documents.
297
463
  const ALLOWED_TYPES = [
298
- 'text/html', 'application/xhtml+xml', 'application/pdf',
464
+ 'text/html', 'application/xhtml+xml',
299
465
  'text/plain', 'text/markdown', 'text/csv',
300
466
  'application/json', 'text/json',
301
467
  'text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml',
302
468
  'application/javascript', 'text/javascript', 'text/css',
469
+ // Documents
470
+ 'application/pdf',
471
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
303
472
  ];
304
- const isAllowed = ALLOWED_TYPES.some(t => contentType.includes(t)) || !contentType;
473
+ const isAllowed = !contentTypeLower ||
474
+ ALLOWED_TYPES.some(t => contentTypeLower.includes(t)) ||
475
+ // Many servers mislabel docs as octet-stream; allow when URL implies a supported document.
476
+ (contentTypeLower.includes('application/octet-stream') && isBinaryDoc);
305
477
  if (!isAllowed) {
306
478
  // Check if it's at least text-based
307
- if (!contentType.startsWith('text/') && !contentType.includes('json') && !contentType.includes('xml')) {
308
- throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content only.`);
479
+ const isTexty = contentTypeLower.startsWith('text/') ||
480
+ contentTypeLower.includes('json') ||
481
+ contentTypeLower.includes('xml');
482
+ if (!isTexty) {
483
+ throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content and PDF/DOCX documents only.`);
309
484
  }
310
485
  }
311
486
  // SECURITY: Stream response with size limit (prevent memory exhaustion)
@@ -339,14 +514,18 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
339
514
  combined.set(chunk, offset);
340
515
  offset += chunk.length;
341
516
  }
342
- const html = new TextDecoder().decode(combined);
517
+ const buffer = Buffer.from(combined);
518
+ const html = isBinaryDoc ? '' : new TextDecoder().decode(combined);
343
519
  // For HTML content, check for suspiciously small responses (bot blocks)
344
520
  // Non-HTML content (JSON, text, XML) can legitimately be short
345
- const isHtmlContent = contentType.includes('html') || contentType.includes('xhtml');
521
+ const isHtmlContent = !isBinaryDoc && (contentTypeLower.includes('html') || contentTypeLower.includes('xhtml'));
346
522
  if (isHtmlContent && (!html || html.length < 100)) {
347
523
  throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
348
524
  }
349
- if (!html) {
525
+ if (!isBinaryDoc && !html) {
526
+ throw new NetworkError('Empty response body');
527
+ }
528
+ if (isBinaryDoc && buffer.length === 0) {
350
529
  throw new NetworkError('Empty response body');
351
530
  }
352
531
  // Check for Cloudflare challenge (only relevant for HTML)
@@ -355,6 +534,7 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
355
534
  }
356
535
  return {
357
536
  html,
537
+ buffer: isBinaryDoc ? buffer : undefined,
358
538
  url: currentUrl,
359
539
  statusCode: response.status,
360
540
  contentType,
@@ -366,6 +546,9 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
366
546
  throw error;
367
547
  }
368
548
  if (error instanceof Error && error.name === 'AbortError') {
549
+ if (abortSignal?.aborted && !timeoutController.signal.aborted) {
550
+ throw createAbortError();
551
+ }
369
552
  throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
370
553
  }
371
554
  // Provide specific error messages based on the actual cause
@@ -393,15 +576,101 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
393
576
  }
394
577
  throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
395
578
  }
579
+ export async function closePool() {
580
+ const oldPool = httpPool;
581
+ httpPool = createHttpPool();
582
+ await oldPool.close().catch(() => { });
583
+ }
396
584
  let sharedBrowser = null;
397
585
  let sharedStealthBrowser = null;
398
586
  let activePagesCount = 0;
399
587
  const MAX_CONCURRENT_PAGES = 5;
588
+ const PAGE_POOL_SIZE = 3;
589
+ const pooledPages = new Set();
590
+ const idlePagePool = [];
591
+ let pagePoolFillPromise = null;
592
+ function removePooledPage(page) {
593
+ pooledPages.delete(page);
594
+ const idleIndex = idlePagePool.indexOf(page);
595
+ if (idleIndex >= 0) {
596
+ idlePagePool.splice(idleIndex, 1);
597
+ }
598
+ }
599
+ function takePooledPage() {
600
+ while (idlePagePool.length > 0) {
601
+ const page = idlePagePool.shift();
602
+ if (page.isClosed()) {
603
+ removePooledPage(page);
604
+ continue;
605
+ }
606
+ return page;
607
+ }
608
+ return null;
609
+ }
610
+ async function ensurePagePool(browser) {
611
+ const activeBrowser = browser ?? sharedBrowser;
612
+ if (!activeBrowser || !activeBrowser.isConnected()) {
613
+ return;
614
+ }
615
+ if (pagePoolFillPromise) {
616
+ await pagePoolFillPromise;
617
+ return;
618
+ }
619
+ pagePoolFillPromise = (async () => {
620
+ while (pooledPages.size < PAGE_POOL_SIZE) {
621
+ const pooledPage = await activeBrowser.newPage({
622
+ userAgent: getRandomUserAgent(),
623
+ });
624
+ pooledPages.add(pooledPage);
625
+ idlePagePool.push(pooledPage);
626
+ }
627
+ })().finally(() => {
628
+ pagePoolFillPromise = null;
629
+ });
630
+ await pagePoolFillPromise;
631
+ }
632
+ async function recyclePooledPage(page) {
633
+ if (!pooledPages.has(page)) {
634
+ await page.close().catch(() => { });
635
+ return;
636
+ }
637
+ if (page.isClosed()) {
638
+ removePooledPage(page);
639
+ if (sharedBrowser?.isConnected()) {
640
+ void ensurePagePool(sharedBrowser).catch(() => { });
641
+ }
642
+ return;
643
+ }
644
+ try {
645
+ await page.unroute('**/*').catch(() => { });
646
+ await page.context().clearCookies().catch(() => { });
647
+ await page.setExtraHTTPHeaders({});
648
+ await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 }).catch(() => { });
649
+ if (!idlePagePool.includes(page)) {
650
+ idlePagePool.push(page);
651
+ }
652
+ }
653
+ catch {
654
+ removePooledPage(page);
655
+ await page.close().catch(() => { });
656
+ }
657
+ if (sharedBrowser?.isConnected() && pooledPages.size < PAGE_POOL_SIZE) {
658
+ void ensurePagePool(sharedBrowser).catch(() => { });
659
+ }
660
+ }
661
+ export async function warmup() {
662
+ startDnsWarmup();
663
+ const browser = await getBrowser();
664
+ await ensurePagePool(browser);
665
+ }
400
666
  async function getBrowser() {
401
667
  // SECURITY: Check if browser is still connected and healthy
402
668
  if (sharedBrowser) {
403
669
  try {
404
670
  if (sharedBrowser.isConnected()) {
671
+ if (pooledPages.size < PAGE_POOL_SIZE) {
672
+ void ensurePagePool(sharedBrowser).catch(() => { });
673
+ }
405
674
  return sharedBrowser;
406
675
  }
407
676
  }
@@ -410,7 +679,11 @@ async function getBrowser() {
410
679
  sharedBrowser = null;
411
680
  }
412
681
  }
682
+ pooledPages.clear();
683
+ idlePagePool.length = 0;
684
+ pagePoolFillPromise = null;
413
685
  sharedBrowser = await chromium.launch({ headless: true });
686
+ void ensurePagePool(sharedBrowser).catch(() => { });
414
687
  return sharedBrowser;
415
688
  }
416
689
  async function getStealthBrowser() {
@@ -436,13 +709,16 @@ async function getStealthBrowser() {
436
709
  export async function browserFetch(url, options = {}) {
437
710
  // SECURITY: Validate URL to prevent SSRF
438
711
  validateUrl(url);
439
- const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, } = options;
712
+ const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, } = options;
440
713
  // Validate user agent if provided
441
714
  const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
442
715
  // Validate wait time
443
716
  if (waitMs < 0 || waitMs > 60000) {
444
717
  throw new WebPeelError('Wait time must be between 0 and 60000ms');
445
718
  }
719
+ if (signal?.aborted) {
720
+ throw createAbortError();
721
+ }
446
722
  // SECURITY: Validate custom headers if provided
447
723
  if (headers) {
448
724
  for (const [key, value] of Object.entries(headers)) {
@@ -466,14 +742,51 @@ export async function browserFetch(url, options = {}) {
466
742
  }
467
743
  activePagesCount++;
468
744
  let page = null;
745
+ let usingPooledPage = false;
746
+ let abortHandler;
469
747
  try {
470
748
  const browser = stealth ? await getStealthBrowser() : await getBrowser();
471
- page = await browser.newPage({
472
- userAgent: validatedUserAgent,
473
- });
474
- // Set custom headers if provided
475
- if (headers && Object.keys(headers).length > 0) {
476
- await page.setExtraHTTPHeaders(headers);
749
+ const shouldUsePagePool = !stealth && !userAgent && !keepPageOpen;
750
+ if (shouldUsePagePool) {
751
+ page = takePooledPage();
752
+ usingPooledPage = !!page;
753
+ if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
754
+ void ensurePagePool(browser).catch(() => { });
755
+ }
756
+ }
757
+ if (!page) {
758
+ const pageOptions = {
759
+ userAgent: validatedUserAgent,
760
+ ...(stealth
761
+ ? {
762
+ viewport: { width: 1920, height: 1080 },
763
+ locale: 'en-US',
764
+ timezoneId: 'America/New_York',
765
+ javaScriptEnabled: true,
766
+ }
767
+ : {}),
768
+ };
769
+ page = await browser.newPage(pageOptions);
770
+ usingPooledPage = false;
771
+ }
772
+ else {
773
+ await page.setViewportSize({ width: 1280, height: 720 }).catch(() => { });
774
+ }
775
+ if (signal) {
776
+ abortHandler = () => {
777
+ if (page && !page.isClosed()) {
778
+ void page.close().catch(() => { });
779
+ }
780
+ };
781
+ signal.addEventListener('abort', abortHandler, { once: true });
782
+ }
783
+ await page.unroute('**/*').catch(() => { });
784
+ const mergedHeaders = { ...(headers || {}) };
785
+ if (usingPooledPage) {
786
+ mergedHeaders['User-Agent'] = validatedUserAgent;
787
+ }
788
+ if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
789
+ await page.setExtraHTTPHeaders(mergedHeaders);
477
790
  }
478
791
  // Set cookies if provided
479
792
  if (cookies && cookies.length > 0) {
@@ -491,8 +804,12 @@ export async function browserFetch(url, options = {}) {
491
804
  });
492
805
  await page.context().addCookies(parsedCookies);
493
806
  }
494
- // Block images, fonts, and other heavy resources for speed (unless screenshot is requested)
495
- if (!screenshot) {
807
+ if (signal?.aborted) {
808
+ throw createAbortError();
809
+ }
810
+ // Block images/fonts/etc for speed in non-stealth mode.
811
+ // In stealth mode, blocking common resources can be a bot-detection signal.
812
+ if (!screenshot && !stealth) {
496
813
  await page.route('**/*', (route) => {
497
814
  const resourceType = route.request().resourceType();
498
815
  if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
@@ -504,19 +821,48 @@ export async function browserFetch(url, options = {}) {
504
821
  });
505
822
  }
506
823
  else {
507
- // For screenshots, allow all resources
824
+ // For screenshots and stealth mode, allow all resources
508
825
  await page.route('**/*', (route) => route.continue());
509
826
  }
510
827
  // SECURITY: Wrap entire operation in timeout
511
828
  let screenshotBuffer;
829
+ const throwIfAborted = () => {
830
+ if (signal?.aborted) {
831
+ throw createAbortError();
832
+ }
833
+ };
512
834
  const fetchPromise = (async () => {
513
- await page.goto(url, {
835
+ const response = await page.goto(url, {
514
836
  waitUntil: 'domcontentloaded',
515
837
  timeout: timeoutMs,
516
838
  });
517
- // Wait for additional time if requested (for dynamic content)
839
+ throwIfAborted();
840
+ // Quick check: if body text is very thin, wait for JS to render more content.
841
+ // Only adds latency when the page clearly hasn't loaded yet.
842
+ // eslint-disable-next-line @typescript-eslint/no-implied-eval
843
+ const bodyTextLength = await page.evaluate('document.body?.innerText?.trim().length || 0').catch(() => 0);
844
+ if (bodyTextLength < 500) {
845
+ await page.waitForLoadState('networkidle', { timeout: 1500 }).catch(() => { });
846
+ throwIfAborted();
847
+ }
848
+ const finalUrl = page.url();
849
+ const contentType = response?.headers()?.['content-type'] || '';
850
+ const contentTypeLower = contentType.toLowerCase();
851
+ const urlLower = finalUrl.toLowerCase();
852
+ const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
853
+ const isDocx = contentTypeLower.includes('wordprocessingml.document') || urlLower.endsWith('.docx');
854
+ const isBinaryDoc = !!response && (isPdf || isDocx);
855
+ // Small randomized delay in stealth mode (simulate human behavior)
856
+ // Keep it short — enough to look human, not enough to kill latency
857
+ if (stealth) {
858
+ const extraDelayMs = 200 + Math.floor(Math.random() * 601);
859
+ await page.waitForTimeout(extraDelayMs);
860
+ throwIfAborted();
861
+ }
862
+ // Wait for additional time if requested (for dynamic content / screenshots)
518
863
  if (waitMs > 0) {
519
864
  await page.waitForTimeout(waitMs);
865
+ throwIfAborted();
520
866
  }
521
867
  // Execute page actions if provided
522
868
  if (actions && actions.length > 0) {
@@ -525,23 +871,59 @@ export async function browserFetch(url, options = {}) {
525
871
  if (actionScreenshot) {
526
872
  screenshotBuffer = actionScreenshot;
527
873
  }
874
+ throwIfAborted();
875
+ }
876
+ // If the navigation returned a binary document (PDF/DOCX), grab the raw body.
877
+ if (isBinaryDoc) {
878
+ const buffer = await response.body();
879
+ throwIfAborted();
880
+ // Capture screenshot if requested (and not already captured by actions)
881
+ if (screenshot && !screenshotBuffer) {
882
+ screenshotBuffer = await page.screenshot({
883
+ fullPage: screenshotFullPage,
884
+ type: 'png',
885
+ });
886
+ }
887
+ return {
888
+ html: '',
889
+ finalUrl,
890
+ buffer,
891
+ contentType,
892
+ statusCode: response.status(),
893
+ };
528
894
  }
529
895
  const html = await page.content();
530
- const finalUrl = page.url();
531
- return { html, finalUrl };
896
+ throwIfAborted();
897
+ return {
898
+ html,
899
+ finalUrl,
900
+ contentType,
901
+ statusCode: response?.status(),
902
+ };
532
903
  })();
904
+ let operationTimeout;
533
905
  const timeoutPromise = new Promise((_, reject) => {
534
- setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
906
+ operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
535
907
  });
536
- const { html, finalUrl } = await Promise.race([fetchPromise, timeoutPromise]);
537
- // SECURITY: Limit HTML size
538
- if (html.length > 10 * 1024 * 1024) { // 10MB limit
539
- throw new WebPeelError('Response too large (max 10MB)');
908
+ const fetchData = await Promise.race([fetchPromise, timeoutPromise]);
909
+ if (operationTimeout) {
910
+ clearTimeout(operationTimeout);
540
911
  }
541
- if (!html || html.length < 100) {
542
- throw new BlockedError('Empty or suspiciously small response from browser.');
912
+ const { html, finalUrl } = fetchData;
913
+ const fetchBuffer = 'buffer' in fetchData ? fetchData.buffer : undefined;
914
+ const fetchContentType = 'contentType' in fetchData ? fetchData.contentType : undefined;
915
+ const fetchStatusCode = 'statusCode' in fetchData ? fetchData.statusCode : undefined;
916
+ const isBinaryDoc = !!fetchBuffer;
917
+ // SECURITY: Limit HTML size (skip for binary documents where html is empty)
918
+ if (!isBinaryDoc) {
919
+ if (html.length > 10 * 1024 * 1024) { // 10MB limit
920
+ throw new WebPeelError('Response too large (max 10MB)');
921
+ }
922
+ if (!html || html.length < 100) {
923
+ throw new BlockedError('Empty or suspiciously small response from browser.');
924
+ }
543
925
  }
544
- // Capture screenshot if requested (and not already captured by actions)
926
+ // Capture screenshot if requested (and not already captured by actions or document handler)
545
927
  if (screenshot && !screenshotBuffer) {
546
928
  screenshotBuffer = await page.screenshot({
547
929
  fullPage: screenshotFullPage,
@@ -552,7 +934,10 @@ export async function browserFetch(url, options = {}) {
552
934
  if (keepPageOpen && page) {
553
935
  return {
554
936
  html,
937
+ buffer: fetchBuffer,
555
938
  url: finalUrl,
939
+ statusCode: fetchStatusCode,
940
+ contentType: fetchContentType,
556
941
  screenshot: screenshotBuffer,
557
942
  page,
558
943
  browser,
@@ -560,7 +945,10 @@ export async function browserFetch(url, options = {}) {
560
945
  }
561
946
  return {
562
947
  html,
948
+ buffer: fetchBuffer,
563
949
  url: finalUrl,
950
+ statusCode: fetchStatusCode,
951
+ contentType: fetchContentType,
564
952
  screenshot: screenshotBuffer,
565
953
  };
566
954
  }
@@ -568,15 +956,26 @@ export async function browserFetch(url, options = {}) {
568
956
  if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
569
957
  throw error;
570
958
  }
959
+ if (error instanceof Error && error.name === 'AbortError') {
960
+ throw error;
961
+ }
571
962
  if (error instanceof Error && error.message.includes('Timeout')) {
572
963
  throw new TimeoutError(`Browser navigation timed out`);
573
964
  }
574
965
  throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
575
966
  }
576
967
  finally {
577
- // CRITICAL: Always close page and decrement counter (unless keepPageOpen and no error)
968
+ if (signal && abortHandler) {
969
+ signal.removeEventListener('abort', abortHandler);
970
+ }
971
+ // CRITICAL: Always release/close page and decrement counter (unless keepPageOpen and no error)
578
972
  if (page && !keepPageOpen) {
579
- await page.close().catch(() => { });
973
+ if (usingPooledPage) {
974
+ await recyclePooledPage(page);
975
+ }
976
+ else {
977
+ await page.close().catch(() => { });
978
+ }
580
979
  }
581
980
  activePagesCount--;
582
981
  }
@@ -584,6 +983,168 @@ export async function browserFetch(url, options = {}) {
584
983
  /**
585
984
  * Retry a fetch operation with exponential backoff
586
985
  */
986
+ export async function browserScreenshot(url, options = {}) {
987
+ // SECURITY: Validate URL to prevent SSRF
988
+ validateUrl(url);
989
+ const { fullPage = false, width, height, format = 'png', quality, waitMs = 0, timeoutMs = 30000, userAgent, headers, cookies, stealth = false, actions, } = options;
990
+ const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
991
+ // Basic validation
992
+ if (waitMs < 0 || waitMs > 60000) {
993
+ throw new WebPeelError('Wait time must be between 0 and 60000ms');
994
+ }
995
+ if (timeoutMs < 1000 || timeoutMs > 120000) {
996
+ throw new WebPeelError('Timeout must be between 1000 and 120000ms');
997
+ }
998
+ if (width !== undefined && (!Number.isFinite(width) || width < 100 || width > 5000)) {
999
+ throw new WebPeelError('Width must be between 100 and 5000');
1000
+ }
1001
+ if (height !== undefined && (!Number.isFinite(height) || height < 100 || height > 5000)) {
1002
+ throw new WebPeelError('Height must be between 100 and 5000');
1003
+ }
1004
+ if (format !== 'png' && format !== 'jpeg') {
1005
+ throw new WebPeelError('Format must be png or jpeg');
1006
+ }
1007
+ if (format === 'jpeg' && quality !== undefined) {
1008
+ if (!Number.isFinite(quality) || quality < 1 || quality > 100) {
1009
+ throw new WebPeelError('JPEG quality must be between 1 and 100');
1010
+ }
1011
+ }
1012
+ // SECURITY: Validate custom headers if provided
1013
+ if (headers) {
1014
+ for (const [key, value] of Object.entries(headers)) {
1015
+ if (key.toLowerCase() === 'host') {
1016
+ throw new WebPeelError('Custom Host header is not allowed');
1017
+ }
1018
+ if (typeof value !== 'string' || value.length > 500) {
1019
+ throw new WebPeelError('Invalid header value');
1020
+ }
1021
+ }
1022
+ }
1023
+ // SECURITY: Limit concurrent browser pages with timeout
1024
+ const queueStartTime = Date.now();
1025
+ const QUEUE_TIMEOUT_MS = 30000;
1026
+ while (activePagesCount >= MAX_CONCURRENT_PAGES) {
1027
+ if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
1028
+ throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
1029
+ }
1030
+ await new Promise(resolve => setTimeout(resolve, 100));
1031
+ }
1032
+ activePagesCount++;
1033
+ let page = null;
1034
+ let usingPooledPage = false;
1035
+ try {
1036
+ const browser = stealth ? await getStealthBrowser() : await getBrowser();
1037
+ const shouldUsePagePool = !stealth && !userAgent;
1038
+ if (shouldUsePagePool) {
1039
+ page = takePooledPage();
1040
+ usingPooledPage = !!page;
1041
+ if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
1042
+ void ensurePagePool(browser).catch(() => { });
1043
+ }
1044
+ }
1045
+ if (!page) {
1046
+ page = await browser.newPage({
1047
+ userAgent: validatedUserAgent,
1048
+ viewport: width || height ? {
1049
+ width: width || 1280,
1050
+ height: height || 720,
1051
+ } : undefined,
1052
+ });
1053
+ usingPooledPage = false;
1054
+ }
1055
+ else {
1056
+ await page.setViewportSize({
1057
+ width: width || 1280,
1058
+ height: height || 720,
1059
+ }).catch(() => { });
1060
+ }
1061
+ await page.unroute('**/*').catch(() => { });
1062
+ const mergedHeaders = { ...(headers || {}) };
1063
+ if (usingPooledPage) {
1064
+ mergedHeaders['User-Agent'] = validatedUserAgent;
1065
+ }
1066
+ if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
1067
+ await page.setExtraHTTPHeaders(mergedHeaders);
1068
+ }
1069
+ if (cookies && cookies.length > 0) {
1070
+ const parsedCookies = cookies.map(cookie => {
1071
+ const [nameValue] = cookie.split(';').map(s => s.trim());
1072
+ const [name, value] = nameValue.split('=');
1073
+ if (!name || value === undefined) {
1074
+ throw new WebPeelError(`Invalid cookie format: ${cookie}`);
1075
+ }
1076
+ return {
1077
+ name: name.trim(),
1078
+ value: value.trim(),
1079
+ url,
1080
+ };
1081
+ });
1082
+ await page.context().addCookies(parsedCookies);
1083
+ }
1084
+ // For screenshots, allow all resources
1085
+ await page.route('**/*', (route) => route.continue());
1086
+ let screenshotBuffer;
1087
+ const doWork = (async () => {
1088
+ await page.goto(url, {
1089
+ waitUntil: 'domcontentloaded',
1090
+ timeout: timeoutMs,
1091
+ });
1092
+ if (waitMs > 0) {
1093
+ await page.waitForTimeout(waitMs);
1094
+ }
1095
+ if (actions && actions.length > 0) {
1096
+ const { executeActions } = await import('./actions.js');
1097
+ const actionScreenshot = await executeActions(page, actions, {
1098
+ fullPage,
1099
+ type: format,
1100
+ quality,
1101
+ });
1102
+ if (actionScreenshot) {
1103
+ screenshotBuffer = actionScreenshot;
1104
+ }
1105
+ }
1106
+ const finalUrl = page.url();
1107
+ // Capture screenshot if not captured via actions
1108
+ if (!screenshotBuffer) {
1109
+ screenshotBuffer = await page.screenshot({
1110
+ fullPage,
1111
+ type: format,
1112
+ ...(format === 'jpeg' && typeof quality === 'number' ? { quality } : {}),
1113
+ });
1114
+ }
1115
+ return { finalUrl, screenshotBuffer: screenshotBuffer };
1116
+ })();
1117
+ let operationTimeout;
1118
+ const timeoutPromise = new Promise((_, reject) => {
1119
+ operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
1120
+ });
1121
+ const { finalUrl, screenshotBuffer: buf } = await Promise.race([doWork, timeoutPromise]);
1122
+ if (operationTimeout) {
1123
+ clearTimeout(operationTimeout);
1124
+ }
1125
+ return { buffer: buf, finalUrl };
1126
+ }
1127
+ catch (error) {
1128
+ if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
1129
+ throw error;
1130
+ }
1131
+ if (error instanceof Error && error.message.includes('Timeout')) {
1132
+ throw new TimeoutError('Browser screenshot timed out');
1133
+ }
1134
+ throw new NetworkError(`Browser screenshot failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
1135
+ }
1136
+ finally {
1137
+ if (page) {
1138
+ if (usingPooledPage) {
1139
+ await recyclePooledPage(page);
1140
+ }
1141
+ else {
1142
+ await page.close().catch(() => { });
1143
+ }
1144
+ }
1145
+ activePagesCount--;
1146
+ }
1147
+ }
587
1148
  export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
588
1149
  let lastError = null;
589
1150
  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
@@ -608,6 +1169,11 @@ export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
608
1169
  * Clean up browser resources
609
1170
  */
610
1171
  export async function cleanup() {
1172
+ const pagesToClose = Array.from(pooledPages);
1173
+ pooledPages.clear();
1174
+ idlePagePool.length = 0;
1175
+ pagePoolFillPromise = null;
1176
+ await Promise.all(pagesToClose.map((page) => page.close().catch(() => { })));
611
1177
  if (sharedBrowser) {
612
1178
  await sharedBrowser.close();
613
1179
  sharedBrowser = null;
@@ -616,5 +1182,6 @@ export async function cleanup() {
616
1182
  await sharedStealthBrowser.close();
617
1183
  sharedStealthBrowser = null;
618
1184
  }
1185
+ await closePool().catch(() => { });
619
1186
  }
620
1187
  //# sourceMappingURL=fetcher.js.map