webpeel 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +11 -4
- package/dist/cache.js.map +1 -1
- package/dist/cli.bundle.cjs +159248 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/core/agent.js +12 -8
- package/dist/core/agent.js.map +1 -1
- package/dist/core/application-tracker.js +3 -2
- package/dist/core/application-tracker.js.map +1 -1
- package/dist/core/auto-extract.js +6 -4
- package/dist/core/auto-extract.js.map +1 -1
- package/dist/core/browser-fetch.d.ts +90 -0
- package/dist/core/browser-fetch.d.ts.map +1 -0
- package/dist/core/browser-fetch.js +599 -0
- package/dist/core/browser-fetch.js.map +1 -0
- package/dist/core/browser-pool.d.ts +70 -0
- package/dist/core/browser-pool.d.ts.map +1 -0
- package/dist/core/browser-pool.js +378 -0
- package/dist/core/browser-pool.js.map +1 -0
- package/dist/core/change-tracking.js +3 -2
- package/dist/core/change-tracking.js.map +1 -1
- package/dist/core/diff.js +3 -2
- package/dist/core/diff.js.map +1 -1
- package/dist/core/domain-extractors.js +3 -2
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/extract-inline.js +6 -4
- package/dist/core/extract-inline.js.map +1 -1
- package/dist/core/fetcher.d.ts +9 -116
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +10 -1484
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/http-fetch.d.ts +37 -0
- package/dist/core/http-fetch.d.ts.map +1 -0
- package/dist/core/http-fetch.js +618 -0
- package/dist/core/http-fetch.js.map +1 -0
- package/dist/core/metadata.js +18 -12
- package/dist/core/metadata.js.map +1 -1
- package/dist/core/pipeline.d.ts +104 -0
- package/dist/core/pipeline.d.ts.map +1 -0
- package/dist/core/pipeline.js +623 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/profiles.js +15 -10
- package/dist/core/profiles.js.map +1 -1
- package/dist/core/quick-answer.d.ts.map +1 -1
- package/dist/core/quick-answer.js +120 -9
- package/dist/core/quick-answer.js.map +1 -1
- package/dist/core/rate-governor.js +3 -2
- package/dist/core/rate-governor.js.map +1 -1
- package/dist/core/readability.d.ts.map +1 -1
- package/dist/core/readability.js +19 -6
- package/dist/core/readability.js.map +1 -1
- package/dist/core/research.js +9 -6
- package/dist/core/research.js.map +1 -1
- package/dist/core/search-provider.js +12 -8
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +14 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/timing.d.ts +22 -0
- package/dist/core/timing.d.ts.map +1 -0
- package/dist/core/timing.js +34 -0
- package/dist/core/timing.js.map +1 -0
- package/dist/core/youtube.d.ts.map +1 -1
- package/dist/core/youtube.js +19 -6
- package/dist/core/youtube.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +13 -444
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +1 -1
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/middleware/auth.js +3 -2
- package/dist/server/middleware/auth.js.map +1 -1
- package/dist/server/routes/answer.d.ts.map +1 -1
- package/dist/server/routes/answer.js +5 -0
- package/dist/server/routes/answer.js.map +1 -1
- package/dist/server/routes/compat.js +3 -2
- package/dist/server/routes/compat.js.map +1 -1
- package/dist/server/routes/deep-fetch.d.ts.map +1 -1
- package/dist/server/routes/deep-fetch.js +5 -0
- package/dist/server/routes/deep-fetch.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +44 -4
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/health.js +3 -2
- package/dist/server/routes/health.js.map +1 -1
- package/dist/server/routes/mcp.js +1 -1
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/quick-answer.d.ts.map +1 -1
- package/dist/server/routes/quick-answer.js +5 -0
- package/dist/server/routes/quick-answer.js.map +1 -1
- package/dist/server/routes/search.js +6 -4
- package/dist/server/routes/search.js.map +1 -1
- package/dist/server/routes/users.js +3 -2
- package/dist/server/routes/users.js.map +1 -1
- package/dist/server/routes/webhooks.d.ts +1 -0
- package/dist/server/routes/webhooks.d.ts.map +1 -1
- package/dist/server/routes/webhooks.js +1 -0
- package/dist/server/routes/webhooks.js.map +1 -1
- package/dist/server/routes/youtube.d.ts.map +1 -1
- package/dist/server/routes/youtube.js +5 -0
- package/dist/server/routes/youtube.js.map +1 -1
- package/dist/types.d.ts +2 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +5 -2
package/dist/core/fetcher.js
CHANGED
|
@@ -1,1487 +1,13 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Core fetching
|
|
3
|
-
*/
|
|
4
|
-
// Force IPv4-first DNS resolution globally.
|
|
5
|
-
// Prevents IPv6 connection failures (TLS errors, timeouts) on hosts that
|
|
6
|
-
// advertise AAAA records but can't actually route IPv6 (e.g. Render containers).
|
|
7
|
-
// Must run before any network library is used.
|
|
8
|
-
import dns from 'dns';
|
|
9
|
-
dns.setDefaultResultOrder('ipv4first');
|
|
10
|
-
import { chromium } from 'playwright';
|
|
11
|
-
import { chromium as stealthChromium } from 'playwright-extra';
|
|
12
|
-
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
13
|
-
import { getRealisticUserAgent, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
|
|
14
|
-
import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
|
|
15
|
-
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
16
|
-
import { getCached } from './cache.js';
|
|
17
|
-
import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
|
|
18
|
-
import { detectChallenge } from './challenge-detection.js';
|
|
19
|
-
// Add stealth plugin to playwright-extra
|
|
20
|
-
stealthChromium.use(StealthPlugin());
|
|
21
|
-
/**
|
|
22
|
-
* Returns a realistic Chrome user agent.
|
|
23
|
-
* Delegates to the curated user-agents module so stealth mode never exposes
|
|
24
|
-
* the default "Chrome for Testing" UA which is a reliable bot-detection signal.
|
|
25
|
-
*/
|
|
26
|
-
function getRandomUserAgent() {
|
|
27
|
-
return getRealisticUserAgent();
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Common Chromium launch arguments for anti-bot-detection.
|
|
31
|
-
* Applied to BOTH regular and stealth browser instances.
|
|
32
|
-
* NOTE: --window-size is intentionally omitted here; it is added dynamically
|
|
33
|
-
* per browser launch using a random realistic viewport (see getRandomViewport()).
|
|
34
|
-
*/
|
|
35
|
-
const ANTI_DETECTION_ARGS = [
|
|
36
|
-
'--disable-blink-features=AutomationControlled',
|
|
37
|
-
'--disable-infobars',
|
|
38
|
-
'--disable-dev-shm-usage',
|
|
39
|
-
'--no-sandbox',
|
|
40
|
-
'--disable-setuid-sandbox',
|
|
41
|
-
'--disable-gpu',
|
|
42
|
-
'--start-maximized',
|
|
43
|
-
// Chrome branding / stealth hardening
|
|
44
|
-
'--disable-features=ChromeUserAgentDataBranding',
|
|
45
|
-
'--disable-component-extensions-with-background-pages',
|
|
46
|
-
'--disable-default-apps',
|
|
47
|
-
'--disable-extensions',
|
|
48
|
-
'--disable-hang-monitor',
|
|
49
|
-
'--disable-popup-blocking',
|
|
50
|
-
'--disable-prompt-on-repost',
|
|
51
|
-
'--disable-sync',
|
|
52
|
-
'--metrics-recording-only',
|
|
53
|
-
'--no-first-run',
|
|
54
|
-
];
|
|
55
|
-
/**
|
|
56
|
-
* Returns a random realistic viewport weighted by real-world market share.
|
|
57
|
-
* Used to avoid the telltale Playwright default of 1280×720.
|
|
58
|
-
*/
|
|
59
|
-
function getRandomViewport() {
|
|
60
|
-
// Common real-world resolutions weighted by market share
|
|
61
|
-
const viewports = [
|
|
62
|
-
{ width: 1920, height: 1080, weight: 35 }, // Full HD
|
|
63
|
-
{ width: 1366, height: 768, weight: 20 }, // Laptop
|
|
64
|
-
{ width: 1536, height: 864, weight: 15 }, // Scaled laptop
|
|
65
|
-
{ width: 1440, height: 900, weight: 10 }, // MacBook
|
|
66
|
-
{ width: 1680, height: 1050, weight: 8 }, // Large laptop
|
|
67
|
-
{ width: 2560, height: 1440, weight: 7 }, // QHD
|
|
68
|
-
{ width: 1280, height: 800, weight: 5 }, // Older laptop
|
|
69
|
-
];
|
|
70
|
-
const total = viewports.reduce((s, v) => s + v.weight, 0);
|
|
71
|
-
let r = Math.random() * total;
|
|
72
|
-
for (const v of viewports) {
|
|
73
|
-
r -= v.weight;
|
|
74
|
-
if (r <= 0)
|
|
75
|
-
return { width: v.width, height: v.height };
|
|
76
|
-
}
|
|
77
|
-
return { width: 1920, height: 1080 };
|
|
78
|
-
}
|
|
79
|
-
/**
|
|
80
|
-
* Apply stealth init scripts to a page to reduce bot-detection signals:
|
|
81
|
-
* 1. Hides the `window.__pwInitScripts` Playwright leak.
|
|
82
|
-
* 2. Patches `navigator.userAgentData.brands` to include "Google Chrome"
|
|
83
|
-
* (Chrome for Testing only ships "Chromium" which is a known detection signal).
|
|
84
|
-
*/
|
|
85
|
-
async function applyStealthScripts(page) {
|
|
86
|
-
// 1. Hide Playwright's __pwInitScripts marker
|
|
87
|
-
// Uses string form to avoid TypeScript DOM-lib requirements (tsconfig has no DOM lib).
|
|
88
|
-
await page.addInitScript(`
|
|
89
|
-
Object.defineProperty(window, '__pwInitScripts', {
|
|
90
|
-
get: () => undefined,
|
|
91
|
-
set: () => {},
|
|
92
|
-
configurable: true,
|
|
93
|
-
});
|
|
94
|
-
`);
|
|
95
|
-
// 2. Patch userAgentData brands to include "Google Chrome"
|
|
96
|
-
// Chrome for Testing only ships "Chromium" — a well-known bot-detection signal.
|
|
97
|
-
await page.addInitScript(`
|
|
98
|
-
(function () {
|
|
99
|
-
var uad = navigator.userAgentData;
|
|
100
|
-
if (!uad) return;
|
|
101
|
-
var originalBrands = uad.brands || [];
|
|
102
|
-
var hasChromeEntry = originalBrands.some(function(b) { return b.brand === 'Google Chrome'; });
|
|
103
|
-
if (hasChromeEntry) return;
|
|
104
|
-
|
|
105
|
-
var chromiumEntry = originalBrands.find(function(b) { return b.brand === 'Chromium'; });
|
|
106
|
-
var version = (chromiumEntry && chromiumEntry.version) || '136';
|
|
107
|
-
var patchedBrands = [
|
|
108
|
-
{ brand: 'Chromium', version: version },
|
|
109
|
-
{ brand: 'Google Chrome', version: version },
|
|
110
|
-
{ brand: 'Not=A?Brand', version: '99' },
|
|
111
|
-
];
|
|
112
|
-
|
|
113
|
-
Object.defineProperty(navigator, 'userAgentData', {
|
|
114
|
-
get: function() {
|
|
115
|
-
return {
|
|
116
|
-
brands: patchedBrands,
|
|
117
|
-
mobile: false,
|
|
118
|
-
platform: uad.platform || 'Windows',
|
|
119
|
-
getHighEntropyValues: uad.getHighEntropyValues ? uad.getHighEntropyValues.bind(uad) : undefined,
|
|
120
|
-
toJSON: function() {
|
|
121
|
-
return {
|
|
122
|
-
brands: patchedBrands,
|
|
123
|
-
mobile: false,
|
|
124
|
-
platform: uad.platform || 'Windows',
|
|
125
|
-
};
|
|
126
|
-
},
|
|
127
|
-
};
|
|
128
|
-
},
|
|
129
|
-
configurable: true,
|
|
130
|
-
});
|
|
131
|
-
})();
|
|
132
|
-
`);
|
|
133
|
-
}
|
|
134
|
-
function createHttpPool() {
|
|
135
|
-
return new Agent({
|
|
136
|
-
connections: 20,
|
|
137
|
-
pipelining: 6,
|
|
138
|
-
keepAliveTimeout: 60000,
|
|
139
|
-
keepAliveMaxTimeout: 60000,
|
|
140
|
-
allowH2: true,
|
|
141
|
-
connect: {
|
|
142
|
-
lookup: cachedLookup,
|
|
143
|
-
},
|
|
144
|
-
});
|
|
145
|
-
}
|
|
146
|
-
let httpPool = createHttpPool();
|
|
147
|
-
startDnsWarmup();
|
|
148
|
-
const CONDITIONAL_CACHE_MAX_ENTRIES = 2000;
|
|
149
|
-
const conditionalValidatorsByUrl = new Map();
|
|
150
|
-
function normalizeUrlForConditionalCache(url) {
|
|
151
|
-
try {
|
|
152
|
-
const normalized = new URL(url);
|
|
153
|
-
normalized.hash = '';
|
|
154
|
-
normalized.hostname = normalized.hostname.toLowerCase();
|
|
155
|
-
if ((normalized.protocol === 'http:' && normalized.port === '80') ||
|
|
156
|
-
(normalized.protocol === 'https:' && normalized.port === '443')) {
|
|
157
|
-
normalized.port = '';
|
|
158
|
-
}
|
|
159
|
-
if (!normalized.pathname) {
|
|
160
|
-
normalized.pathname = '/';
|
|
161
|
-
}
|
|
162
|
-
const sortedParams = [...normalized.searchParams.entries()]
|
|
163
|
-
.sort(([a], [b]) => a.localeCompare(b));
|
|
164
|
-
normalized.search = '';
|
|
165
|
-
for (const [key, value] of sortedParams) {
|
|
166
|
-
normalized.searchParams.append(key, value);
|
|
167
|
-
}
|
|
168
|
-
return normalized.toString();
|
|
169
|
-
}
|
|
170
|
-
catch {
|
|
171
|
-
return url.trim();
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
function getConditionalValidators(url) {
|
|
175
|
-
const key = normalizeUrlForConditionalCache(url);
|
|
176
|
-
const existing = conditionalValidatorsByUrl.get(key);
|
|
177
|
-
if (!existing) {
|
|
178
|
-
return null;
|
|
179
|
-
}
|
|
180
|
-
// LRU touch
|
|
181
|
-
conditionalValidatorsByUrl.delete(key);
|
|
182
|
-
conditionalValidatorsByUrl.set(key, existing);
|
|
183
|
-
return existing;
|
|
184
|
-
}
|
|
185
|
-
function setConditionalValidators(url, validators) {
|
|
186
|
-
const key = normalizeUrlForConditionalCache(url);
|
|
187
|
-
if (conditionalValidatorsByUrl.has(key)) {
|
|
188
|
-
conditionalValidatorsByUrl.delete(key);
|
|
189
|
-
}
|
|
190
|
-
conditionalValidatorsByUrl.set(key, validators);
|
|
191
|
-
while (conditionalValidatorsByUrl.size > CONDITIONAL_CACHE_MAX_ENTRIES) {
|
|
192
|
-
const oldestKey = conditionalValidatorsByUrl.keys().next().value;
|
|
193
|
-
if (!oldestKey) {
|
|
194
|
-
break;
|
|
195
|
-
}
|
|
196
|
-
conditionalValidatorsByUrl.delete(oldestKey);
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
function rememberConditionalValidators(url, response) {
|
|
200
|
-
const etag = response.headers.get('etag') || undefined;
|
|
201
|
-
const lastModified = response.headers.get('last-modified') || undefined;
|
|
202
|
-
if (!etag && !lastModified) {
|
|
203
|
-
return;
|
|
204
|
-
}
|
|
205
|
-
setConditionalValidators(url, { etag, lastModified });
|
|
206
|
-
}
|
|
207
|
-
function hasHeader(headers, name) {
|
|
208
|
-
const lowered = name.toLowerCase();
|
|
209
|
-
return Object.keys(headers).some((header) => header.toLowerCase() === lowered);
|
|
210
|
-
}
|
|
211
|
-
function getCachedResultFor304(url, fallbackUrl) {
|
|
212
|
-
const cached = getCached(url) || (fallbackUrl ? getCached(fallbackUrl) : null);
|
|
213
|
-
if (!cached) {
|
|
214
|
-
return null;
|
|
215
|
-
}
|
|
216
|
-
return {
|
|
217
|
-
html: cached.html,
|
|
218
|
-
buffer: cached.buffer,
|
|
219
|
-
url: cached.url || url,
|
|
220
|
-
statusCode: 304,
|
|
221
|
-
contentType: cached.contentType,
|
|
222
|
-
screenshot: cached.screenshot,
|
|
223
|
-
};
|
|
224
|
-
}
|
|
225
|
-
function createAbortError() {
|
|
226
|
-
const error = new Error('Operation aborted');
|
|
227
|
-
error.name = 'AbortError';
|
|
228
|
-
return error;
|
|
229
|
-
}
|
|
230
|
-
/**
|
|
231
|
-
* SECURITY: Validate URL to prevent SSRF attacks
|
|
232
|
-
* Blocks localhost, private IPs, link-local, and various bypass techniques
|
|
233
|
-
*/
|
|
234
|
-
function validateUrl(urlString) {
|
|
235
|
-
// Length check
|
|
236
|
-
if (urlString.length > 2048) {
|
|
237
|
-
throw new WebPeelError('URL too long (max 2048 characters)');
|
|
238
|
-
}
|
|
239
|
-
// Check for control characters and suspicious encoding
|
|
240
|
-
if (/[\x00-\x1F\x7F]/.test(urlString)) {
|
|
241
|
-
throw new WebPeelError('URL contains invalid control characters');
|
|
242
|
-
}
|
|
243
|
-
let url;
|
|
244
|
-
try {
|
|
245
|
-
url = new URL(urlString);
|
|
246
|
-
}
|
|
247
|
-
catch {
|
|
248
|
-
throw new WebPeelError('Invalid URL format');
|
|
249
|
-
}
|
|
250
|
-
// Only allow HTTP(S)
|
|
251
|
-
if (!['http:', 'https:'].includes(url.protocol)) {
|
|
252
|
-
throw new WebPeelError('Only HTTP and HTTPS protocols are allowed');
|
|
253
|
-
}
|
|
254
|
-
// Validate hostname is not empty
|
|
255
|
-
if (!url.hostname) {
|
|
256
|
-
throw new WebPeelError('Invalid hostname');
|
|
257
|
-
}
|
|
258
|
-
const hostname = url.hostname.toLowerCase();
|
|
259
|
-
// Block localhost patterns
|
|
260
|
-
const localhostPatterns = ['localhost', '0.0.0.0'];
|
|
261
|
-
if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
|
|
262
|
-
throw new WebPeelError('Access to localhost is not allowed');
|
|
263
|
-
}
|
|
264
|
-
// ENHANCED: Parse and validate IP addresses (handles hex, octal, decimal, mixed)
|
|
265
|
-
const ipv4Info = parseAndValidateIPv4(hostname);
|
|
266
|
-
if (ipv4Info) {
|
|
267
|
-
validateIPv4Address(ipv4Info);
|
|
268
|
-
}
|
|
269
|
-
// ENHANCED: Comprehensive IPv6 validation
|
|
270
|
-
if (hostname.includes(':')) {
|
|
271
|
-
validateIPv6Address(hostname);
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
/**
|
|
275
|
-
* Parse IPv4 address in any format (dotted, hex, octal, decimal, mixed)
|
|
276
|
-
* Returns null if not an IPv4 address
|
|
277
|
-
*/
|
|
278
|
-
function parseAndValidateIPv4(hostname) {
|
|
279
|
-
// Remove brackets if present
|
|
280
|
-
const cleaned = hostname.replace(/^\[|\]$/g, '');
|
|
281
|
-
// Standard dotted notation: 192.168.1.1
|
|
282
|
-
const dottedRegex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
|
|
283
|
-
const dottedMatch = cleaned.match(dottedRegex);
|
|
284
|
-
if (dottedMatch) {
|
|
285
|
-
const octets = dottedMatch.slice(1).map(Number);
|
|
286
|
-
if (octets.every(o => o >= 0 && o <= 255)) {
|
|
287
|
-
return octets;
|
|
288
|
-
}
|
|
289
|
-
throw new WebPeelError('Invalid IPv4 address');
|
|
290
|
-
}
|
|
291
|
-
// Hex notation: 0x7f000001
|
|
292
|
-
if (/^0x[0-9a-fA-F]+$/.test(cleaned)) {
|
|
293
|
-
const num = parseInt(cleaned, 16);
|
|
294
|
-
return [
|
|
295
|
-
(num >>> 24) & 0xff,
|
|
296
|
-
(num >>> 16) & 0xff,
|
|
297
|
-
(num >>> 8) & 0xff,
|
|
298
|
-
num & 0xff,
|
|
299
|
-
];
|
|
300
|
-
}
|
|
301
|
-
// Octal notation: 0177.0.0.1 or full octal 017700000001
|
|
302
|
-
if (/^0[0-7]/.test(cleaned)) {
|
|
303
|
-
// Full octal (all digits)
|
|
304
|
-
if (/^0[0-7]+$/.test(cleaned)) {
|
|
305
|
-
const num = parseInt(cleaned, 8);
|
|
306
|
-
if (num <= 0xffffffff) {
|
|
307
|
-
return [
|
|
308
|
-
(num >>> 24) & 0xff,
|
|
309
|
-
(num >>> 16) & 0xff,
|
|
310
|
-
(num >>> 8) & 0xff,
|
|
311
|
-
num & 0xff,
|
|
312
|
-
];
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
// Mixed octal-decimal: 0177.0.0.1
|
|
316
|
-
const parts = cleaned.split('.');
|
|
317
|
-
if (parts.length === 4) {
|
|
318
|
-
const octets = parts.map(p => parseInt(p, /^0[0-7]/.test(p) ? 8 : 10));
|
|
319
|
-
if (octets.every(o => o >= 0 && o <= 255)) {
|
|
320
|
-
return octets;
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
}
|
|
324
|
-
// Decimal notation: 2130706433
|
|
325
|
-
if (/^\d+$/.test(cleaned)) {
|
|
326
|
-
const num = parseInt(cleaned, 10);
|
|
327
|
-
if (num <= 0xffffffff) {
|
|
328
|
-
return [
|
|
329
|
-
(num >>> 24) & 0xff,
|
|
330
|
-
(num >>> 16) & 0xff,
|
|
331
|
-
(num >>> 8) & 0xff,
|
|
332
|
-
num & 0xff,
|
|
333
|
-
];
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
return null;
|
|
337
|
-
}
|
|
338
|
-
/**
|
|
339
|
-
* Validate IPv4 address against private/reserved ranges
|
|
340
|
-
*/
|
|
341
|
-
function validateIPv4Address(octets) {
|
|
342
|
-
const [a, b, c, d] = octets;
|
|
343
|
-
// Loopback: 127.0.0.0/8
|
|
344
|
-
if (a === 127) {
|
|
345
|
-
throw new WebPeelError('Access to loopback addresses is not allowed');
|
|
346
|
-
}
|
|
347
|
-
// Private: 10.0.0.0/8
|
|
348
|
-
if (a === 10) {
|
|
349
|
-
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
350
|
-
}
|
|
351
|
-
// Private: 172.16.0.0/12
|
|
352
|
-
if (a === 172 && b >= 16 && b <= 31) {
|
|
353
|
-
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
354
|
-
}
|
|
355
|
-
// Private: 192.168.0.0/16
|
|
356
|
-
if (a === 192 && b === 168) {
|
|
357
|
-
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
358
|
-
}
|
|
359
|
-
// Link-local: 169.254.0.0/16
|
|
360
|
-
if (a === 169 && b === 254) {
|
|
361
|
-
throw new WebPeelError('Access to link-local addresses is not allowed');
|
|
362
|
-
}
|
|
363
|
-
// Broadcast: 255.255.255.255
|
|
364
|
-
if (a === 255 && b === 255 && c === 255 && d === 255) {
|
|
365
|
-
throw new WebPeelError('Access to broadcast address is not allowed');
|
|
366
|
-
}
|
|
367
|
-
// This network: 0.0.0.0/8
|
|
368
|
-
if (a === 0) {
|
|
369
|
-
throw new WebPeelError('Access to "this network" addresses is not allowed');
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
/**
|
|
373
|
-
* Validate IPv6 address against private/reserved ranges
|
|
374
|
-
*/
|
|
375
|
-
function validateIPv6Address(hostname) {
|
|
376
|
-
// Remove brackets
|
|
377
|
-
const addr = hostname.replace(/^\[|\]$/g, '').toLowerCase();
|
|
378
|
-
// Loopback: ::1
|
|
379
|
-
if (addr === '::1' || addr === '0:0:0:0:0:0:0:1') {
|
|
380
|
-
throw new WebPeelError('Access to loopback addresses is not allowed');
|
|
381
|
-
}
|
|
382
|
-
// IPv6 mapped IPv4: ::ffff:192.168.1.1 or ::ffff:c0a8:0101
|
|
383
|
-
if (addr.startsWith('::ffff:')) {
|
|
384
|
-
// Extract the IPv4 part
|
|
385
|
-
const ipv4Part = addr.substring(7);
|
|
386
|
-
// Could be dotted (::ffff:192.168.1.1) or hex (::ffff:c0a8:0101)
|
|
387
|
-
if (ipv4Part.includes('.')) {
|
|
388
|
-
// Parse dotted IPv4
|
|
389
|
-
const parts = ipv4Part.split('.');
|
|
390
|
-
if (parts.length === 4) {
|
|
391
|
-
const octets = parts.map(p => parseInt(p, 10));
|
|
392
|
-
if (octets.every(o => !isNaN(o) && o >= 0 && o <= 255)) {
|
|
393
|
-
validateIPv4Address(octets);
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
else {
|
|
398
|
-
// Parse hex IPv4 (e.g., c0a80101 = 192.168.1.1)
|
|
399
|
-
const hexStr = ipv4Part.replace(/:/g, '');
|
|
400
|
-
if (/^[0-9a-f]{1,8}$/.test(hexStr)) {
|
|
401
|
-
const num = parseInt(hexStr, 16);
|
|
402
|
-
const octets = [
|
|
403
|
-
(num >>> 24) & 0xff,
|
|
404
|
-
(num >>> 16) & 0xff,
|
|
405
|
-
(num >>> 8) & 0xff,
|
|
406
|
-
num & 0xff,
|
|
407
|
-
];
|
|
408
|
-
validateIPv4Address(octets);
|
|
409
|
-
}
|
|
410
|
-
}
|
|
411
|
-
throw new WebPeelError('Access to IPv6-mapped IPv4 addresses is not allowed');
|
|
412
|
-
}
|
|
413
|
-
// Unique local addresses: fc00::/7 (fc00:: to fdff::)
|
|
414
|
-
if (addr.startsWith('fc') || addr.startsWith('fd')) {
|
|
415
|
-
throw new WebPeelError('Access to unique local IPv6 addresses is not allowed');
|
|
416
|
-
}
|
|
417
|
-
// Link-local: fe80::/10
|
|
418
|
-
if (addr.startsWith('fe8') || addr.startsWith('fe9') ||
|
|
419
|
-
addr.startsWith('fea') || addr.startsWith('feb')) {
|
|
420
|
-
throw new WebPeelError('Access to link-local IPv6 addresses is not allowed');
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
|
-
/**
|
|
424
|
-
* Validate and sanitize user agent string
|
|
425
|
-
*/
|
|
426
|
-
function validateUserAgent(userAgent) {
|
|
427
|
-
if (userAgent.length > 500) {
|
|
428
|
-
throw new WebPeelError('User agent too long (max 500 characters)');
|
|
429
|
-
}
|
|
430
|
-
// Allow only printable ASCII characters
|
|
431
|
-
if (!/^[\x20-\x7E]*$/.test(userAgent)) {
|
|
432
|
-
throw new WebPeelError('User agent contains invalid characters');
|
|
433
|
-
}
|
|
434
|
-
return userAgent;
|
|
435
|
-
}
|
|
436
|
-
/**
|
|
437
|
-
* Simple HTTP fetch using native fetch + Cheerio
|
|
438
|
-
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
439
|
-
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
440
|
-
*/
|
|
441
|
-
export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders, abortSignal, proxy) {
|
|
442
|
-
// SECURITY: Validate URL to prevent SSRF
|
|
443
|
-
validateUrl(url);
|
|
444
|
-
if (abortSignal?.aborted) {
|
|
445
|
-
throw createAbortError();
|
|
446
|
-
}
|
|
447
|
-
// Validate user agent if provided
|
|
448
|
-
// SEC.gov requires a User-Agent with contact info (their documented automated access policy)
|
|
449
|
-
const hostname = new URL(url).hostname.toLowerCase();
|
|
450
|
-
const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
|
|
451
|
-
const validatedUserAgent = isSecGov
|
|
452
|
-
? 'WebPeel/1.0 (support@webpeel.dev)'
|
|
453
|
-
: (userAgent ? validateUserAgent(userAgent) : getRandomUserAgent());
|
|
454
|
-
// SECURITY: Merge custom headers with defaults, block Host header override
|
|
455
|
-
const defaultHeaders = {
|
|
456
|
-
'User-Agent': validatedUserAgent,
|
|
457
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
458
|
-
'Accept-Language': 'en-US,en;q=0.9',
|
|
459
|
-
'Accept-Encoding': 'br, gzip, deflate',
|
|
460
|
-
'DNT': '1',
|
|
461
|
-
'Connection': 'keep-alive',
|
|
462
|
-
'Upgrade-Insecure-Requests': '1',
|
|
463
|
-
'Sec-CH-UA': getSecCHUA(validatedUserAgent),
|
|
464
|
-
'Sec-CH-UA-Mobile': '?0',
|
|
465
|
-
'Sec-CH-UA-Platform': getSecCHUAPlatform(validatedUserAgent),
|
|
466
|
-
'Sec-Fetch-Dest': 'document',
|
|
467
|
-
'Sec-Fetch-Mode': 'navigate',
|
|
468
|
-
'Sec-Fetch-Site': 'none',
|
|
469
|
-
'Sec-Fetch-User': '?1',
|
|
470
|
-
'Cache-Control': 'max-age=0',
|
|
471
|
-
'Priority': 'u=0, i',
|
|
472
|
-
};
|
|
473
|
-
const mergedHeaders = { ...defaultHeaders };
|
|
474
|
-
if (customHeaders) {
|
|
475
|
-
for (const [key, value] of Object.entries(customHeaders)) {
|
|
476
|
-
// SECURITY: Block Host header override
|
|
477
|
-
if (key.toLowerCase() === 'host') {
|
|
478
|
-
throw new WebPeelError('Custom Host header is not allowed');
|
|
479
|
-
}
|
|
480
|
-
mergedHeaders[key] = value;
|
|
481
|
-
}
|
|
482
|
-
}
|
|
483
|
-
const MAX_REDIRECTS = 10;
|
|
484
|
-
let redirectCount = 0;
|
|
485
|
-
let currentUrl = url;
|
|
486
|
-
const seenUrls = new Set();
|
|
487
|
-
try {
|
|
488
|
-
const hostname = new URL(url).hostname;
|
|
489
|
-
void resolveAndCache(hostname).catch(() => {
|
|
490
|
-
// Best-effort optimization only.
|
|
491
|
-
});
|
|
492
|
-
}
|
|
493
|
-
catch {
|
|
494
|
-
// Ignore URL parsing errors here; validation handles invalid input below.
|
|
495
|
-
}
|
|
496
|
-
while (redirectCount <= MAX_REDIRECTS) {
|
|
497
|
-
// Detect redirect loops
|
|
498
|
-
if (seenUrls.has(currentUrl)) {
|
|
499
|
-
throw new WebPeelError('Redirect loop detected');
|
|
500
|
-
}
|
|
501
|
-
seenUrls.add(currentUrl);
|
|
502
|
-
// Re-validate on each redirect
|
|
503
|
-
validateUrl(currentUrl);
|
|
504
|
-
const timeoutController = new AbortController();
|
|
505
|
-
const timer = setTimeout(() => timeoutController.abort(), timeoutMs);
|
|
506
|
-
const signal = abortSignal
|
|
507
|
-
? AbortSignal.any([timeoutController.signal, abortSignal])
|
|
508
|
-
: timeoutController.signal;
|
|
509
|
-
try {
|
|
510
|
-
const requestHeaders = { ...mergedHeaders };
|
|
511
|
-
const validators = getConditionalValidators(currentUrl);
|
|
512
|
-
if (validators?.etag && !hasHeader(requestHeaders, 'if-none-match')) {
|
|
513
|
-
requestHeaders['If-None-Match'] = validators.etag;
|
|
514
|
-
}
|
|
515
|
-
if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
|
|
516
|
-
requestHeaders['If-Modified-Since'] = validators.lastModified;
|
|
517
|
-
}
|
|
518
|
-
// Use proxy if provided, otherwise use shared connection pool
|
|
519
|
-
const dispatcher = proxy ? new ProxyAgent(proxy) : httpPool;
|
|
520
|
-
const response = await undiciFetch(currentUrl, {
|
|
521
|
-
headers: requestHeaders,
|
|
522
|
-
signal,
|
|
523
|
-
dispatcher,
|
|
524
|
-
redirect: 'manual', // SECURITY: Manual redirect handling
|
|
525
|
-
});
|
|
526
|
-
clearTimeout(timer);
|
|
527
|
-
if (response.status === 304) {
|
|
528
|
-
const cachedResult = getCachedResultFor304(currentUrl, url);
|
|
529
|
-
if (cachedResult) {
|
|
530
|
-
return cachedResult;
|
|
531
|
-
}
|
|
532
|
-
throw new NetworkError('HTTP 304 received but no cached response is available');
|
|
533
|
-
}
|
|
534
|
-
// Handle redirects manually
|
|
535
|
-
if (response.status >= 300 && response.status < 400) {
|
|
536
|
-
const location = response.headers.get('location');
|
|
537
|
-
if (!location) {
|
|
538
|
-
throw new NetworkError('Redirect response missing Location header');
|
|
539
|
-
}
|
|
540
|
-
// Resolve relative URLs
|
|
541
|
-
currentUrl = new URL(location, currentUrl).href;
|
|
542
|
-
try {
|
|
543
|
-
const hostname = new URL(currentUrl).hostname;
|
|
544
|
-
void resolveAndCache(hostname).catch(() => {
|
|
545
|
-
// Best-effort optimization only.
|
|
546
|
-
});
|
|
547
|
-
}
|
|
548
|
-
catch {
|
|
549
|
-
// Ignore URL parsing errors here; validation handles invalid input below.
|
|
550
|
-
}
|
|
551
|
-
redirectCount++;
|
|
552
|
-
continue;
|
|
553
|
-
}
|
|
554
|
-
if (!response.ok) {
|
|
555
|
-
if (response.status === 403 || response.status === 503) {
|
|
556
|
-
throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
|
|
557
|
-
}
|
|
558
|
-
throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
|
|
559
|
-
}
|
|
560
|
-
rememberConditionalValidators(currentUrl, response);
|
|
561
|
-
// Content-Type detection
|
|
562
|
-
const contentType = response.headers.get('content-type') || '';
|
|
563
|
-
const contentTypeLower = contentType.toLowerCase();
|
|
564
|
-
const urlLower = currentUrl.toLowerCase();
|
|
565
|
-
// Support binary documents (PDF/DOCX) in the simple HTTP path.
|
|
566
|
-
const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
|
|
567
|
-
const isDocx = contentTypeLower.includes('application/vnd.openxmlformats-officedocument.wordprocessingml.document') || urlLower.endsWith('.docx');
|
|
568
|
-
const isBinaryDoc = isPdf || isDocx;
|
|
569
|
-
// Accept a wide range of text-based content, plus supported binary documents.
|
|
570
|
-
const ALLOWED_TYPES = [
|
|
571
|
-
'text/html', 'application/xhtml+xml',
|
|
572
|
-
'text/plain', 'text/markdown', 'text/csv',
|
|
573
|
-
'application/json', 'text/json',
|
|
574
|
-
'text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml',
|
|
575
|
-
'application/javascript', 'text/javascript', 'text/css',
|
|
576
|
-
// Documents
|
|
577
|
-
'application/pdf',
|
|
578
|
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
579
|
-
];
|
|
580
|
-
const isAllowed = !contentTypeLower ||
|
|
581
|
-
ALLOWED_TYPES.some(t => contentTypeLower.includes(t)) ||
|
|
582
|
-
// Many servers mislabel docs as octet-stream; allow when URL implies a supported document.
|
|
583
|
-
(contentTypeLower.includes('application/octet-stream') && isBinaryDoc);
|
|
584
|
-
if (!isAllowed) {
|
|
585
|
-
// Check if it's at least text-based
|
|
586
|
-
const isTexty = contentTypeLower.startsWith('text/') ||
|
|
587
|
-
contentTypeLower.includes('json') ||
|
|
588
|
-
contentTypeLower.includes('xml');
|
|
589
|
-
if (!isTexty) {
|
|
590
|
-
throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content and PDF/DOCX documents only.`);
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
|
-
// SECURITY: Stream response with size limit (prevent memory exhaustion)
|
|
594
|
-
const chunks = [];
|
|
595
|
-
let totalSize = 0;
|
|
596
|
-
const MAX_SIZE = 10 * 1024 * 1024; // 10MB
|
|
597
|
-
const reader = response.body?.getReader();
|
|
598
|
-
if (!reader) {
|
|
599
|
-
throw new NetworkError('Response body is not readable');
|
|
600
|
-
}
|
|
601
|
-
try {
|
|
602
|
-
while (true) {
|
|
603
|
-
const { done, value } = await reader.read();
|
|
604
|
-
if (done)
|
|
605
|
-
break;
|
|
606
|
-
totalSize += value.length;
|
|
607
|
-
if (totalSize > MAX_SIZE) {
|
|
608
|
-
reader.cancel();
|
|
609
|
-
throw new WebPeelError('Response too large (max 10MB)');
|
|
610
|
-
}
|
|
611
|
-
chunks.push(value);
|
|
612
|
-
}
|
|
613
|
-
}
|
|
614
|
-
finally {
|
|
615
|
-
reader.releaseLock();
|
|
616
|
-
}
|
|
617
|
-
// Combine chunks
|
|
618
|
-
const combined = new Uint8Array(totalSize);
|
|
619
|
-
let offset = 0;
|
|
620
|
-
for (const chunk of chunks) {
|
|
621
|
-
combined.set(chunk, offset);
|
|
622
|
-
offset += chunk.length;
|
|
623
|
-
}
|
|
624
|
-
const buffer = Buffer.from(combined);
|
|
625
|
-
const html = isBinaryDoc ? '' : new TextDecoder().decode(combined);
|
|
626
|
-
// For HTML content, check for suspiciously small responses (bot blocks)
|
|
627
|
-
// Non-HTML content (JSON, text, XML) can legitimately be short
|
|
628
|
-
const isHtmlContent = !isBinaryDoc && (contentTypeLower.includes('html') || contentTypeLower.includes('xhtml'));
|
|
629
|
-
if (isHtmlContent && (!html || html.length < 100)) {
|
|
630
|
-
throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
|
|
631
|
-
}
|
|
632
|
-
if (!isBinaryDoc && !html) {
|
|
633
|
-
throw new NetworkError('Empty response body');
|
|
634
|
-
}
|
|
635
|
-
if (isBinaryDoc && buffer.length === 0) {
|
|
636
|
-
throw new NetworkError('Empty response body');
|
|
637
|
-
}
|
|
638
|
-
// Check for Cloudflare challenge (only relevant for HTML)
|
|
639
|
-
if (isHtmlContent && (html.includes('cf-browser-verification') || html.includes('Just a moment...'))) {
|
|
640
|
-
throw new BlockedError('Cloudflare challenge detected. Try --render for browser mode.');
|
|
641
|
-
}
|
|
642
|
-
// Run full challenge detection for HTML content
|
|
643
|
-
// Note: skip empty-shell type — in simple HTTP mode, SPA shells are expected and
|
|
644
|
-
// the caller's escalation logic upgrades to browser/stealth rendering.
|
|
645
|
-
if (isHtmlContent) {
|
|
646
|
-
const challengeResult = detectChallenge(html, response.status);
|
|
647
|
-
if (challengeResult.isChallenge && challengeResult.type !== 'empty-shell') {
|
|
648
|
-
throw new BlockedError(`Challenge page detected (${challengeResult.type || 'unknown'}, confidence: ${challengeResult.confidence.toFixed(2)}). ` +
|
|
649
|
-
`Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
|
|
650
|
-
}
|
|
651
|
-
}
|
|
652
|
-
return {
|
|
653
|
-
html,
|
|
654
|
-
buffer: isBinaryDoc ? buffer : undefined,
|
|
655
|
-
url: currentUrl,
|
|
656
|
-
statusCode: response.status,
|
|
657
|
-
contentType,
|
|
658
|
-
};
|
|
659
|
-
}
|
|
660
|
-
catch (error) {
|
|
661
|
-
clearTimeout(timer);
|
|
662
|
-
if (error instanceof BlockedError || error instanceof NetworkError || error instanceof WebPeelError) {
|
|
663
|
-
throw error;
|
|
664
|
-
}
|
|
665
|
-
if (error instanceof Error && error.name === 'AbortError') {
|
|
666
|
-
if (abortSignal?.aborted && !timeoutController.signal.aborted) {
|
|
667
|
-
throw createAbortError();
|
|
668
|
-
}
|
|
669
|
-
throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
|
|
670
|
-
}
|
|
671
|
-
// Provide specific error messages based on the actual cause
|
|
672
|
-
const cause = error instanceof Error && error.cause;
|
|
673
|
-
const causeMsg = cause?.message || cause?.code || '';
|
|
674
|
-
if (causeMsg.includes('certificate') || causeMsg.includes('CERT') || causeMsg.includes('SSL') || causeMsg.includes('TLS')) {
|
|
675
|
-
throw new NetworkError(`TLS/SSL certificate error for ${new URL(currentUrl).hostname}. The site's certificate may be expired, self-signed, or untrusted.`);
|
|
676
|
-
}
|
|
677
|
-
if (causeMsg.includes('ENOTFOUND') || causeMsg.includes('getaddrinfo')) {
|
|
678
|
-
throw new NetworkError(`DNS resolution failed: ${new URL(currentUrl).hostname} not found. Check the URL or your network connection.`);
|
|
679
|
-
}
|
|
680
|
-
if (causeMsg.includes('ECONNREFUSED')) {
|
|
681
|
-
throw new NetworkError(`Connection refused by ${new URL(currentUrl).hostname}. The server may be down.`);
|
|
682
|
-
}
|
|
683
|
-
if (causeMsg.includes('ECONNRESET') || causeMsg.includes('EPIPE')) {
|
|
684
|
-
throw new NetworkError(`Connection reset by ${new URL(currentUrl).hostname}. Try again or use --render.`);
|
|
685
|
-
}
|
|
686
|
-
if (causeMsg.includes('ETIMEDOUT') || causeMsg.includes('ENETUNREACH')) {
|
|
687
|
-
throw new TimeoutError(`Network unreachable or connection timed out for ${new URL(currentUrl).hostname}.`);
|
|
688
|
-
}
|
|
689
|
-
const msg = error instanceof Error ? error.message : 'Unknown error';
|
|
690
|
-
const causeDetail = causeMsg ? ` (${causeMsg})` : '';
|
|
691
|
-
throw new NetworkError(`Failed to fetch: ${msg}${causeDetail}`);
|
|
692
|
-
}
|
|
693
|
-
}
|
|
694
|
-
throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
|
|
695
|
-
}
|
|
696
|
-
export async function closePool() {
|
|
697
|
-
const oldPool = httpPool;
|
|
698
|
-
httpPool = createHttpPool();
|
|
699
|
-
await oldPool.close().catch(() => { });
|
|
700
|
-
}
|
|
701
|
-
let sharedBrowser = null;
|
|
702
|
-
let sharedStealthBrowser = null;
|
|
703
|
-
let activePagesCount = 0;
|
|
704
|
-
const MAX_CONCURRENT_PAGES = 5;
|
|
705
|
-
const PAGE_POOL_SIZE = 3;
|
|
706
|
-
const pooledPages = new Set();
|
|
707
|
-
const idlePagePool = [];
|
|
708
|
-
let pagePoolFillPromise = null;
|
|
709
|
-
function removePooledPage(page) {
|
|
710
|
-
pooledPages.delete(page);
|
|
711
|
-
const idleIndex = idlePagePool.indexOf(page);
|
|
712
|
-
if (idleIndex >= 0) {
|
|
713
|
-
idlePagePool.splice(idleIndex, 1);
|
|
714
|
-
}
|
|
715
|
-
}
|
|
716
|
-
function takePooledPage() {
|
|
717
|
-
while (idlePagePool.length > 0) {
|
|
718
|
-
const page = idlePagePool.shift();
|
|
719
|
-
if (page.isClosed()) {
|
|
720
|
-
removePooledPage(page);
|
|
721
|
-
continue;
|
|
722
|
-
}
|
|
723
|
-
return page;
|
|
724
|
-
}
|
|
725
|
-
return null;
|
|
726
|
-
}
|
|
727
|
-
async function ensurePagePool(browser) {
|
|
728
|
-
const activeBrowser = browser ?? sharedBrowser;
|
|
729
|
-
if (!activeBrowser || !activeBrowser.isConnected()) {
|
|
730
|
-
return;
|
|
731
|
-
}
|
|
732
|
-
if (pagePoolFillPromise) {
|
|
733
|
-
await pagePoolFillPromise;
|
|
734
|
-
return;
|
|
735
|
-
}
|
|
736
|
-
pagePoolFillPromise = (async () => {
|
|
737
|
-
while (pooledPages.size < PAGE_POOL_SIZE) {
|
|
738
|
-
const pooledPage = await activeBrowser.newPage({
|
|
739
|
-
userAgent: getRandomUserAgent(),
|
|
740
|
-
viewport: null, // Use browser window size (set via --window-size at launch)
|
|
741
|
-
});
|
|
742
|
-
await applyStealthScripts(pooledPage);
|
|
743
|
-
pooledPages.add(pooledPage);
|
|
744
|
-
idlePagePool.push(pooledPage);
|
|
745
|
-
}
|
|
746
|
-
})().finally(() => {
|
|
747
|
-
pagePoolFillPromise = null;
|
|
748
|
-
});
|
|
749
|
-
await pagePoolFillPromise;
|
|
750
|
-
}
|
|
751
|
-
async function recyclePooledPage(page) {
|
|
752
|
-
if (!pooledPages.has(page)) {
|
|
753
|
-
await page.close().catch(() => { });
|
|
754
|
-
return;
|
|
755
|
-
}
|
|
756
|
-
if (page.isClosed()) {
|
|
757
|
-
removePooledPage(page);
|
|
758
|
-
if (sharedBrowser?.isConnected()) {
|
|
759
|
-
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
760
|
-
}
|
|
761
|
-
return;
|
|
762
|
-
}
|
|
763
|
-
try {
|
|
764
|
-
await page.unroute('**/*').catch(() => { });
|
|
765
|
-
await page.context().clearCookies().catch(() => { });
|
|
766
|
-
await page.setExtraHTTPHeaders({});
|
|
767
|
-
await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 }).catch(() => { });
|
|
768
|
-
if (!idlePagePool.includes(page)) {
|
|
769
|
-
idlePagePool.push(page);
|
|
770
|
-
}
|
|
771
|
-
}
|
|
772
|
-
catch {
|
|
773
|
-
removePooledPage(page);
|
|
774
|
-
await page.close().catch(() => { });
|
|
775
|
-
}
|
|
776
|
-
if (sharedBrowser?.isConnected() && pooledPages.size < PAGE_POOL_SIZE) {
|
|
777
|
-
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
778
|
-
}
|
|
779
|
-
}
|
|
780
|
-
export async function warmup() {
|
|
781
|
-
startDnsWarmup();
|
|
782
|
-
const browser = await getBrowser();
|
|
783
|
-
await ensurePagePool(browser);
|
|
784
|
-
}
|
|
785
|
-
async function getBrowser() {
|
|
786
|
-
// SECURITY: Check if browser is still connected and healthy
|
|
787
|
-
if (sharedBrowser) {
|
|
788
|
-
try {
|
|
789
|
-
if (sharedBrowser.isConnected()) {
|
|
790
|
-
if (pooledPages.size < PAGE_POOL_SIZE) {
|
|
791
|
-
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
792
|
-
}
|
|
793
|
-
return sharedBrowser;
|
|
794
|
-
}
|
|
795
|
-
}
|
|
796
|
-
catch {
|
|
797
|
-
// Browser is dead, recreate
|
|
798
|
-
sharedBrowser = null;
|
|
799
|
-
}
|
|
800
|
-
}
|
|
801
|
-
pooledPages.clear();
|
|
802
|
-
idlePagePool.length = 0;
|
|
803
|
-
pagePoolFillPromise = null;
|
|
804
|
-
const vp = getRandomViewport();
|
|
805
|
-
sharedBrowser = await chromium.launch({
|
|
806
|
-
headless: true,
|
|
807
|
-
args: [...ANTI_DETECTION_ARGS, `--window-size=${vp.width},${vp.height}`],
|
|
808
|
-
});
|
|
809
|
-
void ensurePagePool(sharedBrowser).catch(() => { });
|
|
810
|
-
return sharedBrowser;
|
|
811
|
-
}
|
|
812
|
-
async function getStealthBrowser() {
|
|
813
|
-
// SECURITY: Check if stealth browser is still connected and healthy
|
|
814
|
-
if (sharedStealthBrowser) {
|
|
815
|
-
try {
|
|
816
|
-
if (sharedStealthBrowser.isConnected()) {
|
|
817
|
-
return sharedStealthBrowser;
|
|
818
|
-
}
|
|
819
|
-
}
|
|
820
|
-
catch {
|
|
821
|
-
// Browser is dead, recreate
|
|
822
|
-
sharedStealthBrowser = null;
|
|
823
|
-
}
|
|
824
|
-
}
|
|
825
|
-
const stealthVp = getRandomViewport();
|
|
826
|
-
const stealthBrowser = await stealthChromium.launch({
|
|
827
|
-
headless: true,
|
|
828
|
-
args: [...ANTI_DETECTION_ARGS, `--window-size=${stealthVp.width},${stealthVp.height}`],
|
|
829
|
-
});
|
|
830
|
-
if (!stealthBrowser)
|
|
831
|
-
throw new Error('Failed to launch stealth browser');
|
|
832
|
-
sharedStealthBrowser = stealthBrowser;
|
|
833
|
-
return stealthBrowser;
|
|
834
|
-
}
|
|
835
|
-
// ── Persistent profile browser instances ─────────────────────────────────────
|
|
836
|
-
// Profile browsers are NOT shared — each profileDir gets its own instance.
|
|
837
|
-
// These are keyed by profile path and kept alive between fetches in the same process.
|
|
838
|
-
const profileBrowsers = new Map();
|
|
839
|
-
/**
|
|
840
|
-
* Get or create a browser instance with a persistent user data directory.
|
|
841
|
-
* Profile browsers bypass the shared browser pool so cookies/sessions survive
|
|
842
|
-
* between fetch calls.
|
|
2
|
+
* Core fetching — thin re-export layer for backward compatibility.
|
|
843
3
|
*
|
|
844
|
-
*
|
|
845
|
-
*
|
|
846
|
-
*
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
return existing;
|
|
854
|
-
}
|
|
855
|
-
catch { /* dead, recreate */ }
|
|
856
|
-
profileBrowsers.delete(profileDir);
|
|
857
|
-
}
|
|
858
|
-
const profileVp = getRandomViewport();
|
|
859
|
-
const launchOptions = {
|
|
860
|
-
headless: !headed,
|
|
861
|
-
args: [
|
|
862
|
-
...ANTI_DETECTION_ARGS,
|
|
863
|
-
`--window-size=${profileVp.width},${profileVp.height}`,
|
|
864
|
-
`--user-data-dir=${profileDir}`,
|
|
865
|
-
],
|
|
866
|
-
};
|
|
867
|
-
const launched = stealth
|
|
868
|
-
? await stealthChromium.launch(launchOptions)
|
|
869
|
-
: await chromium.launch(launchOptions);
|
|
870
|
-
if (!launched)
|
|
871
|
-
throw new Error('Failed to launch profile browser');
|
|
872
|
-
profileBrowsers.set(profileDir, launched);
|
|
873
|
-
return launched;
|
|
874
|
-
}
|
|
875
|
-
/**
|
|
876
|
-
* Fetch using headless Chromium via Playwright
|
|
877
|
-
* Slower but can handle JavaScript-heavy sites and bypass some bot detection
|
|
878
|
-
*/
|
|
879
|
-
export async function browserFetch(url, options = {}) {
|
|
880
|
-
// SECURITY: Validate URL to prevent SSRF
|
|
881
|
-
validateUrl(url);
|
|
882
|
-
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, } = options;
|
|
883
|
-
// Validate user agent if provided
|
|
884
|
-
// In stealth mode with no custom UA, always use a realistic Chrome UA
|
|
885
|
-
const validatedUserAgent = userAgent
|
|
886
|
-
? validateUserAgent(userAgent)
|
|
887
|
-
: (stealth ? getRealisticUserAgent() : getRandomUserAgent());
|
|
888
|
-
// Validate wait time
|
|
889
|
-
if (waitMs < 0 || waitMs > 60000) {
|
|
890
|
-
throw new WebPeelError('Wait time must be between 0 and 60000ms');
|
|
891
|
-
}
|
|
892
|
-
if (signal?.aborted) {
|
|
893
|
-
throw createAbortError();
|
|
894
|
-
}
|
|
895
|
-
// SECURITY: Validate custom headers if provided
|
|
896
|
-
if (headers) {
|
|
897
|
-
for (const [key, value] of Object.entries(headers)) {
|
|
898
|
-
// Block Host header override
|
|
899
|
-
if (key.toLowerCase() === 'host') {
|
|
900
|
-
throw new WebPeelError('Custom Host header is not allowed');
|
|
901
|
-
}
|
|
902
|
-
if (typeof value !== 'string' || value.length > 500) {
|
|
903
|
-
throw new WebPeelError('Invalid header value');
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
}
|
|
907
|
-
// SECURITY: Limit concurrent browser pages with timeout
|
|
908
|
-
const queueStartTime = Date.now();
|
|
909
|
-
const QUEUE_TIMEOUT_MS = 30000; // 30 second max wait
|
|
910
|
-
while (activePagesCount >= MAX_CONCURRENT_PAGES) {
|
|
911
|
-
if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
|
|
912
|
-
throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
|
|
913
|
-
}
|
|
914
|
-
await new Promise(resolve => setTimeout(resolve, 100));
|
|
915
|
-
}
|
|
916
|
-
activePagesCount++;
|
|
917
|
-
let page = null;
|
|
918
|
-
let usingPooledPage = false;
|
|
919
|
-
let abortHandler;
|
|
920
|
-
// Declared here (outside try) so the finally block can reference it
|
|
921
|
-
const usingProfileBrowser = !!profileDir;
|
|
922
|
-
// Owned context created when storageState injection is requested
|
|
923
|
-
let ownedContext;
|
|
924
|
-
try {
|
|
925
|
-
const browser = usingProfileBrowser
|
|
926
|
-
? await getProfileBrowser(profileDir, headed, stealth)
|
|
927
|
-
: stealth
|
|
928
|
-
? await getStealthBrowser()
|
|
929
|
-
: await getBrowser();
|
|
930
|
-
// Only use the shared page pool for non-stealth, non-profile, non-keepOpen, non-storageState, non-proxy fetches
|
|
931
|
-
const shouldUsePagePool = !stealth && !userAgent && !keepPageOpen && !usingProfileBrowser && !storageState && !proxy;
|
|
932
|
-
if (shouldUsePagePool) {
|
|
933
|
-
page = takePooledPage();
|
|
934
|
-
usingPooledPage = !!page;
|
|
935
|
-
if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
|
|
936
|
-
void ensurePagePool(browser).catch(() => { });
|
|
937
|
-
}
|
|
938
|
-
}
|
|
939
|
-
if (!page) {
|
|
940
|
-
const fetchVp = getRandomViewport();
|
|
941
|
-
const pageOptions = {
|
|
942
|
-
userAgent: validatedUserAgent,
|
|
943
|
-
// viewport: null lets the browser use its natural window size (set via --window-size),
|
|
944
|
-
// avoiding the telltale Playwright default of 1280×720.
|
|
945
|
-
viewport: null,
|
|
946
|
-
...(stealth
|
|
947
|
-
? {
|
|
948
|
-
locale: 'en-US',
|
|
949
|
-
timezoneId: 'America/New_York',
|
|
950
|
-
javaScriptEnabled: true,
|
|
951
|
-
}
|
|
952
|
-
: {}),
|
|
953
|
-
};
|
|
954
|
-
if (proxy) {
|
|
955
|
-
// Parse proxy URL to extract auth credentials for Playwright
|
|
956
|
-
let playwrightProxy;
|
|
957
|
-
try {
|
|
958
|
-
const proxyUrl = new URL(proxy);
|
|
959
|
-
playwrightProxy = {
|
|
960
|
-
server: `${proxyUrl.protocol}//${proxyUrl.host}`,
|
|
961
|
-
username: proxyUrl.username || undefined,
|
|
962
|
-
password: proxyUrl.password || undefined,
|
|
963
|
-
};
|
|
964
|
-
}
|
|
965
|
-
catch {
|
|
966
|
-
// Fallback: use proxy string as-is
|
|
967
|
-
playwrightProxy = { server: proxy };
|
|
968
|
-
}
|
|
969
|
-
// Create an isolated context with the proxy and optional storageState
|
|
970
|
-
ownedContext = await browser.newContext({
|
|
971
|
-
...pageOptions,
|
|
972
|
-
proxy: playwrightProxy,
|
|
973
|
-
viewport: { width: fetchVp.width, height: fetchVp.height },
|
|
974
|
-
...(storageState ? { storageState } : {}),
|
|
975
|
-
});
|
|
976
|
-
page = await ownedContext.newPage();
|
|
977
|
-
}
|
|
978
|
-
else if (storageState) {
|
|
979
|
-
// Create an isolated context with the injected storage state (cookies + localStorage)
|
|
980
|
-
ownedContext = await browser.newContext({
|
|
981
|
-
...pageOptions,
|
|
982
|
-
storageState,
|
|
983
|
-
viewport: { width: fetchVp.width, height: fetchVp.height },
|
|
984
|
-
});
|
|
985
|
-
page = await ownedContext.newPage();
|
|
986
|
-
}
|
|
987
|
-
else {
|
|
988
|
-
page = await browser.newPage(pageOptions);
|
|
989
|
-
}
|
|
990
|
-
await applyStealthScripts(page);
|
|
991
|
-
usingPooledPage = false;
|
|
992
|
-
}
|
|
993
|
-
else {
|
|
994
|
-
await page.setViewportSize({ width: 1280, height: 720 }).catch(() => { });
|
|
995
|
-
}
|
|
996
|
-
if (signal) {
|
|
997
|
-
abortHandler = () => {
|
|
998
|
-
if (page && !page.isClosed()) {
|
|
999
|
-
void page.close().catch(() => { });
|
|
1000
|
-
}
|
|
1001
|
-
};
|
|
1002
|
-
signal.addEventListener('abort', abortHandler, { once: true });
|
|
1003
|
-
}
|
|
1004
|
-
await page.unroute('**/*').catch(() => { });
|
|
1005
|
-
const mergedHeaders = { ...(headers || {}) };
|
|
1006
|
-
if (usingPooledPage) {
|
|
1007
|
-
mergedHeaders['User-Agent'] = validatedUserAgent;
|
|
1008
|
-
}
|
|
1009
|
-
if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
|
|
1010
|
-
await page.setExtraHTTPHeaders(mergedHeaders);
|
|
1011
|
-
}
|
|
1012
|
-
// Set cookies if provided
|
|
1013
|
-
if (cookies && cookies.length > 0) {
|
|
1014
|
-
const parsedCookies = cookies.map(cookie => {
|
|
1015
|
-
const [nameValue] = cookie.split(';').map(s => s.trim());
|
|
1016
|
-
const [name, value] = nameValue.split('=');
|
|
1017
|
-
if (!name || value === undefined) {
|
|
1018
|
-
throw new WebPeelError(`Invalid cookie format: ${cookie}`);
|
|
1019
|
-
}
|
|
1020
|
-
return {
|
|
1021
|
-
name: name.trim(),
|
|
1022
|
-
value: value.trim(),
|
|
1023
|
-
url,
|
|
1024
|
-
};
|
|
1025
|
-
});
|
|
1026
|
-
await page.context().addCookies(parsedCookies);
|
|
1027
|
-
}
|
|
1028
|
-
if (signal?.aborted) {
|
|
1029
|
-
throw createAbortError();
|
|
1030
|
-
}
|
|
1031
|
-
// Block images/fonts/etc for speed in non-stealth mode.
|
|
1032
|
-
// In stealth mode, blocking common resources can be a bot-detection signal.
|
|
1033
|
-
if (!screenshot && !stealth) {
|
|
1034
|
-
await page.route('**/*', (route) => {
|
|
1035
|
-
const resourceType = route.request().resourceType();
|
|
1036
|
-
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
1037
|
-
route.abort();
|
|
1038
|
-
}
|
|
1039
|
-
else {
|
|
1040
|
-
route.continue();
|
|
1041
|
-
}
|
|
1042
|
-
});
|
|
1043
|
-
}
|
|
1044
|
-
else {
|
|
1045
|
-
// For screenshots and stealth mode, allow all resources
|
|
1046
|
-
await page.route('**/*', (route) => route.continue());
|
|
1047
|
-
}
|
|
1048
|
-
// SECURITY: Wrap entire operation in timeout
|
|
1049
|
-
let screenshotBuffer;
|
|
1050
|
-
const throwIfAborted = () => {
|
|
1051
|
-
if (signal?.aborted) {
|
|
1052
|
-
throw createAbortError();
|
|
1053
|
-
}
|
|
1054
|
-
};
|
|
1055
|
-
const fetchPromise = (async () => {
|
|
1056
|
-
const response = await page.goto(url, {
|
|
1057
|
-
waitUntil: 'domcontentloaded',
|
|
1058
|
-
timeout: timeoutMs,
|
|
1059
|
-
});
|
|
1060
|
-
throwIfAborted();
|
|
1061
|
-
// Quick check: if body text is very thin, wait for JS to render more content.
|
|
1062
|
-
// Only adds latency when the page clearly hasn't loaded yet.
|
|
1063
|
-
// eslint-disable-next-line @typescript-eslint/no-implied-eval
|
|
1064
|
-
const bodyTextLength = await page.evaluate('document.body?.innerText?.trim().length || 0').catch(() => 0);
|
|
1065
|
-
if (bodyTextLength < 500) {
|
|
1066
|
-
await page.waitForLoadState('networkidle', { timeout: 1500 }).catch(() => { });
|
|
1067
|
-
throwIfAborted();
|
|
1068
|
-
}
|
|
1069
|
-
// DOM stability check: wait for SPA hydration to settle.
|
|
1070
|
-
// Polls innerText length every 500ms — if still growing, keep waiting (max 3s extra).
|
|
1071
|
-
{
|
|
1072
|
-
const stabilityStart = Date.now();
|
|
1073
|
-
const MAX_STABILITY_WAIT_MS = 3000;
|
|
1074
|
-
const POLL_INTERVAL_MS = 500;
|
|
1075
|
-
let prevLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
|
|
1076
|
-
let stableCount = 0;
|
|
1077
|
-
while (Date.now() - stabilityStart < MAX_STABILITY_WAIT_MS) {
|
|
1078
|
-
throwIfAborted();
|
|
1079
|
-
await page.waitForTimeout(POLL_INTERVAL_MS);
|
|
1080
|
-
const curLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
|
|
1081
|
-
if (curLength === prevLength) {
|
|
1082
|
-
stableCount++;
|
|
1083
|
-
if (stableCount >= 2)
|
|
1084
|
-
break; // stable for 2 consecutive checks (~1s)
|
|
1085
|
-
}
|
|
1086
|
-
else {
|
|
1087
|
-
stableCount = 0;
|
|
1088
|
-
}
|
|
1089
|
-
prevLength = curLength;
|
|
1090
|
-
}
|
|
1091
|
-
}
|
|
1092
|
-
const finalUrl = page.url();
|
|
1093
|
-
const contentType = response?.headers()?.['content-type'] || '';
|
|
1094
|
-
const contentTypeLower = contentType.toLowerCase();
|
|
1095
|
-
const urlLower = finalUrl.toLowerCase();
|
|
1096
|
-
const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
|
|
1097
|
-
const isDocx = contentTypeLower.includes('wordprocessingml.document') || urlLower.endsWith('.docx');
|
|
1098
|
-
const isBinaryDoc = !!response && (isPdf || isDocx);
|
|
1099
|
-
// Small randomized delay in stealth mode (simulate human behavior)
|
|
1100
|
-
// Keep it short — enough to look human, not enough to kill latency
|
|
1101
|
-
if (stealth) {
|
|
1102
|
-
const extraDelayMs = 200 + Math.floor(Math.random() * 601);
|
|
1103
|
-
await page.waitForTimeout(extraDelayMs);
|
|
1104
|
-
throwIfAborted();
|
|
1105
|
-
}
|
|
1106
|
-
// Wait for additional time if requested (for dynamic content / screenshots)
|
|
1107
|
-
if (waitMs > 0) {
|
|
1108
|
-
await page.waitForTimeout(waitMs);
|
|
1109
|
-
throwIfAborted();
|
|
1110
|
-
}
|
|
1111
|
-
// Execute page actions if provided
|
|
1112
|
-
if (actions && actions.length > 0) {
|
|
1113
|
-
const { executeActions } = await import('./actions.js');
|
|
1114
|
-
const actionScreenshot = await executeActions(page, actions);
|
|
1115
|
-
if (actionScreenshot) {
|
|
1116
|
-
screenshotBuffer = actionScreenshot;
|
|
1117
|
-
}
|
|
1118
|
-
throwIfAborted();
|
|
1119
|
-
}
|
|
1120
|
-
// If the navigation returned a binary document (PDF/DOCX), grab the raw body.
|
|
1121
|
-
if (isBinaryDoc) {
|
|
1122
|
-
const buffer = await response.body();
|
|
1123
|
-
throwIfAborted();
|
|
1124
|
-
// Capture screenshot if requested (and not already captured by actions)
|
|
1125
|
-
if (screenshot && !screenshotBuffer) {
|
|
1126
|
-
screenshotBuffer = await page.screenshot({
|
|
1127
|
-
fullPage: screenshotFullPage,
|
|
1128
|
-
type: 'png',
|
|
1129
|
-
});
|
|
1130
|
-
}
|
|
1131
|
-
return {
|
|
1132
|
-
html: '',
|
|
1133
|
-
finalUrl,
|
|
1134
|
-
buffer,
|
|
1135
|
-
contentType,
|
|
1136
|
-
statusCode: response.status(),
|
|
1137
|
-
};
|
|
1138
|
-
}
|
|
1139
|
-
const html = await page.content();
|
|
1140
|
-
throwIfAborted();
|
|
1141
|
-
return {
|
|
1142
|
-
html,
|
|
1143
|
-
finalUrl,
|
|
1144
|
-
contentType,
|
|
1145
|
-
statusCode: response?.status(),
|
|
1146
|
-
};
|
|
1147
|
-
})();
|
|
1148
|
-
let operationTimeout;
|
|
1149
|
-
const timeoutPromise = new Promise((_, reject) => {
|
|
1150
|
-
operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
1151
|
-
});
|
|
1152
|
-
const fetchData = await Promise.race([fetchPromise, timeoutPromise]);
|
|
1153
|
-
if (operationTimeout) {
|
|
1154
|
-
clearTimeout(operationTimeout);
|
|
1155
|
-
}
|
|
1156
|
-
const { html, finalUrl } = fetchData;
|
|
1157
|
-
const fetchBuffer = 'buffer' in fetchData ? fetchData.buffer : undefined;
|
|
1158
|
-
const fetchContentType = 'contentType' in fetchData ? fetchData.contentType : undefined;
|
|
1159
|
-
const fetchStatusCode = 'statusCode' in fetchData ? fetchData.statusCode : undefined;
|
|
1160
|
-
const isBinaryDoc = !!fetchBuffer;
|
|
1161
|
-
// SECURITY: Limit HTML size (skip for binary documents where html is empty)
|
|
1162
|
-
if (!isBinaryDoc) {
|
|
1163
|
-
if (html.length > 10 * 1024 * 1024) { // 10MB limit
|
|
1164
|
-
throw new WebPeelError('Response too large (max 10MB)');
|
|
1165
|
-
}
|
|
1166
|
-
if (!html || html.length < 100) {
|
|
1167
|
-
throw new BlockedError('Empty or suspiciously small response from browser.');
|
|
1168
|
-
}
|
|
1169
|
-
// Run challenge detection on browser-fetched HTML (covers both regular and stealth modes)
|
|
1170
|
-
// Note: skip empty-shell type — that's a rendering quality issue (SPA needs more JS time),
|
|
1171
|
-
// not a bot challenge. The caller's escalation logic handles empty-shell separately.
|
|
1172
|
-
const browserChallengeResult = detectChallenge(html, fetchStatusCode);
|
|
1173
|
-
if (browserChallengeResult.isChallenge && browserChallengeResult.type !== 'empty-shell') {
|
|
1174
|
-
throw new BlockedError(`Challenge page detected (${browserChallengeResult.type || 'unknown'}, confidence: ${browserChallengeResult.confidence.toFixed(2)}). ` +
|
|
1175
|
-
`Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
|
|
1176
|
-
}
|
|
1177
|
-
}
|
|
1178
|
-
// Capture screenshot if requested (and not already captured by actions or document handler)
|
|
1179
|
-
if (screenshot && !screenshotBuffer) {
|
|
1180
|
-
screenshotBuffer = await page.screenshot({
|
|
1181
|
-
fullPage: screenshotFullPage,
|
|
1182
|
-
type: 'png'
|
|
1183
|
-
});
|
|
1184
|
-
}
|
|
1185
|
-
// If keepPageOpen, return page/browser for caller to use (e.g., branding extraction)
|
|
1186
|
-
if (keepPageOpen && page) {
|
|
1187
|
-
return {
|
|
1188
|
-
html,
|
|
1189
|
-
buffer: fetchBuffer,
|
|
1190
|
-
url: finalUrl,
|
|
1191
|
-
statusCode: fetchStatusCode,
|
|
1192
|
-
contentType: fetchContentType,
|
|
1193
|
-
screenshot: screenshotBuffer,
|
|
1194
|
-
page,
|
|
1195
|
-
browser,
|
|
1196
|
-
};
|
|
1197
|
-
}
|
|
1198
|
-
return {
|
|
1199
|
-
html,
|
|
1200
|
-
buffer: fetchBuffer,
|
|
1201
|
-
url: finalUrl,
|
|
1202
|
-
statusCode: fetchStatusCode,
|
|
1203
|
-
contentType: fetchContentType,
|
|
1204
|
-
screenshot: screenshotBuffer,
|
|
1205
|
-
};
|
|
1206
|
-
}
|
|
1207
|
-
catch (error) {
|
|
1208
|
-
if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
|
|
1209
|
-
throw error;
|
|
1210
|
-
}
|
|
1211
|
-
if (error instanceof Error && error.name === 'AbortError') {
|
|
1212
|
-
throw error;
|
|
1213
|
-
}
|
|
1214
|
-
if (error instanceof Error && error.message.includes('Timeout')) {
|
|
1215
|
-
throw new TimeoutError(`Browser navigation timed out`);
|
|
1216
|
-
}
|
|
1217
|
-
throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
1218
|
-
}
|
|
1219
|
-
finally {
|
|
1220
|
-
if (signal && abortHandler) {
|
|
1221
|
-
signal.removeEventListener('abort', abortHandler);
|
|
1222
|
-
}
|
|
1223
|
-
// CRITICAL: Always release/close page and decrement counter (unless keepPageOpen and no error)
|
|
1224
|
-
if (page && !keepPageOpen) {
|
|
1225
|
-
if (usingPooledPage) {
|
|
1226
|
-
await recyclePooledPage(page);
|
|
1227
|
-
}
|
|
1228
|
-
else if (ownedContext) {
|
|
1229
|
-
// Close the owned context (also closes the page)
|
|
1230
|
-
await ownedContext.close().catch(() => { });
|
|
1231
|
-
}
|
|
1232
|
-
else if (!usingProfileBrowser) {
|
|
1233
|
-
// Profile browser pages are NOT closed — the profile browser stays alive
|
|
1234
|
-
// so that the next fetch in the same process reuses the session.
|
|
1235
|
-
await page.close().catch(() => { });
|
|
1236
|
-
}
|
|
1237
|
-
}
|
|
1238
|
-
activePagesCount--;
|
|
1239
|
-
}
|
|
1240
|
-
}
|
|
1241
|
-
/**
|
|
1242
|
-
* Retry a fetch operation with exponential backoff
|
|
1243
|
-
*/
|
|
1244
|
-
export async function browserScreenshot(url, options = {}) {
|
|
1245
|
-
// SECURITY: Validate URL to prevent SSRF
|
|
1246
|
-
validateUrl(url);
|
|
1247
|
-
const { fullPage = false, width, height, format = 'png', quality, waitMs = 0, timeoutMs = 30000, userAgent, headers, cookies, stealth = false, actions, } = options;
|
|
1248
|
-
const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
|
|
1249
|
-
// Basic validation
|
|
1250
|
-
if (waitMs < 0 || waitMs > 60000) {
|
|
1251
|
-
throw new WebPeelError('Wait time must be between 0 and 60000ms');
|
|
1252
|
-
}
|
|
1253
|
-
if (timeoutMs < 1000 || timeoutMs > 120000) {
|
|
1254
|
-
throw new WebPeelError('Timeout must be between 1000 and 120000ms');
|
|
1255
|
-
}
|
|
1256
|
-
if (width !== undefined && (!Number.isFinite(width) || width < 100 || width > 5000)) {
|
|
1257
|
-
throw new WebPeelError('Width must be between 100 and 5000');
|
|
1258
|
-
}
|
|
1259
|
-
if (height !== undefined && (!Number.isFinite(height) || height < 100 || height > 5000)) {
|
|
1260
|
-
throw new WebPeelError('Height must be between 100 and 5000');
|
|
1261
|
-
}
|
|
1262
|
-
if (format !== 'png' && format !== 'jpeg') {
|
|
1263
|
-
throw new WebPeelError('Format must be png or jpeg');
|
|
1264
|
-
}
|
|
1265
|
-
if (format === 'jpeg' && quality !== undefined) {
|
|
1266
|
-
if (!Number.isFinite(quality) || quality < 1 || quality > 100) {
|
|
1267
|
-
throw new WebPeelError('JPEG quality must be between 1 and 100');
|
|
1268
|
-
}
|
|
1269
|
-
}
|
|
1270
|
-
// SECURITY: Validate custom headers if provided
|
|
1271
|
-
if (headers) {
|
|
1272
|
-
for (const [key, value] of Object.entries(headers)) {
|
|
1273
|
-
if (key.toLowerCase() === 'host') {
|
|
1274
|
-
throw new WebPeelError('Custom Host header is not allowed');
|
|
1275
|
-
}
|
|
1276
|
-
if (typeof value !== 'string' || value.length > 500) {
|
|
1277
|
-
throw new WebPeelError('Invalid header value');
|
|
1278
|
-
}
|
|
1279
|
-
}
|
|
1280
|
-
}
|
|
1281
|
-
// SECURITY: Limit concurrent browser pages with timeout
|
|
1282
|
-
const queueStartTime = Date.now();
|
|
1283
|
-
const QUEUE_TIMEOUT_MS = 30000;
|
|
1284
|
-
while (activePagesCount >= MAX_CONCURRENT_PAGES) {
|
|
1285
|
-
if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
|
|
1286
|
-
throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
|
|
1287
|
-
}
|
|
1288
|
-
await new Promise(resolve => setTimeout(resolve, 100));
|
|
1289
|
-
}
|
|
1290
|
-
activePagesCount++;
|
|
1291
|
-
let page = null;
|
|
1292
|
-
let usingPooledPage = false;
|
|
1293
|
-
try {
|
|
1294
|
-
const browser = stealth ? await getStealthBrowser() : await getBrowser();
|
|
1295
|
-
const shouldUsePagePool = !stealth && !userAgent;
|
|
1296
|
-
if (shouldUsePagePool) {
|
|
1297
|
-
page = takePooledPage();
|
|
1298
|
-
usingPooledPage = !!page;
|
|
1299
|
-
if (usingPooledPage && pooledPages.size < PAGE_POOL_SIZE) {
|
|
1300
|
-
void ensurePagePool(browser).catch(() => { });
|
|
1301
|
-
}
|
|
1302
|
-
}
|
|
1303
|
-
if (!page) {
|
|
1304
|
-
page = await browser.newPage({
|
|
1305
|
-
userAgent: validatedUserAgent,
|
|
1306
|
-
viewport: width || height ? {
|
|
1307
|
-
width: width || 1280,
|
|
1308
|
-
height: height || 720,
|
|
1309
|
-
} : null, // Use browser window size when no explicit dimensions requested
|
|
1310
|
-
});
|
|
1311
|
-
await applyStealthScripts(page);
|
|
1312
|
-
usingPooledPage = false;
|
|
1313
|
-
}
|
|
1314
|
-
else {
|
|
1315
|
-
await page.setViewportSize({
|
|
1316
|
-
width: width || 1280,
|
|
1317
|
-
height: height || 720,
|
|
1318
|
-
}).catch(() => { });
|
|
1319
|
-
}
|
|
1320
|
-
await page.unroute('**/*').catch(() => { });
|
|
1321
|
-
const mergedHeaders = { ...(headers || {}) };
|
|
1322
|
-
if (usingPooledPage) {
|
|
1323
|
-
mergedHeaders['User-Agent'] = validatedUserAgent;
|
|
1324
|
-
}
|
|
1325
|
-
if (usingPooledPage || Object.keys(mergedHeaders).length > 0) {
|
|
1326
|
-
await page.setExtraHTTPHeaders(mergedHeaders);
|
|
1327
|
-
}
|
|
1328
|
-
if (cookies && cookies.length > 0) {
|
|
1329
|
-
const parsedCookies = cookies.map(cookie => {
|
|
1330
|
-
const [nameValue] = cookie.split(';').map(s => s.trim());
|
|
1331
|
-
const [name, value] = nameValue.split('=');
|
|
1332
|
-
if (!name || value === undefined) {
|
|
1333
|
-
throw new WebPeelError(`Invalid cookie format: ${cookie}`);
|
|
1334
|
-
}
|
|
1335
|
-
return {
|
|
1336
|
-
name: name.trim(),
|
|
1337
|
-
value: value.trim(),
|
|
1338
|
-
url,
|
|
1339
|
-
};
|
|
1340
|
-
});
|
|
1341
|
-
await page.context().addCookies(parsedCookies);
|
|
1342
|
-
}
|
|
1343
|
-
// For screenshots, allow all resources
|
|
1344
|
-
await page.route('**/*', (route) => route.continue());
|
|
1345
|
-
let screenshotBuffer;
|
|
1346
|
-
const doWork = (async () => {
|
|
1347
|
-
await page.goto(url, {
|
|
1348
|
-
waitUntil: 'domcontentloaded',
|
|
1349
|
-
timeout: timeoutMs,
|
|
1350
|
-
});
|
|
1351
|
-
if (waitMs > 0) {
|
|
1352
|
-
await page.waitForTimeout(waitMs);
|
|
1353
|
-
}
|
|
1354
|
-
if (actions && actions.length > 0) {
|
|
1355
|
-
const { executeActions } = await import('./actions.js');
|
|
1356
|
-
const actionScreenshot = await executeActions(page, actions, {
|
|
1357
|
-
fullPage,
|
|
1358
|
-
type: format,
|
|
1359
|
-
quality,
|
|
1360
|
-
});
|
|
1361
|
-
if (actionScreenshot) {
|
|
1362
|
-
screenshotBuffer = actionScreenshot;
|
|
1363
|
-
}
|
|
1364
|
-
}
|
|
1365
|
-
const finalUrl = page.url();
|
|
1366
|
-
// Capture screenshot if not captured via actions
|
|
1367
|
-
if (!screenshotBuffer) {
|
|
1368
|
-
screenshotBuffer = await page.screenshot({
|
|
1369
|
-
fullPage,
|
|
1370
|
-
type: format,
|
|
1371
|
-
...(format === 'jpeg' && typeof quality === 'number' ? { quality } : {}),
|
|
1372
|
-
});
|
|
1373
|
-
}
|
|
1374
|
-
return { finalUrl, screenshotBuffer: screenshotBuffer };
|
|
1375
|
-
})();
|
|
1376
|
-
let operationTimeout;
|
|
1377
|
-
const timeoutPromise = new Promise((_, reject) => {
|
|
1378
|
-
operationTimeout = setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
1379
|
-
});
|
|
1380
|
-
const { finalUrl, screenshotBuffer: buf } = await Promise.race([doWork, timeoutPromise]);
|
|
1381
|
-
if (operationTimeout) {
|
|
1382
|
-
clearTimeout(operationTimeout);
|
|
1383
|
-
}
|
|
1384
|
-
return { buffer: buf, finalUrl };
|
|
1385
|
-
}
|
|
1386
|
-
catch (error) {
|
|
1387
|
-
if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
|
|
1388
|
-
throw error;
|
|
1389
|
-
}
|
|
1390
|
-
if (error instanceof Error && error.message.includes('Timeout')) {
|
|
1391
|
-
throw new TimeoutError('Browser screenshot timed out');
|
|
1392
|
-
}
|
|
1393
|
-
throw new NetworkError(`Browser screenshot failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
1394
|
-
}
|
|
1395
|
-
finally {
|
|
1396
|
-
if (page) {
|
|
1397
|
-
if (usingPooledPage) {
|
|
1398
|
-
await recyclePooledPage(page);
|
|
1399
|
-
}
|
|
1400
|
-
else {
|
|
1401
|
-
await page.close().catch(() => { });
|
|
1402
|
-
}
|
|
1403
|
-
}
|
|
1404
|
-
activePagesCount--;
|
|
1405
|
-
}
|
|
1406
|
-
}
|
|
1407
|
-
export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
|
|
1408
|
-
let lastError = null;
|
|
1409
|
-
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
1410
|
-
try {
|
|
1411
|
-
return await fn();
|
|
1412
|
-
}
|
|
1413
|
-
catch (error) {
|
|
1414
|
-
lastError = error instanceof Error ? error : new Error('Unknown error');
|
|
1415
|
-
// Don't retry on blocked errors or timeouts
|
|
1416
|
-
if (error instanceof BlockedError || error instanceof TimeoutError) {
|
|
1417
|
-
throw error;
|
|
1418
|
-
}
|
|
1419
|
-
if (attempt < maxAttempts) {
|
|
1420
|
-
const delay = baseDelayMs * Math.pow(2, attempt - 1);
|
|
1421
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
1422
|
-
}
|
|
1423
|
-
}
|
|
1424
|
-
}
|
|
1425
|
-
throw lastError || new NetworkError('Retry failed');
|
|
1426
|
-
}
|
|
1427
|
-
/**
|
|
1428
|
-
* Scroll to the bottom of the page N times, waiting for the network to
|
|
1429
|
-
* settle between each scroll. Useful for triggering lazy-loaded content
|
|
1430
|
-
* (infinite scroll, deferred images, etc.).
|
|
1431
|
-
*
|
|
1432
|
-
* @param page - Playwright Page instance.
|
|
1433
|
-
* @param times - Number of scroll-and-wait cycles (default: 3).
|
|
1434
|
-
* @returns The final page HTML after all scrolls complete.
|
|
1435
|
-
*/
|
|
1436
|
-
export async function scrollAndWait(page, times = 3) {
|
|
1437
|
-
for (let i = 0; i < times; i++) {
|
|
1438
|
-
// eslint-disable-next-line @typescript-eslint/no-implied-eval
|
|
1439
|
-
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
|
|
1440
|
-
// Wait for network to settle (500 ms of no new requests) or 2 s max.
|
|
1441
|
-
try {
|
|
1442
|
-
await page.waitForLoadState('networkidle', { timeout: 2000 });
|
|
1443
|
-
}
|
|
1444
|
-
catch {
|
|
1445
|
-
// networkidle may never fire — fall back to a flat delay.
|
|
1446
|
-
await page.waitForTimeout(1000);
|
|
1447
|
-
}
|
|
1448
|
-
}
|
|
1449
|
-
return page.content();
|
|
1450
|
-
}
|
|
1451
|
-
/**
|
|
1452
|
-
* Clean up browser resources (shared pool, stealth browser, and all profile browsers).
|
|
1453
|
-
*/
|
|
1454
|
-
export async function cleanup() {
|
|
1455
|
-
const pagesToClose = Array.from(pooledPages);
|
|
1456
|
-
pooledPages.clear();
|
|
1457
|
-
idlePagePool.length = 0;
|
|
1458
|
-
pagePoolFillPromise = null;
|
|
1459
|
-
await Promise.all(pagesToClose.map((page) => page.close().catch(() => { })));
|
|
1460
|
-
if (sharedBrowser) {
|
|
1461
|
-
await sharedBrowser.close();
|
|
1462
|
-
sharedBrowser = null;
|
|
1463
|
-
}
|
|
1464
|
-
if (sharedStealthBrowser) {
|
|
1465
|
-
await sharedStealthBrowser.close();
|
|
1466
|
-
sharedStealthBrowser = null;
|
|
1467
|
-
}
|
|
1468
|
-
// Close all persistent profile browsers
|
|
1469
|
-
const profileBrowserList = Array.from(profileBrowsers.values());
|
|
1470
|
-
profileBrowsers.clear();
|
|
1471
|
-
await Promise.all(profileBrowserList.map(b => b.close().catch(() => { })));
|
|
1472
|
-
await closePool().catch(() => { });
|
|
1473
|
-
}
|
|
1474
|
-
/**
|
|
1475
|
-
* Close a specific persistent profile browser (e.g. when done with a session).
|
|
1476
|
-
* Safe to call even if the browser has already been closed.
|
|
1477
|
-
*
|
|
1478
|
-
* @param profileDir Path to the profile directory used when launching
|
|
1479
|
-
*/
|
|
1480
|
-
export async function closeProfileBrowser(profileDir) {
|
|
1481
|
-
const browser = profileBrowsers.get(profileDir);
|
|
1482
|
-
if (browser) {
|
|
1483
|
-
profileBrowsers.delete(profileDir);
|
|
1484
|
-
await browser.close().catch(() => { });
|
|
1485
|
-
}
|
|
1486
|
-
}
|
|
4
|
+
* The implementation has been split into focused modules:
|
|
5
|
+
* - http-fetch.ts — Pure HTTP fetching (simpleFetch, SSRF validation, HTTP pool)
|
|
6
|
+
* - browser-pool.ts — Browser lifecycle & page pool (getBrowser, cleanup, warmup)
|
|
7
|
+
* - browser-fetch.ts — Browser-based fetching (browserFetch, browserScreenshot)
|
|
8
|
+
*/
|
|
9
|
+
// Re-export everything for backward compatibility
|
|
10
|
+
export { simpleFetch } from './http-fetch.js';
|
|
11
|
+
export { cleanup, warmup, closePool, closeProfileBrowser, playwrightLoaded } from './browser-pool.js';
|
|
12
|
+
export { browserFetch, browserScreenshot, retryFetch, scrollAndWait } from './browser-fetch.js';
|
|
1487
13
|
//# sourceMappingURL=fetcher.js.map
|