@fanboynz/network-scanner 2.0.64 → 2.0.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/nettools.js CHANGED
@@ -4,11 +4,9 @@
4
4
  */
5
5
 
6
6
  const { exec, execSync } = require('child_process');
7
- const util = require('util');
8
7
  const fs = require('fs');
9
8
  const path = require('path');
10
9
  const { formatLogMessage, messageColors } = require('./colorize');
11
- const execPromise = util.promisify(exec);
12
10
  const ANSI_REGEX = /\x1b\[[0-9;]*m/g;
13
11
 
14
12
  // Cycling index for whois server rotation
@@ -80,7 +78,11 @@ function saveDiskCache(filePath, cache, ttl, maxSize) {
80
78
  }
81
79
  }
82
80
 
83
- // If over max, keep only the newest entries
81
+ // If over max, keep only the newest entries. Drop the pretty-print —
82
+ // saveDiskCache runs on the synchronous 'exit' handler when --dns-cache
83
+ // is set, so any work here directly delays scan exit. Compact JSON is
84
+ // several times faster on multi-megabyte caches and the file is not
85
+ // intended for human reading.
84
86
  if (count > maxSize) {
85
87
  const sorted = Object.entries(entries)
86
88
  .sort((a, b) => b[1].timestamp - a[1].timestamp)
@@ -89,9 +91,9 @@ function saveDiskCache(filePath, cache, ttl, maxSize) {
89
91
  for (const [key, entry] of sorted) {
90
92
  trimmed[key] = entry;
91
93
  }
92
- fs.writeFileSync(filePath, JSON.stringify(trimmed, null, 2));
94
+ fs.writeFileSync(filePath, JSON.stringify(trimmed));
93
95
  } else {
94
- fs.writeFileSync(filePath, JSON.stringify(entries, null, 2));
96
+ fs.writeFileSync(filePath, JSON.stringify(entries));
95
97
  }
96
98
  } catch {
97
99
  // Disk write failed — non-fatal, in-memory cache still works
@@ -125,14 +127,18 @@ function enableDiskCache() {
125
127
  loadDiskCache(DIG_CACHE_FILE, globalDigResultCache, GLOBAL_DIG_CACHE_TTL, GLOBAL_DIG_CACHE_MAX);
126
128
  loadDiskCache(WHOIS_CACHE_FILE, globalWhoisResultCache, GLOBAL_WHOIS_CACHE_TTL, GLOBAL_WHOIS_CACHE_MAX);
127
129
 
128
- // Save caches to disk once on process exit instead of per-lookup
130
+ // Save caches to disk once on process exit instead of per-lookup. The
131
+ // 'exit' handler fires synchronously regardless of how the process exits
132
+ // (normal completion, signal, uncaught exception), so a separate signal
133
+ // handler is redundant. We deliberately do NOT install SIGINT/SIGTERM
134
+ // handlers here — nwss.js installs its own async ones that perform
135
+ // browser/VPN cleanup, and a sync handler here would call process.exit(0)
136
+ // first and skip that cleanup entirely.
129
137
  const flushCaches = () => {
130
138
  saveDiskCache(DIG_CACHE_FILE, globalDigResultCache, GLOBAL_DIG_CACHE_TTL, GLOBAL_DIG_CACHE_MAX);
131
139
  saveDiskCache(WHOIS_CACHE_FILE, globalWhoisResultCache, GLOBAL_WHOIS_CACHE_TTL, GLOBAL_WHOIS_CACHE_MAX);
132
140
  };
133
141
  process.on('exit', flushCaches);
134
- process.on('SIGINT', () => { flushCaches(); process.exit(0); });
135
- process.on('SIGTERM', () => { flushCaches(); process.exit(0); });
136
142
  }
137
143
 
138
144
  /**
@@ -217,14 +223,18 @@ function execWithTimeout(command, timeout = 10000) {
217
223
  // Set up timeout
218
224
  const timer = setTimeout(() => {
219
225
  child.kill('SIGTERM');
220
-
221
- // Force kill after 2 seconds if SIGTERM doesn't work
222
- setTimeout(() => {
226
+
227
+ // Force kill after 2 seconds if SIGTERM doesn't work. unref() so this
228
+ // tail timer doesn't keep the event loop alive past scan completion —
229
+ // a dig that times out near the end of a scan would otherwise delay
230
+ // exit by ~2 seconds.
231
+ const killTimer = setTimeout(() => {
223
232
  if (!child.killed) {
224
233
  child.kill('SIGKILL');
225
234
  }
226
235
  }, 2000);
227
-
236
+ killTimer.unref();
237
+
228
238
  reject(new Error(`Command timeout after ${timeout}ms: ${command}`));
229
239
  }, timeout);
230
240
 
@@ -925,6 +935,31 @@ function createNetToolsHandler(config) {
925
935
  const hasWhoisOr = whoisOrTerms && Array.isArray(whoisOrTerms) && whoisOrTerms.length > 0;
926
936
  const hasDig = digTerms && Array.isArray(digTerms) && digTerms.length > 0;
927
937
  const hasDigOr = digOrTerms && Array.isArray(digOrTerms) && digOrTerms.length > 0;
938
+
939
+ // Pre-lowercase search terms once per handler so the per-domain check loop
940
+ // doesn't re-lowercase the same constants for every output it scans.
941
+ const whoisTermsLower = hasWhois ? whoisTerms.map(t => t.toLowerCase()) : null;
942
+ const whoisOrTermsLower = hasWhoisOr ? whoisOrTerms.map(t => t.toLowerCase()) : null;
943
+ const digTermsLower = hasDig ? digTerms.map(t => t.toLowerCase()) : null;
944
+ const digOrTermsLower = hasDigOr ? digOrTerms.map(t => t.toLowerCase()) : null;
945
+
946
+ // Hoisted out of handleNetToolsCheck so the closure is constructed once per
947
+ // handler rather than once per invocation. References forceDebug, debugLogFile,
948
+ // and fs from the destructured config above.
949
+ function logToConsoleAndFile(message) {
950
+ if (forceDebug) {
951
+ console.log(formatLogMessage('debug', message));
952
+ }
953
+ if (debugLogFile && fs) {
954
+ try {
955
+ const timestamp = new Date().toISOString();
956
+ const cleanMessage = stripAnsiColors(message);
957
+ fs.appendFileSync(debugLogFile, `${timestamp} [debug nettools] ${cleanMessage}\n`);
958
+ } catch (_) {
959
+ // Silently fail file logging to avoid disrupting whois operations
960
+ }
961
+ }
962
+ }
928
963
 
929
964
  // Create config-aware cache keys for deduplication
930
965
  // Whois: Only include search terms + server (domain registry data is consistent across subdomains)
@@ -948,10 +983,7 @@ function createNetToolsHandler(config) {
948
983
  // DNS results are the same regardless of search terms
949
984
 
950
985
  return async function handleNetToolsCheck(domain, fullSubdomain) {
951
- // Use fullSubdomain parameter instead of originalDomain to maintain consistency
952
- // with the domain cache fix approach
953
986
  const originalDomain = fullSubdomain;
954
- // Helper function to log to BOTH console and debug file
955
987
 
956
988
  // Check if domain was already detected (skip expensive operations)
957
989
  if (typeof isDomainAlreadyDetected === 'function' && isDomainAlreadyDetected(fullSubdomain)) {
@@ -960,36 +992,7 @@ function createNetToolsHandler(config) {
960
992
  }
961
993
  return;
962
994
  }
963
-
964
- // NOTE: The logToConsoleAndFile function needs to be declared INSIDE this function
965
- // so it has access to the closure variables (forceDebug, debugLogFile, fs) from the
966
- // createNetToolsHandler config. This function was being called but not declared
967
- // within the scope where whoisLookup and whoisLookupWithRetry try to use it.
968
- // This is why we were getting "logToConsoleAndFile is not defined" errors.
969
995
 
970
- // Move the logToConsoleAndFile function declaration from later in the file to here:
971
- function logToConsoleAndFile(message) {
972
- // Note: This function needs access to forceDebug, debugLogFile, and fs from the parent scope
973
- // These are passed in via the config object to createNetToolsHandler
974
- // forceDebug, debugLogFile, and fs are available in this closure
975
-
976
- // Always log to console when in debug mode
977
- if (forceDebug) {
978
- console.log(formatLogMessage('debug', message));
979
- }
980
-
981
- // Also log to file if debug file logging is enabled
982
- if (debugLogFile && fs) {
983
- try {
984
- const timestamp = new Date().toISOString();
985
- const cleanMessage = stripAnsiColors(message);
986
- fs.appendFileSync(debugLogFile, `${timestamp} [debug nettools] ${cleanMessage}\n`);
987
- } catch (logErr) {
988
- // Silently fail file logging to avoid disrupting whois operations
989
- }
990
- }
991
- }
992
-
993
996
  // Determine which domain will be used for dig lookup
994
997
  const digDomain = digSubdomain && originalDomain ? originalDomain : domain;
995
998
 
@@ -1152,8 +1155,13 @@ function createNetToolsHandler(config) {
1152
1155
  try {
1153
1156
  const lookupPromise = whoisLookupWithRetry(whoisRootDomain, 8000, whoisServer, forceDebug, retryOptions, whoisDelay, logToConsoleAndFile);
1154
1157
  pendingWhoisLookups.set(whoisCacheKey, lookupPromise);
1155
- whoisResult = await lookupPromise;
1156
- pendingWhoisLookups.delete(whoisCacheKey);
1158
+ // try/finally so a rejected lookup still clears the pending
1159
+ // entry — see matching comment on pendingDigLookups below.
1160
+ try {
1161
+ whoisResult = await lookupPromise;
1162
+ } finally {
1163
+ pendingWhoisLookups.delete(whoisCacheKey);
1164
+ }
1157
1165
 
1158
1166
  // Cache successful results (and certain types of failures)
1159
1167
  if (whoisResult.success ||
@@ -1196,11 +1204,18 @@ function createNetToolsHandler(config) {
1196
1204
 
1197
1205
  // Process whois result (whether from cache or fresh lookup)
1198
1206
  if (whoisResult) {
1199
-
1207
+
1200
1208
  if (whoisResult.success) {
1209
+ // Lowercase the output ONCE — checkWhoisTerms / checkWhoisTermsOr
1210
+ // each call .toLowerCase() on their input independently, which
1211
+ // re-allocates a multi-KB lowercased string per call. Pre-lowering
1212
+ // here lets the AND check, OR check, and matched-term find share
1213
+ // a single allocation.
1214
+ const whoisOutputLower = whoisResult.output.toLowerCase();
1215
+
1201
1216
  // Check AND terms if configured
1202
1217
  if (hasWhois) {
1203
- whoisMatched = checkWhoisTerms(whoisResult.output, whoisTerms);
1218
+ whoisMatched = whoisTermsLower.every(t => whoisOutputLower.includes(t));
1204
1219
  if (whoisMatched && dryRunCallback) {
1205
1220
  dryRunCallback(domain, 'whois', 'AND logic', whoisTerms.join(', '), 'All terms found in whois data', {
1206
1221
  server: whoisResult.whoisServer || 'default',
@@ -1214,12 +1229,13 @@ function createNetToolsHandler(config) {
1214
1229
  }
1215
1230
 
1216
1231
  }
1217
-
1232
+
1218
1233
  // Check OR terms if configured
1219
1234
  if (hasWhoisOr) {
1220
- whoisOrMatched = checkWhoisTermsOr(whoisResult.output, whoisOrTerms);
1235
+ whoisOrMatched = whoisOrTermsLower.some(t => whoisOutputLower.includes(t));
1221
1236
  if (whoisOrMatched && dryRunCallback) {
1222
- const matchedTerm = whoisOrTerms.find(term => whoisResult.output.toLowerCase().includes(term.toLowerCase()));
1237
+ const matchedIdx = whoisOrTermsLower.findIndex(t => whoisOutputLower.includes(t));
1238
+ const matchedTerm = whoisOrTerms[matchedIdx];
1223
1239
  dryRunCallback(domain, 'whois', 'OR logic', matchedTerm, 'Term found in whois data', {
1224
1240
  server: whoisResult.whoisServer || 'default',
1225
1241
  duration: whoisResult.duration,
@@ -1371,8 +1387,15 @@ function createNetToolsHandler(config) {
1371
1387
  } else {
1372
1388
  const lookupPromise = digLookup(digDomain, digRecordType, 5000);
1373
1389
  pendingDigLookups.set(digCacheKey, lookupPromise);
1374
- digResult = await lookupPromise;
1375
- pendingDigLookups.delete(digCacheKey);
1390
+ // try/finally so a rejected lookup still clears the pending
1391
+ // entry — otherwise the Map would retain a rejected-Promise
1392
+ // entry forever and any subsequent caller for the same key
1393
+ // would await that rejection.
1394
+ try {
1395
+ digResult = await lookupPromise;
1396
+ } finally {
1397
+ pendingDigLookups.delete(digCacheKey);
1398
+ }
1376
1399
 
1377
1400
  // Cache the result for future use
1378
1401
  globalDigResultCache.set(digCacheKey, {
@@ -1389,9 +1412,13 @@ function createNetToolsHandler(config) {
1389
1412
  }
1390
1413
 
1391
1414
  if (digResult.success) {
1415
+ // Lowercase the output ONCE — see matching comment in the whois
1416
+ // branch above for rationale.
1417
+ const digOutputLower = digResult.output.toLowerCase();
1418
+
1392
1419
  // Check AND terms if configured
1393
1420
  if (hasDig) {
1394
- digMatched = checkDigTerms(digResult.output, digTerms);
1421
+ digMatched = digTermsLower.every(t => digOutputLower.includes(t));
1395
1422
  if (digMatched && dryRunCallback) {
1396
1423
  dryRunCallback(domain, 'dig', 'AND logic', digTerms.join(', '), `All terms found in ${digRecordType} records`, {
1397
1424
  queriedDomain: digDomain,
@@ -1400,12 +1427,13 @@ function createNetToolsHandler(config) {
1400
1427
  });
1401
1428
  }
1402
1429
  }
1403
-
1430
+
1404
1431
  // Check OR terms if configured
1405
1432
  if (hasDigOr) {
1406
- digOrMatched = checkDigTermsOr(digResult.output, digOrTerms);
1433
+ digOrMatched = digOrTermsLower.some(t => digOutputLower.includes(t));
1407
1434
  if (digOrMatched && dryRunCallback) {
1408
- const matchedTerm = digOrTerms.find(term => digResult.output.toLowerCase().includes(term.toLowerCase()));
1435
+ const matchedIdx = digOrTermsLower.findIndex(t => digOutputLower.includes(t));
1436
+ const matchedTerm = digOrTerms[matchedIdx];
1409
1437
  dryRunCallback(domain, 'dig', 'OR logic', matchedTerm, `Term found in ${digRecordType} records`, {
1410
1438
  queriedDomain: digDomain,
1411
1439
  recordType: digRecordType,
package/lib/proxy.js CHANGED
@@ -18,6 +18,15 @@
18
18
  *
19
19
  * SOCKS5 with auth:
20
20
  * "proxy": "socks5://user:pass@127.0.0.1:1080"
21
+ * Chromium itself cannot authenticate SOCKS5 (crbug.com/256785), so
22
+ * this module auto-starts an in-process no-auth SOCKS5 relay
23
+ * (lib/socks-relay.js) that does the upstream RFC 1929 auth. Chromium
24
+ * connects to the local relay (no auth — which it CAN do) and the
25
+ * relay tunnels to the authenticated upstream. Transparent: keep the
26
+ * socks5://user:pass@host form in config. Requires prepareSocksRelays()
27
+ * to be awaited once before the scan loop (nwss.js does this).
28
+ * NOTE: socks4 with auth is still unsupported (userId-only,
29
+ * near-extinct) — use socks5 or an authenticated HTTP proxy.
21
30
  *
22
31
  * HTTP proxy (corporate):
23
32
  * "proxy": "http://proxy.corp.com:3128"
@@ -56,8 +65,9 @@
56
65
  */
57
66
 
58
67
  const { formatLogMessage } = require('./colorize');
68
+ const { ensureRelay, getRelayPort } = require('./socks-relay');
59
69
 
60
- const PROXY_MODULE_VERSION = '1.1.0';
70
+ const PROXY_MODULE_VERSION = '1.2.0';
61
71
  const SUPPORTED_PROTOCOLS = ['socks5', 'socks4', 'http', 'https'];
62
72
 
63
73
  const DEFAULT_PORTS = {
@@ -105,8 +115,12 @@ function parseProxyUrl(proxyUrl) {
105
115
  if (!host) return null;
106
116
 
107
117
  const port = parseInt(url.port, 10) || DEFAULT_PORTS[protocol] || 1080;
108
- const username = url.username ? decodeURIComponent(url.username) : null;
109
- const password = url.password ? decodeURIComponent(url.password) : null;
118
+ // decodeURIComponent throws URIError on a literal '%' that isn't a valid
119
+ // escape (e.g. a password containing '%'). Fall back to the raw value so
120
+ // an otherwise-valid proxy isn't rejected as "Invalid proxy URL".
121
+ const safeDecode = (v) => { try { return decodeURIComponent(v); } catch (_) { return v; } };
122
+ const username = url.username ? safeDecode(url.username) : null;
123
+ const password = url.password ? safeDecode(url.password) : null;
110
124
 
111
125
  return { protocol, host, port, username, password };
112
126
  } catch (_) {
@@ -124,6 +138,41 @@ function needsProxy(siteConfig) {
124
138
  return !!getConfiguredProxy(siteConfig);
125
139
  }
126
140
 
141
+ /**
142
+ * Pre-start local no-auth SOCKS5 relays for every distinct authenticated
143
+ * SOCKS5 upstream across the given site configs. Must be awaited ONCE
144
+ * before the scan loop — getProxyArgs() then does a pure sync lookup of
145
+ * the relay port, so the fragile per-batch browser-launch path stays
146
+ * synchronous.
147
+ *
148
+ * @param {object[]} siteConfigs
149
+ * @param {boolean} forceDebug
150
+ * @returns {Promise<number>} count of relays started
151
+ */
152
+ async function prepareSocksRelays(siteConfigs, forceDebug = false) {
153
+ let started = 0;
154
+ const seen = new Set();
155
+ for (const cfg of (siteConfigs || [])) {
156
+ const url = getConfiguredProxy(cfg);
157
+ if (!url) continue;
158
+ const parsed = parseProxyUrl(url);
159
+ // Only socks5 with credentials needs a relay. socks4-auth stays
160
+ // unsupported (near-extinct, userId-only); http/https auth works
161
+ // natively via page.authenticate().
162
+ if (!parsed || parsed.protocol !== 'socks5' || !parsed.username) continue;
163
+ const key = `${parsed.host}:${parsed.port}:${parsed.username}`;
164
+ if (seen.has(key)) continue;
165
+ seen.add(key);
166
+ try {
167
+ await ensureRelay(parsed, forceDebug);
168
+ started++;
169
+ } catch (e) {
170
+ console.warn(formatLogMessage('proxy', `Failed to start SOCKS5 auth relay for ${parsed.host}:${parsed.port}: ${e.message}`));
171
+ }
172
+ }
173
+ return started;
174
+ }
175
+
127
176
  /**
128
177
  * Returns Chromium launch arguments for the configured proxy.
129
178
  *
@@ -141,15 +190,45 @@ function getProxyArgs(siteConfig, forceDebug = false) {
141
190
  return [];
142
191
  }
143
192
 
193
+ // Authenticated SOCKS5: Chromium can't auth SOCKS, so point it at the
194
+ // local no-auth relay (started upfront by prepareSocksRelays) which does
195
+ // the upstream auth. Credentials never reach Chromium. The relay speaks
196
+ // SOCKS5 and forwards domain addresses, so the remote-DNS rule below
197
+ // still applies correctly to the localhost hop.
198
+ let effectiveHost = parsed.host;
199
+ let effectivePort = parsed.port;
200
+ let effectiveProto = parsed.protocol;
201
+ if (parsed.protocol === 'socks5' && parsed.username) {
202
+ const relayPort = getRelayPort(parsed);
203
+ if (relayPort) {
204
+ effectiveHost = '127.0.0.1';
205
+ effectivePort = relayPort;
206
+ const debug = forceDebug || siteConfig.proxy_debug || siteConfig.socks5_debug;
207
+ if (debug) {
208
+ console.log(formatLogMessage('proxy', `SOCKS5 auth via local relay 127.0.0.1:${relayPort} -> ${parsed.host}:${parsed.port}`));
209
+ }
210
+ } else {
211
+ // prepareSocksRelays should have started this; defensive only.
212
+ console.warn(formatLogMessage('proxy', `No SOCKS5 auth relay for ${parsed.host}:${parsed.port} — call prepareSocksRelays() before the scan. Connection will fail (Chromium can't auth SOCKS).`));
213
+ }
214
+ }
215
+
144
216
  const args = [
145
- `--proxy-server=${parsed.protocol}://${parsed.host}:${parsed.port}`
217
+ `--proxy-server=${effectiveProto}://${effectiveHost}:${effectivePort}`
146
218
  ];
147
219
 
148
- // Remote DNS: resolve hostnames through the proxy (prevents DNS leaks)
149
- // Only meaningful for SOCKS proxies; HTTP proxies resolve remotely by default
220
+ // Remote DNS: force proxy-side hostname resolution (prevents DNS leaks).
221
+ // SOCKS5 only it can carry a hostname to the proxy for remote
222
+ // resolution. SOCKS4 cannot (the protocol only accepts an IPv4 address;
223
+ // resolution must happen client-side), so applying MAP * ~NOTFOUND there
224
+ // makes Chromium's local resolver fail with nothing able to resolve the
225
+ // hostname — every request breaks. HTTP/HTTPS proxies resolve remotely
226
+ // by default and need no rule.
150
227
  const remoteDns = siteConfig.proxy_remote_dns ?? siteConfig.socks5_remote_dns;
151
- if ((parsed.protocol === 'socks5' || parsed.protocol === 'socks4') && remoteDns !== false) {
228
+ if (parsed.protocol === 'socks5' && remoteDns !== false) {
152
229
  args.push('--host-resolver-rules=MAP * ~NOTFOUND , EXCLUDE 127.0.0.1');
230
+ } else if (parsed.protocol === 'socks4' && remoteDns === true) {
231
+ console.warn(formatLogMessage('proxy', `proxy_remote_dns ignored: SOCKS4 cannot do proxy-side DNS resolution (use SOCKS5)`));
153
232
  }
154
233
 
155
234
  // Bypass list: domains that skip the proxy
@@ -182,6 +261,20 @@ async function applyProxyAuth(page, siteConfig, forceDebug = false) {
182
261
  const parsed = parseProxyUrl(proxyUrl);
183
262
  if (!parsed || !parsed.username) return false;
184
263
 
264
+ // Chromium can't authenticate SOCKS proxies, and page.authenticate() is
265
+ // HTTP-407-only. SOCKS5+creds is handled out-of-band by the local
266
+ // no-auth relay (prepareSocksRelays + getProxyArgs rewrite) — Chromium
267
+ // talks no-auth to 127.0.0.1, so there's nothing for page.authenticate
268
+ // to do here; return quietly. SOCKS4 auth (userId-only, near-extinct)
269
+ // stays genuinely unsupported.
270
+ if (parsed.protocol === 'socks5') {
271
+ return false; // relay handles upstream auth
272
+ }
273
+ if (parsed.protocol === 'socks4') {
274
+ console.warn(formatLogMessage('proxy', `SOCKS4 proxy auth is unsupported (use SOCKS5, which is auto-relayed, or an authenticated HTTP proxy).`));
275
+ return false;
276
+ }
277
+
185
278
  try {
186
279
  await page.authenticate({
187
280
  username: parsed.username,
@@ -265,9 +358,14 @@ function getModuleInfo() {
265
358
  return { version: PROXY_MODULE_VERSION, name: 'Proxy Handler' };
266
359
  }
267
360
 
361
+ // Re-export relay teardown so nwss.js cleanup paths can close listeners.
362
+ const { closeAllRelays: closeAllSocksRelays } = require('./socks-relay');
363
+
268
364
  module.exports = {
269
365
  parseProxyUrl,
270
366
  needsProxy,
367
+ prepareSocksRelays,
368
+ closeAllSocksRelays,
271
369
  getProxyArgs,
272
370
  applyProxyAuth,
273
371
  testProxy,
package/lib/redirect.js CHANGED
@@ -15,6 +15,9 @@ async function navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOp
15
15
  const redirectChain = [currentUrl];
16
16
  let finalUrl = currentUrl;
17
17
  let redirected = false;
18
+ // Hoisted so they're in scope at the return outside the try block below.
19
+ let httpStatus = null;
20
+ let cfRay = null;
18
21
  const jsRedirectTimeout = siteConfig.js_redirect_timeout || 5000; // Wait 5s for JS redirects
19
22
  const maxRedirects = siteConfig.max_redirects || 10;
20
23
  const detectJSPatterns = siteConfig.detect_js_patterns !== false; // Default to true
@@ -23,7 +26,12 @@ async function navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOp
23
26
  const navigationHandler = (frame) => {
24
27
  if (frame === page.mainFrame()) {
25
28
  const frameUrl = frame.url();
26
- if (frameUrl && frameUrl !== 'about:blank' && !redirectChain.includes(frameUrl)) {
29
+ // Skip about:blank and chrome-error:// — the latter is what Puppeteer
30
+ // navigates to on DNS/connection failures, and pushing it into the
31
+ // redirect chain produces bogus entries like
32
+ // "chrome-error://chromewebdata/" that downstream consumers
33
+ // (redirectDomains, logs) treat as a real intermediate hop.
34
+ if (frameUrl && frameUrl !== 'about:blank' && !frameUrl.startsWith('chrome-error://') && !redirectChain.includes(frameUrl)) {
27
35
  // Check redirect limit before adding
28
36
  if (redirectChain.length >= maxRedirects) {
29
37
  if (forceDebug) {
@@ -161,9 +169,21 @@ async function navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOp
161
169
  console.log(formatLogMessage('debug', `Using goto options: ${JSON.stringify(gotoOptions)}`));
162
170
  }
163
171
 
164
- // Initial navigation
172
+ // Initial navigation. Puppeteer's page.goto returns the response for the
173
+ // last HTTP request in the chain (it follows HTTP redirects internally),
174
+ // so response.status() reflects the page that actually rendered, not the
175
+ // 301/302 hop. JS redirects via window.location detected later in this
176
+ // function will land on a different page, in which case httpStatus/cfRay
177
+ // captured here are pre-JS-redirect — a known limitation.
165
178
  const response = await page.goto(currentUrl, gotoOptions);
166
-
179
+ if (response) {
180
+ try {
181
+ httpStatus = response.status();
182
+ const headers = response.headers();
183
+ if (headers && headers['cf-ray']) cfRay = headers['cf-ray'];
184
+ } catch (_) { /* response disposed or detached — fine, stays null */ }
185
+ }
186
+
167
187
  if (response && response.url() !== currentUrl) {
168
188
  // Check redirect limit before adding
169
189
  if (redirectChain.length >= maxRedirects) {
@@ -295,7 +315,7 @@ async function navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOp
295
315
  redirectDomains = intermediateDomains;
296
316
  }
297
317
 
298
- return { finalUrl, redirected, redirectChain, originalUrl: currentUrl, redirectDomains };
318
+ return { finalUrl, redirected, redirectChain, originalUrl: currentUrl, redirectDomains, httpStatus, cfRay };
299
319
  }
300
320
 
301
321
  /**
@@ -306,13 +326,23 @@ async function navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOp
306
326
  * @returns {Promise<Array>} Array of detected patterns
307
327
  */
308
328
  async function detectCommonJSRedirects(page, forceDebug = false, formatLogMessage) {
329
+ // This function's only externally-visible behavior is the per-pattern
330
+ // debug log below. The return value isn't read by any caller. Bail
331
+ // before the expensive page.evaluate + outerHTML serialization when
332
+ // there's no debug consumer for the result.
333
+ if (!forceDebug) return [];
334
+
309
335
  try {
310
336
  const redirectPatterns = await page.evaluate(() => {
311
337
  const patterns = [];
312
-
313
- // Check for common redirect patterns in page source
314
- const pageSource = document.documentElement.outerHTML;
315
-
338
+
339
+ // Cap the source read to 100KB. document.documentElement.outerHTML
340
+ // materializes the full page (potentially many MB on content-heavy
341
+ // sites) AND serializes it over CDP back to Node. JS redirects all
342
+ // appear early — in head meta tags or top-of-body inline scripts —
343
+ // so a head-anchored cap is enough for real-world coverage.
344
+ const pageSource = document.documentElement.outerHTML.substring(0, 100000);
345
+
316
346
  // Pattern 1: window.location = "url"
317
347
  const locationAssign = pageSource.match(/window\.location\s*=\s*["']([^"']+)["']/g);
318
348
  if (locationAssign) {