@fanboynz/network-scanner 2.0.65 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -9,6 +9,7 @@ const fs = require('fs');
9
9
  const os = require('os');
10
10
  const psl = require('psl');
11
11
  const path = require('path');
12
+ const dnsPromises = require('node:dns/promises');
12
13
  const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
13
14
  const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
14
15
  const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
@@ -27,13 +28,13 @@ const {
27
28
  cleanup: cleanupCloudflareCache
28
29
  } = require('./lib/cloudflare');
29
30
  // FP Bypass
30
- const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
31
+ const { handleFlowProxyProtection, getFlowProxyTimeouts, attachFlowProxyHeaderListener } = require('./lib/flowproxy');
31
32
  // ignore_similar rules
32
33
  const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar');
33
34
  // Graceful exit
34
- const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
35
+ const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
35
36
  // Whois & Dig
36
- const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats } = require('./lib/nettools');
37
+ const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
37
38
  // File compare
38
39
  const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
39
40
  // CDP functionality
@@ -41,7 +42,29 @@ const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeo
41
42
  // Post-processing cleanup
42
43
  const { processResults } = require('./lib/post-processing');
43
44
  // Colorize various text when used
44
- const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize');
45
+ const { messageColors, formatLogMessage } = require('./lib/colorize');
46
+ const TIMEOUT_TAG = messageColors.processing('[TIMEOUT]');
47
+ const INTERACTION_TAG = messageColors.processing('[interaction]');
48
+ const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]');
49
+ const PROXY_TAG = messageColors.processing('[proxy]');
50
+ const GREP_RESPONSE_TAG = messageColors.processing('[grep-response]');
51
+ const IGNORE_DOMAINS_BY_URL_TAG = messageColors.processing('[ignoreDomainsByUrl]');
52
+ const BLOCK_DOMAINS_BY_URL_TAG = messageColors.processing('[blockDomainsByUrl]');
53
+ const IGNORE_SIMILAR_IGNORED_DOMAINS_TAG = messageColors.processing('[ignore_similar_ignored_domains]');
54
+ const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
55
+ const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
56
+ const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
57
+ const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
58
+ const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
59
+ const VPN_TAG = messageColors.processing('[vpn]');
60
+ // Precomputed colored '[SmartCache]' subsystem prefix — paired with the
61
+ // same constant in lib/smart-cache.js so debug lines from both files
62
+ // produce consistently colored output. formatLogMessage only colors the
63
+ // [severity] tag; this constant colors the subsystem prefix.
64
+ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
65
+ // Precomputed colored '[CONCURRENCY]' subsystem prefix for batch-throughput
66
+ // log lines (start/completed). Same cyan as the other monitoring tags.
67
+ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
45
68
  // Enhanced mouse interaction and page simulation
46
69
  const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
47
70
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
@@ -50,7 +73,7 @@ const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostR
50
73
  const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache');
51
74
  const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
52
75
  const { clearPersistentCache } = require('./lib/smart-cache');
53
- const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy } = require('./lib/proxy');
76
+ const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
54
77
  // Dry run functionality
55
78
  const { initializeDryRunCollections, addDryRunMatch, addDryRunNetTools, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
56
79
  // Enhanced site data clearing functionality
@@ -157,7 +180,10 @@ function detectPuppeteerVersion() {
157
180
  // Enhanced redirect handling
158
181
  const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
159
182
  // Ensure web browser is working correctly
160
- const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload, purgeStaleTrackers } = require('./lib/browserhealth');
183
+ // purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
184
+ // and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
185
+ // automatically — manual purging is no longer needed.
186
+ const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
161
187
 
162
188
  // --- Script Configuration & Constants ---
163
189
  const VERSION = '2.0.33'; // Script version
@@ -266,6 +292,13 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
266
292
  }
267
293
 
268
294
  const headfulMode = args.includes('--headful');
295
+ // Sites (esp. video/streaming) call element.requestFullscreen() on load or
296
+ // click. In --headful that hijacks the real Chrome window into true
297
+ // fullscreen, forcing a manual ESC. Neutralize the Fullscreen API by
298
+ // default so it can't. Harmless in headless (no screen — the API is
299
+ // already inert there), so default-on keeps headful consistent with the
300
+ // primary headless path. --allow-fullscreen restores native behavior.
301
+ const allowFullscreen = args.includes('--allow-fullscreen');
269
302
  const SOURCES_FOLDER = 'sources';
270
303
 
271
304
  let outputFile = null;
@@ -326,6 +359,36 @@ const cacheRequests = args.includes('--cache-requests');
326
359
  const dnsCacheMode = args.includes('--dns-cache');
327
360
  if (dnsCacheMode) enableDiskCache();
328
361
 
362
+ // DNS pre-check before page.goto() — default-on, --no-dns-precheck disables.
363
+ // Filters NXDOMAIN / unresolvable hostnames in <100ms before paying the
364
+ // ~5-15s Puppeteer + Cloudflare detection round-trip on each.
365
+ const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
366
+ const dnsPrecheckTimeoutMs = 2000;
367
+
368
+ // Per-scan cache of negative DNS lookups. OS resolvers don't always cache
369
+ // NXDOMAIN responses, and a scan can hit the same dead hostname many times
370
+ // (different URL paths on the same site). Positive results are left to the
371
+ // OS cache; failure-cache avoids repeated lookup latency for known-dead hosts.
372
+ // FIFO eviction at DNS_NEGATIVE_CACHE_MAX so pathological scans (thousands
373
+ // of unique dead hosts) can't grow the cache unboundedly. Same pattern as
374
+ // the rest of the codebase's in-memory caches.
375
+ const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
376
+ const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
377
+ const DNS_NEGATIVE_CACHE_MAX = 1000;
378
+ let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
379
+ let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
380
+ const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
381
+ // c-ares transient codes — read-only, hoisted out of the per-task DNS
382
+ // pre-check so we don't allocate a fresh Set per URL.
383
+ const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
384
+
385
+ function dnsNegativeCacheSet(hostname, error) {
386
+ if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
387
+ dnsNegativeCache.delete(dnsNegativeCache.keys().next().value);
388
+ }
389
+ dnsNegativeCache.set(hostname, { error, timestamp: Date.now() });
390
+ }
391
+
329
392
  let validateRulesFile = null;
330
393
  const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules');
331
394
  if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) {
@@ -643,6 +706,8 @@ General Options:
643
706
  --custom-json <file> Use a custom config JSON file instead of config.json
644
707
  --headful Launch browser with GUI (not headless)
645
708
  --keep-open Keep browser open after scan completes (use with --headful)
709
+ --allow-fullscreen Allow sites to use the Fullscreen API. By default it is
710
+ neutralized so sites can't hijack the window in --headful
646
711
  --use-puppeteer-core Use puppeteer-core with system Chrome instead of bundled Chromium
647
712
  --use-obscura Connect to running Obscura CDP server (ws://127.0.0.1:9222 or OBSCURA_WS env)
648
713
  Skips fingerprint injection — Obscura provides built-in stealth
@@ -658,7 +723,10 @@ General Options:
658
723
 
659
724
  Validation Options:
660
725
  --cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
661
- --dns-cache Persist dig/whois results to disk between runs (3hr/4hr TTL)
726
+ --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
727
+ --no-dns-precheck Disable per-URL DNS resolution check before page navigation.
728
+ By default, URLs whose hostname doesn't resolve are skipped
729
+ immediately (saves ~5-15s of Puppeteer time per dead host).
662
730
  --validate-config Validate config.json file and exit
663
731
  --validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
664
732
  --clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
@@ -669,6 +737,7 @@ Validation Options:
669
737
  Global config.json options:
670
738
  ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
671
739
  ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
740
+ blockDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, ALL subsequent requests on that root domain (and subdomains) are aborted via Puppeteer for the rest of the scan
672
741
  blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
673
742
  whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
674
743
  ignore_similar: true/false Ignore domains similar to already found domains (default: true)
@@ -838,6 +907,7 @@ const {
838
907
  sites = [],
839
908
  ignoreDomains = [],
840
909
  ignoreDomainsByUrl = [],
910
+ blockDomainsByUrl = [],
841
911
  blocked: globalBlocked = [],
842
912
  whois_delay = 3000,
843
913
  whois_server_mode = 'random',
@@ -927,10 +997,11 @@ if (validateConfig) {
927
997
  }
928
998
  }
929
999
 
930
- // Pre-compile global blocked regexes ONCE (used in every processUrl call)
931
- const globalBlockedRegexes = Array.isArray(globalBlocked)
932
- ? globalBlocked.map(pattern => new RegExp(pattern))
933
- : [];
1000
+ // Pre-compile global blocked regexes ONCE (used in every processUrl call).
1001
+ // Was: bare `.map(pattern => new RegExp(pattern))` which hard-threw at
1002
+ // module load on a single bad pattern, killing scan startup. Helper now
1003
+ // warns + skips so the rest of the config can still run.
1004
+ const globalBlockedRegexes = compilePatternList('blocked (global)', globalBlocked);
934
1005
 
935
1006
  // Cache compiled regexes by pattern string — avoids recompiling same patterns across URLs
936
1007
  const _compiledRegexCache = new Map();
@@ -949,6 +1020,44 @@ function getCompiledRegexes(patterns) {
949
1020
  return arr.map(p => getCompiledRegex(p));
950
1021
  }
951
1022
 
1023
+ /**
1024
+ * Compile a list of regex pattern strings, WARNING loudly on any that fail
1025
+ * compilation instead of:
1026
+ * (a) silently dropping them (old ignoreDomainsByUrl/blockDomainsByUrl
1027
+ * behavior) -- made debugging "why isn't my pattern matching?"
1028
+ * miserable, and
1029
+ * (b) hard-throwing at module load (old `blocked` behavior) -- one bad
1030
+ * pattern would kill the whole scan startup.
1031
+ *
1032
+ * Returns the array of successfully compiled regexes. Failed patterns are
1033
+ * skipped with a single warn line per failure naming the config key + the
1034
+ * source string + the regex error -- enough to find and fix without
1035
+ * grepping through diff history.
1036
+ *
1037
+ * @param {string} configKey - name of the config key, for warn context
1038
+ * @param {string[]} patterns - raw regex source strings
1039
+ * @param {(p:string)=>RegExp} [compile] - compile fn (defaults to new RegExp)
1040
+ * @returns {RegExp[]}
1041
+ */
1042
+ function compilePatternList(configKey, patterns, compile = (p) => new RegExp(p)) {
1043
+ if (!Array.isArray(patterns)) return [];
1044
+ const out = [];
1045
+ for (const p of patterns) {
1046
+ try {
1047
+ out.push(compile(p));
1048
+ } catch (err) {
1049
+ console.warn(formatLogMessage('warn', `[config] ${configKey} pattern dropped (compile error): ${JSON.stringify(p)} -- ${err.message}`));
1050
+ }
1051
+ }
1052
+ return out;
1053
+ }
1054
+
1055
+ // Per-pattern match counters for the `blocked` regex (site + global,
1056
+ // combined). Keyed by RegExp.source so the same pattern appearing in both
1057
+ // site and global lists rolls up into one row. Reported at scan end so
1058
+ // stale patterns that match zero requests are easy to spot and prune.
1059
+ const _blockedPatternHits = new Map();
1060
+
952
1061
  // Pre-split ignoreDomains into exact Set (O(1) lookup) and wildcard array
953
1062
  const _ignoreDomainsExact = new Set();
954
1063
  const _ignoreDomainsWildcard = [];
@@ -960,15 +1069,23 @@ for (const pattern of ignoreDomains) {
960
1069
  }
961
1070
  }
962
1071
 
963
- // Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
964
- const _ignoreDomainsByUrlRegexes = Array.isArray(ignoreDomainsByUrl)
965
- ? ignoreDomainsByUrl.map(p => {
966
- try { return getCompiledRegex(p); } catch { return null; }
967
- }).filter(r => r)
968
- : [];
1072
+ // Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains.
1073
+ // Bad patterns warn (via compilePatternList) instead of silently dropping.
1074
+ const _ignoreDomainsByUrlRegexes = compilePatternList('ignoreDomainsByUrl', ignoreDomainsByUrl, getCompiledRegex);
969
1075
  // Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
970
1076
  const _dynamicallyIgnoredDomains = new Set();
971
1077
 
1078
+ // blockDomainsByUrl: symmetric to ignoreDomainsByUrl but for active
1079
+ // blocking via Puppeteer's request.abort(). When a request URL matches
1080
+ // one of these regex patterns, the request's root domain is added to
1081
+ // _dynamicallyBlockedDomains; subsequent requests on that domain (and
1082
+ // its subdomains, via parent-walk in matchesDynamicBlock) get aborted
1083
+ // before reaching the network. The triggering request itself is also
1084
+ // aborted -- same "gate fires immediately after trigger" semantic the
1085
+ // ignoreDomainsByUrl path uses for the dynamic Set short-circuit.
1086
+ const _blockDomainsByUrlRegexes = compilePatternList('blockDomainsByUrl', blockDomainsByUrl, getCompiledRegex);
1087
+ const _dynamicallyBlockedDomains = new Set();
1088
+
972
1089
  // Apply global configuration overrides with validation
973
1090
  // Priority: Command line args > config.json > defaults
974
1091
  const MAX_CONCURRENT_SITES = (() => {
@@ -1065,7 +1182,7 @@ function safeMarkDomainProcessed(domain, context, metadata) {
1065
1182
  }
1066
1183
  } catch (cacheErr) {
1067
1184
  if (forceDebug) {
1068
- console.log(formatLogMessage('debug', `[SmartCache] Error marking domain: ${cacheErr.message}`));
1185
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
1069
1186
  }
1070
1187
  }
1071
1188
  }
@@ -1379,16 +1496,58 @@ function shouldBypassCacheForUrl(url, siteConfig) {
1379
1496
  // ability to use wildcards in ignoreDomains
1380
1497
  // Cache compiled wildcard regexes to avoid recompilation on every request
1381
1498
  const _wildcardRegexCache = new Map();
1499
+
1500
+ // Generic parent-walk helper: returns true if `domain` or any of its
1501
+ // parents (one label at a time, up to the TLD) is present in `set`.
1502
+ // Mirrors the static/dynamic parent-walk inside matchesIgnoreDomain but
1503
+ // usable against an arbitrary single Set -- consumed by
1504
+ // matchesDynamicBlock below. matchesIgnoreDomain keeps its inline
1505
+ // dual-Set probe so the hot path stays single-split, but new single-Set
1506
+ // consumers (block, future similar features) share this helper.
1507
+ function _domainOrParentInSet(set, domain) {
1508
+ if (set.size === 0) return false;
1509
+ if (set.has(domain)) return true;
1510
+ const parts = domain.split('.');
1511
+ for (let i = 1; i < parts.length; i++) {
1512
+ if (set.has(parts.slice(i).join('.'))) return true;
1513
+ }
1514
+ return false;
1515
+ }
1516
+
1517
+ /**
1518
+ * Block-side counterpart to the ignore gate. Returns true if `domain`
1519
+ * (or any of its parents) has been added to _dynamicallyBlockedDomains
1520
+ * by an earlier blockDomainsByUrl pattern match. Called per-request to
1521
+ * decide whether to request.abort() before the static blocked-regex
1522
+ * check fires.
1523
+ */
1524
+ function matchesDynamicBlock(domain) {
1525
+ return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
1526
+ }
1527
+
1382
1528
  function matchesIgnoreDomain(domain, ignorePatterns) {
1383
- // Dynamically ignored domains (from URL pattern matches via ignoreDomainsByUrl)
1384
- if (_dynamicallyIgnoredDomains.has(domain)) return true;
1385
- // Fast path: exact match or suffix match against Set (O(n) for parts, but no regex)
1386
- if (_ignoreDomainsExact.size > 0) {
1387
- if (_ignoreDomainsExact.has(domain)) return true;
1388
- // Check parent domains: sub.ads.example.com ads.example.com example.com
1529
+ // Both dynamic and static ignore lists are walked parent-by-parent so a
1530
+ // subdomain of an ignored root inherits the ignore. Previously the
1531
+ // dynamic check was exact-only, creating an asymmetry: a static-config
1532
+ // `example.com` ignored cdn.example.com transitively, but a runtime
1533
+ // ignoreDomainsByUrl match for the same root (stored as root via
1534
+ // checkedRootDomain at line ~2993) did NOT cascade -- subdomains slipped
1535
+ // through to dig/whois/regex despite the root being ignored. Now
1536
+ // unified: parts split once, shared between both Set probes.
1537
+ const hasDynamic = _dynamicallyIgnoredDomains.size > 0;
1538
+ const hasExact = _ignoreDomainsExact.size > 0;
1539
+
1540
+ if (hasDynamic || hasExact) {
1541
+ // Exact-domain hit on either set wins early.
1542
+ if (hasDynamic && _dynamicallyIgnoredDomains.has(domain)) return true;
1543
+ if (hasExact && _ignoreDomainsExact.has(domain)) return true;
1544
+
1545
+ // Parent-walk: sub.ads.example.com → ads.example.com → example.com
1389
1546
  const parts = domain.split('.');
1390
1547
  for (let i = 1; i < parts.length; i++) {
1391
- if (_ignoreDomainsExact.has(parts.slice(i).join('.'))) return true;
1548
+ const parent = parts.slice(i).join('.');
1549
+ if (hasDynamic && _dynamicallyIgnoredDomains.has(parent)) return true;
1550
+ if (hasExact && _ignoreDomainsExact.has(parent)) return true;
1392
1551
  }
1393
1552
  }
1394
1553
 
@@ -1830,7 +1989,7 @@ function setupFrameHandling(page, forceDebug) {
1830
1989
  wgDisconnectAll(forceDebug);
1831
1990
  ovpnDisconnectAll(forceDebug);
1832
1991
  cleanupCloudflareCache();
1833
- purgeStaleTrackers();
1992
+ try { await closeAllSocksRelays(forceDebug); } catch (_) {}
1834
1993
  }
1835
1994
 
1836
1995
  let siteCounter = 0;
@@ -1981,28 +2140,46 @@ function setupFrameHandling(page, forceDebug) {
1981
2140
  'Browser disconnected'
1982
2141
  ]);
1983
2142
 
2143
+ // Popup-capture cleanup registry — declared outside the try so the
2144
+ // finally block (which is a separate lexical scope from try) can see
2145
+ // it. Populated by the capture_popups setup block if siteConfig
2146
+ // .capture_popups is true; iterated in finally to deregister the
2147
+ // browser 'targetcreated' listener and close any tracked popup pages.
2148
+ const popupCleanups = [];
2149
+ // Race-window guard: 'targetcreated' fires synchronously, but
2150
+ // onTargetCreated does an `await target.page()`. If a popup target
2151
+ // is created right as the per-URL try block winds down, the await
2152
+ // can resolve AFTER finally has already iterated popupCleanups —
2153
+ // leaving the popup unregistered for manual cleanup (it still gets
2154
+ // closed by its own 3s auto-close timer, but in the meantime its
2155
+ // request listener could capture matches into matchedDomains for a
2156
+ // URL that already "finished"). The flag is set in finally and
2157
+ // checked at the start of onTargetCreated to short-circuit late
2158
+ // events cleanly.
2159
+ let urlFinished = false;
2160
+
1984
2161
  try {
1985
2162
 
1986
2163
  // --- Connect VPN if configured for this site ---
1987
2164
  if (siteConfig.vpn) {
1988
2165
  const vpnResult = await wgConnect(siteConfig, forceDebug);
1989
2166
  if (!vpnResult.success) {
1990
- console.warn(formatLogMessage('warn', `[vpn] WireGuard failed for ${currentUrl}: ${vpnResult.error}`));
2167
+ console.warn(formatLogMessage('warn', `${VPN_TAG} WireGuard failed for ${currentUrl}: ${vpnResult.error}`));
1991
2168
  return { url: currentUrl, rules: [], success: false, vpnFailed: true };
1992
2169
  }
1993
2170
  if (!silentMode) {
1994
2171
  const ipInfo = vpnResult.externalIP ? ` (${vpnResult.externalIP})` : '';
1995
- console.log(formatLogMessage('info', `[vpn] WireGuard connected via ${vpnResult.interface}${ipInfo} for ${currentUrl}`));
2172
+ console.log(formatLogMessage('info', `${VPN_TAG} WireGuard connected via ${vpnResult.interface}${ipInfo} for ${currentUrl}`));
1996
2173
  }
1997
2174
  } else if (siteConfig.openvpn) {
1998
2175
  const ovpnResult = await ovpnConnect(siteConfig, forceDebug);
1999
2176
  if (!ovpnResult.success) {
2000
- console.warn(formatLogMessage('warn', `[vpn] OpenVPN failed for ${currentUrl}: ${ovpnResult.error}`));
2177
+ console.warn(formatLogMessage('warn', `${VPN_TAG} OpenVPN failed for ${currentUrl}: ${ovpnResult.error}`));
2001
2178
  return { url: currentUrl, rules: [], success: false, vpnFailed: true };
2002
2179
  }
2003
2180
  if (!silentMode) {
2004
2181
  const ipInfo = ovpnResult.externalIP ? ` (${ovpnResult.externalIP})` : '';
2005
- console.log(formatLogMessage('info', `[vpn] OpenVPN connected via ${ovpnResult.connection}${ipInfo} for ${currentUrl}`));
2182
+ console.log(formatLogMessage('info', `${VPN_TAG} OpenVPN connected via ${ovpnResult.connection}${ipInfo} for ${currentUrl}`));
2006
2183
  }
2007
2184
  }
2008
2185
 
@@ -2036,12 +2213,12 @@ function setupFrameHandling(page, forceDebug) {
2036
2213
  const totalDelay = siteDelay + bufferTime;
2037
2214
 
2038
2215
  if (forceDebug && hasCloudflareConfig) {
2039
- console.log(formatLogMessage('debug', `[realtime_cleanup] Using extended delay for Cloudflare site: ${totalDelay}ms (${siteDelay}ms + ${bufferTime}ms CF buffer)`));
2216
+ console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Using extended delay for Cloudflare site: ${totalDelay}ms (${siteDelay}ms + ${bufferTime}ms CF buffer)`));
2040
2217
  }
2041
2218
 
2042
2219
  const realtimeResult = await performRealtimeWindowCleanup(browserInstance, threshold, forceDebug, totalDelay);
2043
2220
  if (realtimeResult.success && realtimeResult.closedCount > 0 && forceDebug) {
2044
- console.log(formatLogMessage('debug', `[realtime_cleanup] Cleaned ${realtimeResult.closedCount} old pages, ${realtimeResult.remainingPages} remaining`));
2221
+ console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Cleaned ${realtimeResult.closedCount} old pages, ${realtimeResult.remainingPages} remaining`));
2045
2222
  }
2046
2223
  }
2047
2224
 
@@ -2052,7 +2229,7 @@ function setupFrameHandling(page, forceDebug) {
2052
2229
  // Aggressive timeouts prevent hanging in Puppeteer 23.x while maintaining speed
2053
2230
 
2054
2231
  page.on('console', (msg) => {
2055
- if (forceDebug && msg.type() === 'error') console.log(`[debug] Console error: ${msg.text()}`);
2232
+ if (forceDebug && msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
2056
2233
  });
2057
2234
 
2058
2235
  // Add page crash handler
@@ -2113,6 +2290,11 @@ function setupFrameHandling(page, forceDebug) {
2113
2290
  const flowproxyTimeouts = getFlowProxyTimeouts(siteConfig);
2114
2291
  page.setDefaultTimeout(Math.min(flowproxyTimeouts.pageTimeout, TIMEOUTS.DEFAULT_NAVIGATION));
2115
2292
  page.setDefaultNavigationTimeout(Math.min(flowproxyTimeouts.navigationTimeout, TIMEOUTS.DEFAULT_PAGE));
2293
+ // Attach the response/header listener BEFORE navigation so the
2294
+ // document response's own headers (Server, Set-Cookie, X-FlowProxy-*,
2295
+ // etc.) are observed. The listener accumulates state in a WeakMap
2296
+ // keyed by page; analyzeFlowProxyProtection reads from it later.
2297
+ attachFlowProxyHeaderListener(page);
2116
2298
  if (forceDebug) {
2117
2299
  console.log(formatLogMessage('debug', `Applied flowProxy timeouts - page: ${flowproxyTimeouts.pageTimeout}ms, nav: ${flowproxyTimeouts.navigationTimeout}ms`));
2118
2300
  }
@@ -2131,9 +2313,9 @@ function setupFrameHandling(page, forceDebug) {
2131
2313
  if (shouldInjectEvalForPage) {
2132
2314
  if (forceDebug) {
2133
2315
  if (globalEvalOnDoc) {
2134
- console.log(formatLogMessage('debug', `[evalOnDoc] Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
2316
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
2135
2317
  } else { // siteConfig.evaluateOnNewDocument must be true
2136
- console.log(formatLogMessage('debug', `[evalOnDoc] Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
2318
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
2137
2319
  }
2138
2320
  }
2139
2321
 
@@ -2154,7 +2336,7 @@ function setupFrameHandling(page, forceDebug) {
2154
2336
  browserResponsive = true;
2155
2337
  } catch (healthErr) {
2156
2338
  if (forceDebug) {
2157
- console.log(formatLogMessage('debug', `[evalOnDoc] Browser health check failed: ${healthErr.message}`));
2339
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser health check failed: ${healthErr.message}`));
2158
2340
  }
2159
2341
  browserResponsive = false;
2160
2342
  }
@@ -2253,7 +2435,7 @@ function setupFrameHandling(page, forceDebug) {
2253
2435
  ]);
2254
2436
  evalOnDocSuccess = true;
2255
2437
  if (forceDebug) {
2256
- console.log(formatLogMessage('debug', `[evalOnDoc] Full injection successful for ${currentUrl}`));
2438
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection successful for ${currentUrl}`));
2257
2439
  }
2258
2440
  } catch (fullInjectionErr) {
2259
2441
  // Enhanced error detection for CDP issues
@@ -2264,12 +2446,12 @@ function setupFrameHandling(page, forceDebug) {
2264
2446
 
2265
2447
  if (forceDebug) {
2266
2448
  const errorType = isCDPError ? 'CDP/Protocol error' : 'timeout/other';
2267
- console.log(formatLogMessage('debug', `[evalOnDoc] Full injection failed (${errorType}): ${fullInjectionErr.message}`));
2449
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection failed (${errorType}): ${fullInjectionErr.message}`));
2268
2450
  }
2269
2451
 
2270
2452
  // Skip fallback for CDP errors - they indicate browser communication issues
2271
2453
  if (isCDPError) {
2272
- console.warn(formatLogMessage('warn', `[evalOnDoc] CDP communication failure - skipping injection for ${currentUrl}`));
2454
+ console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} CDP communication failure - skipping injection for ${currentUrl}`));
2273
2455
  evalOnDocSuccess = false;
2274
2456
  } else {
2275
2457
 
@@ -2316,11 +2498,11 @@ function setupFrameHandling(page, forceDebug) {
2316
2498
  ]);
2317
2499
  evalOnDocSuccess = true;
2318
2500
  if (forceDebug) {
2319
- console.log(formatLogMessage('debug', `[evalOnDoc] Minimal injection successful for ${currentUrl}`));
2501
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection successful for ${currentUrl}`));
2320
2502
  }
2321
2503
  } catch (minimalInjectionErr) {
2322
2504
  if (forceDebug) {
2323
- console.log(formatLogMessage('debug', `[evalOnDoc] Minimal injection also failed: ${minimalInjectionErr.message}`));
2505
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection also failed: ${minimalInjectionErr.message}`));
2324
2506
  }
2325
2507
  evalOnDocSuccess = false;
2326
2508
  }
@@ -2328,14 +2510,14 @@ function setupFrameHandling(page, forceDebug) {
2328
2510
  }
2329
2511
  } else {
2330
2512
  if (forceDebug) {
2331
- console.log(formatLogMessage('debug', `[evalOnDoc] Browser unresponsive, skipping injection for ${currentUrl}`));
2513
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser unresponsive, skipping injection for ${currentUrl}`));
2332
2514
  }
2333
2515
  evalOnDocSuccess = false;
2334
2516
  }
2335
2517
 
2336
2518
  // Final status logging
2337
2519
  if (!evalOnDocSuccess) {
2338
- console.warn(formatLogMessage('warn', `[evalOnDoc] All injection strategies failed for ${currentUrl} - continuing with standard request monitoring only`));
2520
+ console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} All injection strategies failed for ${currentUrl} - continuing with standard request monitoring only`));
2339
2521
  }
2340
2522
  // Allow realtime cleanup to proceed after injection completes
2341
2523
  if (shouldInjectEvalForPage && siteConfig.window_cleanup === "realtime") {
@@ -2364,7 +2546,7 @@ function setupFrameHandling(page, forceDebug) {
2364
2546
  }
2365
2547
  }, { selectors: cssBlockedSelectors });
2366
2548
  } catch (cssErr) {
2367
- console.warn(formatLogMessage('warn', `[css_blocked] Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
2549
+ console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
2368
2550
  }
2369
2551
  }
2370
2552
  // --- END: CSS Element Blocking Setup ---
@@ -2421,7 +2603,7 @@ function setupFrameHandling(page, forceDebug) {
2421
2603
  const clearResult = await clearSiteData(page, currentUrl, forceDebug);
2422
2604
  if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data for ${currentUrl}`));
2423
2605
  } catch (clearErr) {
2424
- if (forceDebug) console.log(formatLogMessage('debug', `[clear_sitedata] Failed for ${currentUrl}: ${clearErr.message}`));
2606
+ if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Failed for ${currentUrl}: ${clearErr.message}`));
2425
2607
  }
2426
2608
  }
2427
2609
 
@@ -2438,6 +2620,29 @@ function setupFrameHandling(page, forceDebug) {
2438
2620
  } else if (forceDebug) {
2439
2621
  console.log(formatLogMessage('debug', `Skipping fingerprint injection — Obscura provides built-in stealth`));
2440
2622
  }
2623
+
2624
+ // Neutralize the Fullscreen API before any page script runs so a
2625
+ // site can't force the real browser window fullscreen in --headful
2626
+ // (or trip an anti-bot check that reads document.fullscreenElement).
2627
+ // requestFullscreen is stubbed to a resolved no-op — which is also
2628
+ // how browsers already behave when it's called without a user
2629
+ // gesture, so this looks normal, not automated. fullscreenElement
2630
+ // stays null naturally since we never enter fullscreen.
2631
+ if (!allowFullscreen) {
2632
+ try {
2633
+ await page.evaluateOnNewDocument(() => {
2634
+ const noop = function () { return Promise.resolve(); };
2635
+ const legacyNoop = function () {};
2636
+ try { Element.prototype.requestFullscreen = noop; } catch (_) {}
2637
+ try { Element.prototype.webkitRequestFullscreen = legacyNoop; } catch (_) {}
2638
+ try { Element.prototype.webkitRequestFullScreen = legacyNoop; } catch (_) {}
2639
+ try { Element.prototype.mozRequestFullScreen = legacyNoop; } catch (_) {}
2640
+ try { Element.prototype.msRequestFullscreen = legacyNoop; } catch (_) {}
2641
+ });
2642
+ } catch (fsErr) {
2643
+ if (forceDebug) console.log(formatLogMessage('debug', `Fullscreen neutralization injection failed: ${fsErr.message}`));
2644
+ }
2645
+ }
2441
2646
 
2442
2647
  // Client Hints protection for Chrome user agents (skipped under Obscura — it sets its own)
2443
2648
  if (!useObscura && siteConfig.userAgent && siteConfig.userAgent.toLowerCase().includes('chrome')) {
@@ -2624,19 +2829,41 @@ function setupFrameHandling(page, forceDebug) {
2624
2829
  });
2625
2830
  }
2626
2831
 
2627
- const blockedRegexes = Array.isArray(siteConfig.blocked)
2628
- ? siteConfig.blocked.map(pattern => getCompiledRegex(pattern))
2629
- : [];
2832
+ // Per-site blocked compile -- helper warns on bad patterns instead of
2833
+ // throwing out of processUrl and breaking that site's scan.
2834
+ const blockedRegexes = compilePatternList(`blocked (site: ${siteConfig.url || 'unknown'})`, siteConfig.blocked, getCompiledRegex);
2835
+
2836
+ // Per-site escape hatch: disable_adblock turns off the two layers of
2837
+ // "global" ad-blocking for this URL — the adblock-rs filter-list engine
2838
+ // and the globalBlockedRegexes pattern list. Per-site siteConfig.blocked
2839
+ // is preserved (it's an explicit per-site choice, not "global" blocking).
2840
+ //
2841
+ // The use case: capture_popups + popunder/redirect chains. The global
2842
+ // adblock often aborts the exact requests that fire the popup or chain
2843
+ // to the tracker, defeating capture. Setting disable_adblock: true for
2844
+ // those specific URLs lets the chain play out naturally so the popup
2845
+ // request listener can observe the full hop sequence.
2846
+ const disableAdblock = siteConfig.disable_adblock === true;
2630
2847
 
2631
2848
  // Pre-build Set for O(1) resourceType lookups (fired per request)
2632
2849
  const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
2633
2850
  ? new Set(siteConfig.resourceTypes)
2634
2851
  : null;
2635
-
2636
- // Combine site-specific with pre-compiled global blocked patterns
2637
- const allBlockedRegexes = blockedRegexes.length > 0
2638
- ? [...blockedRegexes, ...globalBlockedRegexes]
2639
- : globalBlockedRegexes; // Avoid spread when no site-specific patterns
2852
+
2853
+ // Combine site-specific with pre-compiled global blocked patterns.
2854
+ // When disable_adblock is true, globalBlockedRegexes is omitted so
2855
+ // only the per-site list applies.
2856
+ const allBlockedRegexes = disableAdblock
2857
+ ? blockedRegexes
2858
+ : (blockedRegexes.length > 0
2859
+ ? [...blockedRegexes, ...globalBlockedRegexes]
2860
+ : globalBlockedRegexes); // Avoid spread when no site-specific patterns
2861
+
2862
+ if (disableAdblock && forceDebug) {
2863
+ const dropped = globalBlockedRegexes.length;
2864
+ const adblockNote = adblockEnabled && adblockMatcher ? ' + adblock-rs engine' : '';
2865
+ console.log(formatLogMessage('debug', `[adblock] disable_adblock=true for ${currentUrl} — skipping ${dropped} global blocked patterns${adblockNote} (site-level ${blockedRegexes.length} pattern(s) still apply)`));
2866
+ }
2640
2867
 
2641
2868
  /**
2642
2869
  * Helper function to add domain to matched collection
@@ -2663,7 +2890,7 @@ function setupFrameHandling(page, forceDebug) {
2663
2890
  const cachedSimilarity = smartCache.getCachedSimilarity(domain, existingDomain);
2664
2891
  if (cachedSimilarity !== null && cachedSimilarity >= similarityThreshold) {
2665
2892
  if (forceDebug) {
2666
- console.log(formatLogMessage('debug', `[SmartCache] Used cached similarity: ${domain} ~= ${existingDomain} (${cachedSimilarity}%)`));
2893
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Used cached similarity: ${domain} ~= ${existingDomain} (${cachedSimilarity}%)`));
2667
2894
  }
2668
2895
  return; // Skip adding this domain
2669
2896
  }
@@ -2687,7 +2914,7 @@ function setupFrameHandling(page, forceDebug) {
2687
2914
 
2688
2915
  if (smartCache && smartCache.shouldSkipDomain(domain, context)) {
2689
2916
  if (forceDebug) {
2690
- console.log(formatLogMessage('debug', `[SmartCache] Skipping cached domain: ${domain}`));
2917
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Skipping cached domain: ${domain}`));
2691
2918
  }
2692
2919
  return; // Skip adding this domain
2693
2920
  }
@@ -2705,7 +2932,7 @@ function setupFrameHandling(page, forceDebug) {
2705
2932
 
2706
2933
  if (similarCheck.shouldIgnore) {
2707
2934
  if (forceDebug) {
2708
- console.log(formatLogMessage('debug', `[ignore_similar] Skipping ${domain}: ${similarCheck.reason}`));
2935
+ console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_TAG} Skipping ${domain}: ${similarCheck.reason}`));
2709
2936
  }
2710
2937
  return; // Skip adding this domain
2711
2938
  }
@@ -2721,7 +2948,7 @@ function setupFrameHandling(page, forceDebug) {
2721
2948
 
2722
2949
  if (ignoredSimilarCheck.shouldIgnore) {
2723
2950
  if (forceDebug) {
2724
- console.log(formatLogMessage('debug', `[ignore_similar_ignored_domains] Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
2951
+ console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_IGNORED_DOMAINS_TAG} Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
2725
2952
  }
2726
2953
  return; // Skip adding this domain
2727
2954
  }
@@ -2742,7 +2969,7 @@ function setupFrameHandling(page, forceDebug) {
2742
2969
  }
2743
2970
  } catch (cacheErr) {
2744
2971
  if (forceDebug) {
2745
- console.log(formatLogMessage('debug', `[SmartCache] Error marking domain: ${cacheErr.message}`));
2972
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
2746
2973
  }
2747
2974
  }
2748
2975
  }
@@ -2760,6 +2987,247 @@ function setupFrameHandling(page, forceDebug) {
2760
2987
  }
2761
2988
  }
2762
2989
 
2990
+ // === POPUP CAPTURE (opt-in via siteConfig.capture_popups: true) ===
2991
+ // Many ad networks fire popunders / new-tab opens (window.open, target=
2992
+ // "_blank") that navigate to trackers and disappear from view. Those
2993
+ // pages are SEPARATE Puppeteer targets — page.on('request', ...) on the
2994
+ // main page never sees their network traffic.
2995
+ //
2996
+ // IMPORTANT: modern Chromium blocks programmatic window.open() unless
2997
+ // it's triggered by a real user gesture. In practice that means
2998
+ // capture_popups only catches anything when the scanner is actually
2999
+ // clicking on the page — i.e., the site config also has
3000
+ // `interact: true` AND `interact_clicks: true`. Setting capture_popups
3001
+ // alone will register the listener but no popups will fire.
3002
+ //
3003
+ // When capture_popups is true, we attach a browser-level 'targetcreated'
3004
+ // listener for THIS URL only. New page targets whose opener-chain leads
3005
+ // back to our main page (within maxDepth levels) get a stripped-down
3006
+ // request listener — same regex/first-party/ignoreDomains filter as
3007
+ // the main handler, same addMatchedDomain() sink, same domain
3008
+ // detection cache, same nettools/similarity logic (all inherited via
3009
+ // addMatchedDomain). Cloudflare bypass, adblock-rs matching, curl/grep
3010
+ // content download, and request.abort() are intentionally skipped on
3011
+ // popups — they're observation-only.
3012
+ //
3013
+ // Each popup's request listener stays attached across in-window
3014
+ // navigations, so a single popup that redirects A -> B -> C captures
3015
+ // every hop. The capture window (default 5s, configurable per-site
3016
+ // via capture_popups_window_ms) is the wall-clock budget for that
3017
+ // chain — bump it for long redirect chains, lower it for high-popup-
3018
+ // rate sites where memory pressure matters more than chain coverage.
3019
+ const capturePopups = siteConfig.capture_popups === true;
3020
+ // Per-site overrides (with sane defaults). Parsed as numbers so config
3021
+ // values from JSON come through correctly; falsy / non-positive values
3022
+ // fall back to the default rather than silently disabling capture.
3023
+ const POPUP_MAX_DEPTH = (() => {
3024
+ const v = parseInt(siteConfig.capture_popups_max_depth, 10);
3025
+ return Number.isFinite(v) && v > 0 ? v : 2;
3026
+ })();
3027
+ const POPUP_CAPTURE_WINDOW_MS = (() => {
3028
+ const v = parseInt(siteConfig.capture_popups_window_ms, 10);
3029
+ return Number.isFinite(v) && v > 0 ? v : 5000;
3030
+ })();
3031
+
3032
+ if (capturePopups && forceDebug) {
3033
+ // One-time setup-time warning if the click prerequisite isn't met.
3034
+ // Without clicks, capture_popups is a no-op in practice.
3035
+ const hasClicks = siteConfig.interact === true && siteConfig.interact_clicks === true;
3036
+ if (!hasClicks) {
3037
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but interact_clicks is not — popups need user-gesture clicks to fire; expect no captures unless the page opens popups via in-page redirects`));
3038
+ }
3039
+ console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
3040
+ }
3041
+
3042
+ if (capturePopups) {
3043
+ const mainTarget = page.target();
3044
+
3045
+ // Walk target.opener() chain to find depth relative to mainTarget.
3046
+ // Returns 0 if the target isn't a descendant of mainTarget at all,
3047
+ // 1 for a direct popup of the main page, 2 for popup-of-popup, etc.
3048
+ const getPopupDepth = (target) => {
3049
+ let depth = 0;
3050
+ let cur = target.opener();
3051
+ while (cur && depth <= POPUP_MAX_DEPTH + 1) {
3052
+ depth++;
3053
+ if (cur === mainTarget) return depth;
3054
+ cur = cur.opener();
3055
+ }
3056
+ return 0;
3057
+ };
3058
+
3059
+ // Attach observation-only request listener to a popup page. No
3060
+ // setRequestInterception(true) — page.on('request') fires for every
3061
+ // request regardless of interception state, and we don't need to
3062
+ // block anything on popups.
3063
+ const attachPopupRequestCapture = (popupPage, depth) => {
3064
+ popupPage.on('request', (request) => {
3065
+ try {
3066
+ const checkedUrl = request.url();
3067
+ let fullSubdomain = '';
3068
+ let checkedRootDomain = '';
3069
+ try {
3070
+ const parsedUrl = new URL(checkedUrl);
3071
+ fullSubdomain = parsedUrl.hostname;
3072
+ const pslResult = psl.parse(fullSubdomain);
3073
+ checkedRootDomain = pslResult.domain || fullSubdomain;
3074
+ } catch (_) { return; }
3075
+ if (!checkedRootDomain) return;
3076
+
3077
+ // ignoreDomainsByUrl — if any pattern matches this popup URL,
3078
+ // mark the root domain as ignored for the rest of the scan
3079
+ // (main page + all popups). Mirrors the main handler so a
3080
+ // tracker URL surfaced via popup chain has the same dampening
3081
+ // effect as one surfaced on the main page.
3082
+ if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
3083
+ for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
3084
+ if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
3085
+ _dynamicallyIgnoredDomains.add(checkedRootDomain);
3086
+ if (forceDebug) {
3087
+ console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3088
+ }
3089
+ break;
3090
+ }
3091
+ }
3092
+ }
3093
+
3094
+ // blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
3095
+ // above; populating the dynamic block Set from popup URLs lets
3096
+ // tracker URLs surfaced via popup chains poison their root
3097
+ // domain for the rest of the scan just like main-page hits do.
3098
+ if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3099
+ for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3100
+ if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
3101
+ _dynamicallyBlockedDomains.add(checkedRootDomain);
3102
+ if (forceDebug) {
3103
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3104
+ }
3105
+ break;
3106
+ }
3107
+ }
3108
+ }
3109
+
3110
+ // ignoreDomains gate (global; matchesIgnoreDomain also short-
3111
+ // circuits on _dynamicallyIgnoredDomains, so a domain we just
3112
+ // added above will be caught here on the same request).
3113
+ if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
3114
+
3115
+ // Dynamic-block gate for popup requests — early return on
3116
+ // matched root or any parent (parent-walk in
3117
+ // matchesDynamicBlock). Popups don't have a request object
3118
+ // available here, so we just return rather than abort; the
3119
+ // popup-request observer treats this as "don't process".
3120
+ if (matchesDynamicBlock(checkedRootDomain)) return;
3121
+
3122
+ // First-party / third-party gate (popup belongs to the main URL's
3123
+ // domain group — its OWN URL doesn't redefine first-party).
3124
+ const isFirstParty = firstPartyDomains.has(checkedRootDomain);
3125
+ if (siteConfig.firstParty === false && isFirstParty) return;
3126
+ if (siteConfig.thirdParty === false && !isFirstParty) return;
3127
+
3128
+ // Regex match against the site's filterRegex list
3129
+ const resourceType = request.resourceType();
3130
+ let regexMatched = false;
3131
+ for (const re of regexes) {
3132
+ if (re.test(checkedUrl)) {
3133
+ regexMatched = true;
3134
+ if (forceDebug) {
3135
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
3136
+ }
3137
+ break;
3138
+ }
3139
+ }
3140
+
3141
+ if (!regexMatched) return;
3142
+
3143
+ // hasNetTools is the same flag the main handler uses (line ~2639).
3144
+ // When the site config carries whois/dig terms, regex match is
3145
+ // not sufficient by itself — the URL must ALSO pass the whois/
3146
+ // dig validation before it counts. Mirrors the main handler's
3147
+ // behavior so 'capture popup domains that match regex/dig/whois'
3148
+ // means the same thing for popups as for the main page.
3149
+ if (hasNetTools) {
3150
+ const popupNetToolsHandler = createNetToolsHandler({
3151
+ whoisTerms, whoisOrTerms,
3152
+ processedWhoisDomains: globalProcessedWhoisDomains,
3153
+ processedDigDomains: globalProcessedDigDomains,
3154
+ whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
3155
+ whoisServer,
3156
+ whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
3157
+ debugLogFile,
3158
+ digTerms, digOrTerms, digRecordType,
3159
+ digSubdomain: siteConfig.dig_subdomain === true,
3160
+ dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
3161
+ matchedDomains, addMatchedDomain,
3162
+ isDomainAlreadyDetected: isLocallyDetected,
3163
+ onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
3164
+ onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
3165
+ cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
3166
+ cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
3167
+ currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
3168
+ ignoreDomains, matchesIgnoreDomain
3169
+ });
3170
+ setImmediate(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3171
+ } else {
3172
+ // No nettools required — regex match alone counts.
3173
+ addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3174
+ }
3175
+ } catch (_) { /* observation-only — never let a popup error escape */ }
3176
+ });
3177
+ };
3178
+
3179
+ const onTargetCreated = async (target) => {
3180
+ // Short-circuit guard: if finally has already started, don't attach
3181
+ // a request listener whose closure would outlive its meaningful
3182
+ // scope. The race is narrow (a targetcreated firing while we're
3183
+ // mid-await on target.page() across the finally boundary), but
3184
+ // without this guard a late popup could push matches into
3185
+ // matchedDomains for a URL whose processing has already returned.
3186
+ if (urlFinished) return;
3187
+ if (target.type() !== 'page') return;
3188
+ const depth = getPopupDepth(target);
3189
+ if (depth < 1) return; // Not one of ours
3190
+ if (depth > POPUP_MAX_DEPTH) {
3191
+ if (forceDebug) {
3192
+ console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
3193
+ }
3194
+ return;
3195
+ }
3196
+
3197
+ let popupPage;
3198
+ try { popupPage = await target.page(); } catch (_) { return; }
3199
+ if (!popupPage) return;
3200
+ // Re-check after the await — the per-URL finally may have flipped
3201
+ // the flag while target.page() was resolving.
3202
+ if (urlFinished) {
3203
+ try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
3204
+ return;
3205
+ }
3206
+
3207
+ if (forceDebug) {
3208
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
3209
+ }
3210
+
3211
+ attachPopupRequestCapture(popupPage, depth);
3212
+
3213
+ // Auto-close after the capture window so popups don't pile up.
3214
+ const closeTimer = setTimeout(() => {
3215
+ try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
3216
+ }, POPUP_CAPTURE_WINDOW_MS);
3217
+ if (typeof closeTimer.unref === 'function') closeTimer.unref();
3218
+
3219
+ popupCleanups.push(() => {
3220
+ clearTimeout(closeTimer);
3221
+ try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
3222
+ });
3223
+ };
3224
+
3225
+ browser.on('targetcreated', onTargetCreated);
3226
+ popupCleanups.push(() => {
3227
+ try { browser.off('targetcreated', onTargetCreated); } catch (_) {}
3228
+ });
3229
+ }
3230
+
2763
3231
  // --- page.on('request', ...) Handler: Core Network Request Logic ---
2764
3232
  // This handler is triggered for every network request made by the page.
2765
3233
  // It decides whether to allow, block, or process the request based on:
@@ -2820,15 +3288,17 @@ function setupFrameHandling(page, forceDebug) {
2820
3288
  console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[frame: ${isMainFrame ? 'main' : 'iframe'}] ${debugFrameUrl} → ${checkedUrl}`));
2821
3289
  }
2822
3290
 
2823
- // Apply adblock rules BEFORE expensive regex checks for better performance
2824
- if (adblockEnabled && adblockMatcher) {
3291
+ // Apply adblock-rs filter-list rules BEFORE expensive regex checks
3292
+ // for better performance. Gated on !disableAdblock so per-URL configs
3293
+ // (e.g. for popup/redirect chain capture) can bypass it.
3294
+ if (!disableAdblock && adblockEnabled && adblockMatcher) {
2825
3295
  try {
2826
3296
  const result = adblockMatcher.shouldBlock(
2827
3297
  checkedUrl,
2828
3298
  currentUrl,
2829
3299
  request.resourceType()
2830
3300
  );
2831
-
3301
+
2832
3302
  if (result.blocked) {
2833
3303
  adblockStats.blocked++;
2834
3304
  if (forceDebug) {
@@ -2862,13 +3332,42 @@ function setupFrameHandling(page, forceDebug) {
2862
3332
  if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
2863
3333
  _dynamicallyIgnoredDomains.add(checkedRootDomain);
2864
3334
  if (forceDebug) {
2865
- console.log(formatLogMessage('debug', `[ignoreDomainsByUrl] ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
3335
+ console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
2866
3336
  }
2867
3337
  break;
2868
3338
  }
2869
3339
  }
2870
3340
  }
2871
3341
 
3342
+ // blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl above.
3343
+ // If any pattern matches this URL, mark the root domain as blocked
3344
+ // for the rest of the scan. The gate immediately below catches the
3345
+ // triggering request itself + any future request on this domain or
3346
+ // its subdomains (parent-walk via matchesDynamicBlock).
3347
+ if (_blockDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3348
+ for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3349
+ if (_blockDomainsByUrlRegexes[i].test(reqUrl)) {
3350
+ _dynamicallyBlockedDomains.add(checkedRootDomain);
3351
+ if (forceDebug) {
3352
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source}`));
3353
+ }
3354
+ break;
3355
+ }
3356
+ }
3357
+ }
3358
+ // blockDomainsByUrl gate — abort if reqDomain (or a parent) is in
3359
+ // the dynamic block Set. Fires BEFORE the static blocked-regex
3360
+ // check so domain-based blocks short-circuit without paying the
3361
+ // per-URL regex scan. Same abort reason as the static path so
3362
+ // request.failure() observers see consistent metadata.
3363
+ if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
3364
+ if (forceDebug) {
3365
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
3366
+ }
3367
+ request.abort('blockedbyclient');
3368
+ return;
3369
+ }
3370
+
2872
3371
  let blockedMatchIndex = -1;
2873
3372
  for (let i = 0; i < allBlockedRegexes.length; i++) {
2874
3373
  if (allBlockedRegexes[i].test(reqUrl)) {
@@ -2877,8 +3376,16 @@ function setupFrameHandling(page, forceDebug) {
2877
3376
  }
2878
3377
  }
2879
3378
  if (blockedMatchIndex !== -1) {
3379
+ // Always track the hit (zero-cost on the un-debug path) so the
3380
+ // scan-end summary can show which patterns are doing work vs.
3381
+ // which are stale and ready to prune. Keyed by pattern.source --
3382
+ // identical patterns from site + global lists roll up together,
3383
+ // which matches how users think about them.
3384
+ const matchedPatternSrc = allBlockedRegexes[blockedMatchIndex].source;
3385
+ _blockedPatternHits.set(matchedPatternSrc, (_blockedPatternHits.get(matchedPatternSrc) || 0) + 1);
3386
+
2880
3387
  if (forceDebug) {
2881
- const matchedPattern = allBlockedRegexes[blockedMatchIndex].source;
3388
+ const matchedPattern = matchedPatternSrc;
2882
3389
  const patternSource = blockedMatchIndex < blockedRegexes.length ? 'site' : 'global';
2883
3390
  console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked]')}[${simplifiedCurrentUrl}] ${reqUrl} blocked by ${patternSource} pattern: ${matchedPattern}`));
2884
3391
 
@@ -2950,6 +3457,19 @@ function setupFrameHandling(page, forceDebug) {
2950
3457
  return;
2951
3458
  }
2952
3459
 
3460
+ // Early ignoreDomains gate — skip regex + dig/whois entirely for domains
3461
+ // in the ignoreDomains list (or dynamically-ignored ones populated by
3462
+ // ignoreDomainsByUrl above). Mirrors the popup handler's early gate so
3463
+ // the main path doesn't waste a dig/whois lookup on domains that
3464
+ // post-processing/output filters will strip anyway.
3465
+ if (matchesIgnoreDomain(reqDomain, ignoreDomains)) {
3466
+ if (forceDebug) {
3467
+ console.log(formatLogMessage('debug', `Skipping ignoreDomains match: ${reqDomain}`));
3468
+ }
3469
+ request.continue();
3470
+ return;
3471
+ }
3472
+
2953
3473
  // === ENHANCED REGEX MATCHING WITH AND/OR LOGIC ===
2954
3474
  let regexMatched = false;
2955
3475
  let matchedRegexPattern = null;
@@ -3046,9 +3566,11 @@ function setupFrameHandling(page, forceDebug) {
3046
3566
  dumpUrls,
3047
3567
  matchedUrlsLogFile,
3048
3568
  forceDebug,
3049
- fs
3569
+ fs,
3570
+ ignoreDomains,
3571
+ matchesIgnoreDomain
3050
3572
  });
3051
-
3573
+
3052
3574
  // Execute nettools check asynchronously
3053
3575
  const originalDomain = fullSubdomain;
3054
3576
  setImmediate(() => netToolsHandler(reqDomain, originalDomain));
@@ -3122,7 +3644,7 @@ function setupFrameHandling(page, forceDebug) {
3122
3644
  const cachedDig = smartCache ? smartCache.getCachedNetTools(reqDomain, 'dig', digRecordType) : null;
3123
3645
 
3124
3646
  if ((cachedWhois || cachedDig) && forceDebug) {
3125
- console.log(formatLogMessage('debug', `[SmartCache] Using cached nettools results for ${reqDomain}`));
3647
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached nettools results for ${reqDomain}`));
3126
3648
  }
3127
3649
 
3128
3650
  // Create nettools handler with cache callbacks (if cache is enabled)
@@ -3159,9 +3681,11 @@ function setupFrameHandling(page, forceDebug) {
3159
3681
  dumpUrls,
3160
3682
  matchedUrlsLogFile,
3161
3683
  forceDebug,
3162
- fs
3684
+ fs,
3685
+ ignoreDomains,
3686
+ matchesIgnoreDomain
3163
3687
  });
3164
-
3688
+
3165
3689
  // Execute nettools check asynchronously
3166
3690
  const originalDomain = fullSubdomain; // Use full subdomain for nettools
3167
3691
  setImmediate(() => netToolsHandler(reqDomain, originalDomain));
@@ -3218,7 +3742,7 @@ function setupFrameHandling(page, forceDebug) {
3218
3742
  }
3219
3743
 
3220
3744
  if (cachedContent && forceDebug) {
3221
- console.log(formatLogMessage('debug', `[SmartCache] Using cached response content for ${reqUrl.substring(0, 50)}...`));
3745
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached response content for ${reqUrl.substring(0, 50)}...`));
3222
3746
  // Process cached content instead of fetching
3223
3747
  } else {
3224
3748
  try {
@@ -3248,7 +3772,12 @@ function setupFrameHandling(page, forceDebug) {
3248
3772
  forceDebug,
3249
3773
  userAgent: curlUserAgent,
3250
3774
  resourceType,
3251
- hasSearchString: hasSearchString || hasSearchStringAnd,
3775
+ // Pass both flags separately — createGrepHandler now
3776
+ // applies AND logic when hasSearchStringAnd is set.
3777
+ // Previously OR'd into hasSearchString and the AND
3778
+ // patterns were silently dropped.
3779
+ hasSearchString,
3780
+ hasSearchStringAnd,
3252
3781
  grepOptions: {
3253
3782
  ignoreCase: true,
3254
3783
  wholeWord: false,
@@ -3298,7 +3827,7 @@ function setupFrameHandling(page, forceDebug) {
3298
3827
  } else if (useGrep && (hasSearchString || hasSearchStringAnd)) {
3299
3828
  // Use grep with response handler (no curl)
3300
3829
  if (forceDebug) {
3301
- console.log(formatLogMessage('debug', `[grep-response] Queuing ${reqUrl} for grep analysis via response handler`));
3830
+ console.log(formatLogMessage('debug', `${GREP_RESPONSE_TAG} Queuing ${reqUrl} for grep analysis via response handler`));
3302
3831
  }
3303
3832
 
3304
3833
  // Queue for grep processing via response handler
@@ -3386,7 +3915,7 @@ function setupFrameHandling(page, forceDebug) {
3386
3915
  }
3387
3916
  }, cssBlockedSelectors);
3388
3917
  } catch (cssRuntimeErr) {
3389
- console.warn(formatLogMessage('warn', `[css_blocked] Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
3918
+ console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
3390
3919
  }
3391
3920
  }
3392
3921
  }
@@ -3698,8 +4227,8 @@ function setupFrameHandling(page, forceDebug) {
3698
4227
  const proxyErr = proxyErrors.find(e => err.message.includes(e));
3699
4228
  if (proxyErr) {
3700
4229
  const info = getProxyInfo(siteConfig);
3701
- console.error(formatLogMessage('error', `[proxy] ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
3702
- console.error(formatLogMessage('error', `[proxy] Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
4230
+ console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
4231
+ console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
3703
4232
  }
3704
4233
  }
3705
4234
  console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
@@ -3751,7 +4280,7 @@ function setupFrameHandling(page, forceDebug) {
3751
4280
  try {
3752
4281
  if (ghostConfig) {
3753
4282
  // Ghost-cursor mode: Bezier-based mouse movements
3754
- if (forceDebug) console.log(formatLogMessage('debug', `[ghost-cursor] Using ghost-cursor for ${currentUrl}`));
4283
+ if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
3755
4284
  const cursor = createGhostCursor(page, { forceDebug });
3756
4285
  if (cursor) {
3757
4286
  await Promise.race([
@@ -3789,8 +4318,7 @@ function setupFrameHandling(page, forceDebug) {
3789
4318
  await performPageInteraction(page, currentUrl, {
3790
4319
  ...interactionConfig,
3791
4320
  mouseMovements: 0,
3792
- includeElementClicks: false,
3793
- includeTyping: false
4321
+ includeElementClicks: false
3794
4322
  }, forceDebug);
3795
4323
  }
3796
4324
  })(),
@@ -3811,7 +4339,7 @@ function setupFrameHandling(page, forceDebug) {
3811
4339
  ]);
3812
4340
  }
3813
4341
  } catch (interactTimeoutErr) {
3814
- if (forceDebug) console.log(formatLogMessage('debug', `[interaction] Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
4342
+ if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
3815
4343
  }
3816
4344
  })();
3817
4345
 
@@ -3946,7 +4474,7 @@ function setupFrameHandling(page, forceDebug) {
3946
4474
  const clearResult = await clearSiteData(page, currentUrl, forceDebug, true); // Quick mode for reloads
3947
4475
  if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i} for ${currentUrl}`));
3948
4476
  } catch (reloadClearErr) {
3949
- if (forceDebug) console.log(formatLogMessage('debug', `[clear_sitedata] Before reload failed for ${currentUrl}`));
4477
+ if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
3950
4478
  }
3951
4479
  }
3952
4480
 
@@ -4140,8 +4668,8 @@ function setupFrameHandling(page, forceDebug) {
4140
4668
  const proxyErr = proxyErrors.find(e => err.message.includes(e));
4141
4669
  if (proxyErr) {
4142
4670
  const info = getProxyInfo(siteConfig);
4143
- console.error(formatLogMessage('error', `[proxy] ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
4144
- console.error(formatLogMessage('error', `[proxy] Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
4671
+ console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
4672
+ console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
4145
4673
  }
4146
4674
  }
4147
4675
 
@@ -4208,17 +4736,33 @@ function setupFrameHandling(page, forceDebug) {
4208
4736
  };
4209
4737
  } finally {
4210
4738
  // Guaranteed resource cleanup - this runs regardless of success or failure
4211
-
4739
+
4740
+ // Flip the popup-capture race-window guard first so any in-flight
4741
+ // 'targetcreated' handler that resolves after this point sees the
4742
+ // flag and bails (closing its own popup if it managed to fetch one).
4743
+ urlFinished = true;
4744
+
4745
+ // Popup capture teardown (opt-in via siteConfig.capture_popups). Each
4746
+ // entry is either the browser.off('targetcreated', ...) deregistration
4747
+ // or a per-popup (clearTimeout + popupPage.close) cleanup. Iterate even
4748
+ // if one fails so the rest still run.
4749
+ if (popupCleanups.length) {
4750
+ for (const cleanup of popupCleanups) {
4751
+ try { cleanup(); } catch (_) {}
4752
+ }
4753
+ popupCleanups.length = 0;
4754
+ }
4755
+
4212
4756
  // Disconnect VPN for this site
4213
4757
  if (siteConfig.vpn) {
4214
4758
  const vpnDown = wgDisconnect(siteConfig, forceDebug);
4215
4759
  if (vpnDown.tornDown && forceDebug) {
4216
- console.log(formatLogMessage('debug', `[vpn] WireGuard interface torn down for ${currentUrl}`));
4760
+ console.log(formatLogMessage('debug', `${VPN_TAG} WireGuard interface torn down for ${currentUrl}`));
4217
4761
  }
4218
4762
  } else if (siteConfig.openvpn) {
4219
4763
  const ovpnDown = ovpnDisconnect(siteConfig, forceDebug);
4220
4764
  if (ovpnDown.tornDown && forceDebug) {
4221
- console.log(formatLogMessage('debug', `[vpn] OpenVPN connection torn down for ${currentUrl}`));
4765
+ console.log(formatLogMessage('debug', `${VPN_TAG} OpenVPN connection torn down for ${currentUrl}`));
4222
4766
  }
4223
4767
  }
4224
4768
 
@@ -4300,6 +4844,19 @@ function setupFrameHandling(page, forceDebug) {
4300
4844
  // Sort tasks so proxy groups are contiguous — direct connections first, then each proxy
4301
4845
  allTasks.sort((a, b) => proxyKeyFor(a.config).localeCompare(proxyKeyFor(b.config)));
4302
4846
 
4847
+ // Pre-start local no-auth SOCKS5 relays for any authenticated socks5://
4848
+ // upstreams. Done once here (the only async step) so getProxyArgs stays a
4849
+ // sync lookup in the per-batch browser-launch path. Chromium can't auth
4850
+ // SOCKS5; the relay does the upstream auth transparently.
4851
+ try {
4852
+ const relayCount = await prepareSocksRelays(sites, forceDebug);
4853
+ if (relayCount > 0 && !silentMode) {
4854
+ console.log(messageColors.processing(`Started ${relayCount} SOCKS5 auth relay(s)`));
4855
+ }
4856
+ } catch (relayErr) {
4857
+ console.warn(formatLogMessage('proxy', `SOCKS5 relay setup failed: ${relayErr.message}`));
4858
+ }
4859
+
4303
4860
  let results = [];
4304
4861
  let processedUrlCount = 0;
4305
4862
  let urlsSinceLastCleanup = 0;
@@ -4320,7 +4877,13 @@ function setupFrameHandling(page, forceDebug) {
4320
4877
  let lastProcessedCount = 0;
4321
4878
  let hangCheckCount = 0;
4322
4879
  let forceRestartFlag = false; // Flag to trigger restart on next iteration
4323
-
4880
+
4881
+ // Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
4882
+ // only colors the [severity] tag; the '[HANG CHECK]' substring was
4883
+ // sitting plain inside the message string. Colored once at function
4884
+ // entry so the interval callback doesn't re-colorize per tick.
4885
+ const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
4886
+
4324
4887
  const hangDetectionInterval = setInterval(() => {
4325
4888
  // Progress check, counter, and forceRestartFlag MUST run regardless of
4326
4889
  // debug mode — previously the entire body was gated on forceDebug, which
@@ -4331,10 +4894,10 @@ function setupFrameHandling(page, forceDebug) {
4331
4894
  if (processedUrlCount === lastProcessedCount) {
4332
4895
  hangCheckCount++;
4333
4896
  if (forceDebug) {
4334
- console.log(formatLogMessage('warn', `[HANG CHECK] No progress for ${hangCheckCount * 30}s`));
4897
+ console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
4335
4898
  }
4336
4899
  if (hangCheckCount >= 5) {
4337
- console.log(formatLogMessage('error', `[HANG CHECK] Hung for 2.5 minutes. Triggering emergency browser restart.`));
4900
+ console.log(formatLogMessage('error', `${HANG_CHECK_TAG} Hung for 2.5 minutes. Triggering emergency browser restart.`));
4338
4901
  forceRestartFlag = true; // Set flag instead of exiting
4339
4902
  hangCheckCount = 0; // Reset counter for next cycle
4340
4903
  }
@@ -4347,8 +4910,8 @@ function setupFrameHandling(page, forceDebug) {
4347
4910
  if (forceDebug) {
4348
4911
  const currentBatch = Math.floor(currentBatchInfo.batchStart / RESOURCE_CLEANUP_INTERVAL) + 1;
4349
4912
  const totalBatches = Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL);
4350
- console.log(formatLogMessage('debug', `[HANG CHECK] Processed: ${processedUrlCount}/${totalUrls} URLs, Batch: ${currentBatch}/${totalBatches}, Current batch size: ${currentBatchInfo.batchSize}`));
4351
- console.log(formatLogMessage('debug', `[HANG CHECK] URLs since cleanup: ${urlsSinceLastCleanup}, Recent failures: ${results.slice(-3).filter(r => !r.success).length}/3`));
4913
+ console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} Processed: ${processedUrlCount}/${totalUrls} URLs, Batch: ${currentBatch}/${totalBatches}, Current batch size: ${currentBatchInfo.batchSize}`));
4914
+ console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} URLs since cleanup: ${urlsSinceLastCleanup}, Recent failures: ${results.slice(-3).filter(r => !r.success).length}/3`));
4352
4915
  }
4353
4916
  }, 30000);
4354
4917
  // Don't keep the event loop alive solely for the hang-check interval — the
@@ -4359,29 +4922,46 @@ function setupFrameHandling(page, forceDebug) {
4359
4922
  // Process URLs in batches with exception handling
4360
4923
  let siteGroupIndex = 0;
4361
4924
  let currentProxyKey = ''; // Track active proxy config — '' means direct connection
4925
+ // Map of site-config object -> index in sites[], built once. Per-batch
4926
+ // grouping below uses this for O(1) lookup instead of sites.indexOf which
4927
+ // walked the array per task (batch=80 * sites=20 was ~1600 cmps per batch).
4928
+ const configToIndex = new Map();
4929
+ for (let i = 0; i < sites.length; i++) configToIndex.set(sites[i], i);
4362
4930
  try {
4363
4931
  for (let batchStart = 0; batchStart < totalUrls; batchStart += RESOURCE_CLEANUP_INTERVAL) {
4364
4932
  const batchEnd = Math.min(batchStart + RESOURCE_CLEANUP_INTERVAL, totalUrls);
4365
4933
  const currentBatch = allTasks.slice(batchStart, batchEnd);
4366
4934
 
4367
-
4368
- // Group tasks by their source site configuration for window cleanup
4935
+
4936
+ // Group tasks by their source site configuration for window cleanup.
4937
+ // Single get-or-set replaces has + get + set (one Map lookup not two).
4938
+ // The `?? -1` preserves the old `sites.indexOf` semantics for a task
4939
+ // whose config isn't in sites[] — that case shouldn't happen, but if
4940
+ // it ever does the routing stays identical to the prior code's
4941
+ // 'site_-1' bucket rather than silently shifting to 'site_undefined'.
4369
4942
  const tasksBySite = new Map();
4370
- currentBatch.forEach(task => {
4371
- const siteKey = `site_${sites.indexOf(task.config)}`;
4372
- if (!tasksBySite.has(siteKey)) {
4373
- tasksBySite.set(siteKey, []);
4374
- }
4375
- tasksBySite.get(siteKey).push(task);
4376
- });
4943
+ for (let i = 0; i < currentBatch.length; i++) {
4944
+ const task = currentBatch[i];
4945
+ const siteKey = `site_${configToIndex.get(task.config) ?? -1}`;
4946
+ let arr = tasksBySite.get(siteKey);
4947
+ if (!arr) tasksBySite.set(siteKey, arr = []);
4948
+ arr.push(task);
4949
+ }
4377
4950
 
4378
4951
  // IMPROVED: Only check health if we have indicators of problems
4379
4952
  let healthCheck = { shouldRestart: false, reason: null };
4380
4953
  const recentResults = results.slice(-8); // Check more results for better pattern detection
4381
- const recentFailureRate = recentResults.length > 0 ?
4382
- recentResults.filter(r => !r.success).length / recentResults.length : 0;
4954
+ // Single-pass count for both failure rate and critical-error tally —
4955
+ // was two .filter(...).length calls allocating two intermediate arrays.
4956
+ let recentFailures = 0, recentCritical = 0;
4957
+ for (let i = 0; i < recentResults.length; i++) {
4958
+ const r = recentResults[i];
4959
+ if (!r.success) recentFailures++;
4960
+ if (r.needsImmediateRestart) recentCritical++;
4961
+ }
4962
+ const recentFailureRate = recentResults.length > 0 ? recentFailures / recentResults.length : 0;
4383
4963
  const hasHighFailureRate = recentFailureRate > 0.75; // 75% failure threshold (more conservative)
4384
- const hasCriticalErrors = recentResults.filter(r => r.needsImmediateRestart).length > 2;
4964
+ const hasCriticalErrors = recentCritical > 2;
4385
4965
 
4386
4966
  // Only run health checks when we have STRONG indicators of problems
4387
4967
  if (urlsSinceLastCleanup > 15 && (
@@ -4390,15 +4970,21 @@ function setupFrameHandling(page, forceDebug) {
4390
4970
  urlsSinceLastCleanup > RESOURCE_CLEANUP_INTERVAL * 0.9 // Very close to cleanup limit
4391
4971
  )) {
4392
4972
  try {
4973
+ // Race the health check against a 30s timeout. Attach .catch on the
4974
+ // health promise itself so that if the timeout wins, the still-running
4975
+ // monitorBrowserHealth's eventual rejection doesn't surface as an
4976
+ // unhandledRejection warning.
4977
+ const healthPromise = monitorBrowserHealth(browser, {}, {
4978
+ siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
4979
+ totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
4980
+ urlsSinceCleanup: urlsSinceLastCleanup,
4981
+ cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
4982
+ forceDebug,
4983
+ silentMode
4984
+ });
4985
+ healthPromise.catch(() => {});
4393
4986
  healthCheck = await Promise.race([
4394
- monitorBrowserHealth(browser, {}, {
4395
- siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
4396
- totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
4397
- urlsSinceCleanup: urlsSinceLastCleanup,
4398
- cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
4399
- forceDebug,
4400
- silentMode
4401
- }),
4987
+ healthPromise,
4402
4988
  new Promise((_, reject) => setTimeout(() => reject(new Error('Health check timeout')), 30000))
4403
4989
  ]);
4404
4990
  } catch (healthError) {
@@ -4427,8 +5013,17 @@ function setupFrameHandling(page, forceDebug) {
4427
5013
  // timeout) bypasses the urlsSinceLastCleanup > 8 gate — a confirmed hang
4428
5014
  // needs immediate restart even if we just cleaned up. Proactive triggers
4429
5015
  // keep the gate to prevent thrashing.
5016
+ //
5017
+ // hasHighFailureRate is computed (and still used for the health-check
5018
+ // gate above) but intentionally NOT folded into proactiveRestart:
5019
+ // wouldExceedLimit is always true at every batch boundary with the
5020
+ // default RESOURCE_CLEANUP_INTERVAL == batch size, so the high-failure-
5021
+ // rate branch was dead code reached only at the same boundary that
5022
+ // wouldExceedLimit already triggers. If failure-rate ever needs to
5023
+ // interrupt mid-cleanup-interval, that requires interrupting the
5024
+ // running Promise.all — a real behavior change, not an OR addition.
4430
5025
  const hangRecoveryRestart = forceRestartFlag;
4431
- const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth || (hasHighFailureRate && recentResults.length >= 6)) && urlsSinceLastCleanup > 8;
5026
+ const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth) && urlsSinceLastCleanup > 8;
4432
5027
  if ((hangRecoveryRestart || proactiveRestart) && isNotLastBatch) {
4433
5028
  let restartReason = 'Unknown';
4434
5029
  if (forceRestartFlag) {
@@ -4436,8 +5031,6 @@ function setupFrameHandling(page, forceDebug) {
4436
5031
  forceRestartFlag = false; // Reset the flag
4437
5032
  } else if (shouldRestartFromHealth) {
4438
5033
  restartReason = healthCheck.reason;
4439
- } else if (hasHighFailureRate) {
4440
- restartReason = `High failure rate: ${Math.round(recentFailureRate * 100)}% in recent batch`;
4441
5034
  } else if (wouldExceedLimit) {
4442
5035
  restartReason = `Processed ${urlsSinceLastCleanup} URLs (scheduled maintenance)`;
4443
5036
  }
@@ -4452,7 +5045,7 @@ function setupFrameHandling(page, forceDebug) {
4452
5045
  if (requestCacheStats.enabled && requestCacheStats.size > 0) {
4453
5046
  const clearedCount = smartCache.clearRequestCache();
4454
5047
  if (forceDebug) {
4455
- console.log(formatLogMessage('debug', `[SmartCache] Cleared ${clearedCount} request cache entries during browser restart`));
5048
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during browser restart`));
4456
5049
  }
4457
5050
  }
4458
5051
  }
@@ -4467,24 +5060,21 @@ function setupFrameHandling(page, forceDebug) {
4467
5060
  });
4468
5061
 
4469
5062
  // Clean up the specific user data directory
4470
- if (userDataDir && fs.existsSync(userDataDir)) {
4471
- fs.rmSync(userDataDir, { recursive: true, force: true });
4472
- if (forceDebug) console.log(formatLogMessage('debug', `Cleaned user data dir: ${userDataDir}`));
4473
- }
5063
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4474
5064
 
4475
5065
  // Additional cleanup for any remaining Chrome processes
4476
5066
  if (removeTempFiles) {
4477
- await cleanupChromeTempFiles({
4478
- includeSnapTemp: true,
5067
+ await cleanupChromeTempFiles({
5068
+ includeSnapTemp: true,
4479
5069
  forceDebug,
4480
- comprehensive: true
5070
+ comprehensive: true
4481
5071
  });
4482
5072
  }
4483
5073
 
4484
5074
  } catch (browserCloseErr) {
4485
5075
  if (forceDebug) console.log(formatLogMessage('debug', `Browser cleanup warning: ${browserCloseErr.message}`));
4486
5076
  }
4487
-
5077
+
4488
5078
  // Create new browser for next batch (preserve current proxy config)
4489
5079
  const restartProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
4490
5080
  browser = await createBrowser(restartProxyArgs);
@@ -4492,7 +5082,6 @@ function setupFrameHandling(page, forceDebug) {
4492
5082
 
4493
5083
  // Reset cleanup counter and add delay
4494
5084
  urlsSinceLastCleanup = 0;
4495
- purgeStaleTrackers();
4496
5085
  await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
4497
5086
  }
4498
5087
 
@@ -4512,9 +5101,7 @@ function setupFrameHandling(page, forceDebug) {
4512
5101
  forceDebug, timeout: 10000, exitOnFailure: false,
4513
5102
  cleanTempFiles: true, comprehensiveCleanup: removeTempFiles
4514
5103
  });
4515
- if (userDataDir && fs.existsSync(userDataDir)) {
4516
- fs.rmSync(userDataDir, { recursive: true, force: true });
4517
- }
5104
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4518
5105
  } catch (proxyRestartErr) {
4519
5106
  if (forceDebug) console.log(formatLogMessage('debug', `Proxy switch browser cleanup: ${proxyRestartErr.message}`));
4520
5107
  }
@@ -4526,8 +5113,8 @@ function setupFrameHandling(page, forceDebug) {
4526
5113
  const health = await testProxy(currentBatch[0].config, 5000);
4527
5114
  if (!health.reachable) {
4528
5115
  const info = getProxyInfo(currentBatch[0].config);
4529
- console.error(formatLogMessage('error', `[proxy] Unreachable: ${info} — ${health.error}`));
4530
- console.error(formatLogMessage('error', `[proxy] Skipping ${currentBatch.length} URL(s) in this batch`));
5116
+ console.error(formatLogMessage('error', `${PROXY_TAG} Unreachable: ${info} — ${health.error}`));
5117
+ console.error(formatLogMessage('error', `${PROXY_TAG} Skipping ${currentBatch.length} URL(s) in this batch`));
4531
5118
  const skipResults = currentBatch.map(task => ({
4532
5119
  success: false, url: task.url, rules: [],
4533
5120
  error: `Proxy unreachable: ${health.error}`
@@ -4545,7 +5132,6 @@ function setupFrameHandling(page, forceDebug) {
4545
5132
  browser = await createBrowser(proxyArgs);
4546
5133
  currentProxyKey = batchProxyKey;
4547
5134
  urlsSinceLastCleanup = 0;
4548
- purgeStaleTrackers();
4549
5135
  await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
4550
5136
  }
4551
5137
 
@@ -4555,7 +5141,7 @@ function setupFrameHandling(page, forceDebug) {
4555
5141
 
4556
5142
  // Log start of concurrent processing for hang detection
4557
5143
  if (forceDebug) {
4558
- console.log(formatLogMessage('debug', `[CONCURRENCY] Starting ${batchSize} concurrent tasks with limit ${MAX_CONCURRENT_SITES}`));
5144
+ console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Starting ${batchSize} concurrent tasks with limit ${MAX_CONCURRENT_SITES}`));
4559
5145
  }
4560
5146
 
4561
5147
  // Create tasks with timeout protection — skip domains that repeatedly timed out.
@@ -4567,7 +5153,7 @@ function setupFrameHandling(page, forceDebug) {
4567
5153
  try {
4568
5154
  // Short-circuit queued URLs once any URL in this batch has triggered a
4569
5155
  // restart. Without this, the 80-URL batch in the user's hang trace
4570
- // would have to fail one-by-one at 120s each (~28 min total) before
5156
+ // would have to fail one-by-one at 75s each (~25 min total) before
4571
5157
  // the boundary restart could fire. Now: first hang fires the flag,
4572
5158
  // remaining queued URLs return immediately, batch completes, restart.
4573
5159
  if (forceRestartFlag) {
@@ -4580,25 +5166,111 @@ function setupFrameHandling(page, forceDebug) {
4580
5166
  if (!silentMode) console.log(formatLogMessage('info', `Skipping ${task.url} — ${taskDomain} timed out ${DOMAIN_TIMEOUT_THRESHOLD} times`));
4581
5167
  return { url: task.url, rules: [], success: false, error: 'Domain repeatedly timed out', skipped: true };
4582
5168
  }
5169
+
5170
+ // DNS pre-check — fails fast on NXDOMAIN/unresolvable hosts before
5171
+ // we pay ~5-15s for Puppeteer navigation + Cloudflare detection.
5172
+ // Skips IP literals. Respects an in-memory negative cache so a dead
5173
+ // host hit by many URL paths only costs one DNS round-trip per TTL.
5174
+ //
5175
+ // Uses dns.resolve* (c-ares, async network I/O) NOT dns.lookup
5176
+ // (getaddrinfo, libuv threadpool). Under scan concurrency Puppeteer
5177
+ // saturates the default 4-slot threadpool with filesystem I/O, so
5178
+ // dns.lookup calls sit queued and blow the timeout while never
5179
+ // actually starting — wrongly skipping live domains. c-ares isn't
5180
+ // threadpool-bound so it's immune to that contention.
5181
+ if (dnsPrecheckEnabled && taskDomain && !/^[\d.:]+$|^\[/.test(taskDomain)) {
5182
+ const cached = dnsNegativeCache.get(taskDomain);
5183
+ if (cached && Date.now() - cached.timestamp < DNS_NEGATIVE_CACHE_TTL_MS) {
5184
+ dnsPrecheckSkips++;
5185
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check (cached): ${taskDomain} — ${cached.error}`));
5186
+ return { url: task.url, rules: [], success: false, error: `DNS: ${cached.error}`, skipped: true };
5187
+ }
5188
+ // Positive-resolution shortcut: dig or whois has already proven this
5189
+ // hostname live within their 20h cache TTL (populated either by an
5190
+ // earlier URL this run or by --dns-cache disk-load from a prior run).
5191
+ // Order matters -- negative cache (5min TTL, fresher data) wins
5192
+ // first, then this 20h-TTL positive index, then the actual resolve.
5193
+ if (domainKnownToResolve(taskDomain)) {
5194
+ dnsPositiveSkips++;
5195
+ dnsPositiveSkippedHosts.add(taskDomain);
5196
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
5197
+ // Fall through to navigation -- pre-check "passed" by proxy.
5198
+ } else {
5199
+ const dnsResolve = async () => {
5200
+ // resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
5201
+ // resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
5202
+ // error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
5203
+ // unchanged so the outer transient-retry path sees the real
5204
+ // resolver code and the negative cache records the right reason.
5205
+ // Previously a bare .catch swallowed everything and tried
5206
+ // resolve6, which masked transient v4-side errors behind
5207
+ // whatever resolve6 ended up reporting.
5208
+ // 2s timeout kept as a real safety net — with c-ares off the
5209
+ // threadpool it should now rarely fire.
5210
+ let timer;
5211
+ try {
5212
+ const timeoutP = new Promise((_, reject) => {
5213
+ timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
5214
+ });
5215
+ const resolveChain = dnsPromises.resolve4(taskDomain)
5216
+ .catch(err => {
5217
+ if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
5218
+ return dnsPromises.resolve6(taskDomain);
5219
+ }
5220
+ throw err;
5221
+ });
5222
+ await Promise.race([resolveChain, timeoutP]);
5223
+ } finally {
5224
+ if (timer) clearTimeout(timer);
5225
+ }
5226
+ };
5227
+ // c-ares transient codes — retry once so a momentary resolver
5228
+ // hiccup doesn't poison the negative cache for 5 minutes.
5229
+ // DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
5230
+ try {
5231
+ try {
5232
+ await dnsResolve();
5233
+ } catch (firstErr) {
5234
+ const code = firstErr && firstErr.code;
5235
+ if (DNS_TRANSIENT_ERRORS.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
5236
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
5237
+ await dnsResolve();
5238
+ } else {
5239
+ throw firstErr;
5240
+ }
5241
+ }
5242
+ } catch (dnsErr) {
5243
+ const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
5244
+ dnsNegativeCacheSet(taskDomain, errCode);
5245
+ dnsPrecheckSkips++;
5246
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
5247
+ return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
5248
+ }
5249
+ } // close `else` from domainKnownToResolve shortcut above
5250
+ }
4583
5251
  } catch {}
4584
5252
 
4585
5253
  // Per-URL timeout so a single hung processUrl can't block the batch
4586
- // forever. 120s is well past any legitimate slow page: Cloudflare
4587
- // adaptive max ~25s, nettools overall ~65s, navigation 15s.
5254
+ // forever. 75s sits comfortably above the realistic legit-page ceiling
5255
+ // (nav 35s + Cloudflare adaptive ~25s + interaction ~10s + network-idle
5256
+ // wait ~10s ≈ ~70s), well short of the old 120s safety net. Cuts
5257
+ // hang-recovery time roughly in half when an entire batch's URLs all
5258
+ // hang and we're waiting on this timeout to advance processedUrlCount.
5259
+ const PER_URL_TIMEOUT_MS = 75000;
4588
5260
  const processUrlPromise = processUrl(task.url, task.config, browser);
4589
5261
  let perUrlTimer;
4590
5262
  try {
4591
5263
  return await Promise.race([
4592
5264
  processUrlPromise,
4593
5265
  new Promise((_, reject) => {
4594
- perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (120s)')), 120000);
5266
+ perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (75s)')), PER_URL_TIMEOUT_MS);
4595
5267
  })
4596
5268
  ]);
4597
5269
  } catch (err) {
4598
- if (err && err.message === 'Per-URL timeout (120s)') {
5270
+ if (err && err.message === 'Per-URL timeout (75s)') {
4599
5271
  processUrlPromise.catch(() => {});
4600
5272
  forceRestartFlag = true;
4601
- return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (120s)', needsImmediateRestart: true };
5273
+ return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (75s)', needsImmediateRestart: true };
4602
5274
  }
4603
5275
  throw err;
4604
5276
  } finally {
@@ -4614,21 +5286,29 @@ function setupFrameHandling(page, forceDebug) {
4614
5286
 
4615
5287
  let batchResults;
4616
5288
  try {
5289
+ // Same orphan-promise pattern as the health-check race above: if the
5290
+ // 10-min batch timeout wins, the still-running Promise.all keeps going
5291
+ // until every batchTask settles. Each individual task is already wrapped
5292
+ // in p-limit's error handling so unhandled rejections should not surface,
5293
+ // but the .catch is free belt-and-braces against future refactors that
5294
+ // change task internals.
5295
+ const batchPromise = Promise.all(batchTasks);
5296
+ batchPromise.catch(() => {});
4617
5297
  batchResults = await Promise.race([
4618
- Promise.all(batchTasks),
4619
- new Promise((_, reject) =>
5298
+ batchPromise,
5299
+ new Promise((_, reject) =>
4620
5300
  setTimeout(() => reject(new Error('Batch timeout')), 600000) // 10 min timeout
4621
5301
  )
4622
5302
  ]);
4623
5303
  } catch (timeoutError) {
4624
5304
  if (timeoutError.message.includes('timeout')) {
4625
- console.log(formatLogMessage('error', `[TIMEOUT] Batch hung. Restarting browser.`));
5305
+ console.log(formatLogMessage('error', `${TIMEOUT_TAG} Batch hung. Restarting browser.`));
4626
5306
  try {
4627
5307
  await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false });
5308
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4628
5309
  const timeoutProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
4629
5310
  browser = await createBrowser(timeoutProxyArgs);
4630
5311
  urlsSinceLastCleanup = 0;
4631
- purgeStaleTrackers();
4632
5312
  } catch (restartErr) {
4633
5313
  throw restartErr;
4634
5314
  }
@@ -4665,7 +5345,7 @@ function setupFrameHandling(page, forceDebug) {
4665
5345
 
4666
5346
  // Log completion of concurrent processing
4667
5347
  if (forceDebug) {
4668
- console.log(formatLogMessage('debug', `[CONCURRENCY] Completed ${batchSize} concurrent tasks, ${batchResults.filter(r => r.success).length} successful`));
5348
+ console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Completed ${batchSize} concurrent tasks, ${batchResults.filter(r => r.success).length} successful`));
4669
5349
  }
4670
5350
 
4671
5351
  // Enhanced error reporting for Puppeteer 23.x
@@ -4727,7 +5407,7 @@ function setupFrameHandling(page, forceDebug) {
4727
5407
  if (requestCacheStats.enabled && requestCacheStats.size > 0) {
4728
5408
  const clearedCount = smartCache.clearRequestCache();
4729
5409
  if (forceDebug) {
4730
- console.log(formatLogMessage('debug', `[SmartCache] Cleared ${clearedCount} request cache entries during emergency restart`));
5410
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during emergency restart`));
4731
5411
  }
4732
5412
  }
4733
5413
  }
@@ -4748,17 +5428,23 @@ function setupFrameHandling(page, forceDebug) {
4748
5428
  }
4749
5429
 
4750
5430
  await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true, comprehensiveCleanup: removeTempFiles });
5431
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4751
5432
  // Additional cleanup after emergency restart
4752
5433
  if (removeTempFiles) {
4753
- await cleanupChromeTempFiles({
4754
- includeSnapTemp: true,
5434
+ await cleanupChromeTempFiles({
5435
+ includeSnapTemp: true,
4755
5436
  forceDebug,
4756
- comprehensive: true
5437
+ comprehensive: true
4757
5438
  });
4758
5439
  }
4759
5440
  browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
4760
5441
  urlsSinceLastCleanup = 0; // Reset counter
4761
- purgeStaleTrackers();
5442
+ // Reset the hang-detection flag too: this restart path is triggered
5443
+ // by needsImmediateRestart errors, which the per-URL 75s timeout
5444
+ // sets in lockstep with forceRestartFlag. Without this reset, the
5445
+ // hang-fallback restart below would fire a SECOND back-to-back
5446
+ // browser restart on the same batch boundary.
5447
+ forceRestartFlag = false;
4762
5448
  await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY); // Give browser time to stabilize
4763
5449
  } catch (emergencyRestartErr) {
4764
5450
  if (forceDebug) console.log(formatLogMessage('debug', `Emergency restart failed: ${emergencyRestartErr.message}`));
@@ -4769,9 +5455,9 @@ function setupFrameHandling(page, forceDebug) {
4769
5455
  console.log(`\n${messageColors.fileOp('🔄 Emergency hang detection restart:')} Browser appears hung, forcing restart`);
4770
5456
  try {
4771
5457
  await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true });
5458
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4772
5459
  browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
4773
5460
  urlsSinceLastCleanup = 0;
4774
- purgeStaleTrackers();
4775
5461
  forceRestartFlag = false; // Reset flag
4776
5462
  await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY);
4777
5463
  if (forceDebug) console.log(formatLogMessage('debug', `Emergency hang detection restart completed`));
@@ -4820,11 +5506,11 @@ function setupFrameHandling(page, forceDebug) {
4820
5506
  if (requestCacheStats.enabled && requestCacheStats.size > 0) {
4821
5507
  const clearedCount = smartCache.clearRequestCache();
4822
5508
  if (!silentMode && clearedCount > 0) {
4823
- console.log(`\n🗑️ Cleared request cache: ${clearedCount} entries after JSON processing`);
5509
+ console.log(`\n${messageColors.cleanup(`🗑️ Cleared request cache: ${clearedCount} entries after JSON processing`)}`);
4824
5510
  }
4825
5511
  if (forceDebug) {
4826
5512
  console.log(formatLogMessage('debug',
4827
- `[SmartCache] Request cache cleared after JSON scan completion (hit rate: ${requestCacheStats.hitRate})`
5513
+ `${SMART_CACHE_TAG} Request cache cleared after JSON scan completion (hit rate: ${requestCacheStats.hitRate})`
4828
5514
  ));
4829
5515
  }
4830
5516
  }
@@ -4896,6 +5582,43 @@ function setupFrameHandling(page, forceDebug) {
4896
5582
  if (cloudflareScanStats.errorPages > 0) {
4897
5583
  console.log(formatLogMessage('debug', `Cloudflare 5xx origin-error pages: ${cloudflareScanStats.errorPages} (no bypass possible — origin unreachable)`));
4898
5584
  }
5585
+ if (dnsPrecheckEnabled && (dnsPrecheckSkips > 0 || dnsPositiveSkips > 0)) {
5586
+ // Two skip mechanisms, each with its own counter + unique-host count:
5587
+ // - dnsPrecheckSkips: URLs short-circuited via the NXDOMAIN-cache
5588
+ // (dnsNegativeCache). Unique-host count = dnsNegativeCache.size.
5589
+ // - dnsPositiveSkips: URLs short-circuited via dig/whois cache
5590
+ // proof of resolution (knownResolvedHostnames index in nettools).
5591
+ // Unique-host count = dnsPositiveSkippedHosts.size (this Set is
5592
+ // populated only on actual skip events, not on every Set add in
5593
+ // nettools, so it's a true per-scan visibility metric).
5594
+ const parts = [];
5595
+ if (dnsPrecheckSkips > 0) {
5596
+ parts.push(`${dnsPrecheckSkips} URL(s) via ${dnsNegativeCache.size} unresolvable host(s)`);
5597
+ }
5598
+ if (dnsPositiveSkips > 0) {
5599
+ parts.push(`${dnsPositiveSkips} URL(s) via ${dnsPositiveSkippedHosts.size} resolved host(s)`);
5600
+ }
5601
+ console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
5602
+ }
5603
+ // Blocked-pattern hit stats. Surfaces which patterns are actually
5604
+ // doing work this scan and (by absence) which are stale enough to
5605
+ // prune from config. Top 10 by hit count to keep the log scannable
5606
+ // on configs with dozens of patterns; full counts available via
5607
+ // _blockedPatternHits if needed for tooling. Fires only when at
5608
+ // least one pattern matched -- silent on scans with no blocks.
5609
+ if (_blockedPatternHits.size > 0) {
5610
+ let totalBlocks = 0;
5611
+ for (const n of _blockedPatternHits.values()) totalBlocks += n;
5612
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${_blockedPatternHits.size} pattern(s) hit ${totalBlocks} time(s) total`));
5613
+ const sorted = [..._blockedPatternHits.entries()].sort((a, b) => b[1] - a[1]);
5614
+ const top = sorted.slice(0, 10);
5615
+ for (const [pattern, hits] of top) {
5616
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${hits.toString().padStart(6)} × ${pattern}`));
5617
+ }
5618
+ if (sorted.length > top.length) {
5619
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ... and ${sorted.length - top.length} more pattern(s)`));
5620
+ }
5621
+ }
4899
5622
  // Log smart cache statistics (if cache is enabled)
4900
5623
  // Adblock statistics
4901
5624
  if (adblockEnabled) {
@@ -5112,7 +5835,7 @@ function setupFrameHandling(page, forceDebug) {
5112
5835
  try { cleanupCloudflareCache(); } catch (_) {}
5113
5836
  try { wgDisconnectAll(forceDebug); } catch (_) {}
5114
5837
  try { ovpnDisconnectAll(forceDebug); } catch (_) {}
5115
- try { purgeStaleTrackers(); } catch (_) {}
5838
+ try { await closeAllSocksRelays(forceDebug); } catch (_) {}
5116
5839
 
5117
5840
  // Clean process termination
5118
5841
  if (forceDebug) console.log(formatLogMessage('debug', `About to exit process...`));