@fanboynz/network-scanner 2.0.66 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -28,13 +28,13 @@ const {
28
28
  cleanup: cleanupCloudflareCache
29
29
  } = require('./lib/cloudflare');
30
30
  // FP Bypass
31
- const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
31
+ const { handleFlowProxyProtection, getFlowProxyTimeouts, attachFlowProxyHeaderListener } = require('./lib/flowproxy');
32
32
  // ignore_similar rules
33
33
  const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar');
34
34
  // Graceful exit
35
- const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
35
+ const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
36
36
  // Whois & Dig
37
- const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats } = require('./lib/nettools');
37
+ const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
38
38
  // File compare
39
39
  const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
40
40
  // CDP functionality
@@ -42,7 +42,29 @@ const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeo
42
42
  // Post-processing cleanup
43
43
  const { processResults } = require('./lib/post-processing');
44
44
  // Colorize various text when used
45
- const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize');
45
+ const { messageColors, formatLogMessage } = require('./lib/colorize');
46
+ const TIMEOUT_TAG = messageColors.processing('[TIMEOUT]');
47
+ const INTERACTION_TAG = messageColors.processing('[interaction]');
48
+ const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]');
49
+ const PROXY_TAG = messageColors.processing('[proxy]');
50
+ const GREP_RESPONSE_TAG = messageColors.processing('[grep-response]');
51
+ const IGNORE_DOMAINS_BY_URL_TAG = messageColors.processing('[ignoreDomainsByUrl]');
52
+ const BLOCK_DOMAINS_BY_URL_TAG = messageColors.processing('[blockDomainsByUrl]');
53
+ const IGNORE_SIMILAR_IGNORED_DOMAINS_TAG = messageColors.processing('[ignore_similar_ignored_domains]');
54
+ const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
55
+ const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
56
+ const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
57
+ const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
58
+ const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
59
+ const VPN_TAG = messageColors.processing('[vpn]');
60
+ // Precomputed colored '[SmartCache]' subsystem prefix — paired with the
61
+ // same constant in lib/smart-cache.js so debug lines from both files
62
+ // produce consistently colored output. formatLogMessage only colors the
63
+ // [severity] tag; this constant colors the subsystem prefix.
64
+ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
65
+ // Precomputed colored '[CONCURRENCY]' subsystem prefix for batch-throughput
66
+ // log lines (start/completed). Same cyan as the other monitoring tags.
67
+ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
46
68
  // Enhanced mouse interaction and page simulation
47
69
  const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
48
70
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
@@ -158,7 +180,10 @@ function detectPuppeteerVersion() {
158
180
  // Enhanced redirect handling
159
181
  const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
160
182
  // Ensure web browser is working correctly
161
- const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload, purgeStaleTrackers } = require('./lib/browserhealth');
183
+ // purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
184
+ // and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
185
+ // automatically — manual purging is no longer needed.
186
+ const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
162
187
 
163
188
  // --- Script Configuration & Constants ---
164
189
  const VERSION = '2.0.33'; // Script version
@@ -350,7 +375,12 @@ const dnsPrecheckTimeoutMs = 2000;
350
375
  const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
351
376
  const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
352
377
  const DNS_NEGATIVE_CACHE_MAX = 1000;
353
- let dnsPrecheckSkips = 0;
378
+ let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
379
+ let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
380
+ const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
381
+ // c-ares transient codes — read-only, hoisted out of the per-task DNS
382
+ // pre-check so we don't allocate a fresh Set per URL.
383
+ const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
354
384
 
355
385
  function dnsNegativeCacheSet(hostname, error) {
356
386
  if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
@@ -693,7 +723,7 @@ General Options:
693
723
 
694
724
  Validation Options:
695
725
  --cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
696
- --dns-cache Persist dig/whois results to disk between runs (3hr/4hr TTL)
726
+ --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
697
727
  --no-dns-precheck Disable per-URL DNS resolution check before page navigation.
698
728
  By default, URLs whose hostname doesn't resolve are skipped
699
729
  immediately (saves ~5-15s of Puppeteer time per dead host).
@@ -707,6 +737,7 @@ Validation Options:
707
737
  Global config.json options:
708
738
  ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
709
739
  ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
740
+ blockDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, ALL subsequent requests on that root domain (and subdomains) are aborted via Puppeteer for the rest of the scan
710
741
  blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
711
742
  whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
712
743
  ignore_similar: true/false Ignore domains similar to already found domains (default: true)
@@ -876,6 +907,7 @@ const {
876
907
  sites = [],
877
908
  ignoreDomains = [],
878
909
  ignoreDomainsByUrl = [],
910
+ blockDomainsByUrl = [],
879
911
  blocked: globalBlocked = [],
880
912
  whois_delay = 3000,
881
913
  whois_server_mode = 'random',
@@ -965,10 +997,11 @@ if (validateConfig) {
965
997
  }
966
998
  }
967
999
 
968
- // Pre-compile global blocked regexes ONCE (used in every processUrl call)
969
- const globalBlockedRegexes = Array.isArray(globalBlocked)
970
- ? globalBlocked.map(pattern => new RegExp(pattern))
971
- : [];
1000
+ // Pre-compile global blocked regexes ONCE (used in every processUrl call).
1001
+ // Was: bare `.map(pattern => new RegExp(pattern))` which hard-threw at
1002
+ // module load on a single bad pattern, killing scan startup. Helper now
1003
+ // warns + skips so the rest of the config can still run.
1004
+ const globalBlockedRegexes = compilePatternList('blocked (global)', globalBlocked);
972
1005
 
973
1006
  // Cache compiled regexes by pattern string — avoids recompiling same patterns across URLs
974
1007
  const _compiledRegexCache = new Map();
@@ -987,6 +1020,44 @@ function getCompiledRegexes(patterns) {
987
1020
  return arr.map(p => getCompiledRegex(p));
988
1021
  }
989
1022
 
1023
+ /**
1024
+ * Compile a list of regex pattern strings, WARNING loudly on any that fail
1025
+ * compilation instead of:
1026
+ * (a) silently dropping them (old ignoreDomainsByUrl/blockDomainsByUrl
1027
+ * behavior) -- made debugging "why isn't my pattern matching?"
1028
+ * miserable, and
1029
+ * (b) hard-throwing at module load (old `blocked` behavior) -- one bad
1030
+ * pattern would kill the whole scan startup.
1031
+ *
1032
+ * Returns the array of successfully compiled regexes. Failed patterns are
1033
+ * skipped with a single warn line per failure naming the config key + the
1034
+ * source string + the regex error -- enough to find and fix without
1035
+ * grepping through diff history.
1036
+ *
1037
+ * @param {string} configKey - name of the config key, for warn context
1038
+ * @param {string[]} patterns - raw regex source strings
1039
+ * @param {(p:string)=>RegExp} [compile] - compile fn (defaults to new RegExp)
1040
+ * @returns {RegExp[]}
1041
+ */
1042
+ function compilePatternList(configKey, patterns, compile = (p) => new RegExp(p)) {
1043
+ if (!Array.isArray(patterns)) return [];
1044
+ const out = [];
1045
+ for (const p of patterns) {
1046
+ try {
1047
+ out.push(compile(p));
1048
+ } catch (err) {
1049
+ console.warn(formatLogMessage('warn', `[config] ${configKey} pattern dropped (compile error): ${JSON.stringify(p)} -- ${err.message}`));
1050
+ }
1051
+ }
1052
+ return out;
1053
+ }
1054
+
1055
+ // Per-pattern match counters for the `blocked` regex (site + global,
1056
+ // combined). Keyed by RegExp.source so the same pattern appearing in both
1057
+ // site and global lists rolls up into one row. Reported at scan end so
1058
+ // stale patterns that match zero requests are easy to spot and prune.
1059
+ const _blockedPatternHits = new Map();
1060
+
990
1061
  // Pre-split ignoreDomains into exact Set (O(1) lookup) and wildcard array
991
1062
  const _ignoreDomainsExact = new Set();
992
1063
  const _ignoreDomainsWildcard = [];
@@ -998,15 +1069,23 @@ for (const pattern of ignoreDomains) {
998
1069
  }
999
1070
  }
1000
1071
 
1001
- // Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
1002
- const _ignoreDomainsByUrlRegexes = Array.isArray(ignoreDomainsByUrl)
1003
- ? ignoreDomainsByUrl.map(p => {
1004
- try { return getCompiledRegex(p); } catch { return null; }
1005
- }).filter(r => r)
1006
- : [];
1072
+ // Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains.
1073
+ // Bad patterns warn (via compilePatternList) instead of silently dropping.
1074
+ const _ignoreDomainsByUrlRegexes = compilePatternList('ignoreDomainsByUrl', ignoreDomainsByUrl, getCompiledRegex);
1007
1075
  // Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
1008
1076
  const _dynamicallyIgnoredDomains = new Set();
1009
1077
 
1078
+ // blockDomainsByUrl: symmetric to ignoreDomainsByUrl but for active
1079
+ // blocking via Puppeteer's request.abort(). When a request URL matches
1080
+ // one of these regex patterns, the request's root domain is added to
1081
+ // _dynamicallyBlockedDomains; subsequent requests on that domain (and
1082
+ // its subdomains, via parent-walk in matchesDynamicBlock) get aborted
1083
+ // before reaching the network. The triggering request itself is also
1084
+ // aborted -- same "gate fires immediately after trigger" semantic the
1085
+ // ignoreDomainsByUrl path uses for the dynamic Set short-circuit.
1086
+ const _blockDomainsByUrlRegexes = compilePatternList('blockDomainsByUrl', blockDomainsByUrl, getCompiledRegex);
1087
+ const _dynamicallyBlockedDomains = new Set();
1088
+
1010
1089
  // Apply global configuration overrides with validation
1011
1090
  // Priority: Command line args > config.json > defaults
1012
1091
  const MAX_CONCURRENT_SITES = (() => {
@@ -1103,7 +1182,7 @@ function safeMarkDomainProcessed(domain, context, metadata) {
1103
1182
  }
1104
1183
  } catch (cacheErr) {
1105
1184
  if (forceDebug) {
1106
- console.log(formatLogMessage('debug', `[SmartCache] Error marking domain: ${cacheErr.message}`));
1185
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
1107
1186
  }
1108
1187
  }
1109
1188
  }
@@ -1417,16 +1496,58 @@ function shouldBypassCacheForUrl(url, siteConfig) {
1417
1496
  // ability to use wildcards in ignoreDomains
1418
1497
  // Cache compiled wildcard regexes to avoid recompilation on every request
1419
1498
  const _wildcardRegexCache = new Map();
1499
+
1500
+ // Generic parent-walk helper: returns true if `domain` or any of its
1501
+ // parents (one label at a time, up to the TLD) is present in `set`.
1502
+ // Mirrors the static/dynamic parent-walk inside matchesIgnoreDomain but
1503
+ // usable against an arbitrary single Set -- consumed by
1504
+ // matchesDynamicBlock below. matchesIgnoreDomain keeps its inline
1505
+ // dual-Set probe so the hot path stays single-split, but new single-Set
1506
+ // consumers (block, future similar features) share this helper.
1507
+ function _domainOrParentInSet(set, domain) {
1508
+ if (set.size === 0) return false;
1509
+ if (set.has(domain)) return true;
1510
+ const parts = domain.split('.');
1511
+ for (let i = 1; i < parts.length; i++) {
1512
+ if (set.has(parts.slice(i).join('.'))) return true;
1513
+ }
1514
+ return false;
1515
+ }
1516
+
1517
+ /**
1518
+ * Block-side counterpart to the ignore gate. Returns true if `domain`
1519
+ * (or any of its parents) has been added to _dynamicallyBlockedDomains
1520
+ * by an earlier blockDomainsByUrl pattern match. Called per-request to
1521
+ * decide whether to request.abort() before the static blocked-regex
1522
+ * check fires.
1523
+ */
1524
+ function matchesDynamicBlock(domain) {
1525
+ return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
1526
+ }
1527
+
1420
1528
  function matchesIgnoreDomain(domain, ignorePatterns) {
1421
- // Dynamically ignored domains (from URL pattern matches via ignoreDomainsByUrl)
1422
- if (_dynamicallyIgnoredDomains.has(domain)) return true;
1423
- // Fast path: exact match or suffix match against Set (O(n) for parts, but no regex)
1424
- if (_ignoreDomainsExact.size > 0) {
1425
- if (_ignoreDomainsExact.has(domain)) return true;
1426
- // Check parent domains: sub.ads.example.com ads.example.com example.com
1529
+ // Both dynamic and static ignore lists are walked parent-by-parent so a
1530
+ // subdomain of an ignored root inherits the ignore. Previously the
1531
+ // dynamic check was exact-only, creating an asymmetry: a static-config
1532
+ // `example.com` ignored cdn.example.com transitively, but a runtime
1533
+ // ignoreDomainsByUrl match for the same root (stored as root via
1534
+ // checkedRootDomain at line ~2993) did NOT cascade -- subdomains slipped
1535
+ // through to dig/whois/regex despite the root being ignored. Now
1536
+ // unified: parts split once, shared between both Set probes.
1537
+ const hasDynamic = _dynamicallyIgnoredDomains.size > 0;
1538
+ const hasExact = _ignoreDomainsExact.size > 0;
1539
+
1540
+ if (hasDynamic || hasExact) {
1541
+ // Exact-domain hit on either set wins early.
1542
+ if (hasDynamic && _dynamicallyIgnoredDomains.has(domain)) return true;
1543
+ if (hasExact && _ignoreDomainsExact.has(domain)) return true;
1544
+
1545
+ // Parent-walk: sub.ads.example.com → ads.example.com → example.com
1427
1546
  const parts = domain.split('.');
1428
1547
  for (let i = 1; i < parts.length; i++) {
1429
- if (_ignoreDomainsExact.has(parts.slice(i).join('.'))) return true;
1548
+ const parent = parts.slice(i).join('.');
1549
+ if (hasDynamic && _dynamicallyIgnoredDomains.has(parent)) return true;
1550
+ if (hasExact && _ignoreDomainsExact.has(parent)) return true;
1430
1551
  }
1431
1552
  }
1432
1553
 
@@ -1868,7 +1989,6 @@ function setupFrameHandling(page, forceDebug) {
1868
1989
  wgDisconnectAll(forceDebug);
1869
1990
  ovpnDisconnectAll(forceDebug);
1870
1991
  cleanupCloudflareCache();
1871
- purgeStaleTrackers();
1872
1992
  try { await closeAllSocksRelays(forceDebug); } catch (_) {}
1873
1993
  }
1874
1994
 
@@ -2020,28 +2140,46 @@ function setupFrameHandling(page, forceDebug) {
2020
2140
  'Browser disconnected'
2021
2141
  ]);
2022
2142
 
2143
+ // Popup-capture cleanup registry — declared outside the try so the
2144
+ // finally block (which is a separate lexical scope from try) can see
2145
+ // it. Populated by the capture_popups setup block if siteConfig
2146
+ // .capture_popups is true; iterated in finally to deregister the
2147
+ // browser 'targetcreated' listener and close any tracked popup pages.
2148
+ const popupCleanups = [];
2149
+ // Race-window guard: 'targetcreated' fires synchronously, but
2150
+ // onTargetCreated does an `await target.page()`. If a popup target
2151
+ // is created right as the per-URL try block winds down, the await
2152
+ // can resolve AFTER finally has already iterated popupCleanups —
2153
+ // leaving the popup unregistered for manual cleanup (it still gets
2154
+ // closed by its own 3s auto-close timer, but in the meantime its
2155
+ // request listener could capture matches into matchedDomains for a
2156
+ // URL that already "finished"). The flag is set in finally and
2157
+ // checked at the start of onTargetCreated to short-circuit late
2158
+ // events cleanly.
2159
+ let urlFinished = false;
2160
+
2023
2161
  try {
2024
2162
 
2025
2163
  // --- Connect VPN if configured for this site ---
2026
2164
  if (siteConfig.vpn) {
2027
2165
  const vpnResult = await wgConnect(siteConfig, forceDebug);
2028
2166
  if (!vpnResult.success) {
2029
- console.warn(formatLogMessage('warn', `[vpn] WireGuard failed for ${currentUrl}: ${vpnResult.error}`));
2167
+ console.warn(formatLogMessage('warn', `${VPN_TAG} WireGuard failed for ${currentUrl}: ${vpnResult.error}`));
2030
2168
  return { url: currentUrl, rules: [], success: false, vpnFailed: true };
2031
2169
  }
2032
2170
  if (!silentMode) {
2033
2171
  const ipInfo = vpnResult.externalIP ? ` (${vpnResult.externalIP})` : '';
2034
- console.log(formatLogMessage('info', `[vpn] WireGuard connected via ${vpnResult.interface}${ipInfo} for ${currentUrl}`));
2172
+ console.log(formatLogMessage('info', `${VPN_TAG} WireGuard connected via ${vpnResult.interface}${ipInfo} for ${currentUrl}`));
2035
2173
  }
2036
2174
  } else if (siteConfig.openvpn) {
2037
2175
  const ovpnResult = await ovpnConnect(siteConfig, forceDebug);
2038
2176
  if (!ovpnResult.success) {
2039
- console.warn(formatLogMessage('warn', `[vpn] OpenVPN failed for ${currentUrl}: ${ovpnResult.error}`));
2177
+ console.warn(formatLogMessage('warn', `${VPN_TAG} OpenVPN failed for ${currentUrl}: ${ovpnResult.error}`));
2040
2178
  return { url: currentUrl, rules: [], success: false, vpnFailed: true };
2041
2179
  }
2042
2180
  if (!silentMode) {
2043
2181
  const ipInfo = ovpnResult.externalIP ? ` (${ovpnResult.externalIP})` : '';
2044
- console.log(formatLogMessage('info', `[vpn] OpenVPN connected via ${ovpnResult.connection}${ipInfo} for ${currentUrl}`));
2182
+ console.log(formatLogMessage('info', `${VPN_TAG} OpenVPN connected via ${ovpnResult.connection}${ipInfo} for ${currentUrl}`));
2045
2183
  }
2046
2184
  }
2047
2185
 
@@ -2075,12 +2213,12 @@ function setupFrameHandling(page, forceDebug) {
2075
2213
  const totalDelay = siteDelay + bufferTime;
2076
2214
 
2077
2215
  if (forceDebug && hasCloudflareConfig) {
2078
- console.log(formatLogMessage('debug', `[realtime_cleanup] Using extended delay for Cloudflare site: ${totalDelay}ms (${siteDelay}ms + ${bufferTime}ms CF buffer)`));
2216
+ console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Using extended delay for Cloudflare site: ${totalDelay}ms (${siteDelay}ms + ${bufferTime}ms CF buffer)`));
2079
2217
  }
2080
2218
 
2081
2219
  const realtimeResult = await performRealtimeWindowCleanup(browserInstance, threshold, forceDebug, totalDelay);
2082
2220
  if (realtimeResult.success && realtimeResult.closedCount > 0 && forceDebug) {
2083
- console.log(formatLogMessage('debug', `[realtime_cleanup] Cleaned ${realtimeResult.closedCount} old pages, ${realtimeResult.remainingPages} remaining`));
2221
+ console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Cleaned ${realtimeResult.closedCount} old pages, ${realtimeResult.remainingPages} remaining`));
2084
2222
  }
2085
2223
  }
2086
2224
 
@@ -2091,7 +2229,7 @@ function setupFrameHandling(page, forceDebug) {
2091
2229
  // Aggressive timeouts prevent hanging in Puppeteer 23.x while maintaining speed
2092
2230
 
2093
2231
  page.on('console', (msg) => {
2094
- if (forceDebug && msg.type() === 'error') console.log(`[debug] Console error: ${msg.text()}`);
2232
+ if (forceDebug && msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
2095
2233
  });
2096
2234
 
2097
2235
  // Add page crash handler
@@ -2152,6 +2290,11 @@ function setupFrameHandling(page, forceDebug) {
2152
2290
  const flowproxyTimeouts = getFlowProxyTimeouts(siteConfig);
2153
2291
  page.setDefaultTimeout(Math.min(flowproxyTimeouts.pageTimeout, TIMEOUTS.DEFAULT_NAVIGATION));
2154
2292
  page.setDefaultNavigationTimeout(Math.min(flowproxyTimeouts.navigationTimeout, TIMEOUTS.DEFAULT_PAGE));
2293
+ // Attach the response/header listener BEFORE navigation so the
2294
+ // document response's own headers (Server, Set-Cookie, X-FlowProxy-*,
2295
+ // etc.) are observed. The listener accumulates state in a WeakMap
2296
+ // keyed by page; analyzeFlowProxyProtection reads from it later.
2297
+ attachFlowProxyHeaderListener(page);
2155
2298
  if (forceDebug) {
2156
2299
  console.log(formatLogMessage('debug', `Applied flowProxy timeouts - page: ${flowproxyTimeouts.pageTimeout}ms, nav: ${flowproxyTimeouts.navigationTimeout}ms`));
2157
2300
  }
@@ -2170,9 +2313,9 @@ function setupFrameHandling(page, forceDebug) {
2170
2313
  if (shouldInjectEvalForPage) {
2171
2314
  if (forceDebug) {
2172
2315
  if (globalEvalOnDoc) {
2173
- console.log(formatLogMessage('debug', `[evalOnDoc] Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
2316
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
2174
2317
  } else { // siteConfig.evaluateOnNewDocument must be true
2175
- console.log(formatLogMessage('debug', `[evalOnDoc] Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
2318
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
2176
2319
  }
2177
2320
  }
2178
2321
 
@@ -2193,7 +2336,7 @@ function setupFrameHandling(page, forceDebug) {
2193
2336
  browserResponsive = true;
2194
2337
  } catch (healthErr) {
2195
2338
  if (forceDebug) {
2196
- console.log(formatLogMessage('debug', `[evalOnDoc] Browser health check failed: ${healthErr.message}`));
2339
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser health check failed: ${healthErr.message}`));
2197
2340
  }
2198
2341
  browserResponsive = false;
2199
2342
  }
@@ -2292,7 +2435,7 @@ function setupFrameHandling(page, forceDebug) {
2292
2435
  ]);
2293
2436
  evalOnDocSuccess = true;
2294
2437
  if (forceDebug) {
2295
- console.log(formatLogMessage('debug', `[evalOnDoc] Full injection successful for ${currentUrl}`));
2438
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection successful for ${currentUrl}`));
2296
2439
  }
2297
2440
  } catch (fullInjectionErr) {
2298
2441
  // Enhanced error detection for CDP issues
@@ -2303,12 +2446,12 @@ function setupFrameHandling(page, forceDebug) {
2303
2446
 
2304
2447
  if (forceDebug) {
2305
2448
  const errorType = isCDPError ? 'CDP/Protocol error' : 'timeout/other';
2306
- console.log(formatLogMessage('debug', `[evalOnDoc] Full injection failed (${errorType}): ${fullInjectionErr.message}`));
2449
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection failed (${errorType}): ${fullInjectionErr.message}`));
2307
2450
  }
2308
2451
 
2309
2452
  // Skip fallback for CDP errors - they indicate browser communication issues
2310
2453
  if (isCDPError) {
2311
- console.warn(formatLogMessage('warn', `[evalOnDoc] CDP communication failure - skipping injection for ${currentUrl}`));
2454
+ console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} CDP communication failure - skipping injection for ${currentUrl}`));
2312
2455
  evalOnDocSuccess = false;
2313
2456
  } else {
2314
2457
 
@@ -2355,11 +2498,11 @@ function setupFrameHandling(page, forceDebug) {
2355
2498
  ]);
2356
2499
  evalOnDocSuccess = true;
2357
2500
  if (forceDebug) {
2358
- console.log(formatLogMessage('debug', `[evalOnDoc] Minimal injection successful for ${currentUrl}`));
2501
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection successful for ${currentUrl}`));
2359
2502
  }
2360
2503
  } catch (minimalInjectionErr) {
2361
2504
  if (forceDebug) {
2362
- console.log(formatLogMessage('debug', `[evalOnDoc] Minimal injection also failed: ${minimalInjectionErr.message}`));
2505
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection also failed: ${minimalInjectionErr.message}`));
2363
2506
  }
2364
2507
  evalOnDocSuccess = false;
2365
2508
  }
@@ -2367,14 +2510,14 @@ function setupFrameHandling(page, forceDebug) {
2367
2510
  }
2368
2511
  } else {
2369
2512
  if (forceDebug) {
2370
- console.log(formatLogMessage('debug', `[evalOnDoc] Browser unresponsive, skipping injection for ${currentUrl}`));
2513
+ console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser unresponsive, skipping injection for ${currentUrl}`));
2371
2514
  }
2372
2515
  evalOnDocSuccess = false;
2373
2516
  }
2374
2517
 
2375
2518
  // Final status logging
2376
2519
  if (!evalOnDocSuccess) {
2377
- console.warn(formatLogMessage('warn', `[evalOnDoc] All injection strategies failed for ${currentUrl} - continuing with standard request monitoring only`));
2520
+ console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} All injection strategies failed for ${currentUrl} - continuing with standard request monitoring only`));
2378
2521
  }
2379
2522
  // Allow realtime cleanup to proceed after injection completes
2380
2523
  if (shouldInjectEvalForPage && siteConfig.window_cleanup === "realtime") {
@@ -2403,7 +2546,7 @@ function setupFrameHandling(page, forceDebug) {
2403
2546
  }
2404
2547
  }, { selectors: cssBlockedSelectors });
2405
2548
  } catch (cssErr) {
2406
- console.warn(formatLogMessage('warn', `[css_blocked] Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
2549
+ console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
2407
2550
  }
2408
2551
  }
2409
2552
  // --- END: CSS Element Blocking Setup ---
@@ -2460,7 +2603,7 @@ function setupFrameHandling(page, forceDebug) {
2460
2603
  const clearResult = await clearSiteData(page, currentUrl, forceDebug);
2461
2604
  if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data for ${currentUrl}`));
2462
2605
  } catch (clearErr) {
2463
- if (forceDebug) console.log(formatLogMessage('debug', `[clear_sitedata] Failed for ${currentUrl}: ${clearErr.message}`));
2606
+ if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Failed for ${currentUrl}: ${clearErr.message}`));
2464
2607
  }
2465
2608
  }
2466
2609
 
@@ -2686,19 +2829,41 @@ function setupFrameHandling(page, forceDebug) {
2686
2829
  });
2687
2830
  }
2688
2831
 
2689
- const blockedRegexes = Array.isArray(siteConfig.blocked)
2690
- ? siteConfig.blocked.map(pattern => getCompiledRegex(pattern))
2691
- : [];
2832
+ // Per-site blocked compile -- helper warns on bad patterns instead of
2833
+ // throwing out of processUrl and breaking that site's scan.
2834
+ const blockedRegexes = compilePatternList(`blocked (site: ${siteConfig.url || 'unknown'})`, siteConfig.blocked, getCompiledRegex);
2835
+
2836
+ // Per-site escape hatch: disable_adblock turns off the two layers of
2837
+ // "global" ad-blocking for this URL — the adblock-rs filter-list engine
2838
+ // and the globalBlockedRegexes pattern list. Per-site siteConfig.blocked
2839
+ // is preserved (it's an explicit per-site choice, not "global" blocking).
2840
+ //
2841
+ // The use case: capture_popups + popunder/redirect chains. The global
2842
+ // adblock often aborts the exact requests that fire the popup or chain
2843
+ // to the tracker, defeating capture. Setting disable_adblock: true for
2844
+ // those specific URLs lets the chain play out naturally so the popup
2845
+ // request listener can observe the full hop sequence.
2846
+ const disableAdblock = siteConfig.disable_adblock === true;
2692
2847
 
2693
2848
  // Pre-build Set for O(1) resourceType lookups (fired per request)
2694
2849
  const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
2695
2850
  ? new Set(siteConfig.resourceTypes)
2696
2851
  : null;
2697
-
2698
- // Combine site-specific with pre-compiled global blocked patterns
2699
- const allBlockedRegexes = blockedRegexes.length > 0
2700
- ? [...blockedRegexes, ...globalBlockedRegexes]
2701
- : globalBlockedRegexes; // Avoid spread when no site-specific patterns
2852
+
2853
+ // Combine site-specific with pre-compiled global blocked patterns.
2854
+ // When disable_adblock is true, globalBlockedRegexes is omitted so
2855
+ // only the per-site list applies.
2856
+ const allBlockedRegexes = disableAdblock
2857
+ ? blockedRegexes
2858
+ : (blockedRegexes.length > 0
2859
+ ? [...blockedRegexes, ...globalBlockedRegexes]
2860
+ : globalBlockedRegexes); // Avoid spread when no site-specific patterns
2861
+
2862
+ if (disableAdblock && forceDebug) {
2863
+ const dropped = globalBlockedRegexes.length;
2864
+ const adblockNote = adblockEnabled && adblockMatcher ? ' + adblock-rs engine' : '';
2865
+ console.log(formatLogMessage('debug', `[adblock] disable_adblock=true for ${currentUrl} — skipping ${dropped} global blocked patterns${adblockNote} (site-level ${blockedRegexes.length} pattern(s) still apply)`));
2866
+ }
2702
2867
 
2703
2868
  /**
2704
2869
  * Helper function to add domain to matched collection
@@ -2725,7 +2890,7 @@ function setupFrameHandling(page, forceDebug) {
2725
2890
  const cachedSimilarity = smartCache.getCachedSimilarity(domain, existingDomain);
2726
2891
  if (cachedSimilarity !== null && cachedSimilarity >= similarityThreshold) {
2727
2892
  if (forceDebug) {
2728
- console.log(formatLogMessage('debug', `[SmartCache] Used cached similarity: ${domain} ~= ${existingDomain} (${cachedSimilarity}%)`));
2893
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Used cached similarity: ${domain} ~= ${existingDomain} (${cachedSimilarity}%)`));
2729
2894
  }
2730
2895
  return; // Skip adding this domain
2731
2896
  }
@@ -2749,7 +2914,7 @@ function setupFrameHandling(page, forceDebug) {
2749
2914
 
2750
2915
  if (smartCache && smartCache.shouldSkipDomain(domain, context)) {
2751
2916
  if (forceDebug) {
2752
- console.log(formatLogMessage('debug', `[SmartCache] Skipping cached domain: ${domain}`));
2917
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Skipping cached domain: ${domain}`));
2753
2918
  }
2754
2919
  return; // Skip adding this domain
2755
2920
  }
@@ -2767,7 +2932,7 @@ function setupFrameHandling(page, forceDebug) {
2767
2932
 
2768
2933
  if (similarCheck.shouldIgnore) {
2769
2934
  if (forceDebug) {
2770
- console.log(formatLogMessage('debug', `[ignore_similar] Skipping ${domain}: ${similarCheck.reason}`));
2935
+ console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_TAG} Skipping ${domain}: ${similarCheck.reason}`));
2771
2936
  }
2772
2937
  return; // Skip adding this domain
2773
2938
  }
@@ -2783,7 +2948,7 @@ function setupFrameHandling(page, forceDebug) {
2783
2948
 
2784
2949
  if (ignoredSimilarCheck.shouldIgnore) {
2785
2950
  if (forceDebug) {
2786
- console.log(formatLogMessage('debug', `[ignore_similar_ignored_domains] Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
2951
+ console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_IGNORED_DOMAINS_TAG} Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
2787
2952
  }
2788
2953
  return; // Skip adding this domain
2789
2954
  }
@@ -2804,7 +2969,7 @@ function setupFrameHandling(page, forceDebug) {
2804
2969
  }
2805
2970
  } catch (cacheErr) {
2806
2971
  if (forceDebug) {
2807
- console.log(formatLogMessage('debug', `[SmartCache] Error marking domain: ${cacheErr.message}`));
2972
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
2808
2973
  }
2809
2974
  }
2810
2975
  }
@@ -2822,6 +2987,247 @@ function setupFrameHandling(page, forceDebug) {
2822
2987
  }
2823
2988
  }
2824
2989
 
2990
+ // === POPUP CAPTURE (opt-in via siteConfig.capture_popups: true) ===
2991
+ // Many ad networks fire popunders / new-tab opens (window.open, target=
2992
+ // "_blank") that navigate to trackers and disappear from view. Those
2993
+ // pages are SEPARATE Puppeteer targets — page.on('request', ...) on the
2994
+ // main page never sees their network traffic.
2995
+ //
2996
+ // IMPORTANT: modern Chromium blocks programmatic window.open() unless
2997
+ // it's triggered by a real user gesture. In practice that means
2998
+ // capture_popups only catches anything when the scanner is actually
2999
+ // clicking on the page — i.e., the site config also has
3000
+ // `interact: true` AND `interact_clicks: true`. Setting capture_popups
3001
+ // alone will register the listener but no popups will fire.
3002
+ //
3003
+ // When capture_popups is true, we attach a browser-level 'targetcreated'
3004
+ // listener for THIS URL only. New page targets whose opener-chain leads
3005
+ // back to our main page (within maxDepth levels) get a stripped-down
3006
+ // request listener — same regex/first-party/ignoreDomains filter as
3007
+ // the main handler, same addMatchedDomain() sink, same domain
3008
+ // detection cache, same nettools/similarity logic (all inherited via
3009
+ // addMatchedDomain). Cloudflare bypass, adblock-rs matching, curl/grep
3010
+ // content download, and request.abort() are intentionally skipped on
3011
+ // popups — they're observation-only.
3012
+ //
3013
+ // Each popup's request listener stays attached across in-window
3014
+ // navigations, so a single popup that redirects A -> B -> C captures
3015
+ // every hop. The capture window (default 5s, configurable per-site
3016
+ // via capture_popups_window_ms) is the wall-clock budget for that
3017
+ // chain — bump it for long redirect chains, lower it for high-popup-
3018
+ // rate sites where memory pressure matters more than chain coverage.
3019
+ const capturePopups = siteConfig.capture_popups === true;
3020
+ // Per-site overrides (with sane defaults). Parsed as numbers so config
3021
+ // values from JSON come through correctly; falsy / non-positive values
3022
+ // fall back to the default rather than silently disabling capture.
3023
+ const POPUP_MAX_DEPTH = (() => {
3024
+ const v = parseInt(siteConfig.capture_popups_max_depth, 10);
3025
+ return Number.isFinite(v) && v > 0 ? v : 2;
3026
+ })();
3027
+ const POPUP_CAPTURE_WINDOW_MS = (() => {
3028
+ const v = parseInt(siteConfig.capture_popups_window_ms, 10);
3029
+ return Number.isFinite(v) && v > 0 ? v : 5000;
3030
+ })();
3031
+
3032
+ if (capturePopups && forceDebug) {
3033
+ // One-time setup-time warning if the click prerequisite isn't met.
3034
+ // Without clicks, capture_popups is a no-op in practice.
3035
+ const hasClicks = siteConfig.interact === true && siteConfig.interact_clicks === true;
3036
+ if (!hasClicks) {
3037
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but interact_clicks is not — popups need user-gesture clicks to fire; expect no captures unless the page opens popups via in-page redirects`));
3038
+ }
3039
+ console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
3040
+ }
3041
+
3042
+ if (capturePopups) {
3043
+ const mainTarget = page.target();
3044
+
3045
+ // Walk target.opener() chain to find depth relative to mainTarget.
3046
+ // Returns 0 if the target isn't a descendant of mainTarget at all,
3047
+ // 1 for a direct popup of the main page, 2 for popup-of-popup, etc.
3048
+ const getPopupDepth = (target) => {
3049
+ let depth = 0;
3050
+ let cur = target.opener();
3051
+ while (cur && depth <= POPUP_MAX_DEPTH + 1) {
3052
+ depth++;
3053
+ if (cur === mainTarget) return depth;
3054
+ cur = cur.opener();
3055
+ }
3056
+ return 0;
3057
+ };
3058
+
3059
+ // Attach observation-only request listener to a popup page. No
3060
+ // setRequestInterception(true) — page.on('request') fires for every
3061
+ // request regardless of interception state, and we don't need to
3062
+ // block anything on popups.
3063
+ const attachPopupRequestCapture = (popupPage, depth) => {
3064
+ popupPage.on('request', (request) => {
3065
+ try {
3066
+ const checkedUrl = request.url();
3067
+ let fullSubdomain = '';
3068
+ let checkedRootDomain = '';
3069
+ try {
3070
+ const parsedUrl = new URL(checkedUrl);
3071
+ fullSubdomain = parsedUrl.hostname;
3072
+ const pslResult = psl.parse(fullSubdomain);
3073
+ checkedRootDomain = pslResult.domain || fullSubdomain;
3074
+ } catch (_) { return; }
3075
+ if (!checkedRootDomain) return;
3076
+
3077
+ // ignoreDomainsByUrl — if any pattern matches this popup URL,
3078
+ // mark the root domain as ignored for the rest of the scan
3079
+ // (main page + all popups). Mirrors the main handler so a
3080
+ // tracker URL surfaced via popup chain has the same dampening
3081
+ // effect as one surfaced on the main page.
3082
+ if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
3083
+ for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
3084
+ if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
3085
+ _dynamicallyIgnoredDomains.add(checkedRootDomain);
3086
+ if (forceDebug) {
3087
+ console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3088
+ }
3089
+ break;
3090
+ }
3091
+ }
3092
+ }
3093
+
3094
+ // blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
3095
+ // above; populating the dynamic block Set from popup URLs lets
3096
+ // tracker URLs surfaced via popup chains poison their root
3097
+ // domain for the rest of the scan just like main-page hits do.
3098
+ if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3099
+ for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3100
+ if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
3101
+ _dynamicallyBlockedDomains.add(checkedRootDomain);
3102
+ if (forceDebug) {
3103
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3104
+ }
3105
+ break;
3106
+ }
3107
+ }
3108
+ }
3109
+
3110
+ // ignoreDomains gate (global; matchesIgnoreDomain also short-
3111
+ // circuits on _dynamicallyIgnoredDomains, so a domain we just
3112
+ // added above will be caught here on the same request).
3113
+ if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
3114
+
3115
+ // Dynamic-block gate for popup requests — early return on
3116
+ // matched root or any parent (parent-walk in
3117
+ // matchesDynamicBlock). Popups don't have a request object
3118
+ // available here, so we just return rather than abort; the
3119
+ // popup-request observer treats this as "don't process".
3120
+ if (matchesDynamicBlock(checkedRootDomain)) return;
3121
+
3122
+ // First-party / third-party gate (popup belongs to the main URL's
3123
+ // domain group — its OWN URL doesn't redefine first-party).
3124
+ const isFirstParty = firstPartyDomains.has(checkedRootDomain);
3125
+ if (siteConfig.firstParty === false && isFirstParty) return;
3126
+ if (siteConfig.thirdParty === false && !isFirstParty) return;
3127
+
3128
+ // Regex match against the site's filterRegex list
3129
+ const resourceType = request.resourceType();
3130
+ let regexMatched = false;
3131
+ for (const re of regexes) {
3132
+ if (re.test(checkedUrl)) {
3133
+ regexMatched = true;
3134
+ if (forceDebug) {
3135
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
3136
+ }
3137
+ break;
3138
+ }
3139
+ }
3140
+
3141
+ if (!regexMatched) return;
3142
+
3143
+ // hasNetTools is the same flag the main handler uses (line ~2639).
3144
+ // When the site config carries whois/dig terms, regex match is
3145
+ // not sufficient by itself — the URL must ALSO pass the whois/
3146
+ // dig validation before it counts. Mirrors the main handler's
3147
+ // behavior so 'capture popup domains that match regex/dig/whois'
3148
+ // means the same thing for popups as for the main page.
3149
+ if (hasNetTools) {
3150
+ const popupNetToolsHandler = createNetToolsHandler({
3151
+ whoisTerms, whoisOrTerms,
3152
+ processedWhoisDomains: globalProcessedWhoisDomains,
3153
+ processedDigDomains: globalProcessedDigDomains,
3154
+ whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
3155
+ whoisServer,
3156
+ whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
3157
+ debugLogFile,
3158
+ digTerms, digOrTerms, digRecordType,
3159
+ digSubdomain: siteConfig.dig_subdomain === true,
3160
+ dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
3161
+ matchedDomains, addMatchedDomain,
3162
+ isDomainAlreadyDetected: isLocallyDetected,
3163
+ onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
3164
+ onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
3165
+ cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
3166
+ cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
3167
+ currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
3168
+ ignoreDomains, matchesIgnoreDomain
3169
+ });
3170
+ setImmediate(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3171
+ } else {
3172
+ // No nettools required — regex match alone counts.
3173
+ addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3174
+ }
3175
+ } catch (_) { /* observation-only — never let a popup error escape */ }
3176
+ });
3177
+ };
3178
+
3179
+ const onTargetCreated = async (target) => {
3180
+ // Short-circuit guard: if finally has already started, don't attach
3181
+ // a request listener whose closure would outlive its meaningful
3182
+ // scope. The race is narrow (a targetcreated firing while we're
3183
+ // mid-await on target.page() across the finally boundary), but
3184
+ // without this guard a late popup could push matches into
3185
+ // matchedDomains for a URL whose processing has already returned.
3186
+ if (urlFinished) return;
3187
+ if (target.type() !== 'page') return;
3188
+ const depth = getPopupDepth(target);
3189
+ if (depth < 1) return; // Not one of ours
3190
+ if (depth > POPUP_MAX_DEPTH) {
3191
+ if (forceDebug) {
3192
+ console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
3193
+ }
3194
+ return;
3195
+ }
3196
+
3197
+ let popupPage;
3198
+ try { popupPage = await target.page(); } catch (_) { return; }
3199
+ if (!popupPage) return;
3200
+ // Re-check after the await — the per-URL finally may have flipped
3201
+ // the flag while target.page() was resolving.
3202
+ if (urlFinished) {
3203
+ try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
3204
+ return;
3205
+ }
3206
+
3207
+ if (forceDebug) {
3208
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
3209
+ }
3210
+
3211
+ attachPopupRequestCapture(popupPage, depth);
3212
+
3213
+ // Auto-close after the capture window so popups don't pile up.
3214
+ const closeTimer = setTimeout(() => {
3215
+ try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
3216
+ }, POPUP_CAPTURE_WINDOW_MS);
3217
+ if (typeof closeTimer.unref === 'function') closeTimer.unref();
3218
+
3219
+ popupCleanups.push(() => {
3220
+ clearTimeout(closeTimer);
3221
+ try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
3222
+ });
3223
+ };
3224
+
3225
+ browser.on('targetcreated', onTargetCreated);
3226
+ popupCleanups.push(() => {
3227
+ try { browser.off('targetcreated', onTargetCreated); } catch (_) {}
3228
+ });
3229
+ }
3230
+
2825
3231
  // --- page.on('request', ...) Handler: Core Network Request Logic ---
2826
3232
  // This handler is triggered for every network request made by the page.
2827
3233
  // It decides whether to allow, block, or process the request based on:
@@ -2882,15 +3288,17 @@ function setupFrameHandling(page, forceDebug) {
2882
3288
  console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[frame: ${isMainFrame ? 'main' : 'iframe'}] ${debugFrameUrl} → ${checkedUrl}`));
2883
3289
  }
2884
3290
 
2885
- // Apply adblock rules BEFORE expensive regex checks for better performance
2886
- if (adblockEnabled && adblockMatcher) {
3291
+ // Apply adblock-rs filter-list rules BEFORE expensive regex checks
3292
+ // for better performance. Gated on !disableAdblock so per-URL configs
3293
+ // (e.g. for popup/redirect chain capture) can bypass it.
3294
+ if (!disableAdblock && adblockEnabled && adblockMatcher) {
2887
3295
  try {
2888
3296
  const result = adblockMatcher.shouldBlock(
2889
3297
  checkedUrl,
2890
3298
  currentUrl,
2891
3299
  request.resourceType()
2892
3300
  );
2893
-
3301
+
2894
3302
  if (result.blocked) {
2895
3303
  adblockStats.blocked++;
2896
3304
  if (forceDebug) {
@@ -2924,12 +3332,41 @@ function setupFrameHandling(page, forceDebug) {
2924
3332
  if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
2925
3333
  _dynamicallyIgnoredDomains.add(checkedRootDomain);
2926
3334
  if (forceDebug) {
2927
- console.log(formatLogMessage('debug', `[ignoreDomainsByUrl] ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
3335
+ console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
3336
+ }
3337
+ break;
3338
+ }
3339
+ }
3340
+ }
3341
+
3342
+ // blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl above.
3343
+ // If any pattern matches this URL, mark the root domain as blocked
3344
+ // for the rest of the scan. The gate immediately below catches the
3345
+ // triggering request itself + any future request on this domain or
3346
+ // its subdomains (parent-walk via matchesDynamicBlock).
3347
+ if (_blockDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3348
+ for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3349
+ if (_blockDomainsByUrlRegexes[i].test(reqUrl)) {
3350
+ _dynamicallyBlockedDomains.add(checkedRootDomain);
3351
+ if (forceDebug) {
3352
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source}`));
2928
3353
  }
2929
3354
  break;
2930
3355
  }
2931
3356
  }
2932
3357
  }
3358
+ // blockDomainsByUrl gate — abort if reqDomain (or a parent) is in
3359
+ // the dynamic block Set. Fires BEFORE the static blocked-regex
3360
+ // check so domain-based blocks short-circuit without paying the
3361
+ // per-URL regex scan. Same abort reason as the static path so
3362
+ // request.failure() observers see consistent metadata.
3363
+ if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
3364
+ if (forceDebug) {
3365
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
3366
+ }
3367
+ request.abort('blockedbyclient');
3368
+ return;
3369
+ }
2933
3370
 
2934
3371
  let blockedMatchIndex = -1;
2935
3372
  for (let i = 0; i < allBlockedRegexes.length; i++) {
@@ -2939,8 +3376,16 @@ function setupFrameHandling(page, forceDebug) {
2939
3376
  }
2940
3377
  }
2941
3378
  if (blockedMatchIndex !== -1) {
3379
+ // Always track the hit (zero-cost on the un-debug path) so the
3380
+ // scan-end summary can show which patterns are doing work vs.
3381
+ // which are stale and ready to prune. Keyed by pattern.source --
3382
+ // identical patterns from site + global lists roll up together,
3383
+ // which matches how users think about them.
3384
+ const matchedPatternSrc = allBlockedRegexes[blockedMatchIndex].source;
3385
+ _blockedPatternHits.set(matchedPatternSrc, (_blockedPatternHits.get(matchedPatternSrc) || 0) + 1);
3386
+
2942
3387
  if (forceDebug) {
2943
- const matchedPattern = allBlockedRegexes[blockedMatchIndex].source;
3388
+ const matchedPattern = matchedPatternSrc;
2944
3389
  const patternSource = blockedMatchIndex < blockedRegexes.length ? 'site' : 'global';
2945
3390
  console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked]')}[${simplifiedCurrentUrl}] ${reqUrl} blocked by ${patternSource} pattern: ${matchedPattern}`));
2946
3391
 
@@ -3012,6 +3457,19 @@ function setupFrameHandling(page, forceDebug) {
3012
3457
  return;
3013
3458
  }
3014
3459
 
3460
+ // Early ignoreDomains gate — skip regex + dig/whois entirely for domains
3461
+ // in the ignoreDomains list (or dynamically-ignored ones populated by
3462
+ // ignoreDomainsByUrl above). Mirrors the popup handler's early gate so
3463
+ // the main path doesn't waste a dig/whois lookup on domains that
3464
+ // post-processing/output filters will strip anyway.
3465
+ if (matchesIgnoreDomain(reqDomain, ignoreDomains)) {
3466
+ if (forceDebug) {
3467
+ console.log(formatLogMessage('debug', `Skipping ignoreDomains match: ${reqDomain}`));
3468
+ }
3469
+ request.continue();
3470
+ return;
3471
+ }
3472
+
3015
3473
  // === ENHANCED REGEX MATCHING WITH AND/OR LOGIC ===
3016
3474
  let regexMatched = false;
3017
3475
  let matchedRegexPattern = null;
@@ -3108,9 +3566,11 @@ function setupFrameHandling(page, forceDebug) {
3108
3566
  dumpUrls,
3109
3567
  matchedUrlsLogFile,
3110
3568
  forceDebug,
3111
- fs
3569
+ fs,
3570
+ ignoreDomains,
3571
+ matchesIgnoreDomain
3112
3572
  });
3113
-
3573
+
3114
3574
  // Execute nettools check asynchronously
3115
3575
  const originalDomain = fullSubdomain;
3116
3576
  setImmediate(() => netToolsHandler(reqDomain, originalDomain));
@@ -3184,7 +3644,7 @@ function setupFrameHandling(page, forceDebug) {
3184
3644
  const cachedDig = smartCache ? smartCache.getCachedNetTools(reqDomain, 'dig', digRecordType) : null;
3185
3645
 
3186
3646
  if ((cachedWhois || cachedDig) && forceDebug) {
3187
- console.log(formatLogMessage('debug', `[SmartCache] Using cached nettools results for ${reqDomain}`));
3647
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached nettools results for ${reqDomain}`));
3188
3648
  }
3189
3649
 
3190
3650
  // Create nettools handler with cache callbacks (if cache is enabled)
@@ -3221,9 +3681,11 @@ function setupFrameHandling(page, forceDebug) {
3221
3681
  dumpUrls,
3222
3682
  matchedUrlsLogFile,
3223
3683
  forceDebug,
3224
- fs
3684
+ fs,
3685
+ ignoreDomains,
3686
+ matchesIgnoreDomain
3225
3687
  });
3226
-
3688
+
3227
3689
  // Execute nettools check asynchronously
3228
3690
  const originalDomain = fullSubdomain; // Use full subdomain for nettools
3229
3691
  setImmediate(() => netToolsHandler(reqDomain, originalDomain));
@@ -3280,7 +3742,7 @@ function setupFrameHandling(page, forceDebug) {
3280
3742
  }
3281
3743
 
3282
3744
  if (cachedContent && forceDebug) {
3283
- console.log(formatLogMessage('debug', `[SmartCache] Using cached response content for ${reqUrl.substring(0, 50)}...`));
3745
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached response content for ${reqUrl.substring(0, 50)}...`));
3284
3746
  // Process cached content instead of fetching
3285
3747
  } else {
3286
3748
  try {
@@ -3310,7 +3772,12 @@ function setupFrameHandling(page, forceDebug) {
3310
3772
  forceDebug,
3311
3773
  userAgent: curlUserAgent,
3312
3774
  resourceType,
3313
- hasSearchString: hasSearchString || hasSearchStringAnd,
3775
+ // Pass both flags separately — createGrepHandler now
3776
+ // applies AND logic when hasSearchStringAnd is set.
3777
+ // Previously OR'd into hasSearchString and the AND
3778
+ // patterns were silently dropped.
3779
+ hasSearchString,
3780
+ hasSearchStringAnd,
3314
3781
  grepOptions: {
3315
3782
  ignoreCase: true,
3316
3783
  wholeWord: false,
@@ -3360,7 +3827,7 @@ function setupFrameHandling(page, forceDebug) {
3360
3827
  } else if (useGrep && (hasSearchString || hasSearchStringAnd)) {
3361
3828
  // Use grep with response handler (no curl)
3362
3829
  if (forceDebug) {
3363
- console.log(formatLogMessage('debug', `[grep-response] Queuing ${reqUrl} for grep analysis via response handler`));
3830
+ console.log(formatLogMessage('debug', `${GREP_RESPONSE_TAG} Queuing ${reqUrl} for grep analysis via response handler`));
3364
3831
  }
3365
3832
 
3366
3833
  // Queue for grep processing via response handler
@@ -3448,7 +3915,7 @@ function setupFrameHandling(page, forceDebug) {
3448
3915
  }
3449
3916
  }, cssBlockedSelectors);
3450
3917
  } catch (cssRuntimeErr) {
3451
- console.warn(formatLogMessage('warn', `[css_blocked] Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
3918
+ console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
3452
3919
  }
3453
3920
  }
3454
3921
  }
@@ -3760,8 +4227,8 @@ function setupFrameHandling(page, forceDebug) {
3760
4227
  const proxyErr = proxyErrors.find(e => err.message.includes(e));
3761
4228
  if (proxyErr) {
3762
4229
  const info = getProxyInfo(siteConfig);
3763
- console.error(formatLogMessage('error', `[proxy] ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
3764
- console.error(formatLogMessage('error', `[proxy] Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
4230
+ console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
4231
+ console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
3765
4232
  }
3766
4233
  }
3767
4234
  console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
@@ -3813,7 +4280,7 @@ function setupFrameHandling(page, forceDebug) {
3813
4280
  try {
3814
4281
  if (ghostConfig) {
3815
4282
  // Ghost-cursor mode: Bezier-based mouse movements
3816
- if (forceDebug) console.log(formatLogMessage('debug', `[ghost-cursor] Using ghost-cursor for ${currentUrl}`));
4283
+ if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
3817
4284
  const cursor = createGhostCursor(page, { forceDebug });
3818
4285
  if (cursor) {
3819
4286
  await Promise.race([
@@ -3851,8 +4318,7 @@ function setupFrameHandling(page, forceDebug) {
3851
4318
  await performPageInteraction(page, currentUrl, {
3852
4319
  ...interactionConfig,
3853
4320
  mouseMovements: 0,
3854
- includeElementClicks: false,
3855
- includeTyping: false
4321
+ includeElementClicks: false
3856
4322
  }, forceDebug);
3857
4323
  }
3858
4324
  })(),
@@ -3873,7 +4339,7 @@ function setupFrameHandling(page, forceDebug) {
3873
4339
  ]);
3874
4340
  }
3875
4341
  } catch (interactTimeoutErr) {
3876
- if (forceDebug) console.log(formatLogMessage('debug', `[interaction] Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
4342
+ if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
3877
4343
  }
3878
4344
  })();
3879
4345
 
@@ -4008,7 +4474,7 @@ function setupFrameHandling(page, forceDebug) {
4008
4474
  const clearResult = await clearSiteData(page, currentUrl, forceDebug, true); // Quick mode for reloads
4009
4475
  if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i} for ${currentUrl}`));
4010
4476
  } catch (reloadClearErr) {
4011
- if (forceDebug) console.log(formatLogMessage('debug', `[clear_sitedata] Before reload failed for ${currentUrl}`));
4477
+ if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
4012
4478
  }
4013
4479
  }
4014
4480
 
@@ -4202,8 +4668,8 @@ function setupFrameHandling(page, forceDebug) {
4202
4668
  const proxyErr = proxyErrors.find(e => err.message.includes(e));
4203
4669
  if (proxyErr) {
4204
4670
  const info = getProxyInfo(siteConfig);
4205
- console.error(formatLogMessage('error', `[proxy] ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
4206
- console.error(formatLogMessage('error', `[proxy] Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
4671
+ console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
4672
+ console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
4207
4673
  }
4208
4674
  }
4209
4675
 
@@ -4270,17 +4736,33 @@ function setupFrameHandling(page, forceDebug) {
4270
4736
  };
4271
4737
  } finally {
4272
4738
  // Guaranteed resource cleanup - this runs regardless of success or failure
4273
-
4739
+
4740
+ // Flip the popup-capture race-window guard first so any in-flight
4741
+ // 'targetcreated' handler that resolves after this point sees the
4742
+ // flag and bails (closing its own popup if it managed to fetch one).
4743
+ urlFinished = true;
4744
+
4745
+ // Popup capture teardown (opt-in via siteConfig.capture_popups). Each
4746
+ // entry is either the browser.off('targetcreated', ...) deregistration
4747
+ // or a per-popup (clearTimeout + popupPage.close) cleanup. Iterate even
4748
+ // if one fails so the rest still run.
4749
+ if (popupCleanups.length) {
4750
+ for (const cleanup of popupCleanups) {
4751
+ try { cleanup(); } catch (_) {}
4752
+ }
4753
+ popupCleanups.length = 0;
4754
+ }
4755
+
4274
4756
  // Disconnect VPN for this site
4275
4757
  if (siteConfig.vpn) {
4276
4758
  const vpnDown = wgDisconnect(siteConfig, forceDebug);
4277
4759
  if (vpnDown.tornDown && forceDebug) {
4278
- console.log(formatLogMessage('debug', `[vpn] WireGuard interface torn down for ${currentUrl}`));
4760
+ console.log(formatLogMessage('debug', `${VPN_TAG} WireGuard interface torn down for ${currentUrl}`));
4279
4761
  }
4280
4762
  } else if (siteConfig.openvpn) {
4281
4763
  const ovpnDown = ovpnDisconnect(siteConfig, forceDebug);
4282
4764
  if (ovpnDown.tornDown && forceDebug) {
4283
- console.log(formatLogMessage('debug', `[vpn] OpenVPN connection torn down for ${currentUrl}`));
4765
+ console.log(formatLogMessage('debug', `${VPN_TAG} OpenVPN connection torn down for ${currentUrl}`));
4284
4766
  }
4285
4767
  }
4286
4768
 
@@ -4395,7 +4877,13 @@ function setupFrameHandling(page, forceDebug) {
4395
4877
  let lastProcessedCount = 0;
4396
4878
  let hangCheckCount = 0;
4397
4879
  let forceRestartFlag = false; // Flag to trigger restart on next iteration
4398
-
4880
+
4881
+ // Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
4882
+ // only colors the [severity] tag; the '[HANG CHECK]' substring was
4883
+ // sitting plain inside the message string. Colored once at function
4884
+ // entry so the interval callback doesn't re-colorize per tick.
4885
+ const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
4886
+
4399
4887
  const hangDetectionInterval = setInterval(() => {
4400
4888
  // Progress check, counter, and forceRestartFlag MUST run regardless of
4401
4889
  // debug mode — previously the entire body was gated on forceDebug, which
@@ -4406,10 +4894,10 @@ function setupFrameHandling(page, forceDebug) {
4406
4894
  if (processedUrlCount === lastProcessedCount) {
4407
4895
  hangCheckCount++;
4408
4896
  if (forceDebug) {
4409
- console.log(formatLogMessage('warn', `[HANG CHECK] No progress for ${hangCheckCount * 30}s`));
4897
+ console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
4410
4898
  }
4411
4899
  if (hangCheckCount >= 5) {
4412
- console.log(formatLogMessage('error', `[HANG CHECK] Hung for 2.5 minutes. Triggering emergency browser restart.`));
4900
+ console.log(formatLogMessage('error', `${HANG_CHECK_TAG} Hung for 2.5 minutes. Triggering emergency browser restart.`));
4413
4901
  forceRestartFlag = true; // Set flag instead of exiting
4414
4902
  hangCheckCount = 0; // Reset counter for next cycle
4415
4903
  }
@@ -4422,8 +4910,8 @@ function setupFrameHandling(page, forceDebug) {
4422
4910
  if (forceDebug) {
4423
4911
  const currentBatch = Math.floor(currentBatchInfo.batchStart / RESOURCE_CLEANUP_INTERVAL) + 1;
4424
4912
  const totalBatches = Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL);
4425
- console.log(formatLogMessage('debug', `[HANG CHECK] Processed: ${processedUrlCount}/${totalUrls} URLs, Batch: ${currentBatch}/${totalBatches}, Current batch size: ${currentBatchInfo.batchSize}`));
4426
- console.log(formatLogMessage('debug', `[HANG CHECK] URLs since cleanup: ${urlsSinceLastCleanup}, Recent failures: ${results.slice(-3).filter(r => !r.success).length}/3`));
4913
+ console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} Processed: ${processedUrlCount}/${totalUrls} URLs, Batch: ${currentBatch}/${totalBatches}, Current batch size: ${currentBatchInfo.batchSize}`));
4914
+ console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} URLs since cleanup: ${urlsSinceLastCleanup}, Recent failures: ${results.slice(-3).filter(r => !r.success).length}/3`));
4427
4915
  }
4428
4916
  }, 30000);
4429
4917
  // Don't keep the event loop alive solely for the hang-check interval — the
@@ -4434,29 +4922,46 @@ function setupFrameHandling(page, forceDebug) {
4434
4922
  // Process URLs in batches with exception handling
4435
4923
  let siteGroupIndex = 0;
4436
4924
  let currentProxyKey = ''; // Track active proxy config — '' means direct connection
4925
+ // Map of site-config object -> index in sites[], built once. Per-batch
4926
+ // grouping below uses this for O(1) lookup instead of sites.indexOf which
4927
+ // walked the array per task (batch=80 * sites=20 was ~1600 cmps per batch).
4928
+ const configToIndex = new Map();
4929
+ for (let i = 0; i < sites.length; i++) configToIndex.set(sites[i], i);
4437
4930
  try {
4438
4931
  for (let batchStart = 0; batchStart < totalUrls; batchStart += RESOURCE_CLEANUP_INTERVAL) {
4439
4932
  const batchEnd = Math.min(batchStart + RESOURCE_CLEANUP_INTERVAL, totalUrls);
4440
4933
  const currentBatch = allTasks.slice(batchStart, batchEnd);
4441
4934
 
4442
-
4443
- // Group tasks by their source site configuration for window cleanup
4935
+
4936
+ // Group tasks by their source site configuration for window cleanup.
4937
+ // Single get-or-set replaces has + get + set (one Map lookup not two).
4938
+ // The `?? -1` preserves the old `sites.indexOf` semantics for a task
4939
+ // whose config isn't in sites[] — that case shouldn't happen, but if
4940
+ // it ever does the routing stays identical to the prior code's
4941
+ // 'site_-1' bucket rather than silently shifting to 'site_undefined'.
4444
4942
  const tasksBySite = new Map();
4445
- currentBatch.forEach(task => {
4446
- const siteKey = `site_${sites.indexOf(task.config)}`;
4447
- if (!tasksBySite.has(siteKey)) {
4448
- tasksBySite.set(siteKey, []);
4449
- }
4450
- tasksBySite.get(siteKey).push(task);
4451
- });
4943
+ for (let i = 0; i < currentBatch.length; i++) {
4944
+ const task = currentBatch[i];
4945
+ const siteKey = `site_${configToIndex.get(task.config) ?? -1}`;
4946
+ let arr = tasksBySite.get(siteKey);
4947
+ if (!arr) tasksBySite.set(siteKey, arr = []);
4948
+ arr.push(task);
4949
+ }
4452
4950
 
4453
4951
  // IMPROVED: Only check health if we have indicators of problems
4454
4952
  let healthCheck = { shouldRestart: false, reason: null };
4455
4953
  const recentResults = results.slice(-8); // Check more results for better pattern detection
4456
- const recentFailureRate = recentResults.length > 0 ?
4457
- recentResults.filter(r => !r.success).length / recentResults.length : 0;
4954
+ // Single-pass count for both failure rate and critical-error tally —
4955
+ // was two .filter(...).length calls allocating two intermediate arrays.
4956
+ let recentFailures = 0, recentCritical = 0;
4957
+ for (let i = 0; i < recentResults.length; i++) {
4958
+ const r = recentResults[i];
4959
+ if (!r.success) recentFailures++;
4960
+ if (r.needsImmediateRestart) recentCritical++;
4961
+ }
4962
+ const recentFailureRate = recentResults.length > 0 ? recentFailures / recentResults.length : 0;
4458
4963
  const hasHighFailureRate = recentFailureRate > 0.75; // 75% failure threshold (more conservative)
4459
- const hasCriticalErrors = recentResults.filter(r => r.needsImmediateRestart).length > 2;
4964
+ const hasCriticalErrors = recentCritical > 2;
4460
4965
 
4461
4966
  // Only run health checks when we have STRONG indicators of problems
4462
4967
  if (urlsSinceLastCleanup > 15 && (
@@ -4465,15 +4970,21 @@ function setupFrameHandling(page, forceDebug) {
4465
4970
  urlsSinceLastCleanup > RESOURCE_CLEANUP_INTERVAL * 0.9 // Very close to cleanup limit
4466
4971
  )) {
4467
4972
  try {
4973
+ // Race the health check against a 30s timeout. Attach .catch on the
4974
+ // health promise itself so that if the timeout wins, the still-running
4975
+ // monitorBrowserHealth's eventual rejection doesn't surface as an
4976
+ // unhandledRejection warning.
4977
+ const healthPromise = monitorBrowserHealth(browser, {}, {
4978
+ siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
4979
+ totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
4980
+ urlsSinceCleanup: urlsSinceLastCleanup,
4981
+ cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
4982
+ forceDebug,
4983
+ silentMode
4984
+ });
4985
+ healthPromise.catch(() => {});
4468
4986
  healthCheck = await Promise.race([
4469
- monitorBrowserHealth(browser, {}, {
4470
- siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
4471
- totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
4472
- urlsSinceCleanup: urlsSinceLastCleanup,
4473
- cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
4474
- forceDebug,
4475
- silentMode
4476
- }),
4987
+ healthPromise,
4477
4988
  new Promise((_, reject) => setTimeout(() => reject(new Error('Health check timeout')), 30000))
4478
4989
  ]);
4479
4990
  } catch (healthError) {
@@ -4502,8 +5013,17 @@ function setupFrameHandling(page, forceDebug) {
4502
5013
  // timeout) bypasses the urlsSinceLastCleanup > 8 gate — a confirmed hang
4503
5014
  // needs immediate restart even if we just cleaned up. Proactive triggers
4504
5015
  // keep the gate to prevent thrashing.
5016
+ //
5017
+ // hasHighFailureRate is computed (and still used for the health-check
5018
+ // gate above) but intentionally NOT folded into proactiveRestart:
5019
+ // wouldExceedLimit is always true at every batch boundary with the
5020
+ // default RESOURCE_CLEANUP_INTERVAL == batch size, so the high-failure-
5021
+ // rate branch was dead code reached only at the same boundary that
5022
+ // wouldExceedLimit already triggers. If failure-rate ever needs to
5023
+ // interrupt mid-cleanup-interval, that requires interrupting the
5024
+ // running Promise.all — a real behavior change, not an OR addition.
4505
5025
  const hangRecoveryRestart = forceRestartFlag;
4506
- const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth || (hasHighFailureRate && recentResults.length >= 6)) && urlsSinceLastCleanup > 8;
5026
+ const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth) && urlsSinceLastCleanup > 8;
4507
5027
  if ((hangRecoveryRestart || proactiveRestart) && isNotLastBatch) {
4508
5028
  let restartReason = 'Unknown';
4509
5029
  if (forceRestartFlag) {
@@ -4511,8 +5031,6 @@ function setupFrameHandling(page, forceDebug) {
4511
5031
  forceRestartFlag = false; // Reset the flag
4512
5032
  } else if (shouldRestartFromHealth) {
4513
5033
  restartReason = healthCheck.reason;
4514
- } else if (hasHighFailureRate) {
4515
- restartReason = `High failure rate: ${Math.round(recentFailureRate * 100)}% in recent batch`;
4516
5034
  } else if (wouldExceedLimit) {
4517
5035
  restartReason = `Processed ${urlsSinceLastCleanup} URLs (scheduled maintenance)`;
4518
5036
  }
@@ -4527,7 +5045,7 @@ function setupFrameHandling(page, forceDebug) {
4527
5045
  if (requestCacheStats.enabled && requestCacheStats.size > 0) {
4528
5046
  const clearedCount = smartCache.clearRequestCache();
4529
5047
  if (forceDebug) {
4530
- console.log(formatLogMessage('debug', `[SmartCache] Cleared ${clearedCount} request cache entries during browser restart`));
5048
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during browser restart`));
4531
5049
  }
4532
5050
  }
4533
5051
  }
@@ -4542,24 +5060,21 @@ function setupFrameHandling(page, forceDebug) {
4542
5060
  });
4543
5061
 
4544
5062
  // Clean up the specific user data directory
4545
- if (userDataDir && fs.existsSync(userDataDir)) {
4546
- fs.rmSync(userDataDir, { recursive: true, force: true });
4547
- if (forceDebug) console.log(formatLogMessage('debug', `Cleaned user data dir: ${userDataDir}`));
4548
- }
5063
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4549
5064
 
4550
5065
  // Additional cleanup for any remaining Chrome processes
4551
5066
  if (removeTempFiles) {
4552
- await cleanupChromeTempFiles({
4553
- includeSnapTemp: true,
5067
+ await cleanupChromeTempFiles({
5068
+ includeSnapTemp: true,
4554
5069
  forceDebug,
4555
- comprehensive: true
5070
+ comprehensive: true
4556
5071
  });
4557
5072
  }
4558
5073
 
4559
5074
  } catch (browserCloseErr) {
4560
5075
  if (forceDebug) console.log(formatLogMessage('debug', `Browser cleanup warning: ${browserCloseErr.message}`));
4561
5076
  }
4562
-
5077
+
4563
5078
  // Create new browser for next batch (preserve current proxy config)
4564
5079
  const restartProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
4565
5080
  browser = await createBrowser(restartProxyArgs);
@@ -4567,7 +5082,6 @@ function setupFrameHandling(page, forceDebug) {
4567
5082
 
4568
5083
  // Reset cleanup counter and add delay
4569
5084
  urlsSinceLastCleanup = 0;
4570
- purgeStaleTrackers();
4571
5085
  await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
4572
5086
  }
4573
5087
 
@@ -4587,9 +5101,7 @@ function setupFrameHandling(page, forceDebug) {
4587
5101
  forceDebug, timeout: 10000, exitOnFailure: false,
4588
5102
  cleanTempFiles: true, comprehensiveCleanup: removeTempFiles
4589
5103
  });
4590
- if (userDataDir && fs.existsSync(userDataDir)) {
4591
- fs.rmSync(userDataDir, { recursive: true, force: true });
4592
- }
5104
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4593
5105
  } catch (proxyRestartErr) {
4594
5106
  if (forceDebug) console.log(formatLogMessage('debug', `Proxy switch browser cleanup: ${proxyRestartErr.message}`));
4595
5107
  }
@@ -4601,8 +5113,8 @@ function setupFrameHandling(page, forceDebug) {
4601
5113
  const health = await testProxy(currentBatch[0].config, 5000);
4602
5114
  if (!health.reachable) {
4603
5115
  const info = getProxyInfo(currentBatch[0].config);
4604
- console.error(formatLogMessage('error', `[proxy] Unreachable: ${info} — ${health.error}`));
4605
- console.error(formatLogMessage('error', `[proxy] Skipping ${currentBatch.length} URL(s) in this batch`));
5116
+ console.error(formatLogMessage('error', `${PROXY_TAG} Unreachable: ${info} — ${health.error}`));
5117
+ console.error(formatLogMessage('error', `${PROXY_TAG} Skipping ${currentBatch.length} URL(s) in this batch`));
4606
5118
  const skipResults = currentBatch.map(task => ({
4607
5119
  success: false, url: task.url, rules: [],
4608
5120
  error: `Proxy unreachable: ${health.error}`
@@ -4620,7 +5132,6 @@ function setupFrameHandling(page, forceDebug) {
4620
5132
  browser = await createBrowser(proxyArgs);
4621
5133
  currentProxyKey = batchProxyKey;
4622
5134
  urlsSinceLastCleanup = 0;
4623
- purgeStaleTrackers();
4624
5135
  await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
4625
5136
  }
4626
5137
 
@@ -4630,7 +5141,7 @@ function setupFrameHandling(page, forceDebug) {
4630
5141
 
4631
5142
  // Log start of concurrent processing for hang detection
4632
5143
  if (forceDebug) {
4633
- console.log(formatLogMessage('debug', `[CONCURRENCY] Starting ${batchSize} concurrent tasks with limit ${MAX_CONCURRENT_SITES}`));
5144
+ console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Starting ${batchSize} concurrent tasks with limit ${MAX_CONCURRENT_SITES}`));
4634
5145
  }
4635
5146
 
4636
5147
  // Create tasks with timeout protection — skip domains that repeatedly timed out.
@@ -4642,7 +5153,7 @@ function setupFrameHandling(page, forceDebug) {
4642
5153
  try {
4643
5154
  // Short-circuit queued URLs once any URL in this batch has triggered a
4644
5155
  // restart. Without this, the 80-URL batch in the user's hang trace
4645
- // would have to fail one-by-one at 120s each (~28 min total) before
5156
+ // would have to fail one-by-one at 75s each (~25 min total) before
4646
5157
  // the boundary restart could fire. Now: first hang fires the flag,
4647
5158
  // remaining queued URLs return immediately, batch completes, restart.
4648
5159
  if (forceRestartFlag) {
@@ -4674,10 +5185,26 @@ function setupFrameHandling(page, forceDebug) {
4674
5185
  if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check (cached): ${taskDomain} — ${cached.error}`));
4675
5186
  return { url: task.url, rules: [], success: false, error: `DNS: ${cached.error}`, skipped: true };
4676
5187
  }
5188
+ // Positive-resolution shortcut: dig or whois has already proven this
5189
+ // hostname live within their 20h cache TTL (populated either by an
5190
+ // earlier URL this run or by --dns-cache disk-load from a prior run).
5191
+ // Order matters -- negative cache (5min TTL, fresher data) wins
5192
+ // first, then this 20h-TTL positive index, then the actual resolve.
5193
+ if (domainKnownToResolve(taskDomain)) {
5194
+ dnsPositiveSkips++;
5195
+ dnsPositiveSkippedHosts.add(taskDomain);
5196
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
5197
+ // Fall through to navigation -- pre-check "passed" by proxy.
5198
+ } else {
4677
5199
  const dnsResolve = async () => {
4678
5200
  // resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
4679
- // resolve6 so IPv6-only hosts aren't wrongly skipped. Only a
4680
- // failure of BOTH means the host is genuinely unresolvable.
5201
+ // resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
5202
+ // error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
5203
+ // unchanged so the outer transient-retry path sees the real
5204
+ // resolver code and the negative cache records the right reason.
5205
+ // Previously a bare .catch swallowed everything and tried
5206
+ // resolve6, which masked transient v4-side errors behind
5207
+ // whatever resolve6 ended up reporting.
4681
5208
  // 2s timeout kept as a real safety net — with c-ares off the
4682
5209
  // threadpool it should now rarely fire.
4683
5210
  let timer;
@@ -4686,7 +5213,12 @@ function setupFrameHandling(page, forceDebug) {
4686
5213
  timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
4687
5214
  });
4688
5215
  const resolveChain = dnsPromises.resolve4(taskDomain)
4689
- .catch(() => dnsPromises.resolve6(taskDomain));
5216
+ .catch(err => {
5217
+ if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
5218
+ return dnsPromises.resolve6(taskDomain);
5219
+ }
5220
+ throw err;
5221
+ });
4690
5222
  await Promise.race([resolveChain, timeoutP]);
4691
5223
  } finally {
4692
5224
  if (timer) clearTimeout(timer);
@@ -4694,13 +5226,13 @@ function setupFrameHandling(page, forceDebug) {
4694
5226
  };
4695
5227
  // c-ares transient codes — retry once so a momentary resolver
4696
5228
  // hiccup doesn't poison the negative cache for 5 minutes.
4697
- const TRANSIENT = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
5229
+ // DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
4698
5230
  try {
4699
5231
  try {
4700
5232
  await dnsResolve();
4701
5233
  } catch (firstErr) {
4702
5234
  const code = firstErr && firstErr.code;
4703
- if (TRANSIENT.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
5235
+ if (DNS_TRANSIENT_ERRORS.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
4704
5236
  if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
4705
5237
  await dnsResolve();
4706
5238
  } else {
@@ -4714,26 +5246,31 @@ function setupFrameHandling(page, forceDebug) {
4714
5246
  if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
4715
5247
  return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
4716
5248
  }
5249
+ } // close `else` from domainKnownToResolve shortcut above
4717
5250
  }
4718
5251
  } catch {}
4719
5252
 
4720
5253
  // Per-URL timeout so a single hung processUrl can't block the batch
4721
- // forever. 120s is well past any legitimate slow page: Cloudflare
4722
- // adaptive max ~25s, nettools overall ~65s, navigation 15s.
5254
+ // forever. 75s sits comfortably above the realistic legit-page ceiling
5255
+ // (nav 35s + Cloudflare adaptive ~25s + interaction ~10s + network-idle
5256
+ // wait ~10s ≈ ~70s), well short of the old 120s safety net. Cuts
5257
+ // hang-recovery time roughly in half when an entire batch's URLs all
5258
+ // hang and we're waiting on this timeout to advance processedUrlCount.
5259
+ const PER_URL_TIMEOUT_MS = 75000;
4723
5260
  const processUrlPromise = processUrl(task.url, task.config, browser);
4724
5261
  let perUrlTimer;
4725
5262
  try {
4726
5263
  return await Promise.race([
4727
5264
  processUrlPromise,
4728
5265
  new Promise((_, reject) => {
4729
- perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (120s)')), 120000);
5266
+ perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (75s)')), PER_URL_TIMEOUT_MS);
4730
5267
  })
4731
5268
  ]);
4732
5269
  } catch (err) {
4733
- if (err && err.message === 'Per-URL timeout (120s)') {
5270
+ if (err && err.message === 'Per-URL timeout (75s)') {
4734
5271
  processUrlPromise.catch(() => {});
4735
5272
  forceRestartFlag = true;
4736
- return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (120s)', needsImmediateRestart: true };
5273
+ return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (75s)', needsImmediateRestart: true };
4737
5274
  }
4738
5275
  throw err;
4739
5276
  } finally {
@@ -4749,21 +5286,29 @@ function setupFrameHandling(page, forceDebug) {
4749
5286
 
4750
5287
  let batchResults;
4751
5288
  try {
5289
+ // Same orphan-promise pattern as the health-check race above: if the
5290
+ // 10-min batch timeout wins, the still-running Promise.all keeps going
5291
+ // until every batchTask settles. Each individual task is already wrapped
5292
+ // in p-limit's error handling so unhandled rejections should not surface,
5293
+ // but the .catch is free belt-and-braces against future refactors that
5294
+ // change task internals.
5295
+ const batchPromise = Promise.all(batchTasks);
5296
+ batchPromise.catch(() => {});
4752
5297
  batchResults = await Promise.race([
4753
- Promise.all(batchTasks),
4754
- new Promise((_, reject) =>
5298
+ batchPromise,
5299
+ new Promise((_, reject) =>
4755
5300
  setTimeout(() => reject(new Error('Batch timeout')), 600000) // 10 min timeout
4756
5301
  )
4757
5302
  ]);
4758
5303
  } catch (timeoutError) {
4759
5304
  if (timeoutError.message.includes('timeout')) {
4760
- console.log(formatLogMessage('error', `[TIMEOUT] Batch hung. Restarting browser.`));
5305
+ console.log(formatLogMessage('error', `${TIMEOUT_TAG} Batch hung. Restarting browser.`));
4761
5306
  try {
4762
5307
  await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false });
5308
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4763
5309
  const timeoutProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
4764
5310
  browser = await createBrowser(timeoutProxyArgs);
4765
5311
  urlsSinceLastCleanup = 0;
4766
- purgeStaleTrackers();
4767
5312
  } catch (restartErr) {
4768
5313
  throw restartErr;
4769
5314
  }
@@ -4800,7 +5345,7 @@ function setupFrameHandling(page, forceDebug) {
4800
5345
 
4801
5346
  // Log completion of concurrent processing
4802
5347
  if (forceDebug) {
4803
- console.log(formatLogMessage('debug', `[CONCURRENCY] Completed ${batchSize} concurrent tasks, ${batchResults.filter(r => r.success).length} successful`));
5348
+ console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Completed ${batchSize} concurrent tasks, ${batchResults.filter(r => r.success).length} successful`));
4804
5349
  }
4805
5350
 
4806
5351
  // Enhanced error reporting for Puppeteer 23.x
@@ -4862,7 +5407,7 @@ function setupFrameHandling(page, forceDebug) {
4862
5407
  if (requestCacheStats.enabled && requestCacheStats.size > 0) {
4863
5408
  const clearedCount = smartCache.clearRequestCache();
4864
5409
  if (forceDebug) {
4865
- console.log(formatLogMessage('debug', `[SmartCache] Cleared ${clearedCount} request cache entries during emergency restart`));
5410
+ console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during emergency restart`));
4866
5411
  }
4867
5412
  }
4868
5413
  }
@@ -4883,17 +5428,23 @@ function setupFrameHandling(page, forceDebug) {
4883
5428
  }
4884
5429
 
4885
5430
  await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true, comprehensiveCleanup: removeTempFiles });
5431
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4886
5432
  // Additional cleanup after emergency restart
4887
5433
  if (removeTempFiles) {
4888
- await cleanupChromeTempFiles({
4889
- includeSnapTemp: true,
5434
+ await cleanupChromeTempFiles({
5435
+ includeSnapTemp: true,
4890
5436
  forceDebug,
4891
- comprehensive: true
5437
+ comprehensive: true
4892
5438
  });
4893
5439
  }
4894
5440
  browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
4895
5441
  urlsSinceLastCleanup = 0; // Reset counter
4896
- purgeStaleTrackers();
5442
+ // Reset the hang-detection flag too: this restart path is triggered
5443
+ // by needsImmediateRestart errors, which the per-URL 75s timeout
5444
+ // sets in lockstep with forceRestartFlag. Without this reset, the
5445
+ // hang-fallback restart below would fire a SECOND back-to-back
5446
+ // browser restart on the same batch boundary.
5447
+ forceRestartFlag = false;
4897
5448
  await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY); // Give browser time to stabilize
4898
5449
  } catch (emergencyRestartErr) {
4899
5450
  if (forceDebug) console.log(formatLogMessage('debug', `Emergency restart failed: ${emergencyRestartErr.message}`));
@@ -4904,9 +5455,9 @@ function setupFrameHandling(page, forceDebug) {
4904
5455
  console.log(`\n${messageColors.fileOp('🔄 Emergency hang detection restart:')} Browser appears hung, forcing restart`);
4905
5456
  try {
4906
5457
  await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true });
5458
+ if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
4907
5459
  browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
4908
5460
  urlsSinceLastCleanup = 0;
4909
- purgeStaleTrackers();
4910
5461
  forceRestartFlag = false; // Reset flag
4911
5462
  await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY);
4912
5463
  if (forceDebug) console.log(formatLogMessage('debug', `Emergency hang detection restart completed`));
@@ -4955,11 +5506,11 @@ function setupFrameHandling(page, forceDebug) {
4955
5506
  if (requestCacheStats.enabled && requestCacheStats.size > 0) {
4956
5507
  const clearedCount = smartCache.clearRequestCache();
4957
5508
  if (!silentMode && clearedCount > 0) {
4958
- console.log(`\n🗑️ Cleared request cache: ${clearedCount} entries after JSON processing`);
5509
+ console.log(`\n${messageColors.cleanup(`🗑️ Cleared request cache: ${clearedCount} entries after JSON processing`)}`);
4959
5510
  }
4960
5511
  if (forceDebug) {
4961
5512
  console.log(formatLogMessage('debug',
4962
- `[SmartCache] Request cache cleared after JSON scan completion (hit rate: ${requestCacheStats.hitRate})`
5513
+ `${SMART_CACHE_TAG} Request cache cleared after JSON scan completion (hit rate: ${requestCacheStats.hitRate})`
4963
5514
  ));
4964
5515
  }
4965
5516
  }
@@ -5031,8 +5582,42 @@ function setupFrameHandling(page, forceDebug) {
5031
5582
  if (cloudflareScanStats.errorPages > 0) {
5032
5583
  console.log(formatLogMessage('debug', `Cloudflare 5xx origin-error pages: ${cloudflareScanStats.errorPages} (no bypass possible — origin unreachable)`));
5033
5584
  }
5034
- if (dnsPrecheckEnabled && dnsPrecheckSkips > 0) {
5035
- console.log(formatLogMessage('debug', `DNS pre-check skipped: ${dnsPrecheckSkips} URL(s) via ${dnsNegativeCache.size} unresolvable host(s)`));
5585
+ if (dnsPrecheckEnabled && (dnsPrecheckSkips > 0 || dnsPositiveSkips > 0)) {
5586
+ // Two skip mechanisms, each with its own counter + unique-host count:
5587
+ // - dnsPrecheckSkips: URLs short-circuited via the NXDOMAIN-cache
5588
+ // (dnsNegativeCache). Unique-host count = dnsNegativeCache.size.
5589
+ // - dnsPositiveSkips: URLs short-circuited via dig/whois cache
5590
+ // proof of resolution (knownResolvedHostnames index in nettools).
5591
+ // Unique-host count = dnsPositiveSkippedHosts.size (this Set is
5592
+ // populated only on actual skip events, not on every Set add in
5593
+ // nettools, so it's a true per-scan visibility metric).
5594
+ const parts = [];
5595
+ if (dnsPrecheckSkips > 0) {
5596
+ parts.push(`${dnsPrecheckSkips} URL(s) via ${dnsNegativeCache.size} unresolvable host(s)`);
5597
+ }
5598
+ if (dnsPositiveSkips > 0) {
5599
+ parts.push(`${dnsPositiveSkips} URL(s) via ${dnsPositiveSkippedHosts.size} resolved host(s)`);
5600
+ }
5601
+ console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
5602
+ }
5603
+ // Blocked-pattern hit stats. Surfaces which patterns are actually
5604
+ // doing work this scan and (by absence) which are stale enough to
5605
+ // prune from config. Top 10 by hit count to keep the log scannable
5606
+ // on configs with dozens of patterns; full counts available via
5607
+ // _blockedPatternHits if needed for tooling. Fires only when at
5608
+ // least one pattern matched -- silent on scans with no blocks.
5609
+ if (_blockedPatternHits.size > 0) {
5610
+ let totalBlocks = 0;
5611
+ for (const n of _blockedPatternHits.values()) totalBlocks += n;
5612
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${_blockedPatternHits.size} pattern(s) hit ${totalBlocks} time(s) total`));
5613
+ const sorted = [..._blockedPatternHits.entries()].sort((a, b) => b[1] - a[1]);
5614
+ const top = sorted.slice(0, 10);
5615
+ for (const [pattern, hits] of top) {
5616
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${hits.toString().padStart(6)} × ${pattern}`));
5617
+ }
5618
+ if (sorted.length > top.length) {
5619
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ... and ${sorted.length - top.length} more pattern(s)`));
5620
+ }
5036
5621
  }
5037
5622
  // Log smart cache statistics (if cache is enabled)
5038
5623
  // Adblock statistics
@@ -5250,7 +5835,6 @@ function setupFrameHandling(page, forceDebug) {
5250
5835
  try { cleanupCloudflareCache(); } catch (_) {}
5251
5836
  try { wgDisconnectAll(forceDebug); } catch (_) {}
5252
5837
  try { ovpnDisconnectAll(forceDebug); } catch (_) {}
5253
- try { purgeStaleTrackers(); } catch (_) {}
5254
5838
  try { await closeAllSocksRelays(forceDebug); } catch (_) {}
5255
5839
 
5256
5840
  // Clean process termination