@fanboynz/network-scanner 3.0.2 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -12,13 +12,13 @@ const path = require('path');
12
12
  const dnsPromises = require('node:dns/promises');
13
13
  const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
14
14
  const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
15
- const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
16
- const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
15
+ const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
16
+ const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS } = require('./lib/fingerprint');
17
17
  const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
18
18
  // Curl functionality (replace searchstring curl handler)
19
19
  const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
20
20
  // Rule validation
21
- const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
21
+ const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile, normalizeSiteConfig } = require('./lib/validate_rules');
22
22
  // CF Bypass
23
23
  const {
24
24
  handleCloudflareProtection,
@@ -66,7 +66,7 @@ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
66
66
  // log lines (start/completed). Same cyan as the other monitoring tags.
67
67
  const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
68
68
  // Enhanced mouse interaction and page simulation
69
- const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
69
+ const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
70
70
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
71
71
  const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
72
72
  // Domain detection cache for performance optimization
@@ -129,15 +129,12 @@ const CONCURRENCY_LIMITS = Object.freeze({
129
129
  });
130
130
 
131
131
  // V8 Optimization: Use Map for user agent lookups instead of object
132
- const USER_AGENTS = Object.freeze(new Map([
133
- ['chrome', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
134
- ['chrome_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
135
- ['chrome_linux', "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
136
- ['firefox', "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0"],
137
- ['firefox_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0"],
138
- ['firefox_linux', "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0"],
139
- ['safari', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"]
140
- ]));
132
+ // User-Agent strings come from the single source of truth in lib/fingerprint
133
+ // (USER_AGENT_COLLECTIONS, imported above) the same map page.setUserAgent
134
+ // applies to the browser. The previous local duplicate had silently drifted
135
+ // (Chrome 146 vs the browser's 148, Firefox 148 vs 151, Safari 18.6 vs 19.5),
136
+ // so curl content-fetches advertised a different browser than the page did.
137
+ // Keep using the imported map directly so the two can never diverge again.
141
138
 
142
139
  const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
143
140
 
@@ -776,13 +773,14 @@ Redirect Handling Options:
776
773
  resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
777
774
  interact: true/false Simulate mouse movements/clicks
778
775
  isBrave: true/false Spoof Brave browser detection
779
- userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Custom desktop User-Agent
776
+ userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Desktop User-Agent (defaults to "chrome" if unset; set false to scan with the raw headless UA)
780
777
  interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium)
781
778
  delay: <milliseconds> Delay after load (default: 6000, capped at 2000ms unless delay_uncapped: true)
782
779
  delay_uncapped: true/false Honor 'delay' up to half the per-URL timeout instead of the 2s default cap. Use for sites with setTimeout-deferred lazy ad/tracker loaders that fire well past the standard post-networkidle window
783
780
  reload: <number> Reload page n times after load (default: 1)
784
781
  forcereload: true/false or ["domain1.com", "domain2.com"] Force cache-clearing reload for all URLs or specific domains
785
782
  clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
783
+ clear_sitedata_full_on_reload: true/false With clear_sitedata: true, also clear heavy storage (IndexedDB, WebSQL, service workers) between reloads — quick mode (cookies+cache+local/session storage) is the default for reloads; this flag promotes them to full clears at ~100-500ms latency cost per reload. Use for sites with IndexedDB/service-worker-backed session caps. Off by default.
786
784
  subDomains: 1/0 Output full subdomains (default: 0)
787
785
  localhost: true/false Force localhost output (127.0.0.1)
788
786
  localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
@@ -1864,15 +1862,65 @@ function setupFrameHandling(page, forceDebug) {
1864
1862
  '--log-level=3', // Fatal errors only (suppresses verbose disk logging)
1865
1863
  '--no-service-autorun', // No background service disk activity
1866
1864
  '--disable-domain-reliability', // No reliability monitor disk writes
1865
+ // Suppress Chrome's auto-update subsystem entirely in headful runs.
1866
+ // --disable-component-update + --disable-background-networking above
1867
+ // stop the network-level check, but Chrome's UI can still show the
1868
+ // "update available" toolbar dot / banner / "relaunch to update"
1869
+ // modal if Chrome has cached state from a prior check by the same
1870
+ // installed chrome binary. These two flags neutralize that:
1871
+ // simulate-outdated-no-au=DATE — the no-auto-update simulation
1872
+ // date is treated as DATE. Far-future date = never shows the
1873
+ // 'outdated' UI. Quotes around the date required by Chrome.
1874
+ // check-for-update-interval=N — seconds between update checks.
1875
+ // 31536000 = 1 year. Even if the above somehow gets bypassed,
1876
+ // the check itself won't fire within any reasonable scan.
1877
+ // Both are no-ops in pure headless modes but matter in --headful
1878
+ // and headless='new' (which can render UI in some cases).
1879
+ '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
1880
+ '--check-for-update-interval=31536000',
1867
1881
  // PERFORMANCE: Disable non-essential Chrome features in a single flag
1868
1882
  // IMPORTANT: Chrome only reads the LAST --disable-features flag, so combine all into one
1869
- // AccountConsistencyMirror + AccountConsistencyDice prevent the
1870
- // Chrome sign-in subsystem from initialising at startup. Combined
1871
- // with --disable-sync + --allow-browser-signin=false below, this
1872
- // suppresses the "Something went wrong when opening your profile"
1873
- // popup that fires in headful + --keep-open mode (temp userDataDir
1874
- // has no real profile, so the sync init errors out and pops up).
1875
- `--disable-features=AudioServiceOutOfProcess,VizDisplayCompositor,TranslateUI,BlinkGenPropertyTrees,Translate,BackForwardCache,AcceptCHFrame,SafeBrowsing,HttpsFirstBalancedModeAutoEnable,site-per-process,PaintHolding,AccountConsistencyMirror,AccountConsistencyDice${disable_ad_tagging ? ',AdTagging' : ''}`,
1883
+ //
1884
+ // Sign-in / profile suppression family (prevents the "Something went
1885
+ // wrong when opening your profile. Please sign out then sign in
1886
+ // again" popup that fires in headful when Chrome's sign-in/sync
1887
+ // subsystem can't make sense of our fresh-each-launch temp
1888
+ // userDataDir):
1889
+ // AccountConsistencyMirror, AccountConsistencyDice
1890
+ // Older Chrome's identity consistency layer. Disabling stops
1891
+ // the sync subsystem from initialising at startup.
1892
+ // ProfilePicker, EnableProfilePicker
1893
+ // Two names for the same Chrome feature (renamed in Chrome
1894
+ // ~120s). Disabling stops the profile-picker dialog that some
1895
+ // Chrome versions display when launching with no recognised
1896
+ // profile. Was the new offender in Chrome 148 for this case.
1897
+ // IdentityConsistency
1898
+ // Chrome's identity-consistency-with-google.com checks. Tries
1899
+ // to read profile credentials at startup; trips the popup if
1900
+ // profile is fresh/empty.
1901
+ // SyncDisabledWithProfilePicker
1902
+ // Sync subsystem variant that activates when profile picker
1903
+ // would otherwise show. Disabling is harmless when picker is
1904
+ // also disabled but covers the gap if a Chrome version honors
1905
+ // only one of the two.
1906
+ // SigninInterceptBubble
1907
+ // Sign-in interception bubble that pops when Chrome detects
1908
+ // 'enterprise' sign-in patterns. Defensive.
1909
+ // Combined with --disable-sync + --allow-browser-signin=false
1910
+ // below + --profile-directory=Default flag (explicit profile name
1911
+ // instead of letting Chrome auto-detect/pick), this should fully
1912
+ // suppress sign-in popups in headful from Chrome 118 through 148+.
1913
+ //
1914
+ // ChromeWhatsNewUI: suppresses the post-update "What's New" page
1915
+ // that auto-opens in a new tab after Chrome installs an update —
1916
+ // not popunder-relevant but visually noisy in headful sessions.
1917
+ `--disable-features=AudioServiceOutOfProcess,VizDisplayCompositor,TranslateUI,BlinkGenPropertyTrees,Translate,BackForwardCache,AcceptCHFrame,SafeBrowsing,HttpsFirstBalancedModeAutoEnable,site-per-process,PaintHolding,AccountConsistencyMirror,AccountConsistencyDice,ProfilePicker,EnableProfilePicker,IdentityConsistency,SyncDisabledWithProfilePicker,SigninInterceptBubble,ChromeWhatsNewUI${disable_ad_tagging ? ',AdTagging' : ''}`,
1918
+ // Explicit profile directory — without this, Chrome may probe for
1919
+ // available profiles at launch and trigger the picker dialog (or
1920
+ // the "something went wrong" popup if no profile is found). With
1921
+ // a fresh temp userDataDir each launch, Chrome will create
1922
+ // 'Default' on its own; explicitly naming it skips the probe.
1923
+ '--profile-directory=Default',
1876
1924
  '--disable-ipc-flooding-protection',
1877
1925
  '--aggressive-cache-discard',
1878
1926
  '--memory-pressure-off',
@@ -1931,7 +1979,20 @@ function setupFrameHandling(page, forceDebug) {
1931
1979
 
1932
1980
 
1933
1981
  const pLimit = (await import('p-limit')).default;
1934
- const limit = pLimit(MAX_CONCURRENT_SITES);
1982
+ // VPN connect/disconnect is per-URL (wgConnect/ovpnConnect at scan start,
1983
+ // wgDisconnect/ovpnDisconnect in the finally) and manipulates the SHARED
1984
+ // system routing table. Interface names are derived from a hash of the VPN
1985
+ // config and connect/disconnect is not refcounted, so two concurrent URLs
1986
+ // that share a VPN config resolve to the same interface and one task's
1987
+ // teardown rips the interface out from under the other mid-scan. Force
1988
+ // serial execution whenever any site uses vpn/openvpn — correctness over
1989
+ // throughput, and VPN scans are network-bound rather than CPU-bound anyway.
1990
+ const vpnInUse = sites.some(site => site.vpn || site.openvpn);
1991
+ const effectiveConcurrency = vpnInUse ? 1 : MAX_CONCURRENT_SITES;
1992
+ if (vpnInUse && MAX_CONCURRENT_SITES > 1 && (forceDebug || !silentMode)) {
1993
+ console.log(formatLogMessage('info', `${VPN_TAG} VPN configured — forcing concurrency 1 (was ${MAX_CONCURRENT_SITES}) to avoid routing-table races`));
1994
+ }
1995
+ const limit = pLimit(effectiveConcurrency);
1935
1996
 
1936
1997
  const perSiteHeadful = sites.some(site => site.headful === true);
1937
1998
  const launchHeadless = !(headfulMode || perSiteHeadful);
@@ -2694,23 +2755,34 @@ function setupFrameHandling(page, forceDebug) {
2694
2755
 
2695
2756
  if (userAgentKey === 'chrome_mac') {
2696
2757
  platform = 'macOS';
2697
- platformVersion = '13.5.0';
2758
+ platformVersion = '13.5.0';
2698
2759
  arch = 'arm';
2699
2760
  } else if (userAgentKey === 'chrome_linux') {
2700
2761
  platform = 'Linux';
2701
2762
  platformVersion = '6.5.0';
2702
2763
  arch = 'x86';
2703
2764
  }
2704
-
2765
+
2766
+ // Derive the Chrome major version from the SAME UA string the
2767
+ // browser actually sends (USER_AGENT_COLLECTIONS, via
2768
+ // page.setUserAgent in applyUserAgentSpoofing) so Sec-CH-UA can
2769
+ // never drift out of sync with navigator.userAgent. The version
2770
+ // used to be hardcoded ('146') while the UA list moved to 148 —
2771
+ // a detector cross-checking UA vs Sec-CH-UA saw the mismatch.
2772
+ // Chrome's UA-reduction means the full version is "<major>.0.0.0".
2773
+ const browserUa = USER_AGENT_COLLECTIONS.get(userAgentKey) || '';
2774
+ const chromeMajor = (browserUa.match(/Chrome\/(\d+)/) || [])[1] || '148';
2775
+ const fullVer = `${chromeMajor}.0.0.0`;
2776
+
2705
2777
  await page.setExtraHTTPHeaders({
2706
- 'Sec-CH-UA': '"Not:A-Brand";v="99", "Google Chrome";v="146", "Chromium";v="146"',
2778
+ 'Sec-CH-UA': `"Not:A-Brand";v="99", "Google Chrome";v="${chromeMajor}", "Chromium";v="${chromeMajor}"`,
2707
2779
  'Sec-CH-UA-Platform': `"${platform}"`,
2708
2780
  'Sec-CH-UA-Platform-Version': `"${platformVersion}"`,
2709
2781
  'Sec-CH-UA-Mobile': '?0',
2710
2782
  'Sec-CH-UA-Arch': `"${arch}"`,
2711
2783
  'Sec-CH-UA-Bitness': '"64"',
2712
- 'Sec-CH-UA-Full-Version': '"146.0.0.0"',
2713
- 'Sec-CH-UA-Full-Version-List': '"Not:A-Brand";v="99.0.0.0", "Google Chrome";v="146.0.0.0", "Chromium";v="146.0.0.0"'
2784
+ 'Sec-CH-UA-Full-Version': `"${fullVer}"`,
2785
+ 'Sec-CH-UA-Full-Version-List': `"Not:A-Brand";v="99.0.0.0", "Google Chrome";v="${fullVer}", "Chromium";v="${fullVer}"`
2714
2786
  });
2715
2787
  }
2716
2788
  } catch (fingerprintErr) {
@@ -2736,7 +2808,7 @@ function setupFrameHandling(page, forceDebug) {
2736
2808
  // Get user agent for curl if needed
2737
2809
  let curlUserAgent = '';
2738
2810
  if (useCurl && siteConfig.userAgent) {
2739
- curlUserAgent = USER_AGENTS.get(siteConfig.userAgent.toLowerCase()) || '';
2811
+ curlUserAgent = USER_AGENT_COLLECTIONS.get(siteConfig.userAgent.toLowerCase()) || '';
2740
2812
  }
2741
2813
 
2742
2814
  if (useCurl && forceDebug) {
@@ -3072,10 +3144,22 @@ function setupFrameHandling(page, forceDebug) {
3072
3144
 
3073
3145
  if (capturePopups && forceDebug) {
3074
3146
  // One-time setup-time warning if the click prerequisite isn't met.
3075
- // Without clicks, capture_popups is a no-op in practice.
3076
- const hasClicks = siteConfig.interact === true && siteConfig.interact_clicks === true;
3077
- if (!hasClicks) {
3078
- console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but interact_clicks is not — popups need user-gesture clicks to fire; expect no captures unless the page opens popups via in-page redirects`));
3147
+ // Without clicks, capture_popups is a no-op in practice. Previous
3148
+ // version blamed `interact_clicks` for both missing-piece cases but
3149
+ // when the actual culprit is `interact: 1` (number, silently disabled
3150
+ // by strict `=== true`), the message misled users into debugging
3151
+ // interact_clicks while the real problem was interact itself.
3152
+ // (normalizeSiteConfig now coerces interact: 1 → true with a warning,
3153
+ // so by the time we get here both should be booleans — but keep the
3154
+ // diagnostic accurate for the truly-missing case.)
3155
+ const interactOn = siteConfig.interact === true;
3156
+ const clicksOn = siteConfig.interact_clicks === true;
3157
+ if (!interactOn && !clicksOn) {
3158
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects will capture`));
3159
+ } else if (!interactOn) {
3160
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no fake clicks fire`));
3161
+ } else if (!clicksOn) {
3162
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact_clicks' is not — set interact_clicks: true to enable element-targeted clicks; without it, only random content-zone clicks fire and may miss overlay-based popunders`));
3079
3163
  }
3080
3164
  console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
3081
3165
  }
@@ -3101,133 +3185,200 @@ function setupFrameHandling(page, forceDebug) {
3101
3185
  // setRequestInterception(true) — page.on('request') fires for every
3102
3186
  // request regardless of interception state, and we don't need to
3103
3187
  // block anything on popups.
3104
- const attachPopupRequestCapture = (popupPage, depth) => {
3105
- popupPage.on('request', (request) => {
3188
+ // Evaluate ANY URL surfaced from a popup (the popup's own navigation URL
3189
+ // OR an in-popup request) against the same filter pipeline the main-page
3190
+ // request handler uses. Factored out so:
3191
+ // 1. attachPopupRequestCapture's `popupPage.on('request', ...)` calls
3192
+ // this once per in-popup request (with the request's resourceType).
3193
+ // 2. onTargetCreated calls this once with `target.url()` and resourceType
3194
+ // 'document' BEFORE attaching the request listener — catches the
3195
+ // popup's navigation URL itself, which fires before our listener can
3196
+ // attach (targetcreated → page resolve → attach is async, and the
3197
+ // browser dispatches the navigation immediately on window.open).
3198
+ // Without #2, popunder destinations whose own URL contains the
3199
+ // filterRegex pattern (e.g. AdsCore campaign URLs with &campaign=)
3200
+ // were seen-but-not-evaluated.
3201
+ const evaluatePopupUrl = (checkedUrl, depth, resourceType) => {
3202
+ try {
3203
+ if (!checkedUrl || checkedUrl === 'about:blank') return;
3204
+ let fullSubdomain = '';
3205
+ let checkedRootDomain = '';
3106
3206
  try {
3107
- const checkedUrl = request.url();
3108
- let fullSubdomain = '';
3109
- let checkedRootDomain = '';
3110
- try {
3111
- const parsedUrl = new URL(checkedUrl);
3112
- fullSubdomain = parsedUrl.hostname;
3113
- const pslResult = psl.parse(fullSubdomain);
3114
- checkedRootDomain = pslResult.domain || fullSubdomain;
3115
- } catch (_) { return; }
3116
- if (!checkedRootDomain) return;
3117
-
3118
- // ignoreDomainsByUrl if any pattern matches this popup URL,
3119
- // mark the root domain as ignored for the rest of the scan
3120
- // (main page + all popups). Mirrors the main handler so a
3121
- // tracker URL surfaced via popup chain has the same dampening
3122
- // effect as one surfaced on the main page.
3123
- if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
3124
- for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
3125
- if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
3126
- _dynamicallyIgnoredDomains.add(checkedRootDomain);
3127
- if (forceDebug) {
3128
- console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3129
- }
3130
- break;
3131
- }
3132
- }
3133
- }
3134
-
3135
- // blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
3136
- // above; populating the dynamic block Set from popup URLs lets
3137
- // tracker URLs surfaced via popup chains poison their root
3138
- // domain for the rest of the scan just like main-page hits do.
3139
- if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3140
- for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3141
- if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
3142
- _dynamicallyBlockedDomains.add(checkedRootDomain);
3143
- if (forceDebug) {
3144
- console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3145
- }
3146
- break;
3207
+ const parsedUrl = new URL(checkedUrl);
3208
+ fullSubdomain = parsedUrl.hostname;
3209
+ const pslResult = psl.parse(fullSubdomain);
3210
+ checkedRootDomain = pslResult.domain || fullSubdomain;
3211
+ } catch (_) { return; }
3212
+ if (!checkedRootDomain) return;
3213
+
3214
+ // ignoreDomainsByUrl if any pattern matches this popup URL,
3215
+ // mark the root domain as ignored for the rest of the scan
3216
+ // (main page + all popups). Mirrors the main handler so a
3217
+ // tracker URL surfaced via popup chain has the same dampening
3218
+ // effect as one surfaced on the main page.
3219
+ if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
3220
+ for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
3221
+ if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
3222
+ _dynamicallyIgnoredDomains.add(checkedRootDomain);
3223
+ if (forceDebug) {
3224
+ console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3147
3225
  }
3226
+ break;
3148
3227
  }
3149
3228
  }
3229
+ }
3150
3230
 
3151
- // ignoreDomains gate (global; matchesIgnoreDomain also short-
3152
- // circuits on _dynamicallyIgnoredDomains, so a domain we just
3153
- // added above will be caught here on the same request).
3154
- if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
3155
-
3156
- // Dynamic-block gate for popup requests early return on
3157
- // matched root or any parent (parent-walk in
3158
- // matchesDynamicBlock). Popups don't have a request object
3159
- // available here, so we just return rather than abort; the
3160
- // popup-request observer treats this as "don't process".
3161
- if (matchesDynamicBlock(checkedRootDomain)) return;
3162
-
3163
- // First-party / third-party gate (popup belongs to the main URL's
3164
- // domain group — its OWN URL doesn't redefine first-party).
3165
- const isFirstParty = firstPartyDomains.has(checkedRootDomain);
3166
- if (siteConfig.firstParty === false && isFirstParty) return;
3167
- if (siteConfig.thirdParty === false && !isFirstParty) return;
3168
-
3169
- // Regex match against the site's filterRegex list
3170
- const resourceType = request.resourceType();
3171
- let regexMatched = false;
3172
- for (const re of regexes) {
3173
- if (re.test(checkedUrl)) {
3174
- regexMatched = true;
3231
+ // blockDomainsByUrl trigger symmetric to ignoreDomainsByUrl
3232
+ // above; populating the dynamic block Set from popup URLs lets
3233
+ // tracker URLs surfaced via popup chains poison their root
3234
+ // domain for the rest of the scan just like main-page hits do.
3235
+ if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3236
+ for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3237
+ if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
3238
+ _dynamicallyBlockedDomains.add(checkedRootDomain);
3175
3239
  if (forceDebug) {
3176
- console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
3240
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3177
3241
  }
3178
3242
  break;
3179
3243
  }
3180
3244
  }
3245
+ }
3181
3246
 
3182
- if (!regexMatched) return;
3183
-
3184
- // hasNetTools is the same flag the main handler uses (line ~2639).
3185
- // When the site config carries whois/dig terms, regex match is
3186
- // not sufficient by itself — the URL must ALSO pass the whois/
3187
- // dig validation before it counts. Mirrors the main handler's
3188
- // behavior so 'capture popup domains that match regex/dig/whois'
3189
- // means the same thing for popups as for the main page.
3190
- if (hasNetTools) {
3191
- const popupNetToolsHandler = createNetToolsHandler({
3192
- whoisTerms, whoisOrTerms,
3193
- processedWhoisDomains: globalProcessedWhoisDomains,
3194
- processedDigDomains: globalProcessedDigDomains,
3195
- whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
3196
- whoisServer,
3197
- whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
3198
- debugLogFile,
3199
- digTerms, digOrTerms, digRecordType,
3200
- digSubdomain: siteConfig.dig_subdomain === true,
3201
- dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
3202
- matchedDomains, addMatchedDomain,
3203
- isDomainAlreadyDetected: isLocallyDetected,
3204
- onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
3205
- onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
3206
- cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
3207
- cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
3208
- currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
3209
- ignoreDomains, matchesIgnoreDomain
3210
- });
3211
- trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3212
- } else {
3213
- // No nettools required — regex match alone counts.
3214
- addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3247
+ // ignoreDomains gate (global; matchesIgnoreDomain also short-
3248
+ // circuits on _dynamicallyIgnoredDomains, so a domain we just
3249
+ // added above will be caught here on the same request).
3250
+ if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
3251
+
3252
+ // Dynamic-block gate for popup requests early return on
3253
+ // matched root or any parent (parent-walk in
3254
+ // matchesDynamicBlock). Popups don't have a request object
3255
+ // available here, so we just return rather than abort; the
3256
+ // popup-request observer treats this as "don't process".
3257
+ if (matchesDynamicBlock(checkedRootDomain)) return;
3258
+
3259
+ // First-party / third-party gate (popup belongs to the main URL's
3260
+ // domain group its OWN URL doesn't redefine first-party).
3261
+ const isFirstParty = firstPartyDomains.has(checkedRootDomain);
3262
+ if (siteConfig.firstParty === false && isFirstParty) return;
3263
+ if (siteConfig.thirdParty === false && !isFirstParty) return;
3264
+
3265
+ // Regex match against the site's filterRegex list
3266
+ let regexMatched = false;
3267
+ for (const re of regexes) {
3268
+ if (re.test(checkedUrl)) {
3269
+ regexMatched = true;
3270
+ if (forceDebug) {
3271
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
3272
+ }
3273
+ break;
3215
3274
  }
3216
- } catch (_) { /* observation-only — never let a popup error escape */ }
3275
+ }
3276
+
3277
+ if (!regexMatched) return;
3278
+
3279
+ // hasNetTools is the same flag the main handler uses (line ~2639).
3280
+ // When the site config carries whois/dig terms, regex match is
3281
+ // not sufficient by itself — the URL must ALSO pass the whois/
3282
+ // dig validation before it counts. Mirrors the main handler's
3283
+ // behavior so 'capture popup domains that match regex/dig/whois'
3284
+ // means the same thing for popups as for the main page.
3285
+ if (hasNetTools) {
3286
+ const popupNetToolsHandler = createNetToolsHandler({
3287
+ whoisTerms, whoisOrTerms,
3288
+ processedWhoisDomains: globalProcessedWhoisDomains,
3289
+ processedDigDomains: globalProcessedDigDomains,
3290
+ whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
3291
+ whoisServer,
3292
+ whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
3293
+ debugLogFile,
3294
+ digTerms, digOrTerms, digRecordType,
3295
+ digSubdomain: siteConfig.dig_subdomain === true,
3296
+ dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
3297
+ matchedDomains, addMatchedDomain,
3298
+ isDomainAlreadyDetected: isLocallyDetected,
3299
+ onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
3300
+ onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
3301
+ cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
3302
+ cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
3303
+ currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
3304
+ ignoreDomains, matchesIgnoreDomain
3305
+ });
3306
+ trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3307
+ } else {
3308
+ // No nettools required — regex match alone counts.
3309
+ addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3310
+ }
3311
+ } catch (_) { /* observation-only — never let a popup error escape */ }
3312
+ };
3313
+
3314
+ // Thin wrapper around evaluatePopupUrl for the per-request listener.
3315
+ // Under forceDebug also attach framenavigated + close listeners so
3316
+ // the popup's full lifecycle (initial nav URL, mid-popup navigations,
3317
+ // close) is visible in logs. Useful when investigating "I saw a
3318
+ // Chrome window flash on screen" — the framenavigated transitions
3319
+ // tell you what URL the window was showing and for how long.
3320
+ const attachPopupRequestCapture = (popupPage, depth) => {
3321
+ popupPage.on('request', (request) => {
3322
+ evaluatePopupUrl(request.url(), depth, request.resourceType());
3217
3323
  });
3324
+ if (forceDebug) {
3325
+ try {
3326
+ popupPage.on('framenavigated', (frame) => {
3327
+ try {
3328
+ if (frame !== popupPage.mainFrame()) return; // main frame only
3329
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] framenavigated → ${frame.url() || 'about:blank'}`));
3330
+ } catch (_) {}
3331
+ });
3332
+ popupPage.on('close', () => {
3333
+ try {
3334
+ const lastUrl = popupPage.url ? popupPage.url() : '(unknown)';
3335
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] close (last URL: ${lastUrl})`));
3336
+ } catch (_) {}
3337
+ });
3338
+ popupPage.on('pageerror', (err) => {
3339
+ try { console.log(formatLogMessage('debug', `[popup depth=${depth}] pageerror: ${err.message}`)); } catch (_) {}
3340
+ });
3341
+ } catch (_) { /* listener attach errors aren't fatal */ }
3342
+ }
3218
3343
  };
3219
3344
 
3220
3345
  const onTargetCreated = async (target) => {
3346
+ // Log EVERY targetcreated event under forceDebug so callers can see
3347
+ // the full set of targets Chromium creates during the scan — not
3348
+ // just the ones we capture. Useful when investigating "is that
3349
+ // Chrome window I saw from a popup or from somewhere else?" — if
3350
+ // a window opens but no targetcreated fires, it's not ours. If a
3351
+ // targetcreated fires for type=page but we skip-and-explain below,
3352
+ // the user knows why we ignored it. Captures the FULL diagnostic
3353
+ // surface, no behavior change.
3354
+ let _tType, _tUrl;
3355
+ if (forceDebug) {
3356
+ try {
3357
+ _tType = target.type();
3358
+ _tUrl = target.url() || 'about:blank';
3359
+ console.log(formatLogMessage('debug', `[popup] targetcreated: type=${_tType} url=${_tUrl}`));
3360
+ } catch (_) {}
3361
+ }
3362
+
3221
3363
  // Short-circuit guard: if finally has already started, don't attach
3222
3364
  // a request listener whose closure would outlive its meaningful
3223
3365
  // scope. The race is narrow (a targetcreated firing while we're
3224
3366
  // mid-await on target.page() across the finally boundary), but
3225
3367
  // without this guard a late popup could push matches into
3226
3368
  // matchedDomains for a URL whose processing has already returned.
3227
- if (urlFinished) return;
3228
- if (target.type() !== 'page') return;
3369
+ if (urlFinished) {
3370
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: urlFinished=true (scan teardown in progress)`));
3371
+ return;
3372
+ }
3373
+ if (target.type() !== 'page') {
3374
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: non-page target type=${target.type()} (workers/service-workers/etc are not popunder candidates)`));
3375
+ return;
3376
+ }
3229
3377
  const depth = getPopupDepth(target);
3230
- if (depth < 1) return; // Not one of ours
3378
+ if (depth < 1) {
3379
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: depth=0 — target not in opener chain of main page (likely a new browser tab opened independently, not a popunder from our scan)`));
3380
+ return; // Not one of ours
3381
+ }
3231
3382
  if (depth > POPUP_MAX_DEPTH) {
3232
3383
  if (forceDebug) {
3233
3384
  console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
@@ -3237,7 +3388,10 @@ function setupFrameHandling(page, forceDebug) {
3237
3388
 
3238
3389
  let popupPage;
3239
3390
  try { popupPage = await target.page(); } catch (_) { return; }
3240
- if (!popupPage) return;
3391
+ if (!popupPage) {
3392
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] target.page() returned null — popup not accessible as a Page object`));
3393
+ return;
3394
+ }
3241
3395
  // Re-check after the await — the per-URL finally may have flipped
3242
3396
  // the flag while target.page() was resolving.
3243
3397
  if (urlFinished) {
@@ -3247,8 +3401,31 @@ function setupFrameHandling(page, forceDebug) {
3247
3401
 
3248
3402
  if (forceDebug) {
3249
3403
  console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
3404
+ // Window dimensions are useful for the "is the popup visible on
3405
+ // my screen?" question — a popup with non-zero viewport in a
3406
+ // headless=new launch shouldn't be visible but on some display
3407
+ // servers (WSLg, X11) it can briefly flash on screen. Log the
3408
+ // viewport so callers can correlate with what they saw.
3409
+ try {
3410
+ const vp = popupPage.viewport();
3411
+ if (vp) console.log(formatLogMessage('debug', `[popup depth=${depth}] viewport: ${vp.width}x${vp.height}`));
3412
+ } catch (_) {}
3250
3413
  }
3251
3414
 
3415
+ // Evaluate the popup's own navigation URL against the same filter
3416
+ // pipeline used for in-popup requests. Required because targetcreated
3417
+ // → target.page() → on('request', ...) is async, and the browser
3418
+ // dispatches the popup's navigation request immediately on window.open
3419
+ // — by the time the listener attaches below, the navigation request
3420
+ // has already fired and won't be re-emitted. resourceType 'document'
3421
+ // mirrors what Chrome would emit for a top-level navigation request.
3422
+ // Without this call, AdsCore-style popunder destinations (URL contains
3423
+ // &campaign=, &v=, etc) were seen-but-not-evaluated: the popup was
3424
+ // logged but its domain never matched the filter regex, so it never
3425
+ // became a rule. Only secondary in-popup requests (tracking pixels,
3426
+ // sub-resources) ever got tested against the regex.
3427
+ evaluatePopupUrl(target.url(), depth, 'document');
3428
+
3252
3429
  attachPopupRequestCapture(popupPage, depth);
3253
3430
 
3254
3431
  // Auto-close after the capture window so popups don't pile up.
@@ -4322,7 +4499,26 @@ function setupFrameHandling(page, forceDebug) {
4322
4499
 
4323
4500
  // Mark page as processing during interactions
4324
4501
  updatePageUsage(page, true);
4325
- const INTERACTION_HARD_TIMEOUT = 15000;
4502
+ // Work-aware ceiling (scales with click count / realistic_click /
4503
+ // intensity) instead of a flat 15s, which truncated high-click
4504
+ // popunder configs mid-pass. Single source of truth shared with
4505
+ // interaction.js's own internal hard cap so the two can't disagree.
4506
+ const INTERACTION_HARD_TIMEOUT = computeInteractionCeilingMs(interactionConfig);
4507
+
4508
+ // Capture-and-clear timer wrapper — same fix as cdp.js (0772ccd) and
4509
+ // the per-URL grace (577ad66). The 3 inline Promise.race patterns
4510
+ // below previously used `new Promise((_, reject) => setTimeout(...))`
4511
+ // without capturing the timer ID, leaking the 15s timer + closure on
4512
+ // reject every time interaction completed inside the cap (the common
4513
+ // case). Centralizing avoids the same mistake recurring across the
4514
+ // ghost-cursor / fallback / standard branches.
4515
+ const raceWithTimer = (promise, msg) => {
4516
+ let t;
4517
+ return Promise.race([
4518
+ promise,
4519
+ new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), INTERACTION_HARD_TIMEOUT); })
4520
+ ]).finally(() => clearTimeout(t));
4521
+ };
4326
4522
 
4327
4523
  // Check if ghost-cursor mode is enabled for this site
4328
4524
  const ghostConfig = resolveGhostCursorConfig(siteConfig, globalGhostCursor, forceDebug);
@@ -4333,60 +4529,51 @@ function setupFrameHandling(page, forceDebug) {
4333
4529
  if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
4334
4530
  const cursor = createGhostCursor(page, { forceDebug });
4335
4531
  if (cursor) {
4336
- await Promise.race([
4337
- (async () => {
4338
- const viewport = page.viewport() || { width: 1200, height: 800 };
4339
- const ghostDuration = ghostConfig.duration || 2000;
4340
- const ghostStart = Date.now();
4341
- const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
4342
-
4343
- // Time-based Bezier mouse movements — runs for ghostDuration ms
4344
- while (ghostTimeLeft() > 200) {
4345
- const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
4346
- const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
4347
- await ghostMove(cursor, toX, toY, {
4348
- moveSpeed: ghostConfig.moveSpeed,
4349
- overshootThreshold: ghostConfig.overshootThreshold,
4350
- forceDebug
4351
- });
4352
- if (ghostTimeLeft() > 100) {
4353
- await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
4354
- }
4355
- }
4356
- if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
4357
- await ghostRandomMove(cursor, { forceDebug });
4358
- }
4359
- if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
4360
- const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
4361
- const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
4362
- await ghostClick(cursor, { x: clickX, y: clickY }, {
4363
- hesitate: ghostConfig.hesitate,
4364
- forceDebug
4365
- });
4532
+ await raceWithTimer((async () => {
4533
+ const viewport = page.viewport() || { width: 1200, height: 800 };
4534
+ const ghostDuration = ghostConfig.duration || 2000;
4535
+ const ghostStart = Date.now();
4536
+ const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
4537
+
4538
+ // Time-based Bezier mouse movements — runs for ghostDuration ms
4539
+ while (ghostTimeLeft() > 200) {
4540
+ const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
4541
+ const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
4542
+ await ghostMove(cursor, toX, toY, {
4543
+ moveSpeed: ghostConfig.moveSpeed,
4544
+ overshootThreshold: ghostConfig.overshootThreshold,
4545
+ forceDebug
4546
+ });
4547
+ if (ghostTimeLeft() > 100) {
4548
+ await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
4366
4549
  }
4367
- if (interactionConfig.includeScrolling) {
4368
- await performPageInteraction(page, currentUrl, {
4369
- ...interactionConfig,
4370
- mouseMovements: 0,
4371
- includeElementClicks: false
4372
- }, forceDebug);
4373
- }
4374
- })(),
4375
- new Promise((_, reject) => setTimeout(() => reject(new Error('ghost-cursor interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
4376
- ]);
4550
+ }
4551
+ if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
4552
+ await ghostRandomMove(cursor, { forceDebug });
4553
+ }
4554
+ if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
4555
+ const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
4556
+ const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
4557
+ await ghostClick(cursor, { x: clickX, y: clickY }, {
4558
+ hesitate: ghostConfig.hesitate,
4559
+ forceDebug
4560
+ });
4561
+ }
4562
+ if (interactionConfig.includeScrolling) {
4563
+ await performPageInteraction(page, currentUrl, {
4564
+ ...interactionConfig,
4565
+ mouseMovements: 0,
4566
+ includeElementClicks: false
4567
+ }, forceDebug);
4568
+ }
4569
+ })(), 'ghost-cursor interaction hard timeout');
4377
4570
  } else {
4378
4571
  if (forceDebug) console.log(formatLogMessage('debug', '[ghost-cursor] Falling back to built-in mouse'));
4379
- await Promise.race([
4380
- performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
4381
- new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
4382
- ]);
4572
+ await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
4383
4573
  }
4384
4574
  } else {
4385
4575
  // Standard built-in mouse interaction
4386
- await Promise.race([
4387
- performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
4388
- new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
4389
- ]);
4576
+ await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
4390
4577
  }
4391
4578
  } catch (interactTimeoutErr) {
4392
4579
  if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
@@ -4521,8 +4708,16 @@ function setupFrameHandling(page, forceDebug) {
4521
4708
 
4522
4709
  if (siteConfig.clear_sitedata === true) {
4523
4710
  try {
4524
- const clearResult = await clearSiteData(page, currentUrl, forceDebug, true); // Quick mode for reloads
4525
- if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i} for ${currentUrl}`));
4711
+ // Default reload clear is quick mode (cookies + cache +
4712
+ // localStorage + sessionStorage the storage layers where
4713
+ // session-cap tracking typically lives). Sites that put their
4714
+ // session cap in IndexedDB / WebSQL / service workers can opt
4715
+ // into a full clear-per-reload via clear_sitedata_full_on_reload.
4716
+ // Costs ~100-500ms extra per reload and may unregister a
4717
+ // service worker the page depends on; off by default.
4718
+ const fullOnReload = siteConfig.clear_sitedata_full_on_reload === true;
4719
+ const clearResult = await clearSiteData(page, currentUrl, forceDebug, !fullOnReload);
4720
+ if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data (${fullOnReload ? 'full' : 'quick'}) before reload #${i} for ${currentUrl}`));
4526
4721
  } catch (reloadClearErr) {
4527
4722
  if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
4528
4723
  }
@@ -4536,20 +4731,26 @@ function setupFrameHandling(page, forceDebug) {
4536
4731
  if (useForceReload && !reloadSuccess && !skipForceReload) {
4537
4732
  // Attempt force reload: disable cache, reload, re-enable cache
4538
4733
  try {
4734
+ // Local race-with-timer helper — capture-and-clear pattern from
4735
+ // cdp.js / interact (6ad36e7). Without this, every successful
4736
+ // setCacheEnabled() left an 8s setTimeout running with closure
4737
+ // on `reject` (2 leaks per reload cycle × N reload cycles).
4738
+ const raceWithTimer = (promise, msg, ms) => {
4739
+ let t;
4740
+ return Promise.race([
4741
+ promise,
4742
+ new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), ms); })
4743
+ ]).finally(() => clearTimeout(t));
4744
+ };
4745
+
4539
4746
  // Timeout-protected cache disable
4540
- await Promise.race([
4541
- page.setCacheEnabled(false),
4542
- new Promise((_, reject) => setTimeout(() => reject(new Error('Cache disable timeout')), 8000))
4543
- ]);
4544
-
4747
+ await raceWithTimer(page.setCacheEnabled(false), 'Cache disable timeout', 8000);
4748
+
4545
4749
  // Use networkidle2 for force reload to better detect when page is actually loaded
4546
4750
  await page.reload({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
4547
-
4751
+
4548
4752
  // Timeout-protected cache enable
4549
- await Promise.race([
4550
- page.setCacheEnabled(true),
4551
- new Promise((_, reject) => setTimeout(() => reject(new Error('Cache enable timeout')), 8000))
4552
- ]);
4753
+ await raceWithTimer(page.setCacheEnabled(true), 'Cache enable timeout', 8000);
4553
4754
 
4554
4755
  reloadSuccess = true;
4555
4756
  if (forceDebug) console.log(formatLogMessage('debug', `Force reload #${i} completed for ${currentUrl}`));
@@ -4644,8 +4845,21 @@ function setupFrameHandling(page, forceDebug) {
4644
4845
  const endY = 200 + Math.floor(Math.random() * (vp.height - 400));
4645
4846
  await humanLikeMouseMove(page, startX, startY, endX, endY, { steps: 3, curve: 0.04, jitter: 1 });
4646
4847
  }
4647
- // Content clicks to trigger document-level onclick handlers
4648
- await performContentClicks(page, { clicks: 2, preDelay: 200, forceDebug });
4848
+ // Content clicks to trigger document-level onclick handlers.
4849
+ // Honor siteConfig.interact_click_count so popunder-discovery configs
4850
+ // get the same click volume on every reload, not just the initial load.
4851
+ // Omit `clicks` when no override is set so performContentClicks uses
4852
+ // its CONTENT_CLICK.CLICK_COUNT default (single source of truth).
4853
+ // realistic forwards siteConfig.realistic_click; always passed
4854
+ // (defaults to false) so realistic mode applies to every reload's
4855
+ // clicks, not just the initial pass.
4856
+ const postReloadClickOpts = {
4857
+ preDelay: 200,
4858
+ forceDebug,
4859
+ realistic: !!interactionConfig.realistic
4860
+ };
4861
+ if (interactionConfig.clickCount) postReloadClickOpts.clicks = interactionConfig.clickCount;
4862
+ await performContentClicks(page, postReloadClickOpts);
4649
4863
  if (forceDebug) console.log(formatLogMessage('debug', `Post-reload interaction completed for reload #${i}`));
4650
4864
  } catch (postReloadInteractErr) {
4651
4865
  // Non-critical — continue with remaining reloads
@@ -4870,9 +5084,21 @@ function setupFrameHandling(page, forceDebug) {
4870
5084
  }
4871
5085
  }
4872
5086
 
4873
- // Temporarily store the pLimit function
5087
+ // Temporarily store the pLimit function
4874
5088
  const originalLimit = limit;
4875
5089
 
5090
+ // Per-site config normalization (always runs, not gated on --validate-config).
5091
+ // Catches typo'd keys (whois_terms vs whois) with "did you mean" suggestions
5092
+ // and coerces boolean-like values (interact: 1 → interact: true) before any
5093
+ // downstream strict-equality check silently treats them as disabled. Mutates
5094
+ // each site in place so the rest of the scan sees normalized values.
5095
+ // Reports via console.warn so messages surface even when --silent is set.
5096
+ for (let i = 0; i < sites.length; i++) {
5097
+ const { warnings, errors } = normalizeSiteConfig(sites[i], i);
5098
+ for (const e of errors) console.warn(messageColors.error('⚠ ' + e));
5099
+ for (const w of warnings) console.warn(messageColors.warn('⚠ [config] ' + w));
5100
+ }
5101
+
4876
5102
  // V8 Optimization: Calculate total URLs first to pre-allocate array
4877
5103
  let totalUrls = 0;
4878
5104
  for (const site of sites) {
@@ -4890,7 +5116,17 @@ function setupFrameHandling(page, forceDebug) {
4890
5116
  for (const url of urlsToProcess) {
4891
5117
  allTasks[taskIndex++] = {
4892
5118
  url,
4893
- config: { ...site, _originalUrl: url }, // Preserve original URL for CDP domain checking
5119
+ // Default userAgent to 'chrome' when a site doesn't set one. Without
5120
+ // it the browser sends its bundled default UA, which literally
5121
+ // contains "HeadlessChrome" (verified, both headless modes) — an
5122
+ // instant automation tell. Defaulting here (rather than at launch)
5123
+ // activates the whole coherent path, since UA-string spoofing, the
5124
+ // navigator/webdriver/plugins/userAgentData JS masking, the Sec-CH-UA
5125
+ // request headers, and the curl content-fetch UA all gate on
5126
+ // config.userAgent. Placing 'chrome' BEFORE the spread means an
5127
+ // explicit site value wins — including userAgent:false / null to opt
5128
+ // out and scan with the raw headless UA.
5129
+ config: { userAgent: 'chrome', ...site, _originalUrl: url },
4894
5130
  taskId: taskIndex - 1 // For tracking
4895
5131
  };
4896
5132
  }
@@ -4923,7 +5159,7 @@ function setupFrameHandling(page, forceDebug) {
4923
5159
  let urlsSinceLastCleanup = 0;
4924
5160
 
4925
5161
  if (!silentMode && totalUrls > 0) {
4926
- console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${MAX_CONCURRENT_SITES}...`);
5162
+ console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${effectiveConcurrency}...`);
4927
5163
  if (totalUrls > RESOURCE_CLEANUP_INTERVAL) {
4928
5164
  console.log(messageColors.processing('Browser will restart every') + ` ~${RESOURCE_CLEANUP_INTERVAL} URLs to free resources`);
4929
5165
  }
@@ -5044,10 +5280,18 @@ function setupFrameHandling(page, forceDebug) {
5044
5280
  silentMode
5045
5281
  });
5046
5282
  healthPromise.catch(() => {});
5047
- healthCheck = await Promise.race([
5048
- healthPromise,
5049
- new Promise((_, reject) => setTimeout(() => reject(new Error('Health check timeout')), 30000))
5050
- ]);
5283
+ // Capture-and-clear timer pattern (cdp.js 0772ccd, interact 6ad36e7) —
5284
+ // when healthPromise wins the race, the inline setTimeout would
5285
+ // otherwise hold reject's closure for the full 30s grace window.
5286
+ let healthTimer;
5287
+ try {
5288
+ healthCheck = await Promise.race([
5289
+ healthPromise,
5290
+ new Promise((_, reject) => { healthTimer = setTimeout(() => reject(new Error('Health check timeout')), 30000); })
5291
+ ]);
5292
+ } finally {
5293
+ if (healthTimer) clearTimeout(healthTimer);
5294
+ }
5051
5295
  } catch (healthError) {
5052
5296
  console.log(formatLogMessage('warn', `[HEALTH CHECK] Timeout, assuming restart needed`));
5053
5297
  healthCheck = { shouldRestart: true, reason: 'Health check timeout' };
@@ -5312,26 +5556,94 @@ function setupFrameHandling(page, forceDebug) {
5312
5556
  } catch {}
5313
5557
 
5314
5558
  // Per-URL timeout so a single hung processUrl can't block the batch
5315
- // forever. 75s sits comfortably above the realistic legit-page ceiling
5316
- // (nav 35s + Cloudflare adaptive ~25s + interaction ~10s + network-idle
5317
- // wait ~10s ≈ ~70s), well short of the old 120s safety net. Cuts
5318
- // hang-recovery time roughly in half when an entire batch's URLs all
5319
- // hang and we're waiting on this timeout to advance processedUrlCount.
5320
- const PER_URL_TIMEOUT_MS = 75000;
5559
+ // forever. Scaled from siteConfig.timeout + (delay + interaction) ×
5560
+ // (1 + reload) + 30s headroom, with a 75s floor.
5561
+ //
5562
+ // The (1 + reload) multiplier was missing from the previous formula
5563
+ // (13dd4fa) `reload: 4` configs perform 5 total cycles (initial +
5564
+ // 4 reloads), each with its own delay + interaction overhead, so the
5565
+ // 80s ceiling for the user's lean config (timeout:35000, delay:15000,
5566
+ // reload:4) fired DURING the 3rd reload while the orphan still had
5567
+ // 2 more cycles + drain to go — far longer than the 8s grace could
5568
+ // bridge. Multiplying by cycle count brings the ceiling above the
5569
+ // legitimate work envelope.
5570
+ const reloadCount = task.config.reload || 0;
5571
+ // Interaction overhead per cycle must match interaction.js's actual
5572
+ // ceiling, which is now work-aware (high interact_click_count /
5573
+ // realistic_click configs legitimately run far longer than the old flat
5574
+ // 15s). Compute the same value here so the per-URL ceiling stays above
5575
+ // the real interaction envelope and can't fire mid-pass. Zero when
5576
+ // interaction is disabled for this task (no interaction cost to budget).
5577
+ const interactionOnForTask = task.config.interact === true && !disableInteract;
5578
+ const INTERACTION_OVERHEAD_MS = interactionOnForTask
5579
+ ? computeInteractionCeilingMs(createInteractionConfig(task.url, task.config))
5580
+ : 0;
5581
+ const PER_URL_TIMEOUT_MS = Math.max(
5582
+ 75000,
5583
+ (task.config.timeout || 35000)
5584
+ + ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
5585
+ + 30000
5586
+ );
5587
+ // Grace period after primary timeout — gives the orphan a chance to
5588
+ // finish drainPendingNetTools() and emit "Saving N rules despite page
5589
+ // load failure" before we abandon its result. Drain typically completes
5590
+ // in <1s with cached nettools; 8s is the safety ceiling.
5591
+ const PER_URL_GRACE_MS = 8000;
5592
+ const PER_URL_TIMEOUT_MARKER = 'PER_URL_TIMEOUT_FIRED';
5593
+
5321
5594
  const processUrlPromise = processUrl(task.url, task.config, browser);
5322
5595
  let perUrlTimer;
5323
5596
  try {
5324
5597
  return await Promise.race([
5325
5598
  processUrlPromise,
5326
5599
  new Promise((_, reject) => {
5327
- perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (75s)')), PER_URL_TIMEOUT_MS);
5600
+ perUrlTimer = setTimeout(() => {
5601
+ const e = new Error(`Per-URL timeout (${Math.round(PER_URL_TIMEOUT_MS / 1000)}s)`);
5602
+ e.code = PER_URL_TIMEOUT_MARKER;
5603
+ reject(e);
5604
+ }, PER_URL_TIMEOUT_MS);
5328
5605
  })
5329
5606
  ]);
5330
5607
  } catch (err) {
5331
- if (err && err.message === 'Per-URL timeout (75s)') {
5332
- processUrlPromise.catch(() => {});
5608
+ if (err && err.code === PER_URL_TIMEOUT_MARKER) {
5333
5609
  forceRestartFlag = true;
5334
- return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (75s)', needsImmediateRestart: true };
5610
+ // Log the timeout fire was invisible before; only ended up in the
5611
+ // returned result.error field which is never printed. Makes
5612
+ // ceiling-tuning regressions visible without source-reading.
5613
+ if (forceDebug) {
5614
+ console.log(formatLogMessage('warn', `${err.message} for ${task.url} — orphan in ${PER_URL_GRACE_MS / 1000}s grace`));
5615
+ }
5616
+ // Grace period — wait briefly for the orphan to drain + recover
5617
+ // partial matches. Browser is still in a bad state (we hit the
5618
+ // primary ceiling) so the restart still fires either way; only the
5619
+ // rules payload differs.
5620
+ let graceTimer;
5621
+ try {
5622
+ const graceResult = await Promise.race([
5623
+ processUrlPromise,
5624
+ new Promise((_, reject) => {
5625
+ // Capture the timer ID so the finally can clear it when the
5626
+ // orphan wins the race — otherwise the setTimeout keeps the
5627
+ // event loop ref + closure on `reject` alive for the full
5628
+ // grace window, even though the race already settled.
5629
+ // Same leak pattern fixed in cdp.js (0772ccd) and
5630
+ // clear_sitedata (780b443).
5631
+ graceTimer = setTimeout(() => reject(new Error('Grace timeout')), PER_URL_GRACE_MS);
5632
+ })
5633
+ ]);
5634
+ if (forceDebug) {
5635
+ console.log(formatLogMessage('debug', `Grace recovered ${(graceResult && graceResult.rules ? graceResult.rules.length : 0)} rules for ${task.url}`));
5636
+ }
5637
+ return { ...graceResult, needsImmediateRestart: true };
5638
+ } catch (_) {
5639
+ if (forceDebug) {
5640
+ console.log(formatLogMessage('warn', `Grace timed out for ${task.url} — discarding orphan`));
5641
+ }
5642
+ processUrlPromise.catch(() => {});
5643
+ return { url: task.url, rules: [], success: false, error: err.message, needsImmediateRestart: true };
5644
+ } finally {
5645
+ if (graceTimer) clearTimeout(graceTimer);
5646
+ }
5335
5647
  }
5336
5648
  throw err;
5337
5649
  } finally {