@fanboynz/network-scanner 3.0.3 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -12,13 +12,13 @@ const path = require('path');
12
12
  const dnsPromises = require('node:dns/promises');
13
13
  const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
14
14
  const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
15
- const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
16
- const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
15
+ const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
16
+ const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
17
17
  const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
18
18
  // Curl functionality (replace searchstring curl handler)
19
19
  const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
20
20
  // Rule validation
21
- const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
21
+ const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile, normalizeSiteConfig } = require('./lib/validate_rules');
22
22
  // CF Bypass
23
23
  const {
24
24
  handleCloudflareProtection,
@@ -66,7 +66,7 @@ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
66
66
  // log lines (start/completed). Same cyan as the other monitoring tags.
67
67
  const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
68
68
  // Enhanced mouse interaction and page simulation
69
- const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
69
+ const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
70
70
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
71
71
  const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
72
72
  // Domain detection cache for performance optimization
@@ -129,15 +129,12 @@ const CONCURRENCY_LIMITS = Object.freeze({
129
129
  });
130
130
 
131
131
  // V8 Optimization: Use Map for user agent lookups instead of object
132
- const USER_AGENTS = Object.freeze(new Map([
133
- ['chrome', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
134
- ['chrome_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
135
- ['chrome_linux', "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
136
- ['firefox', "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0"],
137
- ['firefox_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0"],
138
- ['firefox_linux', "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0"],
139
- ['safari', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"]
140
- ]));
132
+ // User-Agent strings come from the single source of truth in lib/fingerprint
133
+ // (USER_AGENT_COLLECTIONS, imported above) the same map page.setUserAgent
134
+ // applies to the browser. The previous local duplicate had silently drifted
135
+ // (Chrome 146 vs the browser's 148, Firefox 148 vs 151, Safari 18.6 vs 19.5),
136
+ // so curl content-fetches advertised a different browser than the page did.
137
+ // Keep using the imported map directly so the two can never diverge again.
141
138
 
142
139
  const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
143
140
 
@@ -776,13 +773,14 @@ Redirect Handling Options:
776
773
  resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
777
774
  interact: true/false Simulate mouse movements/clicks
778
775
  isBrave: true/false Spoof Brave browser detection
779
- userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Custom desktop User-Agent
776
+ userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Desktop User-Agent (defaults to "chrome" if unset; set false to scan with the raw headless UA)
780
777
  interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium)
781
778
  delay: <milliseconds> Delay after load (default: 6000, capped at 2000ms unless delay_uncapped: true)
782
779
  delay_uncapped: true/false Honor 'delay' up to half the per-URL timeout instead of the 2s default cap. Use for sites with setTimeout-deferred lazy ad/tracker loaders that fire well past the standard post-networkidle window
783
780
  reload: <number> Reload page n times after load (default: 1)
784
781
  forcereload: true/false or ["domain1.com", "domain2.com"] Force cache-clearing reload for all URLs or specific domains
785
782
  clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
783
+ clear_sitedata_full_on_reload: true/false With clear_sitedata: true, also clear heavy storage (IndexedDB, WebSQL, service workers) between reloads — quick mode (cookies+cache+local/session storage) is the default for reloads; this flag promotes them to full clears at ~100-500ms latency cost per reload. Use for sites with IndexedDB/service-worker-backed session caps. Off by default.
786
784
  subDomains: 1/0 Output full subdomains (default: 0)
787
785
  localhost: true/false Force localhost output (127.0.0.1)
788
786
  localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
@@ -1864,15 +1862,65 @@ function setupFrameHandling(page, forceDebug) {
1864
1862
  '--log-level=3', // Fatal errors only (suppresses verbose disk logging)
1865
1863
  '--no-service-autorun', // No background service disk activity
1866
1864
  '--disable-domain-reliability', // No reliability monitor disk writes
1865
+ // Suppress Chrome's auto-update subsystem entirely in headful runs.
1866
+ // --disable-component-update + --disable-background-networking above
1867
+ // stop the network-level check, but Chrome's UI can still show the
1868
+ // "update available" toolbar dot / banner / "relaunch to update"
1869
+ // modal if Chrome has cached state from a prior check by the same
1870
+ // installed chrome binary. These two flags neutralize that:
1871
+ // simulate-outdated-no-au=DATE — the no-auto-update simulation
1872
+ // date is treated as DATE. Far-future date = never shows the
1873
+ // 'outdated' UI. Quotes around the date required by Chrome.
1874
+ // check-for-update-interval=N — seconds between update checks.
1875
+ // 31536000 = 1 year. Even if the above somehow gets bypassed,
1876
+ // the check itself won't fire within any reasonable scan.
1877
+ // Both are no-ops in pure headless modes but matter in --headful
1878
+ // and headless='new' (which can render UI in some cases).
1879
+ '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
1880
+ '--check-for-update-interval=31536000',
1867
1881
  // PERFORMANCE: Disable non-essential Chrome features in a single flag
1868
1882
  // IMPORTANT: Chrome only reads the LAST --disable-features flag, so combine all into one
1869
- // AccountConsistencyMirror + AccountConsistencyDice prevent the
1870
- // Chrome sign-in subsystem from initialising at startup. Combined
1871
- // with --disable-sync + --allow-browser-signin=false below, this
1872
- // suppresses the "Something went wrong when opening your profile"
1873
- // popup that fires in headful + --keep-open mode (temp userDataDir
1874
- // has no real profile, so the sync init errors out and pops up).
1875
- `--disable-features=AudioServiceOutOfProcess,VizDisplayCompositor,TranslateUI,BlinkGenPropertyTrees,Translate,BackForwardCache,AcceptCHFrame,SafeBrowsing,HttpsFirstBalancedModeAutoEnable,site-per-process,PaintHolding,AccountConsistencyMirror,AccountConsistencyDice${disable_ad_tagging ? ',AdTagging' : ''}`,
1883
+ //
1884
+ // Sign-in / profile suppression family (prevents the "Something went
1885
+ // wrong when opening your profile. Please sign out then sign in
1886
+ // again" popup that fires in headful when Chrome's sign-in/sync
1887
+ // subsystem can't make sense of our fresh-each-launch temp
1888
+ // userDataDir):
1889
+ // AccountConsistencyMirror, AccountConsistencyDice
1890
+ // Older Chrome's identity consistency layer. Disabling stops
1891
+ // the sync subsystem from initialising at startup.
1892
+ // ProfilePicker, EnableProfilePicker
1893
+ // Two names for the same Chrome feature (renamed in Chrome
1894
+ // ~120s). Disabling stops the profile-picker dialog that some
1895
+ // Chrome versions display when launching with no recognised
1896
+ // profile. Was the new offender in Chrome 148 for this case.
1897
+ // IdentityConsistency
1898
+ // Chrome's identity-consistency-with-google.com checks. Tries
1899
+ // to read profile credentials at startup; trips the popup if
1900
+ // profile is fresh/empty.
1901
+ // SyncDisabledWithProfilePicker
1902
+ // Sync subsystem variant that activates when profile picker
1903
+ // would otherwise show. Disabling is harmless when picker is
1904
+ // also disabled but covers the gap if a Chrome version honors
1905
+ // only one of the two.
1906
+ // SigninInterceptBubble
1907
+ // Sign-in interception bubble that pops when Chrome detects
1908
+ // 'enterprise' sign-in patterns. Defensive.
1909
+ // Combined with --disable-sync + --allow-browser-signin=false
1910
+ // below + --profile-directory=Default flag (explicit profile name
1911
+ // instead of letting Chrome auto-detect/pick), this should fully
1912
+ // suppress sign-in popups in headful from Chrome 118 through 148+.
1913
+ //
1914
+ // ChromeWhatsNewUI: suppresses the post-update "What's New" page
1915
+ // that auto-opens in a new tab after Chrome installs an update —
1916
+ // not popunder-relevant but visually noisy in headful sessions.
1917
+ `--disable-features=AudioServiceOutOfProcess,VizDisplayCompositor,TranslateUI,BlinkGenPropertyTrees,Translate,BackForwardCache,AcceptCHFrame,SafeBrowsing,HttpsFirstBalancedModeAutoEnable,site-per-process,PaintHolding,AccountConsistencyMirror,AccountConsistencyDice,ProfilePicker,EnableProfilePicker,IdentityConsistency,SyncDisabledWithProfilePicker,SigninInterceptBubble,ChromeWhatsNewUI${disable_ad_tagging ? ',AdTagging' : ''}`,
1918
+ // Explicit profile directory — without this, Chrome may probe for
1919
+ // available profiles at launch and trigger the picker dialog (or
1920
+ // the "something went wrong" popup if no profile is found). With
1921
+ // a fresh temp userDataDir each launch, Chrome will create
1922
+ // 'Default' on its own; explicitly naming it skips the probe.
1923
+ '--profile-directory=Default',
1876
1924
  '--disable-ipc-flooding-protection',
1877
1925
  '--aggressive-cache-discard',
1878
1926
  '--memory-pressure-off',
@@ -1931,7 +1979,20 @@ function setupFrameHandling(page, forceDebug) {
1931
1979
 
1932
1980
 
1933
1981
  const pLimit = (await import('p-limit')).default;
1934
- const limit = pLimit(MAX_CONCURRENT_SITES);
1982
+ // VPN connect/disconnect is per-URL (wgConnect/ovpnConnect at scan start,
1983
+ // wgDisconnect/ovpnDisconnect in the finally) and manipulates the SHARED
1984
+ // system routing table. Interface names are derived from a hash of the VPN
1985
+ // config and connect/disconnect is not refcounted, so two concurrent URLs
1986
+ // that share a VPN config resolve to the same interface and one task's
1987
+ // teardown rips the interface out from under the other mid-scan. Force
1988
+ // serial execution whenever any site uses vpn/openvpn — correctness over
1989
+ // throughput, and VPN scans are network-bound rather than CPU-bound anyway.
1990
+ const vpnInUse = sites.some(site => site.vpn || site.openvpn);
1991
+ const effectiveConcurrency = vpnInUse ? 1 : MAX_CONCURRENT_SITES;
1992
+ if (vpnInUse && MAX_CONCURRENT_SITES > 1 && (forceDebug || !silentMode)) {
1993
+ console.log(formatLogMessage('info', `${VPN_TAG} VPN configured — forcing concurrency 1 (was ${MAX_CONCURRENT_SITES}) to avoid routing-table races`));
1994
+ }
1995
+ const limit = pLimit(effectiveConcurrency);
1935
1996
 
1936
1997
  const perSiteHeadful = sites.some(site => site.headful === true);
1937
1998
  const launchHeadless = !(headfulMode || perSiteHeadful);
@@ -2689,29 +2750,65 @@ function setupFrameHandling(page, forceDebug) {
2689
2750
  if (!useObscura && siteConfig.userAgent && siteConfig.userAgent.toLowerCase().includes('chrome')) {
2690
2751
  const userAgentKey = siteConfig.userAgent.toLowerCase();
2691
2752
  let platform = 'Windows';
2692
- let platformVersion = '15.0.0';
2753
+ let platformVersion = '19.0.0'; // Win11 — MUST match fingerprint.js's userAgentData platformVersion
2693
2754
  let arch = 'x86';
2694
2755
 
2695
2756
  if (userAgentKey === 'chrome_mac') {
2696
2757
  platform = 'macOS';
2697
- platformVersion = '13.5.0';
2758
+ platformVersion = '13.5.0';
2698
2759
  arch = 'arm';
2699
2760
  } else if (userAgentKey === 'chrome_linux') {
2700
2761
  platform = 'Linux';
2701
2762
  platformVersion = '6.5.0';
2702
2763
  arch = 'x86';
2703
2764
  }
2704
-
2705
- await page.setExtraHTTPHeaders({
2706
- 'Sec-CH-UA': '"Not:A-Brand";v="99", "Google Chrome";v="146", "Chromium";v="146"',
2765
+
2766
+ // Derive the Chrome major version from the SAME UA string the
2767
+ // browser actually sends (USER_AGENT_COLLECTIONS, via
2768
+ // page.setUserAgent in applyUserAgentSpoofing) so Sec-CH-UA can
2769
+ // never drift out of sync with navigator.userAgent. The version
2770
+ // used to be hardcoded ('146') while the UA list moved to 148 —
2771
+ // a detector cross-checking UA vs Sec-CH-UA saw the mismatch.
2772
+ // The full-version hints carry the REAL build (major.0.BUILD) — the
2773
+ // reduced UA hides it, these reveal it. Build comes from
2774
+ // lib/fingerprint's CHROME_BUILD, the same source the JS
2775
+ // getHighEntropyValues spoof uses, so HTTP and JS can't disagree.
2776
+ const browserUa = USER_AGENT_COLLECTIONS.get(userAgentKey) || '';
2777
+ const chromeMajor = (browserUa.match(/Chrome\/(\d+)/) || [])[1] || '148';
2778
+ const fullVer = `${chromeMajor}.0.${CHROME_BUILD}`;
2779
+
2780
+ const chHeaders = {
2781
+ // Brand list order + grease string match real Chrome of this major
2782
+ // exactly (deterministic GREASE): Chromium, Google Chrome, <grease>.
2783
+ // Same order/grease the JS brands spoof uses, so HTTP and JS agree.
2784
+ 'Sec-CH-UA': `"Chromium";v="${chromeMajor}", "Google Chrome";v="${chromeMajor}", "${CHROME_GREASE_BRAND}";v="99"`,
2707
2785
  'Sec-CH-UA-Platform': `"${platform}"`,
2708
2786
  'Sec-CH-UA-Platform-Version': `"${platformVersion}"`,
2709
2787
  'Sec-CH-UA-Mobile': '?0',
2710
2788
  'Sec-CH-UA-Arch': `"${arch}"`,
2711
2789
  'Sec-CH-UA-Bitness': '"64"',
2712
- 'Sec-CH-UA-Full-Version': '"146.0.0.0"',
2713
- 'Sec-CH-UA-Full-Version-List': '"Not:A-Brand";v="99.0.0.0", "Google Chrome";v="146.0.0.0", "Chromium";v="146.0.0.0"'
2714
- });
2790
+ 'Sec-CH-UA-WoW64': '?0',
2791
+ 'Sec-CH-UA-Model': '""',
2792
+ 'Sec-CH-UA-Full-Version': `"${fullVer}"`,
2793
+ 'Sec-CH-UA-Full-Version-List': `"Chromium";v="${fullVer}", "Google Chrome";v="${fullVer}", "${CHROME_GREASE_BRAND}";v="99.0.0.0"`,
2794
+ // Real Chrome (128+) sends this for desktop; pairs with the
2795
+ // formFactors value in fingerprint.js's getHighEntropyValues spoof.
2796
+ 'Sec-CH-UA-Form-Factors': '"Desktop"'
2797
+ };
2798
+ // Sec-CH-Device-Memory must mirror the JS navigator.deviceMemory
2799
+ // override (8) so a server reading BOTH can't cross-check a mismatch.
2800
+ // That JS override lives in applyFingerprintProtection, so it only
2801
+ // runs when fingerprint_protection is set — gate the header the same
2802
+ // way. Without this gate, a userAgent-only site (no fp_protection)
2803
+ // would get JS deviceMemory = the real host RAM (e.g. 32) but HTTP
2804
+ // = 8, a fresh mismatch. With fp off we send neither and both sides
2805
+ // report the native value, which is also consistent. (RAM isn't
2806
+ // server-observable, so spoofing it down hides datacenter specs with
2807
+ // nothing external to contradict — unlike rtt, which we leave native.)
2808
+ if (siteConfig.fingerprint_protection) {
2809
+ chHeaders['Sec-CH-Device-Memory'] = '8';
2810
+ }
2811
+ await page.setExtraHTTPHeaders(chHeaders);
2715
2812
  }
2716
2813
  } catch (fingerprintErr) {
2717
2814
  if (fingerprintErr.message.includes('Session closed') ||
@@ -2736,7 +2833,7 @@ function setupFrameHandling(page, forceDebug) {
2736
2833
  // Get user agent for curl if needed
2737
2834
  let curlUserAgent = '';
2738
2835
  if (useCurl && siteConfig.userAgent) {
2739
- curlUserAgent = USER_AGENTS.get(siteConfig.userAgent.toLowerCase()) || '';
2836
+ curlUserAgent = USER_AGENT_COLLECTIONS.get(siteConfig.userAgent.toLowerCase()) || '';
2740
2837
  }
2741
2838
 
2742
2839
  if (useCurl && forceDebug) {
@@ -3072,10 +3169,22 @@ function setupFrameHandling(page, forceDebug) {
3072
3169
 
3073
3170
  if (capturePopups && forceDebug) {
3074
3171
  // One-time setup-time warning if the click prerequisite isn't met.
3075
- // Without clicks, capture_popups is a no-op in practice.
3076
- const hasClicks = siteConfig.interact === true && siteConfig.interact_clicks === true;
3077
- if (!hasClicks) {
3078
- console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but interact_clicks is not — popups need user-gesture clicks to fire; expect no captures unless the page opens popups via in-page redirects`));
3172
+ // Without clicks, capture_popups is a no-op in practice. Previous
3173
+ // version blamed `interact_clicks` for both missing-piece cases but
3174
+ // when the actual culprit is `interact: 1` (number, silently disabled
3175
+ // by strict `=== true`), the message misled users into debugging
3176
+ // interact_clicks while the real problem was interact itself.
3177
+ // (normalizeSiteConfig now coerces interact: 1 → true with a warning,
3178
+ // so by the time we get here both should be booleans — but keep the
3179
+ // diagnostic accurate for the truly-missing case.)
3180
+ const interactOn = siteConfig.interact === true;
3181
+ const clicksOn = siteConfig.interact_clicks === true;
3182
+ if (!interactOn && !clicksOn) {
3183
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects will capture`));
3184
+ } else if (!interactOn) {
3185
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no fake clicks fire`));
3186
+ } else if (!clicksOn) {
3187
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact_clicks' is not — set interact_clicks: true to enable element-targeted clicks; without it, only random content-zone clicks fire and may miss overlay-based popunders`));
3079
3188
  }
3080
3189
  console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
3081
3190
  }
@@ -3101,133 +3210,200 @@ function setupFrameHandling(page, forceDebug) {
3101
3210
  // setRequestInterception(true) — page.on('request') fires for every
3102
3211
  // request regardless of interception state, and we don't need to
3103
3212
  // block anything on popups.
3104
- const attachPopupRequestCapture = (popupPage, depth) => {
3105
- popupPage.on('request', (request) => {
3213
+ // Evaluate ANY URL surfaced from a popup (the popup's own navigation URL
3214
+ // OR an in-popup request) against the same filter pipeline the main-page
3215
+ // request handler uses. Factored out so:
3216
+ // 1. attachPopupRequestCapture's `popupPage.on('request', ...)` calls
3217
+ // this once per in-popup request (with the request's resourceType).
3218
+ // 2. onTargetCreated calls this once with `target.url()` and resourceType
3219
+ // 'document' BEFORE attaching the request listener — catches the
3220
+ // popup's navigation URL itself, which fires before our listener can
3221
+ // attach (targetcreated → page resolve → attach is async, and the
3222
+ // browser dispatches the navigation immediately on window.open).
3223
+ // Without #2, popunder destinations whose own URL contains the
3224
+ // filterRegex pattern (e.g. AdsCore campaign URLs with &campaign=)
3225
+ // were seen-but-not-evaluated.
3226
+ const evaluatePopupUrl = (checkedUrl, depth, resourceType) => {
3227
+ try {
3228
+ if (!checkedUrl || checkedUrl === 'about:blank') return;
3229
+ let fullSubdomain = '';
3230
+ let checkedRootDomain = '';
3106
3231
  try {
3107
- const checkedUrl = request.url();
3108
- let fullSubdomain = '';
3109
- let checkedRootDomain = '';
3110
- try {
3111
- const parsedUrl = new URL(checkedUrl);
3112
- fullSubdomain = parsedUrl.hostname;
3113
- const pslResult = psl.parse(fullSubdomain);
3114
- checkedRootDomain = pslResult.domain || fullSubdomain;
3115
- } catch (_) { return; }
3116
- if (!checkedRootDomain) return;
3117
-
3118
- // ignoreDomainsByUrl if any pattern matches this popup URL,
3119
- // mark the root domain as ignored for the rest of the scan
3120
- // (main page + all popups). Mirrors the main handler so a
3121
- // tracker URL surfaced via popup chain has the same dampening
3122
- // effect as one surfaced on the main page.
3123
- if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
3124
- for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
3125
- if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
3126
- _dynamicallyIgnoredDomains.add(checkedRootDomain);
3127
- if (forceDebug) {
3128
- console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3129
- }
3130
- break;
3131
- }
3132
- }
3133
- }
3134
-
3135
- // blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
3136
- // above; populating the dynamic block Set from popup URLs lets
3137
- // tracker URLs surfaced via popup chains poison their root
3138
- // domain for the rest of the scan just like main-page hits do.
3139
- if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3140
- for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3141
- if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
3142
- _dynamicallyBlockedDomains.add(checkedRootDomain);
3143
- if (forceDebug) {
3144
- console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3145
- }
3146
- break;
3232
+ const parsedUrl = new URL(checkedUrl);
3233
+ fullSubdomain = parsedUrl.hostname;
3234
+ const pslResult = psl.parse(fullSubdomain);
3235
+ checkedRootDomain = pslResult.domain || fullSubdomain;
3236
+ } catch (_) { return; }
3237
+ if (!checkedRootDomain) return;
3238
+
3239
+ // ignoreDomainsByUrl if any pattern matches this popup URL,
3240
+ // mark the root domain as ignored for the rest of the scan
3241
+ // (main page + all popups). Mirrors the main handler so a
3242
+ // tracker URL surfaced via popup chain has the same dampening
3243
+ // effect as one surfaced on the main page.
3244
+ if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
3245
+ for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
3246
+ if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
3247
+ _dynamicallyIgnoredDomains.add(checkedRootDomain);
3248
+ if (forceDebug) {
3249
+ console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3147
3250
  }
3251
+ break;
3148
3252
  }
3149
3253
  }
3254
+ }
3150
3255
 
3151
- // ignoreDomains gate (global; matchesIgnoreDomain also short-
3152
- // circuits on _dynamicallyIgnoredDomains, so a domain we just
3153
- // added above will be caught here on the same request).
3154
- if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
3155
-
3156
- // Dynamic-block gate for popup requests early return on
3157
- // matched root or any parent (parent-walk in
3158
- // matchesDynamicBlock). Popups don't have a request object
3159
- // available here, so we just return rather than abort; the
3160
- // popup-request observer treats this as "don't process".
3161
- if (matchesDynamicBlock(checkedRootDomain)) return;
3162
-
3163
- // First-party / third-party gate (popup belongs to the main URL's
3164
- // domain group — its OWN URL doesn't redefine first-party).
3165
- const isFirstParty = firstPartyDomains.has(checkedRootDomain);
3166
- if (siteConfig.firstParty === false && isFirstParty) return;
3167
- if (siteConfig.thirdParty === false && !isFirstParty) return;
3168
-
3169
- // Regex match against the site's filterRegex list
3170
- const resourceType = request.resourceType();
3171
- let regexMatched = false;
3172
- for (const re of regexes) {
3173
- if (re.test(checkedUrl)) {
3174
- regexMatched = true;
3256
+ // blockDomainsByUrl trigger symmetric to ignoreDomainsByUrl
3257
+ // above; populating the dynamic block Set from popup URLs lets
3258
+ // tracker URLs surfaced via popup chains poison their root
3259
+ // domain for the rest of the scan just like main-page hits do.
3260
+ if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
3261
+ for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
3262
+ if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
3263
+ _dynamicallyBlockedDomains.add(checkedRootDomain);
3175
3264
  if (forceDebug) {
3176
- console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
3265
+ console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
3177
3266
  }
3178
3267
  break;
3179
3268
  }
3180
3269
  }
3270
+ }
3181
3271
 
3182
- if (!regexMatched) return;
3183
-
3184
- // hasNetTools is the same flag the main handler uses (line ~2639).
3185
- // When the site config carries whois/dig terms, regex match is
3186
- // not sufficient by itself — the URL must ALSO pass the whois/
3187
- // dig validation before it counts. Mirrors the main handler's
3188
- // behavior so 'capture popup domains that match regex/dig/whois'
3189
- // means the same thing for popups as for the main page.
3190
- if (hasNetTools) {
3191
- const popupNetToolsHandler = createNetToolsHandler({
3192
- whoisTerms, whoisOrTerms,
3193
- processedWhoisDomains: globalProcessedWhoisDomains,
3194
- processedDigDomains: globalProcessedDigDomains,
3195
- whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
3196
- whoisServer,
3197
- whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
3198
- debugLogFile,
3199
- digTerms, digOrTerms, digRecordType,
3200
- digSubdomain: siteConfig.dig_subdomain === true,
3201
- dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
3202
- matchedDomains, addMatchedDomain,
3203
- isDomainAlreadyDetected: isLocallyDetected,
3204
- onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
3205
- onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
3206
- cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
3207
- cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
3208
- currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
3209
- ignoreDomains, matchesIgnoreDomain
3210
- });
3211
- trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3212
- } else {
3213
- // No nettools required — regex match alone counts.
3214
- addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3272
+ // ignoreDomains gate (global; matchesIgnoreDomain also short-
3273
+ // circuits on _dynamicallyIgnoredDomains, so a domain we just
3274
+ // added above will be caught here on the same request).
3275
+ if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
3276
+
3277
+ // Dynamic-block gate for popup requests early return on
3278
+ // matched root or any parent (parent-walk in
3279
+ // matchesDynamicBlock). Popups don't have a request object
3280
+ // available here, so we just return rather than abort; the
3281
+ // popup-request observer treats this as "don't process".
3282
+ if (matchesDynamicBlock(checkedRootDomain)) return;
3283
+
3284
+ // First-party / third-party gate (popup belongs to the main URL's
3285
+ // domain group its OWN URL doesn't redefine first-party).
3286
+ const isFirstParty = firstPartyDomains.has(checkedRootDomain);
3287
+ if (siteConfig.firstParty === false && isFirstParty) return;
3288
+ if (siteConfig.thirdParty === false && !isFirstParty) return;
3289
+
3290
+ // Regex match against the site's filterRegex list
3291
+ let regexMatched = false;
3292
+ for (const re of regexes) {
3293
+ if (re.test(checkedUrl)) {
3294
+ regexMatched = true;
3295
+ if (forceDebug) {
3296
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
3297
+ }
3298
+ break;
3215
3299
  }
3216
- } catch (_) { /* observation-only — never let a popup error escape */ }
3300
+ }
3301
+
3302
+ if (!regexMatched) return;
3303
+
3304
+ // hasNetTools is the same flag the main handler uses (line ~2639).
3305
+ // When the site config carries whois/dig terms, regex match is
3306
+ // not sufficient by itself — the URL must ALSO pass the whois/
3307
+ // dig validation before it counts. Mirrors the main handler's
3308
+ // behavior so 'capture popup domains that match regex/dig/whois'
3309
+ // means the same thing for popups as for the main page.
3310
+ if (hasNetTools) {
3311
+ const popupNetToolsHandler = createNetToolsHandler({
3312
+ whoisTerms, whoisOrTerms,
3313
+ processedWhoisDomains: globalProcessedWhoisDomains,
3314
+ processedDigDomains: globalProcessedDigDomains,
3315
+ whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
3316
+ whoisServer,
3317
+ whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
3318
+ debugLogFile,
3319
+ digTerms, digOrTerms, digRecordType,
3320
+ digSubdomain: siteConfig.dig_subdomain === true,
3321
+ dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
3322
+ matchedDomains, addMatchedDomain,
3323
+ isDomainAlreadyDetected: isLocallyDetected,
3324
+ onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
3325
+ onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
3326
+ cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
3327
+ cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
3328
+ currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
3329
+ ignoreDomains, matchesIgnoreDomain
3330
+ });
3331
+ trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3332
+ } else {
3333
+ // No nettools required — regex match alone counts.
3334
+ addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3335
+ }
3336
+ } catch (_) { /* observation-only — never let a popup error escape */ }
3337
+ };
3338
+
3339
+ // Thin wrapper around evaluatePopupUrl for the per-request listener.
3340
+ // Under forceDebug also attach framenavigated + close listeners so
3341
+ // the popup's full lifecycle (initial nav URL, mid-popup navigations,
3342
+ // close) is visible in logs. Useful when investigating "I saw a
3343
+ // Chrome window flash on screen" — the framenavigated transitions
3344
+ // tell you what URL the window was showing and for how long.
3345
+ const attachPopupRequestCapture = (popupPage, depth) => {
3346
+ popupPage.on('request', (request) => {
3347
+ evaluatePopupUrl(request.url(), depth, request.resourceType());
3217
3348
  });
3349
+ if (forceDebug) {
3350
+ try {
3351
+ popupPage.on('framenavigated', (frame) => {
3352
+ try {
3353
+ if (frame !== popupPage.mainFrame()) return; // main frame only
3354
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] framenavigated → ${frame.url() || 'about:blank'}`));
3355
+ } catch (_) {}
3356
+ });
3357
+ popupPage.on('close', () => {
3358
+ try {
3359
+ const lastUrl = popupPage.url ? popupPage.url() : '(unknown)';
3360
+ console.log(formatLogMessage('debug', `[popup depth=${depth}] close (last URL: ${lastUrl})`));
3361
+ } catch (_) {}
3362
+ });
3363
+ popupPage.on('pageerror', (err) => {
3364
+ try { console.log(formatLogMessage('debug', `[popup depth=${depth}] pageerror: ${err.message}`)); } catch (_) {}
3365
+ });
3366
+ } catch (_) { /* listener attach errors aren't fatal */ }
3367
+ }
3218
3368
  };
3219
3369
 
3220
3370
  const onTargetCreated = async (target) => {
3371
+ // Log EVERY targetcreated event under forceDebug so callers can see
3372
+ // the full set of targets Chromium creates during the scan — not
3373
+ // just the ones we capture. Useful when investigating "is that
3374
+ // Chrome window I saw from a popup or from somewhere else?" — if
3375
+ // a window opens but no targetcreated fires, it's not ours. If a
3376
+ // targetcreated fires for type=page but we skip-and-explain below,
3377
+ // the user knows why we ignored it. Captures the FULL diagnostic
3378
+ // surface, no behavior change.
3379
+ let _tType, _tUrl;
3380
+ if (forceDebug) {
3381
+ try {
3382
+ _tType = target.type();
3383
+ _tUrl = target.url() || 'about:blank';
3384
+ console.log(formatLogMessage('debug', `[popup] targetcreated: type=${_tType} url=${_tUrl}`));
3385
+ } catch (_) {}
3386
+ }
3387
+
3221
3388
  // Short-circuit guard: if finally has already started, don't attach
3222
3389
  // a request listener whose closure would outlive its meaningful
3223
3390
  // scope. The race is narrow (a targetcreated firing while we're
3224
3391
  // mid-await on target.page() across the finally boundary), but
3225
3392
  // without this guard a late popup could push matches into
3226
3393
  // matchedDomains for a URL whose processing has already returned.
3227
- if (urlFinished) return;
3228
- if (target.type() !== 'page') return;
3394
+ if (urlFinished) {
3395
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: urlFinished=true (scan teardown in progress)`));
3396
+ return;
3397
+ }
3398
+ if (target.type() !== 'page') {
3399
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: non-page target type=${target.type()} (workers/service-workers/etc are not popunder candidates)`));
3400
+ return;
3401
+ }
3229
3402
  const depth = getPopupDepth(target);
3230
- if (depth < 1) return; // Not one of ours
3403
+ if (depth < 1) {
3404
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: depth=0 — target not in opener chain of main page (likely a new browser tab opened independently, not a popunder from our scan)`));
3405
+ return; // Not one of ours
3406
+ }
3231
3407
  if (depth > POPUP_MAX_DEPTH) {
3232
3408
  if (forceDebug) {
3233
3409
  console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
@@ -3237,7 +3413,10 @@ function setupFrameHandling(page, forceDebug) {
3237
3413
 
3238
3414
  let popupPage;
3239
3415
  try { popupPage = await target.page(); } catch (_) { return; }
3240
- if (!popupPage) return;
3416
+ if (!popupPage) {
3417
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] target.page() returned null — popup not accessible as a Page object`));
3418
+ return;
3419
+ }
3241
3420
  // Re-check after the await — the per-URL finally may have flipped
3242
3421
  // the flag while target.page() was resolving.
3243
3422
  if (urlFinished) {
@@ -3247,8 +3426,31 @@ function setupFrameHandling(page, forceDebug) {
3247
3426
 
3248
3427
  if (forceDebug) {
3249
3428
  console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
3429
+ // Window dimensions are useful for the "is the popup visible on
3430
+ // my screen?" question — a popup with non-zero viewport in a
3431
+ // headless=new launch shouldn't be visible but on some display
3432
+ // servers (WSLg, X11) it can briefly flash on screen. Log the
3433
+ // viewport so callers can correlate with what they saw.
3434
+ try {
3435
+ const vp = popupPage.viewport();
3436
+ if (vp) console.log(formatLogMessage('debug', `[popup depth=${depth}] viewport: ${vp.width}x${vp.height}`));
3437
+ } catch (_) {}
3250
3438
  }
3251
3439
 
3440
+ // Evaluate the popup's own navigation URL against the same filter
3441
+ // pipeline used for in-popup requests. Required because targetcreated
3442
+ // → target.page() → on('request', ...) is async, and the browser
3443
+ // dispatches the popup's navigation request immediately on window.open
3444
+ // — by the time the listener attaches below, the navigation request
3445
+ // has already fired and won't be re-emitted. resourceType 'document'
3446
+ // mirrors what Chrome would emit for a top-level navigation request.
3447
+ // Without this call, AdsCore-style popunder destinations (URL contains
3448
+ // &campaign=, &v=, etc) were seen-but-not-evaluated: the popup was
3449
+ // logged but its domain never matched the filter regex, so it never
3450
+ // became a rule. Only secondary in-popup requests (tracking pixels,
3451
+ // sub-resources) ever got tested against the regex.
3452
+ evaluatePopupUrl(target.url(), depth, 'document');
3453
+
3252
3454
  attachPopupRequestCapture(popupPage, depth);
3253
3455
 
3254
3456
  // Auto-close after the capture window so popups don't pile up.
@@ -4322,7 +4524,26 @@ function setupFrameHandling(page, forceDebug) {
4322
4524
 
4323
4525
  // Mark page as processing during interactions
4324
4526
  updatePageUsage(page, true);
4325
- const INTERACTION_HARD_TIMEOUT = 15000;
4527
+ // Work-aware ceiling (scales with click count / realistic_click /
4528
+ // intensity) instead of a flat 15s, which truncated high-click
4529
+ // popunder configs mid-pass. Single source of truth shared with
4530
+ // interaction.js's own internal hard cap so the two can't disagree.
4531
+ const INTERACTION_HARD_TIMEOUT = computeInteractionCeilingMs(interactionConfig);
4532
+
4533
+ // Capture-and-clear timer wrapper — same fix as cdp.js (0772ccd) and
4534
+ // the per-URL grace (577ad66). The 3 inline Promise.race patterns
4535
+ // below previously used `new Promise((_, reject) => setTimeout(...))`
4536
+ // without capturing the timer ID, leaking the 15s timer + closure on
4537
+ // reject every time interaction completed inside the cap (the common
4538
+ // case). Centralizing avoids the same mistake recurring across the
4539
+ // ghost-cursor / fallback / standard branches.
4540
+ const raceWithTimer = (promise, msg) => {
4541
+ let t;
4542
+ return Promise.race([
4543
+ promise,
4544
+ new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), INTERACTION_HARD_TIMEOUT); })
4545
+ ]).finally(() => clearTimeout(t));
4546
+ };
4326
4547
 
4327
4548
  // Check if ghost-cursor mode is enabled for this site
4328
4549
  const ghostConfig = resolveGhostCursorConfig(siteConfig, globalGhostCursor, forceDebug);
@@ -4333,60 +4554,51 @@ function setupFrameHandling(page, forceDebug) {
4333
4554
  if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
4334
4555
  const cursor = createGhostCursor(page, { forceDebug });
4335
4556
  if (cursor) {
4336
- await Promise.race([
4337
- (async () => {
4338
- const viewport = page.viewport() || { width: 1200, height: 800 };
4339
- const ghostDuration = ghostConfig.duration || 2000;
4340
- const ghostStart = Date.now();
4341
- const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
4342
-
4343
- // Time-based Bezier mouse movements — runs for ghostDuration ms
4344
- while (ghostTimeLeft() > 200) {
4345
- const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
4346
- const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
4347
- await ghostMove(cursor, toX, toY, {
4348
- moveSpeed: ghostConfig.moveSpeed,
4349
- overshootThreshold: ghostConfig.overshootThreshold,
4350
- forceDebug
4351
- });
4352
- if (ghostTimeLeft() > 100) {
4353
- await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
4354
- }
4355
- }
4356
- if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
4357
- await ghostRandomMove(cursor, { forceDebug });
4557
+ await raceWithTimer((async () => {
4558
+ const viewport = page.viewport() || { width: 1200, height: 800 };
4559
+ const ghostDuration = ghostConfig.duration || 2000;
4560
+ const ghostStart = Date.now();
4561
+ const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
4562
+
4563
+ // Time-based Bezier mouse movements — runs for ghostDuration ms
4564
+ while (ghostTimeLeft() > 200) {
4565
+ const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
4566
+ const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
4567
+ await ghostMove(cursor, toX, toY, {
4568
+ moveSpeed: ghostConfig.moveSpeed,
4569
+ overshootThreshold: ghostConfig.overshootThreshold,
4570
+ forceDebug
4571
+ });
4572
+ if (ghostTimeLeft() > 100) {
4573
+ await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
4358
4574
  }
4359
- if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
4360
- const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
4361
- const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
4362
- await ghostClick(cursor, { x: clickX, y: clickY }, {
4363
- hesitate: ghostConfig.hesitate,
4364
- forceDebug
4365
- });
4366
- }
4367
- if (interactionConfig.includeScrolling) {
4368
- await performPageInteraction(page, currentUrl, {
4369
- ...interactionConfig,
4370
- mouseMovements: 0,
4371
- includeElementClicks: false
4372
- }, forceDebug);
4373
- }
4374
- })(),
4375
- new Promise((_, reject) => setTimeout(() => reject(new Error('ghost-cursor interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
4376
- ]);
4575
+ }
4576
+ if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
4577
+ await ghostRandomMove(cursor, { forceDebug });
4578
+ }
4579
+ if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
4580
+ const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
4581
+ const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
4582
+ await ghostClick(cursor, { x: clickX, y: clickY }, {
4583
+ hesitate: ghostConfig.hesitate,
4584
+ forceDebug
4585
+ });
4586
+ }
4587
+ if (interactionConfig.includeScrolling) {
4588
+ await performPageInteraction(page, currentUrl, {
4589
+ ...interactionConfig,
4590
+ mouseMovements: 0,
4591
+ includeElementClicks: false
4592
+ }, forceDebug);
4593
+ }
4594
+ })(), 'ghost-cursor interaction hard timeout');
4377
4595
  } else {
4378
4596
  if (forceDebug) console.log(formatLogMessage('debug', '[ghost-cursor] Falling back to built-in mouse'));
4379
- await Promise.race([
4380
- performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
4381
- new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
4382
- ]);
4597
+ await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
4383
4598
  }
4384
4599
  } else {
4385
4600
  // Standard built-in mouse interaction
4386
- await Promise.race([
4387
- performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
4388
- new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
4389
- ]);
4601
+ await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
4390
4602
  }
4391
4603
  } catch (interactTimeoutErr) {
4392
4604
  if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
@@ -4521,8 +4733,16 @@ function setupFrameHandling(page, forceDebug) {
4521
4733
 
4522
4734
  if (siteConfig.clear_sitedata === true) {
4523
4735
  try {
4524
- const clearResult = await clearSiteData(page, currentUrl, forceDebug, true); // Quick mode for reloads
4525
- if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i} for ${currentUrl}`));
4736
+ // Default reload clear is quick mode (cookies + cache +
4737
+ // localStorage + sessionStorage the storage layers where
4738
+ // session-cap tracking typically lives). Sites that put their
4739
+ // session cap in IndexedDB / WebSQL / service workers can opt
4740
+ // into a full clear-per-reload via clear_sitedata_full_on_reload.
4741
+ // Costs ~100-500ms extra per reload and may unregister a
4742
+ // service worker the page depends on; off by default.
4743
+ const fullOnReload = siteConfig.clear_sitedata_full_on_reload === true;
4744
+ const clearResult = await clearSiteData(page, currentUrl, forceDebug, !fullOnReload);
4745
+ if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data (${fullOnReload ? 'full' : 'quick'}) before reload #${i} for ${currentUrl}`));
4526
4746
  } catch (reloadClearErr) {
4527
4747
  if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
4528
4748
  }
@@ -4536,20 +4756,26 @@ function setupFrameHandling(page, forceDebug) {
4536
4756
  if (useForceReload && !reloadSuccess && !skipForceReload) {
4537
4757
  // Attempt force reload: disable cache, reload, re-enable cache
4538
4758
  try {
4759
+ // Local race-with-timer helper — capture-and-clear pattern from
4760
+ // cdp.js / interact (6ad36e7). Without this, every successful
4761
+ // setCacheEnabled() left an 8s setTimeout running with closure
4762
+ // on `reject` (2 leaks per reload cycle × N reload cycles).
4763
+ const raceWithTimer = (promise, msg, ms) => {
4764
+ let t;
4765
+ return Promise.race([
4766
+ promise,
4767
+ new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), ms); })
4768
+ ]).finally(() => clearTimeout(t));
4769
+ };
4770
+
4539
4771
  // Timeout-protected cache disable
4540
- await Promise.race([
4541
- page.setCacheEnabled(false),
4542
- new Promise((_, reject) => setTimeout(() => reject(new Error('Cache disable timeout')), 8000))
4543
- ]);
4544
-
4772
+ await raceWithTimer(page.setCacheEnabled(false), 'Cache disable timeout', 8000);
4773
+
4545
4774
  // Use networkidle2 for force reload to better detect when page is actually loaded
4546
4775
  await page.reload({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
4547
-
4776
+
4548
4777
  // Timeout-protected cache enable
4549
- await Promise.race([
4550
- page.setCacheEnabled(true),
4551
- new Promise((_, reject) => setTimeout(() => reject(new Error('Cache enable timeout')), 8000))
4552
- ]);
4778
+ await raceWithTimer(page.setCacheEnabled(true), 'Cache enable timeout', 8000);
4553
4779
 
4554
4780
  reloadSuccess = true;
4555
4781
  if (forceDebug) console.log(formatLogMessage('debug', `Force reload #${i} completed for ${currentUrl}`));
@@ -4644,8 +4870,21 @@ function setupFrameHandling(page, forceDebug) {
4644
4870
  const endY = 200 + Math.floor(Math.random() * (vp.height - 400));
4645
4871
  await humanLikeMouseMove(page, startX, startY, endX, endY, { steps: 3, curve: 0.04, jitter: 1 });
4646
4872
  }
4647
- // Content clicks to trigger document-level onclick handlers
4648
- await performContentClicks(page, { clicks: 2, preDelay: 200, forceDebug });
4873
+ // Content clicks to trigger document-level onclick handlers.
4874
+ // Honor siteConfig.interact_click_count so popunder-discovery configs
4875
+ // get the same click volume on every reload, not just the initial load.
4876
+ // Omit `clicks` when no override is set so performContentClicks uses
4877
+ // its CONTENT_CLICK.CLICK_COUNT default (single source of truth).
4878
+ // realistic forwards siteConfig.realistic_click; always passed
4879
+ // (defaults to false) so realistic mode applies to every reload's
4880
+ // clicks, not just the initial pass.
4881
+ const postReloadClickOpts = {
4882
+ preDelay: 200,
4883
+ forceDebug,
4884
+ realistic: !!interactionConfig.realistic
4885
+ };
4886
+ if (interactionConfig.clickCount) postReloadClickOpts.clicks = interactionConfig.clickCount;
4887
+ await performContentClicks(page, postReloadClickOpts);
4649
4888
  if (forceDebug) console.log(formatLogMessage('debug', `Post-reload interaction completed for reload #${i}`));
4650
4889
  } catch (postReloadInteractErr) {
4651
4890
  // Non-critical — continue with remaining reloads
@@ -4870,9 +5109,21 @@ function setupFrameHandling(page, forceDebug) {
4870
5109
  }
4871
5110
  }
4872
5111
 
4873
- // Temporarily store the pLimit function
5112
+ // Temporarily store the pLimit function
4874
5113
  const originalLimit = limit;
4875
5114
 
5115
+ // Per-site config normalization (always runs, not gated on --validate-config).
5116
+ // Catches typo'd keys (whois_terms vs whois) with "did you mean" suggestions
5117
+ // and coerces boolean-like values (interact: 1 → interact: true) before any
5118
+ // downstream strict-equality check silently treats them as disabled. Mutates
5119
+ // each site in place so the rest of the scan sees normalized values.
5120
+ // Reports via console.warn so messages surface even when --silent is set.
5121
+ for (let i = 0; i < sites.length; i++) {
5122
+ const { warnings, errors } = normalizeSiteConfig(sites[i], i);
5123
+ for (const e of errors) console.warn(messageColors.error('⚠ ' + e));
5124
+ for (const w of warnings) console.warn(messageColors.warn('⚠ [config] ' + w));
5125
+ }
5126
+
4876
5127
  // V8 Optimization: Calculate total URLs first to pre-allocate array
4877
5128
  let totalUrls = 0;
4878
5129
  for (const site of sites) {
@@ -4890,7 +5141,17 @@ function setupFrameHandling(page, forceDebug) {
4890
5141
  for (const url of urlsToProcess) {
4891
5142
  allTasks[taskIndex++] = {
4892
5143
  url,
4893
- config: { ...site, _originalUrl: url }, // Preserve original URL for CDP domain checking
5144
+ // Default userAgent to 'chrome' when a site doesn't set one. Without
5145
+ // it the browser sends its bundled default UA, which literally
5146
+ // contains "HeadlessChrome" (verified, both headless modes) — an
5147
+ // instant automation tell. Defaulting here (rather than at launch)
5148
+ // activates the whole coherent path, since UA-string spoofing, the
5149
+ // navigator/webdriver/plugins/userAgentData JS masking, the Sec-CH-UA
5150
+ // request headers, and the curl content-fetch UA all gate on
5151
+ // config.userAgent. Placing 'chrome' BEFORE the spread means an
5152
+ // explicit site value wins — including userAgent:false / null to opt
5153
+ // out and scan with the raw headless UA.
5154
+ config: { userAgent: 'chrome', ...site, _originalUrl: url },
4894
5155
  taskId: taskIndex - 1 // For tracking
4895
5156
  };
4896
5157
  }
@@ -4923,7 +5184,7 @@ function setupFrameHandling(page, forceDebug) {
4923
5184
  let urlsSinceLastCleanup = 0;
4924
5185
 
4925
5186
  if (!silentMode && totalUrls > 0) {
4926
- console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${MAX_CONCURRENT_SITES}...`);
5187
+ console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${effectiveConcurrency}...`);
4927
5188
  if (totalUrls > RESOURCE_CLEANUP_INTERVAL) {
4928
5189
  console.log(messageColors.processing('Browser will restart every') + ` ~${RESOURCE_CLEANUP_INTERVAL} URLs to free resources`);
4929
5190
  }
@@ -5044,10 +5305,18 @@ function setupFrameHandling(page, forceDebug) {
5044
5305
  silentMode
5045
5306
  });
5046
5307
  healthPromise.catch(() => {});
5047
- healthCheck = await Promise.race([
5048
- healthPromise,
5049
- new Promise((_, reject) => setTimeout(() => reject(new Error('Health check timeout')), 30000))
5050
- ]);
5308
+ // Capture-and-clear timer pattern (cdp.js 0772ccd, interact 6ad36e7) —
5309
+ // when healthPromise wins the race, the inline setTimeout would
5310
+ // otherwise hold reject's closure for the full 30s grace window.
5311
+ let healthTimer;
5312
+ try {
5313
+ healthCheck = await Promise.race([
5314
+ healthPromise,
5315
+ new Promise((_, reject) => { healthTimer = setTimeout(() => reject(new Error('Health check timeout')), 30000); })
5316
+ ]);
5317
+ } finally {
5318
+ if (healthTimer) clearTimeout(healthTimer);
5319
+ }
5051
5320
  } catch (healthError) {
5052
5321
  console.log(formatLogMessage('warn', `[HEALTH CHECK] Timeout, assuming restart needed`));
5053
5322
  healthCheck = { shouldRestart: true, reason: 'Health check timeout' };
@@ -5312,26 +5581,94 @@ function setupFrameHandling(page, forceDebug) {
5312
5581
  } catch {}
5313
5582
 
5314
5583
  // Per-URL timeout so a single hung processUrl can't block the batch
5315
- // forever. 75s sits comfortably above the realistic legit-page ceiling
5316
- // (nav 35s + Cloudflare adaptive ~25s + interaction ~10s + network-idle
5317
- // wait ~10s ≈ ~70s), well short of the old 120s safety net. Cuts
5318
- // hang-recovery time roughly in half when an entire batch's URLs all
5319
- // hang and we're waiting on this timeout to advance processedUrlCount.
5320
- const PER_URL_TIMEOUT_MS = 75000;
5584
+ // forever. Scaled from siteConfig.timeout + (delay + interaction) ×
5585
+ // (1 + reload) + 30s headroom, with a 75s floor.
5586
+ //
5587
+ // The (1 + reload) multiplier was missing from the previous formula
5588
+ // (13dd4fa) `reload: 4` configs perform 5 total cycles (initial +
5589
+ // 4 reloads), each with its own delay + interaction overhead, so the
5590
+ // 80s ceiling for the user's lean config (timeout:35000, delay:15000,
5591
+ // reload:4) fired DURING the 3rd reload while the orphan still had
5592
+ // 2 more cycles + drain to go — far longer than the 8s grace could
5593
+ // bridge. Multiplying by cycle count brings the ceiling above the
5594
+ // legitimate work envelope.
5595
+ const reloadCount = task.config.reload || 0;
5596
+ // Interaction overhead per cycle must match interaction.js's actual
5597
+ // ceiling, which is now work-aware (high interact_click_count /
5598
+ // realistic_click configs legitimately run far longer than the old flat
5599
+ // 15s). Compute the same value here so the per-URL ceiling stays above
5600
+ // the real interaction envelope and can't fire mid-pass. Zero when
5601
+ // interaction is disabled for this task (no interaction cost to budget).
5602
+ const interactionOnForTask = task.config.interact === true && !disableInteract;
5603
+ const INTERACTION_OVERHEAD_MS = interactionOnForTask
5604
+ ? computeInteractionCeilingMs(createInteractionConfig(task.url, task.config))
5605
+ : 0;
5606
+ const PER_URL_TIMEOUT_MS = Math.max(
5607
+ 75000,
5608
+ (task.config.timeout || 35000)
5609
+ + ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
5610
+ + 30000
5611
+ );
5612
+ // Grace period after primary timeout — gives the orphan a chance to
5613
+ // finish drainPendingNetTools() and emit "Saving N rules despite page
5614
+ // load failure" before we abandon its result. Drain typically completes
5615
+ // in <1s with cached nettools; 8s is the safety ceiling.
5616
+ const PER_URL_GRACE_MS = 8000;
5617
+ const PER_URL_TIMEOUT_MARKER = 'PER_URL_TIMEOUT_FIRED';
5618
+
5321
5619
  const processUrlPromise = processUrl(task.url, task.config, browser);
5322
5620
  let perUrlTimer;
5323
5621
  try {
5324
5622
  return await Promise.race([
5325
5623
  processUrlPromise,
5326
5624
  new Promise((_, reject) => {
5327
- perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (75s)')), PER_URL_TIMEOUT_MS);
5625
+ perUrlTimer = setTimeout(() => {
5626
+ const e = new Error(`Per-URL timeout (${Math.round(PER_URL_TIMEOUT_MS / 1000)}s)`);
5627
+ e.code = PER_URL_TIMEOUT_MARKER;
5628
+ reject(e);
5629
+ }, PER_URL_TIMEOUT_MS);
5328
5630
  })
5329
5631
  ]);
5330
5632
  } catch (err) {
5331
- if (err && err.message === 'Per-URL timeout (75s)') {
5332
- processUrlPromise.catch(() => {});
5633
+ if (err && err.code === PER_URL_TIMEOUT_MARKER) {
5333
5634
  forceRestartFlag = true;
5334
- return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (75s)', needsImmediateRestart: true };
5635
+ // Log the timeout fire was invisible before; only ended up in the
5636
+ // returned result.error field which is never printed. Makes
5637
+ // ceiling-tuning regressions visible without source-reading.
5638
+ if (forceDebug) {
5639
+ console.log(formatLogMessage('warn', `${err.message} for ${task.url} — orphan in ${PER_URL_GRACE_MS / 1000}s grace`));
5640
+ }
5641
+ // Grace period — wait briefly for the orphan to drain + recover
5642
+ // partial matches. Browser is still in a bad state (we hit the
5643
+ // primary ceiling) so the restart still fires either way; only the
5644
+ // rules payload differs.
5645
+ let graceTimer;
5646
+ try {
5647
+ const graceResult = await Promise.race([
5648
+ processUrlPromise,
5649
+ new Promise((_, reject) => {
5650
+ // Capture the timer ID so the finally can clear it when the
5651
+ // orphan wins the race — otherwise the setTimeout keeps the
5652
+ // event loop ref + closure on `reject` alive for the full
5653
+ // grace window, even though the race already settled.
5654
+ // Same leak pattern fixed in cdp.js (0772ccd) and
5655
+ // clear_sitedata (780b443).
5656
+ graceTimer = setTimeout(() => reject(new Error('Grace timeout')), PER_URL_GRACE_MS);
5657
+ })
5658
+ ]);
5659
+ if (forceDebug) {
5660
+ console.log(formatLogMessage('debug', `Grace recovered ${(graceResult && graceResult.rules ? graceResult.rules.length : 0)} rules for ${task.url}`));
5661
+ }
5662
+ return { ...graceResult, needsImmediateRestart: true };
5663
+ } catch (_) {
5664
+ if (forceDebug) {
5665
+ console.log(formatLogMessage('warn', `Grace timed out for ${task.url} — discarding orphan`));
5666
+ }
5667
+ processUrlPromise.catch(() => {});
5668
+ return { url: task.url, rules: [], success: false, error: err.message, needsImmediateRestart: true };
5669
+ } finally {
5670
+ if (graceTimer) clearTimeout(graceTimer);
5671
+ }
5335
5672
  }
5336
5673
  throw err;
5337
5674
  } finally {