@fanboynz/network-scanner 3.0.3 → 3.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +53 -0
- package/lib/adblock-rust.js +17 -4
- package/lib/adblock.js +92 -15
- package/lib/browserhealth.js +41 -100
- package/lib/cdp.js +68 -34
- package/lib/clear_sitedata.js +68 -20
- package/lib/compress.js +26 -58
- package/lib/curl.js +44 -22
- package/lib/domain-cache.js +8 -57
- package/lib/dry-run.js +9 -4
- package/lib/fingerprint.js +599 -129
- package/lib/fingerprint.md +94 -0
- package/lib/interaction.js +262 -26
- package/lib/nettools.js +47 -76
- package/lib/openvpn_vpn.js +116 -35
- package/lib/proxy.js +6 -2
- package/lib/searchstring.js +15 -237
- package/lib/smart-cache.js +9 -1
- package/lib/socks-relay.js +14 -9
- package/lib/validate_rules.js +285 -3
- package/lib/wireguard_vpn.js +64 -12
- package/nwss.js +557 -220
- package/package.json +1 -1
- package/regex-tool/index.html +321 -628
package/nwss.js
CHANGED
|
@@ -12,13 +12,13 @@ const path = require('path');
|
|
|
12
12
|
const dnsPromises = require('node:dns/promises');
|
|
13
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
14
14
|
const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
|
|
15
|
-
const { parseSearchStrings, createResponseHandler
|
|
16
|
-
const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
|
|
15
|
+
const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
|
|
16
|
+
const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
|
|
17
17
|
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
18
18
|
// Curl functionality (replace searchstring curl handler)
|
|
19
19
|
const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
|
|
20
20
|
// Rule validation
|
|
21
|
-
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
|
|
21
|
+
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile, normalizeSiteConfig } = require('./lib/validate_rules');
|
|
22
22
|
// CF Bypass
|
|
23
23
|
const {
|
|
24
24
|
handleCloudflareProtection,
|
|
@@ -66,7 +66,7 @@ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
|
|
|
66
66
|
// log lines (start/completed). Same cyan as the other monitoring tags.
|
|
67
67
|
const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
68
68
|
// Enhanced mouse interaction and page simulation
|
|
69
|
-
const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
69
|
+
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
70
70
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
71
71
|
const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
|
|
72
72
|
// Domain detection cache for performance optimization
|
|
@@ -129,15 +129,12 @@ const CONCURRENCY_LIMITS = Object.freeze({
|
|
|
129
129
|
});
|
|
130
130
|
|
|
131
131
|
// V8 Optimization: Use Map for user agent lookups instead of object
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
['firefox_linux', "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0"],
|
|
139
|
-
['safari', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"]
|
|
140
|
-
]));
|
|
132
|
+
// User-Agent strings come from the single source of truth in lib/fingerprint
|
|
133
|
+
// (USER_AGENT_COLLECTIONS, imported above) — the same map page.setUserAgent
|
|
134
|
+
// applies to the browser. The previous local duplicate had silently drifted
|
|
135
|
+
// (Chrome 146 vs the browser's 148, Firefox 148 vs 151, Safari 18.6 vs 19.5),
|
|
136
|
+
// so curl content-fetches advertised a different browser than the page did.
|
|
137
|
+
// Keep using the imported map directly so the two can never diverge again.
|
|
141
138
|
|
|
142
139
|
const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
|
|
143
140
|
|
|
@@ -776,13 +773,14 @@ Redirect Handling Options:
|
|
|
776
773
|
resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
|
|
777
774
|
interact: true/false Simulate mouse movements/clicks
|
|
778
775
|
isBrave: true/false Spoof Brave browser detection
|
|
779
|
-
userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari"
|
|
776
|
+
userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Desktop User-Agent (defaults to "chrome" if unset; set false to scan with the raw headless UA)
|
|
780
777
|
interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium)
|
|
781
778
|
delay: <milliseconds> Delay after load (default: 6000, capped at 2000ms unless delay_uncapped: true)
|
|
782
779
|
delay_uncapped: true/false Honor 'delay' up to half the per-URL timeout instead of the 2s default cap. Use for sites with setTimeout-deferred lazy ad/tracker loaders that fire well past the standard post-networkidle window
|
|
783
780
|
reload: <number> Reload page n times after load (default: 1)
|
|
784
781
|
forcereload: true/false or ["domain1.com", "domain2.com"] Force cache-clearing reload for all URLs or specific domains
|
|
785
782
|
clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
|
|
783
|
+
clear_sitedata_full_on_reload: true/false With clear_sitedata: true, also clear heavy storage (IndexedDB, WebSQL, service workers) between reloads — quick mode (cookies+cache+local/session storage) is the default for reloads; this flag promotes them to full clears at ~100-500ms latency cost per reload. Use for sites with IndexedDB/service-worker-backed session caps. Off by default.
|
|
786
784
|
subDomains: 1/0 Output full subdomains (default: 0)
|
|
787
785
|
localhost: true/false Force localhost output (127.0.0.1)
|
|
788
786
|
localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
|
|
@@ -1864,15 +1862,65 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1864
1862
|
'--log-level=3', // Fatal errors only (suppresses verbose disk logging)
|
|
1865
1863
|
'--no-service-autorun', // No background service disk activity
|
|
1866
1864
|
'--disable-domain-reliability', // No reliability monitor disk writes
|
|
1865
|
+
// Suppress Chrome's auto-update subsystem entirely in headful runs.
|
|
1866
|
+
// --disable-component-update + --disable-background-networking above
|
|
1867
|
+
// stop the network-level check, but Chrome's UI can still show the
|
|
1868
|
+
// "update available" toolbar dot / banner / "relaunch to update"
|
|
1869
|
+
// modal if Chrome has cached state from a prior check by the same
|
|
1870
|
+
// installed chrome binary. These two flags neutralize that:
|
|
1871
|
+
// simulate-outdated-no-au=DATE — the no-auto-update simulation
|
|
1872
|
+
// date is treated as DATE. Far-future date = never shows the
|
|
1873
|
+
// 'outdated' UI. Quotes around the date required by Chrome.
|
|
1874
|
+
// check-for-update-interval=N — seconds between update checks.
|
|
1875
|
+
// 31536000 = 1 year. Even if the above somehow gets bypassed,
|
|
1876
|
+
// the check itself won't fire within any reasonable scan.
|
|
1877
|
+
// Both are no-ops in pure headless modes but matter in --headful
|
|
1878
|
+
// and headless='new' (which can render UI in some cases).
|
|
1879
|
+
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
|
|
1880
|
+
'--check-for-update-interval=31536000',
|
|
1867
1881
|
// PERFORMANCE: Disable non-essential Chrome features in a single flag
|
|
1868
1882
|
// IMPORTANT: Chrome only reads the LAST --disable-features flag, so combine all into one
|
|
1869
|
-
//
|
|
1870
|
-
//
|
|
1871
|
-
//
|
|
1872
|
-
//
|
|
1873
|
-
//
|
|
1874
|
-
//
|
|
1875
|
-
|
|
1883
|
+
//
|
|
1884
|
+
// Sign-in / profile suppression family (prevents the "Something went
|
|
1885
|
+
// wrong when opening your profile. Please sign out then sign in
|
|
1886
|
+
// again" popup that fires in headful when Chrome's sign-in/sync
|
|
1887
|
+
// subsystem can't make sense of our fresh-each-launch temp
|
|
1888
|
+
// userDataDir):
|
|
1889
|
+
// AccountConsistencyMirror, AccountConsistencyDice
|
|
1890
|
+
// Older Chrome's identity consistency layer. Disabling stops
|
|
1891
|
+
// the sync subsystem from initialising at startup.
|
|
1892
|
+
// ProfilePicker, EnableProfilePicker
|
|
1893
|
+
// Two names for the same Chrome feature (renamed in Chrome
|
|
1894
|
+
// ~120s). Disabling stops the profile-picker dialog that some
|
|
1895
|
+
// Chrome versions display when launching with no recognised
|
|
1896
|
+
// profile. Was the new offender in Chrome 148 for this case.
|
|
1897
|
+
// IdentityConsistency
|
|
1898
|
+
// Chrome's identity-consistency-with-google.com checks. Tries
|
|
1899
|
+
// to read profile credentials at startup; trips the popup if
|
|
1900
|
+
// profile is fresh/empty.
|
|
1901
|
+
// SyncDisabledWithProfilePicker
|
|
1902
|
+
// Sync subsystem variant that activates when profile picker
|
|
1903
|
+
// would otherwise show. Disabling is harmless when picker is
|
|
1904
|
+
// also disabled but covers the gap if a Chrome version honors
|
|
1905
|
+
// only one of the two.
|
|
1906
|
+
// SigninInterceptBubble
|
|
1907
|
+
// Sign-in interception bubble that pops when Chrome detects
|
|
1908
|
+
// 'enterprise' sign-in patterns. Defensive.
|
|
1909
|
+
// Combined with --disable-sync + --allow-browser-signin=false
|
|
1910
|
+
// below + --profile-directory=Default flag (explicit profile name
|
|
1911
|
+
// instead of letting Chrome auto-detect/pick), this should fully
|
|
1912
|
+
// suppress sign-in popups in headful from Chrome 118 through 148+.
|
|
1913
|
+
//
|
|
1914
|
+
// ChromeWhatsNewUI: suppresses the post-update "What's New" page
|
|
1915
|
+
// that auto-opens in a new tab after Chrome installs an update —
|
|
1916
|
+
// not popunder-relevant but visually noisy in headful sessions.
|
|
1917
|
+
`--disable-features=AudioServiceOutOfProcess,VizDisplayCompositor,TranslateUI,BlinkGenPropertyTrees,Translate,BackForwardCache,AcceptCHFrame,SafeBrowsing,HttpsFirstBalancedModeAutoEnable,site-per-process,PaintHolding,AccountConsistencyMirror,AccountConsistencyDice,ProfilePicker,EnableProfilePicker,IdentityConsistency,SyncDisabledWithProfilePicker,SigninInterceptBubble,ChromeWhatsNewUI${disable_ad_tagging ? ',AdTagging' : ''}`,
|
|
1918
|
+
// Explicit profile directory — without this, Chrome may probe for
|
|
1919
|
+
// available profiles at launch and trigger the picker dialog (or
|
|
1920
|
+
// the "something went wrong" popup if no profile is found). With
|
|
1921
|
+
// a fresh temp userDataDir each launch, Chrome will create
|
|
1922
|
+
// 'Default' on its own; explicitly naming it skips the probe.
|
|
1923
|
+
'--profile-directory=Default',
|
|
1876
1924
|
'--disable-ipc-flooding-protection',
|
|
1877
1925
|
'--aggressive-cache-discard',
|
|
1878
1926
|
'--memory-pressure-off',
|
|
@@ -1931,7 +1979,20 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1931
1979
|
|
|
1932
1980
|
|
|
1933
1981
|
const pLimit = (await import('p-limit')).default;
|
|
1934
|
-
|
|
1982
|
+
// VPN connect/disconnect is per-URL (wgConnect/ovpnConnect at scan start,
|
|
1983
|
+
// wgDisconnect/ovpnDisconnect in the finally) and manipulates the SHARED
|
|
1984
|
+
// system routing table. Interface names are derived from a hash of the VPN
|
|
1985
|
+
// config and connect/disconnect is not refcounted, so two concurrent URLs
|
|
1986
|
+
// that share a VPN config resolve to the same interface and one task's
|
|
1987
|
+
// teardown rips the interface out from under the other mid-scan. Force
|
|
1988
|
+
// serial execution whenever any site uses vpn/openvpn — correctness over
|
|
1989
|
+
// throughput, and VPN scans are network-bound rather than CPU-bound anyway.
|
|
1990
|
+
const vpnInUse = sites.some(site => site.vpn || site.openvpn);
|
|
1991
|
+
const effectiveConcurrency = vpnInUse ? 1 : MAX_CONCURRENT_SITES;
|
|
1992
|
+
if (vpnInUse && MAX_CONCURRENT_SITES > 1 && (forceDebug || !silentMode)) {
|
|
1993
|
+
console.log(formatLogMessage('info', `${VPN_TAG} VPN configured — forcing concurrency 1 (was ${MAX_CONCURRENT_SITES}) to avoid routing-table races`));
|
|
1994
|
+
}
|
|
1995
|
+
const limit = pLimit(effectiveConcurrency);
|
|
1935
1996
|
|
|
1936
1997
|
const perSiteHeadful = sites.some(site => site.headful === true);
|
|
1937
1998
|
const launchHeadless = !(headfulMode || perSiteHeadful);
|
|
@@ -2689,29 +2750,65 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2689
2750
|
if (!useObscura && siteConfig.userAgent && siteConfig.userAgent.toLowerCase().includes('chrome')) {
|
|
2690
2751
|
const userAgentKey = siteConfig.userAgent.toLowerCase();
|
|
2691
2752
|
let platform = 'Windows';
|
|
2692
|
-
let platformVersion = '
|
|
2753
|
+
let platformVersion = '19.0.0'; // Win11 — MUST match fingerprint.js's userAgentData platformVersion
|
|
2693
2754
|
let arch = 'x86';
|
|
2694
2755
|
|
|
2695
2756
|
if (userAgentKey === 'chrome_mac') {
|
|
2696
2757
|
platform = 'macOS';
|
|
2697
|
-
platformVersion = '13.5.0';
|
|
2758
|
+
platformVersion = '13.5.0';
|
|
2698
2759
|
arch = 'arm';
|
|
2699
2760
|
} else if (userAgentKey === 'chrome_linux') {
|
|
2700
2761
|
platform = 'Linux';
|
|
2701
2762
|
platformVersion = '6.5.0';
|
|
2702
2763
|
arch = 'x86';
|
|
2703
2764
|
}
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2765
|
+
|
|
2766
|
+
// Derive the Chrome major version from the SAME UA string the
|
|
2767
|
+
// browser actually sends (USER_AGENT_COLLECTIONS, via
|
|
2768
|
+
// page.setUserAgent in applyUserAgentSpoofing) so Sec-CH-UA can
|
|
2769
|
+
// never drift out of sync with navigator.userAgent. The version
|
|
2770
|
+
// used to be hardcoded ('146') while the UA list moved to 148 —
|
|
2771
|
+
// a detector cross-checking UA vs Sec-CH-UA saw the mismatch.
|
|
2772
|
+
// The full-version hints carry the REAL build (major.0.BUILD) — the
|
|
2773
|
+
// reduced UA hides it, these reveal it. Build comes from
|
|
2774
|
+
// lib/fingerprint's CHROME_BUILD, the same source the JS
|
|
2775
|
+
// getHighEntropyValues spoof uses, so HTTP and JS can't disagree.
|
|
2776
|
+
const browserUa = USER_AGENT_COLLECTIONS.get(userAgentKey) || '';
|
|
2777
|
+
const chromeMajor = (browserUa.match(/Chrome\/(\d+)/) || [])[1] || '148';
|
|
2778
|
+
const fullVer = `${chromeMajor}.0.${CHROME_BUILD}`;
|
|
2779
|
+
|
|
2780
|
+
const chHeaders = {
|
|
2781
|
+
// Brand list order + grease string match real Chrome of this major
|
|
2782
|
+
// exactly (deterministic GREASE): Chromium, Google Chrome, <grease>.
|
|
2783
|
+
// Same order/grease the JS brands spoof uses, so HTTP and JS agree.
|
|
2784
|
+
'Sec-CH-UA': `"Chromium";v="${chromeMajor}", "Google Chrome";v="${chromeMajor}", "${CHROME_GREASE_BRAND}";v="99"`,
|
|
2707
2785
|
'Sec-CH-UA-Platform': `"${platform}"`,
|
|
2708
2786
|
'Sec-CH-UA-Platform-Version': `"${platformVersion}"`,
|
|
2709
2787
|
'Sec-CH-UA-Mobile': '?0',
|
|
2710
2788
|
'Sec-CH-UA-Arch': `"${arch}"`,
|
|
2711
2789
|
'Sec-CH-UA-Bitness': '"64"',
|
|
2712
|
-
'Sec-CH-UA-
|
|
2713
|
-
'Sec-CH-UA-
|
|
2714
|
-
|
|
2790
|
+
'Sec-CH-UA-WoW64': '?0',
|
|
2791
|
+
'Sec-CH-UA-Model': '""',
|
|
2792
|
+
'Sec-CH-UA-Full-Version': `"${fullVer}"`,
|
|
2793
|
+
'Sec-CH-UA-Full-Version-List': `"Chromium";v="${fullVer}", "Google Chrome";v="${fullVer}", "${CHROME_GREASE_BRAND}";v="99.0.0.0"`,
|
|
2794
|
+
// Real Chrome (128+) sends this for desktop; pairs with the
|
|
2795
|
+
// formFactors value in fingerprint.js's getHighEntropyValues spoof.
|
|
2796
|
+
'Sec-CH-UA-Form-Factors': '"Desktop"'
|
|
2797
|
+
};
|
|
2798
|
+
// Sec-CH-Device-Memory must mirror the JS navigator.deviceMemory
|
|
2799
|
+
// override (8) so a server reading BOTH can't cross-check a mismatch.
|
|
2800
|
+
// That JS override lives in applyFingerprintProtection, so it only
|
|
2801
|
+
// runs when fingerprint_protection is set — gate the header the same
|
|
2802
|
+
// way. Without this gate, a userAgent-only site (no fp_protection)
|
|
2803
|
+
// would get JS deviceMemory = the real host RAM (e.g. 32) but HTTP
|
|
2804
|
+
// = 8, a fresh mismatch. With fp off we send neither and both sides
|
|
2805
|
+
// report the native value, which is also consistent. (RAM isn't
|
|
2806
|
+
// server-observable, so spoofing it down hides datacenter specs with
|
|
2807
|
+
// nothing external to contradict — unlike rtt, which we leave native.)
|
|
2808
|
+
if (siteConfig.fingerprint_protection) {
|
|
2809
|
+
chHeaders['Sec-CH-Device-Memory'] = '8';
|
|
2810
|
+
}
|
|
2811
|
+
await page.setExtraHTTPHeaders(chHeaders);
|
|
2715
2812
|
}
|
|
2716
2813
|
} catch (fingerprintErr) {
|
|
2717
2814
|
if (fingerprintErr.message.includes('Session closed') ||
|
|
@@ -2736,7 +2833,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2736
2833
|
// Get user agent for curl if needed
|
|
2737
2834
|
let curlUserAgent = '';
|
|
2738
2835
|
if (useCurl && siteConfig.userAgent) {
|
|
2739
|
-
curlUserAgent =
|
|
2836
|
+
curlUserAgent = USER_AGENT_COLLECTIONS.get(siteConfig.userAgent.toLowerCase()) || '';
|
|
2740
2837
|
}
|
|
2741
2838
|
|
|
2742
2839
|
if (useCurl && forceDebug) {
|
|
@@ -3072,10 +3169,22 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3072
3169
|
|
|
3073
3170
|
if (capturePopups && forceDebug) {
|
|
3074
3171
|
// One-time setup-time warning if the click prerequisite isn't met.
|
|
3075
|
-
// Without clicks, capture_popups is a no-op in practice.
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3172
|
+
// Without clicks, capture_popups is a no-op in practice. Previous
|
|
3173
|
+
// version blamed `interact_clicks` for both missing-piece cases — but
|
|
3174
|
+
// when the actual culprit is `interact: 1` (number, silently disabled
|
|
3175
|
+
// by strict `=== true`), the message misled users into debugging
|
|
3176
|
+
// interact_clicks while the real problem was interact itself.
|
|
3177
|
+
// (normalizeSiteConfig now coerces interact: 1 → true with a warning,
|
|
3178
|
+
// so by the time we get here both should be booleans — but keep the
|
|
3179
|
+
// diagnostic accurate for the truly-missing case.)
|
|
3180
|
+
const interactOn = siteConfig.interact === true;
|
|
3181
|
+
const clicksOn = siteConfig.interact_clicks === true;
|
|
3182
|
+
if (!interactOn && !clicksOn) {
|
|
3183
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects will capture`));
|
|
3184
|
+
} else if (!interactOn) {
|
|
3185
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no fake clicks fire`));
|
|
3186
|
+
} else if (!clicksOn) {
|
|
3187
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact_clicks' is not — set interact_clicks: true to enable element-targeted clicks; without it, only random content-zone clicks fire and may miss overlay-based popunders`));
|
|
3079
3188
|
}
|
|
3080
3189
|
console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
|
|
3081
3190
|
}
|
|
@@ -3101,133 +3210,200 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3101
3210
|
// setRequestInterception(true) — page.on('request') fires for every
|
|
3102
3211
|
// request regardless of interception state, and we don't need to
|
|
3103
3212
|
// block anything on popups.
|
|
3104
|
-
|
|
3105
|
-
|
|
3213
|
+
// Evaluate ANY URL surfaced from a popup (the popup's own navigation URL
|
|
3214
|
+
// OR an in-popup request) against the same filter pipeline the main-page
|
|
3215
|
+
// request handler uses. Factored out so:
|
|
3216
|
+
// 1. attachPopupRequestCapture's `popupPage.on('request', ...)` calls
|
|
3217
|
+
// this once per in-popup request (with the request's resourceType).
|
|
3218
|
+
// 2. onTargetCreated calls this once with `target.url()` and resourceType
|
|
3219
|
+
// 'document' BEFORE attaching the request listener — catches the
|
|
3220
|
+
// popup's navigation URL itself, which fires before our listener can
|
|
3221
|
+
// attach (targetcreated → page resolve → attach is async, and the
|
|
3222
|
+
// browser dispatches the navigation immediately on window.open).
|
|
3223
|
+
// Without #2, popunder destinations whose own URL contains the
|
|
3224
|
+
// filterRegex pattern (e.g. AdsCore campaign URLs with &campaign=)
|
|
3225
|
+
// were seen-but-not-evaluated.
|
|
3226
|
+
const evaluatePopupUrl = (checkedUrl, depth, resourceType) => {
|
|
3227
|
+
try {
|
|
3228
|
+
if (!checkedUrl || checkedUrl === 'about:blank') return;
|
|
3229
|
+
let fullSubdomain = '';
|
|
3230
|
+
let checkedRootDomain = '';
|
|
3106
3231
|
try {
|
|
3107
|
-
const
|
|
3108
|
-
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3124
|
-
|
|
3125
|
-
if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3126
|
-
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
3127
|
-
if (forceDebug) {
|
|
3128
|
-
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3129
|
-
}
|
|
3130
|
-
break;
|
|
3131
|
-
}
|
|
3132
|
-
}
|
|
3133
|
-
}
|
|
3134
|
-
|
|
3135
|
-
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
|
|
3136
|
-
// above; populating the dynamic block Set from popup URLs lets
|
|
3137
|
-
// tracker URLs surfaced via popup chains poison their root
|
|
3138
|
-
// domain for the rest of the scan just like main-page hits do.
|
|
3139
|
-
if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3140
|
-
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3141
|
-
if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3142
|
-
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3143
|
-
if (forceDebug) {
|
|
3144
|
-
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3145
|
-
}
|
|
3146
|
-
break;
|
|
3232
|
+
const parsedUrl = new URL(checkedUrl);
|
|
3233
|
+
fullSubdomain = parsedUrl.hostname;
|
|
3234
|
+
const pslResult = psl.parse(fullSubdomain);
|
|
3235
|
+
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3236
|
+
} catch (_) { return; }
|
|
3237
|
+
if (!checkedRootDomain) return;
|
|
3238
|
+
|
|
3239
|
+
// ignoreDomainsByUrl — if any pattern matches this popup URL,
|
|
3240
|
+
// mark the root domain as ignored for the rest of the scan
|
|
3241
|
+
// (main page + all popups). Mirrors the main handler so a
|
|
3242
|
+
// tracker URL surfaced via popup chain has the same dampening
|
|
3243
|
+
// effect as one surfaced on the main page.
|
|
3244
|
+
if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
|
|
3245
|
+
for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
|
|
3246
|
+
if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3247
|
+
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
3248
|
+
if (forceDebug) {
|
|
3249
|
+
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3147
3250
|
}
|
|
3251
|
+
break;
|
|
3148
3252
|
}
|
|
3149
3253
|
}
|
|
3254
|
+
}
|
|
3150
3255
|
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
// available here, so we just return rather than abort; the
|
|
3160
|
-
// popup-request observer treats this as "don't process".
|
|
3161
|
-
if (matchesDynamicBlock(checkedRootDomain)) return;
|
|
3162
|
-
|
|
3163
|
-
// First-party / third-party gate (popup belongs to the main URL's
|
|
3164
|
-
// domain group — its OWN URL doesn't redefine first-party).
|
|
3165
|
-
const isFirstParty = firstPartyDomains.has(checkedRootDomain);
|
|
3166
|
-
if (siteConfig.firstParty === false && isFirstParty) return;
|
|
3167
|
-
if (siteConfig.thirdParty === false && !isFirstParty) return;
|
|
3168
|
-
|
|
3169
|
-
// Regex match against the site's filterRegex list
|
|
3170
|
-
const resourceType = request.resourceType();
|
|
3171
|
-
let regexMatched = false;
|
|
3172
|
-
for (const re of regexes) {
|
|
3173
|
-
if (re.test(checkedUrl)) {
|
|
3174
|
-
regexMatched = true;
|
|
3256
|
+
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
|
|
3257
|
+
// above; populating the dynamic block Set from popup URLs lets
|
|
3258
|
+
// tracker URLs surfaced via popup chains poison their root
|
|
3259
|
+
// domain for the rest of the scan just like main-page hits do.
|
|
3260
|
+
if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3261
|
+
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3262
|
+
if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3263
|
+
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3175
3264
|
if (forceDebug) {
|
|
3176
|
-
console.log(formatLogMessage('debug',
|
|
3265
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3177
3266
|
}
|
|
3178
3267
|
break;
|
|
3179
3268
|
}
|
|
3180
3269
|
}
|
|
3270
|
+
}
|
|
3181
3271
|
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3203
|
-
|
|
3204
|
-
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
ignoreDomains, matchesIgnoreDomain
|
|
3210
|
-
});
|
|
3211
|
-
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3212
|
-
} else {
|
|
3213
|
-
// No nettools required — regex match alone counts.
|
|
3214
|
-
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3272
|
+
// ignoreDomains gate (global; matchesIgnoreDomain also short-
|
|
3273
|
+
// circuits on _dynamicallyIgnoredDomains, so a domain we just
|
|
3274
|
+
// added above will be caught here on the same request).
|
|
3275
|
+
if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
|
|
3276
|
+
|
|
3277
|
+
// Dynamic-block gate for popup requests — early return on
|
|
3278
|
+
// matched root or any parent (parent-walk in
|
|
3279
|
+
// matchesDynamicBlock). Popups don't have a request object
|
|
3280
|
+
// available here, so we just return rather than abort; the
|
|
3281
|
+
// popup-request observer treats this as "don't process".
|
|
3282
|
+
if (matchesDynamicBlock(checkedRootDomain)) return;
|
|
3283
|
+
|
|
3284
|
+
// First-party / third-party gate (popup belongs to the main URL's
|
|
3285
|
+
// domain group — its OWN URL doesn't redefine first-party).
|
|
3286
|
+
const isFirstParty = firstPartyDomains.has(checkedRootDomain);
|
|
3287
|
+
if (siteConfig.firstParty === false && isFirstParty) return;
|
|
3288
|
+
if (siteConfig.thirdParty === false && !isFirstParty) return;
|
|
3289
|
+
|
|
3290
|
+
// Regex match against the site's filterRegex list
|
|
3291
|
+
let regexMatched = false;
|
|
3292
|
+
for (const re of regexes) {
|
|
3293
|
+
if (re.test(checkedUrl)) {
|
|
3294
|
+
regexMatched = true;
|
|
3295
|
+
if (forceDebug) {
|
|
3296
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
|
|
3297
|
+
}
|
|
3298
|
+
break;
|
|
3215
3299
|
}
|
|
3216
|
-
}
|
|
3300
|
+
}
|
|
3301
|
+
|
|
3302
|
+
if (!regexMatched) return;
|
|
3303
|
+
|
|
3304
|
+
// hasNetTools is the same flag the main handler uses (line ~2639).
|
|
3305
|
+
// When the site config carries whois/dig terms, regex match is
|
|
3306
|
+
// not sufficient by itself — the URL must ALSO pass the whois/
|
|
3307
|
+
// dig validation before it counts. Mirrors the main handler's
|
|
3308
|
+
// behavior so 'capture popup domains that match regex/dig/whois'
|
|
3309
|
+
// means the same thing for popups as for the main page.
|
|
3310
|
+
if (hasNetTools) {
|
|
3311
|
+
const popupNetToolsHandler = createNetToolsHandler({
|
|
3312
|
+
whoisTerms, whoisOrTerms,
|
|
3313
|
+
processedWhoisDomains: globalProcessedWhoisDomains,
|
|
3314
|
+
processedDigDomains: globalProcessedDigDomains,
|
|
3315
|
+
whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
|
|
3316
|
+
whoisServer,
|
|
3317
|
+
whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
|
|
3318
|
+
debugLogFile,
|
|
3319
|
+
digTerms, digOrTerms, digRecordType,
|
|
3320
|
+
digSubdomain: siteConfig.dig_subdomain === true,
|
|
3321
|
+
dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
|
|
3322
|
+
matchedDomains, addMatchedDomain,
|
|
3323
|
+
isDomainAlreadyDetected: isLocallyDetected,
|
|
3324
|
+
onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
|
|
3325
|
+
onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
|
|
3326
|
+
cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
|
|
3327
|
+
cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
|
|
3328
|
+
currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
|
|
3329
|
+
ignoreDomains, matchesIgnoreDomain
|
|
3330
|
+
});
|
|
3331
|
+
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3332
|
+
} else {
|
|
3333
|
+
// No nettools required — regex match alone counts.
|
|
3334
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3335
|
+
}
|
|
3336
|
+
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3337
|
+
};
|
|
3338
|
+
|
|
3339
|
+
// Thin wrapper around evaluatePopupUrl for the per-request listener.
|
|
3340
|
+
// Under forceDebug also attach framenavigated + close listeners so
|
|
3341
|
+
// the popup's full lifecycle (initial nav URL, mid-popup navigations,
|
|
3342
|
+
// close) is visible in logs. Useful when investigating "I saw a
|
|
3343
|
+
// Chrome window flash on screen" — the framenavigated transitions
|
|
3344
|
+
// tell you what URL the window was showing and for how long.
|
|
3345
|
+
const attachPopupRequestCapture = (popupPage, depth) => {
|
|
3346
|
+
popupPage.on('request', (request) => {
|
|
3347
|
+
evaluatePopupUrl(request.url(), depth, request.resourceType());
|
|
3217
3348
|
});
|
|
3349
|
+
if (forceDebug) {
|
|
3350
|
+
try {
|
|
3351
|
+
popupPage.on('framenavigated', (frame) => {
|
|
3352
|
+
try {
|
|
3353
|
+
if (frame !== popupPage.mainFrame()) return; // main frame only
|
|
3354
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] framenavigated → ${frame.url() || 'about:blank'}`));
|
|
3355
|
+
} catch (_) {}
|
|
3356
|
+
});
|
|
3357
|
+
popupPage.on('close', () => {
|
|
3358
|
+
try {
|
|
3359
|
+
const lastUrl = popupPage.url ? popupPage.url() : '(unknown)';
|
|
3360
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] close (last URL: ${lastUrl})`));
|
|
3361
|
+
} catch (_) {}
|
|
3362
|
+
});
|
|
3363
|
+
popupPage.on('pageerror', (err) => {
|
|
3364
|
+
try { console.log(formatLogMessage('debug', `[popup depth=${depth}] pageerror: ${err.message}`)); } catch (_) {}
|
|
3365
|
+
});
|
|
3366
|
+
} catch (_) { /* listener attach errors aren't fatal */ }
|
|
3367
|
+
}
|
|
3218
3368
|
};
|
|
3219
3369
|
|
|
3220
3370
|
const onTargetCreated = async (target) => {
|
|
3371
|
+
// Log EVERY targetcreated event under forceDebug so callers can see
|
|
3372
|
+
// the full set of targets Chromium creates during the scan — not
|
|
3373
|
+
// just the ones we capture. Useful when investigating "is that
|
|
3374
|
+
// Chrome window I saw from a popup or from somewhere else?" — if
|
|
3375
|
+
// a window opens but no targetcreated fires, it's not ours. If a
|
|
3376
|
+
// targetcreated fires for type=page but we skip-and-explain below,
|
|
3377
|
+
// the user knows why we ignored it. Captures the FULL diagnostic
|
|
3378
|
+
// surface, no behavior change.
|
|
3379
|
+
let _tType, _tUrl;
|
|
3380
|
+
if (forceDebug) {
|
|
3381
|
+
try {
|
|
3382
|
+
_tType = target.type();
|
|
3383
|
+
_tUrl = target.url() || 'about:blank';
|
|
3384
|
+
console.log(formatLogMessage('debug', `[popup] targetcreated: type=${_tType} url=${_tUrl}`));
|
|
3385
|
+
} catch (_) {}
|
|
3386
|
+
}
|
|
3387
|
+
|
|
3221
3388
|
// Short-circuit guard: if finally has already started, don't attach
|
|
3222
3389
|
// a request listener whose closure would outlive its meaningful
|
|
3223
3390
|
// scope. The race is narrow (a targetcreated firing while we're
|
|
3224
3391
|
// mid-await on target.page() across the finally boundary), but
|
|
3225
3392
|
// without this guard a late popup could push matches into
|
|
3226
3393
|
// matchedDomains for a URL whose processing has already returned.
|
|
3227
|
-
if (urlFinished)
|
|
3228
|
-
|
|
3394
|
+
if (urlFinished) {
|
|
3395
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: urlFinished=true (scan teardown in progress)`));
|
|
3396
|
+
return;
|
|
3397
|
+
}
|
|
3398
|
+
if (target.type() !== 'page') {
|
|
3399
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: non-page target type=${target.type()} (workers/service-workers/etc are not popunder candidates)`));
|
|
3400
|
+
return;
|
|
3401
|
+
}
|
|
3229
3402
|
const depth = getPopupDepth(target);
|
|
3230
|
-
if (depth < 1)
|
|
3403
|
+
if (depth < 1) {
|
|
3404
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: depth=0 — target not in opener chain of main page (likely a new browser tab opened independently, not a popunder from our scan)`));
|
|
3405
|
+
return; // Not one of ours
|
|
3406
|
+
}
|
|
3231
3407
|
if (depth > POPUP_MAX_DEPTH) {
|
|
3232
3408
|
if (forceDebug) {
|
|
3233
3409
|
console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
|
|
@@ -3237,7 +3413,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3237
3413
|
|
|
3238
3414
|
let popupPage;
|
|
3239
3415
|
try { popupPage = await target.page(); } catch (_) { return; }
|
|
3240
|
-
if (!popupPage)
|
|
3416
|
+
if (!popupPage) {
|
|
3417
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] target.page() returned null — popup not accessible as a Page object`));
|
|
3418
|
+
return;
|
|
3419
|
+
}
|
|
3241
3420
|
// Re-check after the await — the per-URL finally may have flipped
|
|
3242
3421
|
// the flag while target.page() was resolving.
|
|
3243
3422
|
if (urlFinished) {
|
|
@@ -3247,8 +3426,31 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3247
3426
|
|
|
3248
3427
|
if (forceDebug) {
|
|
3249
3428
|
console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
|
|
3429
|
+
// Window dimensions are useful for the "is the popup visible on
|
|
3430
|
+
// my screen?" question — a popup with non-zero viewport in a
|
|
3431
|
+
// headless=new launch shouldn't be visible but on some display
|
|
3432
|
+
// servers (WSLg, X11) it can briefly flash on screen. Log the
|
|
3433
|
+
// viewport so callers can correlate with what they saw.
|
|
3434
|
+
try {
|
|
3435
|
+
const vp = popupPage.viewport();
|
|
3436
|
+
if (vp) console.log(formatLogMessage('debug', `[popup depth=${depth}] viewport: ${vp.width}x${vp.height}`));
|
|
3437
|
+
} catch (_) {}
|
|
3250
3438
|
}
|
|
3251
3439
|
|
|
3440
|
+
// Evaluate the popup's own navigation URL against the same filter
|
|
3441
|
+
// pipeline used for in-popup requests. Required because targetcreated
|
|
3442
|
+
// → target.page() → on('request', ...) is async, and the browser
|
|
3443
|
+
// dispatches the popup's navigation request immediately on window.open
|
|
3444
|
+
// — by the time the listener attaches below, the navigation request
|
|
3445
|
+
// has already fired and won't be re-emitted. resourceType 'document'
|
|
3446
|
+
// mirrors what Chrome would emit for a top-level navigation request.
|
|
3447
|
+
// Without this call, AdsCore-style popunder destinations (URL contains
|
|
3448
|
+
// &campaign=, &v=, etc) were seen-but-not-evaluated: the popup was
|
|
3449
|
+
// logged but its domain never matched the filter regex, so it never
|
|
3450
|
+
// became a rule. Only secondary in-popup requests (tracking pixels,
|
|
3451
|
+
// sub-resources) ever got tested against the regex.
|
|
3452
|
+
evaluatePopupUrl(target.url(), depth, 'document');
|
|
3453
|
+
|
|
3252
3454
|
attachPopupRequestCapture(popupPage, depth);
|
|
3253
3455
|
|
|
3254
3456
|
// Auto-close after the capture window so popups don't pile up.
|
|
@@ -4322,7 +4524,26 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4322
4524
|
|
|
4323
4525
|
// Mark page as processing during interactions
|
|
4324
4526
|
updatePageUsage(page, true);
|
|
4325
|
-
|
|
4527
|
+
// Work-aware ceiling (scales with click count / realistic_click /
|
|
4528
|
+
// intensity) instead of a flat 15s, which truncated high-click
|
|
4529
|
+
// popunder configs mid-pass. Single source of truth shared with
|
|
4530
|
+
// interaction.js's own internal hard cap so the two can't disagree.
|
|
4531
|
+
const INTERACTION_HARD_TIMEOUT = computeInteractionCeilingMs(interactionConfig);
|
|
4532
|
+
|
|
4533
|
+
// Capture-and-clear timer wrapper — same fix as cdp.js (0772ccd) and
|
|
4534
|
+
// the per-URL grace (577ad66). The 3 inline Promise.race patterns
|
|
4535
|
+
// below previously used `new Promise((_, reject) => setTimeout(...))`
|
|
4536
|
+
// without capturing the timer ID, leaking the 15s timer + closure on
|
|
4537
|
+
// reject every time interaction completed inside the cap (the common
|
|
4538
|
+
// case). Centralizing avoids the same mistake recurring across the
|
|
4539
|
+
// ghost-cursor / fallback / standard branches.
|
|
4540
|
+
const raceWithTimer = (promise, msg) => {
|
|
4541
|
+
let t;
|
|
4542
|
+
return Promise.race([
|
|
4543
|
+
promise,
|
|
4544
|
+
new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), INTERACTION_HARD_TIMEOUT); })
|
|
4545
|
+
]).finally(() => clearTimeout(t));
|
|
4546
|
+
};
|
|
4326
4547
|
|
|
4327
4548
|
// Check if ghost-cursor mode is enabled for this site
|
|
4328
4549
|
const ghostConfig = resolveGhostCursorConfig(siteConfig, globalGhostCursor, forceDebug);
|
|
@@ -4333,60 +4554,51 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4333
4554
|
if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
|
|
4334
4555
|
const cursor = createGhostCursor(page, { forceDebug });
|
|
4335
4556
|
if (cursor) {
|
|
4336
|
-
await
|
|
4337
|
-
|
|
4338
|
-
|
|
4339
|
-
|
|
4340
|
-
|
|
4341
|
-
|
|
4342
|
-
|
|
4343
|
-
|
|
4344
|
-
|
|
4345
|
-
|
|
4346
|
-
|
|
4347
|
-
|
|
4348
|
-
|
|
4349
|
-
|
|
4350
|
-
|
|
4351
|
-
|
|
4352
|
-
|
|
4353
|
-
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4354
|
-
}
|
|
4355
|
-
}
|
|
4356
|
-
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4357
|
-
await ghostRandomMove(cursor, { forceDebug });
|
|
4557
|
+
await raceWithTimer((async () => {
|
|
4558
|
+
const viewport = page.viewport() || { width: 1200, height: 800 };
|
|
4559
|
+
const ghostDuration = ghostConfig.duration || 2000;
|
|
4560
|
+
const ghostStart = Date.now();
|
|
4561
|
+
const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
|
|
4562
|
+
|
|
4563
|
+
// Time-based Bezier mouse movements — runs for ghostDuration ms
|
|
4564
|
+
while (ghostTimeLeft() > 200) {
|
|
4565
|
+
const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
|
|
4566
|
+
const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
|
|
4567
|
+
await ghostMove(cursor, toX, toY, {
|
|
4568
|
+
moveSpeed: ghostConfig.moveSpeed,
|
|
4569
|
+
overshootThreshold: ghostConfig.overshootThreshold,
|
|
4570
|
+
forceDebug
|
|
4571
|
+
});
|
|
4572
|
+
if (ghostTimeLeft() > 100) {
|
|
4573
|
+
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4358
4574
|
}
|
|
4359
|
-
|
|
4360
|
-
|
|
4361
|
-
|
|
4362
|
-
|
|
4363
|
-
|
|
4364
|
-
|
|
4365
|
-
|
|
4366
|
-
}
|
|
4367
|
-
|
|
4368
|
-
|
|
4369
|
-
|
|
4370
|
-
|
|
4371
|
-
|
|
4372
|
-
|
|
4373
|
-
|
|
4374
|
-
|
|
4375
|
-
|
|
4376
|
-
|
|
4575
|
+
}
|
|
4576
|
+
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4577
|
+
await ghostRandomMove(cursor, { forceDebug });
|
|
4578
|
+
}
|
|
4579
|
+
if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
|
|
4580
|
+
const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
|
|
4581
|
+
const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
|
|
4582
|
+
await ghostClick(cursor, { x: clickX, y: clickY }, {
|
|
4583
|
+
hesitate: ghostConfig.hesitate,
|
|
4584
|
+
forceDebug
|
|
4585
|
+
});
|
|
4586
|
+
}
|
|
4587
|
+
if (interactionConfig.includeScrolling) {
|
|
4588
|
+
await performPageInteraction(page, currentUrl, {
|
|
4589
|
+
...interactionConfig,
|
|
4590
|
+
mouseMovements: 0,
|
|
4591
|
+
includeElementClicks: false
|
|
4592
|
+
}, forceDebug);
|
|
4593
|
+
}
|
|
4594
|
+
})(), 'ghost-cursor interaction hard timeout');
|
|
4377
4595
|
} else {
|
|
4378
4596
|
if (forceDebug) console.log(formatLogMessage('debug', '[ghost-cursor] Falling back to built-in mouse'));
|
|
4379
|
-
await
|
|
4380
|
-
performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
|
|
4381
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
|
|
4382
|
-
]);
|
|
4597
|
+
await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
|
|
4383
4598
|
}
|
|
4384
4599
|
} else {
|
|
4385
4600
|
// Standard built-in mouse interaction
|
|
4386
|
-
await
|
|
4387
|
-
performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
|
|
4388
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
|
|
4389
|
-
]);
|
|
4601
|
+
await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
|
|
4390
4602
|
}
|
|
4391
4603
|
} catch (interactTimeoutErr) {
|
|
4392
4604
|
if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
|
|
@@ -4521,8 +4733,16 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4521
4733
|
|
|
4522
4734
|
if (siteConfig.clear_sitedata === true) {
|
|
4523
4735
|
try {
|
|
4524
|
-
|
|
4525
|
-
|
|
4736
|
+
// Default reload clear is quick mode (cookies + cache +
|
|
4737
|
+
// localStorage + sessionStorage — the storage layers where
|
|
4738
|
+
// session-cap tracking typically lives). Sites that put their
|
|
4739
|
+
// session cap in IndexedDB / WebSQL / service workers can opt
|
|
4740
|
+
// into a full clear-per-reload via clear_sitedata_full_on_reload.
|
|
4741
|
+
// Costs ~100-500ms extra per reload and may unregister a
|
|
4742
|
+
// service worker the page depends on; off by default.
|
|
4743
|
+
const fullOnReload = siteConfig.clear_sitedata_full_on_reload === true;
|
|
4744
|
+
const clearResult = await clearSiteData(page, currentUrl, forceDebug, !fullOnReload);
|
|
4745
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data (${fullOnReload ? 'full' : 'quick'}) before reload #${i} for ${currentUrl}`));
|
|
4526
4746
|
} catch (reloadClearErr) {
|
|
4527
4747
|
if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
|
|
4528
4748
|
}
|
|
@@ -4536,20 +4756,26 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4536
4756
|
if (useForceReload && !reloadSuccess && !skipForceReload) {
|
|
4537
4757
|
// Attempt force reload: disable cache, reload, re-enable cache
|
|
4538
4758
|
try {
|
|
4759
|
+
// Local race-with-timer helper — capture-and-clear pattern from
|
|
4760
|
+
// cdp.js / interact (6ad36e7). Without this, every successful
|
|
4761
|
+
// setCacheEnabled() left an 8s setTimeout running with closure
|
|
4762
|
+
// on `reject` (2 leaks per reload cycle × N reload cycles).
|
|
4763
|
+
const raceWithTimer = (promise, msg, ms) => {
|
|
4764
|
+
let t;
|
|
4765
|
+
return Promise.race([
|
|
4766
|
+
promise,
|
|
4767
|
+
new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), ms); })
|
|
4768
|
+
]).finally(() => clearTimeout(t));
|
|
4769
|
+
};
|
|
4770
|
+
|
|
4539
4771
|
// Timeout-protected cache disable
|
|
4540
|
-
await
|
|
4541
|
-
|
|
4542
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('Cache disable timeout')), 8000))
|
|
4543
|
-
]);
|
|
4544
|
-
|
|
4772
|
+
await raceWithTimer(page.setCacheEnabled(false), 'Cache disable timeout', 8000);
|
|
4773
|
+
|
|
4545
4774
|
// Use networkidle2 for force reload to better detect when page is actually loaded
|
|
4546
4775
|
await page.reload({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
|
|
4547
|
-
|
|
4776
|
+
|
|
4548
4777
|
// Timeout-protected cache enable
|
|
4549
|
-
await
|
|
4550
|
-
page.setCacheEnabled(true),
|
|
4551
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('Cache enable timeout')), 8000))
|
|
4552
|
-
]);
|
|
4778
|
+
await raceWithTimer(page.setCacheEnabled(true), 'Cache enable timeout', 8000);
|
|
4553
4779
|
|
|
4554
4780
|
reloadSuccess = true;
|
|
4555
4781
|
if (forceDebug) console.log(formatLogMessage('debug', `Force reload #${i} completed for ${currentUrl}`));
|
|
@@ -4644,8 +4870,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4644
4870
|
const endY = 200 + Math.floor(Math.random() * (vp.height - 400));
|
|
4645
4871
|
await humanLikeMouseMove(page, startX, startY, endX, endY, { steps: 3, curve: 0.04, jitter: 1 });
|
|
4646
4872
|
}
|
|
4647
|
-
// Content clicks to trigger document-level onclick handlers
|
|
4648
|
-
|
|
4873
|
+
// Content clicks to trigger document-level onclick handlers.
|
|
4874
|
+
// Honor siteConfig.interact_click_count so popunder-discovery configs
|
|
4875
|
+
// get the same click volume on every reload, not just the initial load.
|
|
4876
|
+
// Omit `clicks` when no override is set so performContentClicks uses
|
|
4877
|
+
// its CONTENT_CLICK.CLICK_COUNT default (single source of truth).
|
|
4878
|
+
// realistic forwards siteConfig.realistic_click; always passed
|
|
4879
|
+
// (defaults to false) so realistic mode applies to every reload's
|
|
4880
|
+
// clicks, not just the initial pass.
|
|
4881
|
+
const postReloadClickOpts = {
|
|
4882
|
+
preDelay: 200,
|
|
4883
|
+
forceDebug,
|
|
4884
|
+
realistic: !!interactionConfig.realistic
|
|
4885
|
+
};
|
|
4886
|
+
if (interactionConfig.clickCount) postReloadClickOpts.clicks = interactionConfig.clickCount;
|
|
4887
|
+
await performContentClicks(page, postReloadClickOpts);
|
|
4649
4888
|
if (forceDebug) console.log(formatLogMessage('debug', `Post-reload interaction completed for reload #${i}`));
|
|
4650
4889
|
} catch (postReloadInteractErr) {
|
|
4651
4890
|
// Non-critical — continue with remaining reloads
|
|
@@ -4870,9 +5109,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4870
5109
|
}
|
|
4871
5110
|
}
|
|
4872
5111
|
|
|
4873
|
-
// Temporarily store the pLimit function
|
|
5112
|
+
// Temporarily store the pLimit function
|
|
4874
5113
|
const originalLimit = limit;
|
|
4875
5114
|
|
|
5115
|
+
// Per-site config normalization (always runs, not gated on --validate-config).
|
|
5116
|
+
// Catches typo'd keys (whois_terms vs whois) with "did you mean" suggestions
|
|
5117
|
+
// and coerces boolean-like values (interact: 1 → interact: true) before any
|
|
5118
|
+
// downstream strict-equality check silently treats them as disabled. Mutates
|
|
5119
|
+
// each site in place so the rest of the scan sees normalized values.
|
|
5120
|
+
// Reports via console.warn so messages surface even when --silent is set.
|
|
5121
|
+
for (let i = 0; i < sites.length; i++) {
|
|
5122
|
+
const { warnings, errors } = normalizeSiteConfig(sites[i], i);
|
|
5123
|
+
for (const e of errors) console.warn(messageColors.error('⚠ ' + e));
|
|
5124
|
+
for (const w of warnings) console.warn(messageColors.warn('⚠ [config] ' + w));
|
|
5125
|
+
}
|
|
5126
|
+
|
|
4876
5127
|
// V8 Optimization: Calculate total URLs first to pre-allocate array
|
|
4877
5128
|
let totalUrls = 0;
|
|
4878
5129
|
for (const site of sites) {
|
|
@@ -4890,7 +5141,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4890
5141
|
for (const url of urlsToProcess) {
|
|
4891
5142
|
allTasks[taskIndex++] = {
|
|
4892
5143
|
url,
|
|
4893
|
-
|
|
5144
|
+
// Default userAgent to 'chrome' when a site doesn't set one. Without
|
|
5145
|
+
// it the browser sends its bundled default UA, which literally
|
|
5146
|
+
// contains "HeadlessChrome" (verified, both headless modes) — an
|
|
5147
|
+
// instant automation tell. Defaulting here (rather than at launch)
|
|
5148
|
+
// activates the whole coherent path, since UA-string spoofing, the
|
|
5149
|
+
// navigator/webdriver/plugins/userAgentData JS masking, the Sec-CH-UA
|
|
5150
|
+
// request headers, and the curl content-fetch UA all gate on
|
|
5151
|
+
// config.userAgent. Placing 'chrome' BEFORE the spread means an
|
|
5152
|
+
// explicit site value wins — including userAgent:false / null to opt
|
|
5153
|
+
// out and scan with the raw headless UA.
|
|
5154
|
+
config: { userAgent: 'chrome', ...site, _originalUrl: url },
|
|
4894
5155
|
taskId: taskIndex - 1 // For tracking
|
|
4895
5156
|
};
|
|
4896
5157
|
}
|
|
@@ -4923,7 +5184,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4923
5184
|
let urlsSinceLastCleanup = 0;
|
|
4924
5185
|
|
|
4925
5186
|
if (!silentMode && totalUrls > 0) {
|
|
4926
|
-
console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${
|
|
5187
|
+
console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${effectiveConcurrency}...`);
|
|
4927
5188
|
if (totalUrls > RESOURCE_CLEANUP_INTERVAL) {
|
|
4928
5189
|
console.log(messageColors.processing('Browser will restart every') + ` ~${RESOURCE_CLEANUP_INTERVAL} URLs to free resources`);
|
|
4929
5190
|
}
|
|
@@ -5044,10 +5305,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5044
5305
|
silentMode
|
|
5045
5306
|
});
|
|
5046
5307
|
healthPromise.catch(() => {});
|
|
5047
|
-
|
|
5048
|
-
|
|
5049
|
-
|
|
5050
|
-
|
|
5308
|
+
// Capture-and-clear timer pattern (cdp.js 0772ccd, interact 6ad36e7) —
|
|
5309
|
+
// when healthPromise wins the race, the inline setTimeout would
|
|
5310
|
+
// otherwise hold reject's closure for the full 30s grace window.
|
|
5311
|
+
let healthTimer;
|
|
5312
|
+
try {
|
|
5313
|
+
healthCheck = await Promise.race([
|
|
5314
|
+
healthPromise,
|
|
5315
|
+
new Promise((_, reject) => { healthTimer = setTimeout(() => reject(new Error('Health check timeout')), 30000); })
|
|
5316
|
+
]);
|
|
5317
|
+
} finally {
|
|
5318
|
+
if (healthTimer) clearTimeout(healthTimer);
|
|
5319
|
+
}
|
|
5051
5320
|
} catch (healthError) {
|
|
5052
5321
|
console.log(formatLogMessage('warn', `[HEALTH CHECK] Timeout, assuming restart needed`));
|
|
5053
5322
|
healthCheck = { shouldRestart: true, reason: 'Health check timeout' };
|
|
@@ -5312,26 +5581,94 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5312
5581
|
} catch {}
|
|
5313
5582
|
|
|
5314
5583
|
// Per-URL timeout so a single hung processUrl can't block the batch
|
|
5315
|
-
// forever.
|
|
5316
|
-
// (
|
|
5317
|
-
//
|
|
5318
|
-
//
|
|
5319
|
-
//
|
|
5320
|
-
|
|
5584
|
+
// forever. Scaled from siteConfig.timeout + (delay + interaction) ×
|
|
5585
|
+
// (1 + reload) + 30s headroom, with a 75s floor.
|
|
5586
|
+
//
|
|
5587
|
+
// The (1 + reload) multiplier was missing from the previous formula
|
|
5588
|
+
// (13dd4fa) — `reload: 4` configs perform 5 total cycles (initial +
|
|
5589
|
+
// 4 reloads), each with its own delay + interaction overhead, so the
|
|
5590
|
+
// 80s ceiling for the user's lean config (timeout:35000, delay:15000,
|
|
5591
|
+
// reload:4) fired DURING the 3rd reload while the orphan still had
|
|
5592
|
+
// 2 more cycles + drain to go — far longer than the 8s grace could
|
|
5593
|
+
// bridge. Multiplying by cycle count brings the ceiling above the
|
|
5594
|
+
// legitimate work envelope.
|
|
5595
|
+
const reloadCount = task.config.reload || 0;
|
|
5596
|
+
// Interaction overhead per cycle must match interaction.js's actual
|
|
5597
|
+
// ceiling, which is now work-aware (high interact_click_count /
|
|
5598
|
+
// realistic_click configs legitimately run far longer than the old flat
|
|
5599
|
+
// 15s). Compute the same value here so the per-URL ceiling stays above
|
|
5600
|
+
// the real interaction envelope and can't fire mid-pass. Zero when
|
|
5601
|
+
// interaction is disabled for this task (no interaction cost to budget).
|
|
5602
|
+
const interactionOnForTask = task.config.interact === true && !disableInteract;
|
|
5603
|
+
const INTERACTION_OVERHEAD_MS = interactionOnForTask
|
|
5604
|
+
? computeInteractionCeilingMs(createInteractionConfig(task.url, task.config))
|
|
5605
|
+
: 0;
|
|
5606
|
+
const PER_URL_TIMEOUT_MS = Math.max(
|
|
5607
|
+
75000,
|
|
5608
|
+
(task.config.timeout || 35000)
|
|
5609
|
+
+ ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
|
|
5610
|
+
+ 30000
|
|
5611
|
+
);
|
|
5612
|
+
// Grace period after primary timeout — gives the orphan a chance to
|
|
5613
|
+
// finish drainPendingNetTools() and emit "Saving N rules despite page
|
|
5614
|
+
// load failure" before we abandon its result. Drain typically completes
|
|
5615
|
+
// in <1s with cached nettools; 8s is the safety ceiling.
|
|
5616
|
+
const PER_URL_GRACE_MS = 8000;
|
|
5617
|
+
const PER_URL_TIMEOUT_MARKER = 'PER_URL_TIMEOUT_FIRED';
|
|
5618
|
+
|
|
5321
5619
|
const processUrlPromise = processUrl(task.url, task.config, browser);
|
|
5322
5620
|
let perUrlTimer;
|
|
5323
5621
|
try {
|
|
5324
5622
|
return await Promise.race([
|
|
5325
5623
|
processUrlPromise,
|
|
5326
5624
|
new Promise((_, reject) => {
|
|
5327
|
-
perUrlTimer = setTimeout(() =>
|
|
5625
|
+
perUrlTimer = setTimeout(() => {
|
|
5626
|
+
const e = new Error(`Per-URL timeout (${Math.round(PER_URL_TIMEOUT_MS / 1000)}s)`);
|
|
5627
|
+
e.code = PER_URL_TIMEOUT_MARKER;
|
|
5628
|
+
reject(e);
|
|
5629
|
+
}, PER_URL_TIMEOUT_MS);
|
|
5328
5630
|
})
|
|
5329
5631
|
]);
|
|
5330
5632
|
} catch (err) {
|
|
5331
|
-
if (err && err.
|
|
5332
|
-
processUrlPromise.catch(() => {});
|
|
5633
|
+
if (err && err.code === PER_URL_TIMEOUT_MARKER) {
|
|
5333
5634
|
forceRestartFlag = true;
|
|
5334
|
-
|
|
5635
|
+
// Log the timeout fire — was invisible before; only ended up in the
|
|
5636
|
+
// returned result.error field which is never printed. Makes
|
|
5637
|
+
// ceiling-tuning regressions visible without source-reading.
|
|
5638
|
+
if (forceDebug) {
|
|
5639
|
+
console.log(formatLogMessage('warn', `${err.message} for ${task.url} — orphan in ${PER_URL_GRACE_MS / 1000}s grace`));
|
|
5640
|
+
}
|
|
5641
|
+
// Grace period — wait briefly for the orphan to drain + recover
|
|
5642
|
+
// partial matches. Browser is still in a bad state (we hit the
|
|
5643
|
+
// primary ceiling) so the restart still fires either way; only the
|
|
5644
|
+
// rules payload differs.
|
|
5645
|
+
let graceTimer;
|
|
5646
|
+
try {
|
|
5647
|
+
const graceResult = await Promise.race([
|
|
5648
|
+
processUrlPromise,
|
|
5649
|
+
new Promise((_, reject) => {
|
|
5650
|
+
// Capture the timer ID so the finally can clear it when the
|
|
5651
|
+
// orphan wins the race — otherwise the setTimeout keeps the
|
|
5652
|
+
// event loop ref + closure on `reject` alive for the full
|
|
5653
|
+
// grace window, even though the race already settled.
|
|
5654
|
+
// Same leak pattern fixed in cdp.js (0772ccd) and
|
|
5655
|
+
// clear_sitedata (780b443).
|
|
5656
|
+
graceTimer = setTimeout(() => reject(new Error('Grace timeout')), PER_URL_GRACE_MS);
|
|
5657
|
+
})
|
|
5658
|
+
]);
|
|
5659
|
+
if (forceDebug) {
|
|
5660
|
+
console.log(formatLogMessage('debug', `Grace recovered ${(graceResult && graceResult.rules ? graceResult.rules.length : 0)} rules for ${task.url}`));
|
|
5661
|
+
}
|
|
5662
|
+
return { ...graceResult, needsImmediateRestart: true };
|
|
5663
|
+
} catch (_) {
|
|
5664
|
+
if (forceDebug) {
|
|
5665
|
+
console.log(formatLogMessage('warn', `Grace timed out for ${task.url} — discarding orphan`));
|
|
5666
|
+
}
|
|
5667
|
+
processUrlPromise.catch(() => {});
|
|
5668
|
+
return { url: task.url, rules: [], success: false, error: err.message, needsImmediateRestart: true };
|
|
5669
|
+
} finally {
|
|
5670
|
+
if (graceTimer) clearTimeout(graceTimer);
|
|
5671
|
+
}
|
|
5335
5672
|
}
|
|
5336
5673
|
throw err;
|
|
5337
5674
|
} finally {
|