@fanboynz/network-scanner 3.0.2 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/lib/adblock-rust.js +17 -4
- package/lib/adblock.js +92 -15
- package/lib/browserhealth.js +57 -28
- package/lib/cdp.js +68 -34
- package/lib/clear_sitedata.js +68 -20
- package/lib/compress.js +26 -58
- package/lib/curl.js +44 -22
- package/lib/domain-cache.js +8 -57
- package/lib/dry-run.js +9 -4
- package/lib/fingerprint.js +735 -114
- package/lib/interaction.js +262 -26
- package/lib/nettools.js +47 -76
- package/lib/openvpn_vpn.js +116 -35
- package/lib/searchstring.js +15 -237
- package/lib/validate_rules.js +285 -3
- package/lib/wireguard_vpn.js +64 -12
- package/nwss.js +529 -217
- package/package.json +1 -1
- package/regex-tool/index.html +321 -628
- package/scripts/test-stealth.js +39 -13
package/nwss.js
CHANGED
|
@@ -12,13 +12,13 @@ const path = require('path');
|
|
|
12
12
|
const dnsPromises = require('node:dns/promises');
|
|
13
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
14
14
|
const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
|
|
15
|
-
const { parseSearchStrings, createResponseHandler
|
|
16
|
-
const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
|
|
15
|
+
const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
|
|
16
|
+
const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS } = require('./lib/fingerprint');
|
|
17
17
|
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
18
18
|
// Curl functionality (replace searchstring curl handler)
|
|
19
19
|
const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
|
|
20
20
|
// Rule validation
|
|
21
|
-
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
|
|
21
|
+
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile, normalizeSiteConfig } = require('./lib/validate_rules');
|
|
22
22
|
// CF Bypass
|
|
23
23
|
const {
|
|
24
24
|
handleCloudflareProtection,
|
|
@@ -66,7 +66,7 @@ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
|
|
|
66
66
|
// log lines (start/completed). Same cyan as the other monitoring tags.
|
|
67
67
|
const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
68
68
|
// Enhanced mouse interaction and page simulation
|
|
69
|
-
const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
69
|
+
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
70
70
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
71
71
|
const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
|
|
72
72
|
// Domain detection cache for performance optimization
|
|
@@ -129,15 +129,12 @@ const CONCURRENCY_LIMITS = Object.freeze({
|
|
|
129
129
|
});
|
|
130
130
|
|
|
131
131
|
// V8 Optimization: Use Map for user agent lookups instead of object
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
['firefox_linux', "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0"],
|
|
139
|
-
['safari', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"]
|
|
140
|
-
]));
|
|
132
|
+
// User-Agent strings come from the single source of truth in lib/fingerprint
|
|
133
|
+
// (USER_AGENT_COLLECTIONS, imported above) — the same map page.setUserAgent
|
|
134
|
+
// applies to the browser. The previous local duplicate had silently drifted
|
|
135
|
+
// (Chrome 146 vs the browser's 148, Firefox 148 vs 151, Safari 18.6 vs 19.5),
|
|
136
|
+
// so curl content-fetches advertised a different browser than the page did.
|
|
137
|
+
// Keep using the imported map directly so the two can never diverge again.
|
|
141
138
|
|
|
142
139
|
const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
|
|
143
140
|
|
|
@@ -776,13 +773,14 @@ Redirect Handling Options:
|
|
|
776
773
|
resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
|
|
777
774
|
interact: true/false Simulate mouse movements/clicks
|
|
778
775
|
isBrave: true/false Spoof Brave browser detection
|
|
779
|
-
userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari"
|
|
776
|
+
userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Desktop User-Agent (defaults to "chrome" if unset; set false to scan with the raw headless UA)
|
|
780
777
|
interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium)
|
|
781
778
|
delay: <milliseconds> Delay after load (default: 6000, capped at 2000ms unless delay_uncapped: true)
|
|
782
779
|
delay_uncapped: true/false Honor 'delay' up to half the per-URL timeout instead of the 2s default cap. Use for sites with setTimeout-deferred lazy ad/tracker loaders that fire well past the standard post-networkidle window
|
|
783
780
|
reload: <number> Reload page n times after load (default: 1)
|
|
784
781
|
forcereload: true/false or ["domain1.com", "domain2.com"] Force cache-clearing reload for all URLs or specific domains
|
|
785
782
|
clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
|
|
783
|
+
clear_sitedata_full_on_reload: true/false With clear_sitedata: true, also clear heavy storage (IndexedDB, WebSQL, service workers) between reloads — quick mode (cookies+cache+local/session storage) is the default for reloads; this flag promotes them to full clears at ~100-500ms latency cost per reload. Use for sites with IndexedDB/service-worker-backed session caps. Off by default.
|
|
786
784
|
subDomains: 1/0 Output full subdomains (default: 0)
|
|
787
785
|
localhost: true/false Force localhost output (127.0.0.1)
|
|
788
786
|
localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
|
|
@@ -1864,15 +1862,65 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1864
1862
|
'--log-level=3', // Fatal errors only (suppresses verbose disk logging)
|
|
1865
1863
|
'--no-service-autorun', // No background service disk activity
|
|
1866
1864
|
'--disable-domain-reliability', // No reliability monitor disk writes
|
|
1865
|
+
// Suppress Chrome's auto-update subsystem entirely in headful runs.
|
|
1866
|
+
// --disable-component-update + --disable-background-networking above
|
|
1867
|
+
// stop the network-level check, but Chrome's UI can still show the
|
|
1868
|
+
// "update available" toolbar dot / banner / "relaunch to update"
|
|
1869
|
+
// modal if Chrome has cached state from a prior check by the same
|
|
1870
|
+
// installed chrome binary. These two flags neutralize that:
|
|
1871
|
+
// simulate-outdated-no-au=DATE — the no-auto-update simulation
|
|
1872
|
+
// date is treated as DATE. Far-future date = never shows the
|
|
1873
|
+
// 'outdated' UI. Quotes around the date required by Chrome.
|
|
1874
|
+
// check-for-update-interval=N — seconds between update checks.
|
|
1875
|
+
// 31536000 = 1 year. Even if the above somehow gets bypassed,
|
|
1876
|
+
// the check itself won't fire within any reasonable scan.
|
|
1877
|
+
// Both are no-ops in pure headless modes but matter in --headful
|
|
1878
|
+
// and headless='new' (which can render UI in some cases).
|
|
1879
|
+
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
|
|
1880
|
+
'--check-for-update-interval=31536000',
|
|
1867
1881
|
// PERFORMANCE: Disable non-essential Chrome features in a single flag
|
|
1868
1882
|
// IMPORTANT: Chrome only reads the LAST --disable-features flag, so combine all into one
|
|
1869
|
-
//
|
|
1870
|
-
//
|
|
1871
|
-
//
|
|
1872
|
-
//
|
|
1873
|
-
//
|
|
1874
|
-
//
|
|
1875
|
-
|
|
1883
|
+
//
|
|
1884
|
+
// Sign-in / profile suppression family (prevents the "Something went
|
|
1885
|
+
// wrong when opening your profile. Please sign out then sign in
|
|
1886
|
+
// again" popup that fires in headful when Chrome's sign-in/sync
|
|
1887
|
+
// subsystem can't make sense of our fresh-each-launch temp
|
|
1888
|
+
// userDataDir):
|
|
1889
|
+
// AccountConsistencyMirror, AccountConsistencyDice
|
|
1890
|
+
// Older Chrome's identity consistency layer. Disabling stops
|
|
1891
|
+
// the sync subsystem from initialising at startup.
|
|
1892
|
+
// ProfilePicker, EnableProfilePicker
|
|
1893
|
+
// Two names for the same Chrome feature (renamed in Chrome
|
|
1894
|
+
// ~120s). Disabling stops the profile-picker dialog that some
|
|
1895
|
+
// Chrome versions display when launching with no recognised
|
|
1896
|
+
// profile. Was the new offender in Chrome 148 for this case.
|
|
1897
|
+
// IdentityConsistency
|
|
1898
|
+
// Chrome's identity-consistency-with-google.com checks. Tries
|
|
1899
|
+
// to read profile credentials at startup; trips the popup if
|
|
1900
|
+
// profile is fresh/empty.
|
|
1901
|
+
// SyncDisabledWithProfilePicker
|
|
1902
|
+
// Sync subsystem variant that activates when profile picker
|
|
1903
|
+
// would otherwise show. Disabling is harmless when picker is
|
|
1904
|
+
// also disabled but covers the gap if a Chrome version honors
|
|
1905
|
+
// only one of the two.
|
|
1906
|
+
// SigninInterceptBubble
|
|
1907
|
+
// Sign-in interception bubble that pops when Chrome detects
|
|
1908
|
+
// 'enterprise' sign-in patterns. Defensive.
|
|
1909
|
+
// Combined with --disable-sync + --allow-browser-signin=false
|
|
1910
|
+
// below + --profile-directory=Default flag (explicit profile name
|
|
1911
|
+
// instead of letting Chrome auto-detect/pick), this should fully
|
|
1912
|
+
// suppress sign-in popups in headful from Chrome 118 through 148+.
|
|
1913
|
+
//
|
|
1914
|
+
// ChromeWhatsNewUI: suppresses the post-update "What's New" page
|
|
1915
|
+
// that auto-opens in a new tab after Chrome installs an update —
|
|
1916
|
+
// not popunder-relevant but visually noisy in headful sessions.
|
|
1917
|
+
`--disable-features=AudioServiceOutOfProcess,VizDisplayCompositor,TranslateUI,BlinkGenPropertyTrees,Translate,BackForwardCache,AcceptCHFrame,SafeBrowsing,HttpsFirstBalancedModeAutoEnable,site-per-process,PaintHolding,AccountConsistencyMirror,AccountConsistencyDice,ProfilePicker,EnableProfilePicker,IdentityConsistency,SyncDisabledWithProfilePicker,SigninInterceptBubble,ChromeWhatsNewUI${disable_ad_tagging ? ',AdTagging' : ''}`,
|
|
1918
|
+
// Explicit profile directory — without this, Chrome may probe for
|
|
1919
|
+
// available profiles at launch and trigger the picker dialog (or
|
|
1920
|
+
// the "something went wrong" popup if no profile is found). With
|
|
1921
|
+
// a fresh temp userDataDir each launch, Chrome will create
|
|
1922
|
+
// 'Default' on its own; explicitly naming it skips the probe.
|
|
1923
|
+
'--profile-directory=Default',
|
|
1876
1924
|
'--disable-ipc-flooding-protection',
|
|
1877
1925
|
'--aggressive-cache-discard',
|
|
1878
1926
|
'--memory-pressure-off',
|
|
@@ -1931,7 +1979,20 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1931
1979
|
|
|
1932
1980
|
|
|
1933
1981
|
const pLimit = (await import('p-limit')).default;
|
|
1934
|
-
|
|
1982
|
+
// VPN connect/disconnect is per-URL (wgConnect/ovpnConnect at scan start,
|
|
1983
|
+
// wgDisconnect/ovpnDisconnect in the finally) and manipulates the SHARED
|
|
1984
|
+
// system routing table. Interface names are derived from a hash of the VPN
|
|
1985
|
+
// config and connect/disconnect is not refcounted, so two concurrent URLs
|
|
1986
|
+
// that share a VPN config resolve to the same interface and one task's
|
|
1987
|
+
// teardown rips the interface out from under the other mid-scan. Force
|
|
1988
|
+
// serial execution whenever any site uses vpn/openvpn — correctness over
|
|
1989
|
+
// throughput, and VPN scans are network-bound rather than CPU-bound anyway.
|
|
1990
|
+
const vpnInUse = sites.some(site => site.vpn || site.openvpn);
|
|
1991
|
+
const effectiveConcurrency = vpnInUse ? 1 : MAX_CONCURRENT_SITES;
|
|
1992
|
+
if (vpnInUse && MAX_CONCURRENT_SITES > 1 && (forceDebug || !silentMode)) {
|
|
1993
|
+
console.log(formatLogMessage('info', `${VPN_TAG} VPN configured — forcing concurrency 1 (was ${MAX_CONCURRENT_SITES}) to avoid routing-table races`));
|
|
1994
|
+
}
|
|
1995
|
+
const limit = pLimit(effectiveConcurrency);
|
|
1935
1996
|
|
|
1936
1997
|
const perSiteHeadful = sites.some(site => site.headful === true);
|
|
1937
1998
|
const launchHeadless = !(headfulMode || perSiteHeadful);
|
|
@@ -2694,23 +2755,34 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2694
2755
|
|
|
2695
2756
|
if (userAgentKey === 'chrome_mac') {
|
|
2696
2757
|
platform = 'macOS';
|
|
2697
|
-
platformVersion = '13.5.0';
|
|
2758
|
+
platformVersion = '13.5.0';
|
|
2698
2759
|
arch = 'arm';
|
|
2699
2760
|
} else if (userAgentKey === 'chrome_linux') {
|
|
2700
2761
|
platform = 'Linux';
|
|
2701
2762
|
platformVersion = '6.5.0';
|
|
2702
2763
|
arch = 'x86';
|
|
2703
2764
|
}
|
|
2704
|
-
|
|
2765
|
+
|
|
2766
|
+
// Derive the Chrome major version from the SAME UA string the
|
|
2767
|
+
// browser actually sends (USER_AGENT_COLLECTIONS, via
|
|
2768
|
+
// page.setUserAgent in applyUserAgentSpoofing) so Sec-CH-UA can
|
|
2769
|
+
// never drift out of sync with navigator.userAgent. The version
|
|
2770
|
+
// used to be hardcoded ('146') while the UA list moved to 148 —
|
|
2771
|
+
// a detector cross-checking UA vs Sec-CH-UA saw the mismatch.
|
|
2772
|
+
// Chrome's UA-reduction means the full version is "<major>.0.0.0".
|
|
2773
|
+
const browserUa = USER_AGENT_COLLECTIONS.get(userAgentKey) || '';
|
|
2774
|
+
const chromeMajor = (browserUa.match(/Chrome\/(\d+)/) || [])[1] || '148';
|
|
2775
|
+
const fullVer = `${chromeMajor}.0.0.0`;
|
|
2776
|
+
|
|
2705
2777
|
await page.setExtraHTTPHeaders({
|
|
2706
|
-
'Sec-CH-UA':
|
|
2778
|
+
'Sec-CH-UA': `"Not:A-Brand";v="99", "Google Chrome";v="${chromeMajor}", "Chromium";v="${chromeMajor}"`,
|
|
2707
2779
|
'Sec-CH-UA-Platform': `"${platform}"`,
|
|
2708
2780
|
'Sec-CH-UA-Platform-Version': `"${platformVersion}"`,
|
|
2709
2781
|
'Sec-CH-UA-Mobile': '?0',
|
|
2710
2782
|
'Sec-CH-UA-Arch': `"${arch}"`,
|
|
2711
2783
|
'Sec-CH-UA-Bitness': '"64"',
|
|
2712
|
-
'Sec-CH-UA-Full-Version':
|
|
2713
|
-
'Sec-CH-UA-Full-Version-List':
|
|
2784
|
+
'Sec-CH-UA-Full-Version': `"${fullVer}"`,
|
|
2785
|
+
'Sec-CH-UA-Full-Version-List': `"Not:A-Brand";v="99.0.0.0", "Google Chrome";v="${fullVer}", "Chromium";v="${fullVer}"`
|
|
2714
2786
|
});
|
|
2715
2787
|
}
|
|
2716
2788
|
} catch (fingerprintErr) {
|
|
@@ -2736,7 +2808,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2736
2808
|
// Get user agent for curl if needed
|
|
2737
2809
|
let curlUserAgent = '';
|
|
2738
2810
|
if (useCurl && siteConfig.userAgent) {
|
|
2739
|
-
curlUserAgent =
|
|
2811
|
+
curlUserAgent = USER_AGENT_COLLECTIONS.get(siteConfig.userAgent.toLowerCase()) || '';
|
|
2740
2812
|
}
|
|
2741
2813
|
|
|
2742
2814
|
if (useCurl && forceDebug) {
|
|
@@ -3072,10 +3144,22 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3072
3144
|
|
|
3073
3145
|
if (capturePopups && forceDebug) {
|
|
3074
3146
|
// One-time setup-time warning if the click prerequisite isn't met.
|
|
3075
|
-
// Without clicks, capture_popups is a no-op in practice.
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3147
|
+
// Without clicks, capture_popups is a no-op in practice. Previous
|
|
3148
|
+
// version blamed `interact_clicks` for both missing-piece cases — but
|
|
3149
|
+
// when the actual culprit is `interact: 1` (number, silently disabled
|
|
3150
|
+
// by strict `=== true`), the message misled users into debugging
|
|
3151
|
+
// interact_clicks while the real problem was interact itself.
|
|
3152
|
+
// (normalizeSiteConfig now coerces interact: 1 → true with a warning,
|
|
3153
|
+
// so by the time we get here both should be booleans — but keep the
|
|
3154
|
+
// diagnostic accurate for the truly-missing case.)
|
|
3155
|
+
const interactOn = siteConfig.interact === true;
|
|
3156
|
+
const clicksOn = siteConfig.interact_clicks === true;
|
|
3157
|
+
if (!interactOn && !clicksOn) {
|
|
3158
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects will capture`));
|
|
3159
|
+
} else if (!interactOn) {
|
|
3160
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no fake clicks fire`));
|
|
3161
|
+
} else if (!clicksOn) {
|
|
3162
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact_clicks' is not — set interact_clicks: true to enable element-targeted clicks; without it, only random content-zone clicks fire and may miss overlay-based popunders`));
|
|
3079
3163
|
}
|
|
3080
3164
|
console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
|
|
3081
3165
|
}
|
|
@@ -3101,133 +3185,200 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3101
3185
|
// setRequestInterception(true) — page.on('request') fires for every
|
|
3102
3186
|
// request regardless of interception state, and we don't need to
|
|
3103
3187
|
// block anything on popups.
|
|
3104
|
-
|
|
3105
|
-
|
|
3188
|
+
// Evaluate ANY URL surfaced from a popup (the popup's own navigation URL
|
|
3189
|
+
// OR an in-popup request) against the same filter pipeline the main-page
|
|
3190
|
+
// request handler uses. Factored out so:
|
|
3191
|
+
// 1. attachPopupRequestCapture's `popupPage.on('request', ...)` calls
|
|
3192
|
+
// this once per in-popup request (with the request's resourceType).
|
|
3193
|
+
// 2. onTargetCreated calls this once with `target.url()` and resourceType
|
|
3194
|
+
// 'document' BEFORE attaching the request listener — catches the
|
|
3195
|
+
// popup's navigation URL itself, which fires before our listener can
|
|
3196
|
+
// attach (targetcreated → page resolve → attach is async, and the
|
|
3197
|
+
// browser dispatches the navigation immediately on window.open).
|
|
3198
|
+
// Without #2, popunder destinations whose own URL contains the
|
|
3199
|
+
// filterRegex pattern (e.g. AdsCore campaign URLs with &campaign=)
|
|
3200
|
+
// were seen-but-not-evaluated.
|
|
3201
|
+
const evaluatePopupUrl = (checkedUrl, depth, resourceType) => {
|
|
3202
|
+
try {
|
|
3203
|
+
if (!checkedUrl || checkedUrl === 'about:blank') return;
|
|
3204
|
+
let fullSubdomain = '';
|
|
3205
|
+
let checkedRootDomain = '';
|
|
3106
3206
|
try {
|
|
3107
|
-
const
|
|
3108
|
-
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3124
|
-
|
|
3125
|
-
if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3126
|
-
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
3127
|
-
if (forceDebug) {
|
|
3128
|
-
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3129
|
-
}
|
|
3130
|
-
break;
|
|
3131
|
-
}
|
|
3132
|
-
}
|
|
3133
|
-
}
|
|
3134
|
-
|
|
3135
|
-
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
|
|
3136
|
-
// above; populating the dynamic block Set from popup URLs lets
|
|
3137
|
-
// tracker URLs surfaced via popup chains poison their root
|
|
3138
|
-
// domain for the rest of the scan just like main-page hits do.
|
|
3139
|
-
if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3140
|
-
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3141
|
-
if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3142
|
-
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3143
|
-
if (forceDebug) {
|
|
3144
|
-
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3145
|
-
}
|
|
3146
|
-
break;
|
|
3207
|
+
const parsedUrl = new URL(checkedUrl);
|
|
3208
|
+
fullSubdomain = parsedUrl.hostname;
|
|
3209
|
+
const pslResult = psl.parse(fullSubdomain);
|
|
3210
|
+
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3211
|
+
} catch (_) { return; }
|
|
3212
|
+
if (!checkedRootDomain) return;
|
|
3213
|
+
|
|
3214
|
+
// ignoreDomainsByUrl — if any pattern matches this popup URL,
|
|
3215
|
+
// mark the root domain as ignored for the rest of the scan
|
|
3216
|
+
// (main page + all popups). Mirrors the main handler so a
|
|
3217
|
+
// tracker URL surfaced via popup chain has the same dampening
|
|
3218
|
+
// effect as one surfaced on the main page.
|
|
3219
|
+
if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
|
|
3220
|
+
for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
|
|
3221
|
+
if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3222
|
+
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
3223
|
+
if (forceDebug) {
|
|
3224
|
+
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3147
3225
|
}
|
|
3226
|
+
break;
|
|
3148
3227
|
}
|
|
3149
3228
|
}
|
|
3229
|
+
}
|
|
3150
3230
|
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
// available here, so we just return rather than abort; the
|
|
3160
|
-
// popup-request observer treats this as "don't process".
|
|
3161
|
-
if (matchesDynamicBlock(checkedRootDomain)) return;
|
|
3162
|
-
|
|
3163
|
-
// First-party / third-party gate (popup belongs to the main URL's
|
|
3164
|
-
// domain group — its OWN URL doesn't redefine first-party).
|
|
3165
|
-
const isFirstParty = firstPartyDomains.has(checkedRootDomain);
|
|
3166
|
-
if (siteConfig.firstParty === false && isFirstParty) return;
|
|
3167
|
-
if (siteConfig.thirdParty === false && !isFirstParty) return;
|
|
3168
|
-
|
|
3169
|
-
// Regex match against the site's filterRegex list
|
|
3170
|
-
const resourceType = request.resourceType();
|
|
3171
|
-
let regexMatched = false;
|
|
3172
|
-
for (const re of regexes) {
|
|
3173
|
-
if (re.test(checkedUrl)) {
|
|
3174
|
-
regexMatched = true;
|
|
3231
|
+
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
|
|
3232
|
+
// above; populating the dynamic block Set from popup URLs lets
|
|
3233
|
+
// tracker URLs surfaced via popup chains poison their root
|
|
3234
|
+
// domain for the rest of the scan just like main-page hits do.
|
|
3235
|
+
if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3236
|
+
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3237
|
+
if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3238
|
+
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3175
3239
|
if (forceDebug) {
|
|
3176
|
-
console.log(formatLogMessage('debug',
|
|
3240
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3177
3241
|
}
|
|
3178
3242
|
break;
|
|
3179
3243
|
}
|
|
3180
3244
|
}
|
|
3245
|
+
}
|
|
3181
3246
|
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3203
|
-
|
|
3204
|
-
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
ignoreDomains, matchesIgnoreDomain
|
|
3210
|
-
});
|
|
3211
|
-
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3212
|
-
} else {
|
|
3213
|
-
// No nettools required — regex match alone counts.
|
|
3214
|
-
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3247
|
+
// ignoreDomains gate (global; matchesIgnoreDomain also short-
|
|
3248
|
+
// circuits on _dynamicallyIgnoredDomains, so a domain we just
|
|
3249
|
+
// added above will be caught here on the same request).
|
|
3250
|
+
if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
|
|
3251
|
+
|
|
3252
|
+
// Dynamic-block gate for popup requests — early return on
|
|
3253
|
+
// matched root or any parent (parent-walk in
|
|
3254
|
+
// matchesDynamicBlock). Popups don't have a request object
|
|
3255
|
+
// available here, so we just return rather than abort; the
|
|
3256
|
+
// popup-request observer treats this as "don't process".
|
|
3257
|
+
if (matchesDynamicBlock(checkedRootDomain)) return;
|
|
3258
|
+
|
|
3259
|
+
// First-party / third-party gate (popup belongs to the main URL's
|
|
3260
|
+
// domain group — its OWN URL doesn't redefine first-party).
|
|
3261
|
+
const isFirstParty = firstPartyDomains.has(checkedRootDomain);
|
|
3262
|
+
if (siteConfig.firstParty === false && isFirstParty) return;
|
|
3263
|
+
if (siteConfig.thirdParty === false && !isFirstParty) return;
|
|
3264
|
+
|
|
3265
|
+
// Regex match against the site's filterRegex list
|
|
3266
|
+
let regexMatched = false;
|
|
3267
|
+
for (const re of regexes) {
|
|
3268
|
+
if (re.test(checkedUrl)) {
|
|
3269
|
+
regexMatched = true;
|
|
3270
|
+
if (forceDebug) {
|
|
3271
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
|
|
3272
|
+
}
|
|
3273
|
+
break;
|
|
3215
3274
|
}
|
|
3216
|
-
}
|
|
3275
|
+
}
|
|
3276
|
+
|
|
3277
|
+
if (!regexMatched) return;
|
|
3278
|
+
|
|
3279
|
+
// hasNetTools is the same flag the main handler uses (line ~2639).
|
|
3280
|
+
// When the site config carries whois/dig terms, regex match is
|
|
3281
|
+
// not sufficient by itself — the URL must ALSO pass the whois/
|
|
3282
|
+
// dig validation before it counts. Mirrors the main handler's
|
|
3283
|
+
// behavior so 'capture popup domains that match regex/dig/whois'
|
|
3284
|
+
// means the same thing for popups as for the main page.
|
|
3285
|
+
if (hasNetTools) {
|
|
3286
|
+
const popupNetToolsHandler = createNetToolsHandler({
|
|
3287
|
+
whoisTerms, whoisOrTerms,
|
|
3288
|
+
processedWhoisDomains: globalProcessedWhoisDomains,
|
|
3289
|
+
processedDigDomains: globalProcessedDigDomains,
|
|
3290
|
+
whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
|
|
3291
|
+
whoisServer,
|
|
3292
|
+
whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
|
|
3293
|
+
debugLogFile,
|
|
3294
|
+
digTerms, digOrTerms, digRecordType,
|
|
3295
|
+
digSubdomain: siteConfig.dig_subdomain === true,
|
|
3296
|
+
dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
|
|
3297
|
+
matchedDomains, addMatchedDomain,
|
|
3298
|
+
isDomainAlreadyDetected: isLocallyDetected,
|
|
3299
|
+
onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
|
|
3300
|
+
onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
|
|
3301
|
+
cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
|
|
3302
|
+
cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
|
|
3303
|
+
currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
|
|
3304
|
+
ignoreDomains, matchesIgnoreDomain
|
|
3305
|
+
});
|
|
3306
|
+
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3307
|
+
} else {
|
|
3308
|
+
// No nettools required — regex match alone counts.
|
|
3309
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3310
|
+
}
|
|
3311
|
+
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3312
|
+
};
|
|
3313
|
+
|
|
3314
|
+
// Thin wrapper around evaluatePopupUrl for the per-request listener.
|
|
3315
|
+
// Under forceDebug also attach framenavigated + close listeners so
|
|
3316
|
+
// the popup's full lifecycle (initial nav URL, mid-popup navigations,
|
|
3317
|
+
// close) is visible in logs. Useful when investigating "I saw a
|
|
3318
|
+
// Chrome window flash on screen" — the framenavigated transitions
|
|
3319
|
+
// tell you what URL the window was showing and for how long.
|
|
3320
|
+
const attachPopupRequestCapture = (popupPage, depth) => {
|
|
3321
|
+
popupPage.on('request', (request) => {
|
|
3322
|
+
evaluatePopupUrl(request.url(), depth, request.resourceType());
|
|
3217
3323
|
});
|
|
3324
|
+
if (forceDebug) {
|
|
3325
|
+
try {
|
|
3326
|
+
popupPage.on('framenavigated', (frame) => {
|
|
3327
|
+
try {
|
|
3328
|
+
if (frame !== popupPage.mainFrame()) return; // main frame only
|
|
3329
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] framenavigated → ${frame.url() || 'about:blank'}`));
|
|
3330
|
+
} catch (_) {}
|
|
3331
|
+
});
|
|
3332
|
+
popupPage.on('close', () => {
|
|
3333
|
+
try {
|
|
3334
|
+
const lastUrl = popupPage.url ? popupPage.url() : '(unknown)';
|
|
3335
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] close (last URL: ${lastUrl})`));
|
|
3336
|
+
} catch (_) {}
|
|
3337
|
+
});
|
|
3338
|
+
popupPage.on('pageerror', (err) => {
|
|
3339
|
+
try { console.log(formatLogMessage('debug', `[popup depth=${depth}] pageerror: ${err.message}`)); } catch (_) {}
|
|
3340
|
+
});
|
|
3341
|
+
} catch (_) { /* listener attach errors aren't fatal */ }
|
|
3342
|
+
}
|
|
3218
3343
|
};
|
|
3219
3344
|
|
|
3220
3345
|
const onTargetCreated = async (target) => {
|
|
3346
|
+
// Log EVERY targetcreated event under forceDebug so callers can see
|
|
3347
|
+
// the full set of targets Chromium creates during the scan — not
|
|
3348
|
+
// just the ones we capture. Useful when investigating "is that
|
|
3349
|
+
// Chrome window I saw from a popup or from somewhere else?" — if
|
|
3350
|
+
// a window opens but no targetcreated fires, it's not ours. If a
|
|
3351
|
+
// targetcreated fires for type=page but we skip-and-explain below,
|
|
3352
|
+
// the user knows why we ignored it. Captures the FULL diagnostic
|
|
3353
|
+
// surface, no behavior change.
|
|
3354
|
+
let _tType, _tUrl;
|
|
3355
|
+
if (forceDebug) {
|
|
3356
|
+
try {
|
|
3357
|
+
_tType = target.type();
|
|
3358
|
+
_tUrl = target.url() || 'about:blank';
|
|
3359
|
+
console.log(formatLogMessage('debug', `[popup] targetcreated: type=${_tType} url=${_tUrl}`));
|
|
3360
|
+
} catch (_) {}
|
|
3361
|
+
}
|
|
3362
|
+
|
|
3221
3363
|
// Short-circuit guard: if finally has already started, don't attach
|
|
3222
3364
|
// a request listener whose closure would outlive its meaningful
|
|
3223
3365
|
// scope. The race is narrow (a targetcreated firing while we're
|
|
3224
3366
|
// mid-await on target.page() across the finally boundary), but
|
|
3225
3367
|
// without this guard a late popup could push matches into
|
|
3226
3368
|
// matchedDomains for a URL whose processing has already returned.
|
|
3227
|
-
if (urlFinished)
|
|
3228
|
-
|
|
3369
|
+
if (urlFinished) {
|
|
3370
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: urlFinished=true (scan teardown in progress)`));
|
|
3371
|
+
return;
|
|
3372
|
+
}
|
|
3373
|
+
if (target.type() !== 'page') {
|
|
3374
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: non-page target type=${target.type()} (workers/service-workers/etc are not popunder candidates)`));
|
|
3375
|
+
return;
|
|
3376
|
+
}
|
|
3229
3377
|
const depth = getPopupDepth(target);
|
|
3230
|
-
if (depth < 1)
|
|
3378
|
+
if (depth < 1) {
|
|
3379
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup] skipping: depth=0 — target not in opener chain of main page (likely a new browser tab opened independently, not a popunder from our scan)`));
|
|
3380
|
+
return; // Not one of ours
|
|
3381
|
+
}
|
|
3231
3382
|
if (depth > POPUP_MAX_DEPTH) {
|
|
3232
3383
|
if (forceDebug) {
|
|
3233
3384
|
console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
|
|
@@ -3237,7 +3388,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3237
3388
|
|
|
3238
3389
|
let popupPage;
|
|
3239
3390
|
try { popupPage = await target.page(); } catch (_) { return; }
|
|
3240
|
-
if (!popupPage)
|
|
3391
|
+
if (!popupPage) {
|
|
3392
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] target.page() returned null — popup not accessible as a Page object`));
|
|
3393
|
+
return;
|
|
3394
|
+
}
|
|
3241
3395
|
// Re-check after the await — the per-URL finally may have flipped
|
|
3242
3396
|
// the flag while target.page() was resolving.
|
|
3243
3397
|
if (urlFinished) {
|
|
@@ -3247,8 +3401,31 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3247
3401
|
|
|
3248
3402
|
if (forceDebug) {
|
|
3249
3403
|
console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
|
|
3404
|
+
// Window dimensions are useful for the "is the popup visible on
|
|
3405
|
+
// my screen?" question — a popup with non-zero viewport in a
|
|
3406
|
+
// headless=new launch shouldn't be visible but on some display
|
|
3407
|
+
// servers (WSLg, X11) it can briefly flash on screen. Log the
|
|
3408
|
+
// viewport so callers can correlate with what they saw.
|
|
3409
|
+
try {
|
|
3410
|
+
const vp = popupPage.viewport();
|
|
3411
|
+
if (vp) console.log(formatLogMessage('debug', `[popup depth=${depth}] viewport: ${vp.width}x${vp.height}`));
|
|
3412
|
+
} catch (_) {}
|
|
3250
3413
|
}
|
|
3251
3414
|
|
|
3415
|
+
// Evaluate the popup's own navigation URL against the same filter
|
|
3416
|
+
// pipeline used for in-popup requests. Required because targetcreated
|
|
3417
|
+
// → target.page() → on('request', ...) is async, and the browser
|
|
3418
|
+
// dispatches the popup's navigation request immediately on window.open
|
|
3419
|
+
// — by the time the listener attaches below, the navigation request
|
|
3420
|
+
// has already fired and won't be re-emitted. resourceType 'document'
|
|
3421
|
+
// mirrors what Chrome would emit for a top-level navigation request.
|
|
3422
|
+
// Without this call, AdsCore-style popunder destinations (URL contains
|
|
3423
|
+
// &campaign=, &v=, etc) were seen-but-not-evaluated: the popup was
|
|
3424
|
+
// logged but its domain never matched the filter regex, so it never
|
|
3425
|
+
// became a rule. Only secondary in-popup requests (tracking pixels,
|
|
3426
|
+
// sub-resources) ever got tested against the regex.
|
|
3427
|
+
evaluatePopupUrl(target.url(), depth, 'document');
|
|
3428
|
+
|
|
3252
3429
|
attachPopupRequestCapture(popupPage, depth);
|
|
3253
3430
|
|
|
3254
3431
|
// Auto-close after the capture window so popups don't pile up.
|
|
@@ -4322,7 +4499,26 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4322
4499
|
|
|
4323
4500
|
// Mark page as processing during interactions
|
|
4324
4501
|
updatePageUsage(page, true);
|
|
4325
|
-
|
|
4502
|
+
// Work-aware ceiling (scales with click count / realistic_click /
|
|
4503
|
+
// intensity) instead of a flat 15s, which truncated high-click
|
|
4504
|
+
// popunder configs mid-pass. Single source of truth shared with
|
|
4505
|
+
// interaction.js's own internal hard cap so the two can't disagree.
|
|
4506
|
+
const INTERACTION_HARD_TIMEOUT = computeInteractionCeilingMs(interactionConfig);
|
|
4507
|
+
|
|
4508
|
+
// Capture-and-clear timer wrapper — same fix as cdp.js (0772ccd) and
|
|
4509
|
+
// the per-URL grace (577ad66). The 3 inline Promise.race patterns
|
|
4510
|
+
// below previously used `new Promise((_, reject) => setTimeout(...))`
|
|
4511
|
+
// without capturing the timer ID, leaking the 15s timer + closure on
|
|
4512
|
+
// reject every time interaction completed inside the cap (the common
|
|
4513
|
+
// case). Centralizing avoids the same mistake recurring across the
|
|
4514
|
+
// ghost-cursor / fallback / standard branches.
|
|
4515
|
+
const raceWithTimer = (promise, msg) => {
|
|
4516
|
+
let t;
|
|
4517
|
+
return Promise.race([
|
|
4518
|
+
promise,
|
|
4519
|
+
new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), INTERACTION_HARD_TIMEOUT); })
|
|
4520
|
+
]).finally(() => clearTimeout(t));
|
|
4521
|
+
};
|
|
4326
4522
|
|
|
4327
4523
|
// Check if ghost-cursor mode is enabled for this site
|
|
4328
4524
|
const ghostConfig = resolveGhostCursorConfig(siteConfig, globalGhostCursor, forceDebug);
|
|
@@ -4333,60 +4529,51 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4333
4529
|
if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
|
|
4334
4530
|
const cursor = createGhostCursor(page, { forceDebug });
|
|
4335
4531
|
if (cursor) {
|
|
4336
|
-
await
|
|
4337
|
-
|
|
4338
|
-
|
|
4339
|
-
|
|
4340
|
-
|
|
4341
|
-
|
|
4342
|
-
|
|
4343
|
-
|
|
4344
|
-
|
|
4345
|
-
|
|
4346
|
-
|
|
4347
|
-
|
|
4348
|
-
|
|
4349
|
-
|
|
4350
|
-
|
|
4351
|
-
|
|
4352
|
-
|
|
4353
|
-
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4354
|
-
}
|
|
4355
|
-
}
|
|
4356
|
-
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4357
|
-
await ghostRandomMove(cursor, { forceDebug });
|
|
4358
|
-
}
|
|
4359
|
-
if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
|
|
4360
|
-
const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
|
|
4361
|
-
const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
|
|
4362
|
-
await ghostClick(cursor, { x: clickX, y: clickY }, {
|
|
4363
|
-
hesitate: ghostConfig.hesitate,
|
|
4364
|
-
forceDebug
|
|
4365
|
-
});
|
|
4532
|
+
await raceWithTimer((async () => {
|
|
4533
|
+
const viewport = page.viewport() || { width: 1200, height: 800 };
|
|
4534
|
+
const ghostDuration = ghostConfig.duration || 2000;
|
|
4535
|
+
const ghostStart = Date.now();
|
|
4536
|
+
const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
|
|
4537
|
+
|
|
4538
|
+
// Time-based Bezier mouse movements — runs for ghostDuration ms
|
|
4539
|
+
while (ghostTimeLeft() > 200) {
|
|
4540
|
+
const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
|
|
4541
|
+
const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
|
|
4542
|
+
await ghostMove(cursor, toX, toY, {
|
|
4543
|
+
moveSpeed: ghostConfig.moveSpeed,
|
|
4544
|
+
overshootThreshold: ghostConfig.overshootThreshold,
|
|
4545
|
+
forceDebug
|
|
4546
|
+
});
|
|
4547
|
+
if (ghostTimeLeft() > 100) {
|
|
4548
|
+
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4366
4549
|
}
|
|
4367
|
-
|
|
4368
|
-
|
|
4369
|
-
|
|
4370
|
-
|
|
4371
|
-
|
|
4372
|
-
|
|
4373
|
-
|
|
4374
|
-
|
|
4375
|
-
|
|
4376
|
-
|
|
4550
|
+
}
|
|
4551
|
+
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4552
|
+
await ghostRandomMove(cursor, { forceDebug });
|
|
4553
|
+
}
|
|
4554
|
+
if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
|
|
4555
|
+
const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
|
|
4556
|
+
const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
|
|
4557
|
+
await ghostClick(cursor, { x: clickX, y: clickY }, {
|
|
4558
|
+
hesitate: ghostConfig.hesitate,
|
|
4559
|
+
forceDebug
|
|
4560
|
+
});
|
|
4561
|
+
}
|
|
4562
|
+
if (interactionConfig.includeScrolling) {
|
|
4563
|
+
await performPageInteraction(page, currentUrl, {
|
|
4564
|
+
...interactionConfig,
|
|
4565
|
+
mouseMovements: 0,
|
|
4566
|
+
includeElementClicks: false
|
|
4567
|
+
}, forceDebug);
|
|
4568
|
+
}
|
|
4569
|
+
})(), 'ghost-cursor interaction hard timeout');
|
|
4377
4570
|
} else {
|
|
4378
4571
|
if (forceDebug) console.log(formatLogMessage('debug', '[ghost-cursor] Falling back to built-in mouse'));
|
|
4379
|
-
await
|
|
4380
|
-
performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
|
|
4381
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
|
|
4382
|
-
]);
|
|
4572
|
+
await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
|
|
4383
4573
|
}
|
|
4384
4574
|
} else {
|
|
4385
4575
|
// Standard built-in mouse interaction
|
|
4386
|
-
await
|
|
4387
|
-
performPageInteraction(page, currentUrl, interactionConfig, forceDebug),
|
|
4388
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('interaction hard timeout')), INTERACTION_HARD_TIMEOUT))
|
|
4389
|
-
]);
|
|
4576
|
+
await raceWithTimer(performPageInteraction(page, currentUrl, interactionConfig, forceDebug), 'interaction hard timeout');
|
|
4390
4577
|
}
|
|
4391
4578
|
} catch (interactTimeoutErr) {
|
|
4392
4579
|
if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
|
|
@@ -4521,8 +4708,16 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4521
4708
|
|
|
4522
4709
|
if (siteConfig.clear_sitedata === true) {
|
|
4523
4710
|
try {
|
|
4524
|
-
|
|
4525
|
-
|
|
4711
|
+
// Default reload clear is quick mode (cookies + cache +
|
|
4712
|
+
// localStorage + sessionStorage — the storage layers where
|
|
4713
|
+
// session-cap tracking typically lives). Sites that put their
|
|
4714
|
+
// session cap in IndexedDB / WebSQL / service workers can opt
|
|
4715
|
+
// into a full clear-per-reload via clear_sitedata_full_on_reload.
|
|
4716
|
+
// Costs ~100-500ms extra per reload and may unregister a
|
|
4717
|
+
// service worker the page depends on; off by default.
|
|
4718
|
+
const fullOnReload = siteConfig.clear_sitedata_full_on_reload === true;
|
|
4719
|
+
const clearResult = await clearSiteData(page, currentUrl, forceDebug, !fullOnReload);
|
|
4720
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data (${fullOnReload ? 'full' : 'quick'}) before reload #${i} for ${currentUrl}`));
|
|
4526
4721
|
} catch (reloadClearErr) {
|
|
4527
4722
|
if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
|
|
4528
4723
|
}
|
|
@@ -4536,20 +4731,26 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4536
4731
|
if (useForceReload && !reloadSuccess && !skipForceReload) {
|
|
4537
4732
|
// Attempt force reload: disable cache, reload, re-enable cache
|
|
4538
4733
|
try {
|
|
4734
|
+
// Local race-with-timer helper — capture-and-clear pattern from
|
|
4735
|
+
// cdp.js / interact (6ad36e7). Without this, every successful
|
|
4736
|
+
// setCacheEnabled() left an 8s setTimeout running with closure
|
|
4737
|
+
// on `reject` (2 leaks per reload cycle × N reload cycles).
|
|
4738
|
+
const raceWithTimer = (promise, msg, ms) => {
|
|
4739
|
+
let t;
|
|
4740
|
+
return Promise.race([
|
|
4741
|
+
promise,
|
|
4742
|
+
new Promise((_, reject) => { t = setTimeout(() => reject(new Error(msg)), ms); })
|
|
4743
|
+
]).finally(() => clearTimeout(t));
|
|
4744
|
+
};
|
|
4745
|
+
|
|
4539
4746
|
// Timeout-protected cache disable
|
|
4540
|
-
await
|
|
4541
|
-
|
|
4542
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('Cache disable timeout')), 8000))
|
|
4543
|
-
]);
|
|
4544
|
-
|
|
4747
|
+
await raceWithTimer(page.setCacheEnabled(false), 'Cache disable timeout', 8000);
|
|
4748
|
+
|
|
4545
4749
|
// Use networkidle2 for force reload to better detect when page is actually loaded
|
|
4546
4750
|
await page.reload({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
|
|
4547
|
-
|
|
4751
|
+
|
|
4548
4752
|
// Timeout-protected cache enable
|
|
4549
|
-
await
|
|
4550
|
-
page.setCacheEnabled(true),
|
|
4551
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('Cache enable timeout')), 8000))
|
|
4552
|
-
]);
|
|
4753
|
+
await raceWithTimer(page.setCacheEnabled(true), 'Cache enable timeout', 8000);
|
|
4553
4754
|
|
|
4554
4755
|
reloadSuccess = true;
|
|
4555
4756
|
if (forceDebug) console.log(formatLogMessage('debug', `Force reload #${i} completed for ${currentUrl}`));
|
|
@@ -4644,8 +4845,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4644
4845
|
const endY = 200 + Math.floor(Math.random() * (vp.height - 400));
|
|
4645
4846
|
await humanLikeMouseMove(page, startX, startY, endX, endY, { steps: 3, curve: 0.04, jitter: 1 });
|
|
4646
4847
|
}
|
|
4647
|
-
// Content clicks to trigger document-level onclick handlers
|
|
4648
|
-
|
|
4848
|
+
// Content clicks to trigger document-level onclick handlers.
|
|
4849
|
+
// Honor siteConfig.interact_click_count so popunder-discovery configs
|
|
4850
|
+
// get the same click volume on every reload, not just the initial load.
|
|
4851
|
+
// Omit `clicks` when no override is set so performContentClicks uses
|
|
4852
|
+
// its CONTENT_CLICK.CLICK_COUNT default (single source of truth).
|
|
4853
|
+
// realistic forwards siteConfig.realistic_click; always passed
|
|
4854
|
+
// (defaults to false) so realistic mode applies to every reload's
|
|
4855
|
+
// clicks, not just the initial pass.
|
|
4856
|
+
const postReloadClickOpts = {
|
|
4857
|
+
preDelay: 200,
|
|
4858
|
+
forceDebug,
|
|
4859
|
+
realistic: !!interactionConfig.realistic
|
|
4860
|
+
};
|
|
4861
|
+
if (interactionConfig.clickCount) postReloadClickOpts.clicks = interactionConfig.clickCount;
|
|
4862
|
+
await performContentClicks(page, postReloadClickOpts);
|
|
4649
4863
|
if (forceDebug) console.log(formatLogMessage('debug', `Post-reload interaction completed for reload #${i}`));
|
|
4650
4864
|
} catch (postReloadInteractErr) {
|
|
4651
4865
|
// Non-critical — continue with remaining reloads
|
|
@@ -4870,9 +5084,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4870
5084
|
}
|
|
4871
5085
|
}
|
|
4872
5086
|
|
|
4873
|
-
// Temporarily store the pLimit function
|
|
5087
|
+
// Temporarily store the pLimit function
|
|
4874
5088
|
const originalLimit = limit;
|
|
4875
5089
|
|
|
5090
|
+
// Per-site config normalization (always runs, not gated on --validate-config).
|
|
5091
|
+
// Catches typo'd keys (whois_terms vs whois) with "did you mean" suggestions
|
|
5092
|
+
// and coerces boolean-like values (interact: 1 → interact: true) before any
|
|
5093
|
+
// downstream strict-equality check silently treats them as disabled. Mutates
|
|
5094
|
+
// each site in place so the rest of the scan sees normalized values.
|
|
5095
|
+
// Reports via console.warn so messages surface even when --silent is set.
|
|
5096
|
+
for (let i = 0; i < sites.length; i++) {
|
|
5097
|
+
const { warnings, errors } = normalizeSiteConfig(sites[i], i);
|
|
5098
|
+
for (const e of errors) console.warn(messageColors.error('⚠ ' + e));
|
|
5099
|
+
for (const w of warnings) console.warn(messageColors.warn('⚠ [config] ' + w));
|
|
5100
|
+
}
|
|
5101
|
+
|
|
4876
5102
|
// V8 Optimization: Calculate total URLs first to pre-allocate array
|
|
4877
5103
|
let totalUrls = 0;
|
|
4878
5104
|
for (const site of sites) {
|
|
@@ -4890,7 +5116,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4890
5116
|
for (const url of urlsToProcess) {
|
|
4891
5117
|
allTasks[taskIndex++] = {
|
|
4892
5118
|
url,
|
|
4893
|
-
|
|
5119
|
+
// Default userAgent to 'chrome' when a site doesn't set one. Without
|
|
5120
|
+
// it the browser sends its bundled default UA, which literally
|
|
5121
|
+
// contains "HeadlessChrome" (verified, both headless modes) — an
|
|
5122
|
+
// instant automation tell. Defaulting here (rather than at launch)
|
|
5123
|
+
// activates the whole coherent path, since UA-string spoofing, the
|
|
5124
|
+
// navigator/webdriver/plugins/userAgentData JS masking, the Sec-CH-UA
|
|
5125
|
+
// request headers, and the curl content-fetch UA all gate on
|
|
5126
|
+
// config.userAgent. Placing 'chrome' BEFORE the spread means an
|
|
5127
|
+
// explicit site value wins — including userAgent:false / null to opt
|
|
5128
|
+
// out and scan with the raw headless UA.
|
|
5129
|
+
config: { userAgent: 'chrome', ...site, _originalUrl: url },
|
|
4894
5130
|
taskId: taskIndex - 1 // For tracking
|
|
4895
5131
|
};
|
|
4896
5132
|
}
|
|
@@ -4923,7 +5159,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4923
5159
|
let urlsSinceLastCleanup = 0;
|
|
4924
5160
|
|
|
4925
5161
|
if (!silentMode && totalUrls > 0) {
|
|
4926
|
-
console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${
|
|
5162
|
+
console.log(`\n${messageColors.processing('Processing')} ${totalUrls} URLs with TRUE concurrency ${effectiveConcurrency}...`);
|
|
4927
5163
|
if (totalUrls > RESOURCE_CLEANUP_INTERVAL) {
|
|
4928
5164
|
console.log(messageColors.processing('Browser will restart every') + ` ~${RESOURCE_CLEANUP_INTERVAL} URLs to free resources`);
|
|
4929
5165
|
}
|
|
@@ -5044,10 +5280,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5044
5280
|
silentMode
|
|
5045
5281
|
});
|
|
5046
5282
|
healthPromise.catch(() => {});
|
|
5047
|
-
|
|
5048
|
-
|
|
5049
|
-
|
|
5050
|
-
|
|
5283
|
+
// Capture-and-clear timer pattern (cdp.js 0772ccd, interact 6ad36e7) —
|
|
5284
|
+
// when healthPromise wins the race, the inline setTimeout would
|
|
5285
|
+
// otherwise hold reject's closure for the full 30s grace window.
|
|
5286
|
+
let healthTimer;
|
|
5287
|
+
try {
|
|
5288
|
+
healthCheck = await Promise.race([
|
|
5289
|
+
healthPromise,
|
|
5290
|
+
new Promise((_, reject) => { healthTimer = setTimeout(() => reject(new Error('Health check timeout')), 30000); })
|
|
5291
|
+
]);
|
|
5292
|
+
} finally {
|
|
5293
|
+
if (healthTimer) clearTimeout(healthTimer);
|
|
5294
|
+
}
|
|
5051
5295
|
} catch (healthError) {
|
|
5052
5296
|
console.log(formatLogMessage('warn', `[HEALTH CHECK] Timeout, assuming restart needed`));
|
|
5053
5297
|
healthCheck = { shouldRestart: true, reason: 'Health check timeout' };
|
|
@@ -5312,26 +5556,94 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5312
5556
|
} catch {}
|
|
5313
5557
|
|
|
5314
5558
|
// Per-URL timeout so a single hung processUrl can't block the batch
|
|
5315
|
-
// forever.
|
|
5316
|
-
// (
|
|
5317
|
-
//
|
|
5318
|
-
//
|
|
5319
|
-
//
|
|
5320
|
-
|
|
5559
|
+
// forever. Scaled from siteConfig.timeout + (delay + interaction) ×
|
|
5560
|
+
// (1 + reload) + 30s headroom, with a 75s floor.
|
|
5561
|
+
//
|
|
5562
|
+
// The (1 + reload) multiplier was missing from the previous formula
|
|
5563
|
+
// (13dd4fa) — `reload: 4` configs perform 5 total cycles (initial +
|
|
5564
|
+
// 4 reloads), each with its own delay + interaction overhead, so the
|
|
5565
|
+
// 80s ceiling for the user's lean config (timeout:35000, delay:15000,
|
|
5566
|
+
// reload:4) fired DURING the 3rd reload while the orphan still had
|
|
5567
|
+
// 2 more cycles + drain to go — far longer than the 8s grace could
|
|
5568
|
+
// bridge. Multiplying by cycle count brings the ceiling above the
|
|
5569
|
+
// legitimate work envelope.
|
|
5570
|
+
const reloadCount = task.config.reload || 0;
|
|
5571
|
+
// Interaction overhead per cycle must match interaction.js's actual
|
|
5572
|
+
// ceiling, which is now work-aware (high interact_click_count /
|
|
5573
|
+
// realistic_click configs legitimately run far longer than the old flat
|
|
5574
|
+
// 15s). Compute the same value here so the per-URL ceiling stays above
|
|
5575
|
+
// the real interaction envelope and can't fire mid-pass. Zero when
|
|
5576
|
+
// interaction is disabled for this task (no interaction cost to budget).
|
|
5577
|
+
const interactionOnForTask = task.config.interact === true && !disableInteract;
|
|
5578
|
+
const INTERACTION_OVERHEAD_MS = interactionOnForTask
|
|
5579
|
+
? computeInteractionCeilingMs(createInteractionConfig(task.url, task.config))
|
|
5580
|
+
: 0;
|
|
5581
|
+
const PER_URL_TIMEOUT_MS = Math.max(
|
|
5582
|
+
75000,
|
|
5583
|
+
(task.config.timeout || 35000)
|
|
5584
|
+
+ ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
|
|
5585
|
+
+ 30000
|
|
5586
|
+
);
|
|
5587
|
+
// Grace period after primary timeout — gives the orphan a chance to
|
|
5588
|
+
// finish drainPendingNetTools() and emit "Saving N rules despite page
|
|
5589
|
+
// load failure" before we abandon its result. Drain typically completes
|
|
5590
|
+
// in <1s with cached nettools; 8s is the safety ceiling.
|
|
5591
|
+
const PER_URL_GRACE_MS = 8000;
|
|
5592
|
+
const PER_URL_TIMEOUT_MARKER = 'PER_URL_TIMEOUT_FIRED';
|
|
5593
|
+
|
|
5321
5594
|
const processUrlPromise = processUrl(task.url, task.config, browser);
|
|
5322
5595
|
let perUrlTimer;
|
|
5323
5596
|
try {
|
|
5324
5597
|
return await Promise.race([
|
|
5325
5598
|
processUrlPromise,
|
|
5326
5599
|
new Promise((_, reject) => {
|
|
5327
|
-
perUrlTimer = setTimeout(() =>
|
|
5600
|
+
perUrlTimer = setTimeout(() => {
|
|
5601
|
+
const e = new Error(`Per-URL timeout (${Math.round(PER_URL_TIMEOUT_MS / 1000)}s)`);
|
|
5602
|
+
e.code = PER_URL_TIMEOUT_MARKER;
|
|
5603
|
+
reject(e);
|
|
5604
|
+
}, PER_URL_TIMEOUT_MS);
|
|
5328
5605
|
})
|
|
5329
5606
|
]);
|
|
5330
5607
|
} catch (err) {
|
|
5331
|
-
if (err && err.
|
|
5332
|
-
processUrlPromise.catch(() => {});
|
|
5608
|
+
if (err && err.code === PER_URL_TIMEOUT_MARKER) {
|
|
5333
5609
|
forceRestartFlag = true;
|
|
5334
|
-
|
|
5610
|
+
// Log the timeout fire — was invisible before; only ended up in the
|
|
5611
|
+
// returned result.error field which is never printed. Makes
|
|
5612
|
+
// ceiling-tuning regressions visible without source-reading.
|
|
5613
|
+
if (forceDebug) {
|
|
5614
|
+
console.log(formatLogMessage('warn', `${err.message} for ${task.url} — orphan in ${PER_URL_GRACE_MS / 1000}s grace`));
|
|
5615
|
+
}
|
|
5616
|
+
// Grace period — wait briefly for the orphan to drain + recover
|
|
5617
|
+
// partial matches. Browser is still in a bad state (we hit the
|
|
5618
|
+
// primary ceiling) so the restart still fires either way; only the
|
|
5619
|
+
// rules payload differs.
|
|
5620
|
+
let graceTimer;
|
|
5621
|
+
try {
|
|
5622
|
+
const graceResult = await Promise.race([
|
|
5623
|
+
processUrlPromise,
|
|
5624
|
+
new Promise((_, reject) => {
|
|
5625
|
+
// Capture the timer ID so the finally can clear it when the
|
|
5626
|
+
// orphan wins the race — otherwise the setTimeout keeps the
|
|
5627
|
+
// event loop ref + closure on `reject` alive for the full
|
|
5628
|
+
// grace window, even though the race already settled.
|
|
5629
|
+
// Same leak pattern fixed in cdp.js (0772ccd) and
|
|
5630
|
+
// clear_sitedata (780b443).
|
|
5631
|
+
graceTimer = setTimeout(() => reject(new Error('Grace timeout')), PER_URL_GRACE_MS);
|
|
5632
|
+
})
|
|
5633
|
+
]);
|
|
5634
|
+
if (forceDebug) {
|
|
5635
|
+
console.log(formatLogMessage('debug', `Grace recovered ${(graceResult && graceResult.rules ? graceResult.rules.length : 0)} rules for ${task.url}`));
|
|
5636
|
+
}
|
|
5637
|
+
return { ...graceResult, needsImmediateRestart: true };
|
|
5638
|
+
} catch (_) {
|
|
5639
|
+
if (forceDebug) {
|
|
5640
|
+
console.log(formatLogMessage('warn', `Grace timed out for ${task.url} — discarding orphan`));
|
|
5641
|
+
}
|
|
5642
|
+
processUrlPromise.catch(() => {});
|
|
5643
|
+
return { url: task.url, rules: [], success: false, error: err.message, needsImmediateRestart: true };
|
|
5644
|
+
} finally {
|
|
5645
|
+
if (graceTimer) clearTimeout(graceTimer);
|
|
5646
|
+
}
|
|
5335
5647
|
}
|
|
5336
5648
|
throw err;
|
|
5337
5649
|
} finally {
|