@fanboynz/network-scanner 2.0.66 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +134 -10
- package/CHANGELOG.md +135 -0
- package/CLAUDE.md +18 -7
- package/README.md +12 -4
- package/lib/adblock-rust.js +23 -18
- package/lib/adblock.js +127 -82
- package/lib/browserexit.js +210 -200
- package/lib/browserhealth.js +84 -60
- package/lib/cdp.js +103 -81
- package/lib/clear_sitedata.js +61 -159
- package/lib/cloudflare.js +579 -409
- package/lib/colorize.js +29 -12
- package/lib/compare.js +16 -8
- package/lib/compress.js +2 -1
- package/lib/curl.js +287 -220
- package/lib/domain-cache.js +87 -40
- package/lib/dry-run.js +137 -194
- package/lib/fingerprint.js +20 -18
- package/lib/flowproxy.js +391 -188
- package/lib/ghost-cursor.js +8 -7
- package/lib/grep.js +248 -171
- package/lib/ignore_similar.js +70 -124
- package/lib/interaction.js +132 -235
- package/lib/nettools.js +309 -87
- package/lib/openvpn_vpn.js +12 -11
- package/lib/output.js +92 -59
- package/lib/post-processing.js +216 -162
- package/lib/redirect.js +46 -30
- package/lib/referrer.js +158 -165
- package/lib/searchstring.js +290 -381
- package/lib/smart-cache.js +141 -91
- package/lib/socks-relay.js +8 -7
- package/lib/spawn-async.js +137 -0
- package/lib/validate_rules.js +188 -176
- package/lib/wireguard_vpn.js +111 -117
- package/nwss.js +740 -156
- package/package.json +4 -4
package/nwss.js
CHANGED
|
@@ -28,13 +28,13 @@ const {
|
|
|
28
28
|
cleanup: cleanupCloudflareCache
|
|
29
29
|
} = require('./lib/cloudflare');
|
|
30
30
|
// FP Bypass
|
|
31
|
-
const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
|
|
31
|
+
const { handleFlowProxyProtection, getFlowProxyTimeouts, attachFlowProxyHeaderListener } = require('./lib/flowproxy');
|
|
32
32
|
// ignore_similar rules
|
|
33
33
|
const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar');
|
|
34
34
|
// Graceful exit
|
|
35
|
-
const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
|
|
35
|
+
const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
|
|
36
36
|
// Whois & Dig
|
|
37
|
-
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats } = require('./lib/nettools');
|
|
37
|
+
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
|
|
38
38
|
// File compare
|
|
39
39
|
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
|
|
40
40
|
// CDP functionality
|
|
@@ -42,7 +42,29 @@ const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeo
|
|
|
42
42
|
// Post-processing cleanup
|
|
43
43
|
const { processResults } = require('./lib/post-processing');
|
|
44
44
|
// Colorize various text when used
|
|
45
|
-
const {
|
|
45
|
+
const { messageColors, formatLogMessage } = require('./lib/colorize');
|
|
46
|
+
const TIMEOUT_TAG = messageColors.processing('[TIMEOUT]');
|
|
47
|
+
const INTERACTION_TAG = messageColors.processing('[interaction]');
|
|
48
|
+
const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]');
|
|
49
|
+
const PROXY_TAG = messageColors.processing('[proxy]');
|
|
50
|
+
const GREP_RESPONSE_TAG = messageColors.processing('[grep-response]');
|
|
51
|
+
const IGNORE_DOMAINS_BY_URL_TAG = messageColors.processing('[ignoreDomainsByUrl]');
|
|
52
|
+
const BLOCK_DOMAINS_BY_URL_TAG = messageColors.processing('[blockDomainsByUrl]');
|
|
53
|
+
const IGNORE_SIMILAR_IGNORED_DOMAINS_TAG = messageColors.processing('[ignore_similar_ignored_domains]');
|
|
54
|
+
const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
|
|
55
|
+
const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
|
|
56
|
+
const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
|
|
57
|
+
const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
|
|
58
|
+
const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
|
|
59
|
+
const VPN_TAG = messageColors.processing('[vpn]');
|
|
60
|
+
// Precomputed colored '[SmartCache]' subsystem prefix — paired with the
|
|
61
|
+
// same constant in lib/smart-cache.js so debug lines from both files
|
|
62
|
+
// produce consistently colored output. formatLogMessage only colors the
|
|
63
|
+
// [severity] tag; this constant colors the subsystem prefix.
|
|
64
|
+
const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
|
|
65
|
+
// Precomputed colored '[CONCURRENCY]' subsystem prefix for batch-throughput
|
|
66
|
+
// log lines (start/completed). Same cyan as the other monitoring tags.
|
|
67
|
+
const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
46
68
|
// Enhanced mouse interaction and page simulation
|
|
47
69
|
const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
48
70
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
@@ -158,7 +180,10 @@ function detectPuppeteerVersion() {
|
|
|
158
180
|
// Enhanced redirect handling
|
|
159
181
|
const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
|
|
160
182
|
// Ensure web browser is working correctly
|
|
161
|
-
|
|
183
|
+
// purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
|
|
184
|
+
// and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
|
|
185
|
+
// automatically — manual purging is no longer needed.
|
|
186
|
+
const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
|
|
162
187
|
|
|
163
188
|
// --- Script Configuration & Constants ---
|
|
164
189
|
const VERSION = '2.0.33'; // Script version
|
|
@@ -350,7 +375,12 @@ const dnsPrecheckTimeoutMs = 2000;
|
|
|
350
375
|
const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
|
|
351
376
|
const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
352
377
|
const DNS_NEGATIVE_CACHE_MAX = 1000;
|
|
353
|
-
let dnsPrecheckSkips = 0;
|
|
378
|
+
let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
|
|
379
|
+
let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
|
|
380
|
+
const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
|
|
381
|
+
// c-ares transient codes — read-only, hoisted out of the per-task DNS
|
|
382
|
+
// pre-check so we don't allocate a fresh Set per URL.
|
|
383
|
+
const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
|
|
354
384
|
|
|
355
385
|
function dnsNegativeCacheSet(hostname, error) {
|
|
356
386
|
if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
|
|
@@ -693,7 +723,7 @@ General Options:
|
|
|
693
723
|
|
|
694
724
|
Validation Options:
|
|
695
725
|
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
|
|
696
|
-
--dns-cache Persist dig/whois results to disk between runs (
|
|
726
|
+
--dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
|
|
697
727
|
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
|
|
698
728
|
By default, URLs whose hostname doesn't resolve are skipped
|
|
699
729
|
immediately (saves ~5-15s of Puppeteer time per dead host).
|
|
@@ -707,6 +737,7 @@ Validation Options:
|
|
|
707
737
|
Global config.json options:
|
|
708
738
|
ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
|
|
709
739
|
ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
|
|
740
|
+
blockDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, ALL subsequent requests on that root domain (and subdomains) are aborted via Puppeteer for the rest of the scan
|
|
710
741
|
blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
|
|
711
742
|
whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
|
|
712
743
|
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
@@ -876,6 +907,7 @@ const {
|
|
|
876
907
|
sites = [],
|
|
877
908
|
ignoreDomains = [],
|
|
878
909
|
ignoreDomainsByUrl = [],
|
|
910
|
+
blockDomainsByUrl = [],
|
|
879
911
|
blocked: globalBlocked = [],
|
|
880
912
|
whois_delay = 3000,
|
|
881
913
|
whois_server_mode = 'random',
|
|
@@ -965,10 +997,11 @@ if (validateConfig) {
|
|
|
965
997
|
}
|
|
966
998
|
}
|
|
967
999
|
|
|
968
|
-
// Pre-compile global blocked regexes ONCE (used in every processUrl call)
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
1000
|
+
// Pre-compile global blocked regexes ONCE (used in every processUrl call).
|
|
1001
|
+
// Was: bare `.map(pattern => new RegExp(pattern))` which hard-threw at
|
|
1002
|
+
// module load on a single bad pattern, killing scan startup. Helper now
|
|
1003
|
+
// warns + skips so the rest of the config can still run.
|
|
1004
|
+
const globalBlockedRegexes = compilePatternList('blocked (global)', globalBlocked);
|
|
972
1005
|
|
|
973
1006
|
// Cache compiled regexes by pattern string — avoids recompiling same patterns across URLs
|
|
974
1007
|
const _compiledRegexCache = new Map();
|
|
@@ -987,6 +1020,44 @@ function getCompiledRegexes(patterns) {
|
|
|
987
1020
|
return arr.map(p => getCompiledRegex(p));
|
|
988
1021
|
}
|
|
989
1022
|
|
|
1023
|
+
/**
|
|
1024
|
+
* Compile a list of regex pattern strings, WARNING loudly on any that fail
|
|
1025
|
+
* compilation instead of:
|
|
1026
|
+
* (a) silently dropping them (old ignoreDomainsByUrl/blockDomainsByUrl
|
|
1027
|
+
* behavior) -- made debugging "why isn't my pattern matching?"
|
|
1028
|
+
* miserable, and
|
|
1029
|
+
* (b) hard-throwing at module load (old `blocked` behavior) -- one bad
|
|
1030
|
+
* pattern would kill the whole scan startup.
|
|
1031
|
+
*
|
|
1032
|
+
* Returns the array of successfully compiled regexes. Failed patterns are
|
|
1033
|
+
* skipped with a single warn line per failure naming the config key + the
|
|
1034
|
+
* source string + the regex error -- enough to find and fix without
|
|
1035
|
+
* grepping through diff history.
|
|
1036
|
+
*
|
|
1037
|
+
* @param {string} configKey - name of the config key, for warn context
|
|
1038
|
+
* @param {string[]} patterns - raw regex source strings
|
|
1039
|
+
* @param {(p:string)=>RegExp} [compile] - compile fn (defaults to new RegExp)
|
|
1040
|
+
* @returns {RegExp[]}
|
|
1041
|
+
*/
|
|
1042
|
+
function compilePatternList(configKey, patterns, compile = (p) => new RegExp(p)) {
|
|
1043
|
+
if (!Array.isArray(patterns)) return [];
|
|
1044
|
+
const out = [];
|
|
1045
|
+
for (const p of patterns) {
|
|
1046
|
+
try {
|
|
1047
|
+
out.push(compile(p));
|
|
1048
|
+
} catch (err) {
|
|
1049
|
+
console.warn(formatLogMessage('warn', `[config] ${configKey} pattern dropped (compile error): ${JSON.stringify(p)} -- ${err.message}`));
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
return out;
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
// Per-pattern match counters for the `blocked` regex (site + global,
|
|
1056
|
+
// combined). Keyed by RegExp.source so the same pattern appearing in both
|
|
1057
|
+
// site and global lists rolls up into one row. Reported at scan end so
|
|
1058
|
+
// stale patterns that match zero requests are easy to spot and prune.
|
|
1059
|
+
const _blockedPatternHits = new Map();
|
|
1060
|
+
|
|
990
1061
|
// Pre-split ignoreDomains into exact Set (O(1) lookup) and wildcard array
|
|
991
1062
|
const _ignoreDomainsExact = new Set();
|
|
992
1063
|
const _ignoreDomainsWildcard = [];
|
|
@@ -998,15 +1069,23 @@ for (const pattern of ignoreDomains) {
|
|
|
998
1069
|
}
|
|
999
1070
|
}
|
|
1000
1071
|
|
|
1001
|
-
// Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
try { return getCompiledRegex(p); } catch { return null; }
|
|
1005
|
-
}).filter(r => r)
|
|
1006
|
-
: [];
|
|
1072
|
+
// Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains.
|
|
1073
|
+
// Bad patterns warn (via compilePatternList) instead of silently dropping.
|
|
1074
|
+
const _ignoreDomainsByUrlRegexes = compilePatternList('ignoreDomainsByUrl', ignoreDomainsByUrl, getCompiledRegex);
|
|
1007
1075
|
// Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
|
|
1008
1076
|
const _dynamicallyIgnoredDomains = new Set();
|
|
1009
1077
|
|
|
1078
|
+
// blockDomainsByUrl: symmetric to ignoreDomainsByUrl but for active
|
|
1079
|
+
// blocking via Puppeteer's request.abort(). When a request URL matches
|
|
1080
|
+
// one of these regex patterns, the request's root domain is added to
|
|
1081
|
+
// _dynamicallyBlockedDomains; subsequent requests on that domain (and
|
|
1082
|
+
// its subdomains, via parent-walk in matchesDynamicBlock) get aborted
|
|
1083
|
+
// before reaching the network. The triggering request itself is also
|
|
1084
|
+
// aborted -- same "gate fires immediately after trigger" semantic the
|
|
1085
|
+
// ignoreDomainsByUrl path uses for the dynamic Set short-circuit.
|
|
1086
|
+
const _blockDomainsByUrlRegexes = compilePatternList('blockDomainsByUrl', blockDomainsByUrl, getCompiledRegex);
|
|
1087
|
+
const _dynamicallyBlockedDomains = new Set();
|
|
1088
|
+
|
|
1010
1089
|
// Apply global configuration overrides with validation
|
|
1011
1090
|
// Priority: Command line args > config.json > defaults
|
|
1012
1091
|
const MAX_CONCURRENT_SITES = (() => {
|
|
@@ -1103,7 +1182,7 @@ function safeMarkDomainProcessed(domain, context, metadata) {
|
|
|
1103
1182
|
}
|
|
1104
1183
|
} catch (cacheErr) {
|
|
1105
1184
|
if (forceDebug) {
|
|
1106
|
-
console.log(formatLogMessage('debug',
|
|
1185
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
|
|
1107
1186
|
}
|
|
1108
1187
|
}
|
|
1109
1188
|
}
|
|
@@ -1417,16 +1496,58 @@ function shouldBypassCacheForUrl(url, siteConfig) {
|
|
|
1417
1496
|
// ability to use wildcards in ignoreDomains
|
|
1418
1497
|
// Cache compiled wildcard regexes to avoid recompilation on every request
|
|
1419
1498
|
const _wildcardRegexCache = new Map();
|
|
1499
|
+
|
|
1500
|
+
// Generic parent-walk helper: returns true if `domain` or any of its
|
|
1501
|
+
// parents (one label at a time, up to the TLD) is present in `set`.
|
|
1502
|
+
// Mirrors the static/dynamic parent-walk inside matchesIgnoreDomain but
|
|
1503
|
+
// usable against an arbitrary single Set -- consumed by
|
|
1504
|
+
// matchesDynamicBlock below. matchesIgnoreDomain keeps its inline
|
|
1505
|
+
// dual-Set probe so the hot path stays single-split, but new single-Set
|
|
1506
|
+
// consumers (block, future similar features) share this helper.
|
|
1507
|
+
function _domainOrParentInSet(set, domain) {
|
|
1508
|
+
if (set.size === 0) return false;
|
|
1509
|
+
if (set.has(domain)) return true;
|
|
1510
|
+
const parts = domain.split('.');
|
|
1511
|
+
for (let i = 1; i < parts.length; i++) {
|
|
1512
|
+
if (set.has(parts.slice(i).join('.'))) return true;
|
|
1513
|
+
}
|
|
1514
|
+
return false;
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
/**
|
|
1518
|
+
* Block-side counterpart to the ignore gate. Returns true if `domain`
|
|
1519
|
+
* (or any of its parents) has been added to _dynamicallyBlockedDomains
|
|
1520
|
+
* by an earlier blockDomainsByUrl pattern match. Called per-request to
|
|
1521
|
+
* decide whether to request.abort() before the static blocked-regex
|
|
1522
|
+
* check fires.
|
|
1523
|
+
*/
|
|
1524
|
+
function matchesDynamicBlock(domain) {
|
|
1525
|
+
return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1420
1528
|
function matchesIgnoreDomain(domain, ignorePatterns) {
|
|
1421
|
-
//
|
|
1422
|
-
|
|
1423
|
-
//
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1529
|
+
// Both dynamic and static ignore lists are walked parent-by-parent so a
|
|
1530
|
+
// subdomain of an ignored root inherits the ignore. Previously the
|
|
1531
|
+
// dynamic check was exact-only, creating an asymmetry: a static-config
|
|
1532
|
+
// `example.com` ignored cdn.example.com transitively, but a runtime
|
|
1533
|
+
// ignoreDomainsByUrl match for the same root (stored as root via
|
|
1534
|
+
// checkedRootDomain at line ~2993) did NOT cascade -- subdomains slipped
|
|
1535
|
+
// through to dig/whois/regex despite the root being ignored. Now
|
|
1536
|
+
// unified: parts split once, shared between both Set probes.
|
|
1537
|
+
const hasDynamic = _dynamicallyIgnoredDomains.size > 0;
|
|
1538
|
+
const hasExact = _ignoreDomainsExact.size > 0;
|
|
1539
|
+
|
|
1540
|
+
if (hasDynamic || hasExact) {
|
|
1541
|
+
// Exact-domain hit on either set wins early.
|
|
1542
|
+
if (hasDynamic && _dynamicallyIgnoredDomains.has(domain)) return true;
|
|
1543
|
+
if (hasExact && _ignoreDomainsExact.has(domain)) return true;
|
|
1544
|
+
|
|
1545
|
+
// Parent-walk: sub.ads.example.com → ads.example.com → example.com
|
|
1427
1546
|
const parts = domain.split('.');
|
|
1428
1547
|
for (let i = 1; i < parts.length; i++) {
|
|
1429
|
-
|
|
1548
|
+
const parent = parts.slice(i).join('.');
|
|
1549
|
+
if (hasDynamic && _dynamicallyIgnoredDomains.has(parent)) return true;
|
|
1550
|
+
if (hasExact && _ignoreDomainsExact.has(parent)) return true;
|
|
1430
1551
|
}
|
|
1431
1552
|
}
|
|
1432
1553
|
|
|
@@ -1868,7 +1989,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1868
1989
|
wgDisconnectAll(forceDebug);
|
|
1869
1990
|
ovpnDisconnectAll(forceDebug);
|
|
1870
1991
|
cleanupCloudflareCache();
|
|
1871
|
-
purgeStaleTrackers();
|
|
1872
1992
|
try { await closeAllSocksRelays(forceDebug); } catch (_) {}
|
|
1873
1993
|
}
|
|
1874
1994
|
|
|
@@ -2020,28 +2140,46 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2020
2140
|
'Browser disconnected'
|
|
2021
2141
|
]);
|
|
2022
2142
|
|
|
2143
|
+
// Popup-capture cleanup registry — declared outside the try so the
|
|
2144
|
+
// finally block (which is a separate lexical scope from try) can see
|
|
2145
|
+
// it. Populated by the capture_popups setup block if siteConfig
|
|
2146
|
+
// .capture_popups is true; iterated in finally to deregister the
|
|
2147
|
+
// browser 'targetcreated' listener and close any tracked popup pages.
|
|
2148
|
+
const popupCleanups = [];
|
|
2149
|
+
// Race-window guard: 'targetcreated' fires synchronously, but
|
|
2150
|
+
// onTargetCreated does an `await target.page()`. If a popup target
|
|
2151
|
+
// is created right as the per-URL try block winds down, the await
|
|
2152
|
+
// can resolve AFTER finally has already iterated popupCleanups —
|
|
2153
|
+
// leaving the popup unregistered for manual cleanup (it still gets
|
|
2154
|
+
// closed by its own 3s auto-close timer, but in the meantime its
|
|
2155
|
+
// request listener could capture matches into matchedDomains for a
|
|
2156
|
+
// URL that already "finished"). The flag is set in finally and
|
|
2157
|
+
// checked at the start of onTargetCreated to short-circuit late
|
|
2158
|
+
// events cleanly.
|
|
2159
|
+
let urlFinished = false;
|
|
2160
|
+
|
|
2023
2161
|
try {
|
|
2024
2162
|
|
|
2025
2163
|
// --- Connect VPN if configured for this site ---
|
|
2026
2164
|
if (siteConfig.vpn) {
|
|
2027
2165
|
const vpnResult = await wgConnect(siteConfig, forceDebug);
|
|
2028
2166
|
if (!vpnResult.success) {
|
|
2029
|
-
console.warn(formatLogMessage('warn',
|
|
2167
|
+
console.warn(formatLogMessage('warn', `${VPN_TAG} WireGuard failed for ${currentUrl}: ${vpnResult.error}`));
|
|
2030
2168
|
return { url: currentUrl, rules: [], success: false, vpnFailed: true };
|
|
2031
2169
|
}
|
|
2032
2170
|
if (!silentMode) {
|
|
2033
2171
|
const ipInfo = vpnResult.externalIP ? ` (${vpnResult.externalIP})` : '';
|
|
2034
|
-
console.log(formatLogMessage('info',
|
|
2172
|
+
console.log(formatLogMessage('info', `${VPN_TAG} WireGuard connected via ${vpnResult.interface}${ipInfo} for ${currentUrl}`));
|
|
2035
2173
|
}
|
|
2036
2174
|
} else if (siteConfig.openvpn) {
|
|
2037
2175
|
const ovpnResult = await ovpnConnect(siteConfig, forceDebug);
|
|
2038
2176
|
if (!ovpnResult.success) {
|
|
2039
|
-
console.warn(formatLogMessage('warn',
|
|
2177
|
+
console.warn(formatLogMessage('warn', `${VPN_TAG} OpenVPN failed for ${currentUrl}: ${ovpnResult.error}`));
|
|
2040
2178
|
return { url: currentUrl, rules: [], success: false, vpnFailed: true };
|
|
2041
2179
|
}
|
|
2042
2180
|
if (!silentMode) {
|
|
2043
2181
|
const ipInfo = ovpnResult.externalIP ? ` (${ovpnResult.externalIP})` : '';
|
|
2044
|
-
console.log(formatLogMessage('info',
|
|
2182
|
+
console.log(formatLogMessage('info', `${VPN_TAG} OpenVPN connected via ${ovpnResult.connection}${ipInfo} for ${currentUrl}`));
|
|
2045
2183
|
}
|
|
2046
2184
|
}
|
|
2047
2185
|
|
|
@@ -2075,12 +2213,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2075
2213
|
const totalDelay = siteDelay + bufferTime;
|
|
2076
2214
|
|
|
2077
2215
|
if (forceDebug && hasCloudflareConfig) {
|
|
2078
|
-
console.log(formatLogMessage('debug',
|
|
2216
|
+
console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Using extended delay for Cloudflare site: ${totalDelay}ms (${siteDelay}ms + ${bufferTime}ms CF buffer)`));
|
|
2079
2217
|
}
|
|
2080
2218
|
|
|
2081
2219
|
const realtimeResult = await performRealtimeWindowCleanup(browserInstance, threshold, forceDebug, totalDelay);
|
|
2082
2220
|
if (realtimeResult.success && realtimeResult.closedCount > 0 && forceDebug) {
|
|
2083
|
-
console.log(formatLogMessage('debug',
|
|
2221
|
+
console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Cleaned ${realtimeResult.closedCount} old pages, ${realtimeResult.remainingPages} remaining`));
|
|
2084
2222
|
}
|
|
2085
2223
|
}
|
|
2086
2224
|
|
|
@@ -2091,7 +2229,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2091
2229
|
// Aggressive timeouts prevent hanging in Puppeteer 23.x while maintaining speed
|
|
2092
2230
|
|
|
2093
2231
|
page.on('console', (msg) => {
|
|
2094
|
-
if (forceDebug && msg.type() === 'error') console.log(
|
|
2232
|
+
if (forceDebug && msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
|
|
2095
2233
|
});
|
|
2096
2234
|
|
|
2097
2235
|
// Add page crash handler
|
|
@@ -2152,6 +2290,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2152
2290
|
const flowproxyTimeouts = getFlowProxyTimeouts(siteConfig);
|
|
2153
2291
|
page.setDefaultTimeout(Math.min(flowproxyTimeouts.pageTimeout, TIMEOUTS.DEFAULT_NAVIGATION));
|
|
2154
2292
|
page.setDefaultNavigationTimeout(Math.min(flowproxyTimeouts.navigationTimeout, TIMEOUTS.DEFAULT_PAGE));
|
|
2293
|
+
// Attach the response/header listener BEFORE navigation so the
|
|
2294
|
+
// document response's own headers (Server, Set-Cookie, X-FlowProxy-*,
|
|
2295
|
+
// etc.) are observed. The listener accumulates state in a WeakMap
|
|
2296
|
+
// keyed by page; analyzeFlowProxyProtection reads from it later.
|
|
2297
|
+
attachFlowProxyHeaderListener(page);
|
|
2155
2298
|
if (forceDebug) {
|
|
2156
2299
|
console.log(formatLogMessage('debug', `Applied flowProxy timeouts - page: ${flowproxyTimeouts.pageTimeout}ms, nav: ${flowproxyTimeouts.navigationTimeout}ms`));
|
|
2157
2300
|
}
|
|
@@ -2170,9 +2313,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2170
2313
|
if (shouldInjectEvalForPage) {
|
|
2171
2314
|
if (forceDebug) {
|
|
2172
2315
|
if (globalEvalOnDoc) {
|
|
2173
|
-
console.log(formatLogMessage('debug',
|
|
2316
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
|
|
2174
2317
|
} else { // siteConfig.evaluateOnNewDocument must be true
|
|
2175
|
-
console.log(formatLogMessage('debug',
|
|
2318
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
|
|
2176
2319
|
}
|
|
2177
2320
|
}
|
|
2178
2321
|
|
|
@@ -2193,7 +2336,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2193
2336
|
browserResponsive = true;
|
|
2194
2337
|
} catch (healthErr) {
|
|
2195
2338
|
if (forceDebug) {
|
|
2196
|
-
console.log(formatLogMessage('debug',
|
|
2339
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser health check failed: ${healthErr.message}`));
|
|
2197
2340
|
}
|
|
2198
2341
|
browserResponsive = false;
|
|
2199
2342
|
}
|
|
@@ -2292,7 +2435,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2292
2435
|
]);
|
|
2293
2436
|
evalOnDocSuccess = true;
|
|
2294
2437
|
if (forceDebug) {
|
|
2295
|
-
console.log(formatLogMessage('debug',
|
|
2438
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection successful for ${currentUrl}`));
|
|
2296
2439
|
}
|
|
2297
2440
|
} catch (fullInjectionErr) {
|
|
2298
2441
|
// Enhanced error detection for CDP issues
|
|
@@ -2303,12 +2446,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2303
2446
|
|
|
2304
2447
|
if (forceDebug) {
|
|
2305
2448
|
const errorType = isCDPError ? 'CDP/Protocol error' : 'timeout/other';
|
|
2306
|
-
console.log(formatLogMessage('debug',
|
|
2449
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection failed (${errorType}): ${fullInjectionErr.message}`));
|
|
2307
2450
|
}
|
|
2308
2451
|
|
|
2309
2452
|
// Skip fallback for CDP errors - they indicate browser communication issues
|
|
2310
2453
|
if (isCDPError) {
|
|
2311
|
-
console.warn(formatLogMessage('warn',
|
|
2454
|
+
console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} CDP communication failure - skipping injection for ${currentUrl}`));
|
|
2312
2455
|
evalOnDocSuccess = false;
|
|
2313
2456
|
} else {
|
|
2314
2457
|
|
|
@@ -2355,11 +2498,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2355
2498
|
]);
|
|
2356
2499
|
evalOnDocSuccess = true;
|
|
2357
2500
|
if (forceDebug) {
|
|
2358
|
-
console.log(formatLogMessage('debug',
|
|
2501
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection successful for ${currentUrl}`));
|
|
2359
2502
|
}
|
|
2360
2503
|
} catch (minimalInjectionErr) {
|
|
2361
2504
|
if (forceDebug) {
|
|
2362
|
-
console.log(formatLogMessage('debug',
|
|
2505
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection also failed: ${minimalInjectionErr.message}`));
|
|
2363
2506
|
}
|
|
2364
2507
|
evalOnDocSuccess = false;
|
|
2365
2508
|
}
|
|
@@ -2367,14 +2510,14 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2367
2510
|
}
|
|
2368
2511
|
} else {
|
|
2369
2512
|
if (forceDebug) {
|
|
2370
|
-
console.log(formatLogMessage('debug',
|
|
2513
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser unresponsive, skipping injection for ${currentUrl}`));
|
|
2371
2514
|
}
|
|
2372
2515
|
evalOnDocSuccess = false;
|
|
2373
2516
|
}
|
|
2374
2517
|
|
|
2375
2518
|
// Final status logging
|
|
2376
2519
|
if (!evalOnDocSuccess) {
|
|
2377
|
-
console.warn(formatLogMessage('warn',
|
|
2520
|
+
console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} All injection strategies failed for ${currentUrl} - continuing with standard request monitoring only`));
|
|
2378
2521
|
}
|
|
2379
2522
|
// Allow realtime cleanup to proceed after injection completes
|
|
2380
2523
|
if (shouldInjectEvalForPage && siteConfig.window_cleanup === "realtime") {
|
|
@@ -2403,7 +2546,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2403
2546
|
}
|
|
2404
2547
|
}, { selectors: cssBlockedSelectors });
|
|
2405
2548
|
} catch (cssErr) {
|
|
2406
|
-
console.warn(formatLogMessage('warn',
|
|
2549
|
+
console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
|
|
2407
2550
|
}
|
|
2408
2551
|
}
|
|
2409
2552
|
// --- END: CSS Element Blocking Setup ---
|
|
@@ -2460,7 +2603,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2460
2603
|
const clearResult = await clearSiteData(page, currentUrl, forceDebug);
|
|
2461
2604
|
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data for ${currentUrl}`));
|
|
2462
2605
|
} catch (clearErr) {
|
|
2463
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
2606
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Failed for ${currentUrl}: ${clearErr.message}`));
|
|
2464
2607
|
}
|
|
2465
2608
|
}
|
|
2466
2609
|
|
|
@@ -2686,19 +2829,41 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2686
2829
|
});
|
|
2687
2830
|
}
|
|
2688
2831
|
|
|
2689
|
-
|
|
2690
|
-
|
|
2691
|
-
|
|
2832
|
+
// Per-site blocked compile -- helper warns on bad patterns instead of
|
|
2833
|
+
// throwing out of processUrl and breaking that site's scan.
|
|
2834
|
+
const blockedRegexes = compilePatternList(`blocked (site: ${siteConfig.url || 'unknown'})`, siteConfig.blocked, getCompiledRegex);
|
|
2835
|
+
|
|
2836
|
+
// Per-site escape hatch: disable_adblock turns off the two layers of
|
|
2837
|
+
// "global" ad-blocking for this URL — the adblock-rs filter-list engine
|
|
2838
|
+
// and the globalBlockedRegexes pattern list. Per-site siteConfig.blocked
|
|
2839
|
+
// is preserved (it's an explicit per-site choice, not "global" blocking).
|
|
2840
|
+
//
|
|
2841
|
+
// The use case: capture_popups + popunder/redirect chains. The global
|
|
2842
|
+
// adblock often aborts the exact requests that fire the popup or chain
|
|
2843
|
+
// to the tracker, defeating capture. Setting disable_adblock: true for
|
|
2844
|
+
// those specific URLs lets the chain play out naturally so the popup
|
|
2845
|
+
// request listener can observe the full hop sequence.
|
|
2846
|
+
const disableAdblock = siteConfig.disable_adblock === true;
|
|
2692
2847
|
|
|
2693
2848
|
// Pre-build Set for O(1) resourceType lookups (fired per request)
|
|
2694
2849
|
const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
|
|
2695
2850
|
? new Set(siteConfig.resourceTypes)
|
|
2696
2851
|
: null;
|
|
2697
|
-
|
|
2698
|
-
// Combine site-specific with pre-compiled global blocked patterns
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2852
|
+
|
|
2853
|
+
// Combine site-specific with pre-compiled global blocked patterns.
|
|
2854
|
+
// When disable_adblock is true, globalBlockedRegexes is omitted so
|
|
2855
|
+
// only the per-site list applies.
|
|
2856
|
+
const allBlockedRegexes = disableAdblock
|
|
2857
|
+
? blockedRegexes
|
|
2858
|
+
: (blockedRegexes.length > 0
|
|
2859
|
+
? [...blockedRegexes, ...globalBlockedRegexes]
|
|
2860
|
+
: globalBlockedRegexes); // Avoid spread when no site-specific patterns
|
|
2861
|
+
|
|
2862
|
+
if (disableAdblock && forceDebug) {
|
|
2863
|
+
const dropped = globalBlockedRegexes.length;
|
|
2864
|
+
const adblockNote = adblockEnabled && adblockMatcher ? ' + adblock-rs engine' : '';
|
|
2865
|
+
console.log(formatLogMessage('debug', `[adblock] disable_adblock=true for ${currentUrl} — skipping ${dropped} global blocked patterns${adblockNote} (site-level ${blockedRegexes.length} pattern(s) still apply)`));
|
|
2866
|
+
}
|
|
2702
2867
|
|
|
2703
2868
|
/**
|
|
2704
2869
|
* Helper function to add domain to matched collection
|
|
@@ -2725,7 +2890,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2725
2890
|
const cachedSimilarity = smartCache.getCachedSimilarity(domain, existingDomain);
|
|
2726
2891
|
if (cachedSimilarity !== null && cachedSimilarity >= similarityThreshold) {
|
|
2727
2892
|
if (forceDebug) {
|
|
2728
|
-
console.log(formatLogMessage('debug',
|
|
2893
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Used cached similarity: ${domain} ~= ${existingDomain} (${cachedSimilarity}%)`));
|
|
2729
2894
|
}
|
|
2730
2895
|
return; // Skip adding this domain
|
|
2731
2896
|
}
|
|
@@ -2749,7 +2914,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2749
2914
|
|
|
2750
2915
|
if (smartCache && smartCache.shouldSkipDomain(domain, context)) {
|
|
2751
2916
|
if (forceDebug) {
|
|
2752
|
-
console.log(formatLogMessage('debug',
|
|
2917
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Skipping cached domain: ${domain}`));
|
|
2753
2918
|
}
|
|
2754
2919
|
return; // Skip adding this domain
|
|
2755
2920
|
}
|
|
@@ -2767,7 +2932,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2767
2932
|
|
|
2768
2933
|
if (similarCheck.shouldIgnore) {
|
|
2769
2934
|
if (forceDebug) {
|
|
2770
|
-
console.log(formatLogMessage('debug',
|
|
2935
|
+
console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_TAG} Skipping ${domain}: ${similarCheck.reason}`));
|
|
2771
2936
|
}
|
|
2772
2937
|
return; // Skip adding this domain
|
|
2773
2938
|
}
|
|
@@ -2783,7 +2948,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2783
2948
|
|
|
2784
2949
|
if (ignoredSimilarCheck.shouldIgnore) {
|
|
2785
2950
|
if (forceDebug) {
|
|
2786
|
-
console.log(formatLogMessage('debug',
|
|
2951
|
+
console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_IGNORED_DOMAINS_TAG} Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
|
|
2787
2952
|
}
|
|
2788
2953
|
return; // Skip adding this domain
|
|
2789
2954
|
}
|
|
@@ -2804,7 +2969,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2804
2969
|
}
|
|
2805
2970
|
} catch (cacheErr) {
|
|
2806
2971
|
if (forceDebug) {
|
|
2807
|
-
console.log(formatLogMessage('debug',
|
|
2972
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
|
|
2808
2973
|
}
|
|
2809
2974
|
}
|
|
2810
2975
|
}
|
|
@@ -2822,6 +2987,247 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2822
2987
|
}
|
|
2823
2988
|
}
|
|
2824
2989
|
|
|
2990
|
+
// === POPUP CAPTURE (opt-in via siteConfig.capture_popups: true) ===
|
|
2991
|
+
// Many ad networks fire popunders / new-tab opens (window.open, target=
|
|
2992
|
+
// "_blank") that navigate to trackers and disappear from view. Those
|
|
2993
|
+
// pages are SEPARATE Puppeteer targets — page.on('request', ...) on the
|
|
2994
|
+
// main page never sees their network traffic.
|
|
2995
|
+
//
|
|
2996
|
+
// IMPORTANT: modern Chromium blocks programmatic window.open() unless
|
|
2997
|
+
// it's triggered by a real user gesture. In practice that means
|
|
2998
|
+
// capture_popups only catches anything when the scanner is actually
|
|
2999
|
+
// clicking on the page — i.e., the site config also has
|
|
3000
|
+
// `interact: true` AND `interact_clicks: true`. Setting capture_popups
|
|
3001
|
+
// alone will register the listener but no popups will fire.
|
|
3002
|
+
//
|
|
3003
|
+
// When capture_popups is true, we attach a browser-level 'targetcreated'
|
|
3004
|
+
// listener for THIS URL only. New page targets whose opener-chain leads
|
|
3005
|
+
// back to our main page (within maxDepth levels) get a stripped-down
|
|
3006
|
+
// request listener — same regex/first-party/ignoreDomains filter as
|
|
3007
|
+
// the main handler, same addMatchedDomain() sink, same domain
|
|
3008
|
+
// detection cache, same nettools/similarity logic (all inherited via
|
|
3009
|
+
// addMatchedDomain). Cloudflare bypass, adblock-rs matching, curl/grep
|
|
3010
|
+
// content download, and request.abort() are intentionally skipped on
|
|
3011
|
+
// popups — they're observation-only.
|
|
3012
|
+
//
|
|
3013
|
+
// Each popup's request listener stays attached across in-window
|
|
3014
|
+
// navigations, so a single popup that redirects A -> B -> C captures
|
|
3015
|
+
// every hop. The capture window (default 5s, configurable per-site
|
|
3016
|
+
// via capture_popups_window_ms) is the wall-clock budget for that
|
|
3017
|
+
// chain — bump it for long redirect chains, lower it for high-popup-
|
|
3018
|
+
// rate sites where memory pressure matters more than chain coverage.
|
|
3019
|
+
const capturePopups = siteConfig.capture_popups === true;
|
|
3020
|
+
// Per-site overrides (with sane defaults). Parsed as numbers so config
|
|
3021
|
+
// values from JSON come through correctly; falsy / non-positive values
|
|
3022
|
+
// fall back to the default rather than silently disabling capture.
|
|
3023
|
+
const POPUP_MAX_DEPTH = (() => {
|
|
3024
|
+
const v = parseInt(siteConfig.capture_popups_max_depth, 10);
|
|
3025
|
+
return Number.isFinite(v) && v > 0 ? v : 2;
|
|
3026
|
+
})();
|
|
3027
|
+
const POPUP_CAPTURE_WINDOW_MS = (() => {
|
|
3028
|
+
const v = parseInt(siteConfig.capture_popups_window_ms, 10);
|
|
3029
|
+
return Number.isFinite(v) && v > 0 ? v : 5000;
|
|
3030
|
+
})();
|
|
3031
|
+
|
|
3032
|
+
if (capturePopups && forceDebug) {
|
|
3033
|
+
// One-time setup-time warning if the click prerequisite isn't met.
|
|
3034
|
+
// Without clicks, capture_popups is a no-op in practice.
|
|
3035
|
+
const hasClicks = siteConfig.interact === true && siteConfig.interact_clicks === true;
|
|
3036
|
+
if (!hasClicks) {
|
|
3037
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but interact_clicks is not — popups need user-gesture clicks to fire; expect no captures unless the page opens popups via in-page redirects`));
|
|
3038
|
+
}
|
|
3039
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
|
|
3040
|
+
}
|
|
3041
|
+
|
|
3042
|
+
if (capturePopups) {
|
|
3043
|
+
const mainTarget = page.target();
|
|
3044
|
+
|
|
3045
|
+
// Walk target.opener() chain to find depth relative to mainTarget.
|
|
3046
|
+
// Returns 0 if the target isn't a descendant of mainTarget at all,
|
|
3047
|
+
// 1 for a direct popup of the main page, 2 for popup-of-popup, etc.
|
|
3048
|
+
const getPopupDepth = (target) => {
|
|
3049
|
+
let depth = 0;
|
|
3050
|
+
let cur = target.opener();
|
|
3051
|
+
while (cur && depth <= POPUP_MAX_DEPTH + 1) {
|
|
3052
|
+
depth++;
|
|
3053
|
+
if (cur === mainTarget) return depth;
|
|
3054
|
+
cur = cur.opener();
|
|
3055
|
+
}
|
|
3056
|
+
return 0;
|
|
3057
|
+
};
|
|
3058
|
+
|
|
3059
|
+
// Attach observation-only request listener to a popup page. No
|
|
3060
|
+
// setRequestInterception(true) — page.on('request') fires for every
|
|
3061
|
+
// request regardless of interception state, and we don't need to
|
|
3062
|
+
// block anything on popups.
|
|
3063
|
+
const attachPopupRequestCapture = (popupPage, depth) => {
|
|
3064
|
+
popupPage.on('request', (request) => {
|
|
3065
|
+
try {
|
|
3066
|
+
const checkedUrl = request.url();
|
|
3067
|
+
let fullSubdomain = '';
|
|
3068
|
+
let checkedRootDomain = '';
|
|
3069
|
+
try {
|
|
3070
|
+
const parsedUrl = new URL(checkedUrl);
|
|
3071
|
+
fullSubdomain = parsedUrl.hostname;
|
|
3072
|
+
const pslResult = psl.parse(fullSubdomain);
|
|
3073
|
+
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3074
|
+
} catch (_) { return; }
|
|
3075
|
+
if (!checkedRootDomain) return;
|
|
3076
|
+
|
|
3077
|
+
// ignoreDomainsByUrl — if any pattern matches this popup URL,
|
|
3078
|
+
// mark the root domain as ignored for the rest of the scan
|
|
3079
|
+
// (main page + all popups). Mirrors the main handler so a
|
|
3080
|
+
// tracker URL surfaced via popup chain has the same dampening
|
|
3081
|
+
// effect as one surfaced on the main page.
|
|
3082
|
+
if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
|
|
3083
|
+
for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
|
|
3084
|
+
if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3085
|
+
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
3086
|
+
if (forceDebug) {
|
|
3087
|
+
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3088
|
+
}
|
|
3089
|
+
break;
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
|
|
3094
|
+
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
|
|
3095
|
+
// above; populating the dynamic block Set from popup URLs lets
|
|
3096
|
+
// tracker URLs surfaced via popup chains poison their root
|
|
3097
|
+
// domain for the rest of the scan just like main-page hits do.
|
|
3098
|
+
if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3099
|
+
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3100
|
+
if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3101
|
+
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3102
|
+
if (forceDebug) {
|
|
3103
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3104
|
+
}
|
|
3105
|
+
break;
|
|
3106
|
+
}
|
|
3107
|
+
}
|
|
3108
|
+
}
|
|
3109
|
+
|
|
3110
|
+
// ignoreDomains gate (global; matchesIgnoreDomain also short-
|
|
3111
|
+
// circuits on _dynamicallyIgnoredDomains, so a domain we just
|
|
3112
|
+
// added above will be caught here on the same request).
|
|
3113
|
+
if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
|
|
3114
|
+
|
|
3115
|
+
// Dynamic-block gate for popup requests — early return on
|
|
3116
|
+
// matched root or any parent (parent-walk in
|
|
3117
|
+
// matchesDynamicBlock). Popups don't have a request object
|
|
3118
|
+
// available here, so we just return rather than abort; the
|
|
3119
|
+
// popup-request observer treats this as "don't process".
|
|
3120
|
+
if (matchesDynamicBlock(checkedRootDomain)) return;
|
|
3121
|
+
|
|
3122
|
+
// First-party / third-party gate (popup belongs to the main URL's
|
|
3123
|
+
// domain group — its OWN URL doesn't redefine first-party).
|
|
3124
|
+
const isFirstParty = firstPartyDomains.has(checkedRootDomain);
|
|
3125
|
+
if (siteConfig.firstParty === false && isFirstParty) return;
|
|
3126
|
+
if (siteConfig.thirdParty === false && !isFirstParty) return;
|
|
3127
|
+
|
|
3128
|
+
// Regex match against the site's filterRegex list
|
|
3129
|
+
const resourceType = request.resourceType();
|
|
3130
|
+
let regexMatched = false;
|
|
3131
|
+
for (const re of regexes) {
|
|
3132
|
+
if (re.test(checkedUrl)) {
|
|
3133
|
+
regexMatched = true;
|
|
3134
|
+
if (forceDebug) {
|
|
3135
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
|
|
3136
|
+
}
|
|
3137
|
+
break;
|
|
3138
|
+
}
|
|
3139
|
+
}
|
|
3140
|
+
|
|
3141
|
+
if (!regexMatched) return;
|
|
3142
|
+
|
|
3143
|
+
// hasNetTools is the same flag the main handler uses (line ~2639).
|
|
3144
|
+
// When the site config carries whois/dig terms, regex match is
|
|
3145
|
+
// not sufficient by itself — the URL must ALSO pass the whois/
|
|
3146
|
+
// dig validation before it counts. Mirrors the main handler's
|
|
3147
|
+
// behavior so 'capture popup domains that match regex/dig/whois'
|
|
3148
|
+
// means the same thing for popups as for the main page.
|
|
3149
|
+
if (hasNetTools) {
|
|
3150
|
+
const popupNetToolsHandler = createNetToolsHandler({
|
|
3151
|
+
whoisTerms, whoisOrTerms,
|
|
3152
|
+
processedWhoisDomains: globalProcessedWhoisDomains,
|
|
3153
|
+
processedDigDomains: globalProcessedDigDomains,
|
|
3154
|
+
whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
|
|
3155
|
+
whoisServer,
|
|
3156
|
+
whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
|
|
3157
|
+
debugLogFile,
|
|
3158
|
+
digTerms, digOrTerms, digRecordType,
|
|
3159
|
+
digSubdomain: siteConfig.dig_subdomain === true,
|
|
3160
|
+
dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
|
|
3161
|
+
matchedDomains, addMatchedDomain,
|
|
3162
|
+
isDomainAlreadyDetected: isLocallyDetected,
|
|
3163
|
+
onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
|
|
3164
|
+
onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
|
|
3165
|
+
cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
|
|
3166
|
+
cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
|
|
3167
|
+
currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
|
|
3168
|
+
ignoreDomains, matchesIgnoreDomain
|
|
3169
|
+
});
|
|
3170
|
+
setImmediate(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3171
|
+
} else {
|
|
3172
|
+
// No nettools required — regex match alone counts.
|
|
3173
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3174
|
+
}
|
|
3175
|
+
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3176
|
+
});
|
|
3177
|
+
};
|
|
3178
|
+
|
|
3179
|
+
const onTargetCreated = async (target) => {
|
|
3180
|
+
// Short-circuit guard: if finally has already started, don't attach
|
|
3181
|
+
// a request listener whose closure would outlive its meaningful
|
|
3182
|
+
// scope. The race is narrow (a targetcreated firing while we're
|
|
3183
|
+
// mid-await on target.page() across the finally boundary), but
|
|
3184
|
+
// without this guard a late popup could push matches into
|
|
3185
|
+
// matchedDomains for a URL whose processing has already returned.
|
|
3186
|
+
if (urlFinished) return;
|
|
3187
|
+
if (target.type() !== 'page') return;
|
|
3188
|
+
const depth = getPopupDepth(target);
|
|
3189
|
+
if (depth < 1) return; // Not one of ours
|
|
3190
|
+
if (depth > POPUP_MAX_DEPTH) {
|
|
3191
|
+
if (forceDebug) {
|
|
3192
|
+
console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
|
|
3193
|
+
}
|
|
3194
|
+
return;
|
|
3195
|
+
}
|
|
3196
|
+
|
|
3197
|
+
let popupPage;
|
|
3198
|
+
try { popupPage = await target.page(); } catch (_) { return; }
|
|
3199
|
+
if (!popupPage) return;
|
|
3200
|
+
// Re-check after the await — the per-URL finally may have flipped
|
|
3201
|
+
// the flag while target.page() was resolving.
|
|
3202
|
+
if (urlFinished) {
|
|
3203
|
+
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
3204
|
+
return;
|
|
3205
|
+
}
|
|
3206
|
+
|
|
3207
|
+
if (forceDebug) {
|
|
3208
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
|
|
3209
|
+
}
|
|
3210
|
+
|
|
3211
|
+
attachPopupRequestCapture(popupPage, depth);
|
|
3212
|
+
|
|
3213
|
+
// Auto-close after the capture window so popups don't pile up.
|
|
3214
|
+
const closeTimer = setTimeout(() => {
|
|
3215
|
+
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
3216
|
+
}, POPUP_CAPTURE_WINDOW_MS);
|
|
3217
|
+
if (typeof closeTimer.unref === 'function') closeTimer.unref();
|
|
3218
|
+
|
|
3219
|
+
popupCleanups.push(() => {
|
|
3220
|
+
clearTimeout(closeTimer);
|
|
3221
|
+
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
3222
|
+
});
|
|
3223
|
+
};
|
|
3224
|
+
|
|
3225
|
+
browser.on('targetcreated', onTargetCreated);
|
|
3226
|
+
popupCleanups.push(() => {
|
|
3227
|
+
try { browser.off('targetcreated', onTargetCreated); } catch (_) {}
|
|
3228
|
+
});
|
|
3229
|
+
}
|
|
3230
|
+
|
|
2825
3231
|
// --- page.on('request', ...) Handler: Core Network Request Logic ---
|
|
2826
3232
|
// This handler is triggered for every network request made by the page.
|
|
2827
3233
|
// It decides whether to allow, block, or process the request based on:
|
|
@@ -2882,15 +3288,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2882
3288
|
console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[frame: ${isMainFrame ? 'main' : 'iframe'}] ${debugFrameUrl} → ${checkedUrl}`));
|
|
2883
3289
|
}
|
|
2884
3290
|
|
|
2885
|
-
// Apply adblock rules BEFORE expensive regex checks
|
|
2886
|
-
|
|
3291
|
+
// Apply adblock-rs filter-list rules BEFORE expensive regex checks
|
|
3292
|
+
// for better performance. Gated on !disableAdblock so per-URL configs
|
|
3293
|
+
// (e.g. for popup/redirect chain capture) can bypass it.
|
|
3294
|
+
if (!disableAdblock && adblockEnabled && adblockMatcher) {
|
|
2887
3295
|
try {
|
|
2888
3296
|
const result = adblockMatcher.shouldBlock(
|
|
2889
3297
|
checkedUrl,
|
|
2890
3298
|
currentUrl,
|
|
2891
3299
|
request.resourceType()
|
|
2892
3300
|
);
|
|
2893
|
-
|
|
3301
|
+
|
|
2894
3302
|
if (result.blocked) {
|
|
2895
3303
|
adblockStats.blocked++;
|
|
2896
3304
|
if (forceDebug) {
|
|
@@ -2924,12 +3332,41 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2924
3332
|
if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
|
|
2925
3333
|
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
2926
3334
|
if (forceDebug) {
|
|
2927
|
-
console.log(formatLogMessage('debug',
|
|
3335
|
+
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
|
|
3336
|
+
}
|
|
3337
|
+
break;
|
|
3338
|
+
}
|
|
3339
|
+
}
|
|
3340
|
+
}
|
|
3341
|
+
|
|
3342
|
+
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl above.
|
|
3343
|
+
// If any pattern matches this URL, mark the root domain as blocked
|
|
3344
|
+
// for the rest of the scan. The gate immediately below catches the
|
|
3345
|
+
// triggering request itself + any future request on this domain or
|
|
3346
|
+
// its subdomains (parent-walk via matchesDynamicBlock).
|
|
3347
|
+
if (_blockDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3348
|
+
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3349
|
+
if (_blockDomainsByUrlRegexes[i].test(reqUrl)) {
|
|
3350
|
+
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3351
|
+
if (forceDebug) {
|
|
3352
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source}`));
|
|
2928
3353
|
}
|
|
2929
3354
|
break;
|
|
2930
3355
|
}
|
|
2931
3356
|
}
|
|
2932
3357
|
}
|
|
3358
|
+
// blockDomainsByUrl gate — abort if reqDomain (or a parent) is in
|
|
3359
|
+
// the dynamic block Set. Fires BEFORE the static blocked-regex
|
|
3360
|
+
// check so domain-based blocks short-circuit without paying the
|
|
3361
|
+
// per-URL regex scan. Same abort reason as the static path so
|
|
3362
|
+
// request.failure() observers see consistent metadata.
|
|
3363
|
+
if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
|
|
3364
|
+
if (forceDebug) {
|
|
3365
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
|
|
3366
|
+
}
|
|
3367
|
+
request.abort('blockedbyclient');
|
|
3368
|
+
return;
|
|
3369
|
+
}
|
|
2933
3370
|
|
|
2934
3371
|
let blockedMatchIndex = -1;
|
|
2935
3372
|
for (let i = 0; i < allBlockedRegexes.length; i++) {
|
|
@@ -2939,8 +3376,16 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2939
3376
|
}
|
|
2940
3377
|
}
|
|
2941
3378
|
if (blockedMatchIndex !== -1) {
|
|
3379
|
+
// Always track the hit (zero-cost on the un-debug path) so the
|
|
3380
|
+
// scan-end summary can show which patterns are doing work vs.
|
|
3381
|
+
// which are stale and ready to prune. Keyed by pattern.source --
|
|
3382
|
+
// identical patterns from site + global lists roll up together,
|
|
3383
|
+
// which matches how users think about them.
|
|
3384
|
+
const matchedPatternSrc = allBlockedRegexes[blockedMatchIndex].source;
|
|
3385
|
+
_blockedPatternHits.set(matchedPatternSrc, (_blockedPatternHits.get(matchedPatternSrc) || 0) + 1);
|
|
3386
|
+
|
|
2942
3387
|
if (forceDebug) {
|
|
2943
|
-
const matchedPattern =
|
|
3388
|
+
const matchedPattern = matchedPatternSrc;
|
|
2944
3389
|
const patternSource = blockedMatchIndex < blockedRegexes.length ? 'site' : 'global';
|
|
2945
3390
|
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked]')}[${simplifiedCurrentUrl}] ${reqUrl} blocked by ${patternSource} pattern: ${matchedPattern}`));
|
|
2946
3391
|
|
|
@@ -3012,6 +3457,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3012
3457
|
return;
|
|
3013
3458
|
}
|
|
3014
3459
|
|
|
3460
|
+
// Early ignoreDomains gate — skip regex + dig/whois entirely for domains
|
|
3461
|
+
// in the ignoreDomains list (or dynamically-ignored ones populated by
|
|
3462
|
+
// ignoreDomainsByUrl above). Mirrors the popup handler's early gate so
|
|
3463
|
+
// the main path doesn't waste a dig/whois lookup on domains that
|
|
3464
|
+
// post-processing/output filters will strip anyway.
|
|
3465
|
+
if (matchesIgnoreDomain(reqDomain, ignoreDomains)) {
|
|
3466
|
+
if (forceDebug) {
|
|
3467
|
+
console.log(formatLogMessage('debug', `Skipping ignoreDomains match: ${reqDomain}`));
|
|
3468
|
+
}
|
|
3469
|
+
request.continue();
|
|
3470
|
+
return;
|
|
3471
|
+
}
|
|
3472
|
+
|
|
3015
3473
|
// === ENHANCED REGEX MATCHING WITH AND/OR LOGIC ===
|
|
3016
3474
|
let regexMatched = false;
|
|
3017
3475
|
let matchedRegexPattern = null;
|
|
@@ -3108,9 +3566,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3108
3566
|
dumpUrls,
|
|
3109
3567
|
matchedUrlsLogFile,
|
|
3110
3568
|
forceDebug,
|
|
3111
|
-
fs
|
|
3569
|
+
fs,
|
|
3570
|
+
ignoreDomains,
|
|
3571
|
+
matchesIgnoreDomain
|
|
3112
3572
|
});
|
|
3113
|
-
|
|
3573
|
+
|
|
3114
3574
|
// Execute nettools check asynchronously
|
|
3115
3575
|
const originalDomain = fullSubdomain;
|
|
3116
3576
|
setImmediate(() => netToolsHandler(reqDomain, originalDomain));
|
|
@@ -3184,7 +3644,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3184
3644
|
const cachedDig = smartCache ? smartCache.getCachedNetTools(reqDomain, 'dig', digRecordType) : null;
|
|
3185
3645
|
|
|
3186
3646
|
if ((cachedWhois || cachedDig) && forceDebug) {
|
|
3187
|
-
console.log(formatLogMessage('debug',
|
|
3647
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached nettools results for ${reqDomain}`));
|
|
3188
3648
|
}
|
|
3189
3649
|
|
|
3190
3650
|
// Create nettools handler with cache callbacks (if cache is enabled)
|
|
@@ -3221,9 +3681,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3221
3681
|
dumpUrls,
|
|
3222
3682
|
matchedUrlsLogFile,
|
|
3223
3683
|
forceDebug,
|
|
3224
|
-
fs
|
|
3684
|
+
fs,
|
|
3685
|
+
ignoreDomains,
|
|
3686
|
+
matchesIgnoreDomain
|
|
3225
3687
|
});
|
|
3226
|
-
|
|
3688
|
+
|
|
3227
3689
|
// Execute nettools check asynchronously
|
|
3228
3690
|
const originalDomain = fullSubdomain; // Use full subdomain for nettools
|
|
3229
3691
|
setImmediate(() => netToolsHandler(reqDomain, originalDomain));
|
|
@@ -3280,7 +3742,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3280
3742
|
}
|
|
3281
3743
|
|
|
3282
3744
|
if (cachedContent && forceDebug) {
|
|
3283
|
-
console.log(formatLogMessage('debug',
|
|
3745
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached response content for ${reqUrl.substring(0, 50)}...`));
|
|
3284
3746
|
// Process cached content instead of fetching
|
|
3285
3747
|
} else {
|
|
3286
3748
|
try {
|
|
@@ -3310,7 +3772,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3310
3772
|
forceDebug,
|
|
3311
3773
|
userAgent: curlUserAgent,
|
|
3312
3774
|
resourceType,
|
|
3313
|
-
|
|
3775
|
+
// Pass both flags separately — createGrepHandler now
|
|
3776
|
+
// applies AND logic when hasSearchStringAnd is set.
|
|
3777
|
+
// Previously OR'd into hasSearchString and the AND
|
|
3778
|
+
// patterns were silently dropped.
|
|
3779
|
+
hasSearchString,
|
|
3780
|
+
hasSearchStringAnd,
|
|
3314
3781
|
grepOptions: {
|
|
3315
3782
|
ignoreCase: true,
|
|
3316
3783
|
wholeWord: false,
|
|
@@ -3360,7 +3827,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3360
3827
|
} else if (useGrep && (hasSearchString || hasSearchStringAnd)) {
|
|
3361
3828
|
// Use grep with response handler (no curl)
|
|
3362
3829
|
if (forceDebug) {
|
|
3363
|
-
console.log(formatLogMessage('debug',
|
|
3830
|
+
console.log(formatLogMessage('debug', `${GREP_RESPONSE_TAG} Queuing ${reqUrl} for grep analysis via response handler`));
|
|
3364
3831
|
}
|
|
3365
3832
|
|
|
3366
3833
|
// Queue for grep processing via response handler
|
|
@@ -3448,7 +3915,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3448
3915
|
}
|
|
3449
3916
|
}, cssBlockedSelectors);
|
|
3450
3917
|
} catch (cssRuntimeErr) {
|
|
3451
|
-
console.warn(formatLogMessage('warn',
|
|
3918
|
+
console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
|
|
3452
3919
|
}
|
|
3453
3920
|
}
|
|
3454
3921
|
}
|
|
@@ -3760,8 +4227,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3760
4227
|
const proxyErr = proxyErrors.find(e => err.message.includes(e));
|
|
3761
4228
|
if (proxyErr) {
|
|
3762
4229
|
const info = getProxyInfo(siteConfig);
|
|
3763
|
-
console.error(formatLogMessage('error',
|
|
3764
|
-
console.error(formatLogMessage('error',
|
|
4230
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
|
|
4231
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
|
|
3765
4232
|
}
|
|
3766
4233
|
}
|
|
3767
4234
|
console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
|
|
@@ -3813,7 +4280,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3813
4280
|
try {
|
|
3814
4281
|
if (ghostConfig) {
|
|
3815
4282
|
// Ghost-cursor mode: Bezier-based mouse movements
|
|
3816
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
4283
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
|
|
3817
4284
|
const cursor = createGhostCursor(page, { forceDebug });
|
|
3818
4285
|
if (cursor) {
|
|
3819
4286
|
await Promise.race([
|
|
@@ -3851,8 +4318,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3851
4318
|
await performPageInteraction(page, currentUrl, {
|
|
3852
4319
|
...interactionConfig,
|
|
3853
4320
|
mouseMovements: 0,
|
|
3854
|
-
includeElementClicks: false
|
|
3855
|
-
includeTyping: false
|
|
4321
|
+
includeElementClicks: false
|
|
3856
4322
|
}, forceDebug);
|
|
3857
4323
|
}
|
|
3858
4324
|
})(),
|
|
@@ -3873,7 +4339,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3873
4339
|
]);
|
|
3874
4340
|
}
|
|
3875
4341
|
} catch (interactTimeoutErr) {
|
|
3876
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
4342
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
|
|
3877
4343
|
}
|
|
3878
4344
|
})();
|
|
3879
4345
|
|
|
@@ -4008,7 +4474,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4008
4474
|
const clearResult = await clearSiteData(page, currentUrl, forceDebug, true); // Quick mode for reloads
|
|
4009
4475
|
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i} for ${currentUrl}`));
|
|
4010
4476
|
} catch (reloadClearErr) {
|
|
4011
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
4477
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
|
|
4012
4478
|
}
|
|
4013
4479
|
}
|
|
4014
4480
|
|
|
@@ -4202,8 +4668,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4202
4668
|
const proxyErr = proxyErrors.find(e => err.message.includes(e));
|
|
4203
4669
|
if (proxyErr) {
|
|
4204
4670
|
const info = getProxyInfo(siteConfig);
|
|
4205
|
-
console.error(formatLogMessage('error',
|
|
4206
|
-
console.error(formatLogMessage('error',
|
|
4671
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
|
|
4672
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
|
|
4207
4673
|
}
|
|
4208
4674
|
}
|
|
4209
4675
|
|
|
@@ -4270,17 +4736,33 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4270
4736
|
};
|
|
4271
4737
|
} finally {
|
|
4272
4738
|
// Guaranteed resource cleanup - this runs regardless of success or failure
|
|
4273
|
-
|
|
4739
|
+
|
|
4740
|
+
// Flip the popup-capture race-window guard first so any in-flight
|
|
4741
|
+
// 'targetcreated' handler that resolves after this point sees the
|
|
4742
|
+
// flag and bails (closing its own popup if it managed to fetch one).
|
|
4743
|
+
urlFinished = true;
|
|
4744
|
+
|
|
4745
|
+
// Popup capture teardown (opt-in via siteConfig.capture_popups). Each
|
|
4746
|
+
// entry is either the browser.off('targetcreated', ...) deregistration
|
|
4747
|
+
// or a per-popup (clearTimeout + popupPage.close) cleanup. Iterate even
|
|
4748
|
+
// if one fails so the rest still run.
|
|
4749
|
+
if (popupCleanups.length) {
|
|
4750
|
+
for (const cleanup of popupCleanups) {
|
|
4751
|
+
try { cleanup(); } catch (_) {}
|
|
4752
|
+
}
|
|
4753
|
+
popupCleanups.length = 0;
|
|
4754
|
+
}
|
|
4755
|
+
|
|
4274
4756
|
// Disconnect VPN for this site
|
|
4275
4757
|
if (siteConfig.vpn) {
|
|
4276
4758
|
const vpnDown = wgDisconnect(siteConfig, forceDebug);
|
|
4277
4759
|
if (vpnDown.tornDown && forceDebug) {
|
|
4278
|
-
console.log(formatLogMessage('debug',
|
|
4760
|
+
console.log(formatLogMessage('debug', `${VPN_TAG} WireGuard interface torn down for ${currentUrl}`));
|
|
4279
4761
|
}
|
|
4280
4762
|
} else if (siteConfig.openvpn) {
|
|
4281
4763
|
const ovpnDown = ovpnDisconnect(siteConfig, forceDebug);
|
|
4282
4764
|
if (ovpnDown.tornDown && forceDebug) {
|
|
4283
|
-
console.log(formatLogMessage('debug',
|
|
4765
|
+
console.log(formatLogMessage('debug', `${VPN_TAG} OpenVPN connection torn down for ${currentUrl}`));
|
|
4284
4766
|
}
|
|
4285
4767
|
}
|
|
4286
4768
|
|
|
@@ -4395,7 +4877,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4395
4877
|
let lastProcessedCount = 0;
|
|
4396
4878
|
let hangCheckCount = 0;
|
|
4397
4879
|
let forceRestartFlag = false; // Flag to trigger restart on next iteration
|
|
4398
|
-
|
|
4880
|
+
|
|
4881
|
+
// Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
|
|
4882
|
+
// only colors the [severity] tag; the '[HANG CHECK]' substring was
|
|
4883
|
+
// sitting plain inside the message string. Colored once at function
|
|
4884
|
+
// entry so the interval callback doesn't re-colorize per tick.
|
|
4885
|
+
const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
|
|
4886
|
+
|
|
4399
4887
|
const hangDetectionInterval = setInterval(() => {
|
|
4400
4888
|
// Progress check, counter, and forceRestartFlag MUST run regardless of
|
|
4401
4889
|
// debug mode — previously the entire body was gated on forceDebug, which
|
|
@@ -4406,10 +4894,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4406
4894
|
if (processedUrlCount === lastProcessedCount) {
|
|
4407
4895
|
hangCheckCount++;
|
|
4408
4896
|
if (forceDebug) {
|
|
4409
|
-
console.log(formatLogMessage('warn',
|
|
4897
|
+
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
|
|
4410
4898
|
}
|
|
4411
4899
|
if (hangCheckCount >= 5) {
|
|
4412
|
-
console.log(formatLogMessage('error',
|
|
4900
|
+
console.log(formatLogMessage('error', `${HANG_CHECK_TAG} Hung for 2.5 minutes. Triggering emergency browser restart.`));
|
|
4413
4901
|
forceRestartFlag = true; // Set flag instead of exiting
|
|
4414
4902
|
hangCheckCount = 0; // Reset counter for next cycle
|
|
4415
4903
|
}
|
|
@@ -4422,8 +4910,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4422
4910
|
if (forceDebug) {
|
|
4423
4911
|
const currentBatch = Math.floor(currentBatchInfo.batchStart / RESOURCE_CLEANUP_INTERVAL) + 1;
|
|
4424
4912
|
const totalBatches = Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL);
|
|
4425
|
-
console.log(formatLogMessage('debug',
|
|
4426
|
-
console.log(formatLogMessage('debug',
|
|
4913
|
+
console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} Processed: ${processedUrlCount}/${totalUrls} URLs, Batch: ${currentBatch}/${totalBatches}, Current batch size: ${currentBatchInfo.batchSize}`));
|
|
4914
|
+
console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} URLs since cleanup: ${urlsSinceLastCleanup}, Recent failures: ${results.slice(-3).filter(r => !r.success).length}/3`));
|
|
4427
4915
|
}
|
|
4428
4916
|
}, 30000);
|
|
4429
4917
|
// Don't keep the event loop alive solely for the hang-check interval — the
|
|
@@ -4434,29 +4922,46 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4434
4922
|
// Process URLs in batches with exception handling
|
|
4435
4923
|
let siteGroupIndex = 0;
|
|
4436
4924
|
let currentProxyKey = ''; // Track active proxy config — '' means direct connection
|
|
4925
|
+
// Map of site-config object -> index in sites[], built once. Per-batch
|
|
4926
|
+
// grouping below uses this for O(1) lookup instead of sites.indexOf which
|
|
4927
|
+
// walked the array per task (batch=80 * sites=20 was ~1600 cmps per batch).
|
|
4928
|
+
const configToIndex = new Map();
|
|
4929
|
+
for (let i = 0; i < sites.length; i++) configToIndex.set(sites[i], i);
|
|
4437
4930
|
try {
|
|
4438
4931
|
for (let batchStart = 0; batchStart < totalUrls; batchStart += RESOURCE_CLEANUP_INTERVAL) {
|
|
4439
4932
|
const batchEnd = Math.min(batchStart + RESOURCE_CLEANUP_INTERVAL, totalUrls);
|
|
4440
4933
|
const currentBatch = allTasks.slice(batchStart, batchEnd);
|
|
4441
4934
|
|
|
4442
|
-
|
|
4443
|
-
// Group tasks by their source site configuration for window cleanup
|
|
4935
|
+
|
|
4936
|
+
// Group tasks by their source site configuration for window cleanup.
|
|
4937
|
+
// Single get-or-set replaces has + get + set (one Map lookup not two).
|
|
4938
|
+
// The `?? -1` preserves the old `sites.indexOf` semantics for a task
|
|
4939
|
+
// whose config isn't in sites[] — that case shouldn't happen, but if
|
|
4940
|
+
// it ever does the routing stays identical to the prior code's
|
|
4941
|
+
// 'site_-1' bucket rather than silently shifting to 'site_undefined'.
|
|
4444
4942
|
const tasksBySite = new Map();
|
|
4445
|
-
currentBatch.
|
|
4446
|
-
const
|
|
4447
|
-
|
|
4448
|
-
|
|
4449
|
-
|
|
4450
|
-
|
|
4451
|
-
}
|
|
4943
|
+
for (let i = 0; i < currentBatch.length; i++) {
|
|
4944
|
+
const task = currentBatch[i];
|
|
4945
|
+
const siteKey = `site_${configToIndex.get(task.config) ?? -1}`;
|
|
4946
|
+
let arr = tasksBySite.get(siteKey);
|
|
4947
|
+
if (!arr) tasksBySite.set(siteKey, arr = []);
|
|
4948
|
+
arr.push(task);
|
|
4949
|
+
}
|
|
4452
4950
|
|
|
4453
4951
|
// IMPROVED: Only check health if we have indicators of problems
|
|
4454
4952
|
let healthCheck = { shouldRestart: false, reason: null };
|
|
4455
4953
|
const recentResults = results.slice(-8); // Check more results for better pattern detection
|
|
4456
|
-
|
|
4457
|
-
|
|
4954
|
+
// Single-pass count for both failure rate and critical-error tally —
|
|
4955
|
+
// was two .filter(...).length calls allocating two intermediate arrays.
|
|
4956
|
+
let recentFailures = 0, recentCritical = 0;
|
|
4957
|
+
for (let i = 0; i < recentResults.length; i++) {
|
|
4958
|
+
const r = recentResults[i];
|
|
4959
|
+
if (!r.success) recentFailures++;
|
|
4960
|
+
if (r.needsImmediateRestart) recentCritical++;
|
|
4961
|
+
}
|
|
4962
|
+
const recentFailureRate = recentResults.length > 0 ? recentFailures / recentResults.length : 0;
|
|
4458
4963
|
const hasHighFailureRate = recentFailureRate > 0.75; // 75% failure threshold (more conservative)
|
|
4459
|
-
const hasCriticalErrors =
|
|
4964
|
+
const hasCriticalErrors = recentCritical > 2;
|
|
4460
4965
|
|
|
4461
4966
|
// Only run health checks when we have STRONG indicators of problems
|
|
4462
4967
|
if (urlsSinceLastCleanup > 15 && (
|
|
@@ -4465,15 +4970,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4465
4970
|
urlsSinceLastCleanup > RESOURCE_CLEANUP_INTERVAL * 0.9 // Very close to cleanup limit
|
|
4466
4971
|
)) {
|
|
4467
4972
|
try {
|
|
4973
|
+
// Race the health check against a 30s timeout. Attach .catch on the
|
|
4974
|
+
// health promise itself so that if the timeout wins, the still-running
|
|
4975
|
+
// monitorBrowserHealth's eventual rejection doesn't surface as an
|
|
4976
|
+
// unhandledRejection warning.
|
|
4977
|
+
const healthPromise = monitorBrowserHealth(browser, {}, {
|
|
4978
|
+
siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
|
|
4979
|
+
totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
|
|
4980
|
+
urlsSinceCleanup: urlsSinceLastCleanup,
|
|
4981
|
+
cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
|
|
4982
|
+
forceDebug,
|
|
4983
|
+
silentMode
|
|
4984
|
+
});
|
|
4985
|
+
healthPromise.catch(() => {});
|
|
4468
4986
|
healthCheck = await Promise.race([
|
|
4469
|
-
|
|
4470
|
-
siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
|
|
4471
|
-
totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
|
|
4472
|
-
urlsSinceCleanup: urlsSinceLastCleanup,
|
|
4473
|
-
cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
|
|
4474
|
-
forceDebug,
|
|
4475
|
-
silentMode
|
|
4476
|
-
}),
|
|
4987
|
+
healthPromise,
|
|
4477
4988
|
new Promise((_, reject) => setTimeout(() => reject(new Error('Health check timeout')), 30000))
|
|
4478
4989
|
]);
|
|
4479
4990
|
} catch (healthError) {
|
|
@@ -4502,8 +5013,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4502
5013
|
// timeout) bypasses the urlsSinceLastCleanup > 8 gate — a confirmed hang
|
|
4503
5014
|
// needs immediate restart even if we just cleaned up. Proactive triggers
|
|
4504
5015
|
// keep the gate to prevent thrashing.
|
|
5016
|
+
//
|
|
5017
|
+
// hasHighFailureRate is computed (and still used for the health-check
|
|
5018
|
+
// gate above) but intentionally NOT folded into proactiveRestart:
|
|
5019
|
+
// wouldExceedLimit is always true at every batch boundary with the
|
|
5020
|
+
// default RESOURCE_CLEANUP_INTERVAL == batch size, so the high-failure-
|
|
5021
|
+
// rate branch was dead code reached only at the same boundary that
|
|
5022
|
+
// wouldExceedLimit already triggers. If failure-rate ever needs to
|
|
5023
|
+
// interrupt mid-cleanup-interval, that requires interrupting the
|
|
5024
|
+
// running Promise.all — a real behavior change, not an OR addition.
|
|
4505
5025
|
const hangRecoveryRestart = forceRestartFlag;
|
|
4506
|
-
const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth
|
|
5026
|
+
const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth) && urlsSinceLastCleanup > 8;
|
|
4507
5027
|
if ((hangRecoveryRestart || proactiveRestart) && isNotLastBatch) {
|
|
4508
5028
|
let restartReason = 'Unknown';
|
|
4509
5029
|
if (forceRestartFlag) {
|
|
@@ -4511,8 +5031,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4511
5031
|
forceRestartFlag = false; // Reset the flag
|
|
4512
5032
|
} else if (shouldRestartFromHealth) {
|
|
4513
5033
|
restartReason = healthCheck.reason;
|
|
4514
|
-
} else if (hasHighFailureRate) {
|
|
4515
|
-
restartReason = `High failure rate: ${Math.round(recentFailureRate * 100)}% in recent batch`;
|
|
4516
5034
|
} else if (wouldExceedLimit) {
|
|
4517
5035
|
restartReason = `Processed ${urlsSinceLastCleanup} URLs (scheduled maintenance)`;
|
|
4518
5036
|
}
|
|
@@ -4527,7 +5045,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4527
5045
|
if (requestCacheStats.enabled && requestCacheStats.size > 0) {
|
|
4528
5046
|
const clearedCount = smartCache.clearRequestCache();
|
|
4529
5047
|
if (forceDebug) {
|
|
4530
|
-
console.log(formatLogMessage('debug',
|
|
5048
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during browser restart`));
|
|
4531
5049
|
}
|
|
4532
5050
|
}
|
|
4533
5051
|
}
|
|
@@ -4542,24 +5060,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4542
5060
|
});
|
|
4543
5061
|
|
|
4544
5062
|
// Clean up the specific user data directory
|
|
4545
|
-
if (userDataDir
|
|
4546
|
-
fs.rmSync(userDataDir, { recursive: true, force: true });
|
|
4547
|
-
if (forceDebug) console.log(formatLogMessage('debug', `Cleaned user data dir: ${userDataDir}`));
|
|
4548
|
-
}
|
|
5063
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4549
5064
|
|
|
4550
5065
|
// Additional cleanup for any remaining Chrome processes
|
|
4551
5066
|
if (removeTempFiles) {
|
|
4552
|
-
await cleanupChromeTempFiles({
|
|
4553
|
-
includeSnapTemp: true,
|
|
5067
|
+
await cleanupChromeTempFiles({
|
|
5068
|
+
includeSnapTemp: true,
|
|
4554
5069
|
forceDebug,
|
|
4555
|
-
comprehensive: true
|
|
5070
|
+
comprehensive: true
|
|
4556
5071
|
});
|
|
4557
5072
|
}
|
|
4558
5073
|
|
|
4559
5074
|
} catch (browserCloseErr) {
|
|
4560
5075
|
if (forceDebug) console.log(formatLogMessage('debug', `Browser cleanup warning: ${browserCloseErr.message}`));
|
|
4561
5076
|
}
|
|
4562
|
-
|
|
5077
|
+
|
|
4563
5078
|
// Create new browser for next batch (preserve current proxy config)
|
|
4564
5079
|
const restartProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
|
|
4565
5080
|
browser = await createBrowser(restartProxyArgs);
|
|
@@ -4567,7 +5082,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4567
5082
|
|
|
4568
5083
|
// Reset cleanup counter and add delay
|
|
4569
5084
|
urlsSinceLastCleanup = 0;
|
|
4570
|
-
purgeStaleTrackers();
|
|
4571
5085
|
await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
|
|
4572
5086
|
}
|
|
4573
5087
|
|
|
@@ -4587,9 +5101,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4587
5101
|
forceDebug, timeout: 10000, exitOnFailure: false,
|
|
4588
5102
|
cleanTempFiles: true, comprehensiveCleanup: removeTempFiles
|
|
4589
5103
|
});
|
|
4590
|
-
if (userDataDir
|
|
4591
|
-
fs.rmSync(userDataDir, { recursive: true, force: true });
|
|
4592
|
-
}
|
|
5104
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4593
5105
|
} catch (proxyRestartErr) {
|
|
4594
5106
|
if (forceDebug) console.log(formatLogMessage('debug', `Proxy switch browser cleanup: ${proxyRestartErr.message}`));
|
|
4595
5107
|
}
|
|
@@ -4601,8 +5113,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4601
5113
|
const health = await testProxy(currentBatch[0].config, 5000);
|
|
4602
5114
|
if (!health.reachable) {
|
|
4603
5115
|
const info = getProxyInfo(currentBatch[0].config);
|
|
4604
|
-
console.error(formatLogMessage('error',
|
|
4605
|
-
console.error(formatLogMessage('error',
|
|
5116
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Unreachable: ${info} — ${health.error}`));
|
|
5117
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Skipping ${currentBatch.length} URL(s) in this batch`));
|
|
4606
5118
|
const skipResults = currentBatch.map(task => ({
|
|
4607
5119
|
success: false, url: task.url, rules: [],
|
|
4608
5120
|
error: `Proxy unreachable: ${health.error}`
|
|
@@ -4620,7 +5132,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4620
5132
|
browser = await createBrowser(proxyArgs);
|
|
4621
5133
|
currentProxyKey = batchProxyKey;
|
|
4622
5134
|
urlsSinceLastCleanup = 0;
|
|
4623
|
-
purgeStaleTrackers();
|
|
4624
5135
|
await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
|
|
4625
5136
|
}
|
|
4626
5137
|
|
|
@@ -4630,7 +5141,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4630
5141
|
|
|
4631
5142
|
// Log start of concurrent processing for hang detection
|
|
4632
5143
|
if (forceDebug) {
|
|
4633
|
-
console.log(formatLogMessage('debug',
|
|
5144
|
+
console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Starting ${batchSize} concurrent tasks with limit ${MAX_CONCURRENT_SITES}`));
|
|
4634
5145
|
}
|
|
4635
5146
|
|
|
4636
5147
|
// Create tasks with timeout protection — skip domains that repeatedly timed out.
|
|
@@ -4642,7 +5153,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4642
5153
|
try {
|
|
4643
5154
|
// Short-circuit queued URLs once any URL in this batch has triggered a
|
|
4644
5155
|
// restart. Without this, the 80-URL batch in the user's hang trace
|
|
4645
|
-
// would have to fail one-by-one at
|
|
5156
|
+
// would have to fail one-by-one at 75s each (~25 min total) before
|
|
4646
5157
|
// the boundary restart could fire. Now: first hang fires the flag,
|
|
4647
5158
|
// remaining queued URLs return immediately, batch completes, restart.
|
|
4648
5159
|
if (forceRestartFlag) {
|
|
@@ -4674,10 +5185,26 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4674
5185
|
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check (cached): ${taskDomain} — ${cached.error}`));
|
|
4675
5186
|
return { url: task.url, rules: [], success: false, error: `DNS: ${cached.error}`, skipped: true };
|
|
4676
5187
|
}
|
|
5188
|
+
// Positive-resolution shortcut: dig or whois has already proven this
|
|
5189
|
+
// hostname live within their 20h cache TTL (populated either by an
|
|
5190
|
+
// earlier URL this run or by --dns-cache disk-load from a prior run).
|
|
5191
|
+
// Order matters -- negative cache (5min TTL, fresher data) wins
|
|
5192
|
+
// first, then this 20h-TTL positive index, then the actual resolve.
|
|
5193
|
+
if (domainKnownToResolve(taskDomain)) {
|
|
5194
|
+
dnsPositiveSkips++;
|
|
5195
|
+
dnsPositiveSkippedHosts.add(taskDomain);
|
|
5196
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
|
|
5197
|
+
// Fall through to navigation -- pre-check "passed" by proxy.
|
|
5198
|
+
} else {
|
|
4677
5199
|
const dnsResolve = async () => {
|
|
4678
5200
|
// resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
|
|
4679
|
-
// resolve6 so IPv6-only hosts aren't wrongly skipped.
|
|
4680
|
-
//
|
|
5201
|
+
// resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
|
|
5202
|
+
// error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
|
|
5203
|
+
// unchanged so the outer transient-retry path sees the real
|
|
5204
|
+
// resolver code and the negative cache records the right reason.
|
|
5205
|
+
// Previously a bare .catch swallowed everything and tried
|
|
5206
|
+
// resolve6, which masked transient v4-side errors behind
|
|
5207
|
+
// whatever resolve6 ended up reporting.
|
|
4681
5208
|
// 2s timeout kept as a real safety net — with c-ares off the
|
|
4682
5209
|
// threadpool it should now rarely fire.
|
|
4683
5210
|
let timer;
|
|
@@ -4686,7 +5213,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4686
5213
|
timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
|
|
4687
5214
|
});
|
|
4688
5215
|
const resolveChain = dnsPromises.resolve4(taskDomain)
|
|
4689
|
-
.catch(
|
|
5216
|
+
.catch(err => {
|
|
5217
|
+
if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
|
|
5218
|
+
return dnsPromises.resolve6(taskDomain);
|
|
5219
|
+
}
|
|
5220
|
+
throw err;
|
|
5221
|
+
});
|
|
4690
5222
|
await Promise.race([resolveChain, timeoutP]);
|
|
4691
5223
|
} finally {
|
|
4692
5224
|
if (timer) clearTimeout(timer);
|
|
@@ -4694,13 +5226,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4694
5226
|
};
|
|
4695
5227
|
// c-ares transient codes — retry once so a momentary resolver
|
|
4696
5228
|
// hiccup doesn't poison the negative cache for 5 minutes.
|
|
4697
|
-
|
|
5229
|
+
// DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
|
|
4698
5230
|
try {
|
|
4699
5231
|
try {
|
|
4700
5232
|
await dnsResolve();
|
|
4701
5233
|
} catch (firstErr) {
|
|
4702
5234
|
const code = firstErr && firstErr.code;
|
|
4703
|
-
if (
|
|
5235
|
+
if (DNS_TRANSIENT_ERRORS.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
|
|
4704
5236
|
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
|
|
4705
5237
|
await dnsResolve();
|
|
4706
5238
|
} else {
|
|
@@ -4714,26 +5246,31 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4714
5246
|
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
|
|
4715
5247
|
return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
|
|
4716
5248
|
}
|
|
5249
|
+
} // close `else` from domainKnownToResolve shortcut above
|
|
4717
5250
|
}
|
|
4718
5251
|
} catch {}
|
|
4719
5252
|
|
|
4720
5253
|
// Per-URL timeout so a single hung processUrl can't block the batch
|
|
4721
|
-
// forever.
|
|
4722
|
-
// adaptive
|
|
5254
|
+
// forever. 75s sits comfortably above the realistic legit-page ceiling
|
|
5255
|
+
// (nav 35s + Cloudflare adaptive ~25s + interaction ~10s + network-idle
|
|
5256
|
+
// wait ~10s ≈ ~70s), well short of the old 120s safety net. Cuts
|
|
5257
|
+
// hang-recovery time roughly in half when an entire batch's URLs all
|
|
5258
|
+
// hang and we're waiting on this timeout to advance processedUrlCount.
|
|
5259
|
+
const PER_URL_TIMEOUT_MS = 75000;
|
|
4723
5260
|
const processUrlPromise = processUrl(task.url, task.config, browser);
|
|
4724
5261
|
let perUrlTimer;
|
|
4725
5262
|
try {
|
|
4726
5263
|
return await Promise.race([
|
|
4727
5264
|
processUrlPromise,
|
|
4728
5265
|
new Promise((_, reject) => {
|
|
4729
|
-
perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (
|
|
5266
|
+
perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (75s)')), PER_URL_TIMEOUT_MS);
|
|
4730
5267
|
})
|
|
4731
5268
|
]);
|
|
4732
5269
|
} catch (err) {
|
|
4733
|
-
if (err && err.message === 'Per-URL timeout (
|
|
5270
|
+
if (err && err.message === 'Per-URL timeout (75s)') {
|
|
4734
5271
|
processUrlPromise.catch(() => {});
|
|
4735
5272
|
forceRestartFlag = true;
|
|
4736
|
-
return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (
|
|
5273
|
+
return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (75s)', needsImmediateRestart: true };
|
|
4737
5274
|
}
|
|
4738
5275
|
throw err;
|
|
4739
5276
|
} finally {
|
|
@@ -4749,21 +5286,29 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4749
5286
|
|
|
4750
5287
|
let batchResults;
|
|
4751
5288
|
try {
|
|
5289
|
+
// Same orphan-promise pattern as the health-check race above: if the
|
|
5290
|
+
// 10-min batch timeout wins, the still-running Promise.all keeps going
|
|
5291
|
+
// until every batchTask settles. Each individual task is already wrapped
|
|
5292
|
+
// in p-limit's error handling so unhandled rejections should not surface,
|
|
5293
|
+
// but the .catch is free belt-and-braces against future refactors that
|
|
5294
|
+
// change task internals.
|
|
5295
|
+
const batchPromise = Promise.all(batchTasks);
|
|
5296
|
+
batchPromise.catch(() => {});
|
|
4752
5297
|
batchResults = await Promise.race([
|
|
4753
|
-
|
|
4754
|
-
new Promise((_, reject) =>
|
|
5298
|
+
batchPromise,
|
|
5299
|
+
new Promise((_, reject) =>
|
|
4755
5300
|
setTimeout(() => reject(new Error('Batch timeout')), 600000) // 10 min timeout
|
|
4756
5301
|
)
|
|
4757
5302
|
]);
|
|
4758
5303
|
} catch (timeoutError) {
|
|
4759
5304
|
if (timeoutError.message.includes('timeout')) {
|
|
4760
|
-
console.log(formatLogMessage('error',
|
|
5305
|
+
console.log(formatLogMessage('error', `${TIMEOUT_TAG} Batch hung. Restarting browser.`));
|
|
4761
5306
|
try {
|
|
4762
5307
|
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false });
|
|
5308
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4763
5309
|
const timeoutProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
|
|
4764
5310
|
browser = await createBrowser(timeoutProxyArgs);
|
|
4765
5311
|
urlsSinceLastCleanup = 0;
|
|
4766
|
-
purgeStaleTrackers();
|
|
4767
5312
|
} catch (restartErr) {
|
|
4768
5313
|
throw restartErr;
|
|
4769
5314
|
}
|
|
@@ -4800,7 +5345,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4800
5345
|
|
|
4801
5346
|
// Log completion of concurrent processing
|
|
4802
5347
|
if (forceDebug) {
|
|
4803
|
-
console.log(formatLogMessage('debug',
|
|
5348
|
+
console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Completed ${batchSize} concurrent tasks, ${batchResults.filter(r => r.success).length} successful`));
|
|
4804
5349
|
}
|
|
4805
5350
|
|
|
4806
5351
|
// Enhanced error reporting for Puppeteer 23.x
|
|
@@ -4862,7 +5407,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4862
5407
|
if (requestCacheStats.enabled && requestCacheStats.size > 0) {
|
|
4863
5408
|
const clearedCount = smartCache.clearRequestCache();
|
|
4864
5409
|
if (forceDebug) {
|
|
4865
|
-
console.log(formatLogMessage('debug',
|
|
5410
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during emergency restart`));
|
|
4866
5411
|
}
|
|
4867
5412
|
}
|
|
4868
5413
|
}
|
|
@@ -4883,17 +5428,23 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4883
5428
|
}
|
|
4884
5429
|
|
|
4885
5430
|
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true, comprehensiveCleanup: removeTempFiles });
|
|
5431
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4886
5432
|
// Additional cleanup after emergency restart
|
|
4887
5433
|
if (removeTempFiles) {
|
|
4888
|
-
await cleanupChromeTempFiles({
|
|
4889
|
-
includeSnapTemp: true,
|
|
5434
|
+
await cleanupChromeTempFiles({
|
|
5435
|
+
includeSnapTemp: true,
|
|
4890
5436
|
forceDebug,
|
|
4891
|
-
comprehensive: true
|
|
5437
|
+
comprehensive: true
|
|
4892
5438
|
});
|
|
4893
5439
|
}
|
|
4894
5440
|
browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
|
|
4895
5441
|
urlsSinceLastCleanup = 0; // Reset counter
|
|
4896
|
-
|
|
5442
|
+
// Reset the hang-detection flag too: this restart path is triggered
|
|
5443
|
+
// by needsImmediateRestart errors, which the per-URL 75s timeout
|
|
5444
|
+
// sets in lockstep with forceRestartFlag. Without this reset, the
|
|
5445
|
+
// hang-fallback restart below would fire a SECOND back-to-back
|
|
5446
|
+
// browser restart on the same batch boundary.
|
|
5447
|
+
forceRestartFlag = false;
|
|
4897
5448
|
await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY); // Give browser time to stabilize
|
|
4898
5449
|
} catch (emergencyRestartErr) {
|
|
4899
5450
|
if (forceDebug) console.log(formatLogMessage('debug', `Emergency restart failed: ${emergencyRestartErr.message}`));
|
|
@@ -4904,9 +5455,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4904
5455
|
console.log(`\n${messageColors.fileOp('🔄 Emergency hang detection restart:')} Browser appears hung, forcing restart`);
|
|
4905
5456
|
try {
|
|
4906
5457
|
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true });
|
|
5458
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4907
5459
|
browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
|
|
4908
5460
|
urlsSinceLastCleanup = 0;
|
|
4909
|
-
purgeStaleTrackers();
|
|
4910
5461
|
forceRestartFlag = false; // Reset flag
|
|
4911
5462
|
await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY);
|
|
4912
5463
|
if (forceDebug) console.log(formatLogMessage('debug', `Emergency hang detection restart completed`));
|
|
@@ -4955,11 +5506,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4955
5506
|
if (requestCacheStats.enabled && requestCacheStats.size > 0) {
|
|
4956
5507
|
const clearedCount = smartCache.clearRequestCache();
|
|
4957
5508
|
if (!silentMode && clearedCount > 0) {
|
|
4958
|
-
console.log(`\n
|
|
5509
|
+
console.log(`\n${messageColors.cleanup(`🗑️ Cleared request cache: ${clearedCount} entries after JSON processing`)}`);
|
|
4959
5510
|
}
|
|
4960
5511
|
if (forceDebug) {
|
|
4961
5512
|
console.log(formatLogMessage('debug',
|
|
4962
|
-
|
|
5513
|
+
`${SMART_CACHE_TAG} Request cache cleared after JSON scan completion (hit rate: ${requestCacheStats.hitRate})`
|
|
4963
5514
|
));
|
|
4964
5515
|
}
|
|
4965
5516
|
}
|
|
@@ -5031,8 +5582,42 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5031
5582
|
if (cloudflareScanStats.errorPages > 0) {
|
|
5032
5583
|
console.log(formatLogMessage('debug', `Cloudflare 5xx origin-error pages: ${cloudflareScanStats.errorPages} (no bypass possible — origin unreachable)`));
|
|
5033
5584
|
}
|
|
5034
|
-
if (dnsPrecheckEnabled && dnsPrecheckSkips > 0) {
|
|
5035
|
-
|
|
5585
|
+
if (dnsPrecheckEnabled && (dnsPrecheckSkips > 0 || dnsPositiveSkips > 0)) {
|
|
5586
|
+
// Two skip mechanisms, each with its own counter + unique-host count:
|
|
5587
|
+
// - dnsPrecheckSkips: URLs short-circuited via the NXDOMAIN-cache
|
|
5588
|
+
// (dnsNegativeCache). Unique-host count = dnsNegativeCache.size.
|
|
5589
|
+
// - dnsPositiveSkips: URLs short-circuited via dig/whois cache
|
|
5590
|
+
// proof of resolution (knownResolvedHostnames index in nettools).
|
|
5591
|
+
// Unique-host count = dnsPositiveSkippedHosts.size (this Set is
|
|
5592
|
+
// populated only on actual skip events, not on every Set add in
|
|
5593
|
+
// nettools, so it's a true per-scan visibility metric).
|
|
5594
|
+
const parts = [];
|
|
5595
|
+
if (dnsPrecheckSkips > 0) {
|
|
5596
|
+
parts.push(`${dnsPrecheckSkips} URL(s) via ${dnsNegativeCache.size} unresolvable host(s)`);
|
|
5597
|
+
}
|
|
5598
|
+
if (dnsPositiveSkips > 0) {
|
|
5599
|
+
parts.push(`${dnsPositiveSkips} URL(s) via ${dnsPositiveSkippedHosts.size} resolved host(s)`);
|
|
5600
|
+
}
|
|
5601
|
+
console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
|
|
5602
|
+
}
|
|
5603
|
+
// Blocked-pattern hit stats. Surfaces which patterns are actually
|
|
5604
|
+
// doing work this scan and (by absence) which are stale enough to
|
|
5605
|
+
// prune from config. Top 10 by hit count to keep the log scannable
|
|
5606
|
+
// on configs with dozens of patterns; full counts available via
|
|
5607
|
+
// _blockedPatternHits if needed for tooling. Fires only when at
|
|
5608
|
+
// least one pattern matched -- silent on scans with no blocks.
|
|
5609
|
+
if (_blockedPatternHits.size > 0) {
|
|
5610
|
+
let totalBlocks = 0;
|
|
5611
|
+
for (const n of _blockedPatternHits.values()) totalBlocks += n;
|
|
5612
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${_blockedPatternHits.size} pattern(s) hit ${totalBlocks} time(s) total`));
|
|
5613
|
+
const sorted = [..._blockedPatternHits.entries()].sort((a, b) => b[1] - a[1]);
|
|
5614
|
+
const top = sorted.slice(0, 10);
|
|
5615
|
+
for (const [pattern, hits] of top) {
|
|
5616
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${hits.toString().padStart(6)} × ${pattern}`));
|
|
5617
|
+
}
|
|
5618
|
+
if (sorted.length > top.length) {
|
|
5619
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ... and ${sorted.length - top.length} more pattern(s)`));
|
|
5620
|
+
}
|
|
5036
5621
|
}
|
|
5037
5622
|
// Log smart cache statistics (if cache is enabled)
|
|
5038
5623
|
// Adblock statistics
|
|
@@ -5250,7 +5835,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5250
5835
|
try { cleanupCloudflareCache(); } catch (_) {}
|
|
5251
5836
|
try { wgDisconnectAll(forceDebug); } catch (_) {}
|
|
5252
5837
|
try { ovpnDisconnectAll(forceDebug); } catch (_) {}
|
|
5253
|
-
try { purgeStaleTrackers(); } catch (_) {}
|
|
5254
5838
|
try { await closeAllSocksRelays(forceDebug); } catch (_) {}
|
|
5255
5839
|
|
|
5256
5840
|
// Clean process termination
|