@fanboynz/network-scanner 2.0.65 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +134 -10
- package/CHANGELOG.md +135 -0
- package/CLAUDE.md +18 -7
- package/README.md +12 -4
- package/lib/adblock-rust.js +23 -18
- package/lib/adblock.js +127 -82
- package/lib/browserexit.js +210 -200
- package/lib/browserhealth.js +84 -60
- package/lib/cdp.js +103 -81
- package/lib/clear_sitedata.js +61 -159
- package/lib/cloudflare.js +579 -409
- package/lib/colorize.js +29 -12
- package/lib/compare.js +16 -8
- package/lib/compress.js +2 -1
- package/lib/curl.js +287 -220
- package/lib/domain-cache.js +87 -40
- package/lib/dry-run.js +137 -194
- package/lib/fingerprint.js +20 -18
- package/lib/flowproxy.js +391 -188
- package/lib/ghost-cursor.js +8 -7
- package/lib/grep.js +248 -171
- package/lib/ignore_similar.js +70 -124
- package/lib/interaction.js +132 -235
- package/lib/nettools.js +309 -87
- package/lib/openvpn_vpn.js +12 -11
- package/lib/output.js +92 -59
- package/lib/post-processing.js +216 -162
- package/lib/proxy.js +105 -7
- package/lib/redirect.js +46 -30
- package/lib/referrer.js +158 -165
- package/lib/searchstring.js +290 -381
- package/lib/smart-cache.js +141 -91
- package/lib/socks-relay.js +267 -0
- package/lib/spawn-async.js +137 -0
- package/lib/validate_rules.js +188 -176
- package/lib/wireguard_vpn.js +111 -117
- package/nwss.js +872 -149
- package/package.json +6 -5
package/nwss.js
CHANGED
|
@@ -9,6 +9,7 @@ const fs = require('fs');
|
|
|
9
9
|
const os = require('os');
|
|
10
10
|
const psl = require('psl');
|
|
11
11
|
const path = require('path');
|
|
12
|
+
const dnsPromises = require('node:dns/promises');
|
|
12
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
13
14
|
const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
|
|
14
15
|
const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
|
|
@@ -27,13 +28,13 @@ const {
|
|
|
27
28
|
cleanup: cleanupCloudflareCache
|
|
28
29
|
} = require('./lib/cloudflare');
|
|
29
30
|
// FP Bypass
|
|
30
|
-
const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
|
|
31
|
+
const { handleFlowProxyProtection, getFlowProxyTimeouts, attachFlowProxyHeaderListener } = require('./lib/flowproxy');
|
|
31
32
|
// ignore_similar rules
|
|
32
33
|
const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar');
|
|
33
34
|
// Graceful exit
|
|
34
|
-
const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
|
|
35
|
+
const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
|
|
35
36
|
// Whois & Dig
|
|
36
|
-
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats } = require('./lib/nettools');
|
|
37
|
+
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
|
|
37
38
|
// File compare
|
|
38
39
|
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
|
|
39
40
|
// CDP functionality
|
|
@@ -41,7 +42,29 @@ const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeo
|
|
|
41
42
|
// Post-processing cleanup
|
|
42
43
|
const { processResults } = require('./lib/post-processing');
|
|
43
44
|
// Colorize various text when used
|
|
44
|
-
const {
|
|
45
|
+
const { messageColors, formatLogMessage } = require('./lib/colorize');
|
|
46
|
+
const TIMEOUT_TAG = messageColors.processing('[TIMEOUT]');
|
|
47
|
+
const INTERACTION_TAG = messageColors.processing('[interaction]');
|
|
48
|
+
const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]');
|
|
49
|
+
const PROXY_TAG = messageColors.processing('[proxy]');
|
|
50
|
+
const GREP_RESPONSE_TAG = messageColors.processing('[grep-response]');
|
|
51
|
+
const IGNORE_DOMAINS_BY_URL_TAG = messageColors.processing('[ignoreDomainsByUrl]');
|
|
52
|
+
const BLOCK_DOMAINS_BY_URL_TAG = messageColors.processing('[blockDomainsByUrl]');
|
|
53
|
+
const IGNORE_SIMILAR_IGNORED_DOMAINS_TAG = messageColors.processing('[ignore_similar_ignored_domains]');
|
|
54
|
+
const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
|
|
55
|
+
const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
|
|
56
|
+
const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
|
|
57
|
+
const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
|
|
58
|
+
const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
|
|
59
|
+
const VPN_TAG = messageColors.processing('[vpn]');
|
|
60
|
+
// Precomputed colored '[SmartCache]' subsystem prefix — paired with the
|
|
61
|
+
// same constant in lib/smart-cache.js so debug lines from both files
|
|
62
|
+
// produce consistently colored output. formatLogMessage only colors the
|
|
63
|
+
// [severity] tag; this constant colors the subsystem prefix.
|
|
64
|
+
const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
|
|
65
|
+
// Precomputed colored '[CONCURRENCY]' subsystem prefix for batch-throughput
|
|
66
|
+
// log lines (start/completed). Same cyan as the other monitoring tags.
|
|
67
|
+
const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
45
68
|
// Enhanced mouse interaction and page simulation
|
|
46
69
|
const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
47
70
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
@@ -50,7 +73,7 @@ const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostR
|
|
|
50
73
|
const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache');
|
|
51
74
|
const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
|
|
52
75
|
const { clearPersistentCache } = require('./lib/smart-cache');
|
|
53
|
-
const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy } = require('./lib/proxy');
|
|
76
|
+
const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
|
|
54
77
|
// Dry run functionality
|
|
55
78
|
const { initializeDryRunCollections, addDryRunMatch, addDryRunNetTools, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
|
|
56
79
|
// Enhanced site data clearing functionality
|
|
@@ -157,7 +180,10 @@ function detectPuppeteerVersion() {
|
|
|
157
180
|
// Enhanced redirect handling
|
|
158
181
|
const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
|
|
159
182
|
// Ensure web browser is working correctly
|
|
160
|
-
|
|
183
|
+
// purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
|
|
184
|
+
// and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
|
|
185
|
+
// automatically — manual purging is no longer needed.
|
|
186
|
+
const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
|
|
161
187
|
|
|
162
188
|
// --- Script Configuration & Constants ---
|
|
163
189
|
const VERSION = '2.0.33'; // Script version
|
|
@@ -266,6 +292,13 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
266
292
|
}
|
|
267
293
|
|
|
268
294
|
const headfulMode = args.includes('--headful');
|
|
295
|
+
// Sites (esp. video/streaming) call element.requestFullscreen() on load or
|
|
296
|
+
// click. In --headful that hijacks the real Chrome window into true
|
|
297
|
+
// fullscreen, forcing a manual ESC. Neutralize the Fullscreen API by
|
|
298
|
+
// default so it can't. Harmless in headless (no screen — the API is
|
|
299
|
+
// already inert there), so default-on keeps headful consistent with the
|
|
300
|
+
// primary headless path. --allow-fullscreen restores native behavior.
|
|
301
|
+
const allowFullscreen = args.includes('--allow-fullscreen');
|
|
269
302
|
const SOURCES_FOLDER = 'sources';
|
|
270
303
|
|
|
271
304
|
let outputFile = null;
|
|
@@ -326,6 +359,36 @@ const cacheRequests = args.includes('--cache-requests');
|
|
|
326
359
|
const dnsCacheMode = args.includes('--dns-cache');
|
|
327
360
|
if (dnsCacheMode) enableDiskCache();
|
|
328
361
|
|
|
362
|
+
// DNS pre-check before page.goto() — default-on, --no-dns-precheck disables.
|
|
363
|
+
// Filters NXDOMAIN / unresolvable hostnames in <100ms before paying the
|
|
364
|
+
// ~5-15s Puppeteer + Cloudflare detection round-trip on each.
|
|
365
|
+
const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
|
|
366
|
+
const dnsPrecheckTimeoutMs = 2000;
|
|
367
|
+
|
|
368
|
+
// Per-scan cache of negative DNS lookups. OS resolvers don't always cache
|
|
369
|
+
// NXDOMAIN responses, and a scan can hit the same dead hostname many times
|
|
370
|
+
// (different URL paths on the same site). Positive results are left to the
|
|
371
|
+
// OS cache; failure-cache avoids repeated lookup latency for known-dead hosts.
|
|
372
|
+
// FIFO eviction at DNS_NEGATIVE_CACHE_MAX so pathological scans (thousands
|
|
373
|
+
// of unique dead hosts) can't grow the cache unboundedly. Same pattern as
|
|
374
|
+
// the rest of the codebase's in-memory caches.
|
|
375
|
+
const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
|
|
376
|
+
const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
377
|
+
const DNS_NEGATIVE_CACHE_MAX = 1000;
|
|
378
|
+
let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
|
|
379
|
+
let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
|
|
380
|
+
const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
|
|
381
|
+
// c-ares transient codes — read-only, hoisted out of the per-task DNS
|
|
382
|
+
// pre-check so we don't allocate a fresh Set per URL.
|
|
383
|
+
const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
|
|
384
|
+
|
|
385
|
+
function dnsNegativeCacheSet(hostname, error) {
|
|
386
|
+
if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
|
|
387
|
+
dnsNegativeCache.delete(dnsNegativeCache.keys().next().value);
|
|
388
|
+
}
|
|
389
|
+
dnsNegativeCache.set(hostname, { error, timestamp: Date.now() });
|
|
390
|
+
}
|
|
391
|
+
|
|
329
392
|
let validateRulesFile = null;
|
|
330
393
|
const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules');
|
|
331
394
|
if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) {
|
|
@@ -643,6 +706,8 @@ General Options:
|
|
|
643
706
|
--custom-json <file> Use a custom config JSON file instead of config.json
|
|
644
707
|
--headful Launch browser with GUI (not headless)
|
|
645
708
|
--keep-open Keep browser open after scan completes (use with --headful)
|
|
709
|
+
--allow-fullscreen Allow sites to use the Fullscreen API. By default it is
|
|
710
|
+
neutralized so sites can't hijack the window in --headful
|
|
646
711
|
--use-puppeteer-core Use puppeteer-core with system Chrome instead of bundled Chromium
|
|
647
712
|
--use-obscura Connect to running Obscura CDP server (ws://127.0.0.1:9222 or OBSCURA_WS env)
|
|
648
713
|
Skips fingerprint injection — Obscura provides built-in stealth
|
|
@@ -658,7 +723,10 @@ General Options:
|
|
|
658
723
|
|
|
659
724
|
Validation Options:
|
|
660
725
|
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
|
|
661
|
-
--dns-cache Persist dig/whois results to disk between runs (
|
|
726
|
+
--dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
|
|
727
|
+
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
|
|
728
|
+
By default, URLs whose hostname doesn't resolve are skipped
|
|
729
|
+
immediately (saves ~5-15s of Puppeteer time per dead host).
|
|
662
730
|
--validate-config Validate config.json file and exit
|
|
663
731
|
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
|
|
664
732
|
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
|
|
@@ -669,6 +737,7 @@ Validation Options:
|
|
|
669
737
|
Global config.json options:
|
|
670
738
|
ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
|
|
671
739
|
ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
|
|
740
|
+
blockDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, ALL subsequent requests on that root domain (and subdomains) are aborted via Puppeteer for the rest of the scan
|
|
672
741
|
blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
|
|
673
742
|
whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
|
|
674
743
|
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
@@ -838,6 +907,7 @@ const {
|
|
|
838
907
|
sites = [],
|
|
839
908
|
ignoreDomains = [],
|
|
840
909
|
ignoreDomainsByUrl = [],
|
|
910
|
+
blockDomainsByUrl = [],
|
|
841
911
|
blocked: globalBlocked = [],
|
|
842
912
|
whois_delay = 3000,
|
|
843
913
|
whois_server_mode = 'random',
|
|
@@ -927,10 +997,11 @@ if (validateConfig) {
|
|
|
927
997
|
}
|
|
928
998
|
}
|
|
929
999
|
|
|
930
|
-
// Pre-compile global blocked regexes ONCE (used in every processUrl call)
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
1000
|
+
// Pre-compile global blocked regexes ONCE (used in every processUrl call).
|
|
1001
|
+
// Was: bare `.map(pattern => new RegExp(pattern))` which hard-threw at
|
|
1002
|
+
// module load on a single bad pattern, killing scan startup. Helper now
|
|
1003
|
+
// warns + skips so the rest of the config can still run.
|
|
1004
|
+
const globalBlockedRegexes = compilePatternList('blocked (global)', globalBlocked);
|
|
934
1005
|
|
|
935
1006
|
// Cache compiled regexes by pattern string — avoids recompiling same patterns across URLs
|
|
936
1007
|
const _compiledRegexCache = new Map();
|
|
@@ -949,6 +1020,44 @@ function getCompiledRegexes(patterns) {
|
|
|
949
1020
|
return arr.map(p => getCompiledRegex(p));
|
|
950
1021
|
}
|
|
951
1022
|
|
|
1023
|
+
/**
|
|
1024
|
+
* Compile a list of regex pattern strings, WARNING loudly on any that fail
|
|
1025
|
+
* compilation instead of:
|
|
1026
|
+
* (a) silently dropping them (old ignoreDomainsByUrl/blockDomainsByUrl
|
|
1027
|
+
* behavior) -- made debugging "why isn't my pattern matching?"
|
|
1028
|
+
* miserable, and
|
|
1029
|
+
* (b) hard-throwing at module load (old `blocked` behavior) -- one bad
|
|
1030
|
+
* pattern would kill the whole scan startup.
|
|
1031
|
+
*
|
|
1032
|
+
* Returns the array of successfully compiled regexes. Failed patterns are
|
|
1033
|
+
* skipped with a single warn line per failure naming the config key + the
|
|
1034
|
+
* source string + the regex error -- enough to find and fix without
|
|
1035
|
+
* grepping through diff history.
|
|
1036
|
+
*
|
|
1037
|
+
* @param {string} configKey - name of the config key, for warn context
|
|
1038
|
+
* @param {string[]} patterns - raw regex source strings
|
|
1039
|
+
* @param {(p:string)=>RegExp} [compile] - compile fn (defaults to new RegExp)
|
|
1040
|
+
* @returns {RegExp[]}
|
|
1041
|
+
*/
|
|
1042
|
+
function compilePatternList(configKey, patterns, compile = (p) => new RegExp(p)) {
|
|
1043
|
+
if (!Array.isArray(patterns)) return [];
|
|
1044
|
+
const out = [];
|
|
1045
|
+
for (const p of patterns) {
|
|
1046
|
+
try {
|
|
1047
|
+
out.push(compile(p));
|
|
1048
|
+
} catch (err) {
|
|
1049
|
+
console.warn(formatLogMessage('warn', `[config] ${configKey} pattern dropped (compile error): ${JSON.stringify(p)} -- ${err.message}`));
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
return out;
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
// Per-pattern match counters for the `blocked` regex (site + global,
|
|
1056
|
+
// combined). Keyed by RegExp.source so the same pattern appearing in both
|
|
1057
|
+
// site and global lists rolls up into one row. Reported at scan end so
|
|
1058
|
+
// stale patterns that match zero requests are easy to spot and prune.
|
|
1059
|
+
const _blockedPatternHits = new Map();
|
|
1060
|
+
|
|
952
1061
|
// Pre-split ignoreDomains into exact Set (O(1) lookup) and wildcard array
|
|
953
1062
|
const _ignoreDomainsExact = new Set();
|
|
954
1063
|
const _ignoreDomainsWildcard = [];
|
|
@@ -960,15 +1069,23 @@ for (const pattern of ignoreDomains) {
|
|
|
960
1069
|
}
|
|
961
1070
|
}
|
|
962
1071
|
|
|
963
|
-
// Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
try { return getCompiledRegex(p); } catch { return null; }
|
|
967
|
-
}).filter(r => r)
|
|
968
|
-
: [];
|
|
1072
|
+
// Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains.
|
|
1073
|
+
// Bad patterns warn (via compilePatternList) instead of silently dropping.
|
|
1074
|
+
const _ignoreDomainsByUrlRegexes = compilePatternList('ignoreDomainsByUrl', ignoreDomainsByUrl, getCompiledRegex);
|
|
969
1075
|
// Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
|
|
970
1076
|
const _dynamicallyIgnoredDomains = new Set();
|
|
971
1077
|
|
|
1078
|
+
// blockDomainsByUrl: symmetric to ignoreDomainsByUrl but for active
|
|
1079
|
+
// blocking via Puppeteer's request.abort(). When a request URL matches
|
|
1080
|
+
// one of these regex patterns, the request's root domain is added to
|
|
1081
|
+
// _dynamicallyBlockedDomains; subsequent requests on that domain (and
|
|
1082
|
+
// its subdomains, via parent-walk in matchesDynamicBlock) get aborted
|
|
1083
|
+
// before reaching the network. The triggering request itself is also
|
|
1084
|
+
// aborted -- same "gate fires immediately after trigger" semantic the
|
|
1085
|
+
// ignoreDomainsByUrl path uses for the dynamic Set short-circuit.
|
|
1086
|
+
const _blockDomainsByUrlRegexes = compilePatternList('blockDomainsByUrl', blockDomainsByUrl, getCompiledRegex);
|
|
1087
|
+
const _dynamicallyBlockedDomains = new Set();
|
|
1088
|
+
|
|
972
1089
|
// Apply global configuration overrides with validation
|
|
973
1090
|
// Priority: Command line args > config.json > defaults
|
|
974
1091
|
const MAX_CONCURRENT_SITES = (() => {
|
|
@@ -1065,7 +1182,7 @@ function safeMarkDomainProcessed(domain, context, metadata) {
|
|
|
1065
1182
|
}
|
|
1066
1183
|
} catch (cacheErr) {
|
|
1067
1184
|
if (forceDebug) {
|
|
1068
|
-
console.log(formatLogMessage('debug',
|
|
1185
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
|
|
1069
1186
|
}
|
|
1070
1187
|
}
|
|
1071
1188
|
}
|
|
@@ -1379,16 +1496,58 @@ function shouldBypassCacheForUrl(url, siteConfig) {
|
|
|
1379
1496
|
// ability to use wildcards in ignoreDomains
|
|
1380
1497
|
// Cache compiled wildcard regexes to avoid recompilation on every request
|
|
1381
1498
|
const _wildcardRegexCache = new Map();
|
|
1499
|
+
|
|
1500
|
+
// Generic parent-walk helper: returns true if `domain` or any of its
|
|
1501
|
+
// parents (one label at a time, up to the TLD) is present in `set`.
|
|
1502
|
+
// Mirrors the static/dynamic parent-walk inside matchesIgnoreDomain but
|
|
1503
|
+
// usable against an arbitrary single Set -- consumed by
|
|
1504
|
+
// matchesDynamicBlock below. matchesIgnoreDomain keeps its inline
|
|
1505
|
+
// dual-Set probe so the hot path stays single-split, but new single-Set
|
|
1506
|
+
// consumers (block, future similar features) share this helper.
|
|
1507
|
+
function _domainOrParentInSet(set, domain) {
|
|
1508
|
+
if (set.size === 0) return false;
|
|
1509
|
+
if (set.has(domain)) return true;
|
|
1510
|
+
const parts = domain.split('.');
|
|
1511
|
+
for (let i = 1; i < parts.length; i++) {
|
|
1512
|
+
if (set.has(parts.slice(i).join('.'))) return true;
|
|
1513
|
+
}
|
|
1514
|
+
return false;
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
/**
|
|
1518
|
+
* Block-side counterpart to the ignore gate. Returns true if `domain`
|
|
1519
|
+
* (or any of its parents) has been added to _dynamicallyBlockedDomains
|
|
1520
|
+
* by an earlier blockDomainsByUrl pattern match. Called per-request to
|
|
1521
|
+
* decide whether to request.abort() before the static blocked-regex
|
|
1522
|
+
* check fires.
|
|
1523
|
+
*/
|
|
1524
|
+
function matchesDynamicBlock(domain) {
|
|
1525
|
+
return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1382
1528
|
function matchesIgnoreDomain(domain, ignorePatterns) {
|
|
1383
|
-
//
|
|
1384
|
-
|
|
1385
|
-
//
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1529
|
+
// Both dynamic and static ignore lists are walked parent-by-parent so a
|
|
1530
|
+
// subdomain of an ignored root inherits the ignore. Previously the
|
|
1531
|
+
// dynamic check was exact-only, creating an asymmetry: a static-config
|
|
1532
|
+
// `example.com` ignored cdn.example.com transitively, but a runtime
|
|
1533
|
+
// ignoreDomainsByUrl match for the same root (stored as root via
|
|
1534
|
+
// checkedRootDomain at line ~2993) did NOT cascade -- subdomains slipped
|
|
1535
|
+
// through to dig/whois/regex despite the root being ignored. Now
|
|
1536
|
+
// unified: parts split once, shared between both Set probes.
|
|
1537
|
+
const hasDynamic = _dynamicallyIgnoredDomains.size > 0;
|
|
1538
|
+
const hasExact = _ignoreDomainsExact.size > 0;
|
|
1539
|
+
|
|
1540
|
+
if (hasDynamic || hasExact) {
|
|
1541
|
+
// Exact-domain hit on either set wins early.
|
|
1542
|
+
if (hasDynamic && _dynamicallyIgnoredDomains.has(domain)) return true;
|
|
1543
|
+
if (hasExact && _ignoreDomainsExact.has(domain)) return true;
|
|
1544
|
+
|
|
1545
|
+
// Parent-walk: sub.ads.example.com → ads.example.com → example.com
|
|
1389
1546
|
const parts = domain.split('.');
|
|
1390
1547
|
for (let i = 1; i < parts.length; i++) {
|
|
1391
|
-
|
|
1548
|
+
const parent = parts.slice(i).join('.');
|
|
1549
|
+
if (hasDynamic && _dynamicallyIgnoredDomains.has(parent)) return true;
|
|
1550
|
+
if (hasExact && _ignoreDomainsExact.has(parent)) return true;
|
|
1392
1551
|
}
|
|
1393
1552
|
}
|
|
1394
1553
|
|
|
@@ -1830,7 +1989,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1830
1989
|
wgDisconnectAll(forceDebug);
|
|
1831
1990
|
ovpnDisconnectAll(forceDebug);
|
|
1832
1991
|
cleanupCloudflareCache();
|
|
1833
|
-
|
|
1992
|
+
try { await closeAllSocksRelays(forceDebug); } catch (_) {}
|
|
1834
1993
|
}
|
|
1835
1994
|
|
|
1836
1995
|
let siteCounter = 0;
|
|
@@ -1981,28 +2140,46 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1981
2140
|
'Browser disconnected'
|
|
1982
2141
|
]);
|
|
1983
2142
|
|
|
2143
|
+
// Popup-capture cleanup registry — declared outside the try so the
|
|
2144
|
+
// finally block (which is a separate lexical scope from try) can see
|
|
2145
|
+
// it. Populated by the capture_popups setup block if siteConfig
|
|
2146
|
+
// .capture_popups is true; iterated in finally to deregister the
|
|
2147
|
+
// browser 'targetcreated' listener and close any tracked popup pages.
|
|
2148
|
+
const popupCleanups = [];
|
|
2149
|
+
// Race-window guard: 'targetcreated' fires synchronously, but
|
|
2150
|
+
// onTargetCreated does an `await target.page()`. If a popup target
|
|
2151
|
+
// is created right as the per-URL try block winds down, the await
|
|
2152
|
+
// can resolve AFTER finally has already iterated popupCleanups —
|
|
2153
|
+
// leaving the popup unregistered for manual cleanup (it still gets
|
|
2154
|
+
// closed by its own 3s auto-close timer, but in the meantime its
|
|
2155
|
+
// request listener could capture matches into matchedDomains for a
|
|
2156
|
+
// URL that already "finished"). The flag is set in finally and
|
|
2157
|
+
// checked at the start of onTargetCreated to short-circuit late
|
|
2158
|
+
// events cleanly.
|
|
2159
|
+
let urlFinished = false;
|
|
2160
|
+
|
|
1984
2161
|
try {
|
|
1985
2162
|
|
|
1986
2163
|
// --- Connect VPN if configured for this site ---
|
|
1987
2164
|
if (siteConfig.vpn) {
|
|
1988
2165
|
const vpnResult = await wgConnect(siteConfig, forceDebug);
|
|
1989
2166
|
if (!vpnResult.success) {
|
|
1990
|
-
console.warn(formatLogMessage('warn',
|
|
2167
|
+
console.warn(formatLogMessage('warn', `${VPN_TAG} WireGuard failed for ${currentUrl}: ${vpnResult.error}`));
|
|
1991
2168
|
return { url: currentUrl, rules: [], success: false, vpnFailed: true };
|
|
1992
2169
|
}
|
|
1993
2170
|
if (!silentMode) {
|
|
1994
2171
|
const ipInfo = vpnResult.externalIP ? ` (${vpnResult.externalIP})` : '';
|
|
1995
|
-
console.log(formatLogMessage('info',
|
|
2172
|
+
console.log(formatLogMessage('info', `${VPN_TAG} WireGuard connected via ${vpnResult.interface}${ipInfo} for ${currentUrl}`));
|
|
1996
2173
|
}
|
|
1997
2174
|
} else if (siteConfig.openvpn) {
|
|
1998
2175
|
const ovpnResult = await ovpnConnect(siteConfig, forceDebug);
|
|
1999
2176
|
if (!ovpnResult.success) {
|
|
2000
|
-
console.warn(formatLogMessage('warn',
|
|
2177
|
+
console.warn(formatLogMessage('warn', `${VPN_TAG} OpenVPN failed for ${currentUrl}: ${ovpnResult.error}`));
|
|
2001
2178
|
return { url: currentUrl, rules: [], success: false, vpnFailed: true };
|
|
2002
2179
|
}
|
|
2003
2180
|
if (!silentMode) {
|
|
2004
2181
|
const ipInfo = ovpnResult.externalIP ? ` (${ovpnResult.externalIP})` : '';
|
|
2005
|
-
console.log(formatLogMessage('info',
|
|
2182
|
+
console.log(formatLogMessage('info', `${VPN_TAG} OpenVPN connected via ${ovpnResult.connection}${ipInfo} for ${currentUrl}`));
|
|
2006
2183
|
}
|
|
2007
2184
|
}
|
|
2008
2185
|
|
|
@@ -2036,12 +2213,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2036
2213
|
const totalDelay = siteDelay + bufferTime;
|
|
2037
2214
|
|
|
2038
2215
|
if (forceDebug && hasCloudflareConfig) {
|
|
2039
|
-
console.log(formatLogMessage('debug',
|
|
2216
|
+
console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Using extended delay for Cloudflare site: ${totalDelay}ms (${siteDelay}ms + ${bufferTime}ms CF buffer)`));
|
|
2040
2217
|
}
|
|
2041
2218
|
|
|
2042
2219
|
const realtimeResult = await performRealtimeWindowCleanup(browserInstance, threshold, forceDebug, totalDelay);
|
|
2043
2220
|
if (realtimeResult.success && realtimeResult.closedCount > 0 && forceDebug) {
|
|
2044
|
-
console.log(formatLogMessage('debug',
|
|
2221
|
+
console.log(formatLogMessage('debug', `${REALTIME_CLEANUP_TAG} Cleaned ${realtimeResult.closedCount} old pages, ${realtimeResult.remainingPages} remaining`));
|
|
2045
2222
|
}
|
|
2046
2223
|
}
|
|
2047
2224
|
|
|
@@ -2052,7 +2229,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2052
2229
|
// Aggressive timeouts prevent hanging in Puppeteer 23.x while maintaining speed
|
|
2053
2230
|
|
|
2054
2231
|
page.on('console', (msg) => {
|
|
2055
|
-
if (forceDebug && msg.type() === 'error') console.log(
|
|
2232
|
+
if (forceDebug && msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
|
|
2056
2233
|
});
|
|
2057
2234
|
|
|
2058
2235
|
// Add page crash handler
|
|
@@ -2113,6 +2290,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2113
2290
|
const flowproxyTimeouts = getFlowProxyTimeouts(siteConfig);
|
|
2114
2291
|
page.setDefaultTimeout(Math.min(flowproxyTimeouts.pageTimeout, TIMEOUTS.DEFAULT_NAVIGATION));
|
|
2115
2292
|
page.setDefaultNavigationTimeout(Math.min(flowproxyTimeouts.navigationTimeout, TIMEOUTS.DEFAULT_PAGE));
|
|
2293
|
+
// Attach the response/header listener BEFORE navigation so the
|
|
2294
|
+
// document response's own headers (Server, Set-Cookie, X-FlowProxy-*,
|
|
2295
|
+
// etc.) are observed. The listener accumulates state in a WeakMap
|
|
2296
|
+
// keyed by page; analyzeFlowProxyProtection reads from it later.
|
|
2297
|
+
attachFlowProxyHeaderListener(page);
|
|
2116
2298
|
if (forceDebug) {
|
|
2117
2299
|
console.log(formatLogMessage('debug', `Applied flowProxy timeouts - page: ${flowproxyTimeouts.pageTimeout}ms, nav: ${flowproxyTimeouts.navigationTimeout}ms`));
|
|
2118
2300
|
}
|
|
@@ -2131,9 +2313,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2131
2313
|
if (shouldInjectEvalForPage) {
|
|
2132
2314
|
if (forceDebug) {
|
|
2133
2315
|
if (globalEvalOnDoc) {
|
|
2134
|
-
console.log(formatLogMessage('debug',
|
|
2316
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
|
|
2135
2317
|
} else { // siteConfig.evaluateOnNewDocument must be true
|
|
2136
|
-
console.log(formatLogMessage('debug',
|
|
2318
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
|
|
2137
2319
|
}
|
|
2138
2320
|
}
|
|
2139
2321
|
|
|
@@ -2154,7 +2336,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2154
2336
|
browserResponsive = true;
|
|
2155
2337
|
} catch (healthErr) {
|
|
2156
2338
|
if (forceDebug) {
|
|
2157
|
-
console.log(formatLogMessage('debug',
|
|
2339
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser health check failed: ${healthErr.message}`));
|
|
2158
2340
|
}
|
|
2159
2341
|
browserResponsive = false;
|
|
2160
2342
|
}
|
|
@@ -2253,7 +2435,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2253
2435
|
]);
|
|
2254
2436
|
evalOnDocSuccess = true;
|
|
2255
2437
|
if (forceDebug) {
|
|
2256
|
-
console.log(formatLogMessage('debug',
|
|
2438
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection successful for ${currentUrl}`));
|
|
2257
2439
|
}
|
|
2258
2440
|
} catch (fullInjectionErr) {
|
|
2259
2441
|
// Enhanced error detection for CDP issues
|
|
@@ -2264,12 +2446,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2264
2446
|
|
|
2265
2447
|
if (forceDebug) {
|
|
2266
2448
|
const errorType = isCDPError ? 'CDP/Protocol error' : 'timeout/other';
|
|
2267
|
-
console.log(formatLogMessage('debug',
|
|
2449
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Full injection failed (${errorType}): ${fullInjectionErr.message}`));
|
|
2268
2450
|
}
|
|
2269
2451
|
|
|
2270
2452
|
// Skip fallback for CDP errors - they indicate browser communication issues
|
|
2271
2453
|
if (isCDPError) {
|
|
2272
|
-
console.warn(formatLogMessage('warn',
|
|
2454
|
+
console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} CDP communication failure - skipping injection for ${currentUrl}`));
|
|
2273
2455
|
evalOnDocSuccess = false;
|
|
2274
2456
|
} else {
|
|
2275
2457
|
|
|
@@ -2316,11 +2498,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2316
2498
|
]);
|
|
2317
2499
|
evalOnDocSuccess = true;
|
|
2318
2500
|
if (forceDebug) {
|
|
2319
|
-
console.log(formatLogMessage('debug',
|
|
2501
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection successful for ${currentUrl}`));
|
|
2320
2502
|
}
|
|
2321
2503
|
} catch (minimalInjectionErr) {
|
|
2322
2504
|
if (forceDebug) {
|
|
2323
|
-
console.log(formatLogMessage('debug',
|
|
2505
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Minimal injection also failed: ${minimalInjectionErr.message}`));
|
|
2324
2506
|
}
|
|
2325
2507
|
evalOnDocSuccess = false;
|
|
2326
2508
|
}
|
|
@@ -2328,14 +2510,14 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2328
2510
|
}
|
|
2329
2511
|
} else {
|
|
2330
2512
|
if (forceDebug) {
|
|
2331
|
-
console.log(formatLogMessage('debug',
|
|
2513
|
+
console.log(formatLogMessage('debug', `${EVAL_ON_DOC_TAG} Browser unresponsive, skipping injection for ${currentUrl}`));
|
|
2332
2514
|
}
|
|
2333
2515
|
evalOnDocSuccess = false;
|
|
2334
2516
|
}
|
|
2335
2517
|
|
|
2336
2518
|
// Final status logging
|
|
2337
2519
|
if (!evalOnDocSuccess) {
|
|
2338
|
-
console.warn(formatLogMessage('warn',
|
|
2520
|
+
console.warn(formatLogMessage('warn', `${EVAL_ON_DOC_TAG} All injection strategies failed for ${currentUrl} - continuing with standard request monitoring only`));
|
|
2339
2521
|
}
|
|
2340
2522
|
// Allow realtime cleanup to proceed after injection completes
|
|
2341
2523
|
if (shouldInjectEvalForPage && siteConfig.window_cleanup === "realtime") {
|
|
@@ -2364,7 +2546,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2364
2546
|
}
|
|
2365
2547
|
}, { selectors: cssBlockedSelectors });
|
|
2366
2548
|
} catch (cssErr) {
|
|
2367
|
-
console.warn(formatLogMessage('warn',
|
|
2549
|
+
console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
|
|
2368
2550
|
}
|
|
2369
2551
|
}
|
|
2370
2552
|
// --- END: CSS Element Blocking Setup ---
|
|
@@ -2421,7 +2603,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2421
2603
|
const clearResult = await clearSiteData(page, currentUrl, forceDebug);
|
|
2422
2604
|
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data for ${currentUrl}`));
|
|
2423
2605
|
} catch (clearErr) {
|
|
2424
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
2606
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Failed for ${currentUrl}: ${clearErr.message}`));
|
|
2425
2607
|
}
|
|
2426
2608
|
}
|
|
2427
2609
|
|
|
@@ -2438,6 +2620,29 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2438
2620
|
} else if (forceDebug) {
|
|
2439
2621
|
console.log(formatLogMessage('debug', `Skipping fingerprint injection — Obscura provides built-in stealth`));
|
|
2440
2622
|
}
|
|
2623
|
+
|
|
2624
|
+
// Neutralize the Fullscreen API before any page script runs so a
|
|
2625
|
+
// site can't force the real browser window fullscreen in --headful
|
|
2626
|
+
// (or trip an anti-bot check that reads document.fullscreenElement).
|
|
2627
|
+
// requestFullscreen is stubbed to a resolved no-op — which is also
|
|
2628
|
+
// how browsers already behave when it's called without a user
|
|
2629
|
+
// gesture, so this looks normal, not automated. fullscreenElement
|
|
2630
|
+
// stays null naturally since we never enter fullscreen.
|
|
2631
|
+
if (!allowFullscreen) {
|
|
2632
|
+
try {
|
|
2633
|
+
await page.evaluateOnNewDocument(() => {
|
|
2634
|
+
const noop = function () { return Promise.resolve(); };
|
|
2635
|
+
const legacyNoop = function () {};
|
|
2636
|
+
try { Element.prototype.requestFullscreen = noop; } catch (_) {}
|
|
2637
|
+
try { Element.prototype.webkitRequestFullscreen = legacyNoop; } catch (_) {}
|
|
2638
|
+
try { Element.prototype.webkitRequestFullScreen = legacyNoop; } catch (_) {}
|
|
2639
|
+
try { Element.prototype.mozRequestFullScreen = legacyNoop; } catch (_) {}
|
|
2640
|
+
try { Element.prototype.msRequestFullscreen = legacyNoop; } catch (_) {}
|
|
2641
|
+
});
|
|
2642
|
+
} catch (fsErr) {
|
|
2643
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Fullscreen neutralization injection failed: ${fsErr.message}`));
|
|
2644
|
+
}
|
|
2645
|
+
}
|
|
2441
2646
|
|
|
2442
2647
|
// Client Hints protection for Chrome user agents (skipped under Obscura — it sets its own)
|
|
2443
2648
|
if (!useObscura && siteConfig.userAgent && siteConfig.userAgent.toLowerCase().includes('chrome')) {
|
|
@@ -2624,19 +2829,41 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2624
2829
|
});
|
|
2625
2830
|
}
|
|
2626
2831
|
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2832
|
+
// Per-site blocked compile -- helper warns on bad patterns instead of
|
|
2833
|
+
// throwing out of processUrl and breaking that site's scan.
|
|
2834
|
+
const blockedRegexes = compilePatternList(`blocked (site: ${siteConfig.url || 'unknown'})`, siteConfig.blocked, getCompiledRegex);
|
|
2835
|
+
|
|
2836
|
+
// Per-site escape hatch: disable_adblock turns off the two layers of
|
|
2837
|
+
// "global" ad-blocking for this URL — the adblock-rs filter-list engine
|
|
2838
|
+
// and the globalBlockedRegexes pattern list. Per-site siteConfig.blocked
|
|
2839
|
+
// is preserved (it's an explicit per-site choice, not "global" blocking).
|
|
2840
|
+
//
|
|
2841
|
+
// The use case: capture_popups + popunder/redirect chains. The global
|
|
2842
|
+
// adblock often aborts the exact requests that fire the popup or chain
|
|
2843
|
+
// to the tracker, defeating capture. Setting disable_adblock: true for
|
|
2844
|
+
// those specific URLs lets the chain play out naturally so the popup
|
|
2845
|
+
// request listener can observe the full hop sequence.
|
|
2846
|
+
const disableAdblock = siteConfig.disable_adblock === true;
|
|
2630
2847
|
|
|
2631
2848
|
// Pre-build Set for O(1) resourceType lookups (fired per request)
|
|
2632
2849
|
const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
|
|
2633
2850
|
? new Set(siteConfig.resourceTypes)
|
|
2634
2851
|
: null;
|
|
2635
|
-
|
|
2636
|
-
// Combine site-specific with pre-compiled global blocked patterns
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2852
|
+
|
|
2853
|
+
// Combine site-specific with pre-compiled global blocked patterns.
|
|
2854
|
+
// When disable_adblock is true, globalBlockedRegexes is omitted so
|
|
2855
|
+
// only the per-site list applies.
|
|
2856
|
+
const allBlockedRegexes = disableAdblock
|
|
2857
|
+
? blockedRegexes
|
|
2858
|
+
: (blockedRegexes.length > 0
|
|
2859
|
+
? [...blockedRegexes, ...globalBlockedRegexes]
|
|
2860
|
+
: globalBlockedRegexes); // Avoid spread when no site-specific patterns
|
|
2861
|
+
|
|
2862
|
+
if (disableAdblock && forceDebug) {
|
|
2863
|
+
const dropped = globalBlockedRegexes.length;
|
|
2864
|
+
const adblockNote = adblockEnabled && adblockMatcher ? ' + adblock-rs engine' : '';
|
|
2865
|
+
console.log(formatLogMessage('debug', `[adblock] disable_adblock=true for ${currentUrl} — skipping ${dropped} global blocked patterns${adblockNote} (site-level ${blockedRegexes.length} pattern(s) still apply)`));
|
|
2866
|
+
}
|
|
2640
2867
|
|
|
2641
2868
|
/**
|
|
2642
2869
|
* Helper function to add domain to matched collection
|
|
@@ -2663,7 +2890,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2663
2890
|
const cachedSimilarity = smartCache.getCachedSimilarity(domain, existingDomain);
|
|
2664
2891
|
if (cachedSimilarity !== null && cachedSimilarity >= similarityThreshold) {
|
|
2665
2892
|
if (forceDebug) {
|
|
2666
|
-
console.log(formatLogMessage('debug',
|
|
2893
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Used cached similarity: ${domain} ~= ${existingDomain} (${cachedSimilarity}%)`));
|
|
2667
2894
|
}
|
|
2668
2895
|
return; // Skip adding this domain
|
|
2669
2896
|
}
|
|
@@ -2687,7 +2914,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2687
2914
|
|
|
2688
2915
|
if (smartCache && smartCache.shouldSkipDomain(domain, context)) {
|
|
2689
2916
|
if (forceDebug) {
|
|
2690
|
-
console.log(formatLogMessage('debug',
|
|
2917
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Skipping cached domain: ${domain}`));
|
|
2691
2918
|
}
|
|
2692
2919
|
return; // Skip adding this domain
|
|
2693
2920
|
}
|
|
@@ -2705,7 +2932,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2705
2932
|
|
|
2706
2933
|
if (similarCheck.shouldIgnore) {
|
|
2707
2934
|
if (forceDebug) {
|
|
2708
|
-
console.log(formatLogMessage('debug',
|
|
2935
|
+
console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_TAG} Skipping ${domain}: ${similarCheck.reason}`));
|
|
2709
2936
|
}
|
|
2710
2937
|
return; // Skip adding this domain
|
|
2711
2938
|
}
|
|
@@ -2721,7 +2948,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2721
2948
|
|
|
2722
2949
|
if (ignoredSimilarCheck.shouldIgnore) {
|
|
2723
2950
|
if (forceDebug) {
|
|
2724
|
-
console.log(formatLogMessage('debug',
|
|
2951
|
+
console.log(formatLogMessage('debug', `${IGNORE_SIMILAR_IGNORED_DOMAINS_TAG} Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
|
|
2725
2952
|
}
|
|
2726
2953
|
return; // Skip adding this domain
|
|
2727
2954
|
}
|
|
@@ -2742,7 +2969,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2742
2969
|
}
|
|
2743
2970
|
} catch (cacheErr) {
|
|
2744
2971
|
if (forceDebug) {
|
|
2745
|
-
console.log(formatLogMessage('debug',
|
|
2972
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Error marking domain: ${cacheErr.message}`));
|
|
2746
2973
|
}
|
|
2747
2974
|
}
|
|
2748
2975
|
}
|
|
@@ -2760,6 +2987,247 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2760
2987
|
}
|
|
2761
2988
|
}
|
|
2762
2989
|
|
|
2990
|
+
// === POPUP CAPTURE (opt-in via siteConfig.capture_popups: true) ===
|
|
2991
|
+
// Many ad networks fire popunders / new-tab opens (window.open, target=
|
|
2992
|
+
// "_blank") that navigate to trackers and disappear from view. Those
|
|
2993
|
+
// pages are SEPARATE Puppeteer targets — page.on('request', ...) on the
|
|
2994
|
+
// main page never sees their network traffic.
|
|
2995
|
+
//
|
|
2996
|
+
// IMPORTANT: modern Chromium blocks programmatic window.open() unless
|
|
2997
|
+
// it's triggered by a real user gesture. In practice that means
|
|
2998
|
+
// capture_popups only catches anything when the scanner is actually
|
|
2999
|
+
// clicking on the page — i.e., the site config also has
|
|
3000
|
+
// `interact: true` AND `interact_clicks: true`. Setting capture_popups
|
|
3001
|
+
// alone will register the listener but no popups will fire.
|
|
3002
|
+
//
|
|
3003
|
+
// When capture_popups is true, we attach a browser-level 'targetcreated'
|
|
3004
|
+
// listener for THIS URL only. New page targets whose opener-chain leads
|
|
3005
|
+
// back to our main page (within maxDepth levels) get a stripped-down
|
|
3006
|
+
// request listener — same regex/first-party/ignoreDomains filter as
|
|
3007
|
+
// the main handler, same addMatchedDomain() sink, same domain
|
|
3008
|
+
// detection cache, same nettools/similarity logic (all inherited via
|
|
3009
|
+
// addMatchedDomain). Cloudflare bypass, adblock-rs matching, curl/grep
|
|
3010
|
+
// content download, and request.abort() are intentionally skipped on
|
|
3011
|
+
// popups — they're observation-only.
|
|
3012
|
+
//
|
|
3013
|
+
// Each popup's request listener stays attached across in-window
|
|
3014
|
+
// navigations, so a single popup that redirects A -> B -> C captures
|
|
3015
|
+
// every hop. The capture window (default 5s, configurable per-site
|
|
3016
|
+
// via capture_popups_window_ms) is the wall-clock budget for that
|
|
3017
|
+
// chain — bump it for long redirect chains, lower it for high-popup-
|
|
3018
|
+
// rate sites where memory pressure matters more than chain coverage.
|
|
3019
|
+
const capturePopups = siteConfig.capture_popups === true;
|
|
3020
|
+
// Per-site overrides (with sane defaults). Parsed as numbers so config
|
|
3021
|
+
// values from JSON come through correctly; falsy / non-positive values
|
|
3022
|
+
// fall back to the default rather than silently disabling capture.
|
|
3023
|
+
const POPUP_MAX_DEPTH = (() => {
|
|
3024
|
+
const v = parseInt(siteConfig.capture_popups_max_depth, 10);
|
|
3025
|
+
return Number.isFinite(v) && v > 0 ? v : 2;
|
|
3026
|
+
})();
|
|
3027
|
+
const POPUP_CAPTURE_WINDOW_MS = (() => {
|
|
3028
|
+
const v = parseInt(siteConfig.capture_popups_window_ms, 10);
|
|
3029
|
+
return Number.isFinite(v) && v > 0 ? v : 5000;
|
|
3030
|
+
})();
|
|
3031
|
+
|
|
3032
|
+
if (capturePopups && forceDebug) {
|
|
3033
|
+
// One-time setup-time warning if the click prerequisite isn't met.
|
|
3034
|
+
// Without clicks, capture_popups is a no-op in practice.
|
|
3035
|
+
const hasClicks = siteConfig.interact === true && siteConfig.interact_clicks === true;
|
|
3036
|
+
if (!hasClicks) {
|
|
3037
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but interact_clicks is not — popups need user-gesture clicks to fire; expect no captures unless the page opens popups via in-page redirects`));
|
|
3038
|
+
}
|
|
3039
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups settings: maxDepth=${POPUP_MAX_DEPTH}, windowMs=${POPUP_CAPTURE_WINDOW_MS}`));
|
|
3040
|
+
}
|
|
3041
|
+
|
|
3042
|
+
if (capturePopups) {
|
|
3043
|
+
const mainTarget = page.target();
|
|
3044
|
+
|
|
3045
|
+
// Walk target.opener() chain to find depth relative to mainTarget.
|
|
3046
|
+
// Returns 0 if the target isn't a descendant of mainTarget at all,
|
|
3047
|
+
// 1 for a direct popup of the main page, 2 for popup-of-popup, etc.
|
|
3048
|
+
const getPopupDepth = (target) => {
|
|
3049
|
+
let depth = 0;
|
|
3050
|
+
let cur = target.opener();
|
|
3051
|
+
while (cur && depth <= POPUP_MAX_DEPTH + 1) {
|
|
3052
|
+
depth++;
|
|
3053
|
+
if (cur === mainTarget) return depth;
|
|
3054
|
+
cur = cur.opener();
|
|
3055
|
+
}
|
|
3056
|
+
return 0;
|
|
3057
|
+
};
|
|
3058
|
+
|
|
3059
|
+
// Attach observation-only request listener to a popup page. No
|
|
3060
|
+
// setRequestInterception(true) — page.on('request') fires for every
|
|
3061
|
+
// request regardless of interception state, and we don't need to
|
|
3062
|
+
// block anything on popups.
|
|
3063
|
+
const attachPopupRequestCapture = (popupPage, depth) => {
|
|
3064
|
+
popupPage.on('request', (request) => {
|
|
3065
|
+
try {
|
|
3066
|
+
const checkedUrl = request.url();
|
|
3067
|
+
let fullSubdomain = '';
|
|
3068
|
+
let checkedRootDomain = '';
|
|
3069
|
+
try {
|
|
3070
|
+
const parsedUrl = new URL(checkedUrl);
|
|
3071
|
+
fullSubdomain = parsedUrl.hostname;
|
|
3072
|
+
const pslResult = psl.parse(fullSubdomain);
|
|
3073
|
+
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3074
|
+
} catch (_) { return; }
|
|
3075
|
+
if (!checkedRootDomain) return;
|
|
3076
|
+
|
|
3077
|
+
// ignoreDomainsByUrl — if any pattern matches this popup URL,
|
|
3078
|
+
// mark the root domain as ignored for the rest of the scan
|
|
3079
|
+
// (main page + all popups). Mirrors the main handler so a
|
|
3080
|
+
// tracker URL surfaced via popup chain has the same dampening
|
|
3081
|
+
// effect as one surfaced on the main page.
|
|
3082
|
+
if (_ignoreDomainsByUrlRegexes.length > 0 && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
|
|
3083
|
+
for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
|
|
3084
|
+
if (_ignoreDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3085
|
+
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
3086
|
+
if (forceDebug) {
|
|
3087
|
+
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3088
|
+
}
|
|
3089
|
+
break;
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
|
|
3094
|
+
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl
|
|
3095
|
+
// above; populating the dynamic block Set from popup URLs lets
|
|
3096
|
+
// tracker URLs surfaced via popup chains poison their root
|
|
3097
|
+
// domain for the rest of the scan just like main-page hits do.
|
|
3098
|
+
if (_blockDomainsByUrlRegexes.length > 0 && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3099
|
+
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3100
|
+
if (_blockDomainsByUrlRegexes[i].test(checkedUrl)) {
|
|
3101
|
+
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3102
|
+
if (forceDebug) {
|
|
3103
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source} (from popup depth=${depth})`));
|
|
3104
|
+
}
|
|
3105
|
+
break;
|
|
3106
|
+
}
|
|
3107
|
+
}
|
|
3108
|
+
}
|
|
3109
|
+
|
|
3110
|
+
// ignoreDomains gate (global; matchesIgnoreDomain also short-
|
|
3111
|
+
// circuits on _dynamicallyIgnoredDomains, so a domain we just
|
|
3112
|
+
// added above will be caught here on the same request).
|
|
3113
|
+
if (matchesIgnoreDomain(checkedRootDomain, ignoreDomains)) return;
|
|
3114
|
+
|
|
3115
|
+
// Dynamic-block gate for popup requests — early return on
|
|
3116
|
+
// matched root or any parent (parent-walk in
|
|
3117
|
+
// matchesDynamicBlock). Popups don't have a request object
|
|
3118
|
+
// available here, so we just return rather than abort; the
|
|
3119
|
+
// popup-request observer treats this as "don't process".
|
|
3120
|
+
if (matchesDynamicBlock(checkedRootDomain)) return;
|
|
3121
|
+
|
|
3122
|
+
// First-party / third-party gate (popup belongs to the main URL's
|
|
3123
|
+
// domain group — its OWN URL doesn't redefine first-party).
|
|
3124
|
+
const isFirstParty = firstPartyDomains.has(checkedRootDomain);
|
|
3125
|
+
if (siteConfig.firstParty === false && isFirstParty) return;
|
|
3126
|
+
if (siteConfig.thirdParty === false && !isFirstParty) return;
|
|
3127
|
+
|
|
3128
|
+
// Regex match against the site's filterRegex list
|
|
3129
|
+
const resourceType = request.resourceType();
|
|
3130
|
+
let regexMatched = false;
|
|
3131
|
+
for (const re of regexes) {
|
|
3132
|
+
if (re.test(checkedUrl)) {
|
|
3133
|
+
regexMatched = true;
|
|
3134
|
+
if (forceDebug) {
|
|
3135
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] Matched ${checkedRootDomain} via ${re} (${resourceType})`));
|
|
3136
|
+
}
|
|
3137
|
+
break;
|
|
3138
|
+
}
|
|
3139
|
+
}
|
|
3140
|
+
|
|
3141
|
+
if (!regexMatched) return;
|
|
3142
|
+
|
|
3143
|
+
// hasNetTools is the same flag the main handler uses (line ~2639).
|
|
3144
|
+
// When the site config carries whois/dig terms, regex match is
|
|
3145
|
+
// not sufficient by itself — the URL must ALSO pass the whois/
|
|
3146
|
+
// dig validation before it counts. Mirrors the main handler's
|
|
3147
|
+
// behavior so 'capture popup domains that match regex/dig/whois'
|
|
3148
|
+
// means the same thing for popups as for the main page.
|
|
3149
|
+
if (hasNetTools) {
|
|
3150
|
+
const popupNetToolsHandler = createNetToolsHandler({
|
|
3151
|
+
whoisTerms, whoisOrTerms,
|
|
3152
|
+
processedWhoisDomains: globalProcessedWhoisDomains,
|
|
3153
|
+
processedDigDomains: globalProcessedDigDomains,
|
|
3154
|
+
whoisDelay: siteConfig.whois_delay !== undefined ? siteConfig.whois_delay : whois_delay,
|
|
3155
|
+
whoisServer,
|
|
3156
|
+
whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
|
|
3157
|
+
debugLogFile,
|
|
3158
|
+
digTerms, digOrTerms, digRecordType,
|
|
3159
|
+
digSubdomain: siteConfig.dig_subdomain === true,
|
|
3160
|
+
dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
|
|
3161
|
+
matchedDomains, addMatchedDomain,
|
|
3162
|
+
isDomainAlreadyDetected: isLocallyDetected,
|
|
3163
|
+
onWhoisResult: smartCache ? (domain, result) => smartCache.cacheNetTools(domain, 'whois', result) : undefined,
|
|
3164
|
+
onDigResult: smartCache ? (domain, result, recordType) => smartCache.cacheNetTools(domain, 'dig', result, recordType) : undefined,
|
|
3165
|
+
cachedWhois: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'whois') : null,
|
|
3166
|
+
cachedDig: smartCache ? smartCache.getCachedNetTools(checkedRootDomain, 'dig', digRecordType) : null,
|
|
3167
|
+
currentUrl, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, fs,
|
|
3168
|
+
ignoreDomains, matchesIgnoreDomain
|
|
3169
|
+
});
|
|
3170
|
+
setImmediate(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3171
|
+
} else {
|
|
3172
|
+
// No nettools required — regex match alone counts.
|
|
3173
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3174
|
+
}
|
|
3175
|
+
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3176
|
+
});
|
|
3177
|
+
};
|
|
3178
|
+
|
|
3179
|
+
const onTargetCreated = async (target) => {
|
|
3180
|
+
// Short-circuit guard: if finally has already started, don't attach
|
|
3181
|
+
// a request listener whose closure would outlive its meaningful
|
|
3182
|
+
// scope. The race is narrow (a targetcreated firing while we're
|
|
3183
|
+
// mid-await on target.page() across the finally boundary), but
|
|
3184
|
+
// without this guard a late popup could push matches into
|
|
3185
|
+
// matchedDomains for a URL whose processing has already returned.
|
|
3186
|
+
if (urlFinished) return;
|
|
3187
|
+
if (target.type() !== 'page') return;
|
|
3188
|
+
const depth = getPopupDepth(target);
|
|
3189
|
+
if (depth < 1) return; // Not one of ours
|
|
3190
|
+
if (depth > POPUP_MAX_DEPTH) {
|
|
3191
|
+
if (forceDebug) {
|
|
3192
|
+
console.log(formatLogMessage('debug', `[popup] Skipping depth-${depth} popup (max=${POPUP_MAX_DEPTH}): ${target.url() || 'about:blank'}`));
|
|
3193
|
+
}
|
|
3194
|
+
return;
|
|
3195
|
+
}
|
|
3196
|
+
|
|
3197
|
+
let popupPage;
|
|
3198
|
+
try { popupPage = await target.page(); } catch (_) { return; }
|
|
3199
|
+
if (!popupPage) return;
|
|
3200
|
+
// Re-check after the await — the per-URL finally may have flipped
|
|
3201
|
+
// the flag while target.page() was resolving.
|
|
3202
|
+
if (urlFinished) {
|
|
3203
|
+
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
3204
|
+
return;
|
|
3205
|
+
}
|
|
3206
|
+
|
|
3207
|
+
if (forceDebug) {
|
|
3208
|
+
console.log(formatLogMessage('debug', `[popup depth=${depth}] Capturing popup: ${target.url() || 'about:blank'}`));
|
|
3209
|
+
}
|
|
3210
|
+
|
|
3211
|
+
attachPopupRequestCapture(popupPage, depth);
|
|
3212
|
+
|
|
3213
|
+
// Auto-close after the capture window so popups don't pile up.
|
|
3214
|
+
const closeTimer = setTimeout(() => {
|
|
3215
|
+
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
3216
|
+
}, POPUP_CAPTURE_WINDOW_MS);
|
|
3217
|
+
if (typeof closeTimer.unref === 'function') closeTimer.unref();
|
|
3218
|
+
|
|
3219
|
+
popupCleanups.push(() => {
|
|
3220
|
+
clearTimeout(closeTimer);
|
|
3221
|
+
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
3222
|
+
});
|
|
3223
|
+
};
|
|
3224
|
+
|
|
3225
|
+
browser.on('targetcreated', onTargetCreated);
|
|
3226
|
+
popupCleanups.push(() => {
|
|
3227
|
+
try { browser.off('targetcreated', onTargetCreated); } catch (_) {}
|
|
3228
|
+
});
|
|
3229
|
+
}
|
|
3230
|
+
|
|
2763
3231
|
// --- page.on('request', ...) Handler: Core Network Request Logic ---
|
|
2764
3232
|
// This handler is triggered for every network request made by the page.
|
|
2765
3233
|
// It decides whether to allow, block, or process the request based on:
|
|
@@ -2820,15 +3288,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2820
3288
|
console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[frame: ${isMainFrame ? 'main' : 'iframe'}] ${debugFrameUrl} → ${checkedUrl}`));
|
|
2821
3289
|
}
|
|
2822
3290
|
|
|
2823
|
-
// Apply adblock rules BEFORE expensive regex checks
|
|
2824
|
-
|
|
3291
|
+
// Apply adblock-rs filter-list rules BEFORE expensive regex checks
|
|
3292
|
+
// for better performance. Gated on !disableAdblock so per-URL configs
|
|
3293
|
+
// (e.g. for popup/redirect chain capture) can bypass it.
|
|
3294
|
+
if (!disableAdblock && adblockEnabled && adblockMatcher) {
|
|
2825
3295
|
try {
|
|
2826
3296
|
const result = adblockMatcher.shouldBlock(
|
|
2827
3297
|
checkedUrl,
|
|
2828
3298
|
currentUrl,
|
|
2829
3299
|
request.resourceType()
|
|
2830
3300
|
);
|
|
2831
|
-
|
|
3301
|
+
|
|
2832
3302
|
if (result.blocked) {
|
|
2833
3303
|
adblockStats.blocked++;
|
|
2834
3304
|
if (forceDebug) {
|
|
@@ -2862,13 +3332,42 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2862
3332
|
if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
|
|
2863
3333
|
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
2864
3334
|
if (forceDebug) {
|
|
2865
|
-
console.log(formatLogMessage('debug',
|
|
3335
|
+
console.log(formatLogMessage('debug', `${IGNORE_DOMAINS_BY_URL_TAG} ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
|
|
2866
3336
|
}
|
|
2867
3337
|
break;
|
|
2868
3338
|
}
|
|
2869
3339
|
}
|
|
2870
3340
|
}
|
|
2871
3341
|
|
|
3342
|
+
// blockDomainsByUrl trigger — symmetric to ignoreDomainsByUrl above.
|
|
3343
|
+
// If any pattern matches this URL, mark the root domain as blocked
|
|
3344
|
+
// for the rest of the scan. The gate immediately below catches the
|
|
3345
|
+
// triggering request itself + any future request on this domain or
|
|
3346
|
+
// its subdomains (parent-walk via matchesDynamicBlock).
|
|
3347
|
+
if (_blockDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyBlockedDomains.has(checkedRootDomain)) {
|
|
3348
|
+
for (let i = 0; i < _blockDomainsByUrlRegexes.length; i++) {
|
|
3349
|
+
if (_blockDomainsByUrlRegexes[i].test(reqUrl)) {
|
|
3350
|
+
_dynamicallyBlockedDomains.add(checkedRootDomain);
|
|
3351
|
+
if (forceDebug) {
|
|
3352
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} ${checkedRootDomain} blocked — matched pattern: ${_blockDomainsByUrlRegexes[i].source}`));
|
|
3353
|
+
}
|
|
3354
|
+
break;
|
|
3355
|
+
}
|
|
3356
|
+
}
|
|
3357
|
+
}
|
|
3358
|
+
// blockDomainsByUrl gate — abort if reqDomain (or a parent) is in
|
|
3359
|
+
// the dynamic block Set. Fires BEFORE the static blocked-regex
|
|
3360
|
+
// check so domain-based blocks short-circuit without paying the
|
|
3361
|
+
// per-URL regex scan. Same abort reason as the static path so
|
|
3362
|
+
// request.failure() observers see consistent metadata.
|
|
3363
|
+
if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
|
|
3364
|
+
if (forceDebug) {
|
|
3365
|
+
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
|
|
3366
|
+
}
|
|
3367
|
+
request.abort('blockedbyclient');
|
|
3368
|
+
return;
|
|
3369
|
+
}
|
|
3370
|
+
|
|
2872
3371
|
let blockedMatchIndex = -1;
|
|
2873
3372
|
for (let i = 0; i < allBlockedRegexes.length; i++) {
|
|
2874
3373
|
if (allBlockedRegexes[i].test(reqUrl)) {
|
|
@@ -2877,8 +3376,16 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2877
3376
|
}
|
|
2878
3377
|
}
|
|
2879
3378
|
if (blockedMatchIndex !== -1) {
|
|
3379
|
+
// Always track the hit (zero-cost on the un-debug path) so the
|
|
3380
|
+
// scan-end summary can show which patterns are doing work vs.
|
|
3381
|
+
// which are stale and ready to prune. Keyed by pattern.source --
|
|
3382
|
+
// identical patterns from site + global lists roll up together,
|
|
3383
|
+
// which matches how users think about them.
|
|
3384
|
+
const matchedPatternSrc = allBlockedRegexes[blockedMatchIndex].source;
|
|
3385
|
+
_blockedPatternHits.set(matchedPatternSrc, (_blockedPatternHits.get(matchedPatternSrc) || 0) + 1);
|
|
3386
|
+
|
|
2880
3387
|
if (forceDebug) {
|
|
2881
|
-
const matchedPattern =
|
|
3388
|
+
const matchedPattern = matchedPatternSrc;
|
|
2882
3389
|
const patternSource = blockedMatchIndex < blockedRegexes.length ? 'site' : 'global';
|
|
2883
3390
|
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked]')}[${simplifiedCurrentUrl}] ${reqUrl} blocked by ${patternSource} pattern: ${matchedPattern}`));
|
|
2884
3391
|
|
|
@@ -2950,6 +3457,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2950
3457
|
return;
|
|
2951
3458
|
}
|
|
2952
3459
|
|
|
3460
|
+
// Early ignoreDomains gate — skip regex + dig/whois entirely for domains
|
|
3461
|
+
// in the ignoreDomains list (or dynamically-ignored ones populated by
|
|
3462
|
+
// ignoreDomainsByUrl above). Mirrors the popup handler's early gate so
|
|
3463
|
+
// the main path doesn't waste a dig/whois lookup on domains that
|
|
3464
|
+
// post-processing/output filters will strip anyway.
|
|
3465
|
+
if (matchesIgnoreDomain(reqDomain, ignoreDomains)) {
|
|
3466
|
+
if (forceDebug) {
|
|
3467
|
+
console.log(formatLogMessage('debug', `Skipping ignoreDomains match: ${reqDomain}`));
|
|
3468
|
+
}
|
|
3469
|
+
request.continue();
|
|
3470
|
+
return;
|
|
3471
|
+
}
|
|
3472
|
+
|
|
2953
3473
|
// === ENHANCED REGEX MATCHING WITH AND/OR LOGIC ===
|
|
2954
3474
|
let regexMatched = false;
|
|
2955
3475
|
let matchedRegexPattern = null;
|
|
@@ -3046,9 +3566,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3046
3566
|
dumpUrls,
|
|
3047
3567
|
matchedUrlsLogFile,
|
|
3048
3568
|
forceDebug,
|
|
3049
|
-
fs
|
|
3569
|
+
fs,
|
|
3570
|
+
ignoreDomains,
|
|
3571
|
+
matchesIgnoreDomain
|
|
3050
3572
|
});
|
|
3051
|
-
|
|
3573
|
+
|
|
3052
3574
|
// Execute nettools check asynchronously
|
|
3053
3575
|
const originalDomain = fullSubdomain;
|
|
3054
3576
|
setImmediate(() => netToolsHandler(reqDomain, originalDomain));
|
|
@@ -3122,7 +3644,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3122
3644
|
const cachedDig = smartCache ? smartCache.getCachedNetTools(reqDomain, 'dig', digRecordType) : null;
|
|
3123
3645
|
|
|
3124
3646
|
if ((cachedWhois || cachedDig) && forceDebug) {
|
|
3125
|
-
console.log(formatLogMessage('debug',
|
|
3647
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached nettools results for ${reqDomain}`));
|
|
3126
3648
|
}
|
|
3127
3649
|
|
|
3128
3650
|
// Create nettools handler with cache callbacks (if cache is enabled)
|
|
@@ -3159,9 +3681,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3159
3681
|
dumpUrls,
|
|
3160
3682
|
matchedUrlsLogFile,
|
|
3161
3683
|
forceDebug,
|
|
3162
|
-
fs
|
|
3684
|
+
fs,
|
|
3685
|
+
ignoreDomains,
|
|
3686
|
+
matchesIgnoreDomain
|
|
3163
3687
|
});
|
|
3164
|
-
|
|
3688
|
+
|
|
3165
3689
|
// Execute nettools check asynchronously
|
|
3166
3690
|
const originalDomain = fullSubdomain; // Use full subdomain for nettools
|
|
3167
3691
|
setImmediate(() => netToolsHandler(reqDomain, originalDomain));
|
|
@@ -3218,7 +3742,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3218
3742
|
}
|
|
3219
3743
|
|
|
3220
3744
|
if (cachedContent && forceDebug) {
|
|
3221
|
-
console.log(formatLogMessage('debug',
|
|
3745
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Using cached response content for ${reqUrl.substring(0, 50)}...`));
|
|
3222
3746
|
// Process cached content instead of fetching
|
|
3223
3747
|
} else {
|
|
3224
3748
|
try {
|
|
@@ -3248,7 +3772,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3248
3772
|
forceDebug,
|
|
3249
3773
|
userAgent: curlUserAgent,
|
|
3250
3774
|
resourceType,
|
|
3251
|
-
|
|
3775
|
+
// Pass both flags separately — createGrepHandler now
|
|
3776
|
+
// applies AND logic when hasSearchStringAnd is set.
|
|
3777
|
+
// Previously OR'd into hasSearchString and the AND
|
|
3778
|
+
// patterns were silently dropped.
|
|
3779
|
+
hasSearchString,
|
|
3780
|
+
hasSearchStringAnd,
|
|
3252
3781
|
grepOptions: {
|
|
3253
3782
|
ignoreCase: true,
|
|
3254
3783
|
wholeWord: false,
|
|
@@ -3298,7 +3827,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3298
3827
|
} else if (useGrep && (hasSearchString || hasSearchStringAnd)) {
|
|
3299
3828
|
// Use grep with response handler (no curl)
|
|
3300
3829
|
if (forceDebug) {
|
|
3301
|
-
console.log(formatLogMessage('debug',
|
|
3830
|
+
console.log(formatLogMessage('debug', `${GREP_RESPONSE_TAG} Queuing ${reqUrl} for grep analysis via response handler`));
|
|
3302
3831
|
}
|
|
3303
3832
|
|
|
3304
3833
|
// Queue for grep processing via response handler
|
|
@@ -3386,7 +3915,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3386
3915
|
}
|
|
3387
3916
|
}, cssBlockedSelectors);
|
|
3388
3917
|
} catch (cssRuntimeErr) {
|
|
3389
|
-
console.warn(formatLogMessage('warn',
|
|
3918
|
+
console.warn(formatLogMessage('warn', `${CSS_BLOCKED_TAG} Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
|
|
3390
3919
|
}
|
|
3391
3920
|
}
|
|
3392
3921
|
}
|
|
@@ -3698,8 +4227,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3698
4227
|
const proxyErr = proxyErrors.find(e => err.message.includes(e));
|
|
3699
4228
|
if (proxyErr) {
|
|
3700
4229
|
const info = getProxyInfo(siteConfig);
|
|
3701
|
-
console.error(formatLogMessage('error',
|
|
3702
|
-
console.error(formatLogMessage('error',
|
|
4230
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
|
|
4231
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
|
|
3703
4232
|
}
|
|
3704
4233
|
}
|
|
3705
4234
|
console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
|
|
@@ -3751,7 +4280,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3751
4280
|
try {
|
|
3752
4281
|
if (ghostConfig) {
|
|
3753
4282
|
// Ghost-cursor mode: Bezier-based mouse movements
|
|
3754
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
4283
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Using ghost-cursor for ${currentUrl}`));
|
|
3755
4284
|
const cursor = createGhostCursor(page, { forceDebug });
|
|
3756
4285
|
if (cursor) {
|
|
3757
4286
|
await Promise.race([
|
|
@@ -3789,8 +4318,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3789
4318
|
await performPageInteraction(page, currentUrl, {
|
|
3790
4319
|
...interactionConfig,
|
|
3791
4320
|
mouseMovements: 0,
|
|
3792
|
-
includeElementClicks: false
|
|
3793
|
-
includeTyping: false
|
|
4321
|
+
includeElementClicks: false
|
|
3794
4322
|
}, forceDebug);
|
|
3795
4323
|
}
|
|
3796
4324
|
})(),
|
|
@@ -3811,7 +4339,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3811
4339
|
]);
|
|
3812
4340
|
}
|
|
3813
4341
|
} catch (interactTimeoutErr) {
|
|
3814
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
4342
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} Aborted after ${INTERACTION_HARD_TIMEOUT}ms: ${interactTimeoutErr.message}`));
|
|
3815
4343
|
}
|
|
3816
4344
|
})();
|
|
3817
4345
|
|
|
@@ -3946,7 +4474,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3946
4474
|
const clearResult = await clearSiteData(page, currentUrl, forceDebug, true); // Quick mode for reloads
|
|
3947
4475
|
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i} for ${currentUrl}`));
|
|
3948
4476
|
} catch (reloadClearErr) {
|
|
3949
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
4477
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} Before reload failed for ${currentUrl}`));
|
|
3950
4478
|
}
|
|
3951
4479
|
}
|
|
3952
4480
|
|
|
@@ -4140,8 +4668,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4140
4668
|
const proxyErr = proxyErrors.find(e => err.message.includes(e));
|
|
4141
4669
|
if (proxyErr) {
|
|
4142
4670
|
const info = getProxyInfo(siteConfig);
|
|
4143
|
-
console.error(formatLogMessage('error',
|
|
4144
|
-
console.error(formatLogMessage('error',
|
|
4671
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} ${proxyErr} — proxy: ${info} — URL: ${currentUrl}`));
|
|
4672
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Check: is the proxy running? Are credentials correct? Is the target reachable from the proxy?`));
|
|
4145
4673
|
}
|
|
4146
4674
|
}
|
|
4147
4675
|
|
|
@@ -4208,17 +4736,33 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4208
4736
|
};
|
|
4209
4737
|
} finally {
|
|
4210
4738
|
// Guaranteed resource cleanup - this runs regardless of success or failure
|
|
4211
|
-
|
|
4739
|
+
|
|
4740
|
+
// Flip the popup-capture race-window guard first so any in-flight
|
|
4741
|
+
// 'targetcreated' handler that resolves after this point sees the
|
|
4742
|
+
// flag and bails (closing its own popup if it managed to fetch one).
|
|
4743
|
+
urlFinished = true;
|
|
4744
|
+
|
|
4745
|
+
// Popup capture teardown (opt-in via siteConfig.capture_popups). Each
|
|
4746
|
+
// entry is either the browser.off('targetcreated', ...) deregistration
|
|
4747
|
+
// or a per-popup (clearTimeout + popupPage.close) cleanup. Iterate even
|
|
4748
|
+
// if one fails so the rest still run.
|
|
4749
|
+
if (popupCleanups.length) {
|
|
4750
|
+
for (const cleanup of popupCleanups) {
|
|
4751
|
+
try { cleanup(); } catch (_) {}
|
|
4752
|
+
}
|
|
4753
|
+
popupCleanups.length = 0;
|
|
4754
|
+
}
|
|
4755
|
+
|
|
4212
4756
|
// Disconnect VPN for this site
|
|
4213
4757
|
if (siteConfig.vpn) {
|
|
4214
4758
|
const vpnDown = wgDisconnect(siteConfig, forceDebug);
|
|
4215
4759
|
if (vpnDown.tornDown && forceDebug) {
|
|
4216
|
-
console.log(formatLogMessage('debug',
|
|
4760
|
+
console.log(formatLogMessage('debug', `${VPN_TAG} WireGuard interface torn down for ${currentUrl}`));
|
|
4217
4761
|
}
|
|
4218
4762
|
} else if (siteConfig.openvpn) {
|
|
4219
4763
|
const ovpnDown = ovpnDisconnect(siteConfig, forceDebug);
|
|
4220
4764
|
if (ovpnDown.tornDown && forceDebug) {
|
|
4221
|
-
console.log(formatLogMessage('debug',
|
|
4765
|
+
console.log(formatLogMessage('debug', `${VPN_TAG} OpenVPN connection torn down for ${currentUrl}`));
|
|
4222
4766
|
}
|
|
4223
4767
|
}
|
|
4224
4768
|
|
|
@@ -4300,6 +4844,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4300
4844
|
// Sort tasks so proxy groups are contiguous — direct connections first, then each proxy
|
|
4301
4845
|
allTasks.sort((a, b) => proxyKeyFor(a.config).localeCompare(proxyKeyFor(b.config)));
|
|
4302
4846
|
|
|
4847
|
+
// Pre-start local no-auth SOCKS5 relays for any authenticated socks5://
|
|
4848
|
+
// upstreams. Done once here (the only async step) so getProxyArgs stays a
|
|
4849
|
+
// sync lookup in the per-batch browser-launch path. Chromium can't auth
|
|
4850
|
+
// SOCKS5; the relay does the upstream auth transparently.
|
|
4851
|
+
try {
|
|
4852
|
+
const relayCount = await prepareSocksRelays(sites, forceDebug);
|
|
4853
|
+
if (relayCount > 0 && !silentMode) {
|
|
4854
|
+
console.log(messageColors.processing(`Started ${relayCount} SOCKS5 auth relay(s)`));
|
|
4855
|
+
}
|
|
4856
|
+
} catch (relayErr) {
|
|
4857
|
+
console.warn(formatLogMessage('proxy', `SOCKS5 relay setup failed: ${relayErr.message}`));
|
|
4858
|
+
}
|
|
4859
|
+
|
|
4303
4860
|
let results = [];
|
|
4304
4861
|
let processedUrlCount = 0;
|
|
4305
4862
|
let urlsSinceLastCleanup = 0;
|
|
@@ -4320,7 +4877,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4320
4877
|
let lastProcessedCount = 0;
|
|
4321
4878
|
let hangCheckCount = 0;
|
|
4322
4879
|
let forceRestartFlag = false; // Flag to trigger restart on next iteration
|
|
4323
|
-
|
|
4880
|
+
|
|
4881
|
+
// Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
|
|
4882
|
+
// only colors the [severity] tag; the '[HANG CHECK]' substring was
|
|
4883
|
+
// sitting plain inside the message string. Colored once at function
|
|
4884
|
+
// entry so the interval callback doesn't re-colorize per tick.
|
|
4885
|
+
const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
|
|
4886
|
+
|
|
4324
4887
|
const hangDetectionInterval = setInterval(() => {
|
|
4325
4888
|
// Progress check, counter, and forceRestartFlag MUST run regardless of
|
|
4326
4889
|
// debug mode — previously the entire body was gated on forceDebug, which
|
|
@@ -4331,10 +4894,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4331
4894
|
if (processedUrlCount === lastProcessedCount) {
|
|
4332
4895
|
hangCheckCount++;
|
|
4333
4896
|
if (forceDebug) {
|
|
4334
|
-
console.log(formatLogMessage('warn',
|
|
4897
|
+
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
|
|
4335
4898
|
}
|
|
4336
4899
|
if (hangCheckCount >= 5) {
|
|
4337
|
-
console.log(formatLogMessage('error',
|
|
4900
|
+
console.log(formatLogMessage('error', `${HANG_CHECK_TAG} Hung for 2.5 minutes. Triggering emergency browser restart.`));
|
|
4338
4901
|
forceRestartFlag = true; // Set flag instead of exiting
|
|
4339
4902
|
hangCheckCount = 0; // Reset counter for next cycle
|
|
4340
4903
|
}
|
|
@@ -4347,8 +4910,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4347
4910
|
if (forceDebug) {
|
|
4348
4911
|
const currentBatch = Math.floor(currentBatchInfo.batchStart / RESOURCE_CLEANUP_INTERVAL) + 1;
|
|
4349
4912
|
const totalBatches = Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL);
|
|
4350
|
-
console.log(formatLogMessage('debug',
|
|
4351
|
-
console.log(formatLogMessage('debug',
|
|
4913
|
+
console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} Processed: ${processedUrlCount}/${totalUrls} URLs, Batch: ${currentBatch}/${totalBatches}, Current batch size: ${currentBatchInfo.batchSize}`));
|
|
4914
|
+
console.log(formatLogMessage('debug', `${HANG_CHECK_TAG} URLs since cleanup: ${urlsSinceLastCleanup}, Recent failures: ${results.slice(-3).filter(r => !r.success).length}/3`));
|
|
4352
4915
|
}
|
|
4353
4916
|
}, 30000);
|
|
4354
4917
|
// Don't keep the event loop alive solely for the hang-check interval — the
|
|
@@ -4359,29 +4922,46 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4359
4922
|
// Process URLs in batches with exception handling
|
|
4360
4923
|
let siteGroupIndex = 0;
|
|
4361
4924
|
let currentProxyKey = ''; // Track active proxy config — '' means direct connection
|
|
4925
|
+
// Map of site-config object -> index in sites[], built once. Per-batch
|
|
4926
|
+
// grouping below uses this for O(1) lookup instead of sites.indexOf which
|
|
4927
|
+
// walked the array per task (batch=80 * sites=20 was ~1600 cmps per batch).
|
|
4928
|
+
const configToIndex = new Map();
|
|
4929
|
+
for (let i = 0; i < sites.length; i++) configToIndex.set(sites[i], i);
|
|
4362
4930
|
try {
|
|
4363
4931
|
for (let batchStart = 0; batchStart < totalUrls; batchStart += RESOURCE_CLEANUP_INTERVAL) {
|
|
4364
4932
|
const batchEnd = Math.min(batchStart + RESOURCE_CLEANUP_INTERVAL, totalUrls);
|
|
4365
4933
|
const currentBatch = allTasks.slice(batchStart, batchEnd);
|
|
4366
4934
|
|
|
4367
|
-
|
|
4368
|
-
// Group tasks by their source site configuration for window cleanup
|
|
4935
|
+
|
|
4936
|
+
// Group tasks by their source site configuration for window cleanup.
|
|
4937
|
+
// Single get-or-set replaces has + get + set (one Map lookup not two).
|
|
4938
|
+
// The `?? -1` preserves the old `sites.indexOf` semantics for a task
|
|
4939
|
+
// whose config isn't in sites[] — that case shouldn't happen, but if
|
|
4940
|
+
// it ever does the routing stays identical to the prior code's
|
|
4941
|
+
// 'site_-1' bucket rather than silently shifting to 'site_undefined'.
|
|
4369
4942
|
const tasksBySite = new Map();
|
|
4370
|
-
currentBatch.
|
|
4371
|
-
const
|
|
4372
|
-
|
|
4373
|
-
|
|
4374
|
-
|
|
4375
|
-
|
|
4376
|
-
}
|
|
4943
|
+
for (let i = 0; i < currentBatch.length; i++) {
|
|
4944
|
+
const task = currentBatch[i];
|
|
4945
|
+
const siteKey = `site_${configToIndex.get(task.config) ?? -1}`;
|
|
4946
|
+
let arr = tasksBySite.get(siteKey);
|
|
4947
|
+
if (!arr) tasksBySite.set(siteKey, arr = []);
|
|
4948
|
+
arr.push(task);
|
|
4949
|
+
}
|
|
4377
4950
|
|
|
4378
4951
|
// IMPROVED: Only check health if we have indicators of problems
|
|
4379
4952
|
let healthCheck = { shouldRestart: false, reason: null };
|
|
4380
4953
|
const recentResults = results.slice(-8); // Check more results for better pattern detection
|
|
4381
|
-
|
|
4382
|
-
|
|
4954
|
+
// Single-pass count for both failure rate and critical-error tally —
|
|
4955
|
+
// was two .filter(...).length calls allocating two intermediate arrays.
|
|
4956
|
+
let recentFailures = 0, recentCritical = 0;
|
|
4957
|
+
for (let i = 0; i < recentResults.length; i++) {
|
|
4958
|
+
const r = recentResults[i];
|
|
4959
|
+
if (!r.success) recentFailures++;
|
|
4960
|
+
if (r.needsImmediateRestart) recentCritical++;
|
|
4961
|
+
}
|
|
4962
|
+
const recentFailureRate = recentResults.length > 0 ? recentFailures / recentResults.length : 0;
|
|
4383
4963
|
const hasHighFailureRate = recentFailureRate > 0.75; // 75% failure threshold (more conservative)
|
|
4384
|
-
const hasCriticalErrors =
|
|
4964
|
+
const hasCriticalErrors = recentCritical > 2;
|
|
4385
4965
|
|
|
4386
4966
|
// Only run health checks when we have STRONG indicators of problems
|
|
4387
4967
|
if (urlsSinceLastCleanup > 15 && (
|
|
@@ -4390,15 +4970,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4390
4970
|
urlsSinceLastCleanup > RESOURCE_CLEANUP_INTERVAL * 0.9 // Very close to cleanup limit
|
|
4391
4971
|
)) {
|
|
4392
4972
|
try {
|
|
4973
|
+
// Race the health check against a 30s timeout. Attach .catch on the
|
|
4974
|
+
// health promise itself so that if the timeout wins, the still-running
|
|
4975
|
+
// monitorBrowserHealth's eventual rejection doesn't surface as an
|
|
4976
|
+
// unhandledRejection warning.
|
|
4977
|
+
const healthPromise = monitorBrowserHealth(browser, {}, {
|
|
4978
|
+
siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
|
|
4979
|
+
totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
|
|
4980
|
+
urlsSinceCleanup: urlsSinceLastCleanup,
|
|
4981
|
+
cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
|
|
4982
|
+
forceDebug,
|
|
4983
|
+
silentMode
|
|
4984
|
+
});
|
|
4985
|
+
healthPromise.catch(() => {});
|
|
4393
4986
|
healthCheck = await Promise.race([
|
|
4394
|
-
|
|
4395
|
-
siteIndex: Math.floor(batchStart / RESOURCE_CLEANUP_INTERVAL),
|
|
4396
|
-
totalSites: Math.ceil(totalUrls / RESOURCE_CLEANUP_INTERVAL),
|
|
4397
|
-
urlsSinceCleanup: urlsSinceLastCleanup,
|
|
4398
|
-
cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
|
|
4399
|
-
forceDebug,
|
|
4400
|
-
silentMode
|
|
4401
|
-
}),
|
|
4987
|
+
healthPromise,
|
|
4402
4988
|
new Promise((_, reject) => setTimeout(() => reject(new Error('Health check timeout')), 30000))
|
|
4403
4989
|
]);
|
|
4404
4990
|
} catch (healthError) {
|
|
@@ -4427,8 +5013,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4427
5013
|
// timeout) bypasses the urlsSinceLastCleanup > 8 gate — a confirmed hang
|
|
4428
5014
|
// needs immediate restart even if we just cleaned up. Proactive triggers
|
|
4429
5015
|
// keep the gate to prevent thrashing.
|
|
5016
|
+
//
|
|
5017
|
+
// hasHighFailureRate is computed (and still used for the health-check
|
|
5018
|
+
// gate above) but intentionally NOT folded into proactiveRestart:
|
|
5019
|
+
// wouldExceedLimit is always true at every batch boundary with the
|
|
5020
|
+
// default RESOURCE_CLEANUP_INTERVAL == batch size, so the high-failure-
|
|
5021
|
+
// rate branch was dead code reached only at the same boundary that
|
|
5022
|
+
// wouldExceedLimit already triggers. If failure-rate ever needs to
|
|
5023
|
+
// interrupt mid-cleanup-interval, that requires interrupting the
|
|
5024
|
+
// running Promise.all — a real behavior change, not an OR addition.
|
|
4430
5025
|
const hangRecoveryRestart = forceRestartFlag;
|
|
4431
|
-
const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth
|
|
5026
|
+
const proactiveRestart = (wouldExceedLimit || shouldRestartFromHealth) && urlsSinceLastCleanup > 8;
|
|
4432
5027
|
if ((hangRecoveryRestart || proactiveRestart) && isNotLastBatch) {
|
|
4433
5028
|
let restartReason = 'Unknown';
|
|
4434
5029
|
if (forceRestartFlag) {
|
|
@@ -4436,8 +5031,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4436
5031
|
forceRestartFlag = false; // Reset the flag
|
|
4437
5032
|
} else if (shouldRestartFromHealth) {
|
|
4438
5033
|
restartReason = healthCheck.reason;
|
|
4439
|
-
} else if (hasHighFailureRate) {
|
|
4440
|
-
restartReason = `High failure rate: ${Math.round(recentFailureRate * 100)}% in recent batch`;
|
|
4441
5034
|
} else if (wouldExceedLimit) {
|
|
4442
5035
|
restartReason = `Processed ${urlsSinceLastCleanup} URLs (scheduled maintenance)`;
|
|
4443
5036
|
}
|
|
@@ -4452,7 +5045,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4452
5045
|
if (requestCacheStats.enabled && requestCacheStats.size > 0) {
|
|
4453
5046
|
const clearedCount = smartCache.clearRequestCache();
|
|
4454
5047
|
if (forceDebug) {
|
|
4455
|
-
console.log(formatLogMessage('debug',
|
|
5048
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during browser restart`));
|
|
4456
5049
|
}
|
|
4457
5050
|
}
|
|
4458
5051
|
}
|
|
@@ -4467,24 +5060,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4467
5060
|
});
|
|
4468
5061
|
|
|
4469
5062
|
// Clean up the specific user data directory
|
|
4470
|
-
if (userDataDir
|
|
4471
|
-
fs.rmSync(userDataDir, { recursive: true, force: true });
|
|
4472
|
-
if (forceDebug) console.log(formatLogMessage('debug', `Cleaned user data dir: ${userDataDir}`));
|
|
4473
|
-
}
|
|
5063
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4474
5064
|
|
|
4475
5065
|
// Additional cleanup for any remaining Chrome processes
|
|
4476
5066
|
if (removeTempFiles) {
|
|
4477
|
-
await cleanupChromeTempFiles({
|
|
4478
|
-
includeSnapTemp: true,
|
|
5067
|
+
await cleanupChromeTempFiles({
|
|
5068
|
+
includeSnapTemp: true,
|
|
4479
5069
|
forceDebug,
|
|
4480
|
-
comprehensive: true
|
|
5070
|
+
comprehensive: true
|
|
4481
5071
|
});
|
|
4482
5072
|
}
|
|
4483
5073
|
|
|
4484
5074
|
} catch (browserCloseErr) {
|
|
4485
5075
|
if (forceDebug) console.log(formatLogMessage('debug', `Browser cleanup warning: ${browserCloseErr.message}`));
|
|
4486
5076
|
}
|
|
4487
|
-
|
|
5077
|
+
|
|
4488
5078
|
// Create new browser for next batch (preserve current proxy config)
|
|
4489
5079
|
const restartProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
|
|
4490
5080
|
browser = await createBrowser(restartProxyArgs);
|
|
@@ -4492,7 +5082,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4492
5082
|
|
|
4493
5083
|
// Reset cleanup counter and add delay
|
|
4494
5084
|
urlsSinceLastCleanup = 0;
|
|
4495
|
-
purgeStaleTrackers();
|
|
4496
5085
|
await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
|
|
4497
5086
|
}
|
|
4498
5087
|
|
|
@@ -4512,9 +5101,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4512
5101
|
forceDebug, timeout: 10000, exitOnFailure: false,
|
|
4513
5102
|
cleanTempFiles: true, comprehensiveCleanup: removeTempFiles
|
|
4514
5103
|
});
|
|
4515
|
-
if (userDataDir
|
|
4516
|
-
fs.rmSync(userDataDir, { recursive: true, force: true });
|
|
4517
|
-
}
|
|
5104
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4518
5105
|
} catch (proxyRestartErr) {
|
|
4519
5106
|
if (forceDebug) console.log(formatLogMessage('debug', `Proxy switch browser cleanup: ${proxyRestartErr.message}`));
|
|
4520
5107
|
}
|
|
@@ -4526,8 +5113,8 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4526
5113
|
const health = await testProxy(currentBatch[0].config, 5000);
|
|
4527
5114
|
if (!health.reachable) {
|
|
4528
5115
|
const info = getProxyInfo(currentBatch[0].config);
|
|
4529
|
-
console.error(formatLogMessage('error',
|
|
4530
|
-
console.error(formatLogMessage('error',
|
|
5116
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Unreachable: ${info} — ${health.error}`));
|
|
5117
|
+
console.error(formatLogMessage('error', `${PROXY_TAG} Skipping ${currentBatch.length} URL(s) in this batch`));
|
|
4531
5118
|
const skipResults = currentBatch.map(task => ({
|
|
4532
5119
|
success: false, url: task.url, rules: [],
|
|
4533
5120
|
error: `Proxy unreachable: ${health.error}`
|
|
@@ -4545,7 +5132,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4545
5132
|
browser = await createBrowser(proxyArgs);
|
|
4546
5133
|
currentProxyKey = batchProxyKey;
|
|
4547
5134
|
urlsSinceLastCleanup = 0;
|
|
4548
|
-
purgeStaleTrackers();
|
|
4549
5135
|
await fastTimeout(TIMEOUTS.BROWSER_STABILIZE_DELAY);
|
|
4550
5136
|
}
|
|
4551
5137
|
|
|
@@ -4555,7 +5141,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4555
5141
|
|
|
4556
5142
|
// Log start of concurrent processing for hang detection
|
|
4557
5143
|
if (forceDebug) {
|
|
4558
|
-
console.log(formatLogMessage('debug',
|
|
5144
|
+
console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Starting ${batchSize} concurrent tasks with limit ${MAX_CONCURRENT_SITES}`));
|
|
4559
5145
|
}
|
|
4560
5146
|
|
|
4561
5147
|
// Create tasks with timeout protection — skip domains that repeatedly timed out.
|
|
@@ -4567,7 +5153,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4567
5153
|
try {
|
|
4568
5154
|
// Short-circuit queued URLs once any URL in this batch has triggered a
|
|
4569
5155
|
// restart. Without this, the 80-URL batch in the user's hang trace
|
|
4570
|
-
// would have to fail one-by-one at
|
|
5156
|
+
// would have to fail one-by-one at 75s each (~25 min total) before
|
|
4571
5157
|
// the boundary restart could fire. Now: first hang fires the flag,
|
|
4572
5158
|
// remaining queued URLs return immediately, batch completes, restart.
|
|
4573
5159
|
if (forceRestartFlag) {
|
|
@@ -4580,25 +5166,111 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4580
5166
|
if (!silentMode) console.log(formatLogMessage('info', `Skipping ${task.url} — ${taskDomain} timed out ${DOMAIN_TIMEOUT_THRESHOLD} times`));
|
|
4581
5167
|
return { url: task.url, rules: [], success: false, error: 'Domain repeatedly timed out', skipped: true };
|
|
4582
5168
|
}
|
|
5169
|
+
|
|
5170
|
+
// DNS pre-check — fails fast on NXDOMAIN/unresolvable hosts before
|
|
5171
|
+
// we pay ~5-15s for Puppeteer navigation + Cloudflare detection.
|
|
5172
|
+
// Skips IP literals. Respects an in-memory negative cache so a dead
|
|
5173
|
+
// host hit by many URL paths only costs one DNS round-trip per TTL.
|
|
5174
|
+
//
|
|
5175
|
+
// Uses dns.resolve* (c-ares, async network I/O) NOT dns.lookup
|
|
5176
|
+
// (getaddrinfo, libuv threadpool). Under scan concurrency Puppeteer
|
|
5177
|
+
// saturates the default 4-slot threadpool with filesystem I/O, so
|
|
5178
|
+
// dns.lookup calls sit queued and blow the timeout while never
|
|
5179
|
+
// actually starting — wrongly skipping live domains. c-ares isn't
|
|
5180
|
+
// threadpool-bound so it's immune to that contention.
|
|
5181
|
+
if (dnsPrecheckEnabled && taskDomain && !/^[\d.:]+$|^\[/.test(taskDomain)) {
|
|
5182
|
+
const cached = dnsNegativeCache.get(taskDomain);
|
|
5183
|
+
if (cached && Date.now() - cached.timestamp < DNS_NEGATIVE_CACHE_TTL_MS) {
|
|
5184
|
+
dnsPrecheckSkips++;
|
|
5185
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check (cached): ${taskDomain} — ${cached.error}`));
|
|
5186
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${cached.error}`, skipped: true };
|
|
5187
|
+
}
|
|
5188
|
+
// Positive-resolution shortcut: dig or whois has already proven this
|
|
5189
|
+
// hostname live within their 20h cache TTL (populated either by an
|
|
5190
|
+
// earlier URL this run or by --dns-cache disk-load from a prior run).
|
|
5191
|
+
// Order matters -- negative cache (5min TTL, fresher data) wins
|
|
5192
|
+
// first, then this 20h-TTL positive index, then the actual resolve.
|
|
5193
|
+
if (domainKnownToResolve(taskDomain)) {
|
|
5194
|
+
dnsPositiveSkips++;
|
|
5195
|
+
dnsPositiveSkippedHosts.add(taskDomain);
|
|
5196
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
|
|
5197
|
+
// Fall through to navigation -- pre-check "passed" by proxy.
|
|
5198
|
+
} else {
|
|
5199
|
+
const dnsResolve = async () => {
|
|
5200
|
+
// resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
|
|
5201
|
+
// resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
|
|
5202
|
+
// error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
|
|
5203
|
+
// unchanged so the outer transient-retry path sees the real
|
|
5204
|
+
// resolver code and the negative cache records the right reason.
|
|
5205
|
+
// Previously a bare .catch swallowed everything and tried
|
|
5206
|
+
// resolve6, which masked transient v4-side errors behind
|
|
5207
|
+
// whatever resolve6 ended up reporting.
|
|
5208
|
+
// 2s timeout kept as a real safety net — with c-ares off the
|
|
5209
|
+
// threadpool it should now rarely fire.
|
|
5210
|
+
let timer;
|
|
5211
|
+
try {
|
|
5212
|
+
const timeoutP = new Promise((_, reject) => {
|
|
5213
|
+
timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
|
|
5214
|
+
});
|
|
5215
|
+
const resolveChain = dnsPromises.resolve4(taskDomain)
|
|
5216
|
+
.catch(err => {
|
|
5217
|
+
if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
|
|
5218
|
+
return dnsPromises.resolve6(taskDomain);
|
|
5219
|
+
}
|
|
5220
|
+
throw err;
|
|
5221
|
+
});
|
|
5222
|
+
await Promise.race([resolveChain, timeoutP]);
|
|
5223
|
+
} finally {
|
|
5224
|
+
if (timer) clearTimeout(timer);
|
|
5225
|
+
}
|
|
5226
|
+
};
|
|
5227
|
+
// c-ares transient codes — retry once so a momentary resolver
|
|
5228
|
+
// hiccup doesn't poison the negative cache for 5 minutes.
|
|
5229
|
+
// DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
|
|
5230
|
+
try {
|
|
5231
|
+
try {
|
|
5232
|
+
await dnsResolve();
|
|
5233
|
+
} catch (firstErr) {
|
|
5234
|
+
const code = firstErr && firstErr.code;
|
|
5235
|
+
if (DNS_TRANSIENT_ERRORS.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
|
|
5236
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
|
|
5237
|
+
await dnsResolve();
|
|
5238
|
+
} else {
|
|
5239
|
+
throw firstErr;
|
|
5240
|
+
}
|
|
5241
|
+
}
|
|
5242
|
+
} catch (dnsErr) {
|
|
5243
|
+
const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
|
|
5244
|
+
dnsNegativeCacheSet(taskDomain, errCode);
|
|
5245
|
+
dnsPrecheckSkips++;
|
|
5246
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
|
|
5247
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
|
|
5248
|
+
}
|
|
5249
|
+
} // close `else` from domainKnownToResolve shortcut above
|
|
5250
|
+
}
|
|
4583
5251
|
} catch {}
|
|
4584
5252
|
|
|
4585
5253
|
// Per-URL timeout so a single hung processUrl can't block the batch
|
|
4586
|
-
// forever.
|
|
4587
|
-
// adaptive
|
|
5254
|
+
// forever. 75s sits comfortably above the realistic legit-page ceiling
|
|
5255
|
+
// (nav 35s + Cloudflare adaptive ~25s + interaction ~10s + network-idle
|
|
5256
|
+
// wait ~10s ≈ ~70s), well short of the old 120s safety net. Cuts
|
|
5257
|
+
// hang-recovery time roughly in half when an entire batch's URLs all
|
|
5258
|
+
// hang and we're waiting on this timeout to advance processedUrlCount.
|
|
5259
|
+
const PER_URL_TIMEOUT_MS = 75000;
|
|
4588
5260
|
const processUrlPromise = processUrl(task.url, task.config, browser);
|
|
4589
5261
|
let perUrlTimer;
|
|
4590
5262
|
try {
|
|
4591
5263
|
return await Promise.race([
|
|
4592
5264
|
processUrlPromise,
|
|
4593
5265
|
new Promise((_, reject) => {
|
|
4594
|
-
perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (
|
|
5266
|
+
perUrlTimer = setTimeout(() => reject(new Error('Per-URL timeout (75s)')), PER_URL_TIMEOUT_MS);
|
|
4595
5267
|
})
|
|
4596
5268
|
]);
|
|
4597
5269
|
} catch (err) {
|
|
4598
|
-
if (err && err.message === 'Per-URL timeout (
|
|
5270
|
+
if (err && err.message === 'Per-URL timeout (75s)') {
|
|
4599
5271
|
processUrlPromise.catch(() => {});
|
|
4600
5272
|
forceRestartFlag = true;
|
|
4601
|
-
return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (
|
|
5273
|
+
return { url: task.url, rules: [], success: false, error: 'Per-URL timeout (75s)', needsImmediateRestart: true };
|
|
4602
5274
|
}
|
|
4603
5275
|
throw err;
|
|
4604
5276
|
} finally {
|
|
@@ -4614,21 +5286,29 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4614
5286
|
|
|
4615
5287
|
let batchResults;
|
|
4616
5288
|
try {
|
|
5289
|
+
// Same orphan-promise pattern as the health-check race above: if the
|
|
5290
|
+
// 10-min batch timeout wins, the still-running Promise.all keeps going
|
|
5291
|
+
// until every batchTask settles. Each individual task is already wrapped
|
|
5292
|
+
// in p-limit's error handling so unhandled rejections should not surface,
|
|
5293
|
+
// but the .catch is free belt-and-braces against future refactors that
|
|
5294
|
+
// change task internals.
|
|
5295
|
+
const batchPromise = Promise.all(batchTasks);
|
|
5296
|
+
batchPromise.catch(() => {});
|
|
4617
5297
|
batchResults = await Promise.race([
|
|
4618
|
-
|
|
4619
|
-
new Promise((_, reject) =>
|
|
5298
|
+
batchPromise,
|
|
5299
|
+
new Promise((_, reject) =>
|
|
4620
5300
|
setTimeout(() => reject(new Error('Batch timeout')), 600000) // 10 min timeout
|
|
4621
5301
|
)
|
|
4622
5302
|
]);
|
|
4623
5303
|
} catch (timeoutError) {
|
|
4624
5304
|
if (timeoutError.message.includes('timeout')) {
|
|
4625
|
-
console.log(formatLogMessage('error',
|
|
5305
|
+
console.log(formatLogMessage('error', `${TIMEOUT_TAG} Batch hung. Restarting browser.`));
|
|
4626
5306
|
try {
|
|
4627
5307
|
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false });
|
|
5308
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4628
5309
|
const timeoutProxyArgs = currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : [];
|
|
4629
5310
|
browser = await createBrowser(timeoutProxyArgs);
|
|
4630
5311
|
urlsSinceLastCleanup = 0;
|
|
4631
|
-
purgeStaleTrackers();
|
|
4632
5312
|
} catch (restartErr) {
|
|
4633
5313
|
throw restartErr;
|
|
4634
5314
|
}
|
|
@@ -4665,7 +5345,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4665
5345
|
|
|
4666
5346
|
// Log completion of concurrent processing
|
|
4667
5347
|
if (forceDebug) {
|
|
4668
|
-
console.log(formatLogMessage('debug',
|
|
5348
|
+
console.log(formatLogMessage('debug', `${CONCURRENCY_TAG} Completed ${batchSize} concurrent tasks, ${batchResults.filter(r => r.success).length} successful`));
|
|
4669
5349
|
}
|
|
4670
5350
|
|
|
4671
5351
|
// Enhanced error reporting for Puppeteer 23.x
|
|
@@ -4727,7 +5407,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4727
5407
|
if (requestCacheStats.enabled && requestCacheStats.size > 0) {
|
|
4728
5408
|
const clearedCount = smartCache.clearRequestCache();
|
|
4729
5409
|
if (forceDebug) {
|
|
4730
|
-
console.log(formatLogMessage('debug',
|
|
5410
|
+
console.log(formatLogMessage('debug', `${SMART_CACHE_TAG} Cleared ${clearedCount} request cache entries during emergency restart`));
|
|
4731
5411
|
}
|
|
4732
5412
|
}
|
|
4733
5413
|
}
|
|
@@ -4748,17 +5428,23 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4748
5428
|
}
|
|
4749
5429
|
|
|
4750
5430
|
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true, comprehensiveCleanup: removeTempFiles });
|
|
5431
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4751
5432
|
// Additional cleanup after emergency restart
|
|
4752
5433
|
if (removeTempFiles) {
|
|
4753
|
-
await cleanupChromeTempFiles({
|
|
4754
|
-
includeSnapTemp: true,
|
|
5434
|
+
await cleanupChromeTempFiles({
|
|
5435
|
+
includeSnapTemp: true,
|
|
4755
5436
|
forceDebug,
|
|
4756
|
-
comprehensive: true
|
|
5437
|
+
comprehensive: true
|
|
4757
5438
|
});
|
|
4758
5439
|
}
|
|
4759
5440
|
browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
|
|
4760
5441
|
urlsSinceLastCleanup = 0; // Reset counter
|
|
4761
|
-
|
|
5442
|
+
// Reset the hang-detection flag too: this restart path is triggered
|
|
5443
|
+
// by needsImmediateRestart errors, which the per-URL 75s timeout
|
|
5444
|
+
// sets in lockstep with forceRestartFlag. Without this reset, the
|
|
5445
|
+
// hang-fallback restart below would fire a SECOND back-to-back
|
|
5446
|
+
// browser restart on the same batch boundary.
|
|
5447
|
+
forceRestartFlag = false;
|
|
4762
5448
|
await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY); // Give browser time to stabilize
|
|
4763
5449
|
} catch (emergencyRestartErr) {
|
|
4764
5450
|
if (forceDebug) console.log(formatLogMessage('debug', `Emergency restart failed: ${emergencyRestartErr.message}`));
|
|
@@ -4769,9 +5455,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4769
5455
|
console.log(`\n${messageColors.fileOp('🔄 Emergency hang detection restart:')} Browser appears hung, forcing restart`);
|
|
4770
5456
|
try {
|
|
4771
5457
|
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true });
|
|
5458
|
+
if (userDataDir) await cleanupUserDataDir(userDataDir, forceDebug);
|
|
4772
5459
|
browser = await createBrowser(currentProxyKey ? getProxyArgs(currentBatch[0].config, forceDebug) : []);
|
|
4773
5460
|
urlsSinceLastCleanup = 0;
|
|
4774
|
-
purgeStaleTrackers();
|
|
4775
5461
|
forceRestartFlag = false; // Reset flag
|
|
4776
5462
|
await fastTimeout(TIMEOUTS.EMERGENCY_RESTART_DELAY);
|
|
4777
5463
|
if (forceDebug) console.log(formatLogMessage('debug', `Emergency hang detection restart completed`));
|
|
@@ -4820,11 +5506,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4820
5506
|
if (requestCacheStats.enabled && requestCacheStats.size > 0) {
|
|
4821
5507
|
const clearedCount = smartCache.clearRequestCache();
|
|
4822
5508
|
if (!silentMode && clearedCount > 0) {
|
|
4823
|
-
console.log(`\n
|
|
5509
|
+
console.log(`\n${messageColors.cleanup(`🗑️ Cleared request cache: ${clearedCount} entries after JSON processing`)}`);
|
|
4824
5510
|
}
|
|
4825
5511
|
if (forceDebug) {
|
|
4826
5512
|
console.log(formatLogMessage('debug',
|
|
4827
|
-
|
|
5513
|
+
`${SMART_CACHE_TAG} Request cache cleared after JSON scan completion (hit rate: ${requestCacheStats.hitRate})`
|
|
4828
5514
|
));
|
|
4829
5515
|
}
|
|
4830
5516
|
}
|
|
@@ -4896,6 +5582,43 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4896
5582
|
if (cloudflareScanStats.errorPages > 0) {
|
|
4897
5583
|
console.log(formatLogMessage('debug', `Cloudflare 5xx origin-error pages: ${cloudflareScanStats.errorPages} (no bypass possible — origin unreachable)`));
|
|
4898
5584
|
}
|
|
5585
|
+
if (dnsPrecheckEnabled && (dnsPrecheckSkips > 0 || dnsPositiveSkips > 0)) {
|
|
5586
|
+
// Two skip mechanisms, each with its own counter + unique-host count:
|
|
5587
|
+
// - dnsPrecheckSkips: URLs short-circuited via the NXDOMAIN-cache
|
|
5588
|
+
// (dnsNegativeCache). Unique-host count = dnsNegativeCache.size.
|
|
5589
|
+
// - dnsPositiveSkips: URLs short-circuited via dig/whois cache
|
|
5590
|
+
// proof of resolution (knownResolvedHostnames index in nettools).
|
|
5591
|
+
// Unique-host count = dnsPositiveSkippedHosts.size (this Set is
|
|
5592
|
+
// populated only on actual skip events, not on every Set add in
|
|
5593
|
+
// nettools, so it's a true per-scan visibility metric).
|
|
5594
|
+
const parts = [];
|
|
5595
|
+
if (dnsPrecheckSkips > 0) {
|
|
5596
|
+
parts.push(`${dnsPrecheckSkips} URL(s) via ${dnsNegativeCache.size} unresolvable host(s)`);
|
|
5597
|
+
}
|
|
5598
|
+
if (dnsPositiveSkips > 0) {
|
|
5599
|
+
parts.push(`${dnsPositiveSkips} URL(s) via ${dnsPositiveSkippedHosts.size} resolved host(s)`);
|
|
5600
|
+
}
|
|
5601
|
+
console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
|
|
5602
|
+
}
|
|
5603
|
+
// Blocked-pattern hit stats. Surfaces which patterns are actually
|
|
5604
|
+
// doing work this scan and (by absence) which are stale enough to
|
|
5605
|
+
// prune from config. Top 10 by hit count to keep the log scannable
|
|
5606
|
+
// on configs with dozens of patterns; full counts available via
|
|
5607
|
+
// _blockedPatternHits if needed for tooling. Fires only when at
|
|
5608
|
+
// least one pattern matched -- silent on scans with no blocks.
|
|
5609
|
+
if (_blockedPatternHits.size > 0) {
|
|
5610
|
+
let totalBlocks = 0;
|
|
5611
|
+
for (const n of _blockedPatternHits.values()) totalBlocks += n;
|
|
5612
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${_blockedPatternHits.size} pattern(s) hit ${totalBlocks} time(s) total`));
|
|
5613
|
+
const sorted = [..._blockedPatternHits.entries()].sort((a, b) => b[1] - a[1]);
|
|
5614
|
+
const top = sorted.slice(0, 10);
|
|
5615
|
+
for (const [pattern, hits] of top) {
|
|
5616
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ${hits.toString().padStart(6)} × ${pattern}`));
|
|
5617
|
+
}
|
|
5618
|
+
if (sorted.length > top.length) {
|
|
5619
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked-stats]')} ... and ${sorted.length - top.length} more pattern(s)`));
|
|
5620
|
+
}
|
|
5621
|
+
}
|
|
4899
5622
|
// Log smart cache statistics (if cache is enabled)
|
|
4900
5623
|
// Adblock statistics
|
|
4901
5624
|
if (adblockEnabled) {
|
|
@@ -5112,7 +5835,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5112
5835
|
try { cleanupCloudflareCache(); } catch (_) {}
|
|
5113
5836
|
try { wgDisconnectAll(forceDebug); } catch (_) {}
|
|
5114
5837
|
try { ovpnDisconnectAll(forceDebug); } catch (_) {}
|
|
5115
|
-
try {
|
|
5838
|
+
try { await closeAllSocksRelays(forceDebug); } catch (_) {}
|
|
5116
5839
|
|
|
5117
5840
|
// Clean process termination
|
|
5118
5841
|
if (forceDebug) console.log(formatLogMessage('debug', `About to exit process...`));
|