@fanboynz/network-scanner 3.1.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -1
- package/CLAUDE.md +2 -1
- package/README.md +33 -5
- package/eslint.config.mjs +13 -1
- package/lib/browserhealth.js +25 -3
- package/lib/dns.js +238 -0
- package/lib/domain-cache.js +14 -127
- package/lib/ghost-cursor.js +29 -11
- package/lib/interaction.js +4 -0
- package/lib/nettools.js +154 -51
- package/lib/output.js +24 -13
- package/lib/redirect.js +4 -1
- package/lib/validate_rules.js +16 -1
- package/nwss.1 +76 -15
- package/nwss.js +356 -105
- package/package.json +1 -1
package/nwss.js
CHANGED
|
@@ -9,9 +9,9 @@ const fs = require('fs');
|
|
|
9
9
|
const os = require('os');
|
|
10
10
|
const psl = require('psl');
|
|
11
11
|
const path = require('path');
|
|
12
|
-
const
|
|
12
|
+
const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
|
|
13
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
14
|
-
const { compressMultipleFiles
|
|
14
|
+
const { compressMultipleFiles } = require('./lib/compress');
|
|
15
15
|
const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
|
|
16
16
|
const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
|
|
17
17
|
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
@@ -34,9 +34,7 @@ const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore
|
|
|
34
34
|
// Graceful exit
|
|
35
35
|
const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
|
|
36
36
|
// Whois & Dig
|
|
37
|
-
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
|
|
38
|
-
// File compare
|
|
39
|
-
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
|
|
37
|
+
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve, loadDiskCache, saveDiskCache, setDigResolvers } = require('./lib/nettools');
|
|
40
38
|
// CDP functionality
|
|
41
39
|
const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp');
|
|
42
40
|
// Post-processing cleanup
|
|
@@ -68,14 +66,14 @@ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
|
68
66
|
// Enhanced mouse interaction and page simulation
|
|
69
67
|
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
70
68
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
71
|
-
const {
|
|
69
|
+
const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
|
|
72
70
|
// Domain detection cache for performance optimization
|
|
73
|
-
const { createGlobalHelpers,
|
|
71
|
+
const { createGlobalHelpers, getDetectedDomainsCount } = require('./lib/domain-cache');
|
|
74
72
|
const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
|
|
75
73
|
const { clearPersistentCache } = require('./lib/smart-cache');
|
|
76
74
|
const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
|
|
77
75
|
// Dry run functionality
|
|
78
|
-
const { initializeDryRunCollections, addDryRunMatch,
|
|
76
|
+
const { initializeDryRunCollections, addDryRunMatch, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
|
|
79
77
|
// Enhanced site data clearing functionality
|
|
80
78
|
const { clearSiteData } = require('./lib/clear_sitedata');
|
|
81
79
|
// Referrer header generation
|
|
@@ -137,6 +135,7 @@ const CONCURRENCY_LIMITS = Object.freeze({
|
|
|
137
135
|
// Keep using the imported map directly so the two can never diverge again.
|
|
138
136
|
|
|
139
137
|
const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
|
|
138
|
+
const REALTIME_CLEANUP_BUFFER_MS = 25000; // Buffer added after site delay before realtime window cleanup
|
|
140
139
|
|
|
141
140
|
/**
|
|
142
141
|
* Detects the installed Puppeteer version dynamically
|
|
@@ -181,7 +180,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
|
|
|
181
180
|
// purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
|
|
182
181
|
// and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
|
|
183
182
|
// automatically — manual purging is no longer needed.
|
|
184
|
-
const { monitorBrowserHealth,
|
|
183
|
+
const { monitorBrowserHealth, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
|
|
185
184
|
|
|
186
185
|
// --- Script Configuration & Constants ---
|
|
187
186
|
const VERSION = '2.0.33'; // Script version
|
|
@@ -191,7 +190,12 @@ const startTime = Date.now();
|
|
|
191
190
|
|
|
192
191
|
// Initialize domain cache helpers with debug logging if enabled
|
|
193
192
|
const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
|
|
194
|
-
|
|
193
|
+
// Only markDomainAsDetected is used — the global cache feeds the end-of-scan
|
|
194
|
+
// "unique domains cached" stat (getDetectedDomainsCount). The skip-check
|
|
195
|
+
// (isDomainAlreadyDetected) is intentionally not wired in: cross-URL dedup is
|
|
196
|
+
// already handled by nettools' global processed-domain sets, smart-cache, and
|
|
197
|
+
// the per-URL local set, so a cache-level skip would be redundant.
|
|
198
|
+
const { markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
|
|
195
199
|
|
|
196
200
|
// Smart cache will be initialized after config is loaded
|
|
197
201
|
let smartCache = null;
|
|
@@ -232,6 +236,9 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
232
236
|
const settingsMap = {
|
|
233
237
|
output: ['-o', '--output'],
|
|
234
238
|
max_concurrent: ['--max-concurrent'],
|
|
239
|
+
cleanup_interval: ['--cleanup-interval'],
|
|
240
|
+
resource_cleanup_interval: ['--cleanup-interval'],
|
|
241
|
+
dns: ['--dns'],
|
|
235
242
|
dns_cache: ['--dns-cache'],
|
|
236
243
|
cache_requests: ['--cache-requests'],
|
|
237
244
|
dumpurls: ['--dumpurls'],
|
|
@@ -243,20 +250,25 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
243
250
|
compress_logs: ['--compress-logs'],
|
|
244
251
|
debug: ['--debug'],
|
|
245
252
|
silent: ['--silent'],
|
|
246
|
-
verbose: ['--verbose'],
|
|
247
253
|
headful: ['--headful'],
|
|
248
254
|
keep_open: ['--keep-open'],
|
|
249
255
|
dry_run: ['--dry-run'],
|
|
250
256
|
titles: ['--titles'],
|
|
251
257
|
sub_domains: ['--sub-domains'],
|
|
252
258
|
no_interact: ['--no-interact'],
|
|
259
|
+
show_dead_domains: ['--show-dead-domains'],
|
|
253
260
|
ghost_cursor: ['--ghost-cursor'],
|
|
254
261
|
plain: ['--plain'],
|
|
255
262
|
cdp: ['--cdp'],
|
|
256
263
|
dnsmasq: ['--dnsmasq'],
|
|
264
|
+
dnsmasq_old: ['--dnsmasq-old'],
|
|
257
265
|
unbound: ['--unbound'],
|
|
258
266
|
privoxy: ['--privoxy'],
|
|
259
267
|
pihole: ['--pihole'],
|
|
268
|
+
adblock_rules: ['--adblock-rules'],
|
|
269
|
+
no_dns_precheck: ['--no-dns-precheck'],
|
|
270
|
+
allow_fullscreen: ['--allow-fullscreen'],
|
|
271
|
+
load_extension: ['--load-extension'],
|
|
260
272
|
eval_on_doc: ['--eval-on-doc'],
|
|
261
273
|
use_puppeteer_core: ['--use-puppeteer-core'],
|
|
262
274
|
ignore_cache: ['--ignore-cache'],
|
|
@@ -314,7 +326,6 @@ if (compareIndex !== -1 && args[compareIndex + 1]) {
|
|
|
314
326
|
}
|
|
315
327
|
|
|
316
328
|
|
|
317
|
-
const forceVerbose = args.includes('--verbose');
|
|
318
329
|
const forceDebug = args.includes('--debug');
|
|
319
330
|
const silentMode = args.includes('--silent');
|
|
320
331
|
const showTitles = args.includes('--titles');
|
|
@@ -337,12 +348,16 @@ const disableInteract = args.includes('--no-interact');
|
|
|
337
348
|
const globalGhostCursor = args.includes('--ghost-cursor');
|
|
338
349
|
const plainOutput = args.includes('--plain');
|
|
339
350
|
const enableCDP = args.includes('--cdp');
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
const
|
|
351
|
+
// These six are reassigned to false by the incompatible-flag validation
|
|
352
|
+
// blocks below (e.g. --dnsmasq + --unbound), so they must be `let` — as
|
|
353
|
+
// `const` that fallback threw "Assignment to constant variable" the moment
|
|
354
|
+
// two conflicting output modes were combined.
|
|
355
|
+
let dnsmasqMode = args.includes('--dnsmasq');
|
|
356
|
+
let dnsmasqOldMode = args.includes('--dnsmasq-old');
|
|
357
|
+
let unboundMode = args.includes('--unbound');
|
|
343
358
|
const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
|
|
344
|
-
|
|
345
|
-
|
|
359
|
+
let privoxyMode = args.includes('--privoxy');
|
|
360
|
+
let piholeMode = args.includes('--pihole');
|
|
346
361
|
const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
|
|
347
362
|
const dryRunMode = args.includes('--dry-run');
|
|
348
363
|
const compressLogs = args.includes('--compress-logs');
|
|
@@ -363,6 +378,21 @@ if (dnsCacheMode) enableDiskCache();
|
|
|
363
378
|
const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
|
|
364
379
|
const dnsPrecheckTimeoutMs = 2000;
|
|
365
380
|
|
|
381
|
+
// --show-dead-domains: collect hostnames that are definitively DEAD (do not
|
|
382
|
+
// exist / unreachable) and print them at the end of the scan so they can be
|
|
383
|
+
// pruned. Only hard signals count — NXDOMAIN/ENODATA from the pre-check and
|
|
384
|
+
// ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE from navigation. Transient
|
|
385
|
+
// failures (403/429 blocks, timeouts, Cloudflare challenges) mean the domain is
|
|
386
|
+
// ALIVE and are deliberately excluded. host -> reason (first seen).
|
|
387
|
+
const showDeadDomains = args.includes('--show-dead-domains');
|
|
388
|
+
const _deadDomains = new Map();
|
|
389
|
+
function recordDeadDomain(urlOrHost, reason) {
|
|
390
|
+
if (!showDeadDomains || !urlOrHost) return;
|
|
391
|
+
let host = urlOrHost;
|
|
392
|
+
try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
|
|
393
|
+
if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
|
|
394
|
+
}
|
|
395
|
+
|
|
366
396
|
// Per-scan cache of negative DNS lookups. OS resolvers don't always cache
|
|
367
397
|
// NXDOMAIN responses, and a scan can hit the same dead hostname many times
|
|
368
398
|
// (different URL paths on the same site). Positive results are left to the
|
|
@@ -371,14 +401,67 @@ const dnsPrecheckTimeoutMs = 2000;
|
|
|
371
401
|
// of unique dead hosts) can't grow the cache unboundedly. Same pattern as
|
|
372
402
|
// the rest of the codebase's in-memory caches.
|
|
373
403
|
const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
|
|
374
|
-
const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
375
404
|
const DNS_NEGATIVE_CACHE_MAX = 1000;
|
|
405
|
+
// The negative cache holds ONLY definitive non-existence (NXDOMAIN/ENODATA) —
|
|
406
|
+
// resolver errors fail open and never enter it (see the pre-check catch), so
|
|
407
|
+
// persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
|
|
408
|
+
// hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
|
|
409
|
+
// otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
|
|
410
|
+
// shorter than the dig/whois positive cache (20h): a domain that doesn't exist
|
|
411
|
+
// now MAY get registered, and this is a domain-hunting scanner, so the dead
|
|
412
|
+
// ones are re-checked twice a day rather than trusted for ~a day.
|
|
413
|
+
const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
|
|
414
|
+
const DNS_NEGATIVE_CACHE_TTL_MS = dnsCacheMode ? DNS_NEGATIVE_PERSIST_TTL_MS : 5 * 60 * 1000;
|
|
415
|
+
const DNS_NEGATIVE_CACHE_FILE = path.join(__dirname, '.dnsnegcache');
|
|
416
|
+
if (dnsCacheMode) {
|
|
417
|
+
// Reuse the dig/whois caches' generic load/save (atomic write, TTL + size
|
|
418
|
+
// bounded). The 'exit' flush is synchronous (writeFileSync) so it fires on
|
|
419
|
+
// any exit path, mirroring nettools' dig/whois flush.
|
|
420
|
+
loadDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX);
|
|
421
|
+
process.on('exit', () => saveDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX));
|
|
422
|
+
}
|
|
376
423
|
let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
|
|
377
424
|
let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
|
|
378
425
|
const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
|
|
379
|
-
//
|
|
380
|
-
//
|
|
381
|
-
|
|
426
|
+
// DNS pre-check resolver (rotation + resolution logic lives in lib/dns.js).
|
|
427
|
+
// `--dns <ip[,ip...]>` (or a `dns` setting in .nwssconfig, mapped to the same
|
|
428
|
+
// flag) pins/rotates an explicit resolver list; otherwise the resolv.conf
|
|
429
|
+
// nameservers are rotated. Rotation spreads the c-ares burst so one server
|
|
430
|
+
// (e.g. a flaky ISP resolver) doesn't absorb every query and answer REFUSED.
|
|
431
|
+
const dnsServerIndex = args.findIndex(arg => arg === '--dns');
|
|
432
|
+
const dnsServersOverride = (dnsServerIndex !== -1 && args[dnsServerIndex + 1])
|
|
433
|
+
? parseDnsServers(args[dnsServerIndex + 1])
|
|
434
|
+
: [];
|
|
435
|
+
const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceDebug });
|
|
436
|
+
// Route nettools' dig through the same --dns resolvers (dig otherwise uses the
|
|
437
|
+
// system /etc/resolv.conf, which on a flaky setup times out and silently drops
|
|
438
|
+
// dig-gated domains). Only when --dns is explicitly set.
|
|
439
|
+
if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
|
|
440
|
+
// Circuit breaker: if resolver errors dominate, suspend the pre-check for a
|
|
441
|
+
// cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
|
|
442
|
+
// still load — a suspended pre-check just proceeds to navigation).
|
|
443
|
+
const dnsBreaker = createDnsCircuitBreaker({ forceDebug });
|
|
444
|
+
if (dnsResolver.pinned && !silentMode) {
|
|
445
|
+
const how = dnsResolver.servers.length === 1 ? 'pinned to' : 'rotating';
|
|
446
|
+
console.log(formatLogMessage('info', `DNS pre-check ${how} ${dnsResolver.servers.join(', ')}`));
|
|
447
|
+
} else if (forceDebug && dnsResolver.rotates) {
|
|
448
|
+
console.log(formatLogMessage('debug', `DNS pre-check rotating ${dnsResolver.servers.length} resolv.conf nameservers: ${dnsResolver.servers.join(', ')}`));
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Idle-hang watchdog registry: in-flight main pages, iterable (the
|
|
452
|
+
// browserhealth page trackers are WeakMaps and can't be scanned). Registered
|
|
453
|
+
// when a task starts navigating, removed on completion. The hang check probes
|
|
454
|
+
// these ONLY while global progress is stalled and force-closes any page that is
|
|
455
|
+
// unresponsive across consecutive probes — recovering a single hung URL in ~the
|
|
456
|
+
// hang-check window instead of waiting out its full per-URL ceiling (which is
|
|
457
|
+
// the backstop). Acting only during a stall + requiring unresponsiveness avoids
|
|
458
|
+
// killing a page that's merely slow (a page in a config delay is idle but
|
|
459
|
+
// RESPONDS to a trivial evaluate; a hung one does not). Entries self-heal via
|
|
460
|
+
// isClosed() so timeout/error paths that skip the normal close can't leak.
|
|
461
|
+
const _inFlightPages = new Map(); // page -> { url, unresponsiveStrikes }
|
|
462
|
+
const PAGE_HANG_PROBE_TIMEOUT_MS = 2000; // liveness-probe (page.evaluate) cap; no response within this = hung
|
|
463
|
+
const PAGE_HANG_PROBE_INTERVAL_MS = 15000; // how often to probe in-flight pages while the scan is stalled
|
|
464
|
+
const PAGE_HANG_STRIKES_TO_KILL = 2; // consecutive HUNG probes before force-close (~30s recovery at the 15s interval)
|
|
382
465
|
|
|
383
466
|
function dnsNegativeCacheSet(hostname, error) {
|
|
384
467
|
if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
|
|
@@ -691,7 +774,6 @@ Per-config settings file (.nwssconfig):
|
|
|
691
774
|
See README.md for format details.
|
|
692
775
|
|
|
693
776
|
General Options:
|
|
694
|
-
--verbose Force verbose mode globally
|
|
695
777
|
--debug Force debug mode globally
|
|
696
778
|
--silent Suppress normal console logs
|
|
697
779
|
--titles Add ! <url> title before each site's group
|
|
@@ -721,10 +803,16 @@ General Options:
|
|
|
721
803
|
|
|
722
804
|
Validation Options:
|
|
723
805
|
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
|
|
724
|
-
--dns
|
|
806
|
+
--dns <ip[,ip,...]> Resolver(s) for the DNS pre-check AND nettools' dig (not Chrome nav / whois).
|
|
807
|
+
One pins all queries to it; several rotate per query. Overrides /etc/resolv.conf.
|
|
808
|
+
--dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each),
|
|
809
|
+
plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
|
|
725
810
|
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
|
|
726
811
|
By default, URLs whose hostname doesn't resolve are skipped
|
|
727
812
|
immediately (saves ~5-15s of Puppeteer time per dead host).
|
|
813
|
+
--show-dead-domains At end of scan, list hostnames that did not resolve / were
|
|
814
|
+
unreachable (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
|
|
815
|
+
Excludes blocks/timeouts (those mean the domain is alive). For pruning.
|
|
728
816
|
--validate-config Validate config.json file and exit
|
|
729
817
|
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
|
|
730
818
|
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
|
|
@@ -741,7 +829,7 @@ Global config.json options:
|
|
|
741
829
|
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
742
830
|
ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
|
|
743
831
|
ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
|
|
744
|
-
max_concurrent_sites:
|
|
832
|
+
max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6)
|
|
745
833
|
resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80)
|
|
746
834
|
disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true)
|
|
747
835
|
|
|
@@ -752,8 +840,7 @@ Per-site config.json options:
|
|
|
752
840
|
When true, ALL regex patterns must match the same URL
|
|
753
841
|
|
|
754
842
|
Redirect Handling Options:
|
|
755
|
-
|
|
756
|
-
max_redirects: 10 Maximum number of redirects to follow (default: 10)
|
|
843
|
+
max_redirects: 10 Maximum number of redirects to follow (default: 10; 0 = follow none)
|
|
757
844
|
js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
|
|
758
845
|
detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
|
|
759
846
|
redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
|
|
@@ -1525,7 +1612,12 @@ function matchesDynamicBlock(domain) {
|
|
|
1525
1612
|
return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
|
|
1526
1613
|
}
|
|
1527
1614
|
|
|
1528
|
-
|
|
1615
|
+
// `_ignorePatterns` is intentionally unused (underscore-marked): every caller
|
|
1616
|
+
// and the grep/curl/nettools/searchstring callback contract pass the ignore
|
|
1617
|
+
// list as a 2nd arg, but the ignore-state actually lives in the module-level
|
|
1618
|
+
// _dynamicallyIgnoredDomains / _ignoreDomainsExact Sets walked below. Kept in
|
|
1619
|
+
// the signature only to preserve that shared call shape.
|
|
1620
|
+
function matchesIgnoreDomain(domain, _ignorePatterns) {
|
|
1529
1621
|
// Both dynamic and static ignore lists are walked parent-by-parent so a
|
|
1530
1622
|
// subdomain of an ignored root inherits the ignore. Previously the
|
|
1531
1623
|
// dynamic check was exact-only, creating an asymmetry: a static-config
|
|
@@ -2116,22 +2208,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2116
2208
|
bypass_cache
|
|
2117
2209
|
} = siteConfig;
|
|
2118
2210
|
|
|
2119
|
-
const allowFirstParty = firstParty === true || firstParty === 1;
|
|
2120
|
-
const allowThirdParty = thirdParty === undefined || thirdParty === true || thirdParty === 1;
|
|
2121
2211
|
const perSiteSubDomains = subDomains === 1 ? true : subDomainsMode;
|
|
2122
|
-
const siteLocalhostIP = localhost || null;
|
|
2123
|
-
const cloudflarePhishBypass = cloudflare_phish === true;
|
|
2124
|
-
const cloudflareBypass = cloudflare_bypass === true;
|
|
2125
2212
|
// Add redirect and same-page loop protection
|
|
2126
|
-
|
|
2213
|
+
// Number check (not ||) so max_redirects: 0 isn't swallowed as falsy → 10.
|
|
2214
|
+
const MAX_REDIRECT_DEPTH = (typeof siteConfig.max_redirects === 'number' && siteConfig.max_redirects >= 0)
|
|
2215
|
+
? siteConfig.max_redirects : 10;
|
|
2127
2216
|
const redirectHistory = new Set();
|
|
2128
2217
|
let redirectCount = 0;
|
|
2129
2218
|
const pageLoadHistory = new Map(); // Track same-page reloads
|
|
2130
2219
|
const MAX_SAME_PAGE_LOADS = 3;
|
|
2131
2220
|
let currentPageUrl = currentUrl;
|
|
2132
2221
|
|
|
2133
|
-
const sitePrivoxy = privoxy === true;
|
|
2134
|
-
const sitePihole = pihole === true;
|
|
2135
2222
|
const flowproxyDetection = flowproxy_detection === true;
|
|
2136
2223
|
|
|
2137
2224
|
const evenBlocked = even_blocked === true;
|
|
@@ -2298,6 +2385,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2298
2385
|
|
|
2299
2386
|
// Track page for realtime cleanup
|
|
2300
2387
|
trackPageForRealtime(page);
|
|
2388
|
+
// Register with the idle-hang watchdog (force-closed if it goes
|
|
2389
|
+
// unresponsive while the whole scan has stalled).
|
|
2390
|
+
_inFlightPages.set(page, { url: currentUrl, unresponsiveStrikes: 0 });
|
|
2301
2391
|
|
|
2302
2392
|
// Mark page as actively processing
|
|
2303
2393
|
updatePageUsage(page, true);
|
|
@@ -2822,12 +2912,27 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2822
2912
|
|
|
2823
2913
|
const regexes = getCompiledRegexes(siteConfig.filterRegex);
|
|
2824
2914
|
|
|
2915
|
+
// output_regex (optional per-site): extract the rule body from each matched
|
|
2916
|
+
// URL via capture group 1 (or the whole match), so output becomes
|
|
2917
|
+
// ||<capture> (e.g. ||host/script/) instead of ||host^ — lets a stable
|
|
2918
|
+
// folder/file be blocked on a host that also serves legit content. Compiled
|
|
2919
|
+
// silently here; config-load validation (validate_rules) warns on a bad
|
|
2920
|
+
// pattern, so a throw here just disables the feature for this site.
|
|
2921
|
+
// Reuse the memoized regex compiler (same cache as filterRegex) so the
|
|
2922
|
+
// pattern compiles once per unique source, not once per URL. try/catch
|
|
2923
|
+
// because getCompiledRegex throws on a bad pattern — config-load
|
|
2924
|
+
// validation already warned; a throw here just disables the feature.
|
|
2925
|
+
let outputRegex = null;
|
|
2926
|
+
if (siteConfig.output_regex) {
|
|
2927
|
+
try { outputRegex = getCompiledRegexes(siteConfig.output_regex)[0] || null; } catch (_) { outputRegex = null; }
|
|
2928
|
+
}
|
|
2929
|
+
|
|
2825
2930
|
// NEW: Get regex_and setting (defaults to false for backward compatibility)
|
|
2826
2931
|
const useRegexAnd = siteConfig.regex_and === true;
|
|
2827
2932
|
|
|
2828
2933
|
// Parse searchstring patterns using module
|
|
2829
2934
|
const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
|
|
2830
|
-
|
|
2935
|
+
let useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring (reassigned to false below if curl is unavailable)
|
|
2831
2936
|
let useGrep = siteConfig.grep === true; // Grep can work independently
|
|
2832
2937
|
|
|
2833
2938
|
// Get user agent for curl if needed
|
|
@@ -3009,9 +3114,30 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3009
3114
|
* @param {string} fullSubdomain - Full subdomain for cache tracking
|
|
3010
3115
|
* @param {string} resourceType - Resource type (for --adblock-rules mode)
|
|
3011
3116
|
*/
|
|
3012
|
-
function addMatchedDomain(domain, resourceType = null, fullSubdomain = null) {
|
|
3117
|
+
function addMatchedDomain(domain, resourceType = null, fullSubdomain = null, matchedUrl = null) {
|
|
3013
3118
|
// Use fullSubdomain for cache tracking if provided, otherwise fall back to domain
|
|
3014
3119
|
const cacheKey = fullSubdomain || domain;
|
|
3120
|
+
// output_regex: derive the rule body from the matched URL. Capture group 1
|
|
3121
|
+
// (or the whole match) becomes the stored key, e.g. "host/script/", which
|
|
3122
|
+
// formatDomain emits as ||host/script/ for adblock and falls back to the
|
|
3123
|
+
// bare host for domain-only formats. All similarity / dedup / smart-cache
|
|
3124
|
+
// logic below still runs on the bare host (domain); only the final stored
|
|
3125
|
+
// key changes. The capture must contain both '/' and '.' (i.e. host+path),
|
|
3126
|
+
// otherwise we keep the host so a mis-written regex can't emit garbage.
|
|
3127
|
+
let outputKey = domain;
|
|
3128
|
+
if (outputRegex && matchedUrl) {
|
|
3129
|
+
const m = matchedUrl.match(outputRegex);
|
|
3130
|
+
if (m) {
|
|
3131
|
+
const cap = (m[1] != null ? m[1] : m[0]);
|
|
3132
|
+
// Accept only a host+path shape: a '/' with a real host before it
|
|
3133
|
+
// (segment before the first '/' must contain a '.'). Rejects a
|
|
3134
|
+
// capture that accidentally includes the scheme (host part would be
|
|
3135
|
+
// "https:") or a path-only capture with no host — both fall back to
|
|
3136
|
+
// the bare-host ||host^ rule rather than emit garbage.
|
|
3137
|
+
const sl = cap ? cap.indexOf('/') : -1;
|
|
3138
|
+
if (sl > 0 && cap.slice(0, sl).includes('.')) outputKey = cap;
|
|
3139
|
+
}
|
|
3140
|
+
}
|
|
3015
3141
|
// Check if we should ignore similar domains
|
|
3016
3142
|
const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
|
|
3017
3143
|
const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
|
|
@@ -3113,15 +3239,15 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3113
3239
|
}
|
|
3114
3240
|
|
|
3115
3241
|
if (matchedDomains instanceof Map) {
|
|
3116
|
-
if (!matchedDomains.has(
|
|
3117
|
-
matchedDomains.set(
|
|
3242
|
+
if (!matchedDomains.has(outputKey)) {
|
|
3243
|
+
matchedDomains.set(outputKey, new Set());
|
|
3118
3244
|
}
|
|
3119
3245
|
// Only add the specific resourceType that was matched, not all types for this domain
|
|
3120
3246
|
if (resourceType) {
|
|
3121
|
-
matchedDomains.get(
|
|
3247
|
+
matchedDomains.get(outputKey).add(resourceType);
|
|
3122
3248
|
}
|
|
3123
3249
|
} else {
|
|
3124
|
-
matchedDomains.add(
|
|
3250
|
+
matchedDomains.add(outputKey);
|
|
3125
3251
|
}
|
|
3126
3252
|
}
|
|
3127
3253
|
|
|
@@ -3160,12 +3286,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3160
3286
|
// fall back to the default rather than silently disabling capture.
|
|
3161
3287
|
const POPUP_MAX_DEPTH = (() => {
|
|
3162
3288
|
const v = parseInt(siteConfig.capture_popups_max_depth, 10);
|
|
3163
|
-
return Number.isFinite(v) && v > 0 ? v :
|
|
3289
|
+
return Number.isFinite(v) && v > 0 ? v : 4;
|
|
3164
3290
|
})();
|
|
3165
3291
|
const POPUP_CAPTURE_WINDOW_MS = (() => {
|
|
3166
3292
|
const v = parseInt(siteConfig.capture_popups_window_ms, 10);
|
|
3167
3293
|
return Number.isFinite(v) && v > 0 ? v : 5000;
|
|
3168
3294
|
})();
|
|
3295
|
+
// interact_popups: click inside captured popups so they cascade to their
|
|
3296
|
+
// next ad/redirect (requires capture_popups — no popups exist otherwise).
|
|
3297
|
+
// Light pass; the request listener catches whatever the clicks surface.
|
|
3298
|
+
const interactPopups = capturePopups && siteConfig.interact_popups === true;
|
|
3299
|
+
const POPUP_INTERACT_CLICKS = 3; // enough to fire popunder/redirect SDKs (incl. SDKs that suppress the 1st/2nd click as warmup) without runaway cascades
|
|
3169
3300
|
|
|
3170
3301
|
if (capturePopups && forceDebug) {
|
|
3171
3302
|
// One-time setup-time warning if the click prerequisite isn't met.
|
|
@@ -3331,7 +3462,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3331
3462
|
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3332
3463
|
} else {
|
|
3333
3464
|
// No nettools required — regex match alone counts.
|
|
3334
|
-
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3465
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain, checkedUrl);
|
|
3335
3466
|
}
|
|
3336
3467
|
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3337
3468
|
};
|
|
@@ -3453,6 +3584,24 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3453
3584
|
|
|
3454
3585
|
attachPopupRequestCapture(popupPage, depth);
|
|
3455
3586
|
|
|
3587
|
+
// interact_popups: click inside the popup so it can cascade to its next
|
|
3588
|
+
// ad/redirect — popunder/redirect SDKs fire on a document-level click,
|
|
3589
|
+
// and a captured-but-unclicked popup only ever shows its landing URL.
|
|
3590
|
+
// Light pass (POPUP_INTERACT_CLICKS random content-zone clicks), only
|
|
3591
|
+
// on popups shallower than max depth so a clicked popup's spawned child
|
|
3592
|
+
// (depth+1) is still within the capture depth. Fire-and-forget: it must
|
|
3593
|
+
// not block onTargetCreated, and the popup may close/navigate mid-click
|
|
3594
|
+
// (performContentClicks no-ops on a closed page). The request listener
|
|
3595
|
+
// above captures whatever the clicks surface; the close timer bounds it.
|
|
3596
|
+
if (interactPopups && depth < POPUP_MAX_DEPTH && !popupPage.isClosed()) {
|
|
3597
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] interact_popups: ${POPUP_INTERACT_CLICKS} content click(s)`));
|
|
3598
|
+
performContentClicks(popupPage, {
|
|
3599
|
+
clicks: POPUP_INTERACT_CLICKS,
|
|
3600
|
+
forceDebug,
|
|
3601
|
+
realistic: siteConfig.realistic_click === true,
|
|
3602
|
+
}).catch(() => {}); // popup is transient — non-fatal
|
|
3603
|
+
}
|
|
3604
|
+
|
|
3456
3605
|
// Auto-close after the capture window so popups don't pile up.
|
|
3457
3606
|
const closeTimer = setTimeout(() => {
|
|
3458
3607
|
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
@@ -3658,7 +3807,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3658
3807
|
wasBlocked: true
|
|
3659
3808
|
});
|
|
3660
3809
|
} else {
|
|
3661
|
-
addMatchedDomain(reqDomain, resourceType, fullSubdomain);
|
|
3810
|
+
addMatchedDomain(reqDomain, resourceType, fullSubdomain, reqUrl);
|
|
3662
3811
|
}
|
|
3663
3812
|
matchedRegexPatterns.add(evenBlockedRegexPattern);
|
|
3664
3813
|
|
|
@@ -3836,7 +3985,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3836
3985
|
isFirstParty: isFirstParty
|
|
3837
3986
|
});
|
|
3838
3987
|
} else {
|
|
3839
|
-
|
|
3988
|
+
// Pass null for fullSubdomain (not the in-scope hostname) to keep
|
|
3989
|
+
// this path's dedup key as the root domain exactly as before —
|
|
3990
|
+
// only matchedUrl is new here, for output_regex.
|
|
3991
|
+
addMatchedDomain(reqDomain, resourceType, null, reqUrl);
|
|
3840
3992
|
}
|
|
3841
3993
|
if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
|
|
3842
3994
|
if (siteConfig.verbose === 1) {
|
|
@@ -4475,12 +4627,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4475
4627
|
}
|
|
4476
4628
|
}
|
|
4477
4629
|
console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
|
|
4630
|
+
// Capture hard "dead domain" navigation errors for --show-dead-domains
|
|
4631
|
+
// (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
|
|
4632
|
+
// challenges are NOT dead — they're excluded by this match.
|
|
4633
|
+
const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE|ERR_DNS/.exec(err.message || '');
|
|
4634
|
+
if (deadNav) recordDeadDomain(currentUrl, deadNav[0]);
|
|
4478
4635
|
throw err;
|
|
4479
4636
|
}
|
|
4480
4637
|
}
|
|
4481
4638
|
}
|
|
4482
4639
|
|
|
4483
|
-
const delayMs = siteConfig.delay || DEFAULT_DELAY;
|
|
4640
|
+
const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
|
|
4484
4641
|
|
|
4485
4642
|
// Optimized delays for Puppeteer 23.x performance
|
|
4486
4643
|
const isFastSite = timeout <= TIMEOUTS.FAST_SITE_THRESHOLD;
|
|
@@ -4560,8 +4717,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4560
4717
|
const ghostStart = Date.now();
|
|
4561
4718
|
const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
|
|
4562
4719
|
|
|
4563
|
-
//
|
|
4564
|
-
|
|
4720
|
+
// Honor interact_click_count in ghost mode too (built-in default
|
|
4721
|
+
// is 3 — ad SDKs often swallow the 1st/2nd click as warmup). Same
|
|
4722
|
+
// default + 20-cap as the built-in content-click path. 0 when
|
|
4723
|
+
// element clicks are disabled.
|
|
4724
|
+
const ghostClickCount = interactionConfig.includeElementClicks
|
|
4725
|
+
? Math.min(Math.max(Number(siteConfig.interact_click_count) || 3, 1), 20)
|
|
4726
|
+
: 0;
|
|
4727
|
+
// Reserve part of the duration budget for those clicks so the
|
|
4728
|
+
// movement loop doesn't consume all of ghost_cursor_duration.
|
|
4729
|
+
// Capped at half the budget so movement still happens; raise
|
|
4730
|
+
// ghost_cursor_duration to fit more clicks.
|
|
4731
|
+
const clickReserveMs = Math.min(ghostClickCount * 600, ghostDuration * 0.5);
|
|
4732
|
+
|
|
4733
|
+
// Time-based Bezier mouse movements — runs for the unreserved budget
|
|
4734
|
+
while (ghostTimeLeft() > 200 + clickReserveMs) {
|
|
4565
4735
|
const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
|
|
4566
4736
|
const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
|
|
4567
4737
|
await ghostMove(cursor, toX, toY, {
|
|
@@ -4569,18 +4739,23 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4569
4739
|
overshootThreshold: ghostConfig.overshootThreshold,
|
|
4570
4740
|
forceDebug
|
|
4571
4741
|
});
|
|
4572
|
-
if (ghostTimeLeft() > 100) {
|
|
4742
|
+
if (ghostTimeLeft() > 100 + clickReserveMs) {
|
|
4573
4743
|
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4574
4744
|
}
|
|
4575
4745
|
}
|
|
4576
4746
|
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4577
4747
|
await ghostRandomMove(cursor, { forceDebug });
|
|
4578
4748
|
}
|
|
4579
|
-
|
|
4749
|
+
// interact_click_count clicks, each to a fresh content-zone point.
|
|
4750
|
+
// The time guard stops early if the budget runs out (raise
|
|
4751
|
+
// ghost_cursor_duration for more).
|
|
4752
|
+
for (let gc = 0; gc < ghostClickCount && ghostTimeLeft() > 100; gc++) {
|
|
4580
4753
|
const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
|
|
4581
4754
|
const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
|
|
4582
4755
|
await ghostClick(cursor, { x: clickX, y: clickY }, {
|
|
4583
4756
|
hesitate: ghostConfig.hesitate,
|
|
4757
|
+
page,
|
|
4758
|
+
realistic: siteConfig.realistic_click === true,
|
|
4584
4759
|
forceDebug
|
|
4585
4760
|
});
|
|
4586
4761
|
}
|
|
@@ -4895,7 +5070,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4895
5070
|
// Only add delay if we're continuing with more reloads
|
|
4896
5071
|
if (i < totalReloads) {
|
|
4897
5072
|
// Reduce delay for problematic sites
|
|
4898
|
-
const adjustedDelay = i > 1 ? Math.min(DEFAULT_DELAY, 2000) : DEFAULT_DELAY;
|
|
5073
|
+
const adjustedDelay = i > 1 ? Math.min(TIMEOUTS.DEFAULT_DELAY, 2000) : TIMEOUTS.DEFAULT_DELAY;
|
|
4899
5074
|
await fastTimeout(adjustedDelay);
|
|
4900
5075
|
}
|
|
4901
5076
|
}
|
|
@@ -5099,6 +5274,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5099
5274
|
if (!keepBrowserOpen) {
|
|
5100
5275
|
try {
|
|
5101
5276
|
untrackPage(page);
|
|
5277
|
+
_inFlightPages.delete(page);
|
|
5102
5278
|
await page.close();
|
|
5103
5279
|
if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
|
|
5104
5280
|
} catch (pageCloseErr) {
|
|
@@ -5199,6 +5375,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5199
5375
|
let lastProcessedCount = 0;
|
|
5200
5376
|
let hangCheckCount = 0;
|
|
5201
5377
|
let forceRestartFlag = false; // Flag to trigger restart on next iteration
|
|
5378
|
+
// Largest per-URL timeout budget seen across tasks. The hang-check restart
|
|
5379
|
+
// scales to this so it can't false-fire on a legitimately-slow config (high
|
|
5380
|
+
// delay × reload × interact) whose per-URL budget exceeds a flat threshold —
|
|
5381
|
+
// the emergency restart should only fire once the per-URL timeout ITSELF has
|
|
5382
|
+
// had its chance and failed (a true browser hang).
|
|
5383
|
+
let maxPerUrlTimeoutMs = 0;
|
|
5202
5384
|
|
|
5203
5385
|
// Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
|
|
5204
5386
|
// only colors the [severity] tag; the '[HANG CHECK]' substring was
|
|
@@ -5206,6 +5388,48 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5206
5388
|
// entry so the interval callback doesn't re-colorize per tick.
|
|
5207
5389
|
const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
|
|
5208
5390
|
|
|
5391
|
+
// Idle-hang watchdog. Runs only while the scan is stalled (no URL completing).
|
|
5392
|
+
// The probe distinguishes a HUNG renderer from one that's merely NAVIGATING,
|
|
5393
|
+
// which is the key to probing aggressively without false-kills:
|
|
5394
|
+
// - evaluate resolves -> 'alive' -> reset strikes
|
|
5395
|
+
// - evaluate rejects fast (e.g. "Execution context destroyed" mid goto/
|
|
5396
|
+
// reload) -> 'navigating' -> inconclusive: neither
|
|
5397
|
+
// strike nor reset, so a
|
|
5398
|
+
// navigation can NEVER trip
|
|
5399
|
+
// the kill regardless of cadence
|
|
5400
|
+
// - no response within the cap -> 'hung' -> strike
|
|
5401
|
+
// PAGE_HANG_STRIKES_TO_KILL consecutive HUNG probes force-close the page, so the
|
|
5402
|
+
// stuck task's awaits reject and its batch completes instead of waiting out the
|
|
5403
|
+
// full per-URL ceiling. Parallel, guarded against overlap; zero overhead off a stall.
|
|
5404
|
+
let _hangProbeInProgress = false;
|
|
5405
|
+
const probeInFlightPagesForHang = async () => {
|
|
5406
|
+
if (_hangProbeInProgress || _inFlightPages.size === 0) return;
|
|
5407
|
+
_hangProbeInProgress = true;
|
|
5408
|
+
try {
|
|
5409
|
+
await Promise.all([..._inFlightPages.entries()].map(async ([page, info]) => {
|
|
5410
|
+
if (page.isClosed()) { _inFlightPages.delete(page); return; }
|
|
5411
|
+
let verdict;
|
|
5412
|
+
try {
|
|
5413
|
+
verdict = await Promise.race([
|
|
5414
|
+
page.evaluate(() => true).then(() => 'alive', () => 'navigating'),
|
|
5415
|
+
new Promise(r => setTimeout(() => r('hung'), PAGE_HANG_PROBE_TIMEOUT_MS)),
|
|
5416
|
+
]);
|
|
5417
|
+
} catch { verdict = 'hung'; }
|
|
5418
|
+
if (verdict === 'alive') { info.unresponsiveStrikes = 0; return; }
|
|
5419
|
+
if (verdict === 'navigating') return; // context destroyed mid-nav — not a hang; don't strike or reset
|
|
5420
|
+
// verdict === 'hung' — renderer gave no response within the cap
|
|
5421
|
+
info.unresponsiveStrikes++;
|
|
5422
|
+
if (info.unresponsiveStrikes >= PAGE_HANG_STRIKES_TO_KILL) {
|
|
5423
|
+
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} Force-closing hung page after ${info.unresponsiveStrikes} unresponsive probes: ${info.url}`));
|
|
5424
|
+
_inFlightPages.delete(page);
|
|
5425
|
+
page.close().catch(() => {}); // stuck task's awaits reject -> task errors -> batch completes
|
|
5426
|
+
}
|
|
5427
|
+
}));
|
|
5428
|
+
} finally {
|
|
5429
|
+
_hangProbeInProgress = false;
|
|
5430
|
+
}
|
|
5431
|
+
};
|
|
5432
|
+
|
|
5209
5433
|
const hangDetectionInterval = setInterval(() => {
|
|
5210
5434
|
// Progress check, counter, and forceRestartFlag MUST run regardless of
|
|
5211
5435
|
// debug mode — previously the entire body was gated on forceDebug, which
|
|
@@ -5218,8 +5442,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5218
5442
|
if (forceDebug) {
|
|
5219
5443
|
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
|
|
5220
5444
|
}
|
|
5221
|
-
|
|
5222
|
-
|
|
5445
|
+
// The faster 15s probe interval below does surgical per-page recovery; this
|
|
5446
|
+
// 30s interval owns only the slower nuclear-restart escalation. Deadline-
|
|
5447
|
+
// aware: the restart only fires once the stall has OUTLASTED the heaviest
|
|
5448
|
+
// in-flight per-URL budget (+ grace) — i.e. the per-URL timeout itself had
|
|
5449
|
+
// its chance and failed, a true hang. A flat threshold (the old 2.5min)
|
|
5450
|
+
// false-fires on legitimately-slow configs (high delay × reload × interact)
|
|
5451
|
+
// whose per-URL budget exceeds it, restarting the browser mid-work. Floor
|
|
5452
|
+
// at 150s so light configs behave exactly as before.
|
|
5453
|
+
// +45s buffer covers the per-URL 8s orphan grace + the 30s tick granularity + slack.
|
|
5454
|
+
const restartAfterMs = Math.max(150000, maxPerUrlTimeoutMs + 45000);
|
|
5455
|
+
if (hangCheckCount * 30000 >= restartAfterMs) {
|
|
5456
|
+
console.log(formatLogMessage('error', `${HANG_CHECK_TAG} No progress for ${Math.round(hangCheckCount * 30)}s (past the ${Math.round(restartAfterMs / 1000)}s per-URL budget). Triggering emergency browser restart.`));
|
|
5223
5457
|
forceRestartFlag = true; // Set flag instead of exiting
|
|
5224
5458
|
hangCheckCount = 0; // Reset counter for next cycle
|
|
5225
5459
|
}
|
|
@@ -5241,6 +5475,22 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5241
5475
|
// cleanup, this is belt-and-suspenders in case a future refactor moves them.
|
|
5242
5476
|
hangDetectionInterval.unref();
|
|
5243
5477
|
|
|
5478
|
+
// Fast surgical recovery on its own 15s cadence (the 30s interval above owns
|
|
5479
|
+
// the slower nuclear-restart escalation). Probes in-flight pages only while
|
|
5480
|
+
// progress is stalled and force-closes confirmed-hung ones; clears strikes when
|
|
5481
|
+
// progress resumes so a fresh stall starts from zero. Starts at -1 so the very
|
|
5482
|
+
// first window is grace (processedUrlCount begins at 0).
|
|
5483
|
+
let lastProbeCount = -1;
|
|
5484
|
+
const pageHangProbeInterval = setInterval(() => {
|
|
5485
|
+
if (processedUrlCount === lastProbeCount) {
|
|
5486
|
+
probeInFlightPagesForHang(); // fire-and-forget; self-guarded against overlap
|
|
5487
|
+
} else {
|
|
5488
|
+
for (const info of _inFlightPages.values()) info.unresponsiveStrikes = 0;
|
|
5489
|
+
}
|
|
5490
|
+
lastProbeCount = processedUrlCount;
|
|
5491
|
+
}, PAGE_HANG_PROBE_INTERVAL_MS);
|
|
5492
|
+
pageHangProbeInterval.unref();
|
|
5493
|
+
|
|
5244
5494
|
// Process URLs in batches with exception handling
|
|
5245
5495
|
let siteGroupIndex = 0;
|
|
5246
5496
|
let currentProxyKey = ''; // Track active proxy config — '' means direct connection
|
|
@@ -5525,58 +5775,38 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5525
5775
|
dnsPositiveSkippedHosts.add(taskDomain);
|
|
5526
5776
|
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
|
|
5527
5777
|
// Fall through to navigation -- pre-check "passed" by proxy.
|
|
5778
|
+
} else if (dnsBreaker.isTripped()) {
|
|
5779
|
+
// Resolver is in a refusal storm — pre-checking is futile and only
|
|
5780
|
+
// adds load. Skip the resolve and proceed to navigation (same effect
|
|
5781
|
+
// as a fail-open); no breaker record since no resolve happened.
|
|
5782
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check suspended (resolver circuit open) — proceeding: ${taskDomain}`));
|
|
5528
5783
|
} else {
|
|
5529
|
-
const dnsResolve = async () => {
|
|
5530
|
-
// resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
|
|
5531
|
-
// resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
|
|
5532
|
-
// error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
|
|
5533
|
-
// unchanged so the outer transient-retry path sees the real
|
|
5534
|
-
// resolver code and the negative cache records the right reason.
|
|
5535
|
-
// Previously a bare .catch swallowed everything and tried
|
|
5536
|
-
// resolve6, which masked transient v4-side errors behind
|
|
5537
|
-
// whatever resolve6 ended up reporting.
|
|
5538
|
-
// 2s timeout kept as a real safety net — with c-ares off the
|
|
5539
|
-
// threadpool it should now rarely fire.
|
|
5540
|
-
let timer;
|
|
5541
|
-
try {
|
|
5542
|
-
const timeoutP = new Promise((_, reject) => {
|
|
5543
|
-
timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
|
|
5544
|
-
});
|
|
5545
|
-
const resolveChain = dnsPromises.resolve4(taskDomain)
|
|
5546
|
-
.catch(err => {
|
|
5547
|
-
if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
|
|
5548
|
-
return dnsPromises.resolve6(taskDomain);
|
|
5549
|
-
}
|
|
5550
|
-
throw err;
|
|
5551
|
-
});
|
|
5552
|
-
await Promise.race([resolveChain, timeoutP]);
|
|
5553
|
-
} finally {
|
|
5554
|
-
if (timer) clearTimeout(timer);
|
|
5555
|
-
}
|
|
5556
|
-
};
|
|
5557
|
-
// c-ares transient codes — retry once so a momentary resolver
|
|
5558
|
-
// hiccup doesn't poison the negative cache for 5 minutes.
|
|
5559
|
-
// DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
|
|
5560
5784
|
try {
|
|
5561
|
-
|
|
5562
|
-
|
|
5563
|
-
|
|
5564
|
-
|
|
5565
|
-
|
|
5566
|
-
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
|
|
5567
|
-
await dnsResolve();
|
|
5568
|
-
} else {
|
|
5569
|
-
throw firstErr;
|
|
5570
|
-
}
|
|
5571
|
-
}
|
|
5785
|
+
// Rotates the lead nameserver per attempt and retries once on a
|
|
5786
|
+
// transient error; rejects with the final error (code intact) on
|
|
5787
|
+
// failure. See lib/dns.js.
|
|
5788
|
+
await dnsResolver.resolveHost(taskDomain, dnsPrecheckTimeoutMs);
|
|
5789
|
+
dnsBreaker.record(false); // resolved OK — resolver healthy
|
|
5572
5790
|
} catch (dnsErr) {
|
|
5573
5791
|
const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
|
|
5574
|
-
|
|
5575
|
-
|
|
5576
|
-
|
|
5577
|
-
|
|
5792
|
+
// Only a definitive "host does not exist / has no address" answer
|
|
5793
|
+
// (ENOTFOUND/ENODATA) justifies dropping the URL. A resolver-level
|
|
5794
|
+
// failure (EREFUSED/ESERVFAIL/ETIMEOUT/ECONNREFUSED/timeout) says
|
|
5795
|
+
// nothing about whether the domain is live — fail open: don't cache,
|
|
5796
|
+
// don't skip, let it proceed to real browser navigation (a genuinely
|
|
5797
|
+
// dead host still fails fast there).
|
|
5798
|
+
if (isNonExistenceError(errCode)) {
|
|
5799
|
+
dnsBreaker.record(false); // resolver answered NXDOMAIN — healthy
|
|
5800
|
+
dnsNegativeCacheSet(taskDomain, errCode);
|
|
5801
|
+
recordDeadDomain(taskDomain, errCode);
|
|
5802
|
+
dnsPrecheckSkips++;
|
|
5803
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
|
|
5804
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
|
|
5805
|
+
}
|
|
5806
|
+
dnsBreaker.record(true); // resolver error — counts toward tripping the circuit
|
|
5807
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check inconclusive (${errCode}) for ${taskDomain} — proceeding (resolver issue, not a dead host)`));
|
|
5578
5808
|
}
|
|
5579
|
-
} // close `else`
|
|
5809
|
+
} // close the resolve `else` (domainKnownToResolve / circuit-open shortcuts above)
|
|
5580
5810
|
}
|
|
5581
5811
|
} catch {}
|
|
5582
5812
|
|
|
@@ -5609,6 +5839,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5609
5839
|
+ ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
|
|
5610
5840
|
+ 30000
|
|
5611
5841
|
);
|
|
5842
|
+
// Feed the hang-check restart so it never escalates before this URL's own
|
|
5843
|
+
// timeout could have fired (see maxPerUrlTimeoutMs).
|
|
5844
|
+
if (PER_URL_TIMEOUT_MS > maxPerUrlTimeoutMs) maxPerUrlTimeoutMs = PER_URL_TIMEOUT_MS;
|
|
5612
5845
|
// Grace period after primary timeout — gives the orphan a chance to
|
|
5613
5846
|
// finish drainPendingNetTools() and emit "Saving N rules despite page
|
|
5614
5847
|
// load failure" before we abandon its result. Drain typically completes
|
|
@@ -5868,11 +6101,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5868
6101
|
} catch (processingError) {
|
|
5869
6102
|
console.log(formatLogMessage('error', `Critical error: ${processingError.message}`));
|
|
5870
6103
|
clearInterval(hangDetectionInterval);
|
|
6104
|
+
clearInterval(pageHangProbeInterval);
|
|
5871
6105
|
throw processingError;
|
|
5872
6106
|
}
|
|
5873
6107
|
|
|
5874
|
-
// Clear hang detection
|
|
6108
|
+
// Clear hang detection intervals
|
|
5875
6109
|
clearInterval(hangDetectionInterval);
|
|
6110
|
+
clearInterval(pageHangProbeInterval);
|
|
5876
6111
|
|
|
5877
6112
|
// === POST-SCAN PROCESSING ===
|
|
5878
6113
|
// Clean up first-party domains and validate results
|
|
@@ -5954,7 +6189,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5954
6189
|
const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
|
|
5955
6190
|
|
|
5956
6191
|
// Debug: Show output format being used
|
|
5957
|
-
const totalDomainsSkipped = getTotalDomainsSkipped();
|
|
5958
6192
|
const detectedDomainsCount = getDetectedDomainsCount();
|
|
5959
6193
|
if (forceDebug) {
|
|
5960
6194
|
const globalOptions = {
|
|
@@ -5969,7 +6203,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5969
6203
|
};
|
|
5970
6204
|
console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
|
|
5971
6205
|
console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
|
|
5972
|
-
console.log(formatLogMessage('debug', `Performance: ${
|
|
6206
|
+
console.log(formatLogMessage('debug', `Performance: ${detectedDomainsCount} unique domains cached`));
|
|
5973
6207
|
// Cloudflare cache statistics
|
|
5974
6208
|
const cloudflareStats = getCacheStats();
|
|
5975
6209
|
if (cloudflareStats.size > 0) {
|
|
@@ -5998,6 +6232,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5998
6232
|
}
|
|
5999
6233
|
console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
|
|
6000
6234
|
}
|
|
6235
|
+
// Surface circuit-breaker activity in the end-of-scan summary (each trip
|
|
6236
|
+
// also warns in real time). Shown outside forceDebug because a resolver
|
|
6237
|
+
// refusal storm is something the operator should know happened.
|
|
6238
|
+
const dnsBreakerTrips = dnsBreaker.stats().trips;
|
|
6239
|
+
if (dnsBreakerTrips > 0 && !silentMode) {
|
|
6240
|
+
console.log(formatLogMessage('info', `DNS pre-check circuit tripped ${dnsBreakerTrips}× this scan (resolver refusal back-off)`));
|
|
6241
|
+
}
|
|
6001
6242
|
// Blocked-pattern hit stats. Surfaces which patterns are actually
|
|
6002
6243
|
// doing work this scan and (by absence) which are stale enough to
|
|
6003
6244
|
// prune from config. Top 10 by hit count to keep the log scannable
|
|
@@ -6200,8 +6441,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
6200
6441
|
} else if (outputResult.totalRules > 0 && dryRunMode) {
|
|
6201
6442
|
console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
|
|
6202
6443
|
}
|
|
6203
|
-
|
|
6204
|
-
|
|
6444
|
+
// --show-dead-domains: list hostnames that didn't resolve / were unreachable
|
|
6445
|
+
// this scan (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
|
|
6446
|
+
// One host per line so it's greppable for pruning; reason in the trailing column.
|
|
6447
|
+
if (showDeadDomains) {
|
|
6448
|
+
if (_deadDomains.size > 0) {
|
|
6449
|
+
console.log(`\n${messageColors.warn(`Dead domains (${_deadDomains.size}) — did not resolve / unreachable:`)}`);
|
|
6450
|
+
for (const [host, reason] of [..._deadDomains].sort((a, b) => a[0].localeCompare(b[0]))) {
|
|
6451
|
+
console.log(` ${host}\t${reason}`);
|
|
6452
|
+
}
|
|
6453
|
+
} else {
|
|
6454
|
+
console.log(`\n${messageColors.success('Dead domains: none detected')}`);
|
|
6455
|
+
}
|
|
6205
6456
|
}
|
|
6206
6457
|
if (ignoreCache && forceDebug) {
|
|
6207
6458
|
console.log(messageColors.info('Cache:') + ` Smart caching was disabled`);
|