@fanboynz/network-scanner 3.1.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -9,9 +9,9 @@ const fs = require('fs');
9
9
  const os = require('os');
10
10
  const psl = require('psl');
11
11
  const path = require('path');
12
- const dnsPromises = require('node:dns/promises');
12
+ const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
13
13
  const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
14
- const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
14
+ const { compressMultipleFiles } = require('./lib/compress');
15
15
  const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
16
16
  const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
17
17
  const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
@@ -34,9 +34,7 @@ const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore
34
34
  // Graceful exit
35
35
  const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
36
36
  // Whois & Dig
37
- const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
38
- // File compare
39
- const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
37
+ const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve, loadDiskCache, saveDiskCache, setDigResolvers } = require('./lib/nettools');
40
38
  // CDP functionality
41
39
  const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp');
42
40
  // Post-processing cleanup
@@ -68,14 +66,14 @@ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
68
66
  // Enhanced mouse interaction and page simulation
69
67
  const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
70
68
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
71
- const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
69
+ const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
72
70
  // Domain detection cache for performance optimization
73
- const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache');
71
+ const { createGlobalHelpers, getDetectedDomainsCount } = require('./lib/domain-cache');
74
72
  const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
75
73
  const { clearPersistentCache } = require('./lib/smart-cache');
76
74
  const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
77
75
  // Dry run functionality
78
- const { initializeDryRunCollections, addDryRunMatch, addDryRunNetTools, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
76
+ const { initializeDryRunCollections, addDryRunMatch, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
79
77
  // Enhanced site data clearing functionality
80
78
  const { clearSiteData } = require('./lib/clear_sitedata');
81
79
  // Referrer header generation
@@ -137,6 +135,7 @@ const CONCURRENCY_LIMITS = Object.freeze({
137
135
  // Keep using the imported map directly so the two can never diverge again.
138
136
 
139
137
  const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
138
+ const REALTIME_CLEANUP_BUFFER_MS = 25000; // Buffer added after site delay before realtime window cleanup
140
139
 
141
140
  /**
142
141
  * Detects the installed Puppeteer version dynamically
@@ -181,7 +180,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
181
180
  // purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
182
181
  // and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
183
182
  // automatically — manual purging is no longer needed.
184
- const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
183
+ const { monitorBrowserHealth, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
185
184
 
186
185
  // --- Script Configuration & Constants ---
187
186
  const VERSION = '2.0.33'; // Script version
@@ -191,7 +190,12 @@ const startTime = Date.now();
191
190
 
192
191
  // Initialize domain cache helpers with debug logging if enabled
193
192
  const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
194
- const { isDomainAlreadyDetected, markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
193
+ // Only markDomainAsDetected is used — the global cache feeds the end-of-scan
194
+ // "unique domains cached" stat (getDetectedDomainsCount). The skip-check
195
+ // (isDomainAlreadyDetected) is intentionally not wired in: cross-URL dedup is
196
+ // already handled by nettools' global processed-domain sets, smart-cache, and
197
+ // the per-URL local set, so a cache-level skip would be redundant.
198
+ const { markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
195
199
 
196
200
  // Smart cache will be initialized after config is loaded
197
201
  let smartCache = null;
@@ -232,6 +236,9 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
232
236
  const settingsMap = {
233
237
  output: ['-o', '--output'],
234
238
  max_concurrent: ['--max-concurrent'],
239
+ cleanup_interval: ['--cleanup-interval'],
240
+ resource_cleanup_interval: ['--cleanup-interval'],
241
+ dns: ['--dns'],
235
242
  dns_cache: ['--dns-cache'],
236
243
  cache_requests: ['--cache-requests'],
237
244
  dumpurls: ['--dumpurls'],
@@ -243,20 +250,25 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
243
250
  compress_logs: ['--compress-logs'],
244
251
  debug: ['--debug'],
245
252
  silent: ['--silent'],
246
- verbose: ['--verbose'],
247
253
  headful: ['--headful'],
248
254
  keep_open: ['--keep-open'],
249
255
  dry_run: ['--dry-run'],
250
256
  titles: ['--titles'],
251
257
  sub_domains: ['--sub-domains'],
252
258
  no_interact: ['--no-interact'],
259
+ show_dead_domains: ['--show-dead-domains'],
253
260
  ghost_cursor: ['--ghost-cursor'],
254
261
  plain: ['--plain'],
255
262
  cdp: ['--cdp'],
256
263
  dnsmasq: ['--dnsmasq'],
264
+ dnsmasq_old: ['--dnsmasq-old'],
257
265
  unbound: ['--unbound'],
258
266
  privoxy: ['--privoxy'],
259
267
  pihole: ['--pihole'],
268
+ adblock_rules: ['--adblock-rules'],
269
+ no_dns_precheck: ['--no-dns-precheck'],
270
+ allow_fullscreen: ['--allow-fullscreen'],
271
+ load_extension: ['--load-extension'],
260
272
  eval_on_doc: ['--eval-on-doc'],
261
273
  use_puppeteer_core: ['--use-puppeteer-core'],
262
274
  ignore_cache: ['--ignore-cache'],
@@ -314,7 +326,6 @@ if (compareIndex !== -1 && args[compareIndex + 1]) {
314
326
  }
315
327
 
316
328
 
317
- const forceVerbose = args.includes('--verbose');
318
329
  const forceDebug = args.includes('--debug');
319
330
  const silentMode = args.includes('--silent');
320
331
  const showTitles = args.includes('--titles');
@@ -337,12 +348,16 @@ const disableInteract = args.includes('--no-interact');
337
348
  const globalGhostCursor = args.includes('--ghost-cursor');
338
349
  const plainOutput = args.includes('--plain');
339
350
  const enableCDP = args.includes('--cdp');
340
- const dnsmasqMode = args.includes('--dnsmasq');
341
- const dnsmasqOldMode = args.includes('--dnsmasq-old');
342
- const unboundMode = args.includes('--unbound');
351
+ // These six are reassigned to false by the incompatible-flag validation
352
+ // blocks below (e.g. --dnsmasq + --unbound), so they must be `let` — as
353
+ // `const` that fallback threw "Assignment to constant variable" the moment
354
+ // two conflicting output modes were combined.
355
+ let dnsmasqMode = args.includes('--dnsmasq');
356
+ let dnsmasqOldMode = args.includes('--dnsmasq-old');
357
+ let unboundMode = args.includes('--unbound');
343
358
  const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
344
- const privoxyMode = args.includes('--privoxy');
345
- const piholeMode = args.includes('--pihole');
359
+ let privoxyMode = args.includes('--privoxy');
360
+ let piholeMode = args.includes('--pihole');
346
361
  const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
347
362
  const dryRunMode = args.includes('--dry-run');
348
363
  const compressLogs = args.includes('--compress-logs');
@@ -363,6 +378,21 @@ if (dnsCacheMode) enableDiskCache();
363
378
  const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
364
379
  const dnsPrecheckTimeoutMs = 2000;
365
380
 
381
+ // --show-dead-domains: collect hostnames that are definitively DEAD (do not
382
+ // exist / unreachable) and print them at the end of the scan so they can be
383
+ // pruned. Only hard signals count — NXDOMAIN/ENODATA from the pre-check and
384
+ // ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE from navigation. Transient
385
+ // failures (403/429 blocks, timeouts, Cloudflare challenges) mean the domain is
386
+ // ALIVE and are deliberately excluded. host -> reason (first seen).
387
+ const showDeadDomains = args.includes('--show-dead-domains');
388
+ const _deadDomains = new Map();
389
+ function recordDeadDomain(urlOrHost, reason) {
390
+ if (!showDeadDomains || !urlOrHost) return;
391
+ let host = urlOrHost;
392
+ try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
393
+ if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
394
+ }
395
+
366
396
  // Per-scan cache of negative DNS lookups. OS resolvers don't always cache
367
397
  // NXDOMAIN responses, and a scan can hit the same dead hostname many times
368
398
  // (different URL paths on the same site). Positive results are left to the
@@ -371,14 +401,67 @@ const dnsPrecheckTimeoutMs = 2000;
371
401
  // of unique dead hosts) can't grow the cache unboundedly. Same pattern as
372
402
  // the rest of the codebase's in-memory caches.
373
403
  const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
374
- const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
375
404
  const DNS_NEGATIVE_CACHE_MAX = 1000;
405
+ // The negative cache holds ONLY definitive non-existence (NXDOMAIN/ENODATA) —
406
+ // resolver errors fail open and never enter it (see the pre-check catch), so
407
+ // persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
408
+ // hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
409
+ // otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
410
+ // shorter than the dig/whois positive cache (20h): a domain that doesn't exist
411
+ // now MAY get registered, and this is a domain-hunting scanner, so the dead
412
+ // ones are re-checked twice a day rather than trusted for ~a day.
413
+ const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
414
+ const DNS_NEGATIVE_CACHE_TTL_MS = dnsCacheMode ? DNS_NEGATIVE_PERSIST_TTL_MS : 5 * 60 * 1000;
415
+ const DNS_NEGATIVE_CACHE_FILE = path.join(__dirname, '.dnsnegcache');
416
+ if (dnsCacheMode) {
417
+ // Reuse the dig/whois caches' generic load/save (atomic write, TTL + size
418
+ // bounded). The 'exit' flush is synchronous (writeFileSync) so it fires on
419
+ // any exit path, mirroring nettools' dig/whois flush.
420
+ loadDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX);
421
+ process.on('exit', () => saveDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX));
422
+ }
376
423
  let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
377
424
  let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
378
425
  const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
379
- // c-ares transient codes read-only, hoisted out of the per-task DNS
380
- // pre-check so we don't allocate a fresh Set per URL.
381
- const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
426
+ // DNS pre-check resolver (rotation + resolution logic lives in lib/dns.js).
427
+ // `--dns <ip[,ip...]>` (or a `dns` setting in .nwssconfig, mapped to the same
428
+ // flag) pins/rotates an explicit resolver list; otherwise the resolv.conf
429
+ // nameservers are rotated. Rotation spreads the c-ares burst so one server
430
+ // (e.g. a flaky ISP resolver) doesn't absorb every query and answer REFUSED.
431
+ const dnsServerIndex = args.findIndex(arg => arg === '--dns');
432
+ const dnsServersOverride = (dnsServerIndex !== -1 && args[dnsServerIndex + 1])
433
+ ? parseDnsServers(args[dnsServerIndex + 1])
434
+ : [];
435
+ const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceDebug });
436
+ // Route nettools' dig through the same --dns resolvers (dig otherwise uses the
437
+ // system /etc/resolv.conf, which on a flaky setup times out and silently drops
438
+ // dig-gated domains). Only when --dns is explicitly set.
439
+ if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
440
+ // Circuit breaker: if resolver errors dominate, suspend the pre-check for a
441
+ // cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
442
+ // still load — a suspended pre-check just proceeds to navigation).
443
+ const dnsBreaker = createDnsCircuitBreaker({ forceDebug });
444
+ if (dnsResolver.pinned && !silentMode) {
445
+ const how = dnsResolver.servers.length === 1 ? 'pinned to' : 'rotating';
446
+ console.log(formatLogMessage('info', `DNS pre-check ${how} ${dnsResolver.servers.join(', ')}`));
447
+ } else if (forceDebug && dnsResolver.rotates) {
448
+ console.log(formatLogMessage('debug', `DNS pre-check rotating ${dnsResolver.servers.length} resolv.conf nameservers: ${dnsResolver.servers.join(', ')}`));
449
+ }
450
+
451
+ // Idle-hang watchdog registry: in-flight main pages, iterable (the
452
+ // browserhealth page trackers are WeakMaps and can't be scanned). Registered
453
+ // when a task starts navigating, removed on completion. The hang check probes
454
+ // these ONLY while global progress is stalled and force-closes any page that is
455
+ // unresponsive across consecutive probes — recovering a single hung URL in ~the
456
+ // hang-check window instead of waiting out its full per-URL ceiling (which is
457
+ // the backstop). Acting only during a stall + requiring unresponsiveness avoids
458
+ // killing a page that's merely slow (a page in a config delay is idle but
459
+ // RESPONDS to a trivial evaluate; a hung one does not). Entries self-heal via
460
+ // isClosed() so timeout/error paths that skip the normal close can't leak.
461
+ const _inFlightPages = new Map(); // page -> { url, unresponsiveStrikes }
462
+ const PAGE_HANG_PROBE_TIMEOUT_MS = 2000; // liveness-probe (page.evaluate) cap; no response within this = hung
463
+ const PAGE_HANG_PROBE_INTERVAL_MS = 15000; // how often to probe in-flight pages while the scan is stalled
464
+ const PAGE_HANG_STRIKES_TO_KILL = 2; // consecutive HUNG probes before force-close (~30s recovery at the 15s interval)
382
465
 
383
466
  function dnsNegativeCacheSet(hostname, error) {
384
467
  if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
@@ -691,7 +774,6 @@ Per-config settings file (.nwssconfig):
691
774
  See README.md for format details.
692
775
 
693
776
  General Options:
694
- --verbose Force verbose mode globally
695
777
  --debug Force debug mode globally
696
778
  --silent Suppress normal console logs
697
779
  --titles Add ! <url> title before each site's group
@@ -721,10 +803,16 @@ General Options:
721
803
 
722
804
  Validation Options:
723
805
  --cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
724
- --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
806
+ --dns <ip[,ip,...]> Resolver(s) for the DNS pre-check AND nettools' dig (not Chrome nav / whois).
807
+ One pins all queries to it; several rotate per query. Overrides /etc/resolv.conf.
808
+ --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each),
809
+ plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
725
810
  --no-dns-precheck Disable per-URL DNS resolution check before page navigation.
726
811
  By default, URLs whose hostname doesn't resolve are skipped
727
812
  immediately (saves ~5-15s of Puppeteer time per dead host).
813
+ --show-dead-domains At end of scan, list hostnames that did not resolve / were
814
+ unreachable (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
815
+ Excludes blocks/timeouts (those mean the domain is alive). For pruning.
728
816
  --validate-config Validate config.json file and exit
729
817
  --validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
730
818
  --clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
@@ -741,7 +829,7 @@ Global config.json options:
741
829
  ignore_similar: true/false Ignore domains similar to already found domains (default: true)
742
830
  ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
743
831
  ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
744
- max_concurrent_sites: 8 Maximum concurrent site processing (1-50, default: 8)
832
+ max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6)
745
833
  resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80)
746
834
  disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true)
747
835
 
@@ -752,8 +840,7 @@ Per-site config.json options:
752
840
  When true, ALL regex patterns must match the same URL
753
841
 
754
842
  Redirect Handling Options:
755
- follow_redirects: true/false Follow redirects to new domains (default: true)
756
- max_redirects: 10 Maximum number of redirects to follow (default: 10)
843
+ max_redirects: 10 Maximum number of redirects to follow (default: 10; 0 = follow none)
757
844
  js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
758
845
  detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
759
846
  redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
@@ -1525,7 +1612,12 @@ function matchesDynamicBlock(domain) {
1525
1612
  return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
1526
1613
  }
1527
1614
 
1528
- function matchesIgnoreDomain(domain, ignorePatterns) {
1615
+ // `_ignorePatterns` is intentionally unused (underscore-marked): every caller
1616
+ // and the grep/curl/nettools/searchstring callback contract pass the ignore
1617
+ // list as a 2nd arg, but the ignore-state actually lives in the module-level
1618
+ // _dynamicallyIgnoredDomains / _ignoreDomainsExact Sets walked below. Kept in
1619
+ // the signature only to preserve that shared call shape.
1620
+ function matchesIgnoreDomain(domain, _ignorePatterns) {
1529
1621
  // Both dynamic and static ignore lists are walked parent-by-parent so a
1530
1622
  // subdomain of an ignored root inherits the ignore. Previously the
1531
1623
  // dynamic check was exact-only, creating an asymmetry: a static-config
@@ -2116,22 +2208,17 @@ function setupFrameHandling(page, forceDebug) {
2116
2208
  bypass_cache
2117
2209
  } = siteConfig;
2118
2210
 
2119
- const allowFirstParty = firstParty === true || firstParty === 1;
2120
- const allowThirdParty = thirdParty === undefined || thirdParty === true || thirdParty === 1;
2121
2211
  const perSiteSubDomains = subDomains === 1 ? true : subDomainsMode;
2122
- const siteLocalhostIP = localhost || null;
2123
- const cloudflarePhishBypass = cloudflare_phish === true;
2124
- const cloudflareBypass = cloudflare_bypass === true;
2125
2212
  // Add redirect and same-page loop protection
2126
- const MAX_REDIRECT_DEPTH = siteConfig.max_redirects || 10;
2213
+ // Number check (not ||) so max_redirects: 0 isn't swallowed as falsy → 10.
2214
+ const MAX_REDIRECT_DEPTH = (typeof siteConfig.max_redirects === 'number' && siteConfig.max_redirects >= 0)
2215
+ ? siteConfig.max_redirects : 10;
2127
2216
  const redirectHistory = new Set();
2128
2217
  let redirectCount = 0;
2129
2218
  const pageLoadHistory = new Map(); // Track same-page reloads
2130
2219
  const MAX_SAME_PAGE_LOADS = 3;
2131
2220
  let currentPageUrl = currentUrl;
2132
2221
 
2133
- const sitePrivoxy = privoxy === true;
2134
- const sitePihole = pihole === true;
2135
2222
  const flowproxyDetection = flowproxy_detection === true;
2136
2223
 
2137
2224
  const evenBlocked = even_blocked === true;
@@ -2298,6 +2385,9 @@ function setupFrameHandling(page, forceDebug) {
2298
2385
 
2299
2386
  // Track page for realtime cleanup
2300
2387
  trackPageForRealtime(page);
2388
+ // Register with the idle-hang watchdog (force-closed if it goes
2389
+ // unresponsive while the whole scan has stalled).
2390
+ _inFlightPages.set(page, { url: currentUrl, unresponsiveStrikes: 0 });
2301
2391
 
2302
2392
  // Mark page as actively processing
2303
2393
  updatePageUsage(page, true);
@@ -2822,12 +2912,27 @@ function setupFrameHandling(page, forceDebug) {
2822
2912
 
2823
2913
  const regexes = getCompiledRegexes(siteConfig.filterRegex);
2824
2914
 
2915
+ // output_regex (optional per-site): extract the rule body from each matched
2916
+ // URL via capture group 1 (or the whole match), so output becomes
2917
+ // ||<capture> (e.g. ||host/script/) instead of ||host^ — lets a stable
2918
+ // folder/file be blocked on a host that also serves legit content. Compiled
2919
+ // silently here; config-load validation (validate_rules) warns on a bad
2920
+ // pattern, so a throw here just disables the feature for this site.
2921
+ // Reuse the memoized regex compiler (same cache as filterRegex) so the
2922
+ // pattern compiles once per unique source, not once per URL. try/catch
2923
+ // because getCompiledRegex throws on a bad pattern — config-load
2924
+ // validation already warned; a throw here just disables the feature.
2925
+ let outputRegex = null;
2926
+ if (siteConfig.output_regex) {
2927
+ try { outputRegex = getCompiledRegexes(siteConfig.output_regex)[0] || null; } catch (_) { outputRegex = null; }
2928
+ }
2929
+
2825
2930
  // NEW: Get regex_and setting (defaults to false for backward compatibility)
2826
2931
  const useRegexAnd = siteConfig.regex_and === true;
2827
2932
 
2828
2933
  // Parse searchstring patterns using module
2829
2934
  const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
2830
- const useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring
2935
+ let useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring (reassigned to false below if curl is unavailable)
2831
2936
  let useGrep = siteConfig.grep === true; // Grep can work independently
2832
2937
 
2833
2938
  // Get user agent for curl if needed
@@ -3009,9 +3114,30 @@ function setupFrameHandling(page, forceDebug) {
3009
3114
  * @param {string} fullSubdomain - Full subdomain for cache tracking
3010
3115
  * @param {string} resourceType - Resource type (for --adblock-rules mode)
3011
3116
  */
3012
- function addMatchedDomain(domain, resourceType = null, fullSubdomain = null) {
3117
+ function addMatchedDomain(domain, resourceType = null, fullSubdomain = null, matchedUrl = null) {
3013
3118
  // Use fullSubdomain for cache tracking if provided, otherwise fall back to domain
3014
3119
  const cacheKey = fullSubdomain || domain;
3120
+ // output_regex: derive the rule body from the matched URL. Capture group 1
3121
+ // (or the whole match) becomes the stored key, e.g. "host/script/", which
3122
+ // formatDomain emits as ||host/script/ for adblock and falls back to the
3123
+ // bare host for domain-only formats. All similarity / dedup / smart-cache
3124
+ // logic below still runs on the bare host (domain); only the final stored
3125
+ // key changes. The capture must contain both '/' and '.' (i.e. host+path),
3126
+ // otherwise we keep the host so a mis-written regex can't emit garbage.
3127
+ let outputKey = domain;
3128
+ if (outputRegex && matchedUrl) {
3129
+ const m = matchedUrl.match(outputRegex);
3130
+ if (m) {
3131
+ const cap = (m[1] != null ? m[1] : m[0]);
3132
+ // Accept only a host+path shape: a '/' with a real host before it
3133
+ // (segment before the first '/' must contain a '.'). Rejects a
3134
+ // capture that accidentally includes the scheme (host part would be
3135
+ // "https:") or a path-only capture with no host — both fall back to
3136
+ // the bare-host ||host^ rule rather than emit garbage.
3137
+ const sl = cap ? cap.indexOf('/') : -1;
3138
+ if (sl > 0 && cap.slice(0, sl).includes('.')) outputKey = cap;
3139
+ }
3140
+ }
3015
3141
  // Check if we should ignore similar domains
3016
3142
  const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
3017
3143
  const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
@@ -3113,15 +3239,15 @@ function setupFrameHandling(page, forceDebug) {
3113
3239
  }
3114
3240
 
3115
3241
  if (matchedDomains instanceof Map) {
3116
- if (!matchedDomains.has(domain)) {
3117
- matchedDomains.set(domain, new Set());
3242
+ if (!matchedDomains.has(outputKey)) {
3243
+ matchedDomains.set(outputKey, new Set());
3118
3244
  }
3119
3245
  // Only add the specific resourceType that was matched, not all types for this domain
3120
3246
  if (resourceType) {
3121
- matchedDomains.get(domain).add(resourceType);
3247
+ matchedDomains.get(outputKey).add(resourceType);
3122
3248
  }
3123
3249
  } else {
3124
- matchedDomains.add(domain);
3250
+ matchedDomains.add(outputKey);
3125
3251
  }
3126
3252
  }
3127
3253
 
@@ -3160,12 +3286,17 @@ function setupFrameHandling(page, forceDebug) {
3160
3286
  // fall back to the default rather than silently disabling capture.
3161
3287
  const POPUP_MAX_DEPTH = (() => {
3162
3288
  const v = parseInt(siteConfig.capture_popups_max_depth, 10);
3163
- return Number.isFinite(v) && v > 0 ? v : 2;
3289
+ return Number.isFinite(v) && v > 0 ? v : 4;
3164
3290
  })();
3165
3291
  const POPUP_CAPTURE_WINDOW_MS = (() => {
3166
3292
  const v = parseInt(siteConfig.capture_popups_window_ms, 10);
3167
3293
  return Number.isFinite(v) && v > 0 ? v : 5000;
3168
3294
  })();
3295
+ // interact_popups: click inside captured popups so they cascade to their
3296
+ // next ad/redirect (requires capture_popups — no popups exist otherwise).
3297
+ // Light pass; the request listener catches whatever the clicks surface.
3298
+ const interactPopups = capturePopups && siteConfig.interact_popups === true;
3299
+ const POPUP_INTERACT_CLICKS = 3; // enough to fire popunder/redirect SDKs (incl. SDKs that suppress the 1st/2nd click as warmup) without runaway cascades
3169
3300
 
3170
3301
  if (capturePopups && forceDebug) {
3171
3302
  // One-time setup-time warning if the click prerequisite isn't met.
@@ -3331,7 +3462,7 @@ function setupFrameHandling(page, forceDebug) {
3331
3462
  trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3332
3463
  } else {
3333
3464
  // No nettools required — regex match alone counts.
3334
- addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3465
+ addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain, checkedUrl);
3335
3466
  }
3336
3467
  } catch (_) { /* observation-only — never let a popup error escape */ }
3337
3468
  };
@@ -3453,6 +3584,24 @@ function setupFrameHandling(page, forceDebug) {
3453
3584
 
3454
3585
  attachPopupRequestCapture(popupPage, depth);
3455
3586
 
3587
+ // interact_popups: click inside the popup so it can cascade to its next
3588
+ // ad/redirect — popunder/redirect SDKs fire on a document-level click,
3589
+ // and a captured-but-unclicked popup only ever shows its landing URL.
3590
+ // Light pass (POPUP_INTERACT_CLICKS random content-zone clicks), only
3591
+ // on popups shallower than max depth so a clicked popup's spawned child
3592
+ // (depth+1) is still within the capture depth. Fire-and-forget: it must
3593
+ // not block onTargetCreated, and the popup may close/navigate mid-click
3594
+ // (performContentClicks no-ops on a closed page). The request listener
3595
+ // above captures whatever the clicks surface; the close timer bounds it.
3596
+ if (interactPopups && depth < POPUP_MAX_DEPTH && !popupPage.isClosed()) {
3597
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] interact_popups: ${POPUP_INTERACT_CLICKS} content click(s)`));
3598
+ performContentClicks(popupPage, {
3599
+ clicks: POPUP_INTERACT_CLICKS,
3600
+ forceDebug,
3601
+ realistic: siteConfig.realistic_click === true,
3602
+ }).catch(() => {}); // popup is transient — non-fatal
3603
+ }
3604
+
3456
3605
  // Auto-close after the capture window so popups don't pile up.
3457
3606
  const closeTimer = setTimeout(() => {
3458
3607
  try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
@@ -3658,7 +3807,7 @@ function setupFrameHandling(page, forceDebug) {
3658
3807
  wasBlocked: true
3659
3808
  });
3660
3809
  } else {
3661
- addMatchedDomain(reqDomain, resourceType, fullSubdomain);
3810
+ addMatchedDomain(reqDomain, resourceType, fullSubdomain, reqUrl);
3662
3811
  }
3663
3812
  matchedRegexPatterns.add(evenBlockedRegexPattern);
3664
3813
 
@@ -3836,7 +3985,10 @@ function setupFrameHandling(page, forceDebug) {
3836
3985
  isFirstParty: isFirstParty
3837
3986
  });
3838
3987
  } else {
3839
- addMatchedDomain(reqDomain, resourceType);
3988
+ // Pass null for fullSubdomain (not the in-scope hostname) to keep
3989
+ // this path's dedup key as the root domain exactly as before —
3990
+ // only matchedUrl is new here, for output_regex.
3991
+ addMatchedDomain(reqDomain, resourceType, null, reqUrl);
3840
3992
  }
3841
3993
  if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
3842
3994
  if (siteConfig.verbose === 1) {
@@ -4475,12 +4627,17 @@ function setupFrameHandling(page, forceDebug) {
4475
4627
  }
4476
4628
  }
4477
4629
  console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
4630
+ // Capture hard "dead domain" navigation errors for --show-dead-domains
4631
+ // (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
4632
+ // challenges are NOT dead — they're excluded by this match.
4633
+ const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE|ERR_DNS/.exec(err.message || '');
4634
+ if (deadNav) recordDeadDomain(currentUrl, deadNav[0]);
4478
4635
  throw err;
4479
4636
  }
4480
4637
  }
4481
4638
  }
4482
4639
 
4483
- const delayMs = siteConfig.delay || DEFAULT_DELAY;
4640
+ const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
4484
4641
 
4485
4642
  // Optimized delays for Puppeteer 23.x performance
4486
4643
  const isFastSite = timeout <= TIMEOUTS.FAST_SITE_THRESHOLD;
@@ -4560,8 +4717,21 @@ function setupFrameHandling(page, forceDebug) {
4560
4717
  const ghostStart = Date.now();
4561
4718
  const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
4562
4719
 
4563
- // Time-based Bezier mouse movements runs for ghostDuration ms
4564
- while (ghostTimeLeft() > 200) {
4720
+ // Honor interact_click_count in ghost mode too (built-in default
4721
+ // is 3 — ad SDKs often swallow the 1st/2nd click as warmup). Same
4722
+ // default + 20-cap as the built-in content-click path. 0 when
4723
+ // element clicks are disabled.
4724
+ const ghostClickCount = interactionConfig.includeElementClicks
4725
+ ? Math.min(Math.max(Number(siteConfig.interact_click_count) || 3, 1), 20)
4726
+ : 0;
4727
+ // Reserve part of the duration budget for those clicks so the
4728
+ // movement loop doesn't consume all of ghost_cursor_duration.
4729
+ // Capped at half the budget so movement still happens; raise
4730
+ // ghost_cursor_duration to fit more clicks.
4731
+ const clickReserveMs = Math.min(ghostClickCount * 600, ghostDuration * 0.5);
4732
+
4733
+ // Time-based Bezier mouse movements — runs for the unreserved budget
4734
+ while (ghostTimeLeft() > 200 + clickReserveMs) {
4565
4735
  const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
4566
4736
  const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
4567
4737
  await ghostMove(cursor, toX, toY, {
@@ -4569,18 +4739,23 @@ function setupFrameHandling(page, forceDebug) {
4569
4739
  overshootThreshold: ghostConfig.overshootThreshold,
4570
4740
  forceDebug
4571
4741
  });
4572
- if (ghostTimeLeft() > 100) {
4742
+ if (ghostTimeLeft() > 100 + clickReserveMs) {
4573
4743
  await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
4574
4744
  }
4575
4745
  }
4576
4746
  if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
4577
4747
  await ghostRandomMove(cursor, { forceDebug });
4578
4748
  }
4579
- if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
4749
+ // interact_click_count clicks, each to a fresh content-zone point.
4750
+ // The time guard stops early if the budget runs out (raise
4751
+ // ghost_cursor_duration for more).
4752
+ for (let gc = 0; gc < ghostClickCount && ghostTimeLeft() > 100; gc++) {
4580
4753
  const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
4581
4754
  const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
4582
4755
  await ghostClick(cursor, { x: clickX, y: clickY }, {
4583
4756
  hesitate: ghostConfig.hesitate,
4757
+ page,
4758
+ realistic: siteConfig.realistic_click === true,
4584
4759
  forceDebug
4585
4760
  });
4586
4761
  }
@@ -4895,7 +5070,7 @@ function setupFrameHandling(page, forceDebug) {
4895
5070
  // Only add delay if we're continuing with more reloads
4896
5071
  if (i < totalReloads) {
4897
5072
  // Reduce delay for problematic sites
4898
- const adjustedDelay = i > 1 ? Math.min(DEFAULT_DELAY, 2000) : DEFAULT_DELAY;
5073
+ const adjustedDelay = i > 1 ? Math.min(TIMEOUTS.DEFAULT_DELAY, 2000) : TIMEOUTS.DEFAULT_DELAY;
4899
5074
  await fastTimeout(adjustedDelay);
4900
5075
  }
4901
5076
  }
@@ -5099,6 +5274,7 @@ function setupFrameHandling(page, forceDebug) {
5099
5274
  if (!keepBrowserOpen) {
5100
5275
  try {
5101
5276
  untrackPage(page);
5277
+ _inFlightPages.delete(page);
5102
5278
  await page.close();
5103
5279
  if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
5104
5280
  } catch (pageCloseErr) {
@@ -5199,6 +5375,12 @@ function setupFrameHandling(page, forceDebug) {
5199
5375
  let lastProcessedCount = 0;
5200
5376
  let hangCheckCount = 0;
5201
5377
  let forceRestartFlag = false; // Flag to trigger restart on next iteration
5378
+ // Largest per-URL timeout budget seen across tasks. The hang-check restart
5379
+ // scales to this so it can't false-fire on a legitimately-slow config (high
5380
+ // delay × reload × interact) whose per-URL budget exceeds a flat threshold —
5381
+ // the emergency restart should only fire once the per-URL timeout ITSELF has
5382
+ // had its chance and failed (a true browser hang).
5383
+ let maxPerUrlTimeoutMs = 0;
5202
5384
 
5203
5385
  // Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
5204
5386
  // only colors the [severity] tag; the '[HANG CHECK]' substring was
@@ -5206,6 +5388,48 @@ function setupFrameHandling(page, forceDebug) {
5206
5388
  // entry so the interval callback doesn't re-colorize per tick.
5207
5389
  const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
5208
5390
 
5391
+ // Idle-hang watchdog. Runs only while the scan is stalled (no URL completing).
5392
+ // The probe distinguishes a HUNG renderer from one that's merely NAVIGATING,
5393
+ // which is the key to probing aggressively without false-kills:
5394
+ // - evaluate resolves -> 'alive' -> reset strikes
5395
+ // - evaluate rejects fast (e.g. "Execution context destroyed" mid goto/
5396
+ // reload) -> 'navigating' -> inconclusive: neither
5397
+ // strike nor reset, so a
5398
+ // navigation can NEVER trip
5399
+ // the kill regardless of cadence
5400
+ // - no response within the cap -> 'hung' -> strike
5401
+ // PAGE_HANG_STRIKES_TO_KILL consecutive HUNG probes force-close the page, so the
5402
+ // stuck task's awaits reject and its batch completes instead of waiting out the
5403
+ // full per-URL ceiling. Parallel, guarded against overlap; zero overhead off a stall.
5404
+ let _hangProbeInProgress = false;
5405
+ const probeInFlightPagesForHang = async () => {
5406
+ if (_hangProbeInProgress || _inFlightPages.size === 0) return;
5407
+ _hangProbeInProgress = true;
5408
+ try {
5409
+ await Promise.all([..._inFlightPages.entries()].map(async ([page, info]) => {
5410
+ if (page.isClosed()) { _inFlightPages.delete(page); return; }
5411
+ let verdict;
5412
+ try {
5413
+ verdict = await Promise.race([
5414
+ page.evaluate(() => true).then(() => 'alive', () => 'navigating'),
5415
+ new Promise(r => setTimeout(() => r('hung'), PAGE_HANG_PROBE_TIMEOUT_MS)),
5416
+ ]);
5417
+ } catch { verdict = 'hung'; }
5418
+ if (verdict === 'alive') { info.unresponsiveStrikes = 0; return; }
5419
+ if (verdict === 'navigating') return; // context destroyed mid-nav — not a hang; don't strike or reset
5420
+ // verdict === 'hung' — renderer gave no response within the cap
5421
+ info.unresponsiveStrikes++;
5422
+ if (info.unresponsiveStrikes >= PAGE_HANG_STRIKES_TO_KILL) {
5423
+ console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} Force-closing hung page after ${info.unresponsiveStrikes} unresponsive probes: ${info.url}`));
5424
+ _inFlightPages.delete(page);
5425
+ page.close().catch(() => {}); // stuck task's awaits reject -> task errors -> batch completes
5426
+ }
5427
+ }));
5428
+ } finally {
5429
+ _hangProbeInProgress = false;
5430
+ }
5431
+ };
5432
+
5209
5433
  const hangDetectionInterval = setInterval(() => {
5210
5434
  // Progress check, counter, and forceRestartFlag MUST run regardless of
5211
5435
  // debug mode — previously the entire body was gated on forceDebug, which
@@ -5218,8 +5442,18 @@ function setupFrameHandling(page, forceDebug) {
5218
5442
  if (forceDebug) {
5219
5443
  console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
5220
5444
  }
5221
- if (hangCheckCount >= 5) {
5222
- console.log(formatLogMessage('error', `${HANG_CHECK_TAG} Hung for 2.5 minutes. Triggering emergency browser restart.`));
5445
+ // The faster 15s probe interval below does surgical per-page recovery; this
5446
+ // 30s interval owns only the slower nuclear-restart escalation. Deadline-
5447
+ // aware: the restart only fires once the stall has OUTLASTED the heaviest
5448
+ // in-flight per-URL budget (+ grace) — i.e. the per-URL timeout itself had
5449
+ // its chance and failed, a true hang. A flat threshold (the old 2.5min)
5450
+ // false-fires on legitimately-slow configs (high delay × reload × interact)
5451
+ // whose per-URL budget exceeds it, restarting the browser mid-work. Floor
5452
+ // at 150s so light configs behave exactly as before.
5453
+ // +45s buffer covers the per-URL 8s orphan grace + the 30s tick granularity + slack.
5454
+ const restartAfterMs = Math.max(150000, maxPerUrlTimeoutMs + 45000);
5455
+ if (hangCheckCount * 30000 >= restartAfterMs) {
5456
+ console.log(formatLogMessage('error', `${HANG_CHECK_TAG} No progress for ${Math.round(hangCheckCount * 30)}s (past the ${Math.round(restartAfterMs / 1000)}s per-URL budget). Triggering emergency browser restart.`));
5223
5457
  forceRestartFlag = true; // Set flag instead of exiting
5224
5458
  hangCheckCount = 0; // Reset counter for next cycle
5225
5459
  }
@@ -5241,6 +5475,22 @@ function setupFrameHandling(page, forceDebug) {
5241
5475
  // cleanup, this is belt-and-suspenders in case a future refactor moves them.
5242
5476
  hangDetectionInterval.unref();
5243
5477
 
5478
+ // Fast surgical recovery on its own 15s cadence (the 30s interval above owns
5479
+ // the slower nuclear-restart escalation). Probes in-flight pages only while
5480
+ // progress is stalled and force-closes confirmed-hung ones; clears strikes when
5481
+ // progress resumes so a fresh stall starts from zero. Starts at -1 so the very
5482
+ // first window is grace (processedUrlCount begins at 0).
5483
+ let lastProbeCount = -1;
5484
+ const pageHangProbeInterval = setInterval(() => {
5485
+ if (processedUrlCount === lastProbeCount) {
5486
+ probeInFlightPagesForHang(); // fire-and-forget; self-guarded against overlap
5487
+ } else {
5488
+ for (const info of _inFlightPages.values()) info.unresponsiveStrikes = 0;
5489
+ }
5490
+ lastProbeCount = processedUrlCount;
5491
+ }, PAGE_HANG_PROBE_INTERVAL_MS);
5492
+ pageHangProbeInterval.unref();
5493
+
5244
5494
  // Process URLs in batches with exception handling
5245
5495
  let siteGroupIndex = 0;
5246
5496
  let currentProxyKey = ''; // Track active proxy config — '' means direct connection
@@ -5525,58 +5775,38 @@ function setupFrameHandling(page, forceDebug) {
5525
5775
  dnsPositiveSkippedHosts.add(taskDomain);
5526
5776
  if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
5527
5777
  // Fall through to navigation -- pre-check "passed" by proxy.
5778
+ } else if (dnsBreaker.isTripped()) {
5779
+ // Resolver is in a refusal storm — pre-checking is futile and only
5780
+ // adds load. Skip the resolve and proceed to navigation (same effect
5781
+ // as a fail-open); no breaker record since no resolve happened.
5782
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check suspended (resolver circuit open) — proceeding: ${taskDomain}`));
5528
5783
  } else {
5529
- const dnsResolve = async () => {
5530
- // resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
5531
- // resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
5532
- // error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
5533
- // unchanged so the outer transient-retry path sees the real
5534
- // resolver code and the negative cache records the right reason.
5535
- // Previously a bare .catch swallowed everything and tried
5536
- // resolve6, which masked transient v4-side errors behind
5537
- // whatever resolve6 ended up reporting.
5538
- // 2s timeout kept as a real safety net — with c-ares off the
5539
- // threadpool it should now rarely fire.
5540
- let timer;
5541
- try {
5542
- const timeoutP = new Promise((_, reject) => {
5543
- timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
5544
- });
5545
- const resolveChain = dnsPromises.resolve4(taskDomain)
5546
- .catch(err => {
5547
- if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
5548
- return dnsPromises.resolve6(taskDomain);
5549
- }
5550
- throw err;
5551
- });
5552
- await Promise.race([resolveChain, timeoutP]);
5553
- } finally {
5554
- if (timer) clearTimeout(timer);
5555
- }
5556
- };
5557
- // c-ares transient codes — retry once so a momentary resolver
5558
- // hiccup doesn't poison the negative cache for 5 minutes.
5559
- // DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
5560
5784
  try {
5561
- try {
5562
- await dnsResolve();
5563
- } catch (firstErr) {
5564
- const code = firstErr && firstErr.code;
5565
- if (DNS_TRANSIENT_ERRORS.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
5566
- if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
5567
- await dnsResolve();
5568
- } else {
5569
- throw firstErr;
5570
- }
5571
- }
5785
+ // Rotates the lead nameserver per attempt and retries once on a
5786
+ // transient error; rejects with the final error (code intact) on
5787
+ // failure. See lib/dns.js.
5788
+ await dnsResolver.resolveHost(taskDomain, dnsPrecheckTimeoutMs);
5789
+ dnsBreaker.record(false); // resolved OK resolver healthy
5572
5790
  } catch (dnsErr) {
5573
5791
  const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
5574
- dnsNegativeCacheSet(taskDomain, errCode);
5575
- dnsPrecheckSkips++;
5576
- if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
5577
- return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
5792
+ // Only a definitive "host does not exist / has no address" answer
5793
+ // (ENOTFOUND/ENODATA) justifies dropping the URL. A resolver-level
5794
+ // failure (EREFUSED/ESERVFAIL/ETIMEOUT/ECONNREFUSED/timeout) says
5795
+ // nothing about whether the domain is live fail open: don't cache,
5796
+ // don't skip, let it proceed to real browser navigation (a genuinely
5797
+ // dead host still fails fast there).
5798
+ if (isNonExistenceError(errCode)) {
5799
+ dnsBreaker.record(false); // resolver answered NXDOMAIN — healthy
5800
+ dnsNegativeCacheSet(taskDomain, errCode);
5801
+ recordDeadDomain(taskDomain, errCode);
5802
+ dnsPrecheckSkips++;
5803
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
5804
+ return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
5805
+ }
5806
+ dnsBreaker.record(true); // resolver error — counts toward tripping the circuit
5807
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check inconclusive (${errCode}) for ${taskDomain} — proceeding (resolver issue, not a dead host)`));
5578
5808
  }
5579
- } // close `else` from domainKnownToResolve shortcut above
5809
+ } // close the resolve `else` (domainKnownToResolve / circuit-open shortcuts above)
5580
5810
  }
5581
5811
  } catch {}
5582
5812
 
@@ -5609,6 +5839,9 @@ function setupFrameHandling(page, forceDebug) {
5609
5839
  + ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
5610
5840
  + 30000
5611
5841
  );
5842
+ // Feed the hang-check restart so it never escalates before this URL's own
5843
+ // timeout could have fired (see maxPerUrlTimeoutMs).
5844
+ if (PER_URL_TIMEOUT_MS > maxPerUrlTimeoutMs) maxPerUrlTimeoutMs = PER_URL_TIMEOUT_MS;
5612
5845
  // Grace period after primary timeout — gives the orphan a chance to
5613
5846
  // finish drainPendingNetTools() and emit "Saving N rules despite page
5614
5847
  // load failure" before we abandon its result. Drain typically completes
@@ -5868,11 +6101,13 @@ function setupFrameHandling(page, forceDebug) {
5868
6101
  } catch (processingError) {
5869
6102
  console.log(formatLogMessage('error', `Critical error: ${processingError.message}`));
5870
6103
  clearInterval(hangDetectionInterval);
6104
+ clearInterval(pageHangProbeInterval);
5871
6105
  throw processingError;
5872
6106
  }
5873
6107
 
5874
- // Clear hang detection interval
6108
+ // Clear hang detection intervals
5875
6109
  clearInterval(hangDetectionInterval);
6110
+ clearInterval(pageHangProbeInterval);
5876
6111
 
5877
6112
  // === POST-SCAN PROCESSING ===
5878
6113
  // Clean up first-party domains and validate results
@@ -5954,7 +6189,6 @@ function setupFrameHandling(page, forceDebug) {
5954
6189
  const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
5955
6190
 
5956
6191
  // Debug: Show output format being used
5957
- const totalDomainsSkipped = getTotalDomainsSkipped();
5958
6192
  const detectedDomainsCount = getDetectedDomainsCount();
5959
6193
  if (forceDebug) {
5960
6194
  const globalOptions = {
@@ -5969,7 +6203,7 @@ function setupFrameHandling(page, forceDebug) {
5969
6203
  };
5970
6204
  console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
5971
6205
  console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
5972
- console.log(formatLogMessage('debug', `Performance: ${totalDomainsSkipped} domains skipped (already detected), ${detectedDomainsCount} unique domains cached`));
6206
+ console.log(formatLogMessage('debug', `Performance: ${detectedDomainsCount} unique domains cached`));
5973
6207
  // Cloudflare cache statistics
5974
6208
  const cloudflareStats = getCacheStats();
5975
6209
  if (cloudflareStats.size > 0) {
@@ -5998,6 +6232,13 @@ function setupFrameHandling(page, forceDebug) {
5998
6232
  }
5999
6233
  console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
6000
6234
  }
6235
+ // Surface circuit-breaker activity in the end-of-scan summary (each trip
6236
+ // also warns in real time). Shown outside forceDebug because a resolver
6237
+ // refusal storm is something the operator should know happened.
6238
+ const dnsBreakerTrips = dnsBreaker.stats().trips;
6239
+ if (dnsBreakerTrips > 0 && !silentMode) {
6240
+ console.log(formatLogMessage('info', `DNS pre-check circuit tripped ${dnsBreakerTrips}× this scan (resolver refusal back-off)`));
6241
+ }
6001
6242
  // Blocked-pattern hit stats. Surfaces which patterns are actually
6002
6243
  // doing work this scan and (by absence) which are stale enough to
6003
6244
  // prune from config. Top 10 by hit count to keep the log scannable
@@ -6200,8 +6441,18 @@ function setupFrameHandling(page, forceDebug) {
6200
6441
  } else if (outputResult.totalRules > 0 && dryRunMode) {
6201
6442
  console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
6202
6443
  }
6203
- if (totalDomainsSkipped > 0) {
6204
- console.log(messageColors.info('Performance:') + ` ${totalDomainsSkipped} domains skipped (already detected)`);
6444
+ // --show-dead-domains: list hostnames that didn't resolve / were unreachable
6445
+ // this scan (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
6446
+ // One host per line so it's greppable for pruning; reason in the trailing column.
6447
+ if (showDeadDomains) {
6448
+ if (_deadDomains.size > 0) {
6449
+ console.log(`\n${messageColors.warn(`Dead domains (${_deadDomains.size}) — did not resolve / unreachable:`)}`);
6450
+ for (const [host, reason] of [..._deadDomains].sort((a, b) => a[0].localeCompare(b[0]))) {
6451
+ console.log(` ${host}\t${reason}`);
6452
+ }
6453
+ } else {
6454
+ console.log(`\n${messageColors.success('Dead domains: none detected')}`);
6455
+ }
6205
6456
  }
6206
6457
  if (ignoreCache && forceDebug) {
6207
6458
  console.log(messageColors.info('Cache:') + ` Smart caching was disabled`);