@fanboynz/network-scanner 3.1.2 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -9,9 +9,9 @@ const fs = require('fs');
9
9
  const os = require('os');
10
10
  const psl = require('psl');
11
11
  const path = require('path');
12
- const dnsPromises = require('node:dns/promises');
12
+ const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
13
13
  const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
14
- const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
14
+ const { compressMultipleFiles } = require('./lib/compress');
15
15
  const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
16
16
  const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
17
17
  const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
@@ -34,9 +34,7 @@ const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore
34
34
  // Graceful exit
35
35
  const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
36
36
  // Whois & Dig
37
- const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
38
- // File compare
39
- const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
37
+ const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve, loadDiskCache, saveDiskCache, setDigResolvers } = require('./lib/nettools');
40
38
  // CDP functionality
41
39
  const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp');
42
40
  // Post-processing cleanup
@@ -57,6 +55,7 @@ const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
57
55
  const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
58
56
  const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
59
57
  const VPN_TAG = messageColors.processing('[vpn]');
58
+ const POPUP_TAG = messageColors.processing('[popup]');
60
59
  // Precomputed colored '[SmartCache]' subsystem prefix — paired with the
61
60
  // same constant in lib/smart-cache.js so debug lines from both files
62
61
  // produce consistently colored output. formatLogMessage only colors the
@@ -68,14 +67,14 @@ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
68
67
  // Enhanced mouse interaction and page simulation
69
68
  const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
70
69
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
71
- const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
70
+ const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
72
71
  // Domain detection cache for performance optimization
73
- const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache');
72
+ const { createGlobalHelpers, getDetectedDomainsCount } = require('./lib/domain-cache');
74
73
  const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
75
74
  const { clearPersistentCache } = require('./lib/smart-cache');
76
75
  const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
77
76
  // Dry run functionality
78
- const { initializeDryRunCollections, addDryRunMatch, addDryRunNetTools, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
77
+ const { initializeDryRunCollections, addDryRunMatch, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
79
78
  // Enhanced site data clearing functionality
80
79
  const { clearSiteData } = require('./lib/clear_sitedata');
81
80
  // Referrer header generation
@@ -137,6 +136,7 @@ const CONCURRENCY_LIMITS = Object.freeze({
137
136
  // Keep using the imported map directly so the two can never diverge again.
138
137
 
139
138
  const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
139
+ const REALTIME_CLEANUP_BUFFER_MS = 25000; // Buffer added after site delay before realtime window cleanup
140
140
 
141
141
  /**
142
142
  * Detects the installed Puppeteer version dynamically
@@ -181,7 +181,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
181
181
  // purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
182
182
  // and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
183
183
  // automatically — manual purging is no longer needed.
184
- const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
184
+ const { monitorBrowserHealth, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
185
185
 
186
186
  // --- Script Configuration & Constants ---
187
187
  const VERSION = '2.0.33'; // Script version
@@ -191,7 +191,12 @@ const startTime = Date.now();
191
191
 
192
192
  // Initialize domain cache helpers with debug logging if enabled
193
193
  const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
194
- const { isDomainAlreadyDetected, markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
194
+ // Only markDomainAsDetected is used — the global cache feeds the end-of-scan
195
+ // "unique domains cached" stat (getDetectedDomainsCount). The skip-check
196
+ // (isDomainAlreadyDetected) is intentionally not wired in: cross-URL dedup is
197
+ // already handled by nettools' global processed-domain sets, smart-cache, and
198
+ // the per-URL local set, so a cache-level skip would be redundant.
199
+ const { markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
195
200
 
196
201
  // Smart cache will be initialized after config is loaded
197
202
  let smartCache = null;
@@ -232,6 +237,9 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
232
237
  const settingsMap = {
233
238
  output: ['-o', '--output'],
234
239
  max_concurrent: ['--max-concurrent'],
240
+ cleanup_interval: ['--cleanup-interval'],
241
+ resource_cleanup_interval: ['--cleanup-interval'],
242
+ dns: ['--dns'],
235
243
  dns_cache: ['--dns-cache'],
236
244
  cache_requests: ['--cache-requests'],
237
245
  dumpurls: ['--dumpurls'],
@@ -243,20 +251,25 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
243
251
  compress_logs: ['--compress-logs'],
244
252
  debug: ['--debug'],
245
253
  silent: ['--silent'],
246
- verbose: ['--verbose'],
247
254
  headful: ['--headful'],
248
255
  keep_open: ['--keep-open'],
249
256
  dry_run: ['--dry-run'],
250
257
  titles: ['--titles'],
251
258
  sub_domains: ['--sub-domains'],
252
259
  no_interact: ['--no-interact'],
260
+ show_dead_domains: ['--show-dead-domains'],
253
261
  ghost_cursor: ['--ghost-cursor'],
254
262
  plain: ['--plain'],
255
263
  cdp: ['--cdp'],
256
264
  dnsmasq: ['--dnsmasq'],
265
+ dnsmasq_old: ['--dnsmasq-old'],
257
266
  unbound: ['--unbound'],
258
267
  privoxy: ['--privoxy'],
259
268
  pihole: ['--pihole'],
269
+ adblock_rules: ['--adblock-rules'],
270
+ no_dns_precheck: ['--no-dns-precheck'],
271
+ allow_fullscreen: ['--allow-fullscreen'],
272
+ load_extension: ['--load-extension'],
260
273
  eval_on_doc: ['--eval-on-doc'],
261
274
  use_puppeteer_core: ['--use-puppeteer-core'],
262
275
  ignore_cache: ['--ignore-cache'],
@@ -314,7 +327,6 @@ if (compareIndex !== -1 && args[compareIndex + 1]) {
314
327
  }
315
328
 
316
329
 
317
- const forceVerbose = args.includes('--verbose');
318
330
  const forceDebug = args.includes('--debug');
319
331
  const silentMode = args.includes('--silent');
320
332
  const showTitles = args.includes('--titles');
@@ -337,12 +349,16 @@ const disableInteract = args.includes('--no-interact');
337
349
  const globalGhostCursor = args.includes('--ghost-cursor');
338
350
  const plainOutput = args.includes('--plain');
339
351
  const enableCDP = args.includes('--cdp');
340
- const dnsmasqMode = args.includes('--dnsmasq');
341
- const dnsmasqOldMode = args.includes('--dnsmasq-old');
342
- const unboundMode = args.includes('--unbound');
352
+ // These six are reassigned to false by the incompatible-flag validation
353
+ // blocks below (e.g. --dnsmasq + --unbound), so they must be `let` — as
354
+ // `const` that fallback threw "Assignment to constant variable" the moment
355
+ // two conflicting output modes were combined.
356
+ let dnsmasqMode = args.includes('--dnsmasq');
357
+ let dnsmasqOldMode = args.includes('--dnsmasq-old');
358
+ let unboundMode = args.includes('--unbound');
343
359
  const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
344
- const privoxyMode = args.includes('--privoxy');
345
- const piholeMode = args.includes('--pihole');
360
+ let privoxyMode = args.includes('--privoxy');
361
+ let piholeMode = args.includes('--pihole');
346
362
  const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
347
363
  const dryRunMode = args.includes('--dry-run');
348
364
  const compressLogs = args.includes('--compress-logs');
@@ -363,6 +379,25 @@ if (dnsCacheMode) enableDiskCache();
363
379
  const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
364
380
  const dnsPrecheckTimeoutMs = 2000;
365
381
 
382
+ // --show-dead-domains: collect hostnames that are definitively DEAD (do not
383
+ // exist / unreachable) and print them at the end of the scan so they can be
384
+ // pruned. Only hard signals count — NXDOMAIN/ENODATA from the pre-check and
385
+ // ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE from navigation. Transient
386
+ // failures (403/429 blocks, timeouts, Cloudflare challenges) mean the domain is
387
+ // ALIVE and are deliberately excluded. host -> reason (first seen).
388
+ const showDeadDomains = args.includes('--show-dead-domains');
389
+ const _deadDomains = new Map();
390
+ function recordDeadDomain(urlOrHost, reason) {
391
+ // Populate unconditionally — the pre-check skip reads _deadDomains to drop
392
+ // repeat URLs on a host already proven dead this run, which must work whether
393
+ // or not --show-dead-domains is set. The end-of-scan REPORT is separately
394
+ // gated on showDeadDomains, so the flag still controls output, not recording.
395
+ if (!urlOrHost) return;
396
+ let host = urlOrHost;
397
+ try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
398
+ if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
399
+ }
400
+
366
401
  // Per-scan cache of negative DNS lookups. OS resolvers don't always cache
367
402
  // NXDOMAIN responses, and a scan can hit the same dead hostname many times
368
403
  // (different URL paths on the same site). Positive results are left to the
@@ -371,14 +406,67 @@ const dnsPrecheckTimeoutMs = 2000;
371
406
  // of unique dead hosts) can't grow the cache unboundedly. Same pattern as
372
407
  // the rest of the codebase's in-memory caches.
373
408
  const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
374
- const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
375
409
  const DNS_NEGATIVE_CACHE_MAX = 1000;
410
+ // The negative cache holds ONLY definitive non-existence (NXDOMAIN/ENODATA) —
411
+ // resolver errors fail open and never enter it (see the pre-check catch), so
412
+ // persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
413
+ // hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
414
+ // otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
415
+ // shorter than the dig/whois positive cache (dig 20h / whois 36h): a domain that doesn't exist
416
+ // now MAY get registered, and this is a domain-hunting scanner, so the dead
417
+ // ones are re-checked twice a day rather than trusted for ~a day.
418
+ const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
419
+ const DNS_NEGATIVE_CACHE_TTL_MS = dnsCacheMode ? DNS_NEGATIVE_PERSIST_TTL_MS : 5 * 60 * 1000;
420
+ const DNS_NEGATIVE_CACHE_FILE = path.join(__dirname, '.dnsnegcache');
421
+ if (dnsCacheMode) {
422
+ // Reuse the dig/whois caches' generic load/save (atomic write, TTL + size
423
+ // bounded). The 'exit' flush is synchronous (writeFileSync) so it fires on
424
+ // any exit path, mirroring nettools' dig/whois flush.
425
+ loadDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX);
426
+ process.on('exit', () => saveDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX));
427
+ }
376
428
  let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
377
429
  let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
378
430
  const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
379
- // c-ares transient codes read-only, hoisted out of the per-task DNS
380
- // pre-check so we don't allocate a fresh Set per URL.
381
- const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
431
+ // DNS pre-check resolver (rotation + resolution logic lives in lib/dns.js).
432
+ // `--dns <ip[,ip...]>` (or a `dns` setting in .nwssconfig, mapped to the same
433
+ // flag) pins/rotates an explicit resolver list; otherwise the resolv.conf
434
+ // nameservers are rotated. Rotation spreads the c-ares burst so one server
435
+ // (e.g. a flaky ISP resolver) doesn't absorb every query and answer REFUSED.
436
+ const dnsServerIndex = args.findIndex(arg => arg === '--dns');
437
+ const dnsServersOverride = (dnsServerIndex !== -1 && args[dnsServerIndex + 1])
438
+ ? parseDnsServers(args[dnsServerIndex + 1])
439
+ : [];
440
+ const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceDebug });
441
+ // Route nettools' dig through the same --dns resolvers (dig otherwise uses the
442
+ // system /etc/resolv.conf, which on a flaky setup times out and silently drops
443
+ // dig-gated domains). Only when --dns is explicitly set.
444
+ if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
445
+ // Circuit breaker: if resolver errors dominate, suspend the pre-check for a
446
+ // cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
447
+ // still load — a suspended pre-check just proceeds to navigation).
448
+ const dnsBreaker = createDnsCircuitBreaker({ forceDebug });
449
+ if (dnsResolver.pinned && !silentMode) {
450
+ const how = dnsResolver.servers.length === 1 ? 'pinned to' : 'rotating';
451
+ console.log(formatLogMessage('info', `DNS pre-check ${how} ${dnsResolver.servers.join(', ')}`));
452
+ } else if (forceDebug && dnsResolver.rotates) {
453
+ console.log(formatLogMessage('debug', `DNS pre-check rotating ${dnsResolver.servers.length} resolv.conf nameservers: ${dnsResolver.servers.join(', ')}`));
454
+ }
455
+
456
+ // Idle-hang watchdog registry: in-flight main pages, iterable (the
457
+ // browserhealth page trackers are WeakMaps and can't be scanned). Registered
458
+ // when a task starts navigating, removed on completion. The hang check probes
459
+ // these ONLY while global progress is stalled and force-closes any page that is
460
+ // unresponsive across consecutive probes — recovering a single hung URL in ~the
461
+ // hang-check window instead of waiting out its full per-URL ceiling (which is
462
+ // the backstop). Acting only during a stall + requiring unresponsiveness avoids
463
+ // killing a page that's merely slow (a page in a config delay is idle but
464
+ // RESPONDS to a trivial evaluate; a hung one does not). Entries self-heal via
465
+ // isClosed() so timeout/error paths that skip the normal close can't leak.
466
+ const _inFlightPages = new Map(); // page -> { url, unresponsiveStrikes }
467
+ const PAGE_HANG_PROBE_TIMEOUT_MS = 2000; // liveness-probe (page.evaluate) cap; no response within this = hung
468
+ const PAGE_HANG_PROBE_INTERVAL_MS = 15000; // how often to probe in-flight pages while the scan is stalled
469
+ const PAGE_HANG_STRIKES_TO_KILL = 2; // consecutive HUNG probes before force-close (~30s recovery at the 15s interval)
382
470
 
383
471
  function dnsNegativeCacheSet(hostname, error) {
384
472
  if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
@@ -632,6 +720,9 @@ if (blockAdsIndex !== -1) {
632
720
 
633
721
  adblockEnabled = true;
634
722
  const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
723
+ // Only ever assigned the os.tmpdir() path below — never a user file — so the
724
+ // unlink in finally can never touch the caller's own lists.
725
+ let combinedTmpFile = null;
635
726
  try {
636
727
  if (engine === adblockRust) {
637
728
  // Rust wrapper accepts an array directly — no temp file needed.
@@ -640,15 +731,22 @@ if (blockAdsIndex !== -1) {
640
731
  // JS engine takes a single path; concat to a temp file when multiple lists.
641
732
  let rulesFile = rulesFiles[0];
642
733
  if (rulesFiles.length > 1) {
643
- rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
734
+ combinedTmpFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
735
+ rulesFile = combinedTmpFile;
644
736
  const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
645
737
  fs.writeFileSync(rulesFile, combined);
646
738
  }
739
+ // parseAdblockRules reads the file synchronously and in full before
740
+ // returning, so the temp copy is safe to remove immediately afterwards.
647
741
  adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
648
742
  }
649
743
  } catch (err) {
650
744
  console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
651
745
  process.exit(1);
746
+ } finally {
747
+ if (combinedTmpFile) {
748
+ try { fs.unlinkSync(combinedTmpFile); } catch { /* best effort — OS reaps tmpdir */ }
749
+ }
652
750
  }
653
751
  const stats = adblockMatcher.getStats();
654
752
  const ruleDesc = stats.total != null
@@ -691,7 +789,6 @@ Per-config settings file (.nwssconfig):
691
789
  See README.md for format details.
692
790
 
693
791
  General Options:
694
- --verbose Force verbose mode globally
695
792
  --debug Force debug mode globally
696
793
  --silent Suppress normal console logs
697
794
  --titles Add ! <url> title before each site's group
@@ -721,10 +818,16 @@ General Options:
721
818
 
722
819
  Validation Options:
723
820
  --cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
724
- --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
821
+ --dns <ip[,ip,...]> Resolver(s) for the DNS pre-check AND nettools' dig (not Chrome nav / whois).
822
+ One pins all queries to it; several rotate per query. Overrides /etc/resolv.conf.
823
+ --dns-cache Persist dig/whois results to disk between runs (dig 20h / whois 36h TTL, 2000-entry cap each),
824
+ plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
725
825
  --no-dns-precheck Disable per-URL DNS resolution check before page navigation.
726
826
  By default, URLs whose hostname doesn't resolve are skipped
727
827
  immediately (saves ~5-15s of Puppeteer time per dead host).
828
+ --show-dead-domains At end of scan, list hostnames that did not resolve / were
829
+ unreachable (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
830
+ Excludes blocks/timeouts (those mean the domain is alive). For pruning.
728
831
  --validate-config Validate config.json file and exit
729
832
  --validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
730
833
  --clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
@@ -741,7 +844,7 @@ Global config.json options:
741
844
  ignore_similar: true/false Ignore domains similar to already found domains (default: true)
742
845
  ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
743
846
  ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
744
- max_concurrent_sites: 8 Maximum concurrent site processing (1-50, default: 8)
847
+ max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6)
745
848
  resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80)
746
849
  disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true)
747
850
 
@@ -752,8 +855,7 @@ Per-site config.json options:
752
855
  When true, ALL regex patterns must match the same URL
753
856
 
754
857
  Redirect Handling Options:
755
- follow_redirects: true/false Follow redirects to new domains (default: true)
756
- max_redirects: 10 Maximum number of redirects to follow (default: 10)
858
+ max_redirects: 10 Maximum number of redirects to follow (default: 10; 0 = follow none)
757
859
  js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
758
860
  detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
759
861
  redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
@@ -846,7 +948,7 @@ Advanced Options:
846
948
  whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
847
949
  dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
848
950
  dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
849
- goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"})
951
+ goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "domcontentloaded"})
850
952
  dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
851
953
  digRecordType: "A" DNS record type for dig (default: A)
852
954
 
@@ -1336,6 +1438,7 @@ if (dumpUrls) {
1336
1438
  // Avoids blocking I/O on every intercepted request in debug/dumpurls mode
1337
1439
  const _logBuffers = new Map(); // filePath -> string[]
1338
1440
  const LOG_FLUSH_INTERVAL = 2000; // Flush every 2 seconds
1441
+ const LOG_BUFFER_MAX_RETAINED = 10000; // Cap a file's retry backlog (lines) so a permanently unwritable path can't grow memory unboundedly
1339
1442
  let _logFlushTimer = null;
1340
1443
 
1341
1444
  function bufferedLogWrite(filePath, entry) {
@@ -1348,18 +1451,20 @@ function bufferedLogWrite(filePath, entry) {
1348
1451
 
1349
1452
  function flushLogBuffers() {
1350
1453
  for (const [filePath, entries] of _logBuffers) {
1351
- if (entries.length > 0) {
1352
- try {
1353
- const data = entries.join('');
1354
- entries.length = 0; // Clear buffer immediately
1355
- fs.writeFile(filePath, data, { flag: 'a' }, (err) => {
1356
- if (err) {
1357
- console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
1358
- }
1359
- });
1360
- } catch (err) {
1361
- console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
1362
- }
1454
+ if (entries.length === 0) continue;
1455
+ try {
1456
+ // Synchronous append on purpose: the batched 2s flush is small, and a
1457
+ // blocking append cannot overlap the next timer tick (it holds the event
1458
+ // loop for its duration) eliminating the interleaved concurrent-append
1459
+ // hazard of the old async fs.writeFile({flag:'a'}). Clear ONLY after the
1460
+ // write succeeds, so a transient failure retries next tick instead of
1461
+ // being silently dropped (the old code cleared before the async write
1462
+ // confirmed). Bounded so a permanently unwritable path can't grow memory.
1463
+ fs.appendFileSync(filePath, entries.join(''));
1464
+ entries.length = 0;
1465
+ } catch (err) {
1466
+ console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
1467
+ if (entries.length > LOG_BUFFER_MAX_RETAINED) entries.length = 0;
1363
1468
  }
1364
1469
  }
1365
1470
  }
@@ -1403,21 +1508,29 @@ if (forceDebug && globalComments) {
1403
1508
  * @param {string} url - The URL string to parse.
1404
1509
  * @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
1405
1510
  */
1406
- const _rootDomainCache = new Map();
1407
- function getRootDomain(url) {
1408
- const cached = _rootDomainCache.get(url);
1511
+ // psl.parse memoized by hostname. The request handlers parse the root domain
1512
+ // of EVERY request, and a page hits the same few hosts repeatedly (CDN,
1513
+ // analytics, ad domains) — so a hostname-keyed memo turns almost all of those
1514
+ // into Map hits instead of repeated public-suffix-list lookups. Keyed by
1515
+ // hostname (not full URL) so distinct paths/queries on one host share one
1516
+ // entry: higher hit rate, fewer + shorter keys than a URL-keyed cache.
1517
+ // psl.parse is pure and never throws (malformed input → {domain: null}), so
1518
+ // the catch is defensive only.
1519
+ const _hostRootCache = new Map();
1520
+ function rootDomainForHost(hostname) {
1521
+ if (!hostname) return '';
1522
+ const cached = _hostRootCache.get(hostname);
1409
1523
  if (cached !== undefined) return cached;
1410
- try {
1411
- const { hostname } = new URL(url);
1412
- const parsed = psl.parse(hostname);
1413
- const result = parsed.domain || hostname;
1414
- if (_rootDomainCache.size > 5000) _rootDomainCache.clear();
1415
- _rootDomainCache.set(url, result);
1416
- return result;
1417
- } catch {
1418
- _rootDomainCache.set(url, '');
1419
- return '';
1420
- }
1524
+ let result;
1525
+ try { const parsed = psl.parse(hostname); result = parsed.domain || hostname; }
1526
+ catch { result = hostname; }
1527
+ if (_hostRootCache.size > 5000) _hostRootCache.clear();
1528
+ _hostRootCache.set(hostname, result);
1529
+ return result;
1530
+ }
1531
+ function getRootDomain(url) {
1532
+ try { return rootDomainForHost(new URL(url).hostname); }
1533
+ catch { return ''; }
1421
1534
  }
1422
1535
 
1423
1536
  /**
@@ -1525,7 +1638,12 @@ function matchesDynamicBlock(domain) {
1525
1638
  return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
1526
1639
  }
1527
1640
 
1528
- function matchesIgnoreDomain(domain, ignorePatterns) {
1641
+ // `_ignorePatterns` is intentionally unused (underscore-marked): every caller
1642
+ // and the grep/curl/nettools/searchstring callback contract pass the ignore
1643
+ // list as a 2nd arg, but the ignore-state actually lives in the module-level
1644
+ // _dynamicallyIgnoredDomains / _ignoreDomainsExact Sets walked below. Kept in
1645
+ // the signature only to preserve that shared call shape.
1646
+ function matchesIgnoreDomain(domain, _ignorePatterns) {
1529
1647
  // Both dynamic and static ignore lists are walked parent-by-parent so a
1530
1648
  // subdomain of an ignored root inherits the ignore. Previously the
1531
1649
  // dynamic check was exact-only, creating an asymmetry: a static-config
@@ -1747,7 +1865,19 @@ function setupFrameHandling(page, forceDebug) {
1747
1865
 
1748
1866
  // Declare userDataDir in outer scope for cleanup access
1749
1867
  let userDataDir = null;
1750
-
1868
+
1869
+ // Browser-level decision (the browser launches once per batch, so this can't
1870
+ // be per-site): only disable Chrome's pop-up blocker when at least one site
1871
+ // actually wants popups captured. A real browser blocks non-gesture
1872
+ // window.open(), so non-popup scans keep the blocker on for stealth.
1873
+ // capture_popups scans turn it off so non-gesture popunders (document-level
1874
+ // onclick / timer SDKs) fire and get captured too — gesture-triggered
1875
+ // popups already work via the synthetic-click path regardless of this flag.
1876
+ const wantPopups = Array.isArray(sites) && sites.some(s => s && s.capture_popups === true);
1877
+ if (wantPopups && forceDebug) {
1878
+ console.log(formatLogMessage('debug', `${POPUP_TAG} capture_popups set — launching with --disable-popup-blocking (non-gesture popunders allowed)`));
1879
+ }
1880
+
1751
1881
  /**
1752
1882
  * Creates a new browser instance with consistent configuration
1753
1883
  * Uses system Chrome and temporary directories to minimize disk usage
@@ -1838,6 +1968,12 @@ function setupFrameHandling(page, forceDebug) {
1838
1968
  // Puppeteer 22.x headless mode optimization
1839
1969
  // Auto-detect best headless mode based on Puppeteer version
1840
1970
  headless: headlessMode,
1971
+ // Bypass TLS cert errors at the browser level (drives CDP
1972
+ // Security.setIgnoreCertificateErrors). Robust on new-headless Chrome,
1973
+ // where the --ignore-certificate-errors *flag* is increasingly ignored.
1974
+ // An ad/tracker scanner must reach self-signed / mismatched-cert ad and
1975
+ // embed domains; we observe traffic, we don't transmit secrets.
1976
+ acceptInsecureCerts: true,
1841
1977
  args: [
1842
1978
  // CRITICAL: Remove automation detection markers
1843
1979
  '--disable-blink-features=AutomationControlled',
@@ -1926,6 +2062,10 @@ function setupFrameHandling(page, forceDebug) {
1926
2062
  '--memory-pressure-off',
1927
2063
  '--max_old_space_size=2048', // V8 heap limit
1928
2064
  '--disable-prompt-on-repost', // Fixes form popup on page reload
2065
+ // Disable Chrome's pop-up blocker (chrome://settings/content/popups)
2066
+ // ONLY when a site wants popups captured — lets non-gesture popunders
2067
+ // fire. Gated so non-popup scans keep the blocker on for stealth.
2068
+ ...(wantPopups ? ['--disable-popup-blocking'] : []),
1929
2069
  ...(keepBrowserOpen ? [] : ['--disable-background-networking']),
1930
2070
  '--no-sandbox',
1931
2071
  '--disable-setuid-sandbox',
@@ -2116,22 +2256,17 @@ function setupFrameHandling(page, forceDebug) {
2116
2256
  bypass_cache
2117
2257
  } = siteConfig;
2118
2258
 
2119
- const allowFirstParty = firstParty === true || firstParty === 1;
2120
- const allowThirdParty = thirdParty === undefined || thirdParty === true || thirdParty === 1;
2121
2259
  const perSiteSubDomains = subDomains === 1 ? true : subDomainsMode;
2122
- const siteLocalhostIP = localhost || null;
2123
- const cloudflarePhishBypass = cloudflare_phish === true;
2124
- const cloudflareBypass = cloudflare_bypass === true;
2125
2260
  // Add redirect and same-page loop protection
2126
- const MAX_REDIRECT_DEPTH = siteConfig.max_redirects || 10;
2261
+ // Number check (not ||) so max_redirects: 0 isn't swallowed as falsy → 10.
2262
+ const MAX_REDIRECT_DEPTH = (typeof siteConfig.max_redirects === 'number' && siteConfig.max_redirects >= 0)
2263
+ ? siteConfig.max_redirects : 10;
2127
2264
  const redirectHistory = new Set();
2128
2265
  let redirectCount = 0;
2129
2266
  const pageLoadHistory = new Map(); // Track same-page reloads
2130
2267
  const MAX_SAME_PAGE_LOADS = 3;
2131
2268
  let currentPageUrl = currentUrl;
2132
2269
 
2133
- const sitePrivoxy = privoxy === true;
2134
- const sitePihole = pihole === true;
2135
2270
  const flowproxyDetection = flowproxy_detection === true;
2136
2271
 
2137
2272
  const evenBlocked = even_blocked === true;
@@ -2298,6 +2433,9 @@ function setupFrameHandling(page, forceDebug) {
2298
2433
 
2299
2434
  // Track page for realtime cleanup
2300
2435
  trackPageForRealtime(page);
2436
+ // Register with the idle-hang watchdog (force-closed if it goes
2437
+ // unresponsive while the whole scan has stalled).
2438
+ _inFlightPages.set(page, { url: currentUrl, unresponsiveStrikes: 0 });
2301
2439
 
2302
2440
  // Mark page as actively processing
2303
2441
  updatePageUsage(page, true);
@@ -2822,12 +2960,27 @@ function setupFrameHandling(page, forceDebug) {
2822
2960
 
2823
2961
  const regexes = getCompiledRegexes(siteConfig.filterRegex);
2824
2962
 
2963
+ // output_regex (optional per-site): extract the rule body from each matched
2964
+ // URL via capture group 1 (or the whole match), so output becomes
2965
+ // ||<capture> (e.g. ||host/script/) instead of ||host^ — lets a stable
2966
+ // folder/file be blocked on a host that also serves legit content. Compiled
2967
+ // silently here; config-load validation (validate_rules) warns on a bad
2968
+ // pattern, so a throw here just disables the feature for this site.
2969
+ // Reuse the memoized regex compiler (same cache as filterRegex) so the
2970
+ // pattern compiles once per unique source, not once per URL. try/catch
2971
+ // because getCompiledRegex throws on a bad pattern — config-load
2972
+ // validation already warned; a throw here just disables the feature.
2973
+ let outputRegex = null;
2974
+ if (siteConfig.output_regex) {
2975
+ try { outputRegex = getCompiledRegexes(siteConfig.output_regex)[0] || null; } catch (_) { outputRegex = null; }
2976
+ }
2977
+
2825
2978
  // NEW: Get regex_and setting (defaults to false for backward compatibility)
2826
2979
  const useRegexAnd = siteConfig.regex_and === true;
2827
2980
 
2828
2981
  // Parse searchstring patterns using module
2829
2982
  const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
2830
- const useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring
2983
+ let useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring (reassigned to false below if curl is unavailable)
2831
2984
  let useGrep = siteConfig.grep === true; // Grep can work independently
2832
2985
 
2833
2986
  // Get user agent for curl if needed
@@ -3009,9 +3162,30 @@ function setupFrameHandling(page, forceDebug) {
3009
3162
  * @param {string} fullSubdomain - Full subdomain for cache tracking
3010
3163
  * @param {string} resourceType - Resource type (for --adblock-rules mode)
3011
3164
  */
3012
- function addMatchedDomain(domain, resourceType = null, fullSubdomain = null) {
3165
+ function addMatchedDomain(domain, resourceType = null, fullSubdomain = null, matchedUrl = null) {
3013
3166
  // Use fullSubdomain for cache tracking if provided, otherwise fall back to domain
3014
3167
  const cacheKey = fullSubdomain || domain;
3168
+ // output_regex: derive the rule body from the matched URL. Capture group 1
3169
+ // (or the whole match) becomes the stored key, e.g. "host/script/", which
3170
+ // formatDomain emits as ||host/script/ for adblock and falls back to the
3171
+ // bare host for domain-only formats. All similarity / dedup / smart-cache
3172
+ // logic below still runs on the bare host (domain); only the final stored
3173
+ // key changes. The capture must contain both '/' and '.' (i.e. host+path),
3174
+ // otherwise we keep the host so a mis-written regex can't emit garbage.
3175
+ let outputKey = domain;
3176
+ if (outputRegex && matchedUrl) {
3177
+ const m = matchedUrl.match(outputRegex);
3178
+ if (m) {
3179
+ const cap = (m[1] != null ? m[1] : m[0]);
3180
+ // Accept only a host+path shape: a '/' with a real host before it
3181
+ // (segment before the first '/' must contain a '.'). Rejects a
3182
+ // capture that accidentally includes the scheme (host part would be
3183
+ // "https:") or a path-only capture with no host — both fall back to
3184
+ // the bare-host ||host^ rule rather than emit garbage.
3185
+ const sl = cap ? cap.indexOf('/') : -1;
3186
+ if (sl > 0 && cap.slice(0, sl).includes('.')) outputKey = cap;
3187
+ }
3188
+ }
3015
3189
  // Check if we should ignore similar domains
3016
3190
  const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
3017
3191
  const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
@@ -3113,15 +3287,15 @@ function setupFrameHandling(page, forceDebug) {
3113
3287
  }
3114
3288
 
3115
3289
  if (matchedDomains instanceof Map) {
3116
- if (!matchedDomains.has(domain)) {
3117
- matchedDomains.set(domain, new Set());
3290
+ if (!matchedDomains.has(outputKey)) {
3291
+ matchedDomains.set(outputKey, new Set());
3118
3292
  }
3119
3293
  // Only add the specific resourceType that was matched, not all types for this domain
3120
3294
  if (resourceType) {
3121
- matchedDomains.get(domain).add(resourceType);
3295
+ matchedDomains.get(outputKey).add(resourceType);
3122
3296
  }
3123
3297
  } else {
3124
- matchedDomains.add(domain);
3298
+ matchedDomains.add(outputKey);
3125
3299
  }
3126
3300
  }
3127
3301
 
@@ -3160,12 +3334,17 @@ function setupFrameHandling(page, forceDebug) {
3160
3334
  // fall back to the default rather than silently disabling capture.
3161
3335
  const POPUP_MAX_DEPTH = (() => {
3162
3336
  const v = parseInt(siteConfig.capture_popups_max_depth, 10);
3163
- return Number.isFinite(v) && v > 0 ? v : 2;
3337
+ return Number.isFinite(v) && v > 0 ? v : 4;
3164
3338
  })();
3165
3339
  const POPUP_CAPTURE_WINDOW_MS = (() => {
3166
3340
  const v = parseInt(siteConfig.capture_popups_window_ms, 10);
3167
3341
  return Number.isFinite(v) && v > 0 ? v : 5000;
3168
3342
  })();
3343
+ // interact_popups: click inside captured popups so they cascade to their
3344
+ // next ad/redirect (requires capture_popups — no popups exist otherwise).
3345
+ // Light pass; the request listener catches whatever the clicks surface.
3346
+ const interactPopups = capturePopups && siteConfig.interact_popups === true;
3347
+ const POPUP_INTERACT_CLICKS = 3; // enough to fire popunder/redirect SDKs (incl. SDKs that suppress the 1st/2nd click as warmup) without runaway cascades
3169
3348
 
3170
3349
  if (capturePopups && forceDebug) {
3171
3350
  // One-time setup-time warning if the click prerequisite isn't met.
@@ -3231,8 +3410,7 @@ function setupFrameHandling(page, forceDebug) {
3231
3410
  try {
3232
3411
  const parsedUrl = new URL(checkedUrl);
3233
3412
  fullSubdomain = parsedUrl.hostname;
3234
- const pslResult = psl.parse(fullSubdomain);
3235
- checkedRootDomain = pslResult.domain || fullSubdomain;
3413
+ checkedRootDomain = rootDomainForHost(fullSubdomain);
3236
3414
  } catch (_) { return; }
3237
3415
  if (!checkedRootDomain) return;
3238
3416
 
@@ -3331,7 +3509,7 @@ function setupFrameHandling(page, forceDebug) {
3331
3509
  trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
3332
3510
  } else {
3333
3511
  // No nettools required — regex match alone counts.
3334
- addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
3512
+ addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain, checkedUrl);
3335
3513
  }
3336
3514
  } catch (_) { /* observation-only — never let a popup error escape */ }
3337
3515
  };
@@ -3453,6 +3631,24 @@ function setupFrameHandling(page, forceDebug) {
3453
3631
 
3454
3632
  attachPopupRequestCapture(popupPage, depth);
3455
3633
 
3634
+ // interact_popups: click inside the popup so it can cascade to its next
3635
+ // ad/redirect — popunder/redirect SDKs fire on a document-level click,
3636
+ // and a captured-but-unclicked popup only ever shows its landing URL.
3637
+ // Light pass (POPUP_INTERACT_CLICKS random content-zone clicks), only
3638
+ // on popups shallower than max depth so a clicked popup's spawned child
3639
+ // (depth+1) is still within the capture depth. Fire-and-forget: it must
3640
+ // not block onTargetCreated, and the popup may close/navigate mid-click
3641
+ // (performContentClicks no-ops on a closed page). The request listener
3642
+ // above captures whatever the clicks surface; the close timer bounds it.
3643
+ if (interactPopups && depth < POPUP_MAX_DEPTH && !popupPage.isClosed()) {
3644
+ if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] interact_popups: ${POPUP_INTERACT_CLICKS} content click(s)`));
3645
+ performContentClicks(popupPage, {
3646
+ clicks: POPUP_INTERACT_CLICKS,
3647
+ forceDebug,
3648
+ realistic: siteConfig.realistic_click === true,
3649
+ }).catch(() => {}); // popup is transient — non-fatal
3650
+ }
3651
+
3456
3652
  // Auto-close after the capture window so popups don't pile up.
3457
3653
  const closeTimer = setTimeout(() => {
3458
3654
  try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
@@ -3489,30 +3685,24 @@ function setupFrameHandling(page, forceDebug) {
3489
3685
  try {
3490
3686
  const parsedUrl = new URL(checkedUrl);
3491
3687
  fullSubdomain = parsedUrl.hostname;
3492
- const pslResult = psl.parse(fullSubdomain);
3493
- checkedRootDomain = pslResult.domain || fullSubdomain;
3688
+ checkedRootDomain = rootDomainForHost(fullSubdomain);
3494
3689
  } catch (e) {}
3495
3690
 
3691
+ // Never BLOCK the top-level document (the scanned page OR a main-frame
3692
+ // redirect target). Aborting it makes the navigation never commit (page
3693
+ // stays at about:blank → navigation timeout), silently breaking any
3694
+ // scanned URL that matches our own filter lists (adblock / blocked /
3695
+ // blockDomainsByUrl) — common on adult/pirate/stream domains. This flag
3696
+ // ONLY guards the abort paths below; the request still flows through the
3697
+ // match logic, so a main-frame redirect destination (e.g. a
3698
+ // filecrypt → ad-domain hop) is still captured via filterRegex/dig/whois.
3699
+ // isNavigationRequest is true for sub-frame docs too, so the mainFrame()
3700
+ // check keeps ad iframes blockable.
3701
+ let isMainFrameDoc = false;
3702
+ try { isMainFrameDoc = request.isNavigationRequest() && request.frame() === page.mainFrame(); } catch (_) {}
3703
+
3496
3704
  // Check against ALL first-party domains (original + all redirects)
3497
3705
  const isFirstParty = checkedRootDomain && firstPartyDomains.has(checkedRootDomain);
3498
-
3499
- // Block infinite iframe loops - safely access frame URL
3500
- const frameUrl = (() => {
3501
- try {
3502
- const frame = request.frame();
3503
- return frame ? frame.url() : '';
3504
- } catch (err) {
3505
- return '';
3506
- }
3507
- })();
3508
- if (frameUrl && frameUrl.includes('creative.dmzjmp.com') &&
3509
- checkedUrl.includes('go.dmzjmp.com/api/models')) {
3510
- if (forceDebug) {
3511
- console.log(formatLogMessage('debug', `Blocking potential infinite iframe loop: ${checkedUrl}`));
3512
- }
3513
- request.abort();
3514
- return;
3515
- }
3516
3706
 
3517
3707
  // Enhanced debug logging to show which frame the request came from
3518
3708
  if (forceDebug) {
@@ -3542,7 +3732,7 @@ function setupFrameHandling(page, forceDebug) {
3542
3732
  request.resourceType()
3543
3733
  );
3544
3734
 
3545
- if (result.blocked) {
3735
+ if (result.blocked && !isMainFrameDoc) {
3546
3736
  adblockStats.blocked++;
3547
3737
  if (forceDebug) {
3548
3738
  console.log(formatLogMessage('debug', `${messageColors.blocked('[adblock]')} ${checkedUrl} (${result.reason})`));
@@ -3550,6 +3740,12 @@ function setupFrameHandling(page, forceDebug) {
3550
3740
  request.abort('blockedbyclient');
3551
3741
  return;
3552
3742
  }
3743
+ if (result.blocked && isMainFrameDoc && forceDebug) {
3744
+ // Matched a filter rule but it's the page we're scanning (or a
3745
+ // main-frame redirect target) — allow it (blocking the top-level
3746
+ // document aborts navigation). It still flows through the matcher.
3747
+ console.log(formatLogMessage('debug', `${messageColors.highlight('[adblock]')} top-level document ${checkedUrl} matched (${result.reason}) — allowed (never block the scanned page)`));
3748
+ }
3553
3749
  adblockStats.allowed++;
3554
3750
  } catch (err) { /* Silently continue on adblock errors */ }
3555
3751
  }
@@ -3603,7 +3799,7 @@ function setupFrameHandling(page, forceDebug) {
3603
3799
  // check so domain-based blocks short-circuit without paying the
3604
3800
  // per-URL regex scan. Same abort reason as the static path so
3605
3801
  // request.failure() observers see consistent metadata.
3606
- if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
3802
+ if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain) && !isMainFrameDoc) {
3607
3803
  if (forceDebug) {
3608
3804
  console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
3609
3805
  }
@@ -3618,7 +3814,7 @@ function setupFrameHandling(page, forceDebug) {
3618
3814
  break;
3619
3815
  }
3620
3816
  }
3621
- if (blockedMatchIndex !== -1) {
3817
+ if (blockedMatchIndex !== -1 && !isMainFrameDoc) {
3622
3818
  // Always track the hit (zero-cost on the un-debug path) so the
3623
3819
  // scan-end summary can show which patterns are doing work vs.
3624
3820
  // which are stale and ready to prune. Keyed by pattern.source --
@@ -3658,7 +3854,7 @@ function setupFrameHandling(page, forceDebug) {
3658
3854
  wasBlocked: true
3659
3855
  });
3660
3856
  } else {
3661
- addMatchedDomain(reqDomain, resourceType, fullSubdomain);
3857
+ addMatchedDomain(reqDomain, resourceType, fullSubdomain, reqUrl);
3662
3858
  }
3663
3859
  matchedRegexPatterns.add(evenBlockedRegexPattern);
3664
3860
 
@@ -3836,7 +4032,10 @@ function setupFrameHandling(page, forceDebug) {
3836
4032
  isFirstParty: isFirstParty
3837
4033
  });
3838
4034
  } else {
3839
- addMatchedDomain(reqDomain, resourceType);
4035
+ // Pass null for fullSubdomain (not the in-scope hostname) to keep
4036
+ // this path's dedup key as the root domain exactly as before —
4037
+ // only matchedUrl is new here, for output_regex.
4038
+ addMatchedDomain(reqDomain, resourceType, null, reqUrl);
3840
4039
  }
3841
4040
  if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
3842
4041
  if (siteConfig.verbose === 1) {
@@ -4197,15 +4396,43 @@ function setupFrameHandling(page, forceDebug) {
4197
4396
  try {
4198
4397
  navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage);
4199
4398
  } catch (navErr) {
4200
- // Only retry on genuine timeouts, not chrome-error:// redirects
4399
+ // Only handle genuine timeouts here, not chrome-error:// redirects.
4400
+ // pageUrl === 'about:blank' means the navigation never committed
4401
+ // (server never responded) — treat as a real failure, not a partial
4402
+ // page; only a page that actually reached a URL is worth observing.
4201
4403
  let pageUrl = '';
4202
4404
  try { if (!page.isClosed()) pageUrl = page.url(); } catch {}
4203
4405
  const isPopupFailure = navErr.message.includes('chrome-error://') || navErr.message.includes('invalid URL') ||
4204
4406
  pageUrl.startsWith('chrome-error://') || pageUrl === 'about:blank';
4205
4407
  if ((navErr.message.includes('timeout') || navErr.message.includes('Timeout')) && !isPopupFailure) {
4206
- if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout, retrying with waitUntil:networkidle2 for ${currentUrl}`));
4207
- const fallbackOptions = { ...gotoOptions, waitUntil: 'networkidle2', timeout: Math.min(timeout, 10000) };
4208
- navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, fallbackOptions, forceDebug, formatLogMessage);
4408
+ // The OLD fallback retried with networkidle2 STRICTER than the
4409
+ // domcontentloaded default, so it could never rescue a
4410
+ // domcontentloaded timeout (and Puppeteer 25 has no 'commit', i.e.
4411
+ // nothing more lenient). Two-tier recovery instead:
4412
+ // 1. If the site used a wait STRICTER than domcontentloaded, do one
4413
+ // lenient retry with domcontentloaded (it fires earlier).
4414
+ // 2. Otherwise proceed with the partially-loaded page rather than
4415
+ // discarding the URL — it exists and requests already fired
4416
+ // (captured by page.on('request')); the delay/interact phase
4417
+ // below keeps capturing. Streaming/embed/media pages routinely
4418
+ // never reach DOM-ready (a connection stays open) but their
4419
+ // ad/tracker calls fired early.
4420
+ const primaryWait = gotoOptions.waitUntil || defaultWaitUntil;
4421
+ let recovered = false;
4422
+ if (primaryWait !== 'domcontentloaded') {
4423
+ try {
4424
+ if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout (${primaryWait}), retrying with waitUntil:domcontentloaded for ${currentUrl}`));
4425
+ const fallbackOptions = { ...gotoOptions, waitUntil: 'domcontentloaded', timeout: Math.min(timeout, 15000) };
4426
+ navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, fallbackOptions, forceDebug, formatLogMessage);
4427
+ recovered = true;
4428
+ } catch (_) { /* fall through to proceed-with-partial */ }
4429
+ }
4430
+ if (!recovered) {
4431
+ let partialUrl = currentUrl;
4432
+ try { if (!page.isClosed()) partialUrl = page.url() || currentUrl; } catch {}
4433
+ if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout — proceeding with partially-loaded page for ${currentUrl}`));
4434
+ navigationResult = { finalUrl: partialUrl, redirected: false, redirectChain: [currentUrl], originalUrl: currentUrl, redirectDomains: [], httpStatus: null, cfRay: null };
4435
+ }
4209
4436
  } else {
4210
4437
  throw navErr;
4211
4438
  }
@@ -4475,12 +4702,50 @@ function setupFrameHandling(page, forceDebug) {
4475
4702
  }
4476
4703
  }
4477
4704
  console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
4705
+ // Capture hard "dead domain" navigation errors for --show-dead-domains
4706
+ // (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
4707
+ // challenges are NOT dead — they're excluded by this match.
4708
+ // Only DEFINITIVE non-existence / unreachable signals — these now drive
4709
+ // the in-scan dead-domain SKIP (not just --show-dead-domains reporting),
4710
+ // so transient DNS errors must NOT match. The bare `ERR_DNS` used to
4711
+ // catch ERR_DNS_TIMED_OUT / ERR_DNS_MALFORMED_RESPONSE / ERR_DNS_SERVER_FAILED
4712
+ // (all transient) — dropped so a slow-DNS blip can't false-skip the
4713
+ // rest of a live host's URLs.
4714
+ const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE/.exec(err.message || '');
4715
+ if (deadNav) {
4716
+ recordDeadDomain(currentUrl, deadNav[0]);
4717
+ // Corroborate-then-persist to the negative cache (.dnsnegcache with
4718
+ // --dns-cache → cross-scan skip; else in-memory). Chrome resolves via
4719
+ // the possibly-flaky SYSTEM resolver, so its ERR_NAME_NOT_RESOLVED may
4720
+ // be a glitch on a LIVE host. Re-confirm via the reliable --dns
4721
+ // resolver and cache ONLY if it ALSO returns a definitive NXDOMAIN.
4722
+ // ERR_ADDRESS_UNREACHABLE is routing (the host resolves), so the
4723
+ // resolve succeeds and it's correctly not cached. Fire-and-forget:
4724
+ // off the critical path; saveDiskCache flushes on exit.
4725
+ if (dnsPrecheckEnabled && deadNav[0] === 'ERR_NAME_NOT_RESOLVED') {
4726
+ let navHost = '';
4727
+ try { navHost = new URL(currentUrl).hostname; } catch {}
4728
+ if (navHost && !/^[\d.:]+$|^\[/.test(navHost) && !dnsNegativeCache.has(navHost)) {
4729
+ dnsResolver.resolveHost(navHost, dnsPrecheckTimeoutMs).then(
4730
+ () => { /* reliable resolver resolves it — system-resolver glitch, do NOT cache */ },
4731
+ (e) => {
4732
+ const code = (e && (e.code || e.message)) || '';
4733
+ if (isNonExistenceError(code)) {
4734
+ dnsNegativeCacheSet(navHost, code);
4735
+ recordDeadDomain(navHost, code);
4736
+ if (forceDebug) console.log(formatLogMessage('debug', `Dead domain confirmed by --dns resolver (${code}) — caching ${navHost} (skips next run with --dns-cache)`));
4737
+ }
4738
+ }
4739
+ ).catch(() => {});
4740
+ }
4741
+ }
4742
+ }
4478
4743
  throw err;
4479
4744
  }
4480
4745
  }
4481
4746
  }
4482
4747
 
4483
- const delayMs = siteConfig.delay || DEFAULT_DELAY;
4748
+ const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
4484
4749
 
4485
4750
  // Optimized delays for Puppeteer 23.x performance
4486
4751
  const isFastSite = timeout <= TIMEOUTS.FAST_SITE_THRESHOLD;
@@ -4560,8 +4825,21 @@ function setupFrameHandling(page, forceDebug) {
4560
4825
  const ghostStart = Date.now();
4561
4826
  const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
4562
4827
 
4563
- // Time-based Bezier mouse movements runs for ghostDuration ms
4564
- while (ghostTimeLeft() > 200) {
4828
+ // Honor interact_click_count in ghost mode too (built-in default
4829
+ // is 3 — ad SDKs often swallow the 1st/2nd click as warmup). Same
4830
+ // default + 20-cap as the built-in content-click path. 0 when
4831
+ // element clicks are disabled.
4832
+ const ghostClickCount = interactionConfig.includeElementClicks
4833
+ ? Math.min(Math.max(Number(siteConfig.interact_click_count) || 3, 1), 20)
4834
+ : 0;
4835
+ // Reserve part of the duration budget for those clicks so the
4836
+ // movement loop doesn't consume all of ghost_cursor_duration.
4837
+ // Capped at half the budget so movement still happens; raise
4838
+ // ghost_cursor_duration to fit more clicks.
4839
+ const clickReserveMs = Math.min(ghostClickCount * 600, ghostDuration * 0.5);
4840
+
4841
+ // Time-based Bezier mouse movements — runs for the unreserved budget
4842
+ while (ghostTimeLeft() > 200 + clickReserveMs) {
4565
4843
  const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
4566
4844
  const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
4567
4845
  await ghostMove(cursor, toX, toY, {
@@ -4569,18 +4847,23 @@ function setupFrameHandling(page, forceDebug) {
4569
4847
  overshootThreshold: ghostConfig.overshootThreshold,
4570
4848
  forceDebug
4571
4849
  });
4572
- if (ghostTimeLeft() > 100) {
4850
+ if (ghostTimeLeft() > 100 + clickReserveMs) {
4573
4851
  await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
4574
4852
  }
4575
4853
  }
4576
4854
  if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
4577
4855
  await ghostRandomMove(cursor, { forceDebug });
4578
4856
  }
4579
- if (interactionConfig.includeElementClicks && ghostTimeLeft() > 100) {
4857
+ // interact_click_count clicks, each to a fresh content-zone point.
4858
+ // The time guard stops early if the budget runs out (raise
4859
+ // ghost_cursor_duration for more).
4860
+ for (let gc = 0; gc < ghostClickCount && ghostTimeLeft() > 100; gc++) {
4580
4861
  const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
4581
4862
  const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
4582
4863
  await ghostClick(cursor, { x: clickX, y: clickY }, {
4583
4864
  hesitate: ghostConfig.hesitate,
4865
+ page,
4866
+ realistic: siteConfig.realistic_click === true,
4584
4867
  forceDebug
4585
4868
  });
4586
4869
  }
@@ -4895,7 +5178,7 @@ function setupFrameHandling(page, forceDebug) {
4895
5178
  // Only add delay if we're continuing with more reloads
4896
5179
  if (i < totalReloads) {
4897
5180
  // Reduce delay for problematic sites
4898
- const adjustedDelay = i > 1 ? Math.min(DEFAULT_DELAY, 2000) : DEFAULT_DELAY;
5181
+ const adjustedDelay = i > 1 ? Math.min(TIMEOUTS.DEFAULT_DELAY, 2000) : TIMEOUTS.DEFAULT_DELAY;
4899
5182
  await fastTimeout(adjustedDelay);
4900
5183
  }
4901
5184
  }
@@ -5088,7 +5371,7 @@ function setupFrameHandling(page, forceDebug) {
5088
5371
  const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_').substring(0, 80);
5089
5372
  const filename = `screenshots/${safeUrl}-${timestamp}.png`;
5090
5373
  try {
5091
- if (!fs.existsSync('screenshots')) fs.mkdirSync('screenshots', { recursive: true });
5374
+ fs.mkdirSync('screenshots', { recursive: true }); // recursive:true is a no-op if it already exists
5092
5375
  await page.screenshot({ path: filename, type: 'png', fullPage: true });
5093
5376
  console.log(formatLogMessage('info', `Screenshot saved: ${filename}`));
5094
5377
  } catch (screenshotErr) {
@@ -5099,6 +5382,7 @@ function setupFrameHandling(page, forceDebug) {
5099
5382
  if (!keepBrowserOpen) {
5100
5383
  try {
5101
5384
  untrackPage(page);
5385
+ _inFlightPages.delete(page);
5102
5386
  await page.close();
5103
5387
  if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
5104
5388
  } catch (pageCloseErr) {
@@ -5199,6 +5483,12 @@ function setupFrameHandling(page, forceDebug) {
5199
5483
  let lastProcessedCount = 0;
5200
5484
  let hangCheckCount = 0;
5201
5485
  let forceRestartFlag = false; // Flag to trigger restart on next iteration
5486
+ // Largest per-URL timeout budget seen across tasks. The hang-check restart
5487
+ // scales to this so it can't false-fire on a legitimately-slow config (high
5488
+ // delay × reload × interact) whose per-URL budget exceeds a flat threshold —
5489
+ // the emergency restart should only fire once the per-URL timeout ITSELF has
5490
+ // had its chance and failed (a true browser hang).
5491
+ let maxPerUrlTimeoutMs = 0;
5202
5492
 
5203
5493
  // Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
5204
5494
  // only colors the [severity] tag; the '[HANG CHECK]' substring was
@@ -5206,6 +5496,48 @@ function setupFrameHandling(page, forceDebug) {
5206
5496
  // entry so the interval callback doesn't re-colorize per tick.
5207
5497
  const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
5208
5498
 
5499
+ // Idle-hang watchdog. Runs only while the scan is stalled (no URL completing).
5500
+ // The probe distinguishes a HUNG renderer from one that's merely NAVIGATING,
5501
+ // which is the key to probing aggressively without false-kills:
5502
+ // - evaluate resolves -> 'alive' -> reset strikes
5503
+ // - evaluate rejects fast (e.g. "Execution context destroyed" mid goto/
5504
+ // reload) -> 'navigating' -> inconclusive: neither
5505
+ // strike nor reset, so a
5506
+ // navigation can NEVER trip
5507
+ // the kill regardless of cadence
5508
+ // - no response within the cap -> 'hung' -> strike
5509
+ // PAGE_HANG_STRIKES_TO_KILL consecutive HUNG probes force-close the page, so the
5510
+ // stuck task's awaits reject and its batch completes instead of waiting out the
5511
+ // full per-URL ceiling. Parallel, guarded against overlap; zero overhead off a stall.
5512
+ let _hangProbeInProgress = false;
5513
+ const probeInFlightPagesForHang = async () => {
5514
+ if (_hangProbeInProgress || _inFlightPages.size === 0) return;
5515
+ _hangProbeInProgress = true;
5516
+ try {
5517
+ await Promise.all([..._inFlightPages.entries()].map(async ([page, info]) => {
5518
+ if (page.isClosed()) { _inFlightPages.delete(page); return; }
5519
+ let verdict;
5520
+ try {
5521
+ verdict = await Promise.race([
5522
+ page.evaluate(() => true).then(() => 'alive', () => 'navigating'),
5523
+ new Promise(r => setTimeout(() => r('hung'), PAGE_HANG_PROBE_TIMEOUT_MS)),
5524
+ ]);
5525
+ } catch { verdict = 'hung'; }
5526
+ if (verdict === 'alive') { info.unresponsiveStrikes = 0; return; }
5527
+ if (verdict === 'navigating') return; // context destroyed mid-nav — not a hang; don't strike or reset
5528
+ // verdict === 'hung' — renderer gave no response within the cap
5529
+ info.unresponsiveStrikes++;
5530
+ if (info.unresponsiveStrikes >= PAGE_HANG_STRIKES_TO_KILL) {
5531
+ console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} Force-closing hung page after ${info.unresponsiveStrikes} unresponsive probes: ${info.url}`));
5532
+ _inFlightPages.delete(page);
5533
+ page.close().catch(() => {}); // stuck task's awaits reject -> task errors -> batch completes
5534
+ }
5535
+ }));
5536
+ } finally {
5537
+ _hangProbeInProgress = false;
5538
+ }
5539
+ };
5540
+
5209
5541
  const hangDetectionInterval = setInterval(() => {
5210
5542
  // Progress check, counter, and forceRestartFlag MUST run regardless of
5211
5543
  // debug mode — previously the entire body was gated on forceDebug, which
@@ -5218,8 +5550,18 @@ function setupFrameHandling(page, forceDebug) {
5218
5550
  if (forceDebug) {
5219
5551
  console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
5220
5552
  }
5221
- if (hangCheckCount >= 5) {
5222
- console.log(formatLogMessage('error', `${HANG_CHECK_TAG} Hung for 2.5 minutes. Triggering emergency browser restart.`));
5553
+ // The faster 15s probe interval below does surgical per-page recovery; this
5554
+ // 30s interval owns only the slower nuclear-restart escalation. Deadline-
5555
+ // aware: the restart only fires once the stall has OUTLASTED the heaviest
5556
+ // in-flight per-URL budget (+ grace) — i.e. the per-URL timeout itself had
5557
+ // its chance and failed, a true hang. A flat threshold (the old 2.5min)
5558
+ // false-fires on legitimately-slow configs (high delay × reload × interact)
5559
+ // whose per-URL budget exceeds it, restarting the browser mid-work. Floor
5560
+ // at 150s so light configs behave exactly as before.
5561
+ // +45s buffer covers the per-URL 8s orphan grace + the 30s tick granularity + slack.
5562
+ const restartAfterMs = Math.max(150000, maxPerUrlTimeoutMs + 45000);
5563
+ if (hangCheckCount * 30000 >= restartAfterMs) {
5564
+ console.log(formatLogMessage('error', `${HANG_CHECK_TAG} No progress for ${Math.round(hangCheckCount * 30)}s (past the ${Math.round(restartAfterMs / 1000)}s per-URL budget). Triggering emergency browser restart.`));
5223
5565
  forceRestartFlag = true; // Set flag instead of exiting
5224
5566
  hangCheckCount = 0; // Reset counter for next cycle
5225
5567
  }
@@ -5241,6 +5583,22 @@ function setupFrameHandling(page, forceDebug) {
5241
5583
  // cleanup, this is belt-and-suspenders in case a future refactor moves them.
5242
5584
  hangDetectionInterval.unref();
5243
5585
 
5586
+ // Fast surgical recovery on its own 15s cadence (the 30s interval above owns
5587
+ // the slower nuclear-restart escalation). Probes in-flight pages only while
5588
+ // progress is stalled and force-closes confirmed-hung ones; clears strikes when
5589
+ // progress resumes so a fresh stall starts from zero. Starts at -1 so the very
5590
+ // first window is grace (processedUrlCount begins at 0).
5591
+ let lastProbeCount = -1;
5592
+ const pageHangProbeInterval = setInterval(() => {
5593
+ if (processedUrlCount === lastProbeCount) {
5594
+ probeInFlightPagesForHang(); // fire-and-forget; self-guarded against overlap
5595
+ } else {
5596
+ for (const info of _inFlightPages.values()) info.unresponsiveStrikes = 0;
5597
+ }
5598
+ lastProbeCount = processedUrlCount;
5599
+ }, PAGE_HANG_PROBE_INTERVAL_MS);
5600
+ pageHangProbeInterval.unref();
5601
+
5244
5602
  // Process URLs in batches with exception handling
5245
5603
  let siteGroupIndex = 0;
5246
5604
  let currentProxyKey = ''; // Track active proxy config — '' means direct connection
@@ -5509,6 +5867,19 @@ function setupFrameHandling(page, forceDebug) {
5509
5867
  // actually starting — wrongly skipping live domains. c-ares isn't
5510
5868
  // threadpool-bound so it's immune to that contention.
5511
5869
  if (dnsPrecheckEnabled && taskDomain && !/^[\d.:]+$|^\[/.test(taskDomain)) {
5870
+ // Already proven dead earlier THIS run — either a pre-check NXDOMAIN or
5871
+ // a prior URL's navigation hit ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE
5872
+ // (recordDeadDomain populates _deadDomains for both). Skip the repeat
5873
+ // instead of paying another fail-open navigation on a multi-URL dead
5874
+ // host (e.g. dlstreams.top?id=39/54/347). In-scan only (NOT persisted):
5875
+ // Chrome resolves via the system resolver, so a nav-level failure could
5876
+ // be a system-resolver glitch on a live host — a false "dead" must not
5877
+ // carry across runs. Cheap: a Map lookup, no DNS resolve.
5878
+ if (_deadDomains.has(taskDomain)) {
5879
+ dnsPrecheckSkips++;
5880
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check: ${taskDomain} already dead this run (${_deadDomains.get(taskDomain)}) — skipping`));
5881
+ return { url: task.url, rules: [], success: false, error: `DNS: ${_deadDomains.get(taskDomain)}`, skipped: true };
5882
+ }
5512
5883
  const cached = dnsNegativeCache.get(taskDomain);
5513
5884
  if (cached && Date.now() - cached.timestamp < DNS_NEGATIVE_CACHE_TTL_MS) {
5514
5885
  dnsPrecheckSkips++;
@@ -5525,58 +5896,38 @@ function setupFrameHandling(page, forceDebug) {
5525
5896
  dnsPositiveSkippedHosts.add(taskDomain);
5526
5897
  if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
5527
5898
  // Fall through to navigation -- pre-check "passed" by proxy.
5899
+ } else if (dnsBreaker.isTripped()) {
5900
+ // Resolver is in a refusal storm — pre-checking is futile and only
5901
+ // adds load. Skip the resolve and proceed to navigation (same effect
5902
+ // as a fail-open); no breaker record since no resolve happened.
5903
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check suspended (resolver circuit open) — proceeding: ${taskDomain}`));
5528
5904
  } else {
5529
- const dnsResolve = async () => {
5530
- // resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
5531
- // resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
5532
- // error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
5533
- // unchanged so the outer transient-retry path sees the real
5534
- // resolver code and the negative cache records the right reason.
5535
- // Previously a bare .catch swallowed everything and tried
5536
- // resolve6, which masked transient v4-side errors behind
5537
- // whatever resolve6 ended up reporting.
5538
- // 2s timeout kept as a real safety net — with c-ares off the
5539
- // threadpool it should now rarely fire.
5540
- let timer;
5541
- try {
5542
- const timeoutP = new Promise((_, reject) => {
5543
- timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
5544
- });
5545
- const resolveChain = dnsPromises.resolve4(taskDomain)
5546
- .catch(err => {
5547
- if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
5548
- return dnsPromises.resolve6(taskDomain);
5549
- }
5550
- throw err;
5551
- });
5552
- await Promise.race([resolveChain, timeoutP]);
5553
- } finally {
5554
- if (timer) clearTimeout(timer);
5555
- }
5556
- };
5557
- // c-ares transient codes — retry once so a momentary resolver
5558
- // hiccup doesn't poison the negative cache for 5 minutes.
5559
- // DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
5560
5905
  try {
5561
- try {
5562
- await dnsResolve();
5563
- } catch (firstErr) {
5564
- const code = firstErr && firstErr.code;
5565
- if (DNS_TRANSIENT_ERRORS.has(code) || (firstErr && firstErr.message === 'DNS timeout')) {
5566
- if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
5567
- await dnsResolve();
5568
- } else {
5569
- throw firstErr;
5570
- }
5571
- }
5906
+ // Rotates the lead nameserver per attempt and retries once on a
5907
+ // transient error; rejects with the final error (code intact) on
5908
+ // failure. See lib/dns.js.
5909
+ await dnsResolver.resolveHost(taskDomain, dnsPrecheckTimeoutMs);
5910
+ dnsBreaker.record(false); // resolved OK resolver healthy
5572
5911
  } catch (dnsErr) {
5573
5912
  const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
5574
- dnsNegativeCacheSet(taskDomain, errCode);
5575
- dnsPrecheckSkips++;
5576
- if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
5577
- return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
5913
+ // Only a definitive "host does not exist / has no address" answer
5914
+ // (ENOTFOUND/ENODATA) justifies dropping the URL. A resolver-level
5915
+ // failure (EREFUSED/ESERVFAIL/ETIMEOUT/ECONNREFUSED/timeout) says
5916
+ // nothing about whether the domain is live fail open: don't cache,
5917
+ // don't skip, let it proceed to real browser navigation (a genuinely
5918
+ // dead host still fails fast there).
5919
+ if (isNonExistenceError(errCode)) {
5920
+ dnsBreaker.record(false); // resolver answered NXDOMAIN — healthy
5921
+ dnsNegativeCacheSet(taskDomain, errCode);
5922
+ recordDeadDomain(taskDomain, errCode);
5923
+ dnsPrecheckSkips++;
5924
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
5925
+ return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
5926
+ }
5927
+ dnsBreaker.record(true); // resolver error — counts toward tripping the circuit
5928
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check inconclusive (${errCode}) for ${taskDomain} — proceeding (resolver issue, not a dead host)`));
5578
5929
  }
5579
- } // close `else` from domainKnownToResolve shortcut above
5930
+ } // close the resolve `else` (domainKnownToResolve / circuit-open shortcuts above)
5580
5931
  }
5581
5932
  } catch {}
5582
5933
 
@@ -5609,6 +5960,9 @@ function setupFrameHandling(page, forceDebug) {
5609
5960
  + ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
5610
5961
  + 30000
5611
5962
  );
5963
+ // Feed the hang-check restart so it never escalates before this URL's own
5964
+ // timeout could have fired (see maxPerUrlTimeoutMs).
5965
+ if (PER_URL_TIMEOUT_MS > maxPerUrlTimeoutMs) maxPerUrlTimeoutMs = PER_URL_TIMEOUT_MS;
5612
5966
  // Grace period after primary timeout — gives the orphan a chance to
5613
5967
  // finish drainPendingNetTools() and emit "Saving N rules despite page
5614
5968
  // load failure" before we abandon its result. Drain typically completes
@@ -5868,11 +6222,13 @@ function setupFrameHandling(page, forceDebug) {
5868
6222
  } catch (processingError) {
5869
6223
  console.log(formatLogMessage('error', `Critical error: ${processingError.message}`));
5870
6224
  clearInterval(hangDetectionInterval);
6225
+ clearInterval(pageHangProbeInterval);
5871
6226
  throw processingError;
5872
6227
  }
5873
6228
 
5874
- // Clear hang detection interval
6229
+ // Clear hang detection intervals
5875
6230
  clearInterval(hangDetectionInterval);
6231
+ clearInterval(pageHangProbeInterval);
5876
6232
 
5877
6233
  // === POST-SCAN PROCESSING ===
5878
6234
  // Clean up first-party domains and validate results
@@ -5954,7 +6310,6 @@ function setupFrameHandling(page, forceDebug) {
5954
6310
  const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
5955
6311
 
5956
6312
  // Debug: Show output format being used
5957
- const totalDomainsSkipped = getTotalDomainsSkipped();
5958
6313
  const detectedDomainsCount = getDetectedDomainsCount();
5959
6314
  if (forceDebug) {
5960
6315
  const globalOptions = {
@@ -5969,7 +6324,7 @@ function setupFrameHandling(page, forceDebug) {
5969
6324
  };
5970
6325
  console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
5971
6326
  console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
5972
- console.log(formatLogMessage('debug', `Performance: ${totalDomainsSkipped} domains skipped (already detected), ${detectedDomainsCount} unique domains cached`));
6327
+ console.log(formatLogMessage('debug', `Performance: ${detectedDomainsCount} unique domains cached`));
5973
6328
  // Cloudflare cache statistics
5974
6329
  const cloudflareStats = getCacheStats();
5975
6330
  if (cloudflareStats.size > 0) {
@@ -5998,6 +6353,13 @@ function setupFrameHandling(page, forceDebug) {
5998
6353
  }
5999
6354
  console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
6000
6355
  }
6356
+ // Surface circuit-breaker activity in the end-of-scan summary (each trip
6357
+ // also warns in real time). Shown outside forceDebug because a resolver
6358
+ // refusal storm is something the operator should know happened.
6359
+ const dnsBreakerTrips = dnsBreaker.stats().trips;
6360
+ if (dnsBreakerTrips > 0 && !silentMode) {
6361
+ console.log(formatLogMessage('info', `DNS pre-check circuit tripped ${dnsBreakerTrips}× this scan (resolver refusal back-off)`));
6362
+ }
6001
6363
  // Blocked-pattern hit stats. Surfaces which patterns are actually
6002
6364
  // doing work this scan and (by absence) which are stale enough to
6003
6365
  // prune from config. Top 10 by hit count to keep the log scannable
@@ -6200,8 +6562,18 @@ function setupFrameHandling(page, forceDebug) {
6200
6562
  } else if (outputResult.totalRules > 0 && dryRunMode) {
6201
6563
  console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
6202
6564
  }
6203
- if (totalDomainsSkipped > 0) {
6204
- console.log(messageColors.info('Performance:') + ` ${totalDomainsSkipped} domains skipped (already detected)`);
6565
+ // --show-dead-domains: list hostnames that didn't resolve / were unreachable
6566
+ // this scan (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
6567
+ // One host per line so it's greppable for pruning; reason in the trailing column.
6568
+ if (showDeadDomains) {
6569
+ if (_deadDomains.size > 0) {
6570
+ console.log(`\n${messageColors.warn(`Dead domains (${_deadDomains.size}) — did not resolve / unreachable:`)}`);
6571
+ for (const [host, reason] of [..._deadDomains].sort((a, b) => a[0].localeCompare(b[0]))) {
6572
+ console.log(` ${host}\t${reason}`);
6573
+ }
6574
+ } else {
6575
+ console.log(`\n${messageColors.success('Dead domains: none detected')}`);
6576
+ }
6205
6577
  }
6206
6578
  if (ignoreCache && forceDebug) {
6207
6579
  console.log(messageColors.info('Cache:') + ` Smart caching was disabled`);