@fanboynz/network-scanner 3.2.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js CHANGED
@@ -9,7 +9,7 @@ const fs = require('fs');
9
9
  const os = require('os');
10
10
  const psl = require('psl');
11
11
  const path = require('path');
12
- const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
12
+ const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError, dohTemplatesForResolvers } = require('./lib/dns');
13
13
  const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
14
14
  const { compressMultipleFiles } = require('./lib/compress');
15
15
  const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
@@ -17,6 +17,7 @@ const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROM
17
17
  const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
18
18
  // Curl functionality (replace searchstring curl handler)
19
19
  const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
20
+ const { runProcess } = require('./lib/spawn-async');
20
21
  // Rule validation
21
22
  const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile, normalizeSiteConfig } = require('./lib/validate_rules');
22
23
  // CF Bypass
@@ -55,6 +56,7 @@ const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
55
56
  const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
56
57
  const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
57
58
  const VPN_TAG = messageColors.processing('[vpn]');
59
+ const POPUP_TAG = messageColors.processing('[popup]');
58
60
  // Precomputed colored '[SmartCache]' subsystem prefix — paired with the
59
61
  // same constant in lib/smart-cache.js so debug lines from both files
60
62
  // produce consistently colored output. formatLogMessage only colors the
@@ -64,7 +66,7 @@ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
64
66
  // log lines (start/completed). Same cyan as the other monitoring tags.
65
67
  const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
66
68
  // Enhanced mouse interaction and page simulation
67
- const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
69
+ const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove, performTargetedClicks } = require('./lib/interaction');
68
70
  // Optional ghost-cursor support for advanced Bezier-based mouse movements
69
71
  const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
70
72
  // Domain detection cache for performance optimization
@@ -240,6 +242,7 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
240
242
  resource_cleanup_interval: ['--cleanup-interval'],
241
243
  dns: ['--dns'],
242
244
  dns_cache: ['--dns-cache'],
245
+ doh_disable: ['--doh-disable'],
243
246
  cache_requests: ['--cache-requests'],
244
247
  dumpurls: ['--dumpurls'],
245
248
  remove_tempfiles: ['--remove-tempfiles'],
@@ -376,7 +379,13 @@ if (dnsCacheMode) enableDiskCache();
376
379
  // Filters NXDOMAIN / unresolvable hostnames in <100ms before paying the
377
380
  // ~5-15s Puppeteer + Cloudflare detection round-trip on each.
378
381
  const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
379
- const dnsPrecheckTimeoutMs = 2000;
382
+ // 4s (was 2s): under a concurrent scan the c-ares UDP burst against the pinned
383
+ // resolvers can take >2s to answer — a tight timeout false-counted those as
384
+ // resolver errors and tripped the circuit breaker. A clean NXDOMAIN still
385
+ // returns fast (the resolver answers immediately), so the higher ceiling only
386
+ // costs time when the resolver is genuinely slow — exactly when we want to wait
387
+ // rather than false-fail. Paired with the resolver's concurrency cap below.
388
+ const dnsPrecheckTimeoutMs = 4000;
380
389
 
381
390
  // --show-dead-domains: collect hostnames that are definitively DEAD (do not
382
391
  // exist / unreachable) and print them at the end of the scan so they can be
@@ -387,7 +396,11 @@ const dnsPrecheckTimeoutMs = 2000;
387
396
  const showDeadDomains = args.includes('--show-dead-domains');
388
397
  const _deadDomains = new Map();
389
398
  function recordDeadDomain(urlOrHost, reason) {
390
- if (!showDeadDomains || !urlOrHost) return;
399
+ // Populate unconditionally the pre-check skip reads _deadDomains to drop
400
+ // repeat URLs on a host already proven dead this run, which must work whether
401
+ // or not --show-dead-domains is set. The end-of-scan REPORT is separately
402
+ // gated on showDeadDomains, so the flag still controls output, not recording.
403
+ if (!urlOrHost) return;
391
404
  let host = urlOrHost;
392
405
  try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
393
406
  if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
@@ -407,7 +420,7 @@ const DNS_NEGATIVE_CACHE_MAX = 1000;
407
420
  // persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
408
421
  // hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
409
422
  // otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
410
- // shorter than the dig/whois positive cache (20h): a domain that doesn't exist
423
+ // shorter than the dig/whois positive cache (dig 20h / whois 36h): a domain that doesn't exist
411
424
  // now MAY get registered, and this is a domain-hunting scanner, so the dead
412
425
  // ones are re-checked twice a day rather than trusted for ~a day.
413
426
  const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
@@ -437,6 +450,31 @@ const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceD
437
450
  // system /etc/resolv.conf, which on a flaky setup times out and silently drops
438
451
  // dig-gated domains). Only when --dns is explicitly set.
439
452
  if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
453
+ // Pin Chrome's NAVIGATION resolver to the same providers via DoH. Chrome
454
+ // ignores --dns for page loads and reads /etc/resolv.conf directly, so a broken
455
+ // system resolver (e.g. one returning REFUSED) can ERR_NAME_NOT_RESOLVED a
456
+ // domain the pre-check already resolved. Mapping --dns to the matching DoH
457
+ // endpoint makes navigation use the pinned provider instead of resolv.conf.
458
+ // 'automatic' mode (not 'secure') so Chrome still falls back to system DNS if
459
+ // DoH is unreachable rather than failing the whole batch. Empty templates when
460
+ // --dns is absent or maps to no known DoH provider — Chrome keeps system DNS.
461
+ //
462
+ // Applied ONLY to direct connections (see createBrowser): when a proxy or VPN
463
+ // is active, the exit/tunnel does the resolution (remote DNS / pushed DNS), so
464
+ // pinning local DoH would be redundant and could resolve geo-split domains to
465
+ // the wrong region. In those modes Chrome defers to the proxy/VPN as before.
466
+ // --doh-disable (default false) opts out of the Chrome DoH pinning entirely —
467
+ // navigation falls back to system resolv.conf even when --dns maps to a known
468
+ // provider. The pre-check and dig still honor --dns. Use it if DoH adds
469
+ // unwanted latency, is blocked on the network, or you specifically want Chrome
470
+ // to resolve via the system path.
471
+ const dohDisabled = args.includes('--doh-disable');
472
+ const chromeDoh = dnsServersOverride.length > 0
473
+ ? dohTemplatesForResolvers(dnsServersOverride)
474
+ : { templates: '', mapped: [], unmapped: [] };
475
+ // anyVpnConfigured and the DoH startup log live inside the main IIFE below:
476
+ // `sites` is destructured from the config later in module load, so referencing
477
+ // it at this point in top-level evaluation would TDZ-throw.
440
478
  // Circuit breaker: if resolver errors dominate, suspend the pre-check for a
441
479
  // cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
442
480
  // still load — a suspended pre-check just proceeds to navigation).
@@ -715,6 +753,9 @@ if (blockAdsIndex !== -1) {
715
753
 
716
754
  adblockEnabled = true;
717
755
  const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
756
+ // Only ever assigned the os.tmpdir() path below — never a user file — so the
757
+ // unlink in finally can never touch the caller's own lists.
758
+ let combinedTmpFile = null;
718
759
  try {
719
760
  if (engine === adblockRust) {
720
761
  // Rust wrapper accepts an array directly — no temp file needed.
@@ -723,15 +764,22 @@ if (blockAdsIndex !== -1) {
723
764
  // JS engine takes a single path; concat to a temp file when multiple lists.
724
765
  let rulesFile = rulesFiles[0];
725
766
  if (rulesFiles.length > 1) {
726
- rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
767
+ combinedTmpFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
768
+ rulesFile = combinedTmpFile;
727
769
  const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
728
770
  fs.writeFileSync(rulesFile, combined);
729
771
  }
772
+ // parseAdblockRules reads the file synchronously and in full before
773
+ // returning, so the temp copy is safe to remove immediately afterwards.
730
774
  adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
731
775
  }
732
776
  } catch (err) {
733
777
  console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
734
778
  process.exit(1);
779
+ } finally {
780
+ if (combinedTmpFile) {
781
+ try { fs.unlinkSync(combinedTmpFile); } catch { /* best effort — OS reaps tmpdir */ }
782
+ }
735
783
  }
736
784
  const stats = adblockMatcher.getStats();
737
785
  const ruleDesc = stats.total != null
@@ -803,9 +851,13 @@ General Options:
803
851
 
804
852
  Validation Options:
805
853
  --cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
806
- --dns <ip[,ip,...]> Resolver(s) for the DNS pre-check AND nettools' dig (not Chrome nav / whois).
807
- One pins all queries to it; several rotate per query. Overrides /etc/resolv.conf.
808
- --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each),
854
+ --dns <ip[,ip,...]> Resolver(s) for the DNS pre-check, nettools' dig, and when they map to a
855
+ known DoH provider Chrome's page navigation via DoH on direct connections
856
+ (skipped under proxy/VPN; not whois). Overrides /etc/resolv.conf.
857
+ One pins all queries to it; several rotate per query.
858
+ --doh-disable Opt out of the Chrome-navigation DoH pinning (default: off). Chrome then
859
+ resolves via system resolv.conf; --dns still pins the pre-check and dig.
860
+ --dns-cache Persist dig/whois results to disk between runs (dig 20h / whois 36h TTL, 2000-entry cap each),
809
861
  plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
810
862
  --no-dns-precheck Disable per-URL DNS resolution check before page navigation.
811
863
  By default, URLs whose hostname doesn't resolve are skipped
@@ -879,6 +931,9 @@ Redirect Handling Options:
879
931
  source: true/false Save page source HTML after load
880
932
  firstParty: true/false Allow first-party matches (default: false)
881
933
  thirdParty: true/false Allow third-party matches (default: true)
934
+ redirect_first_party: true/false Treat redirect-destination domains as first-party (default: true).
935
+ false keeps redirect targets third-party so filterRegex/dig can match
936
+ them (e.g. capturing an ad/cloak redirect's end domain)
882
937
  screenshot: true/false/\"force\" Capture screenshot (true=on failure, \"force\"=always)
883
938
  headful: true/false Launch browser with GUI for this site
884
939
  fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
@@ -916,6 +971,9 @@ Advanced Options:
916
971
  interact_scrolling: true/false Enable scrolling simulation (default: true)
917
972
  interact_clicks: true/false Enable element clicking simulation (default: false)
918
973
  interact_typing: true/false Enable typing simulation (default: false)
974
+ click_elements: ["sel1","sel2"] After load, click these CSS selectors in order, main frame + iframes
975
+ (organic nav / play button). Honors realistic_click + cursor_mode "ghost"; missing skipped
976
+ click_wait: <milliseconds> Per-click: max wait for the element to appear + settle/nav after (default: 5000)
919
977
  cursor_mode: "ghost" Use ghost-cursor Bezier mouse (requires: npm i ghost-cursor)
920
978
  ghost_cursor_speed: <number> Ghost-cursor speed multiplier (default: auto)
921
979
  ghost_cursor_hesitate: <milliseconds> Delay before ghost-cursor clicks (default: 50)
@@ -933,7 +991,7 @@ Advanced Options:
933
991
  whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
934
992
  dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
935
993
  dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
936
- goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"})
994
+ goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "domcontentloaded"})
937
995
  dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
938
996
  digRecordType: "A" DNS record type for dig (default: A)
939
997
 
@@ -1423,6 +1481,7 @@ if (dumpUrls) {
1423
1481
  // Avoids blocking I/O on every intercepted request in debug/dumpurls mode
1424
1482
  const _logBuffers = new Map(); // filePath -> string[]
1425
1483
  const LOG_FLUSH_INTERVAL = 2000; // Flush every 2 seconds
1484
+ const LOG_BUFFER_MAX_RETAINED = 10000; // Cap a file's retry backlog (lines) so a permanently unwritable path can't grow memory unboundedly
1426
1485
  let _logFlushTimer = null;
1427
1486
 
1428
1487
  function bufferedLogWrite(filePath, entry) {
@@ -1435,18 +1494,20 @@ function bufferedLogWrite(filePath, entry) {
1435
1494
 
1436
1495
  function flushLogBuffers() {
1437
1496
  for (const [filePath, entries] of _logBuffers) {
1438
- if (entries.length > 0) {
1439
- try {
1440
- const data = entries.join('');
1441
- entries.length = 0; // Clear buffer immediately
1442
- fs.writeFile(filePath, data, { flag: 'a' }, (err) => {
1443
- if (err) {
1444
- console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
1445
- }
1446
- });
1447
- } catch (err) {
1448
- console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
1449
- }
1497
+ if (entries.length === 0) continue;
1498
+ try {
1499
+ // Synchronous append on purpose: the batched 2s flush is small, and a
1500
+ // blocking append cannot overlap the next timer tick (it holds the event
1501
+ // loop for its duration) eliminating the interleaved concurrent-append
1502
+ // hazard of the old async fs.writeFile({flag:'a'}). Clear ONLY after the
1503
+ // write succeeds, so a transient failure retries next tick instead of
1504
+ // being silently dropped (the old code cleared before the async write
1505
+ // confirmed). Bounded so a permanently unwritable path can't grow memory.
1506
+ fs.appendFileSync(filePath, entries.join(''));
1507
+ entries.length = 0;
1508
+ } catch (err) {
1509
+ console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
1510
+ if (entries.length > LOG_BUFFER_MAX_RETAINED) entries.length = 0;
1450
1511
  }
1451
1512
  }
1452
1513
  }
@@ -1490,21 +1551,29 @@ if (forceDebug && globalComments) {
1490
1551
  * @param {string} url - The URL string to parse.
1491
1552
  * @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
1492
1553
  */
1493
- const _rootDomainCache = new Map();
1494
- function getRootDomain(url) {
1495
- const cached = _rootDomainCache.get(url);
1554
+ // psl.parse memoized by hostname. The request handlers parse the root domain
1555
+ // of EVERY request, and a page hits the same few hosts repeatedly (CDN,
1556
+ // analytics, ad domains) — so a hostname-keyed memo turns almost all of those
1557
+ // into Map hits instead of repeated public-suffix-list lookups. Keyed by
1558
+ // hostname (not full URL) so distinct paths/queries on one host share one
1559
+ // entry: higher hit rate, fewer + shorter keys than a URL-keyed cache.
1560
+ // psl.parse is pure and never throws (malformed input → {domain: null}), so
1561
+ // the catch is defensive only.
1562
+ const _hostRootCache = new Map();
1563
+ function rootDomainForHost(hostname) {
1564
+ if (!hostname) return '';
1565
+ const cached = _hostRootCache.get(hostname);
1496
1566
  if (cached !== undefined) return cached;
1497
- try {
1498
- const { hostname } = new URL(url);
1499
- const parsed = psl.parse(hostname);
1500
- const result = parsed.domain || hostname;
1501
- if (_rootDomainCache.size > 5000) _rootDomainCache.clear();
1502
- _rootDomainCache.set(url, result);
1503
- return result;
1504
- } catch {
1505
- _rootDomainCache.set(url, '');
1506
- return '';
1507
- }
1567
+ let result;
1568
+ try { const parsed = psl.parse(hostname); result = parsed.domain || hostname; }
1569
+ catch { result = hostname; }
1570
+ if (_hostRootCache.size > 5000) _hostRootCache.clear();
1571
+ _hostRootCache.set(hostname, result);
1572
+ return result;
1573
+ }
1574
+ function getRootDomain(url) {
1575
+ try { return rootDomainForHost(new URL(url).hostname); }
1576
+ catch { return ''; }
1508
1577
  }
1509
1578
 
1510
1579
  /**
@@ -1839,7 +1908,33 @@ function setupFrameHandling(page, forceDebug) {
1839
1908
 
1840
1909
  // Declare userDataDir in outer scope for cleanup access
1841
1910
  let userDataDir = null;
1842
-
1911
+
1912
+ // Browser-level decision (the browser launches once per batch, so this can't
1913
+ // be per-site): only disable Chrome's pop-up blocker when at least one site
1914
+ // actually wants popups captured. A real browser blocks non-gesture
1915
+ // window.open(), so non-popup scans keep the blocker on for stealth.
1916
+ // capture_popups scans turn it off so non-gesture popunders (document-level
1917
+ // onclick / timer SDKs) fire and get captured too — gesture-triggered
1918
+ // popups already work via the synthetic-click path regardless of this flag.
1919
+ const wantPopups = Array.isArray(sites) && sites.some(s => s && s.capture_popups === true);
1920
+ if (wantPopups && forceDebug) {
1921
+ console.log(formatLogMessage('debug', `${POPUP_TAG} capture_popups set — launching with --disable-popup-blocking (non-gesture popunders allowed)`));
1922
+ }
1923
+
1924
+ // DoH gate: any VPN site disables Chrome DoH (the tunnel resolves). Computed
1925
+ // here (not at module top) because `sites` is only initialized by this point.
1926
+ // Read by createBrowser's launch args; the startup log reports the decision.
1927
+ const anyVpnConfigured = Array.isArray(sites) && sites.some(s => s && (s.vpn || s.openvpn));
1928
+ if (dnsServersOverride.length > 0 && !silentMode) {
1929
+ if (dohDisabled) {
1930
+ console.log(formatLogMessage('info', `Chrome DoH disabled via --doh-disable — navigation uses system resolv.conf; --dns still pins the pre-check and dig.`));
1931
+ } else if (chromeDoh.templates) {
1932
+ console.log(formatLogMessage('info', `Chrome navigation will use DoH (automatic) on direct connections: ${chromeDoh.templates}${anyVpnConfigured ? ' — VPN configured, so it defers to VPN resolution' : ' — deferred to proxy resolution on proxied sites'}`));
1933
+ } else {
1934
+ console.warn(formatLogMessage('warn', `--dns servers (${chromeDoh.unmapped.join(', ')}) have no known DoH endpoint — Chrome navigation stays on system resolv.conf; only the pre-check and dig are pinned. Known providers: Google, Cloudflare, Quad9, OpenDNS, AdGuard, CleanBrowsing, DNS.SB, Mullvad.`));
1935
+ }
1936
+ }
1937
+
1843
1938
  /**
1844
1939
  * Creates a new browser instance with consistent configuration
1845
1940
  * Uses system Chrome and temporary directories to minimize disk usage
@@ -1930,6 +2025,12 @@ function setupFrameHandling(page, forceDebug) {
1930
2025
  // Puppeteer 22.x headless mode optimization
1931
2026
  // Auto-detect best headless mode based on Puppeteer version
1932
2027
  headless: headlessMode,
2028
+ // Bypass TLS cert errors at the browser level (drives CDP
2029
+ // Security.setIgnoreCertificateErrors). Robust on new-headless Chrome,
2030
+ // where the --ignore-certificate-errors *flag* is increasingly ignored.
2031
+ // An ad/tracker scanner must reach self-signed / mismatched-cert ad and
2032
+ // embed domains; we observe traffic, we don't transmit secrets.
2033
+ acceptInsecureCerts: true,
1933
2034
  args: [
1934
2035
  // CRITICAL: Remove automation detection markers
1935
2036
  '--disable-blink-features=AutomationControlled',
@@ -1941,6 +2042,19 @@ function setupFrameHandling(page, forceDebug) {
1941
2042
  '--use-mock-keychain',
1942
2043
  '--disable-client-side-phishing-detection',
1943
2044
  '--enable-features=NetworkService',
2045
+ // DoH for Chrome's navigation resolver when --dns maps to a known
2046
+ // provider — but ONLY on direct connections. A proxied launch carries
2047
+ // a --proxy-server in extraArgs and does its own (remote) DNS; a VPN
2048
+ // tunnels resolution. In both cases local DoH is redundant and could
2049
+ // resolve geo-split domains to the wrong region, so it's skipped and
2050
+ // Chrome defers to the proxy/VPN. 'automatic' keeps a system-DNS
2051
+ // fallback if DoH is unreachable. Flags omitted when not applicable.
2052
+ ...((chromeDoh.templates
2053
+ && !dohDisabled
2054
+ && !anyVpnConfigured
2055
+ && !extraArgs.some(a => typeof a === 'string' && a.startsWith('--proxy-server')))
2056
+ ? ['--dns-over-https-mode=automatic', `--dns-over-https-templates=${chromeDoh.templates}`]
2057
+ : []),
1944
2058
  // Disk space controls - minimal cache for scanning workloads
1945
2059
  `--disk-cache-size=${CACHE_LIMITS.DISK_CACHE_SIZE}`,
1946
2060
  `--media-cache-size=${CACHE_LIMITS.MEDIA_CACHE_SIZE}`,
@@ -2018,6 +2132,10 @@ function setupFrameHandling(page, forceDebug) {
2018
2132
  '--memory-pressure-off',
2019
2133
  '--max_old_space_size=2048', // V8 heap limit
2020
2134
  '--disable-prompt-on-repost', // Fixes form popup on page reload
2135
+ // Disable Chrome's pop-up blocker (chrome://settings/content/popups)
2136
+ // ONLY when a site wants popups captured — lets non-gesture popunders
2137
+ // fire. Gated so non-popup scans keep the blocker on for stealth.
2138
+ ...(wantPopups ? ['--disable-popup-blocking'] : []),
2021
2139
  ...(keepBrowserOpen ? [] : ['--disable-background-networking']),
2022
2140
  '--no-sandbox',
2023
2141
  '--disable-setuid-sandbox',
@@ -2420,10 +2538,18 @@ function setupFrameHandling(page, forceDebug) {
2420
2538
  page.setDefaultNavigationTimeout(Math.min(timeout, TIMEOUTS.DEFAULT_NAVIGATION));
2421
2539
  // Aggressive timeouts prevent hanging in Puppeteer 23.x while maintaining speed
2422
2540
 
2423
- page.on('console', (msg) => {
2424
- if (forceDebug && msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
2425
- });
2426
-
2541
+ // Only attach a console listener under --debug. Registering ANY 'console'
2542
+ // listener makes Puppeteer enable the CDP Runtime domain, which arms
2543
+ // console-based automation/DevTools traps (e.g. disable-devtool logs an
2544
+ // object with a getter and detects the inspector reading it → redirects
2545
+ // away). The body is a no-op without forceDebug, so attaching it
2546
+ // unconditionally armed that trap for zero benefit.
2547
+ if (forceDebug) {
2548
+ page.on('console', (msg) => {
2549
+ if (msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
2550
+ });
2551
+ }
2552
+
2427
2553
  // Add page crash handler
2428
2554
  page.on('error', (err) => {
2429
2555
  if (forceDebug) console.log(formatLogMessage('debug', `Page crashed: ${err.message}`));
@@ -3308,12 +3434,18 @@ function setupFrameHandling(page, forceDebug) {
3308
3434
  // (normalizeSiteConfig now coerces interact: 1 → true with a warning,
3309
3435
  // so by the time we get here both should be booleans — but keep the
3310
3436
  // diagnostic accurate for the truly-missing case.)
3437
+ const hasClickElements = Array.isArray(siteConfig.click_elements) && siteConfig.click_elements.length > 0;
3311
3438
  const interactOn = siteConfig.interact === true;
3312
3439
  const clicksOn = siteConfig.interact_clicks === true;
3313
- if (!interactOn && !clicksOn) {
3314
- console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects will capture`));
3440
+ if (hasClickElements && (!interactOn || !clicksOn)) {
3441
+ // click_elements fires its own trusted gesture clicks, so popups it
3442
+ // triggers capture regardless of interact/interact_clicks. Don't warn
3443
+ // "no clicks fire" — surface the random-click coverage gap instead.
3444
+ console.log(formatLogMessage('debug', `[popup] capture_popups: click_elements supplies targeted gesture clicks (popups they trigger WILL capture). interact=${interactOn}, interact_clicks=${clicksOn} — enable both for random content-zone click coverage of overlay popunders too`));
3445
+ } else if (!interactOn && !clicksOn) {
3446
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects (or click_elements) will capture`));
3315
3447
  } else if (!interactOn) {
3316
- console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no fake clicks fire`));
3448
+ console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no random fake clicks fire`));
3317
3449
  } else if (!clicksOn) {
3318
3450
  console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact_clicks' is not — set interact_clicks: true to enable element-targeted clicks; without it, only random content-zone clicks fire and may miss overlay-based popunders`));
3319
3451
  }
@@ -3362,8 +3494,7 @@ function setupFrameHandling(page, forceDebug) {
3362
3494
  try {
3363
3495
  const parsedUrl = new URL(checkedUrl);
3364
3496
  fullSubdomain = parsedUrl.hostname;
3365
- const pslResult = psl.parse(fullSubdomain);
3366
- checkedRootDomain = pslResult.domain || fullSubdomain;
3497
+ checkedRootDomain = rootDomainForHost(fullSubdomain);
3367
3498
  } catch (_) { return; }
3368
3499
  if (!checkedRootDomain) return;
3369
3500
 
@@ -3638,30 +3769,24 @@ function setupFrameHandling(page, forceDebug) {
3638
3769
  try {
3639
3770
  const parsedUrl = new URL(checkedUrl);
3640
3771
  fullSubdomain = parsedUrl.hostname;
3641
- const pslResult = psl.parse(fullSubdomain);
3642
- checkedRootDomain = pslResult.domain || fullSubdomain;
3772
+ checkedRootDomain = rootDomainForHost(fullSubdomain);
3643
3773
  } catch (e) {}
3644
3774
 
3775
+ // Never BLOCK the top-level document (the scanned page OR a main-frame
3776
+ // redirect target). Aborting it makes the navigation never commit (page
3777
+ // stays at about:blank → navigation timeout), silently breaking any
3778
+ // scanned URL that matches our own filter lists (adblock / blocked /
3779
+ // blockDomainsByUrl) — common on adult/pirate/stream domains. This flag
3780
+ // ONLY guards the abort paths below; the request still flows through the
3781
+ // match logic, so a main-frame redirect destination (e.g. a
3782
+ // filecrypt → ad-domain hop) is still captured via filterRegex/dig/whois.
3783
+ // isNavigationRequest is true for sub-frame docs too, so the mainFrame()
3784
+ // check keeps ad iframes blockable.
3785
+ let isMainFrameDoc = false;
3786
+ try { isMainFrameDoc = request.isNavigationRequest() && request.frame() === page.mainFrame(); } catch (_) {}
3787
+
3645
3788
  // Check against ALL first-party domains (original + all redirects)
3646
3789
  const isFirstParty = checkedRootDomain && firstPartyDomains.has(checkedRootDomain);
3647
-
3648
- // Block infinite iframe loops - safely access frame URL
3649
- const frameUrl = (() => {
3650
- try {
3651
- const frame = request.frame();
3652
- return frame ? frame.url() : '';
3653
- } catch (err) {
3654
- return '';
3655
- }
3656
- })();
3657
- if (frameUrl && frameUrl.includes('creative.dmzjmp.com') &&
3658
- checkedUrl.includes('go.dmzjmp.com/api/models')) {
3659
- if (forceDebug) {
3660
- console.log(formatLogMessage('debug', `Blocking potential infinite iframe loop: ${checkedUrl}`));
3661
- }
3662
- request.abort();
3663
- return;
3664
- }
3665
3790
 
3666
3791
  // Enhanced debug logging to show which frame the request came from
3667
3792
  if (forceDebug) {
@@ -3691,7 +3816,7 @@ function setupFrameHandling(page, forceDebug) {
3691
3816
  request.resourceType()
3692
3817
  );
3693
3818
 
3694
- if (result.blocked) {
3819
+ if (result.blocked && !isMainFrameDoc) {
3695
3820
  adblockStats.blocked++;
3696
3821
  if (forceDebug) {
3697
3822
  console.log(formatLogMessage('debug', `${messageColors.blocked('[adblock]')} ${checkedUrl} (${result.reason})`));
@@ -3699,6 +3824,12 @@ function setupFrameHandling(page, forceDebug) {
3699
3824
  request.abort('blockedbyclient');
3700
3825
  return;
3701
3826
  }
3827
+ if (result.blocked && isMainFrameDoc && forceDebug) {
3828
+ // Matched a filter rule but it's the page we're scanning (or a
3829
+ // main-frame redirect target) — allow it (blocking the top-level
3830
+ // document aborts navigation). It still flows through the matcher.
3831
+ console.log(formatLogMessage('debug', `${messageColors.highlight('[adblock]')} top-level document ${checkedUrl} matched (${result.reason}) — allowed (never block the scanned page)`));
3832
+ }
3702
3833
  adblockStats.allowed++;
3703
3834
  } catch (err) { /* Silently continue on adblock errors */ }
3704
3835
  }
@@ -3752,7 +3883,7 @@ function setupFrameHandling(page, forceDebug) {
3752
3883
  // check so domain-based blocks short-circuit without paying the
3753
3884
  // per-URL regex scan. Same abort reason as the static path so
3754
3885
  // request.failure() observers see consistent metadata.
3755
- if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
3886
+ if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain) && !isMainFrameDoc) {
3756
3887
  if (forceDebug) {
3757
3888
  console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
3758
3889
  }
@@ -3767,7 +3898,7 @@ function setupFrameHandling(page, forceDebug) {
3767
3898
  break;
3768
3899
  }
3769
3900
  }
3770
- if (blockedMatchIndex !== -1) {
3901
+ if (blockedMatchIndex !== -1 && !isMainFrameDoc) {
3771
3902
  // Always track the hit (zero-cost on the un-debug path) so the
3772
3903
  // scan-end summary can show which patterns are doing work vs.
3773
3904
  // which are stale and ready to prune. Keyed by pattern.source --
@@ -4349,15 +4480,114 @@ function setupFrameHandling(page, forceDebug) {
4349
4480
  try {
4350
4481
  navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage);
4351
4482
  } catch (navErr) {
4352
- // Only retry on genuine timeouts, not chrome-error:// redirects
4483
+ // Only handle genuine timeouts here, not chrome-error:// redirects.
4484
+ // pageUrl === 'about:blank' means the navigation never committed
4485
+ // (server never responded) — treat as a real failure, not a partial
4486
+ // page; only a page that actually reached a URL is worth observing.
4353
4487
  let pageUrl = '';
4354
4488
  try { if (!page.isClosed()) pageUrl = page.url(); } catch {}
4355
4489
  const isPopupFailure = navErr.message.includes('chrome-error://') || navErr.message.includes('invalid URL') ||
4356
4490
  pageUrl.startsWith('chrome-error://') || pageUrl === 'about:blank';
4357
4491
  if ((navErr.message.includes('timeout') || navErr.message.includes('Timeout')) && !isPopupFailure) {
4358
- if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout, retrying with waitUntil:networkidle2 for ${currentUrl}`));
4359
- const fallbackOptions = { ...gotoOptions, waitUntil: 'networkidle2', timeout: Math.min(timeout, 10000) };
4360
- navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, fallbackOptions, forceDebug, formatLogMessage);
4492
+ // The OLD fallback retried with networkidle2 STRICTER than the
4493
+ // domcontentloaded default, so it could never rescue a
4494
+ // domcontentloaded timeout (and Puppeteer 25 has no 'commit', i.e.
4495
+ // nothing more lenient). Two-tier recovery instead:
4496
+ // 1. If the site used a wait STRICTER than domcontentloaded, do one
4497
+ // lenient retry with domcontentloaded (it fires earlier).
4498
+ // 2. Otherwise proceed with the partially-loaded page rather than
4499
+ // discarding the URL — it exists and requests already fired
4500
+ // (captured by page.on('request')); the delay/interact phase
4501
+ // below keeps capturing. Streaming/embed/media pages routinely
4502
+ // never reach DOM-ready (a connection stays open) but their
4503
+ // ad/tracker calls fired early.
4504
+ const primaryWait = gotoOptions.waitUntil || defaultWaitUntil;
4505
+ let recovered = false;
4506
+ if (primaryWait !== 'domcontentloaded') {
4507
+ try {
4508
+ if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout (${primaryWait}), retrying with waitUntil:domcontentloaded for ${currentUrl}`));
4509
+ const fallbackOptions = { ...gotoOptions, waitUntil: 'domcontentloaded', timeout: Math.min(timeout, 15000) };
4510
+ navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, fallbackOptions, forceDebug, formatLogMessage);
4511
+ recovered = true;
4512
+ } catch (_) { /* fall through to proceed-with-partial */ }
4513
+ }
4514
+ if (!recovered) {
4515
+ let partialUrl = currentUrl;
4516
+ try { if (!page.isClosed()) partialUrl = page.url() || currentUrl; } catch {}
4517
+ if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout — proceeding with partially-loaded page for ${currentUrl}`));
4518
+ navigationResult = { finalUrl: partialUrl, redirected: false, redirectChain: [currentUrl], originalUrl: currentUrl, redirectDomains: [], httpStatus: null, cfRay: null };
4519
+ }
4520
+ } else if (navErr.message.includes('ERR_TOO_MANY_REDIRECTS')) {
4521
+ // Redirect-cloaking chain exceeded Chrome's ~20-hop per-navigation
4522
+ // ceiling, so goto() rejected. Two recovery paths — they cover
4523
+ // opposite cases run-to-run, so try both:
4524
+ // 1. Browser ride-through (free): a JS/meta hop on a committed
4525
+ // intermediate page resets Chrome's counter and carries the page
4526
+ // to the end site on its own. Check if it already happened, else
4527
+ // wait briefly for it.
4528
+ // 2. curl-resolve (fallback, only if the page parked on
4529
+ // chrome-error): curl follows the chain (it gets the real chain,
4530
+ // not headless Chrome's endless loop) to the JS-handoff page;
4531
+ // navigating there directly is a SHORT hop that reaches the end
4532
+ // site. Skipped under proxy/VPN — curl runs DIRECT from the host
4533
+ // and would leak the real IP / resolve from the wrong network.
4534
+ // If neither reaches a real page, keep the chain requests already
4535
+ // captured (grouped under the original URL, never chrome-error).
4536
+ let landedUrl = '';
4537
+ const isRealPage = (u) => !!u && /^https?:\/\//.test(u) && !u.startsWith('chrome-error://') && u !== currentUrl;
4538
+
4539
+ // 1) Browser ride-through — may have completed during goto(); if not,
4540
+ // wait for the next navigation(s) to carry it through.
4541
+ try { if (!page.isClosed() && isRealPage(page.url())) landedUrl = page.url(); } catch {}
4542
+ for (let r = 0; r < 3 && !landedUrl; r++) {
4543
+ try {
4544
+ await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 8000 });
4545
+ if (!page.isClosed() && isRealPage(page.url())) landedUrl = page.url();
4546
+ } catch { break; } // no further navigation — stop waiting
4547
+ }
4548
+ if (landedUrl && forceDebug) console.log(formatLogMessage('debug', `Too many redirects — browser rode through to ${landedUrl} for ${currentUrl}`));
4549
+
4550
+ // 2) curl-resolve fallback — only if still parked (no ride-through).
4551
+ // Opt-in via the site's `curl` option: if you didn't enable curl
4552
+ // in the config, the scanner won't shell out to it here either
4553
+ // (consistent with the content-analysis `curl` gate).
4554
+ if (!landedUrl) {
4555
+ const curlResolveOk = siteConfig.curl === true && !needsProxy(siteConfig) && !anyVpnConfigured && validateCurlAvailability().isAvailable;
4556
+ if (curlResolveOk) {
4557
+ let resolvedUrl = '';
4558
+ try {
4559
+ const curlUa = USER_AGENT_COLLECTIONS.get((siteConfig.userAgent || 'chrome').toLowerCase())
4560
+ || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36';
4561
+ const cr = await runProcess('curl', ['-sL', '--max-redirs', '50', '--max-time', '20', '-o', '/dev/null', '-A', curlUa, '-w', '%{url_effective}', currentUrl], { timeout: 22000, maxStdout: 4096 });
4562
+ const u = (cr.stdout || '').trim();
4563
+ if (cr.code === 0 && /^https?:\/\//.test(u) && u !== currentUrl) resolvedUrl = u;
4564
+ } catch (_) { /* curl failed */ }
4565
+ if (resolvedUrl) {
4566
+ if (forceDebug) console.log(formatLogMessage('debug', `Too many redirects — curl resolved the chain to ${resolvedUrl}; navigating there directly for ${currentUrl}`));
4567
+ // Navigate to the resolved endpoint; the streaming/embed end page
4568
+ // often never reaches DOM-ready, so the goto may throw — either
4569
+ // way it navigated, so adopt page.url().
4570
+ try { navigationResult = await navigateWithRedirectHandling(page, resolvedUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage); } catch (_) { /* timed out — use page.url() below */ }
4571
+ try { if (!page.isClosed() && page.url() && !page.url().startsWith('chrome-error://')) landedUrl = page.url(); } catch {}
4572
+ } else if (forceDebug) {
4573
+ console.log(formatLogMessage('debug', `Too many redirects — no ride-through and curl could not resolve; keeping chain captures for ${currentUrl}`));
4574
+ }
4575
+ } else if (forceDebug) {
4576
+ const why = siteConfig.curl !== true ? 'curl not enabled (curl:false)'
4577
+ : (needsProxy(siteConfig) || anyVpnConfigured) ? 'proxy/VPN active'
4578
+ : 'curl unavailable';
4579
+ console.log(formatLogMessage('debug', `Too many redirects — no ride-through and curl-resolve skipped (${why}); keeping chain captures for ${currentUrl}`));
4580
+ }
4581
+ }
4582
+
4583
+ // navigateWithRedirectHandling may already have set navigationResult
4584
+ // (clean curl path). Otherwise build a partial from where we landed —
4585
+ // the end site if we rode through / curl'd, else the original URL with
4586
+ // the chain requests already captured.
4587
+ if (!navigationResult) {
4588
+ const fu = landedUrl || currentUrl;
4589
+ navigationResult = { finalUrl: fu, redirected: fu !== currentUrl, redirectChain: [currentUrl, fu], originalUrl: currentUrl, redirectDomains: [], httpStatus: null, cfRay: null };
4590
+ }
4361
4591
  } else {
4362
4592
  throw navErr;
4363
4593
  }
@@ -4403,17 +4633,26 @@ function setupFrameHandling(page, forceDebug) {
4403
4633
  redirectHistory.add(currentUrl);
4404
4634
  redirectHistory.add(finalUrl);
4405
4635
 
4406
- // Add redirect destination to first-party domains immediately
4407
- if (finalDomain) {
4408
- firstPartyDomains.add(finalDomain);
4409
- }
4410
-
4411
- // Also add any intermediate redirect domains as first-party
4412
- if (redirectDomains && redirectDomains.length > 0) {
4413
- redirectDomains.forEach(domain => {
4414
- const rootDomain = safeGetDomain(`http://${domain}`, false);
4415
- if (rootDomain) firstPartyDomains.add(rootDomain);
4416
- });
4636
+ // Add redirect destination (and intermediates) to first-party domains
4637
+ // so the landed site's own resources aren't captured as third-party.
4638
+ // Opt out with redirect_first_party:false — then redirect targets stay
4639
+ // THIRD-PARTY and become eligible for filterRegex/dig under
4640
+ // thirdParty:true (e.g. capturing an ad/cloak redirect's end domain).
4641
+ // The originally-scanned domain (added earlier) stays first-party.
4642
+ const redirectsAreFirstParty = siteConfig.redirect_first_party !== false;
4643
+ if (redirectsAreFirstParty) {
4644
+ if (finalDomain) {
4645
+ firstPartyDomains.add(finalDomain);
4646
+ }
4647
+ // Also add any intermediate redirect domains as first-party
4648
+ if (redirectDomains && redirectDomains.length > 0) {
4649
+ redirectDomains.forEach(domain => {
4650
+ const rootDomain = safeGetDomain(`http://${domain}`, false);
4651
+ if (rootDomain) firstPartyDomains.add(rootDomain);
4652
+ });
4653
+ }
4654
+ } else if (forceDebug) {
4655
+ console.log(formatLogMessage('debug', `redirect_first_party:false — keeping redirect target ${finalDomain} third-party for ${currentUrl}`));
4417
4656
  }
4418
4657
 
4419
4658
  if (originalDomain !== finalDomain) {
@@ -4630,13 +4869,85 @@ function setupFrameHandling(page, forceDebug) {
4630
4869
  // Capture hard "dead domain" navigation errors for --show-dead-domains
4631
4870
  // (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
4632
4871
  // challenges are NOT dead — they're excluded by this match.
4633
- const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE|ERR_DNS/.exec(err.message || '');
4634
- if (deadNav) recordDeadDomain(currentUrl, deadNav[0]);
4872
+ // Only DEFINITIVE non-existence / unreachable signals — these now drive
4873
+ // the in-scan dead-domain SKIP (not just --show-dead-domains reporting),
4874
+ // so transient DNS errors must NOT match. The bare `ERR_DNS` used to
4875
+ // catch ERR_DNS_TIMED_OUT / ERR_DNS_MALFORMED_RESPONSE / ERR_DNS_SERVER_FAILED
4876
+ // (all transient) — dropped so a slow-DNS blip can't false-skip the
4877
+ // rest of a live host's URLs.
4878
+ const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE/.exec(err.message || '');
4879
+ if (deadNav) {
4880
+ recordDeadDomain(currentUrl, deadNav[0]);
4881
+ // Corroborate-then-persist to the negative cache (.dnsnegcache with
4882
+ // --dns-cache → cross-scan skip; else in-memory). Chrome resolves via
4883
+ // the possibly-flaky SYSTEM resolver, so its ERR_NAME_NOT_RESOLVED may
4884
+ // be a glitch on a LIVE host. Re-confirm via the reliable --dns
4885
+ // resolver and cache ONLY if it ALSO returns a definitive NXDOMAIN.
4886
+ // ERR_ADDRESS_UNREACHABLE is routing (the host resolves), so the
4887
+ // resolve succeeds and it's correctly not cached. Fire-and-forget:
4888
+ // off the critical path; saveDiskCache flushes on exit.
4889
+ if (dnsPrecheckEnabled && deadNav[0] === 'ERR_NAME_NOT_RESOLVED') {
4890
+ let navHost = '';
4891
+ try { navHost = new URL(currentUrl).hostname; } catch {}
4892
+ if (navHost && !/^[\d.:]+$|^\[/.test(navHost) && !dnsNegativeCache.has(navHost)) {
4893
+ dnsResolver.resolveHost(navHost, dnsPrecheckTimeoutMs).then(
4894
+ () => { /* reliable resolver resolves it — system-resolver glitch, do NOT cache */ },
4895
+ (e) => {
4896
+ const code = (e && (e.code || e.message)) || '';
4897
+ if (isNonExistenceError(code)) {
4898
+ dnsNegativeCacheSet(navHost, code);
4899
+ recordDeadDomain(navHost, code);
4900
+ if (forceDebug) console.log(formatLogMessage('debug', `Dead domain confirmed by --dns resolver (${code}) — caching ${navHost} (skips next run with --dns-cache)`));
4901
+ }
4902
+ }
4903
+ ).catch(() => {});
4904
+ }
4905
+ }
4906
+ }
4635
4907
  throw err;
4636
4908
  }
4637
4909
  }
4638
4910
  }
4639
4911
 
4912
+ // Targeted clicks: after load, click configured CSS selectors in order
4913
+ // (e.g. a movie link, then a play button) to reach content via organic
4914
+ // navigation/gesture instead of a direct deep-load (which some sites
4915
+ // JS-redirect away). The request interceptor stays attached, so the
4916
+ // post-click page's requests flow into the same filterRegex/dig matching.
4917
+ // Reuses realistic_click for a genuine trusted gesture. Runs before the
4918
+ // delay/interact phase so those operate on the resulting page.
4919
+ if (Array.isArray(siteConfig.click_elements) && siteConfig.click_elements.length > 0 && page && !page.isClosed()) {
4920
+ // If ghost-cursor is enabled for this site (cursor_mode:"ghost" or
4921
+ // --ghost-cursor), route the targeted clicks through it — Bezier travel
4922
+ // to the element + realistic press — matching the interact phase.
4923
+ // Injected so interaction.js needn't require ghost-cursor.js (circular).
4924
+ // Falls back to performTargetedClicks' humanClick/el.click when ghost is
4925
+ // off or the package isn't installed (resolveGhostCursorConfig → null).
4926
+ let ghostClicker = null;
4927
+ const tcGhostCfg = resolveGhostCursorConfig(siteConfig, globalGhostCursor, forceDebug);
4928
+ if (tcGhostCfg) {
4929
+ const tcCursor = createGhostCursor(page, { forceDebug });
4930
+ if (tcCursor) {
4931
+ ghostClicker = (x, y) => ghostClick(tcCursor, { x, y }, {
4932
+ hesitate: tcGhostCfg.hesitate,
4933
+ page,
4934
+ realistic: siteConfig.realistic_click === true,
4935
+ forceDebug
4936
+ });
4937
+ }
4938
+ }
4939
+ try {
4940
+ await performTargetedClicks(page, siteConfig.click_elements, {
4941
+ realistic: siteConfig.realistic_click === true,
4942
+ waitMs: Math.min(Number(siteConfig.click_wait) || 5000, Math.floor(timeout / 2)),
4943
+ ghostClick: ghostClicker,
4944
+ forceDebug
4945
+ });
4946
+ } catch (clickErr) {
4947
+ if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} click_elements phase failed for ${currentUrl}: ${clickErr.message}`));
4948
+ }
4949
+ }
4950
+
4640
4951
  const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
4641
4952
 
4642
4953
  // Optimized delays for Puppeteer 23.x performance
@@ -4653,6 +4964,13 @@ function setupFrameHandling(page, forceDebug) {
4653
4964
  const actualDelay = siteConfig.delay_uncapped === true
4654
4965
  ? Math.min(delayMs, Math.floor(timeout / 2))
4655
4966
  : Math.min(delayMs, TIMEOUTS.NETWORK_IDLE);
4967
+ // Surface the clamp — otherwise `delay: 48000` silently running as 29000
4968
+ // (timeout/2) looks like the flag was ignored. The per-URL budget already
4969
+ // reserves the full `delay`, so the lever to honor it is a larger timeout.
4970
+ if (forceDebug && actualDelay < delayMs) {
4971
+ const ceiling = siteConfig.delay_uncapped === true ? 'timeout/2; raise timeout to lift' : 'default 2s cap; set delay_uncapped:true to lift';
4972
+ console.log(formatLogMessage('debug', `delay ${delayMs}ms clamped to ${actualDelay}ms (${ceiling}) for ${currentUrl}`));
4973
+ }
4656
4974
 
4657
4975
  // Build delay promise (networkIdle + delay + optional flowProxy delay)
4658
4976
  const delayPromise = (async () => {
@@ -4925,6 +5243,21 @@ function setupFrameHandling(page, forceDebug) {
4925
5243
 
4926
5244
  let reloadSuccess = false;
4927
5245
 
5246
+ // page.reload() can't carry a referer; when referrer_headers is set,
5247
+ // re-navigate to the current URL with it so referer-gated embeds keep
5248
+ // serving across the reload:N loop (the initial goto carries the referer,
5249
+ // but reload() drops it). Nav-only scope — subresources keep their normal
5250
+ // page-origin referer (unlike setExtraHTTPHeaders, which would force the
5251
+ // referer onto every request and can break embeds whose subresources
5252
+ // expect own-origin). A static referrer_headers string is identical each
5253
+ // reload; random/mixed modes pick a fresh value per reload.
5254
+ const reloadReferer = siteConfig.referrer_headers
5255
+ ? getReferrerForUrl(currentUrl, siteConfig.referrer_headers, siteConfig.referrer_disable, forceDebug)
5256
+ : '';
5257
+ const reloadOrReferredGoto = (opts) => reloadReferer
5258
+ ? page.goto(page.url(), { ...opts, referer: reloadReferer })
5259
+ : page.reload(opts);
5260
+
4928
5261
  // Skip force reload if browser seems unhealthy
4929
5262
  const skipForceReload = i > 2; // After 2 attempts, skip force reload
4930
5263
 
@@ -4947,7 +5280,7 @@ function setupFrameHandling(page, forceDebug) {
4947
5280
  await raceWithTimer(page.setCacheEnabled(false), 'Cache disable timeout', 8000);
4948
5281
 
4949
5282
  // Use networkidle2 for force reload to better detect when page is actually loaded
4950
- await page.reload({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
5283
+ await reloadOrReferredGoto({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
4951
5284
 
4952
5285
  // Timeout-protected cache enable
4953
5286
  await raceWithTimer(page.setCacheEnabled(true), 'Cache enable timeout', 8000);
@@ -4986,7 +5319,7 @@ function setupFrameHandling(page, forceDebug) {
4986
5319
  ? { waitUntil: 'domcontentloaded', timeout: 10000 } // Simpler after failures
4987
5320
  : { waitUntil: 'networkidle2', timeout: 15000 }; // Full wait first time
4988
5321
 
4989
- await page.reload(reloadOptions);
5322
+ await reloadOrReferredGoto(reloadOptions);
4990
5323
 
4991
5324
  if (forceDebug) console.log(formatLogMessage('debug', `Standard reload #${i} completed for ${currentUrl}`));
4992
5325
  } catch (standardReloadErr) {
@@ -5263,7 +5596,7 @@ function setupFrameHandling(page, forceDebug) {
5263
5596
  const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_').substring(0, 80);
5264
5597
  const filename = `screenshots/${safeUrl}-${timestamp}.png`;
5265
5598
  try {
5266
- if (!fs.existsSync('screenshots')) fs.mkdirSync('screenshots', { recursive: true });
5599
+ fs.mkdirSync('screenshots', { recursive: true }); // recursive:true is a no-op if it already exists
5267
5600
  await page.screenshot({ path: filename, type: 'png', fullPage: true });
5268
5601
  console.log(formatLogMessage('info', `Screenshot saved: ${filename}`));
5269
5602
  } catch (screenshotErr) {
@@ -5759,6 +6092,19 @@ function setupFrameHandling(page, forceDebug) {
5759
6092
  // actually starting — wrongly skipping live domains. c-ares isn't
5760
6093
  // threadpool-bound so it's immune to that contention.
5761
6094
  if (dnsPrecheckEnabled && taskDomain && !/^[\d.:]+$|^\[/.test(taskDomain)) {
6095
+ // Already proven dead earlier THIS run — either a pre-check NXDOMAIN or
6096
+ // a prior URL's navigation hit ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE
6097
+ // (recordDeadDomain populates _deadDomains for both). Skip the repeat
6098
+ // instead of paying another fail-open navigation on a multi-URL dead
6099
+ // host (e.g. dlstreams.top?id=39/54/347). In-scan only (NOT persisted):
6100
+ // Chrome resolves via the system resolver, so a nav-level failure could
6101
+ // be a system-resolver glitch on a live host — a false "dead" must not
6102
+ // carry across runs. Cheap: a Map lookup, no DNS resolve.
6103
+ if (_deadDomains.has(taskDomain)) {
6104
+ dnsPrecheckSkips++;
6105
+ if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check: ${taskDomain} already dead this run (${_deadDomains.get(taskDomain)}) — skipping`));
6106
+ return { url: task.url, rules: [], success: false, error: `DNS: ${_deadDomains.get(taskDomain)}`, skipped: true };
6107
+ }
5762
6108
  const cached = dnsNegativeCache.get(taskDomain);
5763
6109
  if (cached && Date.now() - cached.timestamp < DNS_NEGATIVE_CACHE_TTL_MS) {
5764
6110
  dnsPrecheckSkips++;
@@ -5833,10 +6179,24 @@ function setupFrameHandling(page, forceDebug) {
5833
6179
  const INTERACTION_OVERHEAD_MS = interactionOnForTask
5834
6180
  ? computeInteractionCeilingMs(createInteractionConfig(task.url, task.config))
5835
6181
  : 0;
6182
+ // click_elements runs ONCE after load (before the delay/interact/reload
6183
+ // phases): N selectors, each a settle/nav wait (click_wait, capped at
6184
+ // timeout/2 — mirror the call site) plus ~2s for scroll + the click action
6185
+ // (ghost Bezier travel is the slowest). Budget it so a heavy click chain
6186
+ // can't trip the per-URL ceiling before the work that follows it. Not
6187
+ // multiplied by reloadCount — the click phase is one-time.
6188
+ const clickEls = Array.isArray(task.config.click_elements)
6189
+ ? task.config.click_elements.filter(s => typeof s === 'string' && s.trim())
6190
+ : [];
6191
+ const clickWaitMs = clickEls.length
6192
+ ? Math.min(Number(task.config.click_wait) || 5000, Math.floor((task.config.timeout || 35000) / 2))
6193
+ : 0;
6194
+ const CLICK_ELEMENTS_OVERHEAD_MS = clickEls.length * (clickWaitMs + 2000);
5836
6195
  const PER_URL_TIMEOUT_MS = Math.max(
5837
6196
  75000,
5838
6197
  (task.config.timeout || 35000)
5839
6198
  + ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
6199
+ + CLICK_ELEMENTS_OVERHEAD_MS
5840
6200
  + 30000
5841
6201
  );
5842
6202
  // Feed the hang-check restart so it never escalates before this URL's own