@fanboynz/network-scanner 3.2.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +40 -4
- package/lib/dns.js +117 -7
- package/lib/fingerprint.js +39 -36
- package/lib/interaction.js +151 -0
- package/lib/nettools.js +7 -4
- package/lib/openvpn_vpn.js +8 -0
- package/lib/validate_rules.js +3 -3
- package/lib/wireguard_vpn.js +8 -0
- package/nwss.1 +46 -6
- package/nwss.js +449 -89
- package/package.json +1 -1
package/nwss.js
CHANGED
|
@@ -9,7 +9,7 @@ const fs = require('fs');
|
|
|
9
9
|
const os = require('os');
|
|
10
10
|
const psl = require('psl');
|
|
11
11
|
const path = require('path');
|
|
12
|
-
const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
|
|
12
|
+
const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError, dohTemplatesForResolvers } = require('./lib/dns');
|
|
13
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
14
14
|
const { compressMultipleFiles } = require('./lib/compress');
|
|
15
15
|
const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
|
|
@@ -17,6 +17,7 @@ const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROM
|
|
|
17
17
|
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
18
18
|
// Curl functionality (replace searchstring curl handler)
|
|
19
19
|
const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
|
|
20
|
+
const { runProcess } = require('./lib/spawn-async');
|
|
20
21
|
// Rule validation
|
|
21
22
|
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile, normalizeSiteConfig } = require('./lib/validate_rules');
|
|
22
23
|
// CF Bypass
|
|
@@ -55,6 +56,7 @@ const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
|
|
|
55
56
|
const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
|
|
56
57
|
const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
|
|
57
58
|
const VPN_TAG = messageColors.processing('[vpn]');
|
|
59
|
+
const POPUP_TAG = messageColors.processing('[popup]');
|
|
58
60
|
// Precomputed colored '[SmartCache]' subsystem prefix — paired with the
|
|
59
61
|
// same constant in lib/smart-cache.js so debug lines from both files
|
|
60
62
|
// produce consistently colored output. formatLogMessage only colors the
|
|
@@ -64,7 +66,7 @@ const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
|
|
|
64
66
|
// log lines (start/completed). Same cyan as the other monitoring tags.
|
|
65
67
|
const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
66
68
|
// Enhanced mouse interaction and page simulation
|
|
67
|
-
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
69
|
+
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove, performTargetedClicks } = require('./lib/interaction');
|
|
68
70
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
69
71
|
const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
|
|
70
72
|
// Domain detection cache for performance optimization
|
|
@@ -240,6 +242,7 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
240
242
|
resource_cleanup_interval: ['--cleanup-interval'],
|
|
241
243
|
dns: ['--dns'],
|
|
242
244
|
dns_cache: ['--dns-cache'],
|
|
245
|
+
doh_disable: ['--doh-disable'],
|
|
243
246
|
cache_requests: ['--cache-requests'],
|
|
244
247
|
dumpurls: ['--dumpurls'],
|
|
245
248
|
remove_tempfiles: ['--remove-tempfiles'],
|
|
@@ -376,7 +379,13 @@ if (dnsCacheMode) enableDiskCache();
|
|
|
376
379
|
// Filters NXDOMAIN / unresolvable hostnames in <100ms before paying the
|
|
377
380
|
// ~5-15s Puppeteer + Cloudflare detection round-trip on each.
|
|
378
381
|
const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
|
|
379
|
-
|
|
382
|
+
// 4s (was 2s): under a concurrent scan the c-ares UDP burst against the pinned
|
|
383
|
+
// resolvers can take >2s to answer — a tight timeout false-counted those as
|
|
384
|
+
// resolver errors and tripped the circuit breaker. A clean NXDOMAIN still
|
|
385
|
+
// returns fast (the resolver answers immediately), so the higher ceiling only
|
|
386
|
+
// costs time when the resolver is genuinely slow — exactly when we want to wait
|
|
387
|
+
// rather than false-fail. Paired with the resolver's concurrency cap below.
|
|
388
|
+
const dnsPrecheckTimeoutMs = 4000;
|
|
380
389
|
|
|
381
390
|
// --show-dead-domains: collect hostnames that are definitively DEAD (do not
|
|
382
391
|
// exist / unreachable) and print them at the end of the scan so they can be
|
|
@@ -387,7 +396,11 @@ const dnsPrecheckTimeoutMs = 2000;
|
|
|
387
396
|
const showDeadDomains = args.includes('--show-dead-domains');
|
|
388
397
|
const _deadDomains = new Map();
|
|
389
398
|
function recordDeadDomain(urlOrHost, reason) {
|
|
390
|
-
|
|
399
|
+
// Populate unconditionally — the pre-check skip reads _deadDomains to drop
|
|
400
|
+
// repeat URLs on a host already proven dead this run, which must work whether
|
|
401
|
+
// or not --show-dead-domains is set. The end-of-scan REPORT is separately
|
|
402
|
+
// gated on showDeadDomains, so the flag still controls output, not recording.
|
|
403
|
+
if (!urlOrHost) return;
|
|
391
404
|
let host = urlOrHost;
|
|
392
405
|
try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
|
|
393
406
|
if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
|
|
@@ -407,7 +420,7 @@ const DNS_NEGATIVE_CACHE_MAX = 1000;
|
|
|
407
420
|
// persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
|
|
408
421
|
// hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
|
|
409
422
|
// otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
|
|
410
|
-
// shorter than the dig/whois positive cache (20h): a domain that doesn't exist
|
|
423
|
+
// shorter than the dig/whois positive cache (dig 20h / whois 36h): a domain that doesn't exist
|
|
411
424
|
// now MAY get registered, and this is a domain-hunting scanner, so the dead
|
|
412
425
|
// ones are re-checked twice a day rather than trusted for ~a day.
|
|
413
426
|
const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
|
|
@@ -437,6 +450,31 @@ const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceD
|
|
|
437
450
|
// system /etc/resolv.conf, which on a flaky setup times out and silently drops
|
|
438
451
|
// dig-gated domains). Only when --dns is explicitly set.
|
|
439
452
|
if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
|
|
453
|
+
// Pin Chrome's NAVIGATION resolver to the same providers via DoH. Chrome
|
|
454
|
+
// ignores --dns for page loads and reads /etc/resolv.conf directly, so a broken
|
|
455
|
+
// system resolver (e.g. one returning REFUSED) can ERR_NAME_NOT_RESOLVED a
|
|
456
|
+
// domain the pre-check already resolved. Mapping --dns to the matching DoH
|
|
457
|
+
// endpoint makes navigation use the pinned provider instead of resolv.conf.
|
|
458
|
+
// 'automatic' mode (not 'secure') so Chrome still falls back to system DNS if
|
|
459
|
+
// DoH is unreachable rather than failing the whole batch. Empty templates when
|
|
460
|
+
// --dns is absent or maps to no known DoH provider — Chrome keeps system DNS.
|
|
461
|
+
//
|
|
462
|
+
// Applied ONLY to direct connections (see createBrowser): when a proxy or VPN
|
|
463
|
+
// is active, the exit/tunnel does the resolution (remote DNS / pushed DNS), so
|
|
464
|
+
// pinning local DoH would be redundant and could resolve geo-split domains to
|
|
465
|
+
// the wrong region. In those modes Chrome defers to the proxy/VPN as before.
|
|
466
|
+
// --doh-disable (default false) opts out of the Chrome DoH pinning entirely —
|
|
467
|
+
// navigation falls back to system resolv.conf even when --dns maps to a known
|
|
468
|
+
// provider. The pre-check and dig still honor --dns. Use it if DoH adds
|
|
469
|
+
// unwanted latency, is blocked on the network, or you specifically want Chrome
|
|
470
|
+
// to resolve via the system path.
|
|
471
|
+
const dohDisabled = args.includes('--doh-disable');
|
|
472
|
+
const chromeDoh = dnsServersOverride.length > 0
|
|
473
|
+
? dohTemplatesForResolvers(dnsServersOverride)
|
|
474
|
+
: { templates: '', mapped: [], unmapped: [] };
|
|
475
|
+
// anyVpnConfigured and the DoH startup log live inside the main IIFE below:
|
|
476
|
+
// `sites` is destructured from the config later in module load, so referencing
|
|
477
|
+
// it at this point in top-level evaluation would TDZ-throw.
|
|
440
478
|
// Circuit breaker: if resolver errors dominate, suspend the pre-check for a
|
|
441
479
|
// cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
|
|
442
480
|
// still load — a suspended pre-check just proceeds to navigation).
|
|
@@ -715,6 +753,9 @@ if (blockAdsIndex !== -1) {
|
|
|
715
753
|
|
|
716
754
|
adblockEnabled = true;
|
|
717
755
|
const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
|
|
756
|
+
// Only ever assigned the os.tmpdir() path below — never a user file — so the
|
|
757
|
+
// unlink in finally can never touch the caller's own lists.
|
|
758
|
+
let combinedTmpFile = null;
|
|
718
759
|
try {
|
|
719
760
|
if (engine === adblockRust) {
|
|
720
761
|
// Rust wrapper accepts an array directly — no temp file needed.
|
|
@@ -723,15 +764,22 @@ if (blockAdsIndex !== -1) {
|
|
|
723
764
|
// JS engine takes a single path; concat to a temp file when multiple lists.
|
|
724
765
|
let rulesFile = rulesFiles[0];
|
|
725
766
|
if (rulesFiles.length > 1) {
|
|
726
|
-
|
|
767
|
+
combinedTmpFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
|
|
768
|
+
rulesFile = combinedTmpFile;
|
|
727
769
|
const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
|
|
728
770
|
fs.writeFileSync(rulesFile, combined);
|
|
729
771
|
}
|
|
772
|
+
// parseAdblockRules reads the file synchronously and in full before
|
|
773
|
+
// returning, so the temp copy is safe to remove immediately afterwards.
|
|
730
774
|
adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
|
|
731
775
|
}
|
|
732
776
|
} catch (err) {
|
|
733
777
|
console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
|
|
734
778
|
process.exit(1);
|
|
779
|
+
} finally {
|
|
780
|
+
if (combinedTmpFile) {
|
|
781
|
+
try { fs.unlinkSync(combinedTmpFile); } catch { /* best effort — OS reaps tmpdir */ }
|
|
782
|
+
}
|
|
735
783
|
}
|
|
736
784
|
const stats = adblockMatcher.getStats();
|
|
737
785
|
const ruleDesc = stats.total != null
|
|
@@ -803,9 +851,13 @@ General Options:
|
|
|
803
851
|
|
|
804
852
|
Validation Options:
|
|
805
853
|
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
|
|
806
|
-
--dns <ip[,ip,...]> Resolver(s) for the DNS pre-check
|
|
807
|
-
|
|
808
|
-
|
|
854
|
+
--dns <ip[,ip,...]> Resolver(s) for the DNS pre-check, nettools' dig, and — when they map to a
|
|
855
|
+
known DoH provider — Chrome's page navigation via DoH on direct connections
|
|
856
|
+
(skipped under proxy/VPN; not whois). Overrides /etc/resolv.conf.
|
|
857
|
+
One pins all queries to it; several rotate per query.
|
|
858
|
+
--doh-disable Opt out of the Chrome-navigation DoH pinning (default: off). Chrome then
|
|
859
|
+
resolves via system resolv.conf; --dns still pins the pre-check and dig.
|
|
860
|
+
--dns-cache Persist dig/whois results to disk between runs (dig 20h / whois 36h TTL, 2000-entry cap each),
|
|
809
861
|
plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
|
|
810
862
|
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
|
|
811
863
|
By default, URLs whose hostname doesn't resolve are skipped
|
|
@@ -879,6 +931,9 @@ Redirect Handling Options:
|
|
|
879
931
|
source: true/false Save page source HTML after load
|
|
880
932
|
firstParty: true/false Allow first-party matches (default: false)
|
|
881
933
|
thirdParty: true/false Allow third-party matches (default: true)
|
|
934
|
+
redirect_first_party: true/false Treat redirect-destination domains as first-party (default: true).
|
|
935
|
+
false keeps redirect targets third-party so filterRegex/dig can match
|
|
936
|
+
them (e.g. capturing an ad/cloak redirect's end domain)
|
|
882
937
|
screenshot: true/false/\"force\" Capture screenshot (true=on failure, \"force\"=always)
|
|
883
938
|
headful: true/false Launch browser with GUI for this site
|
|
884
939
|
fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
|
|
@@ -916,6 +971,9 @@ Advanced Options:
|
|
|
916
971
|
interact_scrolling: true/false Enable scrolling simulation (default: true)
|
|
917
972
|
interact_clicks: true/false Enable element clicking simulation (default: false)
|
|
918
973
|
interact_typing: true/false Enable typing simulation (default: false)
|
|
974
|
+
click_elements: ["sel1","sel2"] After load, click these CSS selectors in order, main frame + iframes
|
|
975
|
+
(organic nav / play button). Honors realistic_click + cursor_mode "ghost"; missing skipped
|
|
976
|
+
click_wait: <milliseconds> Per-click: max wait for the element to appear + settle/nav after (default: 5000)
|
|
919
977
|
cursor_mode: "ghost" Use ghost-cursor Bezier mouse (requires: npm i ghost-cursor)
|
|
920
978
|
ghost_cursor_speed: <number> Ghost-cursor speed multiplier (default: auto)
|
|
921
979
|
ghost_cursor_hesitate: <milliseconds> Delay before ghost-cursor clicks (default: 50)
|
|
@@ -933,7 +991,7 @@ Advanced Options:
|
|
|
933
991
|
whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
|
|
934
992
|
dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
|
|
935
993
|
dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
|
|
936
|
-
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "
|
|
994
|
+
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "domcontentloaded"})
|
|
937
995
|
dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
|
|
938
996
|
digRecordType: "A" DNS record type for dig (default: A)
|
|
939
997
|
|
|
@@ -1423,6 +1481,7 @@ if (dumpUrls) {
|
|
|
1423
1481
|
// Avoids blocking I/O on every intercepted request in debug/dumpurls mode
|
|
1424
1482
|
const _logBuffers = new Map(); // filePath -> string[]
|
|
1425
1483
|
const LOG_FLUSH_INTERVAL = 2000; // Flush every 2 seconds
|
|
1484
|
+
const LOG_BUFFER_MAX_RETAINED = 10000; // Cap a file's retry backlog (lines) so a permanently unwritable path can't grow memory unboundedly
|
|
1426
1485
|
let _logFlushTimer = null;
|
|
1427
1486
|
|
|
1428
1487
|
function bufferedLogWrite(filePath, entry) {
|
|
@@ -1435,18 +1494,20 @@ function bufferedLogWrite(filePath, entry) {
|
|
|
1435
1494
|
|
|
1436
1495
|
function flushLogBuffers() {
|
|
1437
1496
|
for (const [filePath, entries] of _logBuffers) {
|
|
1438
|
-
if (entries.length
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1497
|
+
if (entries.length === 0) continue;
|
|
1498
|
+
try {
|
|
1499
|
+
// Synchronous append on purpose: the batched 2s flush is small, and a
|
|
1500
|
+
// blocking append cannot overlap the next timer tick (it holds the event
|
|
1501
|
+
// loop for its duration) — eliminating the interleaved concurrent-append
|
|
1502
|
+
// hazard of the old async fs.writeFile({flag:'a'}). Clear ONLY after the
|
|
1503
|
+
// write succeeds, so a transient failure retries next tick instead of
|
|
1504
|
+
// being silently dropped (the old code cleared before the async write
|
|
1505
|
+
// confirmed). Bounded so a permanently unwritable path can't grow memory.
|
|
1506
|
+
fs.appendFileSync(filePath, entries.join(''));
|
|
1507
|
+
entries.length = 0;
|
|
1508
|
+
} catch (err) {
|
|
1509
|
+
console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
|
|
1510
|
+
if (entries.length > LOG_BUFFER_MAX_RETAINED) entries.length = 0;
|
|
1450
1511
|
}
|
|
1451
1512
|
}
|
|
1452
1513
|
}
|
|
@@ -1490,21 +1551,29 @@ if (forceDebug && globalComments) {
|
|
|
1490
1551
|
* @param {string} url - The URL string to parse.
|
|
1491
1552
|
* @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
|
|
1492
1553
|
*/
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1554
|
+
// psl.parse memoized by hostname. The request handlers parse the root domain
|
|
1555
|
+
// of EVERY request, and a page hits the same few hosts repeatedly (CDN,
|
|
1556
|
+
// analytics, ad domains) — so a hostname-keyed memo turns almost all of those
|
|
1557
|
+
// into Map hits instead of repeated public-suffix-list lookups. Keyed by
|
|
1558
|
+
// hostname (not full URL) so distinct paths/queries on one host share one
|
|
1559
|
+
// entry: higher hit rate, fewer + shorter keys than a URL-keyed cache.
|
|
1560
|
+
// psl.parse is pure and never throws (malformed input → {domain: null}), so
|
|
1561
|
+
// the catch is defensive only.
|
|
1562
|
+
const _hostRootCache = new Map();
|
|
1563
|
+
function rootDomainForHost(hostname) {
|
|
1564
|
+
if (!hostname) return '';
|
|
1565
|
+
const cached = _hostRootCache.get(hostname);
|
|
1496
1566
|
if (cached !== undefined) return cached;
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
}
|
|
1567
|
+
let result;
|
|
1568
|
+
try { const parsed = psl.parse(hostname); result = parsed.domain || hostname; }
|
|
1569
|
+
catch { result = hostname; }
|
|
1570
|
+
if (_hostRootCache.size > 5000) _hostRootCache.clear();
|
|
1571
|
+
_hostRootCache.set(hostname, result);
|
|
1572
|
+
return result;
|
|
1573
|
+
}
|
|
1574
|
+
function getRootDomain(url) {
|
|
1575
|
+
try { return rootDomainForHost(new URL(url).hostname); }
|
|
1576
|
+
catch { return ''; }
|
|
1508
1577
|
}
|
|
1509
1578
|
|
|
1510
1579
|
/**
|
|
@@ -1839,7 +1908,33 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1839
1908
|
|
|
1840
1909
|
// Declare userDataDir in outer scope for cleanup access
|
|
1841
1910
|
let userDataDir = null;
|
|
1842
|
-
|
|
1911
|
+
|
|
1912
|
+
// Browser-level decision (the browser launches once per batch, so this can't
|
|
1913
|
+
// be per-site): only disable Chrome's pop-up blocker when at least one site
|
|
1914
|
+
// actually wants popups captured. A real browser blocks non-gesture
|
|
1915
|
+
// window.open(), so non-popup scans keep the blocker on for stealth.
|
|
1916
|
+
// capture_popups scans turn it off so non-gesture popunders (document-level
|
|
1917
|
+
// onclick / timer SDKs) fire and get captured too — gesture-triggered
|
|
1918
|
+
// popups already work via the synthetic-click path regardless of this flag.
|
|
1919
|
+
const wantPopups = Array.isArray(sites) && sites.some(s => s && s.capture_popups === true);
|
|
1920
|
+
if (wantPopups && forceDebug) {
|
|
1921
|
+
console.log(formatLogMessage('debug', `${POPUP_TAG} capture_popups set — launching with --disable-popup-blocking (non-gesture popunders allowed)`));
|
|
1922
|
+
}
|
|
1923
|
+
|
|
1924
|
+
// DoH gate: any VPN site disables Chrome DoH (the tunnel resolves). Computed
|
|
1925
|
+
// here (not at module top) because `sites` is only initialized by this point.
|
|
1926
|
+
// Read by createBrowser's launch args; the startup log reports the decision.
|
|
1927
|
+
const anyVpnConfigured = Array.isArray(sites) && sites.some(s => s && (s.vpn || s.openvpn));
|
|
1928
|
+
if (dnsServersOverride.length > 0 && !silentMode) {
|
|
1929
|
+
if (dohDisabled) {
|
|
1930
|
+
console.log(formatLogMessage('info', `Chrome DoH disabled via --doh-disable — navigation uses system resolv.conf; --dns still pins the pre-check and dig.`));
|
|
1931
|
+
} else if (chromeDoh.templates) {
|
|
1932
|
+
console.log(formatLogMessage('info', `Chrome navigation will use DoH (automatic) on direct connections: ${chromeDoh.templates}${anyVpnConfigured ? ' — VPN configured, so it defers to VPN resolution' : ' — deferred to proxy resolution on proxied sites'}`));
|
|
1933
|
+
} else {
|
|
1934
|
+
console.warn(formatLogMessage('warn', `--dns servers (${chromeDoh.unmapped.join(', ')}) have no known DoH endpoint — Chrome navigation stays on system resolv.conf; only the pre-check and dig are pinned. Known providers: Google, Cloudflare, Quad9, OpenDNS, AdGuard, CleanBrowsing, DNS.SB, Mullvad.`));
|
|
1935
|
+
}
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1843
1938
|
/**
|
|
1844
1939
|
* Creates a new browser instance with consistent configuration
|
|
1845
1940
|
* Uses system Chrome and temporary directories to minimize disk usage
|
|
@@ -1930,6 +2025,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1930
2025
|
// Puppeteer 22.x headless mode optimization
|
|
1931
2026
|
// Auto-detect best headless mode based on Puppeteer version
|
|
1932
2027
|
headless: headlessMode,
|
|
2028
|
+
// Bypass TLS cert errors at the browser level (drives CDP
|
|
2029
|
+
// Security.setIgnoreCertificateErrors). Robust on new-headless Chrome,
|
|
2030
|
+
// where the --ignore-certificate-errors *flag* is increasingly ignored.
|
|
2031
|
+
// An ad/tracker scanner must reach self-signed / mismatched-cert ad and
|
|
2032
|
+
// embed domains; we observe traffic, we don't transmit secrets.
|
|
2033
|
+
acceptInsecureCerts: true,
|
|
1933
2034
|
args: [
|
|
1934
2035
|
// CRITICAL: Remove automation detection markers
|
|
1935
2036
|
'--disable-blink-features=AutomationControlled',
|
|
@@ -1941,6 +2042,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1941
2042
|
'--use-mock-keychain',
|
|
1942
2043
|
'--disable-client-side-phishing-detection',
|
|
1943
2044
|
'--enable-features=NetworkService',
|
|
2045
|
+
// DoH for Chrome's navigation resolver when --dns maps to a known
|
|
2046
|
+
// provider — but ONLY on direct connections. A proxied launch carries
|
|
2047
|
+
// a --proxy-server in extraArgs and does its own (remote) DNS; a VPN
|
|
2048
|
+
// tunnels resolution. In both cases local DoH is redundant and could
|
|
2049
|
+
// resolve geo-split domains to the wrong region, so it's skipped and
|
|
2050
|
+
// Chrome defers to the proxy/VPN. 'automatic' keeps a system-DNS
|
|
2051
|
+
// fallback if DoH is unreachable. Flags omitted when not applicable.
|
|
2052
|
+
...((chromeDoh.templates
|
|
2053
|
+
&& !dohDisabled
|
|
2054
|
+
&& !anyVpnConfigured
|
|
2055
|
+
&& !extraArgs.some(a => typeof a === 'string' && a.startsWith('--proxy-server')))
|
|
2056
|
+
? ['--dns-over-https-mode=automatic', `--dns-over-https-templates=${chromeDoh.templates}`]
|
|
2057
|
+
: []),
|
|
1944
2058
|
// Disk space controls - minimal cache for scanning workloads
|
|
1945
2059
|
`--disk-cache-size=${CACHE_LIMITS.DISK_CACHE_SIZE}`,
|
|
1946
2060
|
`--media-cache-size=${CACHE_LIMITS.MEDIA_CACHE_SIZE}`,
|
|
@@ -2018,6 +2132,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2018
2132
|
'--memory-pressure-off',
|
|
2019
2133
|
'--max_old_space_size=2048', // V8 heap limit
|
|
2020
2134
|
'--disable-prompt-on-repost', // Fixes form popup on page reload
|
|
2135
|
+
// Disable Chrome's pop-up blocker (chrome://settings/content/popups)
|
|
2136
|
+
// ONLY when a site wants popups captured — lets non-gesture popunders
|
|
2137
|
+
// fire. Gated so non-popup scans keep the blocker on for stealth.
|
|
2138
|
+
...(wantPopups ? ['--disable-popup-blocking'] : []),
|
|
2021
2139
|
...(keepBrowserOpen ? [] : ['--disable-background-networking']),
|
|
2022
2140
|
'--no-sandbox',
|
|
2023
2141
|
'--disable-setuid-sandbox',
|
|
@@ -2420,10 +2538,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2420
2538
|
page.setDefaultNavigationTimeout(Math.min(timeout, TIMEOUTS.DEFAULT_NAVIGATION));
|
|
2421
2539
|
// Aggressive timeouts prevent hanging in Puppeteer 23.x while maintaining speed
|
|
2422
2540
|
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2541
|
+
// Only attach a console listener under --debug. Registering ANY 'console'
|
|
2542
|
+
// listener makes Puppeteer enable the CDP Runtime domain, which arms
|
|
2543
|
+
// console-based automation/DevTools traps (e.g. disable-devtool logs an
|
|
2544
|
+
// object with a getter and detects the inspector reading it → redirects
|
|
2545
|
+
// away). The body is a no-op without forceDebug, so attaching it
|
|
2546
|
+
// unconditionally armed that trap for zero benefit.
|
|
2547
|
+
if (forceDebug) {
|
|
2548
|
+
page.on('console', (msg) => {
|
|
2549
|
+
if (msg.type() === 'error') console.log(formatLogMessage('debug', `Console error: ${msg.text()}`));
|
|
2550
|
+
});
|
|
2551
|
+
}
|
|
2552
|
+
|
|
2427
2553
|
// Add page crash handler
|
|
2428
2554
|
page.on('error', (err) => {
|
|
2429
2555
|
if (forceDebug) console.log(formatLogMessage('debug', `Page crashed: ${err.message}`));
|
|
@@ -3308,12 +3434,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3308
3434
|
// (normalizeSiteConfig now coerces interact: 1 → true with a warning,
|
|
3309
3435
|
// so by the time we get here both should be booleans — but keep the
|
|
3310
3436
|
// diagnostic accurate for the truly-missing case.)
|
|
3437
|
+
const hasClickElements = Array.isArray(siteConfig.click_elements) && siteConfig.click_elements.length > 0;
|
|
3311
3438
|
const interactOn = siteConfig.interact === true;
|
|
3312
3439
|
const clicksOn = siteConfig.interact_clicks === true;
|
|
3313
|
-
if (!interactOn
|
|
3314
|
-
|
|
3440
|
+
if (hasClickElements && (!interactOn || !clicksOn)) {
|
|
3441
|
+
// click_elements fires its own trusted gesture clicks, so popups it
|
|
3442
|
+
// triggers capture regardless of interact/interact_clicks. Don't warn
|
|
3443
|
+
// "no clicks fire" — surface the random-click coverage gap instead.
|
|
3444
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups: click_elements supplies targeted gesture clicks (popups they trigger WILL capture). interact=${interactOn}, interact_clicks=${clicksOn} — enable both for random content-zone click coverage of overlay popunders too`));
|
|
3445
|
+
} else if (!interactOn && !clicksOn) {
|
|
3446
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but neither 'interact' nor 'interact_clicks' is — set BOTH to true to fire user-gesture clicks; without them, only popups opened via in-page redirects (or click_elements) will capture`));
|
|
3315
3447
|
} else if (!interactOn) {
|
|
3316
|
-
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no fake clicks fire`));
|
|
3448
|
+
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact' is not — set interact: true to enable the interaction loop (interact_clicks is already set); without it, no random fake clicks fire`));
|
|
3317
3449
|
} else if (!clicksOn) {
|
|
3318
3450
|
console.log(formatLogMessage('debug', `[popup] capture_popups is enabled but 'interact_clicks' is not — set interact_clicks: true to enable element-targeted clicks; without it, only random content-zone clicks fire and may miss overlay-based popunders`));
|
|
3319
3451
|
}
|
|
@@ -3362,8 +3494,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3362
3494
|
try {
|
|
3363
3495
|
const parsedUrl = new URL(checkedUrl);
|
|
3364
3496
|
fullSubdomain = parsedUrl.hostname;
|
|
3365
|
-
|
|
3366
|
-
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3497
|
+
checkedRootDomain = rootDomainForHost(fullSubdomain);
|
|
3367
3498
|
} catch (_) { return; }
|
|
3368
3499
|
if (!checkedRootDomain) return;
|
|
3369
3500
|
|
|
@@ -3638,30 +3769,24 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3638
3769
|
try {
|
|
3639
3770
|
const parsedUrl = new URL(checkedUrl);
|
|
3640
3771
|
fullSubdomain = parsedUrl.hostname;
|
|
3641
|
-
|
|
3642
|
-
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3772
|
+
checkedRootDomain = rootDomainForHost(fullSubdomain);
|
|
3643
3773
|
} catch (e) {}
|
|
3644
3774
|
|
|
3775
|
+
// Never BLOCK the top-level document (the scanned page OR a main-frame
|
|
3776
|
+
// redirect target). Aborting it makes the navigation never commit (page
|
|
3777
|
+
// stays at about:blank → navigation timeout), silently breaking any
|
|
3778
|
+
// scanned URL that matches our own filter lists (adblock / blocked /
|
|
3779
|
+
// blockDomainsByUrl) — common on adult/pirate/stream domains. This flag
|
|
3780
|
+
// ONLY guards the abort paths below; the request still flows through the
|
|
3781
|
+
// match logic, so a main-frame redirect destination (e.g. a
|
|
3782
|
+
// filecrypt → ad-domain hop) is still captured via filterRegex/dig/whois.
|
|
3783
|
+
// isNavigationRequest is true for sub-frame docs too, so the mainFrame()
|
|
3784
|
+
// check keeps ad iframes blockable.
|
|
3785
|
+
let isMainFrameDoc = false;
|
|
3786
|
+
try { isMainFrameDoc = request.isNavigationRequest() && request.frame() === page.mainFrame(); } catch (_) {}
|
|
3787
|
+
|
|
3645
3788
|
// Check against ALL first-party domains (original + all redirects)
|
|
3646
3789
|
const isFirstParty = checkedRootDomain && firstPartyDomains.has(checkedRootDomain);
|
|
3647
|
-
|
|
3648
|
-
// Block infinite iframe loops - safely access frame URL
|
|
3649
|
-
const frameUrl = (() => {
|
|
3650
|
-
try {
|
|
3651
|
-
const frame = request.frame();
|
|
3652
|
-
return frame ? frame.url() : '';
|
|
3653
|
-
} catch (err) {
|
|
3654
|
-
return '';
|
|
3655
|
-
}
|
|
3656
|
-
})();
|
|
3657
|
-
if (frameUrl && frameUrl.includes('creative.dmzjmp.com') &&
|
|
3658
|
-
checkedUrl.includes('go.dmzjmp.com/api/models')) {
|
|
3659
|
-
if (forceDebug) {
|
|
3660
|
-
console.log(formatLogMessage('debug', `Blocking potential infinite iframe loop: ${checkedUrl}`));
|
|
3661
|
-
}
|
|
3662
|
-
request.abort();
|
|
3663
|
-
return;
|
|
3664
|
-
}
|
|
3665
3790
|
|
|
3666
3791
|
// Enhanced debug logging to show which frame the request came from
|
|
3667
3792
|
if (forceDebug) {
|
|
@@ -3691,7 +3816,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3691
3816
|
request.resourceType()
|
|
3692
3817
|
);
|
|
3693
3818
|
|
|
3694
|
-
if (result.blocked) {
|
|
3819
|
+
if (result.blocked && !isMainFrameDoc) {
|
|
3695
3820
|
adblockStats.blocked++;
|
|
3696
3821
|
if (forceDebug) {
|
|
3697
3822
|
console.log(formatLogMessage('debug', `${messageColors.blocked('[adblock]')} ${checkedUrl} (${result.reason})`));
|
|
@@ -3699,6 +3824,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3699
3824
|
request.abort('blockedbyclient');
|
|
3700
3825
|
return;
|
|
3701
3826
|
}
|
|
3827
|
+
if (result.blocked && isMainFrameDoc && forceDebug) {
|
|
3828
|
+
// Matched a filter rule but it's the page we're scanning (or a
|
|
3829
|
+
// main-frame redirect target) — allow it (blocking the top-level
|
|
3830
|
+
// document aborts navigation). It still flows through the matcher.
|
|
3831
|
+
console.log(formatLogMessage('debug', `${messageColors.highlight('[adblock]')} top-level document ${checkedUrl} matched (${result.reason}) — allowed (never block the scanned page)`));
|
|
3832
|
+
}
|
|
3702
3833
|
adblockStats.allowed++;
|
|
3703
3834
|
} catch (err) { /* Silently continue on adblock errors */ }
|
|
3704
3835
|
}
|
|
@@ -3752,7 +3883,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3752
3883
|
// check so domain-based blocks short-circuit without paying the
|
|
3753
3884
|
// per-URL regex scan. Same abort reason as the static path so
|
|
3754
3885
|
// request.failure() observers see consistent metadata.
|
|
3755
|
-
if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
|
|
3886
|
+
if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain) && !isMainFrameDoc) {
|
|
3756
3887
|
if (forceDebug) {
|
|
3757
3888
|
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
|
|
3758
3889
|
}
|
|
@@ -3767,7 +3898,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3767
3898
|
break;
|
|
3768
3899
|
}
|
|
3769
3900
|
}
|
|
3770
|
-
if (blockedMatchIndex !== -1) {
|
|
3901
|
+
if (blockedMatchIndex !== -1 && !isMainFrameDoc) {
|
|
3771
3902
|
// Always track the hit (zero-cost on the un-debug path) so the
|
|
3772
3903
|
// scan-end summary can show which patterns are doing work vs.
|
|
3773
3904
|
// which are stale and ready to prune. Keyed by pattern.source --
|
|
@@ -4349,15 +4480,114 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4349
4480
|
try {
|
|
4350
4481
|
navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage);
|
|
4351
4482
|
} catch (navErr) {
|
|
4352
|
-
// Only
|
|
4483
|
+
// Only handle genuine timeouts here, not chrome-error:// redirects.
|
|
4484
|
+
// pageUrl === 'about:blank' means the navigation never committed
|
|
4485
|
+
// (server never responded) — treat as a real failure, not a partial
|
|
4486
|
+
// page; only a page that actually reached a URL is worth observing.
|
|
4353
4487
|
let pageUrl = '';
|
|
4354
4488
|
try { if (!page.isClosed()) pageUrl = page.url(); } catch {}
|
|
4355
4489
|
const isPopupFailure = navErr.message.includes('chrome-error://') || navErr.message.includes('invalid URL') ||
|
|
4356
4490
|
pageUrl.startsWith('chrome-error://') || pageUrl === 'about:blank';
|
|
4357
4491
|
if ((navErr.message.includes('timeout') || navErr.message.includes('Timeout')) && !isPopupFailure) {
|
|
4358
|
-
|
|
4359
|
-
|
|
4360
|
-
|
|
4492
|
+
// The OLD fallback retried with networkidle2 — STRICTER than the
|
|
4493
|
+
// domcontentloaded default, so it could never rescue a
|
|
4494
|
+
// domcontentloaded timeout (and Puppeteer 25 has no 'commit', i.e.
|
|
4495
|
+
// nothing more lenient). Two-tier recovery instead:
|
|
4496
|
+
// 1. If the site used a wait STRICTER than domcontentloaded, do one
|
|
4497
|
+
// lenient retry with domcontentloaded (it fires earlier).
|
|
4498
|
+
// 2. Otherwise proceed with the partially-loaded page rather than
|
|
4499
|
+
// discarding the URL — it exists and requests already fired
|
|
4500
|
+
// (captured by page.on('request')); the delay/interact phase
|
|
4501
|
+
// below keeps capturing. Streaming/embed/media pages routinely
|
|
4502
|
+
// never reach DOM-ready (a connection stays open) but their
|
|
4503
|
+
// ad/tracker calls fired early.
|
|
4504
|
+
const primaryWait = gotoOptions.waitUntil || defaultWaitUntil;
|
|
4505
|
+
let recovered = false;
|
|
4506
|
+
if (primaryWait !== 'domcontentloaded') {
|
|
4507
|
+
try {
|
|
4508
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout (${primaryWait}), retrying with waitUntil:domcontentloaded for ${currentUrl}`));
|
|
4509
|
+
const fallbackOptions = { ...gotoOptions, waitUntil: 'domcontentloaded', timeout: Math.min(timeout, 15000) };
|
|
4510
|
+
navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, fallbackOptions, forceDebug, formatLogMessage);
|
|
4511
|
+
recovered = true;
|
|
4512
|
+
} catch (_) { /* fall through to proceed-with-partial */ }
|
|
4513
|
+
}
|
|
4514
|
+
if (!recovered) {
|
|
4515
|
+
let partialUrl = currentUrl;
|
|
4516
|
+
try { if (!page.isClosed()) partialUrl = page.url() || currentUrl; } catch {}
|
|
4517
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout — proceeding with partially-loaded page for ${currentUrl}`));
|
|
4518
|
+
navigationResult = { finalUrl: partialUrl, redirected: false, redirectChain: [currentUrl], originalUrl: currentUrl, redirectDomains: [], httpStatus: null, cfRay: null };
|
|
4519
|
+
}
|
|
4520
|
+
} else if (navErr.message.includes('ERR_TOO_MANY_REDIRECTS')) {
|
|
4521
|
+
// Redirect-cloaking chain exceeded Chrome's ~20-hop per-navigation
|
|
4522
|
+
// ceiling, so goto() rejected. Two recovery paths — they cover
|
|
4523
|
+
// opposite cases run-to-run, so try both:
|
|
4524
|
+
// 1. Browser ride-through (free): a JS/meta hop on a committed
|
|
4525
|
+
// intermediate page resets Chrome's counter and carries the page
|
|
4526
|
+
// to the end site on its own. Check if it already happened, else
|
|
4527
|
+
// wait briefly for it.
|
|
4528
|
+
// 2. curl-resolve (fallback, only if the page parked on
|
|
4529
|
+
// chrome-error): curl follows the chain (it gets the real chain,
|
|
4530
|
+
// not headless Chrome's endless loop) to the JS-handoff page;
|
|
4531
|
+
// navigating there directly is a SHORT hop that reaches the end
|
|
4532
|
+
// site. Skipped under proxy/VPN — curl runs DIRECT from the host
|
|
4533
|
+
// and would leak the real IP / resolve from the wrong network.
|
|
4534
|
+
// If neither reaches a real page, keep the chain requests already
|
|
4535
|
+
// captured (grouped under the original URL, never chrome-error).
|
|
4536
|
+
let landedUrl = '';
|
|
4537
|
+
const isRealPage = (u) => !!u && /^https?:\/\//.test(u) && !u.startsWith('chrome-error://') && u !== currentUrl;
|
|
4538
|
+
|
|
4539
|
+
// 1) Browser ride-through — may have completed during goto(); if not,
|
|
4540
|
+
// wait for the next navigation(s) to carry it through.
|
|
4541
|
+
try { if (!page.isClosed() && isRealPage(page.url())) landedUrl = page.url(); } catch {}
|
|
4542
|
+
for (let r = 0; r < 3 && !landedUrl; r++) {
|
|
4543
|
+
try {
|
|
4544
|
+
await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 8000 });
|
|
4545
|
+
if (!page.isClosed() && isRealPage(page.url())) landedUrl = page.url();
|
|
4546
|
+
} catch { break; } // no further navigation — stop waiting
|
|
4547
|
+
}
|
|
4548
|
+
if (landedUrl && forceDebug) console.log(formatLogMessage('debug', `Too many redirects — browser rode through to ${landedUrl} for ${currentUrl}`));
|
|
4549
|
+
|
|
4550
|
+
// 2) curl-resolve fallback — only if still parked (no ride-through).
|
|
4551
|
+
// Opt-in via the site's `curl` option: if you didn't enable curl
|
|
4552
|
+
// in the config, the scanner won't shell out to it here either
|
|
4553
|
+
// (consistent with the content-analysis `curl` gate).
|
|
4554
|
+
if (!landedUrl) {
|
|
4555
|
+
const curlResolveOk = siteConfig.curl === true && !needsProxy(siteConfig) && !anyVpnConfigured && validateCurlAvailability().isAvailable;
|
|
4556
|
+
if (curlResolveOk) {
|
|
4557
|
+
let resolvedUrl = '';
|
|
4558
|
+
try {
|
|
4559
|
+
const curlUa = USER_AGENT_COLLECTIONS.get((siteConfig.userAgent || 'chrome').toLowerCase())
|
|
4560
|
+
|| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36';
|
|
4561
|
+
const cr = await runProcess('curl', ['-sL', '--max-redirs', '50', '--max-time', '20', '-o', '/dev/null', '-A', curlUa, '-w', '%{url_effective}', currentUrl], { timeout: 22000, maxStdout: 4096 });
|
|
4562
|
+
const u = (cr.stdout || '').trim();
|
|
4563
|
+
if (cr.code === 0 && /^https?:\/\//.test(u) && u !== currentUrl) resolvedUrl = u;
|
|
4564
|
+
} catch (_) { /* curl failed */ }
|
|
4565
|
+
if (resolvedUrl) {
|
|
4566
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Too many redirects — curl resolved the chain to ${resolvedUrl}; navigating there directly for ${currentUrl}`));
|
|
4567
|
+
// Navigate to the resolved endpoint; the streaming/embed end page
|
|
4568
|
+
// often never reaches DOM-ready, so the goto may throw — either
|
|
4569
|
+
// way it navigated, so adopt page.url().
|
|
4570
|
+
try { navigationResult = await navigateWithRedirectHandling(page, resolvedUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage); } catch (_) { /* timed out — use page.url() below */ }
|
|
4571
|
+
try { if (!page.isClosed() && page.url() && !page.url().startsWith('chrome-error://')) landedUrl = page.url(); } catch {}
|
|
4572
|
+
} else if (forceDebug) {
|
|
4573
|
+
console.log(formatLogMessage('debug', `Too many redirects — no ride-through and curl could not resolve; keeping chain captures for ${currentUrl}`));
|
|
4574
|
+
}
|
|
4575
|
+
} else if (forceDebug) {
|
|
4576
|
+
const why = siteConfig.curl !== true ? 'curl not enabled (curl:false)'
|
|
4577
|
+
: (needsProxy(siteConfig) || anyVpnConfigured) ? 'proxy/VPN active'
|
|
4578
|
+
: 'curl unavailable';
|
|
4579
|
+
console.log(formatLogMessage('debug', `Too many redirects — no ride-through and curl-resolve skipped (${why}); keeping chain captures for ${currentUrl}`));
|
|
4580
|
+
}
|
|
4581
|
+
}
|
|
4582
|
+
|
|
4583
|
+
// navigateWithRedirectHandling may already have set navigationResult
|
|
4584
|
+
// (clean curl path). Otherwise build a partial from where we landed —
|
|
4585
|
+
// the end site if we rode through / curl'd, else the original URL with
|
|
4586
|
+
// the chain requests already captured.
|
|
4587
|
+
if (!navigationResult) {
|
|
4588
|
+
const fu = landedUrl || currentUrl;
|
|
4589
|
+
navigationResult = { finalUrl: fu, redirected: fu !== currentUrl, redirectChain: [currentUrl, fu], originalUrl: currentUrl, redirectDomains: [], httpStatus: null, cfRay: null };
|
|
4590
|
+
}
|
|
4361
4591
|
} else {
|
|
4362
4592
|
throw navErr;
|
|
4363
4593
|
}
|
|
@@ -4403,17 +4633,26 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4403
4633
|
redirectHistory.add(currentUrl);
|
|
4404
4634
|
redirectHistory.add(finalUrl);
|
|
4405
4635
|
|
|
4406
|
-
// Add redirect destination to first-party domains
|
|
4407
|
-
|
|
4408
|
-
|
|
4409
|
-
|
|
4410
|
-
|
|
4411
|
-
//
|
|
4412
|
-
|
|
4413
|
-
|
|
4414
|
-
|
|
4415
|
-
|
|
4416
|
-
}
|
|
4636
|
+
// Add redirect destination (and intermediates) to first-party domains
|
|
4637
|
+
// so the landed site's own resources aren't captured as third-party.
|
|
4638
|
+
// Opt out with redirect_first_party:false — then redirect targets stay
|
|
4639
|
+
// THIRD-PARTY and become eligible for filterRegex/dig under
|
|
4640
|
+
// thirdParty:true (e.g. capturing an ad/cloak redirect's end domain).
|
|
4641
|
+
// The originally-scanned domain (added earlier) stays first-party.
|
|
4642
|
+
const redirectsAreFirstParty = siteConfig.redirect_first_party !== false;
|
|
4643
|
+
if (redirectsAreFirstParty) {
|
|
4644
|
+
if (finalDomain) {
|
|
4645
|
+
firstPartyDomains.add(finalDomain);
|
|
4646
|
+
}
|
|
4647
|
+
// Also add any intermediate redirect domains as first-party
|
|
4648
|
+
if (redirectDomains && redirectDomains.length > 0) {
|
|
4649
|
+
redirectDomains.forEach(domain => {
|
|
4650
|
+
const rootDomain = safeGetDomain(`http://${domain}`, false);
|
|
4651
|
+
if (rootDomain) firstPartyDomains.add(rootDomain);
|
|
4652
|
+
});
|
|
4653
|
+
}
|
|
4654
|
+
} else if (forceDebug) {
|
|
4655
|
+
console.log(formatLogMessage('debug', `redirect_first_party:false — keeping redirect target ${finalDomain} third-party for ${currentUrl}`));
|
|
4417
4656
|
}
|
|
4418
4657
|
|
|
4419
4658
|
if (originalDomain !== finalDomain) {
|
|
@@ -4630,13 +4869,85 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4630
4869
|
// Capture hard "dead domain" navigation errors for --show-dead-domains
|
|
4631
4870
|
// (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
|
|
4632
4871
|
// challenges are NOT dead — they're excluded by this match.
|
|
4633
|
-
|
|
4634
|
-
|
|
4872
|
+
// Only DEFINITIVE non-existence / unreachable signals — these now drive
|
|
4873
|
+
// the in-scan dead-domain SKIP (not just --show-dead-domains reporting),
|
|
4874
|
+
// so transient DNS errors must NOT match. The bare `ERR_DNS` used to
|
|
4875
|
+
// catch ERR_DNS_TIMED_OUT / ERR_DNS_MALFORMED_RESPONSE / ERR_DNS_SERVER_FAILED
|
|
4876
|
+
// (all transient) — dropped so a slow-DNS blip can't false-skip the
|
|
4877
|
+
// rest of a live host's URLs.
|
|
4878
|
+
const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE/.exec(err.message || '');
|
|
4879
|
+
if (deadNav) {
|
|
4880
|
+
recordDeadDomain(currentUrl, deadNav[0]);
|
|
4881
|
+
// Corroborate-then-persist to the negative cache (.dnsnegcache with
|
|
4882
|
+
// --dns-cache → cross-scan skip; else in-memory). Chrome resolves via
|
|
4883
|
+
// the possibly-flaky SYSTEM resolver, so its ERR_NAME_NOT_RESOLVED may
|
|
4884
|
+
// be a glitch on a LIVE host. Re-confirm via the reliable --dns
|
|
4885
|
+
// resolver and cache ONLY if it ALSO returns a definitive NXDOMAIN.
|
|
4886
|
+
// ERR_ADDRESS_UNREACHABLE is routing (the host resolves), so the
|
|
4887
|
+
// resolve succeeds and it's correctly not cached. Fire-and-forget:
|
|
4888
|
+
// off the critical path; saveDiskCache flushes on exit.
|
|
4889
|
+
if (dnsPrecheckEnabled && deadNav[0] === 'ERR_NAME_NOT_RESOLVED') {
|
|
4890
|
+
let navHost = '';
|
|
4891
|
+
try { navHost = new URL(currentUrl).hostname; } catch {}
|
|
4892
|
+
if (navHost && !/^[\d.:]+$|^\[/.test(navHost) && !dnsNegativeCache.has(navHost)) {
|
|
4893
|
+
dnsResolver.resolveHost(navHost, dnsPrecheckTimeoutMs).then(
|
|
4894
|
+
() => { /* reliable resolver resolves it — system-resolver glitch, do NOT cache */ },
|
|
4895
|
+
(e) => {
|
|
4896
|
+
const code = (e && (e.code || e.message)) || '';
|
|
4897
|
+
if (isNonExistenceError(code)) {
|
|
4898
|
+
dnsNegativeCacheSet(navHost, code);
|
|
4899
|
+
recordDeadDomain(navHost, code);
|
|
4900
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Dead domain confirmed by --dns resolver (${code}) — caching ${navHost} (skips next run with --dns-cache)`));
|
|
4901
|
+
}
|
|
4902
|
+
}
|
|
4903
|
+
).catch(() => {});
|
|
4904
|
+
}
|
|
4905
|
+
}
|
|
4906
|
+
}
|
|
4635
4907
|
throw err;
|
|
4636
4908
|
}
|
|
4637
4909
|
}
|
|
4638
4910
|
}
|
|
4639
4911
|
|
|
4912
|
+
// Targeted clicks: after load, click configured CSS selectors in order
|
|
4913
|
+
// (e.g. a movie link, then a play button) to reach content via organic
|
|
4914
|
+
// navigation/gesture instead of a direct deep-load (which some sites
|
|
4915
|
+
// JS-redirect away). The request interceptor stays attached, so the
|
|
4916
|
+
// post-click page's requests flow into the same filterRegex/dig matching.
|
|
4917
|
+
// Reuses realistic_click for a genuine trusted gesture. Runs before the
|
|
4918
|
+
// delay/interact phase so those operate on the resulting page.
|
|
4919
|
+
if (Array.isArray(siteConfig.click_elements) && siteConfig.click_elements.length > 0 && page && !page.isClosed()) {
|
|
4920
|
+
// If ghost-cursor is enabled for this site (cursor_mode:"ghost" or
|
|
4921
|
+
// --ghost-cursor), route the targeted clicks through it — Bezier travel
|
|
4922
|
+
// to the element + realistic press — matching the interact phase.
|
|
4923
|
+
// Injected so interaction.js needn't require ghost-cursor.js (circular).
|
|
4924
|
+
// Falls back to performTargetedClicks' humanClick/el.click when ghost is
|
|
4925
|
+
// off or the package isn't installed (resolveGhostCursorConfig → null).
|
|
4926
|
+
let ghostClicker = null;
|
|
4927
|
+
const tcGhostCfg = resolveGhostCursorConfig(siteConfig, globalGhostCursor, forceDebug);
|
|
4928
|
+
if (tcGhostCfg) {
|
|
4929
|
+
const tcCursor = createGhostCursor(page, { forceDebug });
|
|
4930
|
+
if (tcCursor) {
|
|
4931
|
+
ghostClicker = (x, y) => ghostClick(tcCursor, { x, y }, {
|
|
4932
|
+
hesitate: tcGhostCfg.hesitate,
|
|
4933
|
+
page,
|
|
4934
|
+
realistic: siteConfig.realistic_click === true,
|
|
4935
|
+
forceDebug
|
|
4936
|
+
});
|
|
4937
|
+
}
|
|
4938
|
+
}
|
|
4939
|
+
try {
|
|
4940
|
+
await performTargetedClicks(page, siteConfig.click_elements, {
|
|
4941
|
+
realistic: siteConfig.realistic_click === true,
|
|
4942
|
+
waitMs: Math.min(Number(siteConfig.click_wait) || 5000, Math.floor(timeout / 2)),
|
|
4943
|
+
ghostClick: ghostClicker,
|
|
4944
|
+
forceDebug
|
|
4945
|
+
});
|
|
4946
|
+
} catch (clickErr) {
|
|
4947
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${INTERACTION_TAG} click_elements phase failed for ${currentUrl}: ${clickErr.message}`));
|
|
4948
|
+
}
|
|
4949
|
+
}
|
|
4950
|
+
|
|
4640
4951
|
const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
|
|
4641
4952
|
|
|
4642
4953
|
// Optimized delays for Puppeteer 23.x performance
|
|
@@ -4653,6 +4964,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4653
4964
|
const actualDelay = siteConfig.delay_uncapped === true
|
|
4654
4965
|
? Math.min(delayMs, Math.floor(timeout / 2))
|
|
4655
4966
|
: Math.min(delayMs, TIMEOUTS.NETWORK_IDLE);
|
|
4967
|
+
// Surface the clamp — otherwise `delay: 48000` silently running as 29000
|
|
4968
|
+
// (timeout/2) looks like the flag was ignored. The per-URL budget already
|
|
4969
|
+
// reserves the full `delay`, so the lever to honor it is a larger timeout.
|
|
4970
|
+
if (forceDebug && actualDelay < delayMs) {
|
|
4971
|
+
const ceiling = siteConfig.delay_uncapped === true ? 'timeout/2; raise timeout to lift' : 'default 2s cap; set delay_uncapped:true to lift';
|
|
4972
|
+
console.log(formatLogMessage('debug', `delay ${delayMs}ms clamped to ${actualDelay}ms (${ceiling}) for ${currentUrl}`));
|
|
4973
|
+
}
|
|
4656
4974
|
|
|
4657
4975
|
// Build delay promise (networkIdle + delay + optional flowProxy delay)
|
|
4658
4976
|
const delayPromise = (async () => {
|
|
@@ -4925,6 +5243,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4925
5243
|
|
|
4926
5244
|
let reloadSuccess = false;
|
|
4927
5245
|
|
|
5246
|
+
// page.reload() can't carry a referer; when referrer_headers is set,
|
|
5247
|
+
// re-navigate to the current URL with it so referer-gated embeds keep
|
|
5248
|
+
// serving across the reload:N loop (the initial goto carries the referer,
|
|
5249
|
+
// but reload() drops it). Nav-only scope — subresources keep their normal
|
|
5250
|
+
// page-origin referer (unlike setExtraHTTPHeaders, which would force the
|
|
5251
|
+
// referer onto every request and can break embeds whose subresources
|
|
5252
|
+
// expect own-origin). A static referrer_headers string is identical each
|
|
5253
|
+
// reload; random/mixed modes pick a fresh value per reload.
|
|
5254
|
+
const reloadReferer = siteConfig.referrer_headers
|
|
5255
|
+
? getReferrerForUrl(currentUrl, siteConfig.referrer_headers, siteConfig.referrer_disable, forceDebug)
|
|
5256
|
+
: '';
|
|
5257
|
+
const reloadOrReferredGoto = (opts) => reloadReferer
|
|
5258
|
+
? page.goto(page.url(), { ...opts, referer: reloadReferer })
|
|
5259
|
+
: page.reload(opts);
|
|
5260
|
+
|
|
4928
5261
|
// Skip force reload if browser seems unhealthy
|
|
4929
5262
|
const skipForceReload = i > 2; // After 2 attempts, skip force reload
|
|
4930
5263
|
|
|
@@ -4947,7 +5280,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4947
5280
|
await raceWithTimer(page.setCacheEnabled(false), 'Cache disable timeout', 8000);
|
|
4948
5281
|
|
|
4949
5282
|
// Use networkidle2 for force reload to better detect when page is actually loaded
|
|
4950
|
-
await
|
|
5283
|
+
await reloadOrReferredGoto({ waitUntil: 'networkidle2', timeout: Math.min(timeout, 15000) });
|
|
4951
5284
|
|
|
4952
5285
|
// Timeout-protected cache enable
|
|
4953
5286
|
await raceWithTimer(page.setCacheEnabled(true), 'Cache enable timeout', 8000);
|
|
@@ -4986,7 +5319,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4986
5319
|
? { waitUntil: 'domcontentloaded', timeout: 10000 } // Simpler after failures
|
|
4987
5320
|
: { waitUntil: 'networkidle2', timeout: 15000 }; // Full wait first time
|
|
4988
5321
|
|
|
4989
|
-
await
|
|
5322
|
+
await reloadOrReferredGoto(reloadOptions);
|
|
4990
5323
|
|
|
4991
5324
|
if (forceDebug) console.log(formatLogMessage('debug', `Standard reload #${i} completed for ${currentUrl}`));
|
|
4992
5325
|
} catch (standardReloadErr) {
|
|
@@ -5263,7 +5596,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5263
5596
|
const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_').substring(0, 80);
|
|
5264
5597
|
const filename = `screenshots/${safeUrl}-${timestamp}.png`;
|
|
5265
5598
|
try {
|
|
5266
|
-
|
|
5599
|
+
fs.mkdirSync('screenshots', { recursive: true }); // recursive:true is a no-op if it already exists
|
|
5267
5600
|
await page.screenshot({ path: filename, type: 'png', fullPage: true });
|
|
5268
5601
|
console.log(formatLogMessage('info', `Screenshot saved: ${filename}`));
|
|
5269
5602
|
} catch (screenshotErr) {
|
|
@@ -5759,6 +6092,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5759
6092
|
// actually starting — wrongly skipping live domains. c-ares isn't
|
|
5760
6093
|
// threadpool-bound so it's immune to that contention.
|
|
5761
6094
|
if (dnsPrecheckEnabled && taskDomain && !/^[\d.:]+$|^\[/.test(taskDomain)) {
|
|
6095
|
+
// Already proven dead earlier THIS run — either a pre-check NXDOMAIN or
|
|
6096
|
+
// a prior URL's navigation hit ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE
|
|
6097
|
+
// (recordDeadDomain populates _deadDomains for both). Skip the repeat
|
|
6098
|
+
// instead of paying another fail-open navigation on a multi-URL dead
|
|
6099
|
+
// host (e.g. dlstreams.top?id=39/54/347). In-scan only (NOT persisted):
|
|
6100
|
+
// Chrome resolves via the system resolver, so a nav-level failure could
|
|
6101
|
+
// be a system-resolver glitch on a live host — a false "dead" must not
|
|
6102
|
+
// carry across runs. Cheap: a Map lookup, no DNS resolve.
|
|
6103
|
+
if (_deadDomains.has(taskDomain)) {
|
|
6104
|
+
dnsPrecheckSkips++;
|
|
6105
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check: ${taskDomain} already dead this run (${_deadDomains.get(taskDomain)}) — skipping`));
|
|
6106
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${_deadDomains.get(taskDomain)}`, skipped: true };
|
|
6107
|
+
}
|
|
5762
6108
|
const cached = dnsNegativeCache.get(taskDomain);
|
|
5763
6109
|
if (cached && Date.now() - cached.timestamp < DNS_NEGATIVE_CACHE_TTL_MS) {
|
|
5764
6110
|
dnsPrecheckSkips++;
|
|
@@ -5833,10 +6179,24 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5833
6179
|
const INTERACTION_OVERHEAD_MS = interactionOnForTask
|
|
5834
6180
|
? computeInteractionCeilingMs(createInteractionConfig(task.url, task.config))
|
|
5835
6181
|
: 0;
|
|
6182
|
+
// click_elements runs ONCE after load (before the delay/interact/reload
|
|
6183
|
+
// phases): N selectors, each a settle/nav wait (click_wait, capped at
|
|
6184
|
+
// timeout/2 — mirror the call site) plus ~2s for scroll + the click action
|
|
6185
|
+
// (ghost Bezier travel is the slowest). Budget it so a heavy click chain
|
|
6186
|
+
// can't trip the per-URL ceiling before the work that follows it. Not
|
|
6187
|
+
// multiplied by reloadCount — the click phase is one-time.
|
|
6188
|
+
const clickEls = Array.isArray(task.config.click_elements)
|
|
6189
|
+
? task.config.click_elements.filter(s => typeof s === 'string' && s.trim())
|
|
6190
|
+
: [];
|
|
6191
|
+
const clickWaitMs = clickEls.length
|
|
6192
|
+
? Math.min(Number(task.config.click_wait) || 5000, Math.floor((task.config.timeout || 35000) / 2))
|
|
6193
|
+
: 0;
|
|
6194
|
+
const CLICK_ELEMENTS_OVERHEAD_MS = clickEls.length * (clickWaitMs + 2000);
|
|
5836
6195
|
const PER_URL_TIMEOUT_MS = Math.max(
|
|
5837
6196
|
75000,
|
|
5838
6197
|
(task.config.timeout || 35000)
|
|
5839
6198
|
+ ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
|
|
6199
|
+
+ CLICK_ELEMENTS_OVERHEAD_MS
|
|
5840
6200
|
+ 30000
|
|
5841
6201
|
);
|
|
5842
6202
|
// Feed the hang-check restart so it never escalates before this URL's own
|