@fanboynz/network-scanner 3.1.2 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +52 -1
- package/CLAUDE.md +2 -1
- package/README.md +67 -7
- package/eslint.config.mjs +13 -1
- package/lib/browserhealth.js +25 -3
- package/lib/dns.js +238 -0
- package/lib/domain-cache.js +14 -127
- package/lib/ghost-cursor.js +29 -11
- package/lib/interaction.js +4 -0
- package/lib/nettools.js +157 -54
- package/lib/openvpn_vpn.js +8 -0
- package/lib/output.js +24 -13
- package/lib/redirect.js +4 -1
- package/lib/validate_rules.js +16 -1
- package/lib/wireguard_vpn.js +8 -0
- package/nwss.1 +84 -15
- package/nwss.js +536 -164
- package/package.json +1 -1
package/nwss.js
CHANGED
|
@@ -9,9 +9,9 @@ const fs = require('fs');
|
|
|
9
9
|
const os = require('os');
|
|
10
10
|
const psl = require('psl');
|
|
11
11
|
const path = require('path');
|
|
12
|
-
const
|
|
12
|
+
const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
|
|
13
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
14
|
-
const { compressMultipleFiles
|
|
14
|
+
const { compressMultipleFiles } = require('./lib/compress');
|
|
15
15
|
const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
|
|
16
16
|
const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
|
|
17
17
|
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
@@ -34,9 +34,7 @@ const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore
|
|
|
34
34
|
// Graceful exit
|
|
35
35
|
const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
|
|
36
36
|
// Whois & Dig
|
|
37
|
-
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
|
|
38
|
-
// File compare
|
|
39
|
-
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
|
|
37
|
+
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve, loadDiskCache, saveDiskCache, setDigResolvers } = require('./lib/nettools');
|
|
40
38
|
// CDP functionality
|
|
41
39
|
const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp');
|
|
42
40
|
// Post-processing cleanup
|
|
@@ -57,6 +55,7 @@ const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
|
|
|
57
55
|
const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
|
|
58
56
|
const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
|
|
59
57
|
const VPN_TAG = messageColors.processing('[vpn]');
|
|
58
|
+
const POPUP_TAG = messageColors.processing('[popup]');
|
|
60
59
|
// Precomputed colored '[SmartCache]' subsystem prefix — paired with the
|
|
61
60
|
// same constant in lib/smart-cache.js so debug lines from both files
|
|
62
61
|
// produce consistently colored output. formatLogMessage only colors the
|
|
@@ -68,14 +67,14 @@ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
|
68
67
|
// Enhanced mouse interaction and page simulation
|
|
69
68
|
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
70
69
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
71
|
-
const {
|
|
70
|
+
const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
|
|
72
71
|
// Domain detection cache for performance optimization
|
|
73
|
-
const { createGlobalHelpers,
|
|
72
|
+
const { createGlobalHelpers, getDetectedDomainsCount } = require('./lib/domain-cache');
|
|
74
73
|
const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
|
|
75
74
|
const { clearPersistentCache } = require('./lib/smart-cache');
|
|
76
75
|
const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
|
|
77
76
|
// Dry run functionality
|
|
78
|
-
const { initializeDryRunCollections, addDryRunMatch,
|
|
77
|
+
const { initializeDryRunCollections, addDryRunMatch, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
|
|
79
78
|
// Enhanced site data clearing functionality
|
|
80
79
|
const { clearSiteData } = require('./lib/clear_sitedata');
|
|
81
80
|
// Referrer header generation
|
|
@@ -137,6 +136,7 @@ const CONCURRENCY_LIMITS = Object.freeze({
|
|
|
137
136
|
// Keep using the imported map directly so the two can never diverge again.
|
|
138
137
|
|
|
139
138
|
const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
|
|
139
|
+
const REALTIME_CLEANUP_BUFFER_MS = 25000; // Buffer added after site delay before realtime window cleanup
|
|
140
140
|
|
|
141
141
|
/**
|
|
142
142
|
* Detects the installed Puppeteer version dynamically
|
|
@@ -181,7 +181,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
|
|
|
181
181
|
// purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
|
|
182
182
|
// and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
|
|
183
183
|
// automatically — manual purging is no longer needed.
|
|
184
|
-
const { monitorBrowserHealth,
|
|
184
|
+
const { monitorBrowserHealth, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
|
|
185
185
|
|
|
186
186
|
// --- Script Configuration & Constants ---
|
|
187
187
|
const VERSION = '2.0.33'; // Script version
|
|
@@ -191,7 +191,12 @@ const startTime = Date.now();
|
|
|
191
191
|
|
|
192
192
|
// Initialize domain cache helpers with debug logging if enabled
|
|
193
193
|
const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
|
|
194
|
-
|
|
194
|
+
// Only markDomainAsDetected is used — the global cache feeds the end-of-scan
|
|
195
|
+
// "unique domains cached" stat (getDetectedDomainsCount). The skip-check
|
|
196
|
+
// (isDomainAlreadyDetected) is intentionally not wired in: cross-URL dedup is
|
|
197
|
+
// already handled by nettools' global processed-domain sets, smart-cache, and
|
|
198
|
+
// the per-URL local set, so a cache-level skip would be redundant.
|
|
199
|
+
const { markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
|
|
195
200
|
|
|
196
201
|
// Smart cache will be initialized after config is loaded
|
|
197
202
|
let smartCache = null;
|
|
@@ -232,6 +237,9 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
232
237
|
const settingsMap = {
|
|
233
238
|
output: ['-o', '--output'],
|
|
234
239
|
max_concurrent: ['--max-concurrent'],
|
|
240
|
+
cleanup_interval: ['--cleanup-interval'],
|
|
241
|
+
resource_cleanup_interval: ['--cleanup-interval'],
|
|
242
|
+
dns: ['--dns'],
|
|
235
243
|
dns_cache: ['--dns-cache'],
|
|
236
244
|
cache_requests: ['--cache-requests'],
|
|
237
245
|
dumpurls: ['--dumpurls'],
|
|
@@ -243,20 +251,25 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
243
251
|
compress_logs: ['--compress-logs'],
|
|
244
252
|
debug: ['--debug'],
|
|
245
253
|
silent: ['--silent'],
|
|
246
|
-
verbose: ['--verbose'],
|
|
247
254
|
headful: ['--headful'],
|
|
248
255
|
keep_open: ['--keep-open'],
|
|
249
256
|
dry_run: ['--dry-run'],
|
|
250
257
|
titles: ['--titles'],
|
|
251
258
|
sub_domains: ['--sub-domains'],
|
|
252
259
|
no_interact: ['--no-interact'],
|
|
260
|
+
show_dead_domains: ['--show-dead-domains'],
|
|
253
261
|
ghost_cursor: ['--ghost-cursor'],
|
|
254
262
|
plain: ['--plain'],
|
|
255
263
|
cdp: ['--cdp'],
|
|
256
264
|
dnsmasq: ['--dnsmasq'],
|
|
265
|
+
dnsmasq_old: ['--dnsmasq-old'],
|
|
257
266
|
unbound: ['--unbound'],
|
|
258
267
|
privoxy: ['--privoxy'],
|
|
259
268
|
pihole: ['--pihole'],
|
|
269
|
+
adblock_rules: ['--adblock-rules'],
|
|
270
|
+
no_dns_precheck: ['--no-dns-precheck'],
|
|
271
|
+
allow_fullscreen: ['--allow-fullscreen'],
|
|
272
|
+
load_extension: ['--load-extension'],
|
|
260
273
|
eval_on_doc: ['--eval-on-doc'],
|
|
261
274
|
use_puppeteer_core: ['--use-puppeteer-core'],
|
|
262
275
|
ignore_cache: ['--ignore-cache'],
|
|
@@ -314,7 +327,6 @@ if (compareIndex !== -1 && args[compareIndex + 1]) {
|
|
|
314
327
|
}
|
|
315
328
|
|
|
316
329
|
|
|
317
|
-
const forceVerbose = args.includes('--verbose');
|
|
318
330
|
const forceDebug = args.includes('--debug');
|
|
319
331
|
const silentMode = args.includes('--silent');
|
|
320
332
|
const showTitles = args.includes('--titles');
|
|
@@ -337,12 +349,16 @@ const disableInteract = args.includes('--no-interact');
|
|
|
337
349
|
const globalGhostCursor = args.includes('--ghost-cursor');
|
|
338
350
|
const plainOutput = args.includes('--plain');
|
|
339
351
|
const enableCDP = args.includes('--cdp');
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
const
|
|
352
|
+
// These six are reassigned to false by the incompatible-flag validation
|
|
353
|
+
// blocks below (e.g. --dnsmasq + --unbound), so they must be `let` — as
|
|
354
|
+
// `const` that fallback threw "Assignment to constant variable" the moment
|
|
355
|
+
// two conflicting output modes were combined.
|
|
356
|
+
let dnsmasqMode = args.includes('--dnsmasq');
|
|
357
|
+
let dnsmasqOldMode = args.includes('--dnsmasq-old');
|
|
358
|
+
let unboundMode = args.includes('--unbound');
|
|
343
359
|
const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
|
|
344
|
-
|
|
345
|
-
|
|
360
|
+
let privoxyMode = args.includes('--privoxy');
|
|
361
|
+
let piholeMode = args.includes('--pihole');
|
|
346
362
|
const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
|
|
347
363
|
const dryRunMode = args.includes('--dry-run');
|
|
348
364
|
const compressLogs = args.includes('--compress-logs');
|
|
@@ -363,6 +379,25 @@ if (dnsCacheMode) enableDiskCache();
|
|
|
363
379
|
const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
|
|
364
380
|
const dnsPrecheckTimeoutMs = 2000;
|
|
365
381
|
|
|
382
|
+
// --show-dead-domains: collect hostnames that are definitively DEAD (do not
|
|
383
|
+
// exist / unreachable) and print them at the end of the scan so they can be
|
|
384
|
+
// pruned. Only hard signals count — NXDOMAIN/ENODATA from the pre-check and
|
|
385
|
+
// ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE from navigation. Transient
|
|
386
|
+
// failures (403/429 blocks, timeouts, Cloudflare challenges) mean the domain is
|
|
387
|
+
// ALIVE and are deliberately excluded. host -> reason (first seen).
|
|
388
|
+
const showDeadDomains = args.includes('--show-dead-domains');
|
|
389
|
+
const _deadDomains = new Map();
|
|
390
|
+
function recordDeadDomain(urlOrHost, reason) {
|
|
391
|
+
// Populate unconditionally — the pre-check skip reads _deadDomains to drop
|
|
392
|
+
// repeat URLs on a host already proven dead this run, which must work whether
|
|
393
|
+
// or not --show-dead-domains is set. The end-of-scan REPORT is separately
|
|
394
|
+
// gated on showDeadDomains, so the flag still controls output, not recording.
|
|
395
|
+
if (!urlOrHost) return;
|
|
396
|
+
let host = urlOrHost;
|
|
397
|
+
try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
|
|
398
|
+
if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
|
|
399
|
+
}
|
|
400
|
+
|
|
366
401
|
// Per-scan cache of negative DNS lookups. OS resolvers don't always cache
|
|
367
402
|
// NXDOMAIN responses, and a scan can hit the same dead hostname many times
|
|
368
403
|
// (different URL paths on the same site). Positive results are left to the
|
|
@@ -371,14 +406,67 @@ const dnsPrecheckTimeoutMs = 2000;
|
|
|
371
406
|
// of unique dead hosts) can't grow the cache unboundedly. Same pattern as
|
|
372
407
|
// the rest of the codebase's in-memory caches.
|
|
373
408
|
const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
|
|
374
|
-
const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
375
409
|
const DNS_NEGATIVE_CACHE_MAX = 1000;
|
|
410
|
+
// The negative cache holds ONLY definitive non-existence (NXDOMAIN/ENODATA) —
|
|
411
|
+
// resolver errors fail open and never enter it (see the pre-check catch), so
|
|
412
|
+
// persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
|
|
413
|
+
// hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
|
|
414
|
+
// otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
|
|
415
|
+
// shorter than the dig/whois positive cache (dig 20h / whois 36h): a domain that doesn't exist
|
|
416
|
+
// now MAY get registered, and this is a domain-hunting scanner, so the dead
|
|
417
|
+
// ones are re-checked twice a day rather than trusted for ~a day.
|
|
418
|
+
const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
|
|
419
|
+
const DNS_NEGATIVE_CACHE_TTL_MS = dnsCacheMode ? DNS_NEGATIVE_PERSIST_TTL_MS : 5 * 60 * 1000;
|
|
420
|
+
const DNS_NEGATIVE_CACHE_FILE = path.join(__dirname, '.dnsnegcache');
|
|
421
|
+
if (dnsCacheMode) {
|
|
422
|
+
// Reuse the dig/whois caches' generic load/save (atomic write, TTL + size
|
|
423
|
+
// bounded). The 'exit' flush is synchronous (writeFileSync) so it fires on
|
|
424
|
+
// any exit path, mirroring nettools' dig/whois flush.
|
|
425
|
+
loadDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX);
|
|
426
|
+
process.on('exit', () => saveDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX));
|
|
427
|
+
}
|
|
376
428
|
let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
|
|
377
429
|
let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
|
|
378
430
|
const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
|
|
379
|
-
//
|
|
380
|
-
//
|
|
381
|
-
|
|
431
|
+
// DNS pre-check resolver (rotation + resolution logic lives in lib/dns.js).
|
|
432
|
+
// `--dns <ip[,ip...]>` (or a `dns` setting in .nwssconfig, mapped to the same
|
|
433
|
+
// flag) pins/rotates an explicit resolver list; otherwise the resolv.conf
|
|
434
|
+
// nameservers are rotated. Rotation spreads the c-ares burst so one server
|
|
435
|
+
// (e.g. a flaky ISP resolver) doesn't absorb every query and answer REFUSED.
|
|
436
|
+
const dnsServerIndex = args.findIndex(arg => arg === '--dns');
|
|
437
|
+
const dnsServersOverride = (dnsServerIndex !== -1 && args[dnsServerIndex + 1])
|
|
438
|
+
? parseDnsServers(args[dnsServerIndex + 1])
|
|
439
|
+
: [];
|
|
440
|
+
const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceDebug });
|
|
441
|
+
// Route nettools' dig through the same --dns resolvers (dig otherwise uses the
|
|
442
|
+
// system /etc/resolv.conf, which on a flaky setup times out and silently drops
|
|
443
|
+
// dig-gated domains). Only when --dns is explicitly set.
|
|
444
|
+
if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
|
|
445
|
+
// Circuit breaker: if resolver errors dominate, suspend the pre-check for a
|
|
446
|
+
// cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
|
|
447
|
+
// still load — a suspended pre-check just proceeds to navigation).
|
|
448
|
+
const dnsBreaker = createDnsCircuitBreaker({ forceDebug });
|
|
449
|
+
if (dnsResolver.pinned && !silentMode) {
|
|
450
|
+
const how = dnsResolver.servers.length === 1 ? 'pinned to' : 'rotating';
|
|
451
|
+
console.log(formatLogMessage('info', `DNS pre-check ${how} ${dnsResolver.servers.join(', ')}`));
|
|
452
|
+
} else if (forceDebug && dnsResolver.rotates) {
|
|
453
|
+
console.log(formatLogMessage('debug', `DNS pre-check rotating ${dnsResolver.servers.length} resolv.conf nameservers: ${dnsResolver.servers.join(', ')}`));
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Idle-hang watchdog registry: in-flight main pages, iterable (the
|
|
457
|
+
// browserhealth page trackers are WeakMaps and can't be scanned). Registered
|
|
458
|
+
// when a task starts navigating, removed on completion. The hang check probes
|
|
459
|
+
// these ONLY while global progress is stalled and force-closes any page that is
|
|
460
|
+
// unresponsive across consecutive probes — recovering a single hung URL in ~the
|
|
461
|
+
// hang-check window instead of waiting out its full per-URL ceiling (which is
|
|
462
|
+
// the backstop). Acting only during a stall + requiring unresponsiveness avoids
|
|
463
|
+
// killing a page that's merely slow (a page in a config delay is idle but
|
|
464
|
+
// RESPONDS to a trivial evaluate; a hung one does not). Entries self-heal via
|
|
465
|
+
// isClosed() so timeout/error paths that skip the normal close can't leak.
|
|
466
|
+
const _inFlightPages = new Map(); // page -> { url, unresponsiveStrikes }
|
|
467
|
+
const PAGE_HANG_PROBE_TIMEOUT_MS = 2000; // liveness-probe (page.evaluate) cap; no response within this = hung
|
|
468
|
+
const PAGE_HANG_PROBE_INTERVAL_MS = 15000; // how often to probe in-flight pages while the scan is stalled
|
|
469
|
+
const PAGE_HANG_STRIKES_TO_KILL = 2; // consecutive HUNG probes before force-close (~30s recovery at the 15s interval)
|
|
382
470
|
|
|
383
471
|
function dnsNegativeCacheSet(hostname, error) {
|
|
384
472
|
if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
|
|
@@ -632,6 +720,9 @@ if (blockAdsIndex !== -1) {
|
|
|
632
720
|
|
|
633
721
|
adblockEnabled = true;
|
|
634
722
|
const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
|
|
723
|
+
// Only ever assigned the os.tmpdir() path below — never a user file — so the
|
|
724
|
+
// unlink in finally can never touch the caller's own lists.
|
|
725
|
+
let combinedTmpFile = null;
|
|
635
726
|
try {
|
|
636
727
|
if (engine === adblockRust) {
|
|
637
728
|
// Rust wrapper accepts an array directly — no temp file needed.
|
|
@@ -640,15 +731,22 @@ if (blockAdsIndex !== -1) {
|
|
|
640
731
|
// JS engine takes a single path; concat to a temp file when multiple lists.
|
|
641
732
|
let rulesFile = rulesFiles[0];
|
|
642
733
|
if (rulesFiles.length > 1) {
|
|
643
|
-
|
|
734
|
+
combinedTmpFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
|
|
735
|
+
rulesFile = combinedTmpFile;
|
|
644
736
|
const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
|
|
645
737
|
fs.writeFileSync(rulesFile, combined);
|
|
646
738
|
}
|
|
739
|
+
// parseAdblockRules reads the file synchronously and in full before
|
|
740
|
+
// returning, so the temp copy is safe to remove immediately afterwards.
|
|
647
741
|
adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
|
|
648
742
|
}
|
|
649
743
|
} catch (err) {
|
|
650
744
|
console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
|
|
651
745
|
process.exit(1);
|
|
746
|
+
} finally {
|
|
747
|
+
if (combinedTmpFile) {
|
|
748
|
+
try { fs.unlinkSync(combinedTmpFile); } catch { /* best effort — OS reaps tmpdir */ }
|
|
749
|
+
}
|
|
652
750
|
}
|
|
653
751
|
const stats = adblockMatcher.getStats();
|
|
654
752
|
const ruleDesc = stats.total != null
|
|
@@ -691,7 +789,6 @@ Per-config settings file (.nwssconfig):
|
|
|
691
789
|
See README.md for format details.
|
|
692
790
|
|
|
693
791
|
General Options:
|
|
694
|
-
--verbose Force verbose mode globally
|
|
695
792
|
--debug Force debug mode globally
|
|
696
793
|
--silent Suppress normal console logs
|
|
697
794
|
--titles Add ! <url> title before each site's group
|
|
@@ -721,10 +818,16 @@ General Options:
|
|
|
721
818
|
|
|
722
819
|
Validation Options:
|
|
723
820
|
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
|
|
724
|
-
--dns
|
|
821
|
+
--dns <ip[,ip,...]> Resolver(s) for the DNS pre-check AND nettools' dig (not Chrome nav / whois).
|
|
822
|
+
One pins all queries to it; several rotate per query. Overrides /etc/resolv.conf.
|
|
823
|
+
--dns-cache Persist dig/whois results to disk between runs (dig 20h / whois 36h TTL, 2000-entry cap each),
|
|
824
|
+
plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
|
|
725
825
|
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
|
|
726
826
|
By default, URLs whose hostname doesn't resolve are skipped
|
|
727
827
|
immediately (saves ~5-15s of Puppeteer time per dead host).
|
|
828
|
+
--show-dead-domains At end of scan, list hostnames that did not resolve / were
|
|
829
|
+
unreachable (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
|
|
830
|
+
Excludes blocks/timeouts (those mean the domain is alive). For pruning.
|
|
728
831
|
--validate-config Validate config.json file and exit
|
|
729
832
|
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
|
|
730
833
|
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
|
|
@@ -741,7 +844,7 @@ Global config.json options:
|
|
|
741
844
|
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
742
845
|
ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
|
|
743
846
|
ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
|
|
744
|
-
max_concurrent_sites:
|
|
847
|
+
max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6)
|
|
745
848
|
resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80)
|
|
746
849
|
disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true)
|
|
747
850
|
|
|
@@ -752,8 +855,7 @@ Per-site config.json options:
|
|
|
752
855
|
When true, ALL regex patterns must match the same URL
|
|
753
856
|
|
|
754
857
|
Redirect Handling Options:
|
|
755
|
-
|
|
756
|
-
max_redirects: 10 Maximum number of redirects to follow (default: 10)
|
|
858
|
+
max_redirects: 10 Maximum number of redirects to follow (default: 10; 0 = follow none)
|
|
757
859
|
js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
|
|
758
860
|
detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
|
|
759
861
|
redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
|
|
@@ -846,7 +948,7 @@ Advanced Options:
|
|
|
846
948
|
whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
|
|
847
949
|
dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
|
|
848
950
|
dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
|
|
849
|
-
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "
|
|
951
|
+
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "domcontentloaded"})
|
|
850
952
|
dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
|
|
851
953
|
digRecordType: "A" DNS record type for dig (default: A)
|
|
852
954
|
|
|
@@ -1336,6 +1438,7 @@ if (dumpUrls) {
|
|
|
1336
1438
|
// Avoids blocking I/O on every intercepted request in debug/dumpurls mode
|
|
1337
1439
|
const _logBuffers = new Map(); // filePath -> string[]
|
|
1338
1440
|
const LOG_FLUSH_INTERVAL = 2000; // Flush every 2 seconds
|
|
1441
|
+
const LOG_BUFFER_MAX_RETAINED = 10000; // Cap a file's retry backlog (lines) so a permanently unwritable path can't grow memory unboundedly
|
|
1339
1442
|
let _logFlushTimer = null;
|
|
1340
1443
|
|
|
1341
1444
|
function bufferedLogWrite(filePath, entry) {
|
|
@@ -1348,18 +1451,20 @@ function bufferedLogWrite(filePath, entry) {
|
|
|
1348
1451
|
|
|
1349
1452
|
function flushLogBuffers() {
|
|
1350
1453
|
for (const [filePath, entries] of _logBuffers) {
|
|
1351
|
-
if (entries.length
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1454
|
+
if (entries.length === 0) continue;
|
|
1455
|
+
try {
|
|
1456
|
+
// Synchronous append on purpose: the batched 2s flush is small, and a
|
|
1457
|
+
// blocking append cannot overlap the next timer tick (it holds the event
|
|
1458
|
+
// loop for its duration) — eliminating the interleaved concurrent-append
|
|
1459
|
+
// hazard of the old async fs.writeFile({flag:'a'}). Clear ONLY after the
|
|
1460
|
+
// write succeeds, so a transient failure retries next tick instead of
|
|
1461
|
+
// being silently dropped (the old code cleared before the async write
|
|
1462
|
+
// confirmed). Bounded so a permanently unwritable path can't grow memory.
|
|
1463
|
+
fs.appendFileSync(filePath, entries.join(''));
|
|
1464
|
+
entries.length = 0;
|
|
1465
|
+
} catch (err) {
|
|
1466
|
+
console.warn(formatLogMessage('warn', `Failed to flush log buffer to ${filePath}: ${err.message}`));
|
|
1467
|
+
if (entries.length > LOG_BUFFER_MAX_RETAINED) entries.length = 0;
|
|
1363
1468
|
}
|
|
1364
1469
|
}
|
|
1365
1470
|
}
|
|
@@ -1403,21 +1508,29 @@ if (forceDebug && globalComments) {
|
|
|
1403
1508
|
* @param {string} url - The URL string to parse.
|
|
1404
1509
|
* @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
|
|
1405
1510
|
*/
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1511
|
+
// psl.parse memoized by hostname. The request handlers parse the root domain
|
|
1512
|
+
// of EVERY request, and a page hits the same few hosts repeatedly (CDN,
|
|
1513
|
+
// analytics, ad domains) — so a hostname-keyed memo turns almost all of those
|
|
1514
|
+
// into Map hits instead of repeated public-suffix-list lookups. Keyed by
|
|
1515
|
+
// hostname (not full URL) so distinct paths/queries on one host share one
|
|
1516
|
+
// entry: higher hit rate, fewer + shorter keys than a URL-keyed cache.
|
|
1517
|
+
// psl.parse is pure and never throws (malformed input → {domain: null}), so
|
|
1518
|
+
// the catch is defensive only.
|
|
1519
|
+
const _hostRootCache = new Map();
|
|
1520
|
+
function rootDomainForHost(hostname) {
|
|
1521
|
+
if (!hostname) return '';
|
|
1522
|
+
const cached = _hostRootCache.get(hostname);
|
|
1409
1523
|
if (cached !== undefined) return cached;
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
}
|
|
1524
|
+
let result;
|
|
1525
|
+
try { const parsed = psl.parse(hostname); result = parsed.domain || hostname; }
|
|
1526
|
+
catch { result = hostname; }
|
|
1527
|
+
if (_hostRootCache.size > 5000) _hostRootCache.clear();
|
|
1528
|
+
_hostRootCache.set(hostname, result);
|
|
1529
|
+
return result;
|
|
1530
|
+
}
|
|
1531
|
+
function getRootDomain(url) {
|
|
1532
|
+
try { return rootDomainForHost(new URL(url).hostname); }
|
|
1533
|
+
catch { return ''; }
|
|
1421
1534
|
}
|
|
1422
1535
|
|
|
1423
1536
|
/**
|
|
@@ -1525,7 +1638,12 @@ function matchesDynamicBlock(domain) {
|
|
|
1525
1638
|
return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
|
|
1526
1639
|
}
|
|
1527
1640
|
|
|
1528
|
-
|
|
1641
|
+
// `_ignorePatterns` is intentionally unused (underscore-marked): every caller
|
|
1642
|
+
// and the grep/curl/nettools/searchstring callback contract pass the ignore
|
|
1643
|
+
// list as a 2nd arg, but the ignore-state actually lives in the module-level
|
|
1644
|
+
// _dynamicallyIgnoredDomains / _ignoreDomainsExact Sets walked below. Kept in
|
|
1645
|
+
// the signature only to preserve that shared call shape.
|
|
1646
|
+
function matchesIgnoreDomain(domain, _ignorePatterns) {
|
|
1529
1647
|
// Both dynamic and static ignore lists are walked parent-by-parent so a
|
|
1530
1648
|
// subdomain of an ignored root inherits the ignore. Previously the
|
|
1531
1649
|
// dynamic check was exact-only, creating an asymmetry: a static-config
|
|
@@ -1747,7 +1865,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1747
1865
|
|
|
1748
1866
|
// Declare userDataDir in outer scope for cleanup access
|
|
1749
1867
|
let userDataDir = null;
|
|
1750
|
-
|
|
1868
|
+
|
|
1869
|
+
// Browser-level decision (the browser launches once per batch, so this can't
|
|
1870
|
+
// be per-site): only disable Chrome's pop-up blocker when at least one site
|
|
1871
|
+
// actually wants popups captured. A real browser blocks non-gesture
|
|
1872
|
+
// window.open(), so non-popup scans keep the blocker on for stealth.
|
|
1873
|
+
// capture_popups scans turn it off so non-gesture popunders (document-level
|
|
1874
|
+
// onclick / timer SDKs) fire and get captured too — gesture-triggered
|
|
1875
|
+
// popups already work via the synthetic-click path regardless of this flag.
|
|
1876
|
+
const wantPopups = Array.isArray(sites) && sites.some(s => s && s.capture_popups === true);
|
|
1877
|
+
if (wantPopups && forceDebug) {
|
|
1878
|
+
console.log(formatLogMessage('debug', `${POPUP_TAG} capture_popups set — launching with --disable-popup-blocking (non-gesture popunders allowed)`));
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1751
1881
|
/**
|
|
1752
1882
|
* Creates a new browser instance with consistent configuration
|
|
1753
1883
|
* Uses system Chrome and temporary directories to minimize disk usage
|
|
@@ -1838,6 +1968,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1838
1968
|
// Puppeteer 22.x headless mode optimization
|
|
1839
1969
|
// Auto-detect best headless mode based on Puppeteer version
|
|
1840
1970
|
headless: headlessMode,
|
|
1971
|
+
// Bypass TLS cert errors at the browser level (drives CDP
|
|
1972
|
+
// Security.setIgnoreCertificateErrors). Robust on new-headless Chrome,
|
|
1973
|
+
// where the --ignore-certificate-errors *flag* is increasingly ignored.
|
|
1974
|
+
// An ad/tracker scanner must reach self-signed / mismatched-cert ad and
|
|
1975
|
+
// embed domains; we observe traffic, we don't transmit secrets.
|
|
1976
|
+
acceptInsecureCerts: true,
|
|
1841
1977
|
args: [
|
|
1842
1978
|
// CRITICAL: Remove automation detection markers
|
|
1843
1979
|
'--disable-blink-features=AutomationControlled',
|
|
@@ -1926,6 +2062,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1926
2062
|
'--memory-pressure-off',
|
|
1927
2063
|
'--max_old_space_size=2048', // V8 heap limit
|
|
1928
2064
|
'--disable-prompt-on-repost', // Fixes form popup on page reload
|
|
2065
|
+
// Disable Chrome's pop-up blocker (chrome://settings/content/popups)
|
|
2066
|
+
// ONLY when a site wants popups captured — lets non-gesture popunders
|
|
2067
|
+
// fire. Gated so non-popup scans keep the blocker on for stealth.
|
|
2068
|
+
...(wantPopups ? ['--disable-popup-blocking'] : []),
|
|
1929
2069
|
...(keepBrowserOpen ? [] : ['--disable-background-networking']),
|
|
1930
2070
|
'--no-sandbox',
|
|
1931
2071
|
'--disable-setuid-sandbox',
|
|
@@ -2116,22 +2256,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2116
2256
|
bypass_cache
|
|
2117
2257
|
} = siteConfig;
|
|
2118
2258
|
|
|
2119
|
-
const allowFirstParty = firstParty === true || firstParty === 1;
|
|
2120
|
-
const allowThirdParty = thirdParty === undefined || thirdParty === true || thirdParty === 1;
|
|
2121
2259
|
const perSiteSubDomains = subDomains === 1 ? true : subDomainsMode;
|
|
2122
|
-
const siteLocalhostIP = localhost || null;
|
|
2123
|
-
const cloudflarePhishBypass = cloudflare_phish === true;
|
|
2124
|
-
const cloudflareBypass = cloudflare_bypass === true;
|
|
2125
2260
|
// Add redirect and same-page loop protection
|
|
2126
|
-
|
|
2261
|
+
// Number check (not ||) so max_redirects: 0 isn't swallowed as falsy → 10.
|
|
2262
|
+
const MAX_REDIRECT_DEPTH = (typeof siteConfig.max_redirects === 'number' && siteConfig.max_redirects >= 0)
|
|
2263
|
+
? siteConfig.max_redirects : 10;
|
|
2127
2264
|
const redirectHistory = new Set();
|
|
2128
2265
|
let redirectCount = 0;
|
|
2129
2266
|
const pageLoadHistory = new Map(); // Track same-page reloads
|
|
2130
2267
|
const MAX_SAME_PAGE_LOADS = 3;
|
|
2131
2268
|
let currentPageUrl = currentUrl;
|
|
2132
2269
|
|
|
2133
|
-
const sitePrivoxy = privoxy === true;
|
|
2134
|
-
const sitePihole = pihole === true;
|
|
2135
2270
|
const flowproxyDetection = flowproxy_detection === true;
|
|
2136
2271
|
|
|
2137
2272
|
const evenBlocked = even_blocked === true;
|
|
@@ -2298,6 +2433,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2298
2433
|
|
|
2299
2434
|
// Track page for realtime cleanup
|
|
2300
2435
|
trackPageForRealtime(page);
|
|
2436
|
+
// Register with the idle-hang watchdog (force-closed if it goes
|
|
2437
|
+
// unresponsive while the whole scan has stalled).
|
|
2438
|
+
_inFlightPages.set(page, { url: currentUrl, unresponsiveStrikes: 0 });
|
|
2301
2439
|
|
|
2302
2440
|
// Mark page as actively processing
|
|
2303
2441
|
updatePageUsage(page, true);
|
|
@@ -2822,12 +2960,27 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2822
2960
|
|
|
2823
2961
|
const regexes = getCompiledRegexes(siteConfig.filterRegex);
|
|
2824
2962
|
|
|
2963
|
+
// output_regex (optional per-site): extract the rule body from each matched
|
|
2964
|
+
// URL via capture group 1 (or the whole match), so output becomes
|
|
2965
|
+
// ||<capture> (e.g. ||host/script/) instead of ||host^ — lets a stable
|
|
2966
|
+
// folder/file be blocked on a host that also serves legit content. Compiled
|
|
2967
|
+
// silently here; config-load validation (validate_rules) warns on a bad
|
|
2968
|
+
// pattern, so a throw here just disables the feature for this site.
|
|
2969
|
+
// Reuse the memoized regex compiler (same cache as filterRegex) so the
|
|
2970
|
+
// pattern compiles once per unique source, not once per URL. try/catch
|
|
2971
|
+
// because getCompiledRegex throws on a bad pattern — config-load
|
|
2972
|
+
// validation already warned; a throw here just disables the feature.
|
|
2973
|
+
let outputRegex = null;
|
|
2974
|
+
if (siteConfig.output_regex) {
|
|
2975
|
+
try { outputRegex = getCompiledRegexes(siteConfig.output_regex)[0] || null; } catch (_) { outputRegex = null; }
|
|
2976
|
+
}
|
|
2977
|
+
|
|
2825
2978
|
// NEW: Get regex_and setting (defaults to false for backward compatibility)
|
|
2826
2979
|
const useRegexAnd = siteConfig.regex_and === true;
|
|
2827
2980
|
|
|
2828
2981
|
// Parse searchstring patterns using module
|
|
2829
2982
|
const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
|
|
2830
|
-
|
|
2983
|
+
let useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring (reassigned to false below if curl is unavailable)
|
|
2831
2984
|
let useGrep = siteConfig.grep === true; // Grep can work independently
|
|
2832
2985
|
|
|
2833
2986
|
// Get user agent for curl if needed
|
|
@@ -3009,9 +3162,30 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3009
3162
|
* @param {string} fullSubdomain - Full subdomain for cache tracking
|
|
3010
3163
|
* @param {string} resourceType - Resource type (for --adblock-rules mode)
|
|
3011
3164
|
*/
|
|
3012
|
-
function addMatchedDomain(domain, resourceType = null, fullSubdomain = null) {
|
|
3165
|
+
function addMatchedDomain(domain, resourceType = null, fullSubdomain = null, matchedUrl = null) {
|
|
3013
3166
|
// Use fullSubdomain for cache tracking if provided, otherwise fall back to domain
|
|
3014
3167
|
const cacheKey = fullSubdomain || domain;
|
|
3168
|
+
// output_regex: derive the rule body from the matched URL. Capture group 1
|
|
3169
|
+
// (or the whole match) becomes the stored key, e.g. "host/script/", which
|
|
3170
|
+
// formatDomain emits as ||host/script/ for adblock and falls back to the
|
|
3171
|
+
// bare host for domain-only formats. All similarity / dedup / smart-cache
|
|
3172
|
+
// logic below still runs on the bare host (domain); only the final stored
|
|
3173
|
+
// key changes. The capture must contain both '/' and '.' (i.e. host+path),
|
|
3174
|
+
// otherwise we keep the host so a mis-written regex can't emit garbage.
|
|
3175
|
+
let outputKey = domain;
|
|
3176
|
+
if (outputRegex && matchedUrl) {
|
|
3177
|
+
const m = matchedUrl.match(outputRegex);
|
|
3178
|
+
if (m) {
|
|
3179
|
+
const cap = (m[1] != null ? m[1] : m[0]);
|
|
3180
|
+
// Accept only a host+path shape: a '/' with a real host before it
|
|
3181
|
+
// (segment before the first '/' must contain a '.'). Rejects a
|
|
3182
|
+
// capture that accidentally includes the scheme (host part would be
|
|
3183
|
+
// "https:") or a path-only capture with no host — both fall back to
|
|
3184
|
+
// the bare-host ||host^ rule rather than emit garbage.
|
|
3185
|
+
const sl = cap ? cap.indexOf('/') : -1;
|
|
3186
|
+
if (sl > 0 && cap.slice(0, sl).includes('.')) outputKey = cap;
|
|
3187
|
+
}
|
|
3188
|
+
}
|
|
3015
3189
|
// Check if we should ignore similar domains
|
|
3016
3190
|
const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
|
|
3017
3191
|
const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
|
|
@@ -3113,15 +3287,15 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3113
3287
|
}
|
|
3114
3288
|
|
|
3115
3289
|
if (matchedDomains instanceof Map) {
|
|
3116
|
-
if (!matchedDomains.has(
|
|
3117
|
-
matchedDomains.set(
|
|
3290
|
+
if (!matchedDomains.has(outputKey)) {
|
|
3291
|
+
matchedDomains.set(outputKey, new Set());
|
|
3118
3292
|
}
|
|
3119
3293
|
// Only add the specific resourceType that was matched, not all types for this domain
|
|
3120
3294
|
if (resourceType) {
|
|
3121
|
-
matchedDomains.get(
|
|
3295
|
+
matchedDomains.get(outputKey).add(resourceType);
|
|
3122
3296
|
}
|
|
3123
3297
|
} else {
|
|
3124
|
-
matchedDomains.add(
|
|
3298
|
+
matchedDomains.add(outputKey);
|
|
3125
3299
|
}
|
|
3126
3300
|
}
|
|
3127
3301
|
|
|
@@ -3160,12 +3334,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3160
3334
|
// fall back to the default rather than silently disabling capture.
|
|
3161
3335
|
const POPUP_MAX_DEPTH = (() => {
|
|
3162
3336
|
const v = parseInt(siteConfig.capture_popups_max_depth, 10);
|
|
3163
|
-
return Number.isFinite(v) && v > 0 ? v :
|
|
3337
|
+
return Number.isFinite(v) && v > 0 ? v : 4;
|
|
3164
3338
|
})();
|
|
3165
3339
|
const POPUP_CAPTURE_WINDOW_MS = (() => {
|
|
3166
3340
|
const v = parseInt(siteConfig.capture_popups_window_ms, 10);
|
|
3167
3341
|
return Number.isFinite(v) && v > 0 ? v : 5000;
|
|
3168
3342
|
})();
|
|
3343
|
+
// interact_popups: click inside captured popups so they cascade to their
|
|
3344
|
+
// next ad/redirect (requires capture_popups — no popups exist otherwise).
|
|
3345
|
+
// Light pass; the request listener catches whatever the clicks surface.
|
|
3346
|
+
const interactPopups = capturePopups && siteConfig.interact_popups === true;
|
|
3347
|
+
const POPUP_INTERACT_CLICKS = 3; // enough to fire popunder/redirect SDKs (incl. SDKs that suppress the 1st/2nd click as warmup) without runaway cascades
|
|
3169
3348
|
|
|
3170
3349
|
if (capturePopups && forceDebug) {
|
|
3171
3350
|
// One-time setup-time warning if the click prerequisite isn't met.
|
|
@@ -3231,8 +3410,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3231
3410
|
try {
|
|
3232
3411
|
const parsedUrl = new URL(checkedUrl);
|
|
3233
3412
|
fullSubdomain = parsedUrl.hostname;
|
|
3234
|
-
|
|
3235
|
-
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3413
|
+
checkedRootDomain = rootDomainForHost(fullSubdomain);
|
|
3236
3414
|
} catch (_) { return; }
|
|
3237
3415
|
if (!checkedRootDomain) return;
|
|
3238
3416
|
|
|
@@ -3331,7 +3509,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3331
3509
|
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3332
3510
|
} else {
|
|
3333
3511
|
// No nettools required — regex match alone counts.
|
|
3334
|
-
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3512
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain, checkedUrl);
|
|
3335
3513
|
}
|
|
3336
3514
|
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3337
3515
|
};
|
|
@@ -3453,6 +3631,24 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3453
3631
|
|
|
3454
3632
|
attachPopupRequestCapture(popupPage, depth);
|
|
3455
3633
|
|
|
3634
|
+
// interact_popups: click inside the popup so it can cascade to its next
|
|
3635
|
+
// ad/redirect — popunder/redirect SDKs fire on a document-level click,
|
|
3636
|
+
// and a captured-but-unclicked popup only ever shows its landing URL.
|
|
3637
|
+
// Light pass (POPUP_INTERACT_CLICKS random content-zone clicks), only
|
|
3638
|
+
// on popups shallower than max depth so a clicked popup's spawned child
|
|
3639
|
+
// (depth+1) is still within the capture depth. Fire-and-forget: it must
|
|
3640
|
+
// not block onTargetCreated, and the popup may close/navigate mid-click
|
|
3641
|
+
// (performContentClicks no-ops on a closed page). The request listener
|
|
3642
|
+
// above captures whatever the clicks surface; the close timer bounds it.
|
|
3643
|
+
if (interactPopups && depth < POPUP_MAX_DEPTH && !popupPage.isClosed()) {
|
|
3644
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] interact_popups: ${POPUP_INTERACT_CLICKS} content click(s)`));
|
|
3645
|
+
performContentClicks(popupPage, {
|
|
3646
|
+
clicks: POPUP_INTERACT_CLICKS,
|
|
3647
|
+
forceDebug,
|
|
3648
|
+
realistic: siteConfig.realistic_click === true,
|
|
3649
|
+
}).catch(() => {}); // popup is transient — non-fatal
|
|
3650
|
+
}
|
|
3651
|
+
|
|
3456
3652
|
// Auto-close after the capture window so popups don't pile up.
|
|
3457
3653
|
const closeTimer = setTimeout(() => {
|
|
3458
3654
|
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
@@ -3489,30 +3685,24 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3489
3685
|
try {
|
|
3490
3686
|
const parsedUrl = new URL(checkedUrl);
|
|
3491
3687
|
fullSubdomain = parsedUrl.hostname;
|
|
3492
|
-
|
|
3493
|
-
checkedRootDomain = pslResult.domain || fullSubdomain;
|
|
3688
|
+
checkedRootDomain = rootDomainForHost(fullSubdomain);
|
|
3494
3689
|
} catch (e) {}
|
|
3495
3690
|
|
|
3691
|
+
// Never BLOCK the top-level document (the scanned page OR a main-frame
|
|
3692
|
+
// redirect target). Aborting it makes the navigation never commit (page
|
|
3693
|
+
// stays at about:blank → navigation timeout), silently breaking any
|
|
3694
|
+
// scanned URL that matches our own filter lists (adblock / blocked /
|
|
3695
|
+
// blockDomainsByUrl) — common on adult/pirate/stream domains. This flag
|
|
3696
|
+
// ONLY guards the abort paths below; the request still flows through the
|
|
3697
|
+
// match logic, so a main-frame redirect destination (e.g. a
|
|
3698
|
+
// filecrypt → ad-domain hop) is still captured via filterRegex/dig/whois.
|
|
3699
|
+
// isNavigationRequest is true for sub-frame docs too, so the mainFrame()
|
|
3700
|
+
// check keeps ad iframes blockable.
|
|
3701
|
+
let isMainFrameDoc = false;
|
|
3702
|
+
try { isMainFrameDoc = request.isNavigationRequest() && request.frame() === page.mainFrame(); } catch (_) {}
|
|
3703
|
+
|
|
3496
3704
|
// Check against ALL first-party domains (original + all redirects)
|
|
3497
3705
|
const isFirstParty = checkedRootDomain && firstPartyDomains.has(checkedRootDomain);
|
|
3498
|
-
|
|
3499
|
-
// Block infinite iframe loops - safely access frame URL
|
|
3500
|
-
const frameUrl = (() => {
|
|
3501
|
-
try {
|
|
3502
|
-
const frame = request.frame();
|
|
3503
|
-
return frame ? frame.url() : '';
|
|
3504
|
-
} catch (err) {
|
|
3505
|
-
return '';
|
|
3506
|
-
}
|
|
3507
|
-
})();
|
|
3508
|
-
if (frameUrl && frameUrl.includes('creative.dmzjmp.com') &&
|
|
3509
|
-
checkedUrl.includes('go.dmzjmp.com/api/models')) {
|
|
3510
|
-
if (forceDebug) {
|
|
3511
|
-
console.log(formatLogMessage('debug', `Blocking potential infinite iframe loop: ${checkedUrl}`));
|
|
3512
|
-
}
|
|
3513
|
-
request.abort();
|
|
3514
|
-
return;
|
|
3515
|
-
}
|
|
3516
3706
|
|
|
3517
3707
|
// Enhanced debug logging to show which frame the request came from
|
|
3518
3708
|
if (forceDebug) {
|
|
@@ -3542,7 +3732,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3542
3732
|
request.resourceType()
|
|
3543
3733
|
);
|
|
3544
3734
|
|
|
3545
|
-
if (result.blocked) {
|
|
3735
|
+
if (result.blocked && !isMainFrameDoc) {
|
|
3546
3736
|
adblockStats.blocked++;
|
|
3547
3737
|
if (forceDebug) {
|
|
3548
3738
|
console.log(formatLogMessage('debug', `${messageColors.blocked('[adblock]')} ${checkedUrl} (${result.reason})`));
|
|
@@ -3550,6 +3740,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3550
3740
|
request.abort('blockedbyclient');
|
|
3551
3741
|
return;
|
|
3552
3742
|
}
|
|
3743
|
+
if (result.blocked && isMainFrameDoc && forceDebug) {
|
|
3744
|
+
// Matched a filter rule but it's the page we're scanning (or a
|
|
3745
|
+
// main-frame redirect target) — allow it (blocking the top-level
|
|
3746
|
+
// document aborts navigation). It still flows through the matcher.
|
|
3747
|
+
console.log(formatLogMessage('debug', `${messageColors.highlight('[adblock]')} top-level document ${checkedUrl} matched (${result.reason}) — allowed (never block the scanned page)`));
|
|
3748
|
+
}
|
|
3553
3749
|
adblockStats.allowed++;
|
|
3554
3750
|
} catch (err) { /* Silently continue on adblock errors */ }
|
|
3555
3751
|
}
|
|
@@ -3603,7 +3799,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3603
3799
|
// check so domain-based blocks short-circuit without paying the
|
|
3604
3800
|
// per-URL regex scan. Same abort reason as the static path so
|
|
3605
3801
|
// request.failure() observers see consistent metadata.
|
|
3606
|
-
if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain)) {
|
|
3802
|
+
if (reqDomain && _dynamicallyBlockedDomains.size > 0 && matchesDynamicBlock(reqDomain) && !isMainFrameDoc) {
|
|
3607
3803
|
if (forceDebug) {
|
|
3608
3804
|
console.log(formatLogMessage('debug', `${BLOCK_DOMAINS_BY_URL_TAG} aborting ${reqUrl} (domain ${reqDomain} dynamically blocked)`));
|
|
3609
3805
|
}
|
|
@@ -3618,7 +3814,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3618
3814
|
break;
|
|
3619
3815
|
}
|
|
3620
3816
|
}
|
|
3621
|
-
if (blockedMatchIndex !== -1) {
|
|
3817
|
+
if (blockedMatchIndex !== -1 && !isMainFrameDoc) {
|
|
3622
3818
|
// Always track the hit (zero-cost on the un-debug path) so the
|
|
3623
3819
|
// scan-end summary can show which patterns are doing work vs.
|
|
3624
3820
|
// which are stale and ready to prune. Keyed by pattern.source --
|
|
@@ -3658,7 +3854,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3658
3854
|
wasBlocked: true
|
|
3659
3855
|
});
|
|
3660
3856
|
} else {
|
|
3661
|
-
addMatchedDomain(reqDomain, resourceType, fullSubdomain);
|
|
3857
|
+
addMatchedDomain(reqDomain, resourceType, fullSubdomain, reqUrl);
|
|
3662
3858
|
}
|
|
3663
3859
|
matchedRegexPatterns.add(evenBlockedRegexPattern);
|
|
3664
3860
|
|
|
@@ -3836,7 +4032,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3836
4032
|
isFirstParty: isFirstParty
|
|
3837
4033
|
});
|
|
3838
4034
|
} else {
|
|
3839
|
-
|
|
4035
|
+
// Pass null for fullSubdomain (not the in-scope hostname) to keep
|
|
4036
|
+
// this path's dedup key as the root domain exactly as before —
|
|
4037
|
+
// only matchedUrl is new here, for output_regex.
|
|
4038
|
+
addMatchedDomain(reqDomain, resourceType, null, reqUrl);
|
|
3840
4039
|
}
|
|
3841
4040
|
if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
|
|
3842
4041
|
if (siteConfig.verbose === 1) {
|
|
@@ -4197,15 +4396,43 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4197
4396
|
try {
|
|
4198
4397
|
navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage);
|
|
4199
4398
|
} catch (navErr) {
|
|
4200
|
-
// Only
|
|
4399
|
+
// Only handle genuine timeouts here, not chrome-error:// redirects.
|
|
4400
|
+
// pageUrl === 'about:blank' means the navigation never committed
|
|
4401
|
+
// (server never responded) — treat as a real failure, not a partial
|
|
4402
|
+
// page; only a page that actually reached a URL is worth observing.
|
|
4201
4403
|
let pageUrl = '';
|
|
4202
4404
|
try { if (!page.isClosed()) pageUrl = page.url(); } catch {}
|
|
4203
4405
|
const isPopupFailure = navErr.message.includes('chrome-error://') || navErr.message.includes('invalid URL') ||
|
|
4204
4406
|
pageUrl.startsWith('chrome-error://') || pageUrl === 'about:blank';
|
|
4205
4407
|
if ((navErr.message.includes('timeout') || navErr.message.includes('Timeout')) && !isPopupFailure) {
|
|
4206
|
-
|
|
4207
|
-
|
|
4208
|
-
|
|
4408
|
+
// The OLD fallback retried with networkidle2 — STRICTER than the
|
|
4409
|
+
// domcontentloaded default, so it could never rescue a
|
|
4410
|
+
// domcontentloaded timeout (and Puppeteer 25 has no 'commit', i.e.
|
|
4411
|
+
// nothing more lenient). Two-tier recovery instead:
|
|
4412
|
+
// 1. If the site used a wait STRICTER than domcontentloaded, do one
|
|
4413
|
+
// lenient retry with domcontentloaded (it fires earlier).
|
|
4414
|
+
// 2. Otherwise proceed with the partially-loaded page rather than
|
|
4415
|
+
// discarding the URL — it exists and requests already fired
|
|
4416
|
+
// (captured by page.on('request')); the delay/interact phase
|
|
4417
|
+
// below keeps capturing. Streaming/embed/media pages routinely
|
|
4418
|
+
// never reach DOM-ready (a connection stays open) but their
|
|
4419
|
+
// ad/tracker calls fired early.
|
|
4420
|
+
const primaryWait = gotoOptions.waitUntil || defaultWaitUntil;
|
|
4421
|
+
let recovered = false;
|
|
4422
|
+
if (primaryWait !== 'domcontentloaded') {
|
|
4423
|
+
try {
|
|
4424
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout (${primaryWait}), retrying with waitUntil:domcontentloaded for ${currentUrl}`));
|
|
4425
|
+
const fallbackOptions = { ...gotoOptions, waitUntil: 'domcontentloaded', timeout: Math.min(timeout, 15000) };
|
|
4426
|
+
navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, fallbackOptions, forceDebug, formatLogMessage);
|
|
4427
|
+
recovered = true;
|
|
4428
|
+
} catch (_) { /* fall through to proceed-with-partial */ }
|
|
4429
|
+
}
|
|
4430
|
+
if (!recovered) {
|
|
4431
|
+
let partialUrl = currentUrl;
|
|
4432
|
+
try { if (!page.isClosed()) partialUrl = page.url() || currentUrl; } catch {}
|
|
4433
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Navigation timeout — proceeding with partially-loaded page for ${currentUrl}`));
|
|
4434
|
+
navigationResult = { finalUrl: partialUrl, redirected: false, redirectChain: [currentUrl], originalUrl: currentUrl, redirectDomains: [], httpStatus: null, cfRay: null };
|
|
4435
|
+
}
|
|
4209
4436
|
} else {
|
|
4210
4437
|
throw navErr;
|
|
4211
4438
|
}
|
|
@@ -4475,12 +4702,50 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4475
4702
|
}
|
|
4476
4703
|
}
|
|
4477
4704
|
console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
|
|
4705
|
+
// Capture hard "dead domain" navigation errors for --show-dead-domains
|
|
4706
|
+
// (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
|
|
4707
|
+
// challenges are NOT dead — they're excluded by this match.
|
|
4708
|
+
// Only DEFINITIVE non-existence / unreachable signals — these now drive
|
|
4709
|
+
// the in-scan dead-domain SKIP (not just --show-dead-domains reporting),
|
|
4710
|
+
// so transient DNS errors must NOT match. The bare `ERR_DNS` used to
|
|
4711
|
+
// catch ERR_DNS_TIMED_OUT / ERR_DNS_MALFORMED_RESPONSE / ERR_DNS_SERVER_FAILED
|
|
4712
|
+
// (all transient) — dropped so a slow-DNS blip can't false-skip the
|
|
4713
|
+
// rest of a live host's URLs.
|
|
4714
|
+
const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE/.exec(err.message || '');
|
|
4715
|
+
if (deadNav) {
|
|
4716
|
+
recordDeadDomain(currentUrl, deadNav[0]);
|
|
4717
|
+
// Corroborate-then-persist to the negative cache (.dnsnegcache with
|
|
4718
|
+
// --dns-cache → cross-scan skip; else in-memory). Chrome resolves via
|
|
4719
|
+
// the possibly-flaky SYSTEM resolver, so its ERR_NAME_NOT_RESOLVED may
|
|
4720
|
+
// be a glitch on a LIVE host. Re-confirm via the reliable --dns
|
|
4721
|
+
// resolver and cache ONLY if it ALSO returns a definitive NXDOMAIN.
|
|
4722
|
+
// ERR_ADDRESS_UNREACHABLE is routing (the host resolves), so the
|
|
4723
|
+
// resolve succeeds and it's correctly not cached. Fire-and-forget:
|
|
4724
|
+
// off the critical path; saveDiskCache flushes on exit.
|
|
4725
|
+
if (dnsPrecheckEnabled && deadNav[0] === 'ERR_NAME_NOT_RESOLVED') {
|
|
4726
|
+
let navHost = '';
|
|
4727
|
+
try { navHost = new URL(currentUrl).hostname; } catch {}
|
|
4728
|
+
if (navHost && !/^[\d.:]+$|^\[/.test(navHost) && !dnsNegativeCache.has(navHost)) {
|
|
4729
|
+
dnsResolver.resolveHost(navHost, dnsPrecheckTimeoutMs).then(
|
|
4730
|
+
() => { /* reliable resolver resolves it — system-resolver glitch, do NOT cache */ },
|
|
4731
|
+
(e) => {
|
|
4732
|
+
const code = (e && (e.code || e.message)) || '';
|
|
4733
|
+
if (isNonExistenceError(code)) {
|
|
4734
|
+
dnsNegativeCacheSet(navHost, code);
|
|
4735
|
+
recordDeadDomain(navHost, code);
|
|
4736
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Dead domain confirmed by --dns resolver (${code}) — caching ${navHost} (skips next run with --dns-cache)`));
|
|
4737
|
+
}
|
|
4738
|
+
}
|
|
4739
|
+
).catch(() => {});
|
|
4740
|
+
}
|
|
4741
|
+
}
|
|
4742
|
+
}
|
|
4478
4743
|
throw err;
|
|
4479
4744
|
}
|
|
4480
4745
|
}
|
|
4481
4746
|
}
|
|
4482
4747
|
|
|
4483
|
-
const delayMs = siteConfig.delay || DEFAULT_DELAY;
|
|
4748
|
+
const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
|
|
4484
4749
|
|
|
4485
4750
|
// Optimized delays for Puppeteer 23.x performance
|
|
4486
4751
|
const isFastSite = timeout <= TIMEOUTS.FAST_SITE_THRESHOLD;
|
|
@@ -4560,8 +4825,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4560
4825
|
const ghostStart = Date.now();
|
|
4561
4826
|
const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
|
|
4562
4827
|
|
|
4563
|
-
//
|
|
4564
|
-
|
|
4828
|
+
// Honor interact_click_count in ghost mode too (built-in default
|
|
4829
|
+
// is 3 — ad SDKs often swallow the 1st/2nd click as warmup). Same
|
|
4830
|
+
// default + 20-cap as the built-in content-click path. 0 when
|
|
4831
|
+
// element clicks are disabled.
|
|
4832
|
+
const ghostClickCount = interactionConfig.includeElementClicks
|
|
4833
|
+
? Math.min(Math.max(Number(siteConfig.interact_click_count) || 3, 1), 20)
|
|
4834
|
+
: 0;
|
|
4835
|
+
// Reserve part of the duration budget for those clicks so the
|
|
4836
|
+
// movement loop doesn't consume all of ghost_cursor_duration.
|
|
4837
|
+
// Capped at half the budget so movement still happens; raise
|
|
4838
|
+
// ghost_cursor_duration to fit more clicks.
|
|
4839
|
+
const clickReserveMs = Math.min(ghostClickCount * 600, ghostDuration * 0.5);
|
|
4840
|
+
|
|
4841
|
+
// Time-based Bezier mouse movements — runs for the unreserved budget
|
|
4842
|
+
while (ghostTimeLeft() > 200 + clickReserveMs) {
|
|
4565
4843
|
const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
|
|
4566
4844
|
const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
|
|
4567
4845
|
await ghostMove(cursor, toX, toY, {
|
|
@@ -4569,18 +4847,23 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4569
4847
|
overshootThreshold: ghostConfig.overshootThreshold,
|
|
4570
4848
|
forceDebug
|
|
4571
4849
|
});
|
|
4572
|
-
if (ghostTimeLeft() > 100) {
|
|
4850
|
+
if (ghostTimeLeft() > 100 + clickReserveMs) {
|
|
4573
4851
|
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4574
4852
|
}
|
|
4575
4853
|
}
|
|
4576
4854
|
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4577
4855
|
await ghostRandomMove(cursor, { forceDebug });
|
|
4578
4856
|
}
|
|
4579
|
-
|
|
4857
|
+
// interact_click_count clicks, each to a fresh content-zone point.
|
|
4858
|
+
// The time guard stops early if the budget runs out (raise
|
|
4859
|
+
// ghost_cursor_duration for more).
|
|
4860
|
+
for (let gc = 0; gc < ghostClickCount && ghostTimeLeft() > 100; gc++) {
|
|
4580
4861
|
const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
|
|
4581
4862
|
const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
|
|
4582
4863
|
await ghostClick(cursor, { x: clickX, y: clickY }, {
|
|
4583
4864
|
hesitate: ghostConfig.hesitate,
|
|
4865
|
+
page,
|
|
4866
|
+
realistic: siteConfig.realistic_click === true,
|
|
4584
4867
|
forceDebug
|
|
4585
4868
|
});
|
|
4586
4869
|
}
|
|
@@ -4895,7 +5178,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4895
5178
|
// Only add delay if we're continuing with more reloads
|
|
4896
5179
|
if (i < totalReloads) {
|
|
4897
5180
|
// Reduce delay for problematic sites
|
|
4898
|
-
const adjustedDelay = i > 1 ? Math.min(DEFAULT_DELAY, 2000) : DEFAULT_DELAY;
|
|
5181
|
+
const adjustedDelay = i > 1 ? Math.min(TIMEOUTS.DEFAULT_DELAY, 2000) : TIMEOUTS.DEFAULT_DELAY;
|
|
4899
5182
|
await fastTimeout(adjustedDelay);
|
|
4900
5183
|
}
|
|
4901
5184
|
}
|
|
@@ -5088,7 +5371,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5088
5371
|
const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_').substring(0, 80);
|
|
5089
5372
|
const filename = `screenshots/${safeUrl}-${timestamp}.png`;
|
|
5090
5373
|
try {
|
|
5091
|
-
|
|
5374
|
+
fs.mkdirSync('screenshots', { recursive: true }); // recursive:true is a no-op if it already exists
|
|
5092
5375
|
await page.screenshot({ path: filename, type: 'png', fullPage: true });
|
|
5093
5376
|
console.log(formatLogMessage('info', `Screenshot saved: ${filename}`));
|
|
5094
5377
|
} catch (screenshotErr) {
|
|
@@ -5099,6 +5382,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5099
5382
|
if (!keepBrowserOpen) {
|
|
5100
5383
|
try {
|
|
5101
5384
|
untrackPage(page);
|
|
5385
|
+
_inFlightPages.delete(page);
|
|
5102
5386
|
await page.close();
|
|
5103
5387
|
if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
|
|
5104
5388
|
} catch (pageCloseErr) {
|
|
@@ -5199,6 +5483,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5199
5483
|
let lastProcessedCount = 0;
|
|
5200
5484
|
let hangCheckCount = 0;
|
|
5201
5485
|
let forceRestartFlag = false; // Flag to trigger restart on next iteration
|
|
5486
|
+
// Largest per-URL timeout budget seen across tasks. The hang-check restart
|
|
5487
|
+
// scales to this so it can't false-fire on a legitimately-slow config (high
|
|
5488
|
+
// delay × reload × interact) whose per-URL budget exceeds a flat threshold —
|
|
5489
|
+
// the emergency restart should only fire once the per-URL timeout ITSELF has
|
|
5490
|
+
// had its chance and failed (a true browser hang).
|
|
5491
|
+
let maxPerUrlTimeoutMs = 0;
|
|
5202
5492
|
|
|
5203
5493
|
// Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
|
|
5204
5494
|
// only colors the [severity] tag; the '[HANG CHECK]' substring was
|
|
@@ -5206,6 +5496,48 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5206
5496
|
// entry so the interval callback doesn't re-colorize per tick.
|
|
5207
5497
|
const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
|
|
5208
5498
|
|
|
5499
|
+
// Idle-hang watchdog. Runs only while the scan is stalled (no URL completing).
|
|
5500
|
+
// The probe distinguishes a HUNG renderer from one that's merely NAVIGATING,
|
|
5501
|
+
// which is the key to probing aggressively without false-kills:
|
|
5502
|
+
// - evaluate resolves -> 'alive' -> reset strikes
|
|
5503
|
+
// - evaluate rejects fast (e.g. "Execution context destroyed" mid goto/
|
|
5504
|
+
// reload) -> 'navigating' -> inconclusive: neither
|
|
5505
|
+
// strike nor reset, so a
|
|
5506
|
+
// navigation can NEVER trip
|
|
5507
|
+
// the kill regardless of cadence
|
|
5508
|
+
// - no response within the cap -> 'hung' -> strike
|
|
5509
|
+
// PAGE_HANG_STRIKES_TO_KILL consecutive HUNG probes force-close the page, so the
|
|
5510
|
+
// stuck task's awaits reject and its batch completes instead of waiting out the
|
|
5511
|
+
// full per-URL ceiling. Parallel, guarded against overlap; zero overhead off a stall.
|
|
5512
|
+
let _hangProbeInProgress = false;
|
|
5513
|
+
const probeInFlightPagesForHang = async () => {
|
|
5514
|
+
if (_hangProbeInProgress || _inFlightPages.size === 0) return;
|
|
5515
|
+
_hangProbeInProgress = true;
|
|
5516
|
+
try {
|
|
5517
|
+
await Promise.all([..._inFlightPages.entries()].map(async ([page, info]) => {
|
|
5518
|
+
if (page.isClosed()) { _inFlightPages.delete(page); return; }
|
|
5519
|
+
let verdict;
|
|
5520
|
+
try {
|
|
5521
|
+
verdict = await Promise.race([
|
|
5522
|
+
page.evaluate(() => true).then(() => 'alive', () => 'navigating'),
|
|
5523
|
+
new Promise(r => setTimeout(() => r('hung'), PAGE_HANG_PROBE_TIMEOUT_MS)),
|
|
5524
|
+
]);
|
|
5525
|
+
} catch { verdict = 'hung'; }
|
|
5526
|
+
if (verdict === 'alive') { info.unresponsiveStrikes = 0; return; }
|
|
5527
|
+
if (verdict === 'navigating') return; // context destroyed mid-nav — not a hang; don't strike or reset
|
|
5528
|
+
// verdict === 'hung' — renderer gave no response within the cap
|
|
5529
|
+
info.unresponsiveStrikes++;
|
|
5530
|
+
if (info.unresponsiveStrikes >= PAGE_HANG_STRIKES_TO_KILL) {
|
|
5531
|
+
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} Force-closing hung page after ${info.unresponsiveStrikes} unresponsive probes: ${info.url}`));
|
|
5532
|
+
_inFlightPages.delete(page);
|
|
5533
|
+
page.close().catch(() => {}); // stuck task's awaits reject -> task errors -> batch completes
|
|
5534
|
+
}
|
|
5535
|
+
}));
|
|
5536
|
+
} finally {
|
|
5537
|
+
_hangProbeInProgress = false;
|
|
5538
|
+
}
|
|
5539
|
+
};
|
|
5540
|
+
|
|
5209
5541
|
const hangDetectionInterval = setInterval(() => {
|
|
5210
5542
|
// Progress check, counter, and forceRestartFlag MUST run regardless of
|
|
5211
5543
|
// debug mode — previously the entire body was gated on forceDebug, which
|
|
@@ -5218,8 +5550,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5218
5550
|
if (forceDebug) {
|
|
5219
5551
|
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
|
|
5220
5552
|
}
|
|
5221
|
-
|
|
5222
|
-
|
|
5553
|
+
// The faster 15s probe interval below does surgical per-page recovery; this
|
|
5554
|
+
// 30s interval owns only the slower nuclear-restart escalation. Deadline-
|
|
5555
|
+
// aware: the restart only fires once the stall has OUTLASTED the heaviest
|
|
5556
|
+
// in-flight per-URL budget (+ grace) — i.e. the per-URL timeout itself had
|
|
5557
|
+
// its chance and failed, a true hang. A flat threshold (the old 2.5min)
|
|
5558
|
+
// false-fires on legitimately-slow configs (high delay × reload × interact)
|
|
5559
|
+
// whose per-URL budget exceeds it, restarting the browser mid-work. Floor
|
|
5560
|
+
// at 150s so light configs behave exactly as before.
|
|
5561
|
+
// +45s buffer covers the per-URL 8s orphan grace + the 30s tick granularity + slack.
|
|
5562
|
+
const restartAfterMs = Math.max(150000, maxPerUrlTimeoutMs + 45000);
|
|
5563
|
+
if (hangCheckCount * 30000 >= restartAfterMs) {
|
|
5564
|
+
console.log(formatLogMessage('error', `${HANG_CHECK_TAG} No progress for ${Math.round(hangCheckCount * 30)}s (past the ${Math.round(restartAfterMs / 1000)}s per-URL budget). Triggering emergency browser restart.`));
|
|
5223
5565
|
forceRestartFlag = true; // Set flag instead of exiting
|
|
5224
5566
|
hangCheckCount = 0; // Reset counter for next cycle
|
|
5225
5567
|
}
|
|
@@ -5241,6 +5583,22 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5241
5583
|
// cleanup, this is belt-and-suspenders in case a future refactor moves them.
|
|
5242
5584
|
hangDetectionInterval.unref();
|
|
5243
5585
|
|
|
5586
|
+
// Fast surgical recovery on its own 15s cadence (the 30s interval above owns
|
|
5587
|
+
// the slower nuclear-restart escalation). Probes in-flight pages only while
|
|
5588
|
+
// progress is stalled and force-closes confirmed-hung ones; clears strikes when
|
|
5589
|
+
// progress resumes so a fresh stall starts from zero. Starts at -1 so the very
|
|
5590
|
+
// first window is grace (processedUrlCount begins at 0).
|
|
5591
|
+
let lastProbeCount = -1;
|
|
5592
|
+
const pageHangProbeInterval = setInterval(() => {
|
|
5593
|
+
if (processedUrlCount === lastProbeCount) {
|
|
5594
|
+
probeInFlightPagesForHang(); // fire-and-forget; self-guarded against overlap
|
|
5595
|
+
} else {
|
|
5596
|
+
for (const info of _inFlightPages.values()) info.unresponsiveStrikes = 0;
|
|
5597
|
+
}
|
|
5598
|
+
lastProbeCount = processedUrlCount;
|
|
5599
|
+
}, PAGE_HANG_PROBE_INTERVAL_MS);
|
|
5600
|
+
pageHangProbeInterval.unref();
|
|
5601
|
+
|
|
5244
5602
|
// Process URLs in batches with exception handling
|
|
5245
5603
|
let siteGroupIndex = 0;
|
|
5246
5604
|
let currentProxyKey = ''; // Track active proxy config — '' means direct connection
|
|
@@ -5509,6 +5867,19 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5509
5867
|
// actually starting — wrongly skipping live domains. c-ares isn't
|
|
5510
5868
|
// threadpool-bound so it's immune to that contention.
|
|
5511
5869
|
if (dnsPrecheckEnabled && taskDomain && !/^[\d.:]+$|^\[/.test(taskDomain)) {
|
|
5870
|
+
// Already proven dead earlier THIS run — either a pre-check NXDOMAIN or
|
|
5871
|
+
// a prior URL's navigation hit ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE
|
|
5872
|
+
// (recordDeadDomain populates _deadDomains for both). Skip the repeat
|
|
5873
|
+
// instead of paying another fail-open navigation on a multi-URL dead
|
|
5874
|
+
// host (e.g. dlstreams.top?id=39/54/347). In-scan only (NOT persisted):
|
|
5875
|
+
// Chrome resolves via the system resolver, so a nav-level failure could
|
|
5876
|
+
// be a system-resolver glitch on a live host — a false "dead" must not
|
|
5877
|
+
// carry across runs. Cheap: a Map lookup, no DNS resolve.
|
|
5878
|
+
if (_deadDomains.has(taskDomain)) {
|
|
5879
|
+
dnsPrecheckSkips++;
|
|
5880
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check: ${taskDomain} already dead this run (${_deadDomains.get(taskDomain)}) — skipping`));
|
|
5881
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${_deadDomains.get(taskDomain)}`, skipped: true };
|
|
5882
|
+
}
|
|
5512
5883
|
const cached = dnsNegativeCache.get(taskDomain);
|
|
5513
5884
|
if (cached && Date.now() - cached.timestamp < DNS_NEGATIVE_CACHE_TTL_MS) {
|
|
5514
5885
|
dnsPrecheckSkips++;
|
|
@@ -5525,58 +5896,38 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5525
5896
|
dnsPositiveSkippedHosts.add(taskDomain);
|
|
5526
5897
|
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
|
|
5527
5898
|
// Fall through to navigation -- pre-check "passed" by proxy.
|
|
5899
|
+
} else if (dnsBreaker.isTripped()) {
|
|
5900
|
+
// Resolver is in a refusal storm — pre-checking is futile and only
|
|
5901
|
+
// adds load. Skip the resolve and proceed to navigation (same effect
|
|
5902
|
+
// as a fail-open); no breaker record since no resolve happened.
|
|
5903
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check suspended (resolver circuit open) — proceeding: ${taskDomain}`));
|
|
5528
5904
|
} else {
|
|
5529
|
-
const dnsResolve = async () => {
|
|
5530
|
-
// resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
|
|
5531
|
-
// resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
|
|
5532
|
-
// error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
|
|
5533
|
-
// unchanged so the outer transient-retry path sees the real
|
|
5534
|
-
// resolver code and the negative cache records the right reason.
|
|
5535
|
-
// Previously a bare .catch swallowed everything and tried
|
|
5536
|
-
// resolve6, which masked transient v4-side errors behind
|
|
5537
|
-
// whatever resolve6 ended up reporting.
|
|
5538
|
-
// 2s timeout kept as a real safety net — with c-ares off the
|
|
5539
|
-
// threadpool it should now rarely fire.
|
|
5540
|
-
let timer;
|
|
5541
|
-
try {
|
|
5542
|
-
const timeoutP = new Promise((_, reject) => {
|
|
5543
|
-
timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
|
|
5544
|
-
});
|
|
5545
|
-
const resolveChain = dnsPromises.resolve4(taskDomain)
|
|
5546
|
-
.catch(err => {
|
|
5547
|
-
if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
|
|
5548
|
-
return dnsPromises.resolve6(taskDomain);
|
|
5549
|
-
}
|
|
5550
|
-
throw err;
|
|
5551
|
-
});
|
|
5552
|
-
await Promise.race([resolveChain, timeoutP]);
|
|
5553
|
-
} finally {
|
|
5554
|
-
if (timer) clearTimeout(timer);
|
|
5555
|
-
}
|
|
5556
|
-
};
|
|
5557
|
-
// c-ares transient codes — retry once so a momentary resolver
|
|
5558
|
-
// hiccup doesn't poison the negative cache for 5 minutes.
|
|
5559
|
-
// DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
|
|
5560
5905
|
try {
|
|
5561
|
-
|
|
5562
|
-
|
|
5563
|
-
|
|
5564
|
-
|
|
5565
|
-
|
|
5566
|
-
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
|
|
5567
|
-
await dnsResolve();
|
|
5568
|
-
} else {
|
|
5569
|
-
throw firstErr;
|
|
5570
|
-
}
|
|
5571
|
-
}
|
|
5906
|
+
// Rotates the lead nameserver per attempt and retries once on a
|
|
5907
|
+
// transient error; rejects with the final error (code intact) on
|
|
5908
|
+
// failure. See lib/dns.js.
|
|
5909
|
+
await dnsResolver.resolveHost(taskDomain, dnsPrecheckTimeoutMs);
|
|
5910
|
+
dnsBreaker.record(false); // resolved OK — resolver healthy
|
|
5572
5911
|
} catch (dnsErr) {
|
|
5573
5912
|
const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
|
|
5574
|
-
|
|
5575
|
-
|
|
5576
|
-
|
|
5577
|
-
|
|
5913
|
+
// Only a definitive "host does not exist / has no address" answer
|
|
5914
|
+
// (ENOTFOUND/ENODATA) justifies dropping the URL. A resolver-level
|
|
5915
|
+
// failure (EREFUSED/ESERVFAIL/ETIMEOUT/ECONNREFUSED/timeout) says
|
|
5916
|
+
// nothing about whether the domain is live — fail open: don't cache,
|
|
5917
|
+
// don't skip, let it proceed to real browser navigation (a genuinely
|
|
5918
|
+
// dead host still fails fast there).
|
|
5919
|
+
if (isNonExistenceError(errCode)) {
|
|
5920
|
+
dnsBreaker.record(false); // resolver answered NXDOMAIN — healthy
|
|
5921
|
+
dnsNegativeCacheSet(taskDomain, errCode);
|
|
5922
|
+
recordDeadDomain(taskDomain, errCode);
|
|
5923
|
+
dnsPrecheckSkips++;
|
|
5924
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
|
|
5925
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
|
|
5926
|
+
}
|
|
5927
|
+
dnsBreaker.record(true); // resolver error — counts toward tripping the circuit
|
|
5928
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check inconclusive (${errCode}) for ${taskDomain} — proceeding (resolver issue, not a dead host)`));
|
|
5578
5929
|
}
|
|
5579
|
-
} // close `else`
|
|
5930
|
+
} // close the resolve `else` (domainKnownToResolve / circuit-open shortcuts above)
|
|
5580
5931
|
}
|
|
5581
5932
|
} catch {}
|
|
5582
5933
|
|
|
@@ -5609,6 +5960,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5609
5960
|
+ ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
|
|
5610
5961
|
+ 30000
|
|
5611
5962
|
);
|
|
5963
|
+
// Feed the hang-check restart so it never escalates before this URL's own
|
|
5964
|
+
// timeout could have fired (see maxPerUrlTimeoutMs).
|
|
5965
|
+
if (PER_URL_TIMEOUT_MS > maxPerUrlTimeoutMs) maxPerUrlTimeoutMs = PER_URL_TIMEOUT_MS;
|
|
5612
5966
|
// Grace period after primary timeout — gives the orphan a chance to
|
|
5613
5967
|
// finish drainPendingNetTools() and emit "Saving N rules despite page
|
|
5614
5968
|
// load failure" before we abandon its result. Drain typically completes
|
|
@@ -5868,11 +6222,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5868
6222
|
} catch (processingError) {
|
|
5869
6223
|
console.log(formatLogMessage('error', `Critical error: ${processingError.message}`));
|
|
5870
6224
|
clearInterval(hangDetectionInterval);
|
|
6225
|
+
clearInterval(pageHangProbeInterval);
|
|
5871
6226
|
throw processingError;
|
|
5872
6227
|
}
|
|
5873
6228
|
|
|
5874
|
-
// Clear hang detection
|
|
6229
|
+
// Clear hang detection intervals
|
|
5875
6230
|
clearInterval(hangDetectionInterval);
|
|
6231
|
+
clearInterval(pageHangProbeInterval);
|
|
5876
6232
|
|
|
5877
6233
|
// === POST-SCAN PROCESSING ===
|
|
5878
6234
|
// Clean up first-party domains and validate results
|
|
@@ -5954,7 +6310,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5954
6310
|
const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
|
|
5955
6311
|
|
|
5956
6312
|
// Debug: Show output format being used
|
|
5957
|
-
const totalDomainsSkipped = getTotalDomainsSkipped();
|
|
5958
6313
|
const detectedDomainsCount = getDetectedDomainsCount();
|
|
5959
6314
|
if (forceDebug) {
|
|
5960
6315
|
const globalOptions = {
|
|
@@ -5969,7 +6324,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5969
6324
|
};
|
|
5970
6325
|
console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
|
|
5971
6326
|
console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
|
|
5972
|
-
console.log(formatLogMessage('debug', `Performance: ${
|
|
6327
|
+
console.log(formatLogMessage('debug', `Performance: ${detectedDomainsCount} unique domains cached`));
|
|
5973
6328
|
// Cloudflare cache statistics
|
|
5974
6329
|
const cloudflareStats = getCacheStats();
|
|
5975
6330
|
if (cloudflareStats.size > 0) {
|
|
@@ -5998,6 +6353,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5998
6353
|
}
|
|
5999
6354
|
console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
|
|
6000
6355
|
}
|
|
6356
|
+
// Surface circuit-breaker activity in the end-of-scan summary (each trip
|
|
6357
|
+
// also warns in real time). Shown outside forceDebug because a resolver
|
|
6358
|
+
// refusal storm is something the operator should know happened.
|
|
6359
|
+
const dnsBreakerTrips = dnsBreaker.stats().trips;
|
|
6360
|
+
if (dnsBreakerTrips > 0 && !silentMode) {
|
|
6361
|
+
console.log(formatLogMessage('info', `DNS pre-check circuit tripped ${dnsBreakerTrips}× this scan (resolver refusal back-off)`));
|
|
6362
|
+
}
|
|
6001
6363
|
// Blocked-pattern hit stats. Surfaces which patterns are actually
|
|
6002
6364
|
// doing work this scan and (by absence) which are stale enough to
|
|
6003
6365
|
// prune from config. Top 10 by hit count to keep the log scannable
|
|
@@ -6200,8 +6562,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
6200
6562
|
} else if (outputResult.totalRules > 0 && dryRunMode) {
|
|
6201
6563
|
console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
|
|
6202
6564
|
}
|
|
6203
|
-
|
|
6204
|
-
|
|
6565
|
+
// --show-dead-domains: list hostnames that didn't resolve / were unreachable
|
|
6566
|
+
// this scan (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
|
|
6567
|
+
// One host per line so it's greppable for pruning; reason in the trailing column.
|
|
6568
|
+
if (showDeadDomains) {
|
|
6569
|
+
if (_deadDomains.size > 0) {
|
|
6570
|
+
console.log(`\n${messageColors.warn(`Dead domains (${_deadDomains.size}) — did not resolve / unreachable:`)}`);
|
|
6571
|
+
for (const [host, reason] of [..._deadDomains].sort((a, b) => a[0].localeCompare(b[0]))) {
|
|
6572
|
+
console.log(` ${host}\t${reason}`);
|
|
6573
|
+
}
|
|
6574
|
+
} else {
|
|
6575
|
+
console.log(`\n${messageColors.success('Dead domains: none detected')}`);
|
|
6576
|
+
}
|
|
6205
6577
|
}
|
|
6206
6578
|
if (ignoreCache && forceDebug) {
|
|
6207
6579
|
console.log(messageColors.info('Cache:') + ` Smart caching was disabled`);
|