@fanboynz/network-scanner 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/CLAUDE.md +2 -1
- package/README.md +33 -5
- package/eslint.config.mjs +13 -1
- package/lib/browserhealth.js +28 -94
- package/lib/dns.js +238 -0
- package/lib/domain-cache.js +14 -127
- package/lib/fingerprint.js +220 -97
- package/lib/fingerprint.md +94 -0
- package/lib/ghost-cursor.js +29 -11
- package/lib/interaction.js +4 -0
- package/lib/nettools.js +154 -51
- package/lib/output.js +24 -13
- package/lib/proxy.js +6 -2
- package/lib/redirect.js +4 -1
- package/lib/smart-cache.js +9 -1
- package/lib/socks-relay.js +14 -9
- package/lib/validate_rules.js +16 -1
- package/nwss.1 +76 -15
- package/nwss.js +389 -113
- package/package.json +1 -1
package/nwss.js
CHANGED
|
@@ -9,11 +9,11 @@ const fs = require('fs');
|
|
|
9
9
|
const os = require('os');
|
|
10
10
|
const psl = require('psl');
|
|
11
11
|
const path = require('path');
|
|
12
|
-
const
|
|
12
|
+
const { createRotatingResolver, createDnsCircuitBreaker, parseDnsServers, isNonExistenceError } = require('./lib/dns');
|
|
13
13
|
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
14
|
-
const { compressMultipleFiles
|
|
14
|
+
const { compressMultipleFiles } = require('./lib/compress');
|
|
15
15
|
const { parseSearchStrings, createResponseHandler } = require('./lib/searchstring');
|
|
16
|
-
const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS } = require('./lib/fingerprint');
|
|
16
|
+
const { applyAllFingerprintSpoofing, USER_AGENT_COLLECTIONS, CHROME_BUILD, CHROME_GREASE_BRAND } = require('./lib/fingerprint');
|
|
17
17
|
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
18
18
|
// Curl functionality (replace searchstring curl handler)
|
|
19
19
|
const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
|
|
@@ -34,9 +34,7 @@ const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore
|
|
|
34
34
|
// Graceful exit
|
|
35
35
|
const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
|
|
36
36
|
// Whois & Dig
|
|
37
|
-
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
|
|
38
|
-
// File compare
|
|
39
|
-
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
|
|
37
|
+
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve, loadDiskCache, saveDiskCache, setDigResolvers } = require('./lib/nettools');
|
|
40
38
|
// CDP functionality
|
|
41
39
|
const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp');
|
|
42
40
|
// Post-processing cleanup
|
|
@@ -68,14 +66,14 @@ const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
|
|
|
68
66
|
// Enhanced mouse interaction and page simulation
|
|
69
67
|
const { performPageInteraction, createInteractionConfig, computeInteractionCeilingMs, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
|
|
70
68
|
// Optional ghost-cursor support for advanced Bezier-based mouse movements
|
|
71
|
-
const {
|
|
69
|
+
const { createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
|
|
72
70
|
// Domain detection cache for performance optimization
|
|
73
|
-
const { createGlobalHelpers,
|
|
71
|
+
const { createGlobalHelpers, getDetectedDomainsCount } = require('./lib/domain-cache');
|
|
74
72
|
const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
|
|
75
73
|
const { clearPersistentCache } = require('./lib/smart-cache');
|
|
76
74
|
const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
|
|
77
75
|
// Dry run functionality
|
|
78
|
-
const { initializeDryRunCollections, addDryRunMatch,
|
|
76
|
+
const { initializeDryRunCollections, addDryRunMatch, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
|
|
79
77
|
// Enhanced site data clearing functionality
|
|
80
78
|
const { clearSiteData } = require('./lib/clear_sitedata');
|
|
81
79
|
// Referrer header generation
|
|
@@ -137,6 +135,7 @@ const CONCURRENCY_LIMITS = Object.freeze({
|
|
|
137
135
|
// Keep using the imported map directly so the two can never diverge again.
|
|
138
136
|
|
|
139
137
|
const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
|
|
138
|
+
const REALTIME_CLEANUP_BUFFER_MS = 25000; // Buffer added after site delay before realtime window cleanup
|
|
140
139
|
|
|
141
140
|
/**
|
|
142
141
|
* Detects the installed Puppeteer version dynamically
|
|
@@ -181,7 +180,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
|
|
|
181
180
|
// purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
|
|
182
181
|
// and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
|
|
183
182
|
// automatically — manual purging is no longer needed.
|
|
184
|
-
const { monitorBrowserHealth,
|
|
183
|
+
const { monitorBrowserHealth, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
|
|
185
184
|
|
|
186
185
|
// --- Script Configuration & Constants ---
|
|
187
186
|
const VERSION = '2.0.33'; // Script version
|
|
@@ -191,7 +190,12 @@ const startTime = Date.now();
|
|
|
191
190
|
|
|
192
191
|
// Initialize domain cache helpers with debug logging if enabled
|
|
193
192
|
const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
|
|
194
|
-
|
|
193
|
+
// Only markDomainAsDetected is used — the global cache feeds the end-of-scan
|
|
194
|
+
// "unique domains cached" stat (getDetectedDomainsCount). The skip-check
|
|
195
|
+
// (isDomainAlreadyDetected) is intentionally not wired in: cross-URL dedup is
|
|
196
|
+
// already handled by nettools' global processed-domain sets, smart-cache, and
|
|
197
|
+
// the per-URL local set, so a cache-level skip would be redundant.
|
|
198
|
+
const { markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
|
|
195
199
|
|
|
196
200
|
// Smart cache will be initialized after config is loaded
|
|
197
201
|
let smartCache = null;
|
|
@@ -232,6 +236,9 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
232
236
|
const settingsMap = {
|
|
233
237
|
output: ['-o', '--output'],
|
|
234
238
|
max_concurrent: ['--max-concurrent'],
|
|
239
|
+
cleanup_interval: ['--cleanup-interval'],
|
|
240
|
+
resource_cleanup_interval: ['--cleanup-interval'],
|
|
241
|
+
dns: ['--dns'],
|
|
235
242
|
dns_cache: ['--dns-cache'],
|
|
236
243
|
cache_requests: ['--cache-requests'],
|
|
237
244
|
dumpurls: ['--dumpurls'],
|
|
@@ -243,20 +250,25 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
243
250
|
compress_logs: ['--compress-logs'],
|
|
244
251
|
debug: ['--debug'],
|
|
245
252
|
silent: ['--silent'],
|
|
246
|
-
verbose: ['--verbose'],
|
|
247
253
|
headful: ['--headful'],
|
|
248
254
|
keep_open: ['--keep-open'],
|
|
249
255
|
dry_run: ['--dry-run'],
|
|
250
256
|
titles: ['--titles'],
|
|
251
257
|
sub_domains: ['--sub-domains'],
|
|
252
258
|
no_interact: ['--no-interact'],
|
|
259
|
+
show_dead_domains: ['--show-dead-domains'],
|
|
253
260
|
ghost_cursor: ['--ghost-cursor'],
|
|
254
261
|
plain: ['--plain'],
|
|
255
262
|
cdp: ['--cdp'],
|
|
256
263
|
dnsmasq: ['--dnsmasq'],
|
|
264
|
+
dnsmasq_old: ['--dnsmasq-old'],
|
|
257
265
|
unbound: ['--unbound'],
|
|
258
266
|
privoxy: ['--privoxy'],
|
|
259
267
|
pihole: ['--pihole'],
|
|
268
|
+
adblock_rules: ['--adblock-rules'],
|
|
269
|
+
no_dns_precheck: ['--no-dns-precheck'],
|
|
270
|
+
allow_fullscreen: ['--allow-fullscreen'],
|
|
271
|
+
load_extension: ['--load-extension'],
|
|
260
272
|
eval_on_doc: ['--eval-on-doc'],
|
|
261
273
|
use_puppeteer_core: ['--use-puppeteer-core'],
|
|
262
274
|
ignore_cache: ['--ignore-cache'],
|
|
@@ -314,7 +326,6 @@ if (compareIndex !== -1 && args[compareIndex + 1]) {
|
|
|
314
326
|
}
|
|
315
327
|
|
|
316
328
|
|
|
317
|
-
const forceVerbose = args.includes('--verbose');
|
|
318
329
|
const forceDebug = args.includes('--debug');
|
|
319
330
|
const silentMode = args.includes('--silent');
|
|
320
331
|
const showTitles = args.includes('--titles');
|
|
@@ -337,12 +348,16 @@ const disableInteract = args.includes('--no-interact');
|
|
|
337
348
|
const globalGhostCursor = args.includes('--ghost-cursor');
|
|
338
349
|
const plainOutput = args.includes('--plain');
|
|
339
350
|
const enableCDP = args.includes('--cdp');
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
const
|
|
351
|
+
// These six are reassigned to false by the incompatible-flag validation
|
|
352
|
+
// blocks below (e.g. --dnsmasq + --unbound), so they must be `let` — as
|
|
353
|
+
// `const` that fallback threw "Assignment to constant variable" the moment
|
|
354
|
+
// two conflicting output modes were combined.
|
|
355
|
+
let dnsmasqMode = args.includes('--dnsmasq');
|
|
356
|
+
let dnsmasqOldMode = args.includes('--dnsmasq-old');
|
|
357
|
+
let unboundMode = args.includes('--unbound');
|
|
343
358
|
const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
|
|
344
|
-
|
|
345
|
-
|
|
359
|
+
let privoxyMode = args.includes('--privoxy');
|
|
360
|
+
let piholeMode = args.includes('--pihole');
|
|
346
361
|
const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
|
|
347
362
|
const dryRunMode = args.includes('--dry-run');
|
|
348
363
|
const compressLogs = args.includes('--compress-logs');
|
|
@@ -363,6 +378,21 @@ if (dnsCacheMode) enableDiskCache();
|
|
|
363
378
|
const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
|
|
364
379
|
const dnsPrecheckTimeoutMs = 2000;
|
|
365
380
|
|
|
381
|
+
// --show-dead-domains: collect hostnames that are definitively DEAD (do not
|
|
382
|
+
// exist / unreachable) and print them at the end of the scan so they can be
|
|
383
|
+
// pruned. Only hard signals count — NXDOMAIN/ENODATA from the pre-check and
|
|
384
|
+
// ERR_NAME_NOT_RESOLVED / ERR_ADDRESS_UNREACHABLE from navigation. Transient
|
|
385
|
+
// failures (403/429 blocks, timeouts, Cloudflare challenges) mean the domain is
|
|
386
|
+
// ALIVE and are deliberately excluded. host -> reason (first seen).
|
|
387
|
+
const showDeadDomains = args.includes('--show-dead-domains');
|
|
388
|
+
const _deadDomains = new Map();
|
|
389
|
+
function recordDeadDomain(urlOrHost, reason) {
|
|
390
|
+
if (!showDeadDomains || !urlOrHost) return;
|
|
391
|
+
let host = urlOrHost;
|
|
392
|
+
try { host = new URL(urlOrHost).hostname; } catch { /* already a bare host */ }
|
|
393
|
+
if (host && !_deadDomains.has(host)) _deadDomains.set(host, reason);
|
|
394
|
+
}
|
|
395
|
+
|
|
366
396
|
// Per-scan cache of negative DNS lookups. OS resolvers don't always cache
|
|
367
397
|
// NXDOMAIN responses, and a scan can hit the same dead hostname many times
|
|
368
398
|
// (different URL paths on the same site). Positive results are left to the
|
|
@@ -371,14 +401,67 @@ const dnsPrecheckTimeoutMs = 2000;
|
|
|
371
401
|
// of unique dead hosts) can't grow the cache unboundedly. Same pattern as
|
|
372
402
|
// the rest of the codebase's in-memory caches.
|
|
373
403
|
const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
|
|
374
|
-
const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
375
404
|
const DNS_NEGATIVE_CACHE_MAX = 1000;
|
|
405
|
+
// The negative cache holds ONLY definitive non-existence (NXDOMAIN/ENODATA) —
|
|
406
|
+
// resolver errors fail open and never enter it (see the pre-check catch), so
|
|
407
|
+
// persisting it can't silently drop a live host. Opt-in via --dns-cache: dead
|
|
408
|
+
// hosts are remembered for DNS_NEGATIVE_PERSIST_TTL_MS and reloaded next run;
|
|
409
|
+
// otherwise it's a 5-min in-memory-only cache. The persist TTL is deliberately
|
|
410
|
+
// shorter than the dig/whois positive cache (20h): a domain that doesn't exist
|
|
411
|
+
// now MAY get registered, and this is a domain-hunting scanner, so the dead
|
|
412
|
+
// ones are re-checked twice a day rather than trusted for ~a day.
|
|
413
|
+
const DNS_NEGATIVE_PERSIST_TTL_MS = 12 * 60 * 60 * 1000; // 12 hours
|
|
414
|
+
const DNS_NEGATIVE_CACHE_TTL_MS = dnsCacheMode ? DNS_NEGATIVE_PERSIST_TTL_MS : 5 * 60 * 1000;
|
|
415
|
+
const DNS_NEGATIVE_CACHE_FILE = path.join(__dirname, '.dnsnegcache');
|
|
416
|
+
if (dnsCacheMode) {
|
|
417
|
+
// Reuse the dig/whois caches' generic load/save (atomic write, TTL + size
|
|
418
|
+
// bounded). The 'exit' flush is synchronous (writeFileSync) so it fires on
|
|
419
|
+
// any exit path, mirroring nettools' dig/whois flush.
|
|
420
|
+
loadDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX);
|
|
421
|
+
process.on('exit', () => saveDiskCache(DNS_NEGATIVE_CACHE_FILE, dnsNegativeCache, DNS_NEGATIVE_CACHE_TTL_MS, DNS_NEGATIVE_CACHE_MAX));
|
|
422
|
+
}
|
|
376
423
|
let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
|
|
377
424
|
let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
|
|
378
425
|
const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
|
|
379
|
-
//
|
|
380
|
-
//
|
|
381
|
-
|
|
426
|
+
// DNS pre-check resolver (rotation + resolution logic lives in lib/dns.js).
|
|
427
|
+
// `--dns <ip[,ip...]>` (or a `dns` setting in .nwssconfig, mapped to the same
|
|
428
|
+
// flag) pins/rotates an explicit resolver list; otherwise the resolv.conf
|
|
429
|
+
// nameservers are rotated. Rotation spreads the c-ares burst so one server
|
|
430
|
+
// (e.g. a flaky ISP resolver) doesn't absorb every query and answer REFUSED.
|
|
431
|
+
const dnsServerIndex = args.findIndex(arg => arg === '--dns');
|
|
432
|
+
const dnsServersOverride = (dnsServerIndex !== -1 && args[dnsServerIndex + 1])
|
|
433
|
+
? parseDnsServers(args[dnsServerIndex + 1])
|
|
434
|
+
: [];
|
|
435
|
+
const dnsResolver = createRotatingResolver({ servers: dnsServersOverride, forceDebug });
|
|
436
|
+
// Route nettools' dig through the same --dns resolvers (dig otherwise uses the
|
|
437
|
+
// system /etc/resolv.conf, which on a flaky setup times out and silently drops
|
|
438
|
+
// dig-gated domains). Only when --dns is explicitly set.
|
|
439
|
+
if (dnsServersOverride.length > 0) setDigResolvers(dnsServersOverride);
|
|
440
|
+
// Circuit breaker: if resolver errors dominate, suspend the pre-check for a
|
|
441
|
+
// cooldown so a refusal storm doesn't keep hammering a broken resolver (sites
|
|
442
|
+
// still load — a suspended pre-check just proceeds to navigation).
|
|
443
|
+
const dnsBreaker = createDnsCircuitBreaker({ forceDebug });
|
|
444
|
+
if (dnsResolver.pinned && !silentMode) {
|
|
445
|
+
const how = dnsResolver.servers.length === 1 ? 'pinned to' : 'rotating';
|
|
446
|
+
console.log(formatLogMessage('info', `DNS pre-check ${how} ${dnsResolver.servers.join(', ')}`));
|
|
447
|
+
} else if (forceDebug && dnsResolver.rotates) {
|
|
448
|
+
console.log(formatLogMessage('debug', `DNS pre-check rotating ${dnsResolver.servers.length} resolv.conf nameservers: ${dnsResolver.servers.join(', ')}`));
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Idle-hang watchdog registry: in-flight main pages, iterable (the
|
|
452
|
+
// browserhealth page trackers are WeakMaps and can't be scanned). Registered
|
|
453
|
+
// when a task starts navigating, removed on completion. The hang check probes
|
|
454
|
+
// these ONLY while global progress is stalled and force-closes any page that is
|
|
455
|
+
// unresponsive across consecutive probes — recovering a single hung URL in ~the
|
|
456
|
+
// hang-check window instead of waiting out its full per-URL ceiling (which is
|
|
457
|
+
// the backstop). Acting only during a stall + requiring unresponsiveness avoids
|
|
458
|
+
// killing a page that's merely slow (a page in a config delay is idle but
|
|
459
|
+
// RESPONDS to a trivial evaluate; a hung one does not). Entries self-heal via
|
|
460
|
+
// isClosed() so timeout/error paths that skip the normal close can't leak.
|
|
461
|
+
const _inFlightPages = new Map(); // page -> { url, unresponsiveStrikes }
|
|
462
|
+
const PAGE_HANG_PROBE_TIMEOUT_MS = 2000; // liveness-probe (page.evaluate) cap; no response within this = hung
|
|
463
|
+
const PAGE_HANG_PROBE_INTERVAL_MS = 15000; // how often to probe in-flight pages while the scan is stalled
|
|
464
|
+
const PAGE_HANG_STRIKES_TO_KILL = 2; // consecutive HUNG probes before force-close (~30s recovery at the 15s interval)
|
|
382
465
|
|
|
383
466
|
function dnsNegativeCacheSet(hostname, error) {
|
|
384
467
|
if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
|
|
@@ -691,7 +774,6 @@ Per-config settings file (.nwssconfig):
|
|
|
691
774
|
See README.md for format details.
|
|
692
775
|
|
|
693
776
|
General Options:
|
|
694
|
-
--verbose Force verbose mode globally
|
|
695
777
|
--debug Force debug mode globally
|
|
696
778
|
--silent Suppress normal console logs
|
|
697
779
|
--titles Add ! <url> title before each site's group
|
|
@@ -721,10 +803,16 @@ General Options:
|
|
|
721
803
|
|
|
722
804
|
Validation Options:
|
|
723
805
|
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
|
|
724
|
-
--dns
|
|
806
|
+
--dns <ip[,ip,...]> Resolver(s) for the DNS pre-check AND nettools' dig (not Chrome nav / whois).
|
|
807
|
+
One pins all queries to it; several rotate per query. Overrides /etc/resolv.conf.
|
|
808
|
+
--dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each),
|
|
809
|
+
plus the DNS pre-check negative cache (NXDOMAIN only, 12h TTL, .dnsnegcache)
|
|
725
810
|
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
|
|
726
811
|
By default, URLs whose hostname doesn't resolve are skipped
|
|
727
812
|
immediately (saves ~5-15s of Puppeteer time per dead host).
|
|
813
|
+
--show-dead-domains At end of scan, list hostnames that did not resolve / were
|
|
814
|
+
unreachable (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
|
|
815
|
+
Excludes blocks/timeouts (those mean the domain is alive). For pruning.
|
|
728
816
|
--validate-config Validate config.json file and exit
|
|
729
817
|
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
|
|
730
818
|
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
|
|
@@ -741,7 +829,7 @@ Global config.json options:
|
|
|
741
829
|
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
742
830
|
ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
|
|
743
831
|
ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
|
|
744
|
-
max_concurrent_sites:
|
|
832
|
+
max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6)
|
|
745
833
|
resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80)
|
|
746
834
|
disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true)
|
|
747
835
|
|
|
@@ -752,8 +840,7 @@ Per-site config.json options:
|
|
|
752
840
|
When true, ALL regex patterns must match the same URL
|
|
753
841
|
|
|
754
842
|
Redirect Handling Options:
|
|
755
|
-
|
|
756
|
-
max_redirects: 10 Maximum number of redirects to follow (default: 10)
|
|
843
|
+
max_redirects: 10 Maximum number of redirects to follow (default: 10; 0 = follow none)
|
|
757
844
|
js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
|
|
758
845
|
detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
|
|
759
846
|
redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
|
|
@@ -1525,7 +1612,12 @@ function matchesDynamicBlock(domain) {
|
|
|
1525
1612
|
return _domainOrParentInSet(_dynamicallyBlockedDomains, domain);
|
|
1526
1613
|
}
|
|
1527
1614
|
|
|
1528
|
-
|
|
1615
|
+
// `_ignorePatterns` is intentionally unused (underscore-marked): every caller
|
|
1616
|
+
// and the grep/curl/nettools/searchstring callback contract pass the ignore
|
|
1617
|
+
// list as a 2nd arg, but the ignore-state actually lives in the module-level
|
|
1618
|
+
// _dynamicallyIgnoredDomains / _ignoreDomainsExact Sets walked below. Kept in
|
|
1619
|
+
// the signature only to preserve that shared call shape.
|
|
1620
|
+
function matchesIgnoreDomain(domain, _ignorePatterns) {
|
|
1529
1621
|
// Both dynamic and static ignore lists are walked parent-by-parent so a
|
|
1530
1622
|
// subdomain of an ignored root inherits the ignore. Previously the
|
|
1531
1623
|
// dynamic check was exact-only, creating an asymmetry: a static-config
|
|
@@ -2116,22 +2208,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2116
2208
|
bypass_cache
|
|
2117
2209
|
} = siteConfig;
|
|
2118
2210
|
|
|
2119
|
-
const allowFirstParty = firstParty === true || firstParty === 1;
|
|
2120
|
-
const allowThirdParty = thirdParty === undefined || thirdParty === true || thirdParty === 1;
|
|
2121
2211
|
const perSiteSubDomains = subDomains === 1 ? true : subDomainsMode;
|
|
2122
|
-
const siteLocalhostIP = localhost || null;
|
|
2123
|
-
const cloudflarePhishBypass = cloudflare_phish === true;
|
|
2124
|
-
const cloudflareBypass = cloudflare_bypass === true;
|
|
2125
2212
|
// Add redirect and same-page loop protection
|
|
2126
|
-
|
|
2213
|
+
// Number check (not ||) so max_redirects: 0 isn't swallowed as falsy → 10.
|
|
2214
|
+
const MAX_REDIRECT_DEPTH = (typeof siteConfig.max_redirects === 'number' && siteConfig.max_redirects >= 0)
|
|
2215
|
+
? siteConfig.max_redirects : 10;
|
|
2127
2216
|
const redirectHistory = new Set();
|
|
2128
2217
|
let redirectCount = 0;
|
|
2129
2218
|
const pageLoadHistory = new Map(); // Track same-page reloads
|
|
2130
2219
|
const MAX_SAME_PAGE_LOADS = 3;
|
|
2131
2220
|
let currentPageUrl = currentUrl;
|
|
2132
2221
|
|
|
2133
|
-
const sitePrivoxy = privoxy === true;
|
|
2134
|
-
const sitePihole = pihole === true;
|
|
2135
2222
|
const flowproxyDetection = flowproxy_detection === true;
|
|
2136
2223
|
|
|
2137
2224
|
const evenBlocked = even_blocked === true;
|
|
@@ -2298,6 +2385,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2298
2385
|
|
|
2299
2386
|
// Track page for realtime cleanup
|
|
2300
2387
|
trackPageForRealtime(page);
|
|
2388
|
+
// Register with the idle-hang watchdog (force-closed if it goes
|
|
2389
|
+
// unresponsive while the whole scan has stalled).
|
|
2390
|
+
_inFlightPages.set(page, { url: currentUrl, unresponsiveStrikes: 0 });
|
|
2301
2391
|
|
|
2302
2392
|
// Mark page as actively processing
|
|
2303
2393
|
updatePageUsage(page, true);
|
|
@@ -2750,7 +2840,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2750
2840
|
if (!useObscura && siteConfig.userAgent && siteConfig.userAgent.toLowerCase().includes('chrome')) {
|
|
2751
2841
|
const userAgentKey = siteConfig.userAgent.toLowerCase();
|
|
2752
2842
|
let platform = 'Windows';
|
|
2753
|
-
let platformVersion = '
|
|
2843
|
+
let platformVersion = '19.0.0'; // Win11 — MUST match fingerprint.js's userAgentData platformVersion
|
|
2754
2844
|
let arch = 'x86';
|
|
2755
2845
|
|
|
2756
2846
|
if (userAgentKey === 'chrome_mac') {
|
|
@@ -2769,21 +2859,46 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2769
2859
|
// never drift out of sync with navigator.userAgent. The version
|
|
2770
2860
|
// used to be hardcoded ('146') while the UA list moved to 148 —
|
|
2771
2861
|
// a detector cross-checking UA vs Sec-CH-UA saw the mismatch.
|
|
2772
|
-
//
|
|
2862
|
+
// The full-version hints carry the REAL build (major.0.BUILD) — the
|
|
2863
|
+
// reduced UA hides it, these reveal it. Build comes from
|
|
2864
|
+
// lib/fingerprint's CHROME_BUILD, the same source the JS
|
|
2865
|
+
// getHighEntropyValues spoof uses, so HTTP and JS can't disagree.
|
|
2773
2866
|
const browserUa = USER_AGENT_COLLECTIONS.get(userAgentKey) || '';
|
|
2774
2867
|
const chromeMajor = (browserUa.match(/Chrome\/(\d+)/) || [])[1] || '148';
|
|
2775
|
-
const fullVer = `${chromeMajor}.0
|
|
2868
|
+
const fullVer = `${chromeMajor}.0.${CHROME_BUILD}`;
|
|
2776
2869
|
|
|
2777
|
-
|
|
2778
|
-
|
|
2870
|
+
const chHeaders = {
|
|
2871
|
+
// Brand list order + grease string match real Chrome of this major
|
|
2872
|
+
// exactly (deterministic GREASE): Chromium, Google Chrome, <grease>.
|
|
2873
|
+
// Same order/grease the JS brands spoof uses, so HTTP and JS agree.
|
|
2874
|
+
'Sec-CH-UA': `"Chromium";v="${chromeMajor}", "Google Chrome";v="${chromeMajor}", "${CHROME_GREASE_BRAND}";v="99"`,
|
|
2779
2875
|
'Sec-CH-UA-Platform': `"${platform}"`,
|
|
2780
2876
|
'Sec-CH-UA-Platform-Version': `"${platformVersion}"`,
|
|
2781
2877
|
'Sec-CH-UA-Mobile': '?0',
|
|
2782
2878
|
'Sec-CH-UA-Arch': `"${arch}"`,
|
|
2783
2879
|
'Sec-CH-UA-Bitness': '"64"',
|
|
2880
|
+
'Sec-CH-UA-WoW64': '?0',
|
|
2881
|
+
'Sec-CH-UA-Model': '""',
|
|
2784
2882
|
'Sec-CH-UA-Full-Version': `"${fullVer}"`,
|
|
2785
|
-
'Sec-CH-UA-Full-Version-List': `"
|
|
2786
|
-
|
|
2883
|
+
'Sec-CH-UA-Full-Version-List': `"Chromium";v="${fullVer}", "Google Chrome";v="${fullVer}", "${CHROME_GREASE_BRAND}";v="99.0.0.0"`,
|
|
2884
|
+
// Real Chrome (128+) sends this for desktop; pairs with the
|
|
2885
|
+
// formFactors value in fingerprint.js's getHighEntropyValues spoof.
|
|
2886
|
+
'Sec-CH-UA-Form-Factors': '"Desktop"'
|
|
2887
|
+
};
|
|
2888
|
+
// Sec-CH-Device-Memory must mirror the JS navigator.deviceMemory
|
|
2889
|
+
// override (8) so a server reading BOTH can't cross-check a mismatch.
|
|
2890
|
+
// That JS override lives in applyFingerprintProtection, so it only
|
|
2891
|
+
// runs when fingerprint_protection is set — gate the header the same
|
|
2892
|
+
// way. Without this gate, a userAgent-only site (no fp_protection)
|
|
2893
|
+
// would get JS deviceMemory = the real host RAM (e.g. 32) but HTTP
|
|
2894
|
+
// = 8, a fresh mismatch. With fp off we send neither and both sides
|
|
2895
|
+
// report the native value, which is also consistent. (RAM isn't
|
|
2896
|
+
// server-observable, so spoofing it down hides datacenter specs with
|
|
2897
|
+
// nothing external to contradict — unlike rtt, which we leave native.)
|
|
2898
|
+
if (siteConfig.fingerprint_protection) {
|
|
2899
|
+
chHeaders['Sec-CH-Device-Memory'] = '8';
|
|
2900
|
+
}
|
|
2901
|
+
await page.setExtraHTTPHeaders(chHeaders);
|
|
2787
2902
|
}
|
|
2788
2903
|
} catch (fingerprintErr) {
|
|
2789
2904
|
if (fingerprintErr.message.includes('Session closed') ||
|
|
@@ -2797,12 +2912,27 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2797
2912
|
|
|
2798
2913
|
const regexes = getCompiledRegexes(siteConfig.filterRegex);
|
|
2799
2914
|
|
|
2915
|
+
// output_regex (optional per-site): extract the rule body from each matched
|
|
2916
|
+
// URL via capture group 1 (or the whole match), so output becomes
|
|
2917
|
+
// ||<capture> (e.g. ||host/script/) instead of ||host^ — lets a stable
|
|
2918
|
+
// folder/file be blocked on a host that also serves legit content. Compiled
|
|
2919
|
+
// silently here; config-load validation (validate_rules) warns on a bad
|
|
2920
|
+
// pattern, so a throw here just disables the feature for this site.
|
|
2921
|
+
// Reuse the memoized regex compiler (same cache as filterRegex) so the
|
|
2922
|
+
// pattern compiles once per unique source, not once per URL. try/catch
|
|
2923
|
+
// because getCompiledRegex throws on a bad pattern — config-load
|
|
2924
|
+
// validation already warned; a throw here just disables the feature.
|
|
2925
|
+
let outputRegex = null;
|
|
2926
|
+
if (siteConfig.output_regex) {
|
|
2927
|
+
try { outputRegex = getCompiledRegexes(siteConfig.output_regex)[0] || null; } catch (_) { outputRegex = null; }
|
|
2928
|
+
}
|
|
2929
|
+
|
|
2800
2930
|
// NEW: Get regex_and setting (defaults to false for backward compatibility)
|
|
2801
2931
|
const useRegexAnd = siteConfig.regex_and === true;
|
|
2802
2932
|
|
|
2803
2933
|
// Parse searchstring patterns using module
|
|
2804
2934
|
const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
|
|
2805
|
-
|
|
2935
|
+
let useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring (reassigned to false below if curl is unavailable)
|
|
2806
2936
|
let useGrep = siteConfig.grep === true; // Grep can work independently
|
|
2807
2937
|
|
|
2808
2938
|
// Get user agent for curl if needed
|
|
@@ -2984,9 +3114,30 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2984
3114
|
* @param {string} fullSubdomain - Full subdomain for cache tracking
|
|
2985
3115
|
* @param {string} resourceType - Resource type (for --adblock-rules mode)
|
|
2986
3116
|
*/
|
|
2987
|
-
function addMatchedDomain(domain, resourceType = null, fullSubdomain = null) {
|
|
3117
|
+
function addMatchedDomain(domain, resourceType = null, fullSubdomain = null, matchedUrl = null) {
|
|
2988
3118
|
// Use fullSubdomain for cache tracking if provided, otherwise fall back to domain
|
|
2989
3119
|
const cacheKey = fullSubdomain || domain;
|
|
3120
|
+
// output_regex: derive the rule body from the matched URL. Capture group 1
|
|
3121
|
+
// (or the whole match) becomes the stored key, e.g. "host/script/", which
|
|
3122
|
+
// formatDomain emits as ||host/script/ for adblock and falls back to the
|
|
3123
|
+
// bare host for domain-only formats. All similarity / dedup / smart-cache
|
|
3124
|
+
// logic below still runs on the bare host (domain); only the final stored
|
|
3125
|
+
// key changes. The capture must contain both '/' and '.' (i.e. host+path),
|
|
3126
|
+
// otherwise we keep the host so a mis-written regex can't emit garbage.
|
|
3127
|
+
let outputKey = domain;
|
|
3128
|
+
if (outputRegex && matchedUrl) {
|
|
3129
|
+
const m = matchedUrl.match(outputRegex);
|
|
3130
|
+
if (m) {
|
|
3131
|
+
const cap = (m[1] != null ? m[1] : m[0]);
|
|
3132
|
+
// Accept only a host+path shape: a '/' with a real host before it
|
|
3133
|
+
// (segment before the first '/' must contain a '.'). Rejects a
|
|
3134
|
+
// capture that accidentally includes the scheme (host part would be
|
|
3135
|
+
// "https:") or a path-only capture with no host — both fall back to
|
|
3136
|
+
// the bare-host ||host^ rule rather than emit garbage.
|
|
3137
|
+
const sl = cap ? cap.indexOf('/') : -1;
|
|
3138
|
+
if (sl > 0 && cap.slice(0, sl).includes('.')) outputKey = cap;
|
|
3139
|
+
}
|
|
3140
|
+
}
|
|
2990
3141
|
// Check if we should ignore similar domains
|
|
2991
3142
|
const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
|
|
2992
3143
|
const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
|
|
@@ -3088,15 +3239,15 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3088
3239
|
}
|
|
3089
3240
|
|
|
3090
3241
|
if (matchedDomains instanceof Map) {
|
|
3091
|
-
if (!matchedDomains.has(
|
|
3092
|
-
matchedDomains.set(
|
|
3242
|
+
if (!matchedDomains.has(outputKey)) {
|
|
3243
|
+
matchedDomains.set(outputKey, new Set());
|
|
3093
3244
|
}
|
|
3094
3245
|
// Only add the specific resourceType that was matched, not all types for this domain
|
|
3095
3246
|
if (resourceType) {
|
|
3096
|
-
matchedDomains.get(
|
|
3247
|
+
matchedDomains.get(outputKey).add(resourceType);
|
|
3097
3248
|
}
|
|
3098
3249
|
} else {
|
|
3099
|
-
matchedDomains.add(
|
|
3250
|
+
matchedDomains.add(outputKey);
|
|
3100
3251
|
}
|
|
3101
3252
|
}
|
|
3102
3253
|
|
|
@@ -3135,12 +3286,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3135
3286
|
// fall back to the default rather than silently disabling capture.
|
|
3136
3287
|
const POPUP_MAX_DEPTH = (() => {
|
|
3137
3288
|
const v = parseInt(siteConfig.capture_popups_max_depth, 10);
|
|
3138
|
-
return Number.isFinite(v) && v > 0 ? v :
|
|
3289
|
+
return Number.isFinite(v) && v > 0 ? v : 4;
|
|
3139
3290
|
})();
|
|
3140
3291
|
const POPUP_CAPTURE_WINDOW_MS = (() => {
|
|
3141
3292
|
const v = parseInt(siteConfig.capture_popups_window_ms, 10);
|
|
3142
3293
|
return Number.isFinite(v) && v > 0 ? v : 5000;
|
|
3143
3294
|
})();
|
|
3295
|
+
// interact_popups: click inside captured popups so they cascade to their
|
|
3296
|
+
// next ad/redirect (requires capture_popups — no popups exist otherwise).
|
|
3297
|
+
// Light pass; the request listener catches whatever the clicks surface.
|
|
3298
|
+
const interactPopups = capturePopups && siteConfig.interact_popups === true;
|
|
3299
|
+
const POPUP_INTERACT_CLICKS = 3; // enough to fire popunder/redirect SDKs (incl. SDKs that suppress the 1st/2nd click as warmup) without runaway cascades
|
|
3144
3300
|
|
|
3145
3301
|
if (capturePopups && forceDebug) {
|
|
3146
3302
|
// One-time setup-time warning if the click prerequisite isn't met.
|
|
@@ -3306,7 +3462,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3306
3462
|
trackNetToolsHandler(() => popupNetToolsHandler(checkedRootDomain, fullSubdomain));
|
|
3307
3463
|
} else {
|
|
3308
3464
|
// No nettools required — regex match alone counts.
|
|
3309
|
-
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain);
|
|
3465
|
+
addMatchedDomain(checkedRootDomain, resourceType, fullSubdomain, checkedUrl);
|
|
3310
3466
|
}
|
|
3311
3467
|
} catch (_) { /* observation-only — never let a popup error escape */ }
|
|
3312
3468
|
};
|
|
@@ -3428,6 +3584,24 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3428
3584
|
|
|
3429
3585
|
attachPopupRequestCapture(popupPage, depth);
|
|
3430
3586
|
|
|
3587
|
+
// interact_popups: click inside the popup so it can cascade to its next
|
|
3588
|
+
// ad/redirect — popunder/redirect SDKs fire on a document-level click,
|
|
3589
|
+
// and a captured-but-unclicked popup only ever shows its landing URL.
|
|
3590
|
+
// Light pass (POPUP_INTERACT_CLICKS random content-zone clicks), only
|
|
3591
|
+
// on popups shallower than max depth so a clicked popup's spawned child
|
|
3592
|
+
// (depth+1) is still within the capture depth. Fire-and-forget: it must
|
|
3593
|
+
// not block onTargetCreated, and the popup may close/navigate mid-click
|
|
3594
|
+
// (performContentClicks no-ops on a closed page). The request listener
|
|
3595
|
+
// above captures whatever the clicks surface; the close timer bounds it.
|
|
3596
|
+
if (interactPopups && depth < POPUP_MAX_DEPTH && !popupPage.isClosed()) {
|
|
3597
|
+
if (forceDebug) console.log(formatLogMessage('debug', `[popup depth=${depth}] interact_popups: ${POPUP_INTERACT_CLICKS} content click(s)`));
|
|
3598
|
+
performContentClicks(popupPage, {
|
|
3599
|
+
clicks: POPUP_INTERACT_CLICKS,
|
|
3600
|
+
forceDebug,
|
|
3601
|
+
realistic: siteConfig.realistic_click === true,
|
|
3602
|
+
}).catch(() => {}); // popup is transient — non-fatal
|
|
3603
|
+
}
|
|
3604
|
+
|
|
3431
3605
|
// Auto-close after the capture window so popups don't pile up.
|
|
3432
3606
|
const closeTimer = setTimeout(() => {
|
|
3433
3607
|
try { if (!popupPage.isClosed()) popupPage.close().catch(() => {}); } catch (_) {}
|
|
@@ -3633,7 +3807,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3633
3807
|
wasBlocked: true
|
|
3634
3808
|
});
|
|
3635
3809
|
} else {
|
|
3636
|
-
addMatchedDomain(reqDomain, resourceType, fullSubdomain);
|
|
3810
|
+
addMatchedDomain(reqDomain, resourceType, fullSubdomain, reqUrl);
|
|
3637
3811
|
}
|
|
3638
3812
|
matchedRegexPatterns.add(evenBlockedRegexPattern);
|
|
3639
3813
|
|
|
@@ -3811,7 +3985,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
3811
3985
|
isFirstParty: isFirstParty
|
|
3812
3986
|
});
|
|
3813
3987
|
} else {
|
|
3814
|
-
|
|
3988
|
+
// Pass null for fullSubdomain (not the in-scope hostname) to keep
|
|
3989
|
+
// this path's dedup key as the root domain exactly as before —
|
|
3990
|
+
// only matchedUrl is new here, for output_regex.
|
|
3991
|
+
addMatchedDomain(reqDomain, resourceType, null, reqUrl);
|
|
3815
3992
|
}
|
|
3816
3993
|
if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
|
|
3817
3994
|
if (siteConfig.verbose === 1) {
|
|
@@ -4450,12 +4627,17 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4450
4627
|
}
|
|
4451
4628
|
}
|
|
4452
4629
|
console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
|
|
4630
|
+
// Capture hard "dead domain" navigation errors for --show-dead-domains
|
|
4631
|
+
// (DNS doesn't resolve / host unreachable). Blocks, timeouts and CF
|
|
4632
|
+
// challenges are NOT dead — they're excluded by this match.
|
|
4633
|
+
const deadNav = /ERR_NAME_NOT_RESOLVED|ERR_ADDRESS_UNREACHABLE|ERR_DNS/.exec(err.message || '');
|
|
4634
|
+
if (deadNav) recordDeadDomain(currentUrl, deadNav[0]);
|
|
4453
4635
|
throw err;
|
|
4454
4636
|
}
|
|
4455
4637
|
}
|
|
4456
4638
|
}
|
|
4457
4639
|
|
|
4458
|
-
const delayMs = siteConfig.delay || DEFAULT_DELAY;
|
|
4640
|
+
const delayMs = siteConfig.delay || TIMEOUTS.DEFAULT_DELAY;
|
|
4459
4641
|
|
|
4460
4642
|
// Optimized delays for Puppeteer 23.x performance
|
|
4461
4643
|
const isFastSite = timeout <= TIMEOUTS.FAST_SITE_THRESHOLD;
|
|
@@ -4535,8 +4717,21 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4535
4717
|
const ghostStart = Date.now();
|
|
4536
4718
|
const ghostTimeLeft = () => ghostDuration - (Date.now() - ghostStart);
|
|
4537
4719
|
|
|
4538
|
-
//
|
|
4539
|
-
|
|
4720
|
+
// Honor interact_click_count in ghost mode too (built-in default
|
|
4721
|
+
// is 3 — ad SDKs often swallow the 1st/2nd click as warmup). Same
|
|
4722
|
+
// default + 20-cap as the built-in content-click path. 0 when
|
|
4723
|
+
// element clicks are disabled.
|
|
4724
|
+
const ghostClickCount = interactionConfig.includeElementClicks
|
|
4725
|
+
? Math.min(Math.max(Number(siteConfig.interact_click_count) || 3, 1), 20)
|
|
4726
|
+
: 0;
|
|
4727
|
+
// Reserve part of the duration budget for those clicks so the
|
|
4728
|
+
// movement loop doesn't consume all of ghost_cursor_duration.
|
|
4729
|
+
// Capped at half the budget so movement still happens; raise
|
|
4730
|
+
// ghost_cursor_duration to fit more clicks.
|
|
4731
|
+
const clickReserveMs = Math.min(ghostClickCount * 600, ghostDuration * 0.5);
|
|
4732
|
+
|
|
4733
|
+
// Time-based Bezier mouse movements — runs for the unreserved budget
|
|
4734
|
+
while (ghostTimeLeft() > 200 + clickReserveMs) {
|
|
4540
4735
|
const toX = Math.floor(Math.random() * (viewport.width - 100)) + 50;
|
|
4541
4736
|
const toY = Math.floor(Math.random() * (viewport.height - 100)) + 50;
|
|
4542
4737
|
await ghostMove(cursor, toX, toY, {
|
|
@@ -4544,18 +4739,23 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4544
4739
|
overshootThreshold: ghostConfig.overshootThreshold,
|
|
4545
4740
|
forceDebug
|
|
4546
4741
|
});
|
|
4547
|
-
if (ghostTimeLeft() > 100) {
|
|
4742
|
+
if (ghostTimeLeft() > 100 + clickReserveMs) {
|
|
4548
4743
|
await new Promise(r => setTimeout(r, 25 + Math.random() * 75));
|
|
4549
4744
|
}
|
|
4550
4745
|
}
|
|
4551
4746
|
if (ghostTimeLeft() > 100 && Math.random() < 0.3) {
|
|
4552
4747
|
await ghostRandomMove(cursor, { forceDebug });
|
|
4553
4748
|
}
|
|
4554
|
-
|
|
4749
|
+
// interact_click_count clicks, each to a fresh content-zone point.
|
|
4750
|
+
// The time guard stops early if the budget runs out (raise
|
|
4751
|
+
// ghost_cursor_duration for more).
|
|
4752
|
+
for (let gc = 0; gc < ghostClickCount && ghostTimeLeft() > 100; gc++) {
|
|
4555
4753
|
const clickX = Math.floor(viewport.width * 0.2 + Math.random() * viewport.width * 0.6);
|
|
4556
4754
|
const clickY = Math.floor(viewport.height * 0.2 + Math.random() * viewport.height * 0.6);
|
|
4557
4755
|
await ghostClick(cursor, { x: clickX, y: clickY }, {
|
|
4558
4756
|
hesitate: ghostConfig.hesitate,
|
|
4757
|
+
page,
|
|
4758
|
+
realistic: siteConfig.realistic_click === true,
|
|
4559
4759
|
forceDebug
|
|
4560
4760
|
});
|
|
4561
4761
|
}
|
|
@@ -4870,7 +5070,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4870
5070
|
// Only add delay if we're continuing with more reloads
|
|
4871
5071
|
if (i < totalReloads) {
|
|
4872
5072
|
// Reduce delay for problematic sites
|
|
4873
|
-
const adjustedDelay = i > 1 ? Math.min(DEFAULT_DELAY, 2000) : DEFAULT_DELAY;
|
|
5073
|
+
const adjustedDelay = i > 1 ? Math.min(TIMEOUTS.DEFAULT_DELAY, 2000) : TIMEOUTS.DEFAULT_DELAY;
|
|
4874
5074
|
await fastTimeout(adjustedDelay);
|
|
4875
5075
|
}
|
|
4876
5076
|
}
|
|
@@ -5074,6 +5274,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5074
5274
|
if (!keepBrowserOpen) {
|
|
5075
5275
|
try {
|
|
5076
5276
|
untrackPage(page);
|
|
5277
|
+
_inFlightPages.delete(page);
|
|
5077
5278
|
await page.close();
|
|
5078
5279
|
if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
|
|
5079
5280
|
} catch (pageCloseErr) {
|
|
@@ -5174,6 +5375,12 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5174
5375
|
let lastProcessedCount = 0;
|
|
5175
5376
|
let hangCheckCount = 0;
|
|
5176
5377
|
let forceRestartFlag = false; // Flag to trigger restart on next iteration
|
|
5378
|
+
// Largest per-URL timeout budget seen across tasks. The hang-check restart
|
|
5379
|
+
// scales to this so it can't false-fire on a legitimately-slow config (high
|
|
5380
|
+
// delay × reload × interact) whose per-URL budget exceeds a flat threshold —
|
|
5381
|
+
// the emergency restart should only fire once the per-URL timeout ITSELF has
|
|
5382
|
+
// had its chance and failed (a true browser hang).
|
|
5383
|
+
let maxPerUrlTimeoutMs = 0;
|
|
5177
5384
|
|
|
5178
5385
|
// Precomputed colored '[HANG CHECK]' subsystem prefix. formatLogMessage
|
|
5179
5386
|
// only colors the [severity] tag; the '[HANG CHECK]' substring was
|
|
@@ -5181,6 +5388,48 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5181
5388
|
// entry so the interval callback doesn't re-colorize per tick.
|
|
5182
5389
|
const HANG_CHECK_TAG = messageColors.processing('[HANG CHECK]');
|
|
5183
5390
|
|
|
5391
|
+
// Idle-hang watchdog. Runs only while the scan is stalled (no URL completing).
|
|
5392
|
+
// The probe distinguishes a HUNG renderer from one that's merely NAVIGATING,
|
|
5393
|
+
// which is the key to probing aggressively without false-kills:
|
|
5394
|
+
// - evaluate resolves -> 'alive' -> reset strikes
|
|
5395
|
+
// - evaluate rejects fast (e.g. "Execution context destroyed" mid goto/
|
|
5396
|
+
// reload) -> 'navigating' -> inconclusive: neither
|
|
5397
|
+
// strike nor reset, so a
|
|
5398
|
+
// navigation can NEVER trip
|
|
5399
|
+
// the kill regardless of cadence
|
|
5400
|
+
// - no response within the cap -> 'hung' -> strike
|
|
5401
|
+
// PAGE_HANG_STRIKES_TO_KILL consecutive HUNG probes force-close the page, so the
|
|
5402
|
+
// stuck task's awaits reject and its batch completes instead of waiting out the
|
|
5403
|
+
// full per-URL ceiling. Parallel, guarded against overlap; zero overhead off a stall.
|
|
5404
|
+
let _hangProbeInProgress = false;
|
|
5405
|
+
const probeInFlightPagesForHang = async () => {
|
|
5406
|
+
if (_hangProbeInProgress || _inFlightPages.size === 0) return;
|
|
5407
|
+
_hangProbeInProgress = true;
|
|
5408
|
+
try {
|
|
5409
|
+
await Promise.all([..._inFlightPages.entries()].map(async ([page, info]) => {
|
|
5410
|
+
if (page.isClosed()) { _inFlightPages.delete(page); return; }
|
|
5411
|
+
let verdict;
|
|
5412
|
+
try {
|
|
5413
|
+
verdict = await Promise.race([
|
|
5414
|
+
page.evaluate(() => true).then(() => 'alive', () => 'navigating'),
|
|
5415
|
+
new Promise(r => setTimeout(() => r('hung'), PAGE_HANG_PROBE_TIMEOUT_MS)),
|
|
5416
|
+
]);
|
|
5417
|
+
} catch { verdict = 'hung'; }
|
|
5418
|
+
if (verdict === 'alive') { info.unresponsiveStrikes = 0; return; }
|
|
5419
|
+
if (verdict === 'navigating') return; // context destroyed mid-nav — not a hang; don't strike or reset
|
|
5420
|
+
// verdict === 'hung' — renderer gave no response within the cap
|
|
5421
|
+
info.unresponsiveStrikes++;
|
|
5422
|
+
if (info.unresponsiveStrikes >= PAGE_HANG_STRIKES_TO_KILL) {
|
|
5423
|
+
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} Force-closing hung page after ${info.unresponsiveStrikes} unresponsive probes: ${info.url}`));
|
|
5424
|
+
_inFlightPages.delete(page);
|
|
5425
|
+
page.close().catch(() => {}); // stuck task's awaits reject -> task errors -> batch completes
|
|
5426
|
+
}
|
|
5427
|
+
}));
|
|
5428
|
+
} finally {
|
|
5429
|
+
_hangProbeInProgress = false;
|
|
5430
|
+
}
|
|
5431
|
+
};
|
|
5432
|
+
|
|
5184
5433
|
const hangDetectionInterval = setInterval(() => {
|
|
5185
5434
|
// Progress check, counter, and forceRestartFlag MUST run regardless of
|
|
5186
5435
|
// debug mode — previously the entire body was gated on forceDebug, which
|
|
@@ -5193,8 +5442,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5193
5442
|
if (forceDebug) {
|
|
5194
5443
|
console.log(formatLogMessage('warn', `${HANG_CHECK_TAG} No progress for ${hangCheckCount * 30}s`));
|
|
5195
5444
|
}
|
|
5196
|
-
|
|
5197
|
-
|
|
5445
|
+
// The faster 15s probe interval below does surgical per-page recovery; this
|
|
5446
|
+
// 30s interval owns only the slower nuclear-restart escalation. Deadline-
|
|
5447
|
+
// aware: the restart only fires once the stall has OUTLASTED the heaviest
|
|
5448
|
+
// in-flight per-URL budget (+ grace) — i.e. the per-URL timeout itself had
|
|
5449
|
+
// its chance and failed, a true hang. A flat threshold (the old 2.5min)
|
|
5450
|
+
// false-fires on legitimately-slow configs (high delay × reload × interact)
|
|
5451
|
+
// whose per-URL budget exceeds it, restarting the browser mid-work. Floor
|
|
5452
|
+
// at 150s so light configs behave exactly as before.
|
|
5453
|
+
// +45s buffer covers the per-URL 8s orphan grace + the 30s tick granularity + slack.
|
|
5454
|
+
const restartAfterMs = Math.max(150000, maxPerUrlTimeoutMs + 45000);
|
|
5455
|
+
if (hangCheckCount * 30000 >= restartAfterMs) {
|
|
5456
|
+
console.log(formatLogMessage('error', `${HANG_CHECK_TAG} No progress for ${Math.round(hangCheckCount * 30)}s (past the ${Math.round(restartAfterMs / 1000)}s per-URL budget). Triggering emergency browser restart.`));
|
|
5198
5457
|
forceRestartFlag = true; // Set flag instead of exiting
|
|
5199
5458
|
hangCheckCount = 0; // Reset counter for next cycle
|
|
5200
5459
|
}
|
|
@@ -5216,6 +5475,22 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5216
5475
|
// cleanup, this is belt-and-suspenders in case a future refactor moves them.
|
|
5217
5476
|
hangDetectionInterval.unref();
|
|
5218
5477
|
|
|
5478
|
+
// Fast surgical recovery on its own 15s cadence (the 30s interval above owns
|
|
5479
|
+
// the slower nuclear-restart escalation). Probes in-flight pages only while
|
|
5480
|
+
// progress is stalled and force-closes confirmed-hung ones; clears strikes when
|
|
5481
|
+
// progress resumes so a fresh stall starts from zero. Starts at -1 so the very
|
|
5482
|
+
// first window is grace (processedUrlCount begins at 0).
|
|
5483
|
+
let lastProbeCount = -1;
|
|
5484
|
+
const pageHangProbeInterval = setInterval(() => {
|
|
5485
|
+
if (processedUrlCount === lastProbeCount) {
|
|
5486
|
+
probeInFlightPagesForHang(); // fire-and-forget; self-guarded against overlap
|
|
5487
|
+
} else {
|
|
5488
|
+
for (const info of _inFlightPages.values()) info.unresponsiveStrikes = 0;
|
|
5489
|
+
}
|
|
5490
|
+
lastProbeCount = processedUrlCount;
|
|
5491
|
+
}, PAGE_HANG_PROBE_INTERVAL_MS);
|
|
5492
|
+
pageHangProbeInterval.unref();
|
|
5493
|
+
|
|
5219
5494
|
// Process URLs in batches with exception handling
|
|
5220
5495
|
let siteGroupIndex = 0;
|
|
5221
5496
|
let currentProxyKey = ''; // Track active proxy config — '' means direct connection
|
|
@@ -5500,58 +5775,38 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5500
5775
|
dnsPositiveSkippedHosts.add(taskDomain);
|
|
5501
5776
|
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check skipped (dig/whois cache confirms resolution): ${taskDomain}`));
|
|
5502
5777
|
// Fall through to navigation -- pre-check "passed" by proxy.
|
|
5778
|
+
} else if (dnsBreaker.isTripped()) {
|
|
5779
|
+
// Resolver is in a refusal storm — pre-checking is futile and only
|
|
5780
|
+
// adds load. Skip the resolve and proceed to navigation (same effect
|
|
5781
|
+
// as a fail-open); no breaker record since no resolve happened.
|
|
5782
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check suspended (resolver circuit open) — proceeding: ${taskDomain}`));
|
|
5503
5783
|
} else {
|
|
5504
|
-
const dnsResolve = async () => {
|
|
5505
|
-
// resolve4 first; on no-IPv4 (ENODATA / ENOTFOUND) fall back to
|
|
5506
|
-
// resolve6 so IPv6-only hosts aren't wrongly skipped. ANY OTHER
|
|
5507
|
-
// error code (ESERVFAIL, ETIMEOUT, EREFUSED, etc.) propagates
|
|
5508
|
-
// unchanged so the outer transient-retry path sees the real
|
|
5509
|
-
// resolver code and the negative cache records the right reason.
|
|
5510
|
-
// Previously a bare .catch swallowed everything and tried
|
|
5511
|
-
// resolve6, which masked transient v4-side errors behind
|
|
5512
|
-
// whatever resolve6 ended up reporting.
|
|
5513
|
-
// 2s timeout kept as a real safety net — with c-ares off the
|
|
5514
|
-
// threadpool it should now rarely fire.
|
|
5515
|
-
let timer;
|
|
5516
|
-
try {
|
|
5517
|
-
const timeoutP = new Promise((_, reject) => {
|
|
5518
|
-
timer = setTimeout(() => reject(new Error('DNS timeout')), dnsPrecheckTimeoutMs);
|
|
5519
|
-
});
|
|
5520
|
-
const resolveChain = dnsPromises.resolve4(taskDomain)
|
|
5521
|
-
.catch(err => {
|
|
5522
|
-
if (err && (err.code === 'ENODATA' || err.code === 'ENOTFOUND')) {
|
|
5523
|
-
return dnsPromises.resolve6(taskDomain);
|
|
5524
|
-
}
|
|
5525
|
-
throw err;
|
|
5526
|
-
});
|
|
5527
|
-
await Promise.race([resolveChain, timeoutP]);
|
|
5528
|
-
} finally {
|
|
5529
|
-
if (timer) clearTimeout(timer);
|
|
5530
|
-
}
|
|
5531
|
-
};
|
|
5532
|
-
// c-ares transient codes — retry once so a momentary resolver
|
|
5533
|
-
// hiccup doesn't poison the negative cache for 5 minutes.
|
|
5534
|
-
// DNS_TRANSIENT_ERRORS is module-level so we don't allocate per task.
|
|
5535
5784
|
try {
|
|
5536
|
-
|
|
5537
|
-
|
|
5538
|
-
|
|
5539
|
-
|
|
5540
|
-
|
|
5541
|
-
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check transient (${code || 'timeout'}) for ${taskDomain}, retrying once`));
|
|
5542
|
-
await dnsResolve();
|
|
5543
|
-
} else {
|
|
5544
|
-
throw firstErr;
|
|
5545
|
-
}
|
|
5546
|
-
}
|
|
5785
|
+
// Rotates the lead nameserver per attempt and retries once on a
|
|
5786
|
+
// transient error; rejects with the final error (code intact) on
|
|
5787
|
+
// failure. See lib/dns.js.
|
|
5788
|
+
await dnsResolver.resolveHost(taskDomain, dnsPrecheckTimeoutMs);
|
|
5789
|
+
dnsBreaker.record(false); // resolved OK — resolver healthy
|
|
5547
5790
|
} catch (dnsErr) {
|
|
5548
5791
|
const errCode = dnsErr.code || dnsErr.message || 'DNS resolve failed';
|
|
5549
|
-
|
|
5550
|
-
|
|
5551
|
-
|
|
5552
|
-
|
|
5792
|
+
// Only a definitive "host does not exist / has no address" answer
|
|
5793
|
+
// (ENOTFOUND/ENODATA) justifies dropping the URL. A resolver-level
|
|
5794
|
+
// failure (EREFUSED/ESERVFAIL/ETIMEOUT/ECONNREFUSED/timeout) says
|
|
5795
|
+
// nothing about whether the domain is live — fail open: don't cache,
|
|
5796
|
+
// don't skip, let it proceed to real browser navigation (a genuinely
|
|
5797
|
+
// dead host still fails fast there).
|
|
5798
|
+
if (isNonExistenceError(errCode)) {
|
|
5799
|
+
dnsBreaker.record(false); // resolver answered NXDOMAIN — healthy
|
|
5800
|
+
dnsNegativeCacheSet(taskDomain, errCode);
|
|
5801
|
+
recordDeadDomain(taskDomain, errCode);
|
|
5802
|
+
dnsPrecheckSkips++;
|
|
5803
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check failed: ${taskDomain} — ${errCode}`));
|
|
5804
|
+
return { url: task.url, rules: [], success: false, error: `DNS: ${errCode}`, skipped: true };
|
|
5805
|
+
}
|
|
5806
|
+
dnsBreaker.record(true); // resolver error — counts toward tripping the circuit
|
|
5807
|
+
if (forceDebug) console.log(formatLogMessage('debug', `DNS pre-check inconclusive (${errCode}) for ${taskDomain} — proceeding (resolver issue, not a dead host)`));
|
|
5553
5808
|
}
|
|
5554
|
-
} // close `else`
|
|
5809
|
+
} // close the resolve `else` (domainKnownToResolve / circuit-open shortcuts above)
|
|
5555
5810
|
}
|
|
5556
5811
|
} catch {}
|
|
5557
5812
|
|
|
@@ -5584,6 +5839,9 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5584
5839
|
+ ((task.config.delay || 0) + INTERACTION_OVERHEAD_MS) * (1 + reloadCount)
|
|
5585
5840
|
+ 30000
|
|
5586
5841
|
);
|
|
5842
|
+
// Feed the hang-check restart so it never escalates before this URL's own
|
|
5843
|
+
// timeout could have fired (see maxPerUrlTimeoutMs).
|
|
5844
|
+
if (PER_URL_TIMEOUT_MS > maxPerUrlTimeoutMs) maxPerUrlTimeoutMs = PER_URL_TIMEOUT_MS;
|
|
5587
5845
|
// Grace period after primary timeout — gives the orphan a chance to
|
|
5588
5846
|
// finish drainPendingNetTools() and emit "Saving N rules despite page
|
|
5589
5847
|
// load failure" before we abandon its result. Drain typically completes
|
|
@@ -5843,11 +6101,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5843
6101
|
} catch (processingError) {
|
|
5844
6102
|
console.log(formatLogMessage('error', `Critical error: ${processingError.message}`));
|
|
5845
6103
|
clearInterval(hangDetectionInterval);
|
|
6104
|
+
clearInterval(pageHangProbeInterval);
|
|
5846
6105
|
throw processingError;
|
|
5847
6106
|
}
|
|
5848
6107
|
|
|
5849
|
-
// Clear hang detection
|
|
6108
|
+
// Clear hang detection intervals
|
|
5850
6109
|
clearInterval(hangDetectionInterval);
|
|
6110
|
+
clearInterval(pageHangProbeInterval);
|
|
5851
6111
|
|
|
5852
6112
|
// === POST-SCAN PROCESSING ===
|
|
5853
6113
|
// Clean up first-party domains and validate results
|
|
@@ -5929,7 +6189,6 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5929
6189
|
const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
|
|
5930
6190
|
|
|
5931
6191
|
// Debug: Show output format being used
|
|
5932
|
-
const totalDomainsSkipped = getTotalDomainsSkipped();
|
|
5933
6192
|
const detectedDomainsCount = getDetectedDomainsCount();
|
|
5934
6193
|
if (forceDebug) {
|
|
5935
6194
|
const globalOptions = {
|
|
@@ -5944,7 +6203,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5944
6203
|
};
|
|
5945
6204
|
console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
|
|
5946
6205
|
console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
|
|
5947
|
-
console.log(formatLogMessage('debug', `Performance: ${
|
|
6206
|
+
console.log(formatLogMessage('debug', `Performance: ${detectedDomainsCount} unique domains cached`));
|
|
5948
6207
|
// Cloudflare cache statistics
|
|
5949
6208
|
const cloudflareStats = getCacheStats();
|
|
5950
6209
|
if (cloudflareStats.size > 0) {
|
|
@@ -5973,6 +6232,13 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
5973
6232
|
}
|
|
5974
6233
|
console.log(formatLogMessage('debug', `DNS pre-check skipped: ${parts.join(', ')}`));
|
|
5975
6234
|
}
|
|
6235
|
+
// Surface circuit-breaker activity in the end-of-scan summary (each trip
|
|
6236
|
+
// also warns in real time). Shown outside forceDebug because a resolver
|
|
6237
|
+
// refusal storm is something the operator should know happened.
|
|
6238
|
+
const dnsBreakerTrips = dnsBreaker.stats().trips;
|
|
6239
|
+
if (dnsBreakerTrips > 0 && !silentMode) {
|
|
6240
|
+
console.log(formatLogMessage('info', `DNS pre-check circuit tripped ${dnsBreakerTrips}× this scan (resolver refusal back-off)`));
|
|
6241
|
+
}
|
|
5976
6242
|
// Blocked-pattern hit stats. Surfaces which patterns are actually
|
|
5977
6243
|
// doing work this scan and (by absence) which are stale enough to
|
|
5978
6244
|
// prune from config. Top 10 by hit count to keep the log scannable
|
|
@@ -6175,8 +6441,18 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
6175
6441
|
} else if (outputResult.totalRules > 0 && dryRunMode) {
|
|
6176
6442
|
console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
|
|
6177
6443
|
}
|
|
6178
|
-
|
|
6179
|
-
|
|
6444
|
+
// --show-dead-domains: list hostnames that didn't resolve / were unreachable
|
|
6445
|
+
// this scan (NXDOMAIN/ENODATA + ERR_NAME_NOT_RESOLVED/ERR_ADDRESS_UNREACHABLE).
|
|
6446
|
+
// One host per line so it's greppable for pruning; reason in the trailing column.
|
|
6447
|
+
if (showDeadDomains) {
|
|
6448
|
+
if (_deadDomains.size > 0) {
|
|
6449
|
+
console.log(`\n${messageColors.warn(`Dead domains (${_deadDomains.size}) — did not resolve / unreachable:`)}`);
|
|
6450
|
+
for (const [host, reason] of [..._deadDomains].sort((a, b) => a[0].localeCompare(b[0]))) {
|
|
6451
|
+
console.log(` ${host}\t${reason}`);
|
|
6452
|
+
}
|
|
6453
|
+
} else {
|
|
6454
|
+
console.log(`\n${messageColors.success('Dead domains: none detected')}`);
|
|
6455
|
+
}
|
|
6180
6456
|
}
|
|
6181
6457
|
if (ignoreCache && forceDebug) {
|
|
6182
6458
|
console.log(messageColors.info('Cache:') + ` Smart caching was disabled`);
|