@fanboynz/network-scanner 1.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/nwss.js ADDED
@@ -0,0 +1,2488 @@
1
+ // === Network scanner script (nwss.js) v1.0.35 ===
2
+
3
+ // puppeteer for browser automation, fs for file system operations, psl for domain parsing.
4
+ // const pLimit = require('p-limit'); // Will be dynamically imported
5
+ const puppeteer = require('puppeteer');
6
+ const fs = require('fs');
7
+ const psl = require('psl');
8
+ const path = require('path');
9
+ const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
10
+ const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
11
+ const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
12
+ const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
13
+ const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
14
+ // Rule validation
15
+ const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
16
+ // CF Bypass
17
+ const { handleCloudflareProtection } = require('./lib/cloudflare');
18
+ // FP Bypass
19
+ const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
20
+ // ignore_similar rules
21
+ const { shouldIgnoreSimilarDomain } = require('./lib/ignore_similar');
22
+ // Graceful exit
23
+ const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
24
+ // Whois & Dig
25
+ const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability } = require('./lib/nettools');
26
+ // File compare
27
+ const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
28
+ // Colorize various text when used
29
+ const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize');
30
+ // Enhanced redirect handling
31
+ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
32
+ // Ensure web browser is working correctly
33
+ const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth');
34
+
35
+ // --- Script Configuration & Constants ---
36
+ const VERSION = '1.0.35'; // Script version
37
+ const MAX_CONCURRENT_SITES = 5;
38
+ const RESOURCE_CLEANUP_INTERVAL = 80; // Close browser and restart every N sites to free resources
39
+
40
+ // get startTime
41
+ const startTime = Date.now();
42
+
43
+ // --- Command-Line Argument Parsing ---
44
+ const args = process.argv.slice(2);
45
+
46
+ if (args.length === 0) {
47
+ args.push('--help');
48
+ }
49
+
50
+ const headfulMode = args.includes('--headful');
51
+ const SOURCES_FOLDER = 'sources';
52
+
53
+ let outputFile = null;
54
+ const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o');
55
+ if (outputIndex !== -1 && args[outputIndex + 1]) {
56
+ outputFile = args[outputIndex + 1];
57
+ }
58
+
59
+ const appendMode = args.includes('--append');
60
+
61
+ let compareFile = null;
62
+ const compareIndex = args.findIndex(arg => arg === '--compare');
63
+ if (compareIndex !== -1 && args[compareIndex + 1]) {
64
+ compareFile = args[compareIndex + 1];
65
+ }
66
+
67
+
68
+ const forceVerbose = args.includes('--verbose');
69
+ const forceDebug = args.includes('--debug');
70
+ const silentMode = args.includes('--silent');
71
+ const showTitles = args.includes('--titles');
72
+ const dumpUrls = args.includes('--dumpurls');
73
+ const subDomainsMode = args.includes('--sub-domains');
74
+ const localhostMode = args.includes('--localhost');
75
+ const localhostModeAlt = args.includes('--localhost-0.0.0.0');
76
+ const disableInteract = args.includes('--no-interact');
77
+ const plainOutput = args.includes('--plain');
78
+ const enableCDP = args.includes('--cdp');
79
+ const dnsmasqMode = args.includes('--dnsmasq');
80
+ const dnsmasqOldMode = args.includes('--dnsmasq-old');
81
+ const unboundMode = args.includes('--unbound');
82
+ const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
83
+ const privoxyMode = args.includes('--privoxy');
84
+ const piholeMode = args.includes('--pihole');
85
+ const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
86
+ const dryRunMode = args.includes('--dry-run');
87
+ const compressLogs = args.includes('--compress-logs');
88
+ const removeTempFiles = args.includes('--remove-tempfiles');
89
+ const validateConfig = args.includes('--validate-config');
90
+ const validateRules = args.includes('--validate-rules');
91
+ const testValidation = args.includes('--test-validation');
92
+ let cleanRules = args.includes('--clean-rules');
93
+
94
+ let validateRulesFile = null;
95
+ const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules');
96
+ if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) {
97
+ validateRulesFile = args[validateRulesIndex + 1];
98
+ validateRules = true; // Override the boolean if file specified
99
+ }
100
+
101
+ let cleanRulesFile = null;
102
+ const cleanRulesIndex = args.findIndex(arg => arg === '--clean-rules');
103
+ if (cleanRulesIndex !== -1 && args[cleanRulesIndex + 1] && !args[cleanRulesIndex + 1].startsWith('--')) {
104
+ cleanRulesFile = args[cleanRulesIndex + 1];
105
+ cleanRules = true; // Override the boolean if file specified
106
+ }
107
+
108
+ const enableColors = args.includes('--color') || args.includes('--colour');
109
+ let adblockRulesMode = args.includes('--adblock-rules');
110
+
111
+ // Validate --adblock-rules usage - ignore if used incorrectly instead of erroring
112
+ if (adblockRulesMode) {
113
+ if (!outputFile) {
114
+ if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: requires --output (-o) to specify an output file`));
115
+ adblockRulesMode = false;
116
+ } else if (localhostMode || localhostModeAlt || plainOutput || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
117
+ if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: incompatible with localhost/plain output modes`));
118
+ adblockRulesMode = false;
119
+ }
120
+ }
121
+
122
+ // Validate --dnsmasq usage
123
+ if (dnsmasqMode) {
124
+ if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
125
+ if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
126
+ dnsmasqMode = false;
127
+ }
128
+ }
129
+
130
+ // Validate --dnsmasq-old usage
131
+ if (dnsmasqOldMode) {
132
+ if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || unboundMode || privoxyMode || piholeMode) {
133
+ if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
134
+ dnsmasqOldMode = false;
135
+ }
136
+ }
137
+
138
+ // Validate --unbound usage
139
+ if (unboundMode) {
140
+ if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || privoxyMode || piholeMode) {
141
+ if (forceDebug) console.log(formatLogMessage('debug', `--unbound ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
142
+ unboundMode = false;
143
+ }
144
+ }
145
+
146
+ // Validate --privoxy usage
147
+ if (privoxyMode) {
148
+ if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || piholeMode) {
149
+ if (forceDebug) console.log(formatLogMessage('debug', `--privoxy ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound output modes`));
150
+ privoxyMode = false;
151
+ }
152
+ }
153
+
154
+ // Validate --pihole usage
155
+ if (piholeMode) {
156
+ if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode) {
157
+ if (forceDebug) console.log(formatLogMessage('debug', `--pihole ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound/privoxy output modes`));
158
+ piholeMode = false;
159
+ }
160
+ }
161
+
162
+ // Validate --compress-logs usage
163
+ if (compressLogs && !dumpUrls) {
164
+ console.error(`❌ --compress-logs can only be used with --dumpurls`);
165
+ process.exit(1);
166
+ }
167
+
168
+ // Validate --append usage
169
+ if (appendMode && !outputFile) {
170
+ console.error(`❌ --append requires --output (-o) to specify an output file`);
171
+ process.exit(1);
172
+ }
173
+
174
+ if (appendMode && (compareFile || dryRunMode)) {
175
+ console.error(`❌ --append cannot be used with --compare or --dry-run`);
176
+ process.exit(1);
177
+ }
178
+
179
+ // Validate --dry-run usage
180
+ if (dryRunMode) {
181
+ if (compressLogs || compareFile) {
182
+ console.error(`❌ --dry-run cannot be used with --compress-logs or --compare`);
183
+ process.exit(1);
184
+ }
185
+ }
186
+
187
+ // Validate --compare usage
188
+ if (compareFile && !outputFile) {
189
+ console.error(`❌ --compare requires --output (-o) to specify an output file`);
190
+ process.exit(1);
191
+ }
192
+
193
+ if (compareFile && !fs.existsSync(compareFile)) {
194
+ console.error(`❌ Compare file not found: ${compareFile}`);
195
+ process.exit(1);
196
+ }
197
+
198
+ if (args.includes('--version')) {
199
+ console.log(`nwss.js version ${VERSION}`);
200
+ process.exit(0);
201
+ }
202
+
203
+ // Handle validation-only operations before main help
204
+ if (testValidation) {
205
+ console.log(`\n${messageColors.processing('Running domain validation tests...')}`);
206
+ const testResult = testDomainValidation();
207
+ if (testResult) {
208
+ console.log(`${messageColors.success('✅ All validation tests passed!')}`);
209
+ process.exit(0);
210
+ } else {
211
+ console.log(`${messageColors.error('❌ Some validation tests failed!')}`);
212
+ process.exit(1);
213
+ }
214
+ }
215
+
216
+ if (validateConfig) {
217
+ console.log(`\n${messageColors.processing('Validating configuration file...')}`);
218
+ try {
219
+ const validation = validateFullConfig(config, { forceDebug, silentMode });
220
+
221
+ // Validate referrer_headers format
222
+ for (const site of sites) {
223
+ if (site.referrer_headers && typeof site.referrer_headers === 'object' && !Array.isArray(site.referrer_headers)) {
224
+ const validModes = ['random_search', 'social_media', 'direct_navigation', 'custom'];
225
+ if (site.referrer_headers.mode && !validModes.includes(site.referrer_headers.mode)) {
226
+ console.warn(`⚠ Invalid referrer_headers mode: ${site.referrer_headers.mode}. Valid modes: ${validModes.join(', ')}`);
227
+ }
228
+ }
229
+ }
230
+
231
+ if (validation.isValid) {
232
+ console.log(`${messageColors.success('✅ Configuration is valid!')}`);
233
+ console.log(`${messageColors.info('Summary:')} ${validation.summary.validSites}/${validation.summary.totalSites} sites valid`);
234
+ if (validation.summary.sitesWithWarnings > 0) {
235
+ console.log(`${messageColors.warn('⚠ Warnings:')} ${validation.summary.sitesWithWarnings} sites have warnings`);
236
+ }
237
+ process.exit(0);
238
+ } else {
239
+ console.log(`${messageColors.error('❌ Configuration validation failed!')}`);
240
+ console.log(`${messageColors.error('Errors:')} ${validation.globalErrors.length} global, ${validation.summary.sitesWithErrors} site-specific`);
241
+ process.exit(1);
242
+ }
243
+ } catch (validationErr) {
244
+ console.error(`❌ Validation failed: ${validationErr.message}`);
245
+ process.exit(1);
246
+ }
247
+ }
248
+
249
+ if (validateRules || validateRulesFile) {
250
+ const filesToValidate = validateRulesFile ? [validateRulesFile] : [outputFile, compareFile].filter(Boolean);
251
+
252
+ if (filesToValidate.length === 0) {
253
+ console.error('❌ --validate-rules requires either a file argument or --output/--compare files to be specified');
254
+ process.exit(1);
255
+ }
256
+
257
+ console.log(`\n${messageColors.processing('Validating rule files...')}`);
258
+ let overallValid = true;
259
+
260
+ for (const file of filesToValidate) {
261
+ console.log(`\n${messageColors.info('Validating:')} ${file}`);
262
+ try {
263
+ const validation = validateRulesetFile(file, { forceDebug, silentMode, maxErrors: 20 });
264
+
265
+ if (validation.isValid) {
266
+ console.log(`${messageColors.success('✅ Valid:')} ${validation.stats.valid} rules, ${validation.stats.comments} comments`);
267
+ if (validation.duplicates.length > 0) {
268
+ console.log(`${messageColors.warn('⚠ Duplicates:')} ${validation.duplicates.length} duplicate rules found`);
269
+ }
270
+
271
+ if (Object.keys(validation.stats.formats).length > 0) {
272
+ console.log(`${messageColors.info('Formats:')} ${Object.entries(validation.stats.formats).map(([f, c]) => `${f}(${c})`).join(', ')}`);
273
+ }
274
+ } else {
275
+ console.log(`${messageColors.error('❌ Invalid:')} ${validation.stats.invalid} invalid rules out of ${validation.stats.total} total`);
276
+ overallValid = false;
277
+ }
278
+ } catch (validationErr) {
279
+ console.error(`❌ Failed to validate ${file}: ${validationErr.message}`);
280
+ overallValid = false;
281
+ }
282
+ }
283
+
284
+ if (overallValid) {
285
+ console.log(`\n${messageColors.success('✅ All rule files are valid!')}`);
286
+ process.exit(0);
287
+ } else {
288
+ console.log(`\n${messageColors.error('❌ Some rule files have validation errors!')}`);
289
+ process.exit(1);
290
+ }
291
+ }
292
+
293
+ if (args.includes('--help') || args.includes('-h')) {
294
+ console.log(`Usage: node nwss.js [options]
295
+
296
+ Options:
297
+ --color, --colour Enable colored console output for status messages
298
+ -o, --output <file> Output file for rules. If omitted, prints to console
299
+ --compare <file> Remove rules that already exist in this file before output
300
+ --append Append new rules to output file instead of overwriting (requires -o)
301
+
302
+ Output Format Options:
303
+ --localhost Output as 127.0.0.1 domain.com
304
+ --localhost-0.0.0.0 Output as 0.0.0.0 domain.com
305
+ --plain Output just domains (no adblock formatting)
306
+ --dnsmasq Output as local=/domain.com/ (dnsmasq format)
307
+ --dnsmasq-old Output as server=/domain.com/ (dnsmasq old format)
308
+ --unbound Output as local-zone: "domain.com." always_null (unbound format)
309
+ --privoxy Output as { +block } .domain.com (Privoxy format)
310
+ --pihole Output as (^|\\.)domain\\.com$ (Pi-hole regex format)
311
+ --adblock-rules Generate adblock filter rules with resource type modifiers (requires -o)
312
+
313
+ General Options:
314
+ --verbose Force verbose mode globally
315
+ --debug Force debug mode globally
316
+ --silent Suppress normal console logs
317
+ --titles Add ! <url> title before each site's group
318
+ --dumpurls Dump matched URLs into matched_urls.log
319
+ --dry-run Console output only: show matching regex, titles, whois/dig/searchstring results, and adblock rules
320
+ --compress-logs Compress log files with gzip (requires --dumpurls)
321
+ --sub-domains Output full subdomains instead of collapsing to root
322
+ --no-interact Disable page interactions globally
323
+ --custom-json <file> Use a custom config JSON file instead of config.json
324
+ --headful Launch browser with GUI (not headless)
325
+ --cdp Enable Chrome DevTools Protocol logging (now per-page if enabled)
326
+ --remove-dupes Remove duplicate domains from output (only with -o)
327
+ --eval-on-doc Globally enable evaluateOnNewDocument() for Fetch/XHR interception
328
+ --help, -h Show this help menu
329
+ --version Show script version
330
+ --remove-tempfiles Remove Chrome/Puppeteer temporary files before exit
331
+
332
+ Validation Options:
333
+ --validate-config Validate config.json file and exit
334
+ --validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
335
+ --clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
336
+ --test-validation Run domain validation tests and exit
337
+
338
+ Global config.json options:
339
+ ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
340
+ blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
341
+ whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
342
+ ignore_similar: true/false Ignore domains similar to already found domains (default: true)
343
+ ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
344
+ ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
345
+
346
+
347
+ Per-site config.json options:
348
+ url: "site" or ["site1", "site2"] Single URL or list of URLs
349
+ filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests
350
+
351
+ Redirect Handling Options:
352
+ follow_redirects: true/false Follow redirects to new domains (default: true)
353
+ max_redirects: 10 Maximum number of redirects to follow (default: 10)
354
+ js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
355
+ detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
356
+ redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
357
+
358
+ comments: "text" or ["text1", "text2"] Documentation/notes - ignored by script
359
+ searchstring: "text" or ["text1", "text2"] Text to search in response content (requires filterRegex match)
360
+ ignore_similar: true/false Override global ignore_similar setting for this site
361
+ ignore_similar_threshold: 80 Override global similarity threshold for this site
362
+ ignore_similar_ignored_domains: true/false Override global ignore_similar_ignored_domains for this site
363
+ searchstring_and: "text" or ["text1", "text2"] Text to search with AND logic - ALL terms must be present (requires filterRegex match)
364
+ curl: true/false Use curl to download content for analysis (default: false)
365
+ Note: curl respects filterRegex but ignores resourceTypes filtering
366
+ grep: true/false Use grep instead of JavaScript for pattern matching (default: false)
367
+ Note: requires curl=true, uses system grep command for faster searches
368
+ blocked: ["regex"] Regex patterns to block requests
369
+ css_blocked: ["#selector", ".class"] CSS selectors to hide elements
370
+ resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
371
+ interact: true/false Simulate mouse movements/clicks
372
+ isBrave: true/false Spoof Brave browser detection
373
+ userAgent: "chrome"|"firefox"|"safari" Custom desktop User-Agent
374
+ delay: <milliseconds> Delay after load (default: 4000)
375
+ reload: <number> Reload page n times after load (default: 1)
376
+ forcereload: true/false Force an additional reload after reloads
377
+ clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
378
+ subDomains: 1/0 Output full subdomains (default: 0)
379
+ localhost: true/false Force localhost output (127.0.0.1)
380
+ localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
381
+ dnsmasq: true/false Force dnsmasq output (local=/domain.com/)
382
+ dnsmasq_old: true/false Force dnsmasq old output (server=/domain.com/)
383
+ unbound: true/false Force unbound output (local-zone: "domain.com." always_null)
384
+ privoxy: true/false Force Privoxy output ({ +block } .domain.com)
385
+ pihole: true/false Force Pi-hole regex output ((^|\\.)domain\\.com$)
386
+ source: true/false Save page source HTML after load
387
+ firstParty: true/false Allow first-party matches (default: false)
388
+ thirdParty: true/false Allow third-party matches (default: true)
389
+ screenshot: true/false Capture screenshot on load failure
390
+ headful: true/false Launch browser with GUI for this site
391
+ fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
392
+ adblock_rules: true/false Generate adblock filter rules with resource types for this site
393
+ even_blocked: true/false Add matching rules even if requests are blocked (default: false)
394
+
395
+ referrer_headers: "url" or ["url1", "url2"] Set referrer header for realistic traffic sources
396
+ custom_headers: {"Header": "value"} Add custom HTTP headers to requests
397
+
398
+ Cloudflare Protection Options:
399
+ cloudflare_phish: true/false Auto-click through Cloudflare phishing warnings (default: false)
400
+ cloudflare_bypass: true/false Auto-solve Cloudflare "Verify you are human" challenges (default: false)
401
+
402
+ FlowProxy Protection Options:
403
+ flowproxy_detection: true/false Enable flowProxy protection detection and handling (default: false)
404
+ flowproxy_page_timeout: <milliseconds> Page timeout for flowProxy sites (default: 45000)
405
+ flowproxy_nav_timeout: <milliseconds> Navigation timeout for flowProxy sites (default: 45000)
406
+ flowproxy_js_timeout: <milliseconds> JavaScript challenge timeout (default: 15000)
407
+ flowproxy_delay: <milliseconds> Delay for rate limiting (default: 30000)
408
+ flowproxy_additional_delay: <milliseconds> Additional processing delay (default: 5000)
409
+
410
+ Advanced Options:
411
+ evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page (for this site)
412
+ cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page
413
+ whois: ["term1", "term2"] Check whois data for ALL specified terms (AND logic)
414
+ whois-or: ["term1", "term2"] Check whois data for ANY specified term (OR logic)
415
+ whois_server_mode: "random" or "cycle" Server selection mode: random (default) or cycle through list
416
+ whois_server: "whois.domain.com" or ["server1", "server2"] Custom whois server(s) - single server or randomized list (default: system default)
417
+ whois_max_retries: 2 Maximum retry attempts per domain (default: 2)
418
+ whois_timeout_multiplier: 1.5 Timeout increase multiplier per retry (default: 1.5)
419
+ whois_use_fallback: true Add TLD-specific fallback servers (default: true)
420
+ whois_retry_on_timeout: true Retry on timeout errors (default: true)
421
+ whois_retry_on_error: false Retry on connection/other errors (default: false)
422
+ whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
423
+ dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
424
+ dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
425
+ goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"})
426
+ dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
427
+ digRecordType: "A" DNS record type for dig (default: A)
428
+
429
+ Referrer Header Options:
430
+ referrer_headers: "https://google.com" Single referrer URL
431
+ referrer_headers: ["url1", "url2"] Random selection from array
432
+ referrer_headers: {"mode": "random_search", "search_terms": ["term1"]} Smart search engine traffic
433
+ referrer_headers: {"mode": "social_media"} Random social media referrers
434
+ referrer_headers: {"mode": "direct_navigation"} No referrer (direct access)
435
+ custom_headers: {"Header": "Value"} Additional HTTP headers
436
+ `);
437
+ process.exit(0);
438
+ }
439
+
440
+ // --- Configuration File Loading ---
441
+ const configPathIndex = args.findIndex(arg => arg === '--custom-json');
442
+ const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json';
443
+ let config;
444
+ try {
445
+ if (!fs.existsSync(configPath)) {
446
+ console.error(`❌ Config file not found: ${configPath}`);
447
+ process.exit(1);
448
+ }
449
+ if (forceDebug && configPath !== 'config.json') {
450
+ console.log(formatLogMessage('debug', `Using custom config file: ${configPath}`));
451
+ }
452
+ const raw = fs.readFileSync(configPath, 'utf8');
453
+ config = JSON.parse(raw);
454
+ } catch (e) {
455
+ console.error(`❌ Failed to load config file (${configPath}):`, e.message);
456
+ process.exit(1);
457
+ }
458
+ // Extract config values while ignoring 'comments' field at global and site levels
459
+ const { sites = [], ignoreDomains = [], blocked: globalBlocked = [], whois_delay = 3000, whois_server_mode = 'random', ignore_similar = true, ignore_similar_threshold = 80, ignore_similar_ignored_domains = true, comments: globalComments, ...otherGlobalConfig } = config;
460
+
461
+ // Handle --clean-rules after config is loaded (so we have access to sites)
462
+ if (cleanRules || cleanRulesFile) {
463
+ const filesToClean = cleanRulesFile ? [cleanRulesFile] : [outputFile, compareFile].filter(Boolean);
464
+
465
+ if (filesToClean.length === 0) {
466
+ console.error('❌ --clean-rules requires either a file argument or --output/--compare files to be specified');
467
+ process.exit(1);
468
+ }
469
+
470
+ console.log(`\n${messageColors.processing('Cleaning rule files...')}`);
471
+ let overallSuccess = true;
472
+ let totalCleaned = 0;
473
+
474
+ // Check if we're cleaning the same file we want to use for output
475
+ const cleaningOutputFile = outputFile && filesToClean.includes(outputFile);
476
+
477
+ if (cleaningOutputFile && forceDebug) {
478
+ console.log(formatLogMessage('debug', `Output file detected: will clean ${outputFile} first, then continue with scan`));
479
+ }
480
+
481
+ for (const file of filesToClean) {
482
+ console.log(`\n${messageColors.info('Cleaning:')} ${file}`);
483
+
484
+ // Check if file exists before trying to clean it
485
+ if (!fs.existsSync(file)) {
486
+ if (file === outputFile) {
487
+ // If it's the output file that doesn't exist, that's OK - we'll create it during scan
488
+ const modeText = appendMode ? 'created (append mode)' : 'created';
489
+ console.log(`${messageColors.info('📄 Note:')} Output file ${file} doesn't exist yet - will be ${modeText} during scan`);
490
+ continue;
491
+ } else {
492
+ // For other files (like compare files), this is an error
493
+ console.log(`${messageColors.error('❌ Failed:')} File not found: ${file}`);
494
+ overallSuccess = false;
495
+ continue;
496
+ }
497
+ }
498
+
499
+ try {
500
+ const cleanResult = cleanRulesetFile(file, null, {
501
+ forceDebug,
502
+ silentMode,
503
+ removeDuplicates: removeDupes,
504
+ backupOriginal: true,
505
+ dryRun: dryRunMode
506
+ });
507
+
508
+ if (cleanResult.success) {
509
+ if (dryRunMode) {
510
+ if (cleanResult.wouldModify) {
511
+ console.log(`${messageColors.info('🔍 Dry run:')} Would remove ${cleanResult.stats.removed} lines (${cleanResult.stats.invalid} invalid, ${cleanResult.stats.duplicates} duplicates)`);
512
+ } else {
513
+ console.log(`${messageColors.success('✅ Dry run:')} File is already clean - no changes needed`);
514
+ }
515
+ } else {
516
+ if (cleanResult.modified) {
517
+ console.log(`${messageColors.success('✅ Cleaned:')} Removed ${cleanResult.stats.removed} lines, preserved ${cleanResult.stats.valid} valid rules`);
518
+ if (cleanResult.backupCreated) {
519
+ console.log(`${messageColors.info('💾 Backup:')} Original file backed up`);
520
+ }
521
+ totalCleaned += cleanResult.stats.removed;
522
+
523
+ if (cleaningOutputFile && file === outputFile) {
524
+ console.log(`${messageColors.info('📄 Note:')} File cleaned - new rules will be ${appendMode ? 'appended' : 'written'} during scan`);
525
+ }
526
+ } else {
527
+ console.log(`${messageColors.success('✅ Clean:')} File was already valid - no changes needed`);
528
+ }
529
+ }
530
+ } else {
531
+ console.log(`${messageColors.error('❌ Failed:')} ${cleanResult.error}`);
532
+ overallSuccess = false;
533
+ }
534
+ } catch (cleanErr) {
535
+ console.error(`❌ Failed to clean ${file}: ${cleanErr.message}`);
536
+ overallSuccess = false;
537
+ }
538
+ }
539
+
540
+ // Determine if we should continue with scanning
541
+ const shouldContinueScanning = sites && sites.length > 0 && outputFile;
542
+ const cleanedOutputFileForScanning = outputFile && filesToClean.includes(outputFile);
543
+
544
+ if (overallSuccess) {
545
+ if (dryRunMode) {
546
+ console.log(`\n${messageColors.info('🔍 Dry run completed successfully!')}`);
547
+ process.exit(0);
548
+ } else {
549
+ console.log(`\n${messageColors.success('✅ All rule files cleaned successfully!')} Total lines removed: ${totalCleaned}`);
550
+
551
+ // Continue with scan if we have sites to process and we cleaned the output file
552
+ if (shouldContinueScanning && cleanedOutputFileForScanning) {
553
+ const actionText = appendMode ? 'append new rules to' : 'write rules to';
554
+ console.log(`${messageColors.info('📄 Continuing:')} Proceeding with scan to ${actionText} ${outputFile}`);
555
+ // Don't exit - continue with scanning
556
+ } else {
557
+ process.exit(0);
558
+ }
559
+ }
560
+ } else {
561
+ console.log(`\n${messageColors.error('❌ Some rule files failed to clean!')}`);
562
+ process.exit(1);
563
+ }
564
+ }
565
+
566
+ // Add global cycling index tracker for whois server selection
567
+ let globalWhoisServerIndex = 0;
568
+
569
+ // Track dry run output for file writing
570
+ let dryRunOutput = [];
571
+
572
+ // --- Log File Setup ---
573
+ let debugLogFile = null;
574
+ let matchedUrlsLogFile = null;
575
+ let adblockRulesLogFile = null;
576
+ if (forceDebug || dumpUrls) {
577
+ // Create logs folder if it doesn't exist
578
+ const logsFolder = 'logs';
579
+ if (!fs.existsSync(logsFolder)) {
580
+ fs.mkdirSync(logsFolder, { recursive: true });
581
+ console.log(formatLogMessage('debug', `Created logs folder: ${logsFolder}`));
582
+ }
583
+
584
+ // Generate timestamped log filenames
585
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-').replace('T', '_').slice(0, -5);
586
+
587
+ if (forceDebug) {
588
+ debugLogFile = path.join(logsFolder, `debug_requests_${timestamp}.log`);
589
+ console.log(formatLogMessage('debug', `Debug requests will be logged to: ${debugLogFile}`));
590
+ }
591
+
592
+ if (dumpUrls) {
593
+ matchedUrlsLogFile = path.join(logsFolder, `matched_urls_${timestamp}.log`);
594
+ console.log(messageColors.processing('Matched URLs will be logged to:') + ` ${matchedUrlsLogFile}`);
595
+
596
+ // Also create adblock rules log file with same timestamp
597
+ adblockRulesLogFile = path.join(logsFolder, `adblock_rules_${timestamp}.txt`);
598
+ console.log(messageColors.processing('Adblock rules will be saved to:') + ` ${adblockRulesLogFile}`);
599
+ }
600
+ }
601
+
602
+ // Log comments if debug mode is enabled and comments exist
603
+ if (forceDebug && globalComments) {
604
+ const commentList = Array.isArray(globalComments) ? globalComments : [globalComments];
605
+ console.log(formatLogMessage('debug', `Global comments found: ${commentList.length} item(s)`));
606
+ commentList.forEach((comment, idx) => console.log(formatLogMessage('debug', ` Comment ${idx + 1}: ${comment}`)));
607
+ }
608
+ // --- Global CDP Override Logic --- [COMMENT RE-ADDED PREVIOUSLY, relevant to old logic]
609
+ // If globalCDP is not already enabled by the --cdp flag,
610
+ // check if any site in config.json has `cdp: true`. If so, enable globalCDP.
611
+ // This allows site-specific config to trigger CDP logging for the entire session.
612
+ // Note: Analysis suggests CDP should ideally be managed per-page for comprehensive logging.
613
+ // (The code block that utilized this logic for a global CDP variable has been removed
614
+ // as CDP is now handled per-page based on 'enableCDP' and 'siteConfig.cdp')
615
+
616
+ /**
617
+ * Extracts the root domain from a given URL string using the psl library.
618
+ * For example, for 'http://sub.example.com/path', it returns 'example.com'.
619
+ *
620
+ * @param {string} url - The URL string to parse.
621
+ * @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
622
+ */
623
+ function getRootDomain(url) {
624
+ try {
625
+ const { hostname } = new URL(url);
626
+ const parsed = psl.parse(hostname);
627
+ return parsed.domain || hostname;
628
+ } catch {
629
+ return '';
630
+ }
631
+ }
632
+
633
+ /**
634
+ * Safely extracts hostname from a URL, handling malformed URLs gracefully
635
+ * @param {string} url - The URL string to parse
636
+ * @param {boolean} getFullHostname - If true, returns full hostname; if false, returns root domain
637
+ * @returns {string} The hostname/domain, or empty string if URL is invalid
638
+ */
639
+ function safeGetDomain(url, getFullHostname = false) {
640
+ try {
641
+ const parsedUrl = new URL(url);
642
+ if (getFullHostname) {
643
+ return parsedUrl.hostname;
644
+ } else {
645
+ return getRootDomain(url);
646
+ }
647
+ } catch (urlError) {
648
+ // Log malformed URLs for debugging
649
+ if (forceDebug) {
650
+ console.log(formatLogMessage('debug', `Malformed URL skipped: ${url} (${urlError.message})`));
651
+ }
652
+ return '';
653
+ }
654
+ }
655
+
656
+ /**
657
+ * Outputs dry run results to console with formatted display
658
+ * If outputFile is specified, also captures output for file writing
659
+ * @param {string} url - The URL being processed
660
+ * @param {Array} matchedItems - Array of matched items with regex, domain, and resource type
661
+ * @param {Array} netToolsResults - Array of whois/dig results
662
+ * @param {string} pageTitle - Title of the page (if available)
663
+ */
664
+ function outputDryRunResults(url, matchedItems, netToolsResults, pageTitle) {
665
+ const lines = [];
666
+
667
+ lines.push(`\n=== DRY RUN RESULTS === ${url}`);
668
+
669
+ console.log(`\n${messageColors.scanning('=== DRY RUN RESULTS ===')} ${url}`);
670
+
671
+ if (pageTitle && pageTitle.trim()) {
672
+ lines.push(`Title: ${pageTitle.trim()}`);
673
+ console.log(`${messageColors.info('Title:')} ${pageTitle.trim()}`);
674
+ }
675
+
676
+ if (matchedItems.length === 0 && netToolsResults.length === 0) {
677
+ lines.push(`No matching rules found on ${url}`);
678
+
679
+ // Store output for file writing if outputFile is specified
680
+ if (outputFile) {
681
+ dryRunOutput.push(...lines);
682
+ dryRunOutput.push(''); // Add empty line
683
+ }
684
+ console.log(messageColors.warn(`No matching rules found on ${url}`));
685
+ return;
686
+ }
687
+
688
+ const totalMatches = matchedItems.length + netToolsResults.length;
689
+ lines.push(`Matches found: ${totalMatches}`);
690
+ console.log(`${messageColors.success('Matches found:')} ${totalMatches}`);
691
+
692
+ matchedItems.forEach((item, index) => {
693
+ lines.push('');
694
+ lines.push(`[${index + 1}] Regex Match:`);
695
+ lines.push(` Pattern: ${item.regex}`);
696
+ lines.push(` Domain: ${item.domain}`);
697
+ lines.push(` Resource Type: ${item.resourceType}`);
698
+ lines.push(` Full URL: ${item.fullUrl}`);
699
+
700
+ console.log(`\n${messageColors.highlight(`[${index + 1}]`)} ${messageColors.match('Regex Match:')}`);
701
+ console.log(` Pattern: ${item.regex}`);
702
+ console.log(` Domain: ${item.domain}`);
703
+ console.log(` Resource Type: ${item.resourceType}`);
704
+ console.log(` Full URL: ${item.fullUrl}`);
705
+
706
+ // Show searchstring results if available
707
+ if (item.searchStringMatch) {
708
+ lines.push(` ✓ Searchstring Match: ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`);
709
+ console.log(` ${messageColors.success('✓ Searchstring Match:')} ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`);
710
+ } else if (item.searchStringChecked) {
711
+ lines.push(` ✗ Searchstring: No matches found in content`);
712
+ console.log(` ${messageColors.warn('✗ Searchstring:')} No matches found in content`);
713
+ }
714
+
715
+ // Generate adblock rule
716
+ const adblockRule = `||${item.domain}^$${item.resourceType}`;
717
+ lines.push(` Adblock Rule: ${adblockRule}`);
718
+ console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`);
719
+ });
720
+
721
+ // Display nettools results
722
+ netToolsResults.forEach((result, index) => {
723
+ const resultIndex = matchedItems.length + index + 1;
724
+ lines.push('');
725
+ lines.push(`[${resultIndex}] NetTools Match:`);
726
+ lines.push(` Domain: ${result.domain}`);
727
+ lines.push(` Tool: ${result.tool.toUpperCase()}`);
728
+ lines.push(` ✓ Match: ${result.matchType} - "${result.matchedTerm}"`);
729
+ if (result.details) {
730
+ lines.push(` Details: ${result.details}`);
731
+ }
732
+ console.log(`\n${messageColors.highlight(`[${resultIndex}]`)} ${messageColors.match('NetTools Match:')}`);
733
+ console.log(` Domain: ${result.domain}`);
734
+ console.log(` Tool: ${result.tool.toUpperCase()}`);
735
+ console.log(` ${messageColors.success('✓ Match:')} ${result.matchType} - "${result.matchedTerm}"`);
736
+ if (result.details) {
737
+ console.log(` Details: ${result.details}`);
738
+ }
739
+
740
+ // Generate adblock rule for nettools matches
741
+ const adblockRule = `||${result.domain}^`;
742
+ lines.push(` Adblock Rule: ${adblockRule}`);
743
+ console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`);
744
+ });
745
+
746
+ // Store output for file writing if outputFile is specified
747
+ if (outputFile) {
748
+ dryRunOutput.push(...lines);
749
+ dryRunOutput.push(''); // Add empty line between sites
750
+ }
751
+ }
752
+
753
+ // ability to use widcards in ignoreDomains
754
+ function matchesIgnoreDomain(domain, ignorePatterns) {
755
+ return ignorePatterns.some(pattern => {
756
+ if (pattern.includes('*')) {
757
+ // Convert wildcard pattern to regex
758
+ const regexPattern = pattern
759
+ .replace(/\./g, '\\.') // Escape dots
760
+ .replace(/\*/g, '.*'); // Convert * to .*
761
+ return new RegExp(`^${regexPattern}$`).test(domain);
762
+ }
763
+ return domain.endsWith(pattern);
764
+ });
765
+ }
766
+
767
+ function setupFrameHandling(page, forceDebug) {
768
+ // Handle frame creation with error suppression
769
+ page.on('frameattached', async (frame) => {
770
+ if (frame.parentFrame()) { // Only handle child frames, not main frame
771
+ try {
772
+ const frameUrl = frame.url();
773
+
774
+ if (forceDebug) {
775
+ console.log(formatLogMessage('debug', `New frame attached: ${frameUrl || 'about:blank'}`));
776
+ }
777
+
778
+ // Don't try to navigate to frames with invalid/empty URLs
779
+ if (!frameUrl ||
780
+ frameUrl === 'about:blank' ||
781
+ frameUrl === '' ||
782
+ frameUrl === 'about:srcdoc' ||
783
+ frameUrl.startsWith('about:') ||
784
+ frameUrl.startsWith('data:') ||
785
+ frameUrl.startsWith('blob:') ||
786
+ frameUrl.startsWith('chrome-error://') ||
787
+ frameUrl.startsWith('chrome-extension://')) {
788
+ if (forceDebug) {
789
+ console.log(formatLogMessage('debug', `Skipping frame with invalid/special URL: ${frameUrl}`));
790
+ }
791
+ return;
792
+ }
793
+
794
+ // Validate URL format before attempting navigation
795
+ try {
796
+ const parsedUrl = new URL(frameUrl);
797
+ // Only process http/https URLs
798
+ if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
799
+ if (forceDebug) {
800
+ console.log(formatLogMessage('debug', `Skipping frame with non-http protocol: ${frameUrl}`));
801
+ }
802
+ return;
803
+ }
804
+ } catch (urlErr) {
805
+ if (forceDebug) {
806
+ console.log(formatLogMessage('debug', `Skipping frame with malformed URL: ${frameUrl}`));
807
+ }
808
+ return;
809
+ }
810
+ // REMOVED: Don't try to manually navigate frames
811
+ // Let frames load naturally - manual navigation often causes Protocol errors
812
+ // await frame.goto(frame.url(), { waitUntil: 'domcontentloaded', timeout: 5000 });
813
+
814
+ if (forceDebug) {
815
+ console.log(formatLogMessage('debug', `Frame will load naturally: ${frameUrl}`));
816
+ }
817
+
818
+ } catch (err) {
819
+ // Suppress "Cannot navigate to invalid URL" errors but log others
820
+ if (!err.message.includes('Cannot navigate to invalid URL') &&
821
+ !err.message.includes('Protocol error')) {
822
+ if (forceDebug) {
823
+ console.log(formatLogMessage('debug', `Frame handling error: ${err.message}`));
824
+ }
825
+ }
826
+ }
827
+ }
828
+ });
829
+ // Handle frame navigations (keep this for monitoring)
830
+ page.on('framenavigated', (frame) => {
831
+ const frameUrl = frame.url();
832
+ if (forceDebug &&
833
+ frameUrl &&
834
+ frameUrl !== 'about:blank' &&
835
+ frameUrl !== 'about:srcdoc' &&
836
+ !frameUrl.startsWith('about:') &&
837
+ !frameUrl.startsWith('data:') &&
838
+ !frameUrl.startsWith('chrome-error://') &&
839
+ !frameUrl.startsWith('chrome-extension://')) {
840
+ console.log(formatLogMessage('debug', `Frame navigated to: ${frameUrl}`));
841
+ }
842
+ });
843
+
844
+ // Optional: Handle frame detachment for cleanup
845
+ page.on('framedetached', (frame) => {
846
+ if (forceDebug) {
847
+ const frameUrl = frame.url();
848
+ if (frameUrl &&
849
+ frameUrl !== 'about:blank' &&
850
+ frameUrl !== 'about:srcdoc' &&
851
+ !frameUrl.startsWith('about:') &&
852
+ !frameUrl.startsWith('chrome-error://') &&
853
+ !frameUrl.startsWith('chrome-extension://')) {
854
+ console.log(formatLogMessage('debug', `Frame detached: ${frameUrl}`));
855
+ }
856
+ }
857
+ });
858
+ }
859
+
860
+ // --- Main Asynchronous IIFE (Immediately Invoked Function Expression) ---
861
+ // This is the main entry point and execution block for the network scanner script.
862
+ (async () => {
863
+ /**
864
+ * Creates a new browser instance with consistent configuration
865
+ * Uses system Chrome and temporary directories to minimize disk usage
866
+ * @returns {Promise<import('puppeteer').Browser>} Browser instance
867
+ */
868
+ async function createBrowser() {
869
+ // Create temporary user data directory that we can fully control and clean up
870
+ const tempUserDataDir = `/tmp/puppeteer-${Date.now()}-${Math.random().toString(36).substring(7)}`;
871
+ let userDataDir = tempUserDataDir; // Store for cleanup tracking
872
+
873
+ // Try to find system Chrome installation to avoid Puppeteer downloads
874
+ const systemChromePaths = [
875
+ '/usr/bin/google-chrome-stable',
876
+ '/usr/bin/google-chrome',
877
+ '/usr/bin/chromium-browser',
878
+ '/usr/bin/chromium',
879
+ '/snap/bin/chromium'
880
+ ];
881
+
882
+ let executablePath = null;
883
+ for (const chromePath of systemChromePaths) {
884
+ if (fs.existsSync(chromePath)) {
885
+ executablePath = chromePath;
886
+ if (forceDebug) {
887
+ console.log(formatLogMessage('debug', `Using system Chrome: ${chromePath}`));
888
+ }
889
+ break;
890
+ }
891
+ }
892
+ const browser = await puppeteer.launch({
893
+ // Use system Chrome if available to avoid downloads
894
+ executablePath: executablePath,
895
+ // Force temporary user data directory for complete cleanup control
896
+ userDataDir: tempUserDataDir,
897
+ args: [
898
+ // Disk space controls - 50MB cache limits
899
+ '--disk-cache-size=52428800', // 50MB disk cache (50 * 1024 * 1024)
900
+ '--media-cache-size=52428800', // 50MB media cache
901
+ '--disable-application-cache',
902
+ '--disable-offline-load-stale-cache',
903
+ '--disable-background-downloads',
904
+ '--no-first-run',
905
+ '--disable-default-apps',
906
+ '--disable-component-extensions-with-background-pages',
907
+ '--disable-background-networking',
908
+ '--no-sandbox',
909
+ '--disable-setuid-sandbox',
910
+ '--disable-features=SafeBrowsing',
911
+ '--disable-dev-shm-usage',
912
+ '--disable-sync',
913
+ '--disable-gpu',
914
+ '--mute-audio',
915
+ '--disable-translate',
916
+ '--window-size=1920,1080',
917
+ '--disable-extensions',
918
+ '--no-default-browser-check',
919
+ '--safebrowsing-disable-auto-update',
920
+ '--max_old_space_size=1024',
921
+ '--ignore-ssl-errors',
922
+ '--ignore-certificate-errors',
923
+ '--ignore-certificate-errors-spki-list',
924
+ '--ignore-certificate-errors-ca-list',
925
+ '--disable-web-security',
926
+ '--allow-running-insecure-content',
927
+ '--disable-background-timer-throttling',
928
+ '--disable-backgrounding-occluded-windows',
929
+ '--disable-renderer-backgrounding',
930
+ '--disable-features=TranslateUI',
931
+ '--disable-features=VizDisplayCompositor',
932
+ '--run-all-compositor-stages-before-draw',
933
+ '--disable-threaded-animation',
934
+ '--disable-threaded-scrolling',
935
+ '--disable-checker-imaging',
936
+ '--disable-image-animation-resync'
937
+ ],
938
+ headless: launchHeadless ? 'shell' : false,
939
+ protocolTimeout: 500000
940
+ });
941
+
942
+ // Store the user data directory on the browser object for cleanup
943
+ browser._nwssUserDataDir = tempUserDataDir;
944
+ return browser;
945
+ }
946
+
947
+
948
+ const pLimit = (await import('p-limit')).default;
949
+ const limit = pLimit(MAX_CONCURRENT_SITES);
950
+
951
+ const perSiteHeadful = sites.some(site => site.headful === true);
952
+ const launchHeadless = !(headfulMode || perSiteHeadful);
953
+ // launch with no safe browsing
954
+ let browser = await createBrowser();
955
+ if (forceDebug) console.log(formatLogMessage('debug', `Launching browser with headless: ${launchHeadless}`));
956
+
957
+ // Log which headless mode is being used
958
+ if (forceDebug && launchHeadless) {
959
+ console.log(formatLogMessage('debug', `Using chrome-headless-shell for maximum performance`));
960
+ }
961
+
962
+ // Initial cleanup of any existing Chrome temp files - always comprehensive on startup
963
+ if (forceDebug) console.log(formatLogMessage('debug', 'Cleaning up any leftover temp files from previous runs...'));
964
+ await cleanupChromeTempFiles({
965
+ includeSnapTemp: true, // Always clean snap dirs on startup
966
+ forceDebug,
967
+ comprehensive: true // Always comprehensive on startup to clean leftovers
968
+ });
969
+
970
+ // Set up cleanup on process termination
971
+ process.on('SIGINT', async () => {
972
+ if (forceDebug) console.log(formatLogMessage('debug', 'SIGINT received, performing cleanup...'));
973
+ await performEmergencyCleanup();
974
+ process.exit(0);
975
+ });
976
+
977
+ process.on('SIGTERM', async () => {
978
+ if (forceDebug) console.log(formatLogMessage('debug', 'SIGTERM received, performing cleanup...'));
979
+ await performEmergencyCleanup();
980
+ process.exit(0);
981
+ });
982
+
983
+ // Emergency cleanup function
984
+ async function performEmergencyCleanup() {
985
+ try {
986
+ if (browser && !browser.process()?.killed) {
987
+ await handleBrowserExit(browser, {
988
+ forceDebug,
989
+ timeout: 5000,
990
+ exitOnFailure: false,
991
+ cleanTempFiles: true,
992
+ comprehensiveCleanup: true, // Always comprehensive on emergency
993
+ userDataDir: browser._nwssUserDataDir
994
+ });
995
+ } else {
996
+ // Browser already dead, just clean temp files
997
+ await cleanupChromeTempFiles({
998
+ includeSnapTemp: true,
999
+ forceDebug,
1000
+ comprehensive: true
1001
+ });
1002
+ }
1003
+ } catch (emergencyErr) {
1004
+ if (forceDebug) console.log(formatLogMessage('debug', `Emergency cleanup failed: ${emergencyErr.message}`));
1005
+ }
1006
+ }
1007
+
1008
+ let siteCounter = 0;
1009
+ const totalUrls = sites.reduce((sum, site) => {
1010
+ const urls = Array.isArray(site.url) ? site.url.length : 1;
1011
+ return sum + urls;
1012
+ }, 0);
1013
+
1014
+ // --- Global CDP (Chrome DevTools Protocol) Session --- [COMMENT RE-ADDED PREVIOUSLY, relevant to old logic]
1015
+ // NOTE: This CDP session is attached to the initial browser page (e.g., about:blank).
1016
+ // For comprehensive network logging per scanned site, a CDP session should ideally be
1017
+ // created for each new page context. This current setup might miss some site-specific requests.
1018
+ // (The code block for this initial global CDP session has been removed.
1019
+ // CDP is now handled on a per-page basis within processUrl if enabled.)
1020
+
1021
+
1022
+ // --- Global evaluateOnNewDocument for Fetch/XHR Interception ---
1023
+ // REMOVED: The old flawed global loop for evaluateOnNewDocument (Fetch/XHR interception) is removed.
1024
+ // This functionality is now correctly implemented within the processUrl function on the actual target page.
1025
+
1026
+
1027
+ /**
1028
+ * Processes a single URL: navigates to it, applies configurations (spoofing, interception),
1029
+ * monitors network requests, and extracts domains based on matching filterRegex.
1030
+ *
1031
+ * @param {string} currentUrl - The URL to scan.
1032
+ * @param {object} siteConfig - The configuration object for this specific site/URL from config.json.
1033
+ * @param {import('puppeteer').Browser} browserInstance - The shared Puppeteer browser instance.
1034
+ * @returns {Promise<object>} A promise that resolves to an object containing scan results.
1035
+ */
1036
+ async function processUrl(currentUrl, siteConfig, browserInstance) {
1037
+ const allowFirstParty = siteConfig.firstParty === 1;
1038
+ const allowThirdParty = siteConfig.thirdParty === undefined || siteConfig.thirdParty === 1;
1039
+ const perSiteSubDomains = siteConfig.subDomains === 1 ? true : subDomainsMode;
1040
+ const siteLocalhost = siteConfig.localhost === true;
1041
+ const siteLocalhostAlt = siteConfig.localhost_0_0_0_0 === true;
1042
+ const cloudflarePhishBypass = siteConfig.cloudflare_phish === true;
1043
+ const cloudflareBypass = siteConfig.cloudflare_bypass === true;
1044
+ const sitePrivoxy = siteConfig.privoxy === true;
1045
+ const sitePihole = siteConfig.pihole === true;
1046
+ const flowproxyDetection = siteConfig.flowproxy_detection === true;
1047
+
1048
+ const evenBlocked = siteConfig.even_blocked === true;
1049
+ // Log site-level comments if debug mode is enabled
1050
+ if (forceDebug && siteConfig.comments) {
1051
+ const siteComments = Array.isArray(siteConfig.comments) ? siteConfig.comments : [siteConfig.comments];
1052
+ console.log(formatLogMessage('debug', `Site comments for ${currentUrl}: ${siteComments.length} item(s)`));
1053
+ siteComments.forEach((comment, idx) =>
1054
+ console.log(formatLogMessage('debug', ` Site comment ${idx + 1}: ${comment}`))
1055
+ );
1056
+ }
1057
+
1058
+ if (siteConfig.firstParty === 0 && siteConfig.thirdParty === 0) {
1059
+ console.warn(`⚠ Skipping ${currentUrl} because both firstParty and thirdParty are disabled.`);
1060
+ return { url: currentUrl, rules: [], success: false, skipped: true };
1061
+ }
1062
+
1063
+ let page = null;
1064
+ let cdpSession = null;
1065
+ // Use Map to track domains and their resource types for --adblock-rules or --dry-run
1066
+ const matchedDomains = (adblockRulesMode || siteConfig.adblock_rules || dryRunMode) ? new Map() : new Set();
1067
+
1068
+ // Initialize dry run matches collection
1069
+ if (dryRunMode) {
1070
+ matchedDomains.set('dryRunMatches', []);
1071
+ matchedDomains.set('dryRunNetTools', []);
1072
+ matchedDomains.set('dryRunSearchString', new Map()); // Map URL to search results
1073
+ }
1074
+ const timeout = siteConfig.timeout || 30000;
1075
+
1076
+ if (!silentMode) console.log(`\n${messageColors.scanning('Scanning:')} ${currentUrl}`);
1077
+
1078
+ // Track redirect domains to exclude from matching
1079
+ let redirectDomainsToExclude = [];
1080
+
1081
+ // Track the effective current URL for first-party detection (updates after redirects)
1082
+ let effectiveCurrentUrl = currentUrl;
1083
+
1084
+ try {
1085
+ // Health check before creating new page
1086
+ const isHealthy = await isBrowserHealthy(browserInstance);
1087
+ if (!isHealthy) {
1088
+ if (forceDebug) {
1089
+ console.log(formatLogMessage('debug', `Browser health degraded before processing ${currentUrl} - forcing immediate restart`));
1090
+ }
1091
+ // Return special code to trigger immediate browser restart
1092
+ return {
1093
+ url: currentUrl,
1094
+ rules: [],
1095
+ success: false,
1096
+ needsImmediateRestart: true,
1097
+ error: 'Browser health degraded - restart required'
1098
+ };
1099
+ }
1100
+ // Check for Protocol timeout errors that indicate browser is broken
1101
+ if (browserInstance.process() && browserInstance.process().killed) {
1102
+ throw new Error('Browser process was killed - restart required');
1103
+ }
1104
+ page = await browserInstance.newPage();
1105
+
1106
+ // Set aggressive timeouts for problematic operations
1107
+ page.setDefaultTimeout(Math.min(timeout, 20000)); // Use site timeout or 20s max
1108
+ page.setDefaultNavigationTimeout(Math.min(timeout, 25000)); // Use site timeout or 25s max
1109
+ // Note: timeout variable from siteConfig.timeout || 30000 is overridden for stability
1110
+
1111
+ page.on('console', (msg) => {
1112
+ if (forceDebug && msg.type() === 'error') console.log(`[debug] Console error: ${msg.text()}`);
1113
+ });
1114
+
1115
+ // Add page crash handler
1116
+ page.on('error', (err) => {
1117
+ if (forceDebug) console.log(formatLogMessage('debug', `Page crashed: ${err.message}`));
1118
+ // Don't throw here as it might cause hanging - let the timeout handle it
1119
+ });
1120
+
1121
+ // Apply flowProxy timeouts if detection is enabled
1122
+ if (flowproxyDetection) {
1123
+ const flowproxyTimeouts = getFlowProxyTimeouts(siteConfig);
1124
+ page.setDefaultTimeout(flowproxyTimeouts.pageTimeout);
1125
+ page.setDefaultNavigationTimeout(flowproxyTimeouts.navigationTimeout);
1126
+ if (forceDebug) {
1127
+ console.log(formatLogMessage('debug', `Applied flowProxy timeouts - page: ${flowproxyTimeouts.pageTimeout}ms, nav: ${flowproxyTimeouts.navigationTimeout}ms`));
1128
+ }
1129
+ }
1130
+
1131
+ // --- START: evaluateOnNewDocument for Fetch/XHR Interception (Moved and Fixed) ---
1132
+ // This script is injected if --eval-on-doc is used or siteConfig.evaluateOnNewDocument is true.
1133
+ const shouldInjectEvalForPage = siteConfig.evaluateOnNewDocument === true || globalEvalOnDoc;
1134
+ if (shouldInjectEvalForPage) {
1135
+ if (forceDebug) {
1136
+ if (globalEvalOnDoc) {
1137
+ console.log(formatLogMessage('debug', `[evalOnDoc] Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
1138
+ } else { // siteConfig.evaluateOnNewDocument must be true
1139
+ console.log(formatLogMessage('debug', `[evalOnDoc] Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
1140
+ }
1141
+ }
1142
+ try {
1143
+ await page.evaluateOnNewDocument(() => {
1144
+ // This script intercepts and logs Fetch and XHR requests
1145
+ // from within the page context at the earliest possible moment.
1146
+ const originalFetch = window.fetch;
1147
+ window.fetch = (...args) => {
1148
+ console.log('[evalOnDoc][fetch]', args[0]); // Log fetch requests
1149
+ return originalFetch.apply(this, args);
1150
+ };
1151
+
1152
+ const originalXHROpen = XMLHttpRequest.prototype.open;
1153
+ XMLHttpRequest.prototype.open = function (method, xhrUrl) { // Renamed 'url' to 'xhrUrl' to avoid conflict
1154
+ console.log('[evalOnDoc][xhr]', xhrUrl); // Log XHR requests
1155
+ return originalXHROpen.apply(this, arguments);
1156
+ };
1157
+ });
1158
+ } catch (evalErr) {
1159
+ console.warn(formatLogMessage('warn', `[evalOnDoc] Failed to set up Fetch/XHR interception for ${currentUrl}: ${evalErr.message}`));
1160
+ }
1161
+ }
1162
+ // --- END: evaluateOnNewDocument for Fetch/XHR Interception ---
1163
+
1164
+ // --- CSS Element Blocking Setup ---
1165
+ const cssBlockedSelectors = siteConfig.css_blocked;
1166
+ if (cssBlockedSelectors && Array.isArray(cssBlockedSelectors) && cssBlockedSelectors.length > 0) {
1167
+ if (forceDebug) console.log(formatLogMessage('debug', `CSS element blocking enabled for ${currentUrl}: ${cssBlockedSelectors.join(', ')}`));
1168
+ try {
1169
+ await page.evaluateOnNewDocument(({ selectors }) => {
1170
+ // Inject CSS to hide blocked elements
1171
+ const style = document.createElement('style');
1172
+ style.type = 'text/css';
1173
+ const cssRules = selectors.map(selector => `${selector} { display: none !important; visibility: hidden !important; }`).join('\n');
1174
+ style.innerHTML = cssRules;
1175
+
1176
+ // Add the style as soon as DOM is available
1177
+ if (document.head) {
1178
+ document.head.appendChild(style);
1179
+ } else {
1180
+ document.addEventListener('DOMContentLoaded', () => document.head.appendChild(style));
1181
+ }
1182
+ }, { selectors: cssBlockedSelectors });
1183
+ } catch (cssErr) {
1184
+ console.warn(formatLogMessage('warn', `[css_blocked] Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
1185
+ }
1186
+ }
1187
+ // --- END: CSS Element Blocking Setup ---
1188
+
1189
+ // --- Per-Page CDP Setup ---
1190
+ const cdpLoggingNeededForPage = enableCDP || siteConfig.cdp === true;
1191
+ if (cdpLoggingNeededForPage) {
1192
+ if (forceDebug) {
1193
+ if (enableCDP) {
1194
+ console.log(formatLogMessage('debug', `CDP logging globally enabled by --cdp, applying to page: ${currentUrl}`));
1195
+ } else if (siteConfig.cdp === true) {
1196
+ console.log(formatLogMessage('debug', `CDP logging enabled for page ${currentUrl} via site-specific 'cdp: true' config.`));
1197
+ }
1198
+ }
1199
+ try {
1200
+ cdpSession = await page.target().createCDPSession();
1201
+ await cdpSession.send('Network.enable');
1202
+ cdpSession.on('Network.requestWillBeSent', (params) => {
1203
+ const { url: requestUrl, method } = params.request;
1204
+ const initiator = params.initiator ? params.initiator.type : 'unknown';
1205
+ let hostnameForLog = 'unknown-host';
1206
+ try {
1207
+ hostnameForLog = new URL(currentUrl).hostname;
1208
+ } catch (_) { /* ignore if currentUrl is invalid for URL parsing */ }
1209
+ console.log(formatLogMessage('debug', `[cdp][${hostnameForLog}] ${method} ${requestUrl} (initiator: ${initiator})`));
1210
+ });
1211
+ } catch (cdpErr) {
1212
+ cdpSession = null; // Reset on failure
1213
+ if (cdpErr.message.includes('Network.enable timed out') ||
1214
+ cdpErr.message.includes('Protocol error')) {
1215
+ // This indicates browser is completely broken
1216
+ throw new Error(`Browser protocol broken: ${cdpErr.message}`);
1217
+ }
1218
+ console.warn(formatLogMessage('warn', `[cdp] Failed to attach CDP session for ${currentUrl}: ${cdpErr.message}`));
1219
+ }
1220
+ }
1221
+ // --- End of Per-Page CDP Setup ---
1222
+
1223
+ await page.setRequestInterception(true);
1224
+
1225
+ // Set up frame handling to suppress invalid URL errors
1226
+ setupFrameHandling(page, forceDebug);
1227
+
1228
+ if (siteConfig.clear_sitedata === true) {
1229
+ try {
1230
+ let clearDataSession = null;
1231
+ try {
1232
+ clearDataSession = await page.target().createCDPSession();
1233
+ await clearDataSession.send('Network.clearBrowserCookies');
1234
+ await clearDataSession.send('Network.clearBrowserCache');
1235
+ } finally {
1236
+ if (clearDataSession) {
1237
+ try { await clearDataSession.detach(); } catch (detachErr) { /* ignore */ }
1238
+ }
1239
+ }
1240
+ await page.evaluate(() => {
1241
+ localStorage.clear();
1242
+ sessionStorage.clear();
1243
+ indexedDB.databases().then(dbs => dbs.forEach(db => indexedDB.deleteDatabase(db.name)));
1244
+ });
1245
+ if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data for ${currentUrl}`));
1246
+ } catch (clearErr) {
1247
+ console.warn(messageColors.warn(`[clear_sitedata failed] ${currentUrl}: ${clearErr.message}`));
1248
+ }
1249
+ }
1250
+
1251
+ // --- Apply all fingerprint spoofing (user agent, Brave, fingerprint protection) ---
1252
+ await applyAllFingerprintSpoofing(page, siteConfig, forceDebug, currentUrl);
1253
+
1254
+ const regexes = Array.isArray(siteConfig.filterRegex)
1255
+ ? siteConfig.filterRegex.map(r => new RegExp(r.replace(/^\/(.*)\/$/, '$1')))
1256
+ : siteConfig.filterRegex
1257
+ ? [new RegExp(siteConfig.filterRegex.replace(/^\/(.*)\/$/, '$1'))]
1258
+ : [];
1259
+
1260
+ // Parse searchstring patterns using module
1261
+ const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
1262
+ const useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring
1263
+ let useGrep = siteConfig.grep === true && useCurl; // Grep requires curl to be enabled
1264
+
1265
+ // Get user agent for curl if needed
1266
+ let curlUserAgent = '';
1267
+ if (useCurl && siteConfig.userAgent) {
1268
+ const userAgents = {
1269
+ chrome: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
1270
+ firefox: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
1271
+ safari: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15"
1272
+ };
1273
+ curlUserAgent = userAgents[siteConfig.userAgent.toLowerCase()] || '';
1274
+ }
1275
+
1276
+ if (useCurl && forceDebug) {
1277
+ console.log(formatLogMessage('debug', `Curl-based content analysis enabled for ${currentUrl}`));
1278
+ }
1279
+
1280
+ if (useGrep && forceDebug) {
1281
+ console.log(formatLogMessage('debug', `Grep-based pattern matching enabled for ${currentUrl}`));
1282
+ }
1283
+
1284
+ // Validate grep availability if needed
1285
+ if (useGrep && (hasSearchString || hasSearchStringAnd)) {
1286
+ const grepCheck = validateGrepAvailability();
1287
+ if (!grepCheck.isAvailable) {
1288
+ console.warn(formatLogMessage('warn', `Grep not available for ${currentUrl}: ${grepCheck.error}. Falling back to JavaScript search.`));
1289
+ useGrep = false;
1290
+ } else if (forceDebug) {
1291
+ console.log(formatLogMessage('debug', `Using grep: ${grepCheck.version}`));
1292
+ }
1293
+ }
1294
+
1295
+ // Parse whois and dig terms
1296
+ const whoisTerms = siteConfig.whois && Array.isArray(siteConfig.whois) ? siteConfig.whois : null;
1297
+ const whoisOrTerms = siteConfig['whois-or'] && Array.isArray(siteConfig['whois-or']) ? siteConfig['whois-or'] : null;
1298
+ const whoisServer = siteConfig.whois_server || null; // Parse whois_server configuration
1299
+ const digTerms = siteConfig.dig && Array.isArray(siteConfig.dig) ? siteConfig.dig : null;
1300
+ const digOrTerms = siteConfig['dig-or'] && Array.isArray(siteConfig['dig-or']) ? siteConfig['dig-or'] : null;
1301
+ const digRecordType = siteConfig.digRecordType || 'A';
1302
+ const hasNetTools = whoisTerms || whoisOrTerms || digTerms || digOrTerms;
1303
+
1304
+ // Validate nettools availability if needed
1305
+ if (hasNetTools) {
1306
+ if (whoisTerms || whoisOrTerms) {
1307
+ const whoisCheck = validateWhoisAvailability();
1308
+ if (!whoisCheck.isAvailable) {
1309
+ console.warn(formatLogMessage('warn', `Whois not available for ${currentUrl}: ${whoisCheck.error}. Skipping whois checks.`));
1310
+ siteConfig.whois = null; // Disable whois for this site
1311
+ siteConfig['whois-or'] = null; // Disable whois-or for this site
1312
+ } else if (forceDebug) {
1313
+ console.log(formatLogMessage('debug', `Using whois: ${whoisCheck.version}`));
1314
+ }
1315
+ }
1316
+
1317
+ if (digTerms || digOrTerms) {
1318
+ const digCheck = validateDigAvailability();
1319
+ if (!digCheck.isAvailable) {
1320
+ console.warn(formatLogMessage('warn', `Dig not available for ${currentUrl}: ${digCheck.error}. Skipping dig checks.`));
1321
+ siteConfig.dig = null; // Disable dig for this site
1322
+ siteConfig['dig-or'] = null; // Disable dig-or for this site
1323
+ } else if (forceDebug) {
1324
+ console.log(formatLogMessage('debug', `Using dig: ${digCheck.version}`));
1325
+ }
1326
+ }
1327
+ }
1328
+
1329
+ if (siteConfig.verbose === 1 && siteConfig.filterRegex) {
1330
+ const patterns = Array.isArray(siteConfig.filterRegex) ? siteConfig.filterRegex : [siteConfig.filterRegex];
1331
+ console.log(formatLogMessage('info', `Regex patterns for ${currentUrl}:`));
1332
+ patterns.forEach((pattern, idx) => {
1333
+ console.log(` [${idx + 1}] ${pattern}`);
1334
+ });
1335
+ }
1336
+
1337
+ if (siteConfig.verbose === 1 && (hasSearchString || hasSearchStringAnd)) {
1338
+ console.log(formatLogMessage('info', `Search strings for ${currentUrl}:`));
1339
+ if (hasSearchString) {
1340
+ console.log(` OR logic (any must match):`);
1341
+ searchStrings.forEach((searchStr, idx) => {
1342
+ console.log(` [${idx + 1}] "${searchStr}"`);
1343
+ });
1344
+ }
1345
+ if (hasSearchStringAnd) {
1346
+ console.log(` AND logic (all must match):`);
1347
+ searchStringsAnd.forEach((searchStr, idx) => {
1348
+ console.log(` [${idx + 1}] "${searchStr}"`);
1349
+ });
1350
+ }
1351
+ }
1352
+
1353
+ if (siteConfig.verbose === 1 && whoisServer) {
1354
+ if (forceDebug) {
1355
+ if (Array.isArray(whoisServer)) {
1356
+ console.log(formatLogMessage('info', `Whois servers for ${currentUrl} (randomized): [${whoisServer.join(', ')}]`));
1357
+ } else {
1358
+ console.log(formatLogMessage('info', `Whois server for ${currentUrl}: ${whoisServer}`));
1359
+ }
1360
+ }
1361
+ }
1362
+
1363
+ if (siteConfig.verbose === 1 && whoisTerms) {
1364
+ if (forceDebug) console.log(formatLogMessage('info', `Whois terms for ${currentUrl}:`));
1365
+ whoisTerms.forEach((term, idx) => {
1366
+ if (forceDebug) console.log(` [${idx + 1}] "${term}"`);
1367
+ });
1368
+ }
1369
+
1370
+ if (siteConfig.verbose === 1 && whoisOrTerms) {
1371
+ if (forceDebug) console.log(formatLogMessage('info', `Whois-or terms for ${currentUrl}:`));
1372
+ whoisOrTerms.forEach((term, idx) => {
1373
+ if (forceDebug) console.log(` [${idx + 1}] "${term}" (OR logic)`);
1374
+ });
1375
+ }
1376
+
1377
+ if (siteConfig.verbose === 1 && digTerms) {
1378
+ if (forceDebug) console.log(formatLogMessage('info', `Dig terms for ${currentUrl} (${digRecordType} records):`));
1379
+ digTerms.forEach((term, idx) => {
1380
+ if (forceDebug) console.log(` [${idx + 1}] "${term}"`);
1381
+ });
1382
+ }
1383
+
1384
+ if (siteConfig.verbose === 1 && digOrTerms) {
1385
+ if (forceDebug) console.log(formatLogMessage('info', `Dig-or terms for ${currentUrl} (${digRecordType} records):`));
1386
+ digOrTerms.forEach((term, idx) => {
1387
+ if (forceDebug) console.log(` [${idx + 1}] "${term}" (OR logic)`);
1388
+ });
1389
+ }
1390
+
1391
+ const blockedRegexes = Array.isArray(siteConfig.blocked)
1392
+ ? siteConfig.blocked.map(pattern => new RegExp(pattern))
1393
+ : [];
1394
+
1395
+ // Add global blocked patterns
1396
+ const globalBlockedRegexes = Array.isArray(globalBlocked)
1397
+ ? globalBlocked.map(pattern => new RegExp(pattern))
1398
+ : [];
1399
+ const allBlockedRegexes = [...blockedRegexes, ...globalBlockedRegexes];
1400
+
1401
+ /**
1402
+ * Helper function to add domain to matched collection
1403
+ * @param {string} domain - Domain to add
1404
+ * @param {string} resourceType - Resource type (for --adblock-rules mode)
1405
+ */
1406
+ function addMatchedDomain(domain, resourceType = null) {
1407
+ // Check if we should ignore similar domains
1408
+ const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
1409
+ const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
1410
+ const ignoreSimilarIgnoredDomains = siteConfig.ignore_similar_ignored_domains !== undefined ? siteConfig.ignore_similar_ignored_domains : ignore_similar_ignored_domains;
1411
+
1412
+ if (ignoreSimilarEnabled) {
1413
+ const existingDomains = matchedDomains instanceof Map
1414
+ ? Array.from(matchedDomains.keys()).filter(key => !['dryRunMatches', 'dryRunNetTools', 'dryRunSearchString'].includes(key))
1415
+ : Array.from(matchedDomains);
1416
+
1417
+ const similarCheck = shouldIgnoreSimilarDomain(domain, existingDomains, {
1418
+ enabled: true,
1419
+ threshold: similarityThreshold,
1420
+ forceDebug
1421
+ });
1422
+
1423
+ if (similarCheck.shouldIgnore) {
1424
+ if (forceDebug) {
1425
+ console.log(formatLogMessage('debug', `[ignore_similar] Skipping ${domain}: ${similarCheck.reason}`));
1426
+ }
1427
+ return; // Skip adding this domain
1428
+ }
1429
+ }
1430
+
1431
+ // Check if domain is similar to any in ignoreDomains list
1432
+ if (ignoreSimilarIgnoredDomains && ignoreDomains && ignoreDomains.length > 0) {
1433
+ const ignoredSimilarCheck = shouldIgnoreSimilarDomain(domain, ignoreDomains, {
1434
+ enabled: true,
1435
+ threshold: similarityThreshold,
1436
+ forceDebug
1437
+ });
1438
+
1439
+ if (ignoredSimilarCheck.shouldIgnore) {
1440
+ if (forceDebug) {
1441
+ console.log(formatLogMessage('debug', `[ignore_similar_ignored_domains] Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
1442
+ }
1443
+ return; // Skip adding this domain
1444
+ }
1445
+ }
1446
+
1447
+ if (matchedDomains instanceof Map) {
1448
+ if (!matchedDomains.has(domain)) {
1449
+ matchedDomains.set(domain, new Set());
1450
+ }
1451
+ // Only add the specific resourceType that was matched, not all types for this domain
1452
+ if (resourceType) {
1453
+ matchedDomains.get(domain).add(resourceType);
1454
+ }
1455
+ } else {
1456
+ matchedDomains.add(domain);
1457
+ }
1458
+ }
1459
+
1460
+ // --- page.on('request', ...) Handler: Core Network Request Logic ---
1461
+ // This handler is triggered for every network request made by the page.
1462
+ // It decides whether to allow, block, or process the request based on:
1463
+ // - First-party/third-party status and site configuration.
1464
+ // - URL matching against blocklists (`blockedRegexes`).
1465
+ // - URL matching against filter patterns (`regexes`) for domain extraction.
1466
+ // - Global `ignoreDomains` list.
1467
+ page.on('request', request => {
1468
+ const checkedUrl = request.url();
1469
+ const checkedHostname = safeGetDomain(checkedUrl, true);
1470
+ // Use effectiveCurrentUrl which gets updated after redirects
1471
+ // This ensures first-party detection uses the final redirected domain
1472
+ const effectiveCurrentHostname = safeGetDomain(effectiveCurrentUrl, true);
1473
+ const isFirstParty = checkedHostname && effectiveCurrentHostname && checkedHostname === effectiveCurrentHostname;
1474
+
1475
+ // Block infinite iframe loops
1476
+ const frameUrl = request.frame() ? request.frame().url() : '';
1477
+ if (frameUrl && frameUrl.includes('creative.dmzjmp.com') &&
1478
+ request.url().includes('go.dmzjmp.com/api/models')) {
1479
+ if (forceDebug) {
1480
+ console.log(formatLogMessage('debug', `Blocking potential infinite iframe loop: ${request.url()}`));
1481
+ }
1482
+ request.abort();
1483
+ return;
1484
+ }
1485
+
1486
+ // Enhanced debug logging to show which frame the request came from
1487
+ if (forceDebug) {
1488
+ const frameUrl = request.frame() ? request.frame().url() : 'unknown-frame';
1489
+ const isMainFrame = request.frame() === page.mainFrame();
1490
+ console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[frame: ${isMainFrame ? 'main' : 'iframe'}] ${frameUrl} → ${request.url()}`));
1491
+ }
1492
+
1493
+ // Show --debug output and the url while its scanning
1494
+ if (forceDebug) {
1495
+ const simplifiedUrl = getRootDomain(currentUrl);
1496
+ const timestamp = new Date().toISOString();
1497
+ const logEntry = `${timestamp} [debug req][${simplifiedUrl}] ${request.url()}`;
1498
+
1499
+ // Output to console
1500
+ console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[${simplifiedUrl}] ${request.url()}`));
1501
+
1502
+ // Output to file
1503
+ if (debugLogFile) {
1504
+ try {
1505
+ fs.appendFileSync(debugLogFile, logEntry + '\n');
1506
+ } catch (logErr) {
1507
+ console.warn(formatLogMessage('warn', `Failed to write to debug log file: ${logErr.message}`));
1508
+ }
1509
+ }
1510
+ }
1511
+ const reqUrl = request.url();
1512
+
1513
+ if (allBlockedRegexes.some(re => re.test(reqUrl))) {
1514
+ if (forceDebug) {
1515
+ // Find which specific pattern matched for debug logging
1516
+ const allPatterns = [...(siteConfig.blocked || []), ...globalBlocked];
1517
+ const matchedPattern = allPatterns.find(pattern => new RegExp(pattern).test(reqUrl));
1518
+ const patternSource = siteConfig.blocked && siteConfig.blocked.includes(matchedPattern) ? 'site' : 'global';
1519
+ const simplifiedUrl = getRootDomain(currentUrl);
1520
+ console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked]')}[${simplifiedUrl}] ${reqUrl} blocked by ${patternSource} pattern: ${matchedPattern}`));
1521
+
1522
+ // Also log to file if debug logging is enabled
1523
+ if (debugLogFile) {
1524
+ try {
1525
+ const timestamp = new Date().toISOString();
1526
+ fs.appendFileSync(debugLogFile, `${timestamp} [blocked][${simplifiedUrl}] ${reqUrl} (${patternSource} pattern: ${matchedPattern})\n`);
1527
+ } catch (logErr) {
1528
+ console.warn(formatLogMessage('warn', `Failed to write blocked domain to debug log: ${logErr.message}`));
1529
+ }
1530
+ }
1531
+ }
1532
+
1533
+ // NEW: Check if even_blocked is enabled and this URL matches filter regex
1534
+ if (evenBlocked) {
1535
+ const reqDomain = safeGetDomain(reqUrl, perSiteSubDomains);
1536
+ if (reqDomain && !matchesIgnoreDomain(reqDomain, ignoreDomains)) {
1537
+ for (const re of regexes) {
1538
+ if (re.test(reqUrl)) {
1539
+ const resourceType = request.resourceType();
1540
+
1541
+ // Apply same filtering logic as unblocked requests
1542
+ const allowedResourceTypes = siteConfig.resourceTypes;
1543
+ if (!allowedResourceTypes || !Array.isArray(allowedResourceTypes) || allowedResourceTypes.includes(resourceType)) {
1544
+ if (dryRunMode) {
1545
+ matchedDomains.get('dryRunMatches').push({
1546
+ regex: re.source,
1547
+ domain: reqDomain,
1548
+ resourceType: resourceType,
1549
+ fullUrl: reqUrl,
1550
+ isFirstParty: isFirstParty,
1551
+ wasBlocked: true
1552
+ });
1553
+ } else {
1554
+ addMatchedDomain(reqDomain, resourceType);
1555
+ }
1556
+
1557
+ const simplifiedUrl = getRootDomain(currentUrl);
1558
+ if (siteConfig.verbose === 1) {
1559
+ const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
1560
+ console.log(formatLogMessage('match', `[${simplifiedUrl}] ${reqUrl} matched regex: ${re} and resourceType: ${resourceType}${resourceInfo} [BLOCKED BUT ADDED]`));
1561
+ }
1562
+ if (dumpUrls) {
1563
+ const timestamp = new Date().toISOString();
1564
+ const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
1565
+ fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${reqUrl} (resourceType: ${resourceType})${resourceInfo} [BLOCKED BUT ADDED]\n`);
1566
+ }
1567
+ break; // Only match once per URL
1568
+ }
1569
+ }
1570
+ }
1571
+ }
1572
+ }
1573
+
1574
+ request.abort();
1575
+ return;
1576
+ }
1577
+
1578
+ const reqDomain = safeGetDomain(reqUrl, perSiteSubDomains);
1579
+
1580
+ if (!reqDomain) {
1581
+ if (forceDebug) {
1582
+ console.log(formatLogMessage('debug', `Skipping request with unparseable URL: ${reqUrl}`));
1583
+ }
1584
+ request.continue();
1585
+ return;
1586
+ }
1587
+
1588
+ // Skip matching if this domain is one of the redirect intermediaries
1589
+ if (redirectDomainsToExclude && redirectDomainsToExclude.includes(reqDomain)) {
1590
+ if (forceDebug) {
1591
+ console.log(formatLogMessage('debug', `Skipping redirect intermediary domain: ${reqDomain}`));
1592
+ }
1593
+ request.continue();
1594
+ return;
1595
+ }
1596
+
1597
+ for (const re of regexes) {
1598
+ if (re.test(reqUrl)) {
1599
+ const resourceType = request.resourceType();
1600
+
1601
+ // *** UNIVERSAL RESOURCE TYPE FILTER ***
1602
+ // Check resourceTypes filter FIRST, before ANY processing (nettools, searchstring, immediate matching)
1603
+ const allowedResourceTypes = siteConfig.resourceTypes;
1604
+ if (allowedResourceTypes && Array.isArray(allowedResourceTypes) && allowedResourceTypes.length > 0) {
1605
+ if (!allowedResourceTypes.includes(resourceType)) {
1606
+ if (forceDebug) {
1607
+ console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${allowedResourceTypes.join(', ')}]. Skipping ALL processing.`));
1608
+ }
1609
+ break; // Skip this URL entirely - doesn't match required resource types
1610
+ }
1611
+ }
1612
+
1613
+ // Check party filtering AFTER regex match but BEFORE domain processing
1614
+ if (isFirstParty && siteConfig.firstParty === false) {
1615
+ if (forceDebug) {
1616
+ console.log(formatLogMessage('debug', `Skipping first-party match: ${reqUrl} (firstParty disabled)`));
1617
+ }
1618
+ break; // Skip this URL - it's first-party but firstParty is disabled
1619
+ }
1620
+ if (!isFirstParty && siteConfig.thirdParty === false) {
1621
+ if (forceDebug) {
1622
+ console.log(formatLogMessage('debug', `Skipping third-party match: ${reqUrl} (thirdParty disabled)`));
1623
+ }
1624
+ break; // Skip this URL - it's third-party but thirdParty is disabled
1625
+ }
1626
+
1627
+ // Check ignoreDomains AFTER regex match but BEFORE domain processing
1628
+ if (matchesIgnoreDomain(reqDomain, ignoreDomains)) {
1629
+ if (forceDebug) {
1630
+ console.log(formatLogMessage('debug', `Ignoring domain ${reqDomain} (matches ignoreDomains pattern)`));
1631
+ }
1632
+ break; // Skip this URL - domain is in ignore list
1633
+ }
1634
+
1635
+ // REMOVED: Check if this URL matches any blocked patterns - if so, skip detection but still continue browser blocking
1636
+ // This check is no longer needed here since even_blocked handles it above
1637
+
1638
+ // If NO searchstring AND NO nettools are defined, match immediately (existing behavior)
1639
+ if (!hasSearchString && !hasSearchStringAnd && !hasNetTools) {
1640
+ if (dryRunMode) {
1641
+ matchedDomains.get('dryRunMatches').push({
1642
+ regex: re.source,
1643
+ domain: reqDomain,
1644
+ resourceType: resourceType,
1645
+ fullUrl: reqUrl,
1646
+ isFirstParty: isFirstParty
1647
+ });
1648
+ } else {
1649
+ addMatchedDomain(reqDomain, resourceType);
1650
+ }
1651
+ const simplifiedUrl = getRootDomain(currentUrl);
1652
+ if (siteConfig.verbose === 1) {
1653
+ const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
1654
+ console.log(formatLogMessage('match', `[${simplifiedUrl}] ${reqUrl} matched regex: ${re} and resourceType: ${resourceType}${resourceInfo}`));
1655
+ }
1656
+ if (dumpUrls) {
1657
+ const timestamp = new Date().toISOString();
1658
+ const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
1659
+ fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${reqUrl} (resourceType: ${resourceType})${resourceInfo}\n`);
1660
+ }
1661
+ } else if (hasNetTools && !hasSearchString && !hasSearchStringAnd) {
1662
+ // If nettools are configured (whois/dig), perform checks on the domain
1663
+ if (forceDebug) {
1664
+ console.log(formatLogMessage('debug', `${reqUrl} matched regex ${re} and resourceType ${resourceType}, queued for nettools check`));
1665
+ }
1666
+
1667
+ if (dryRunMode) {
1668
+ // For dry run, we'll collect the domain for nettools checking
1669
+ matchedDomains.get('dryRunMatches').push({
1670
+ regex: re.source,
1671
+ domain: reqDomain,
1672
+ resourceType: resourceType,
1673
+ fullUrl: reqUrl,
1674
+ isFirstParty: isFirstParty,
1675
+ needsNetToolsCheck: true
1676
+ });
1677
+ }
1678
+
1679
+ // Create and execute nettools handler
1680
+ const netToolsHandler = createNetToolsHandler({
1681
+ whoisTerms,
1682
+ whoisOrTerms,
1683
+ whoisDelay: siteConfig.whois_delay || whois_delay, // Site-specific or global fallback
1684
+ whoisServer, // Pass whois server configuration
1685
+ whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
1686
+ debugLogFile, // Pass debug log file for whois error logging
1687
+ fs, // Pass fs module for file operations
1688
+ digTerms,
1689
+ digOrTerms,
1690
+ digRecordType,
1691
+ digSubdomain: siteConfig.dig_subdomain === true,
1692
+ // Add dry run callback for nettools results
1693
+ dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
1694
+ matchedDomains,
1695
+ addMatchedDomain,
1696
+ currentUrl,
1697
+ getRootDomain,
1698
+ siteConfig,
1699
+ dumpUrls,
1700
+ matchedUrlsLogFile,
1701
+ forceDebug,
1702
+ fs
1703
+ });
1704
+
1705
+ // Execute nettools check asynchronously
1706
+ const originalDomain = (new URL(reqUrl)).hostname;
1707
+ setImmediate(() => netToolsHandler(reqDomain, originalDomain));
1708
+ } else {
1709
+ // If searchstring or searchstring_and IS defined (with or without nettools), queue for content checking
1710
+ if (forceDebug) {
1711
+ const searchType = hasSearchStringAnd ? 'searchstring_and' : 'searchstring';
1712
+ console.log(formatLogMessage('debug', `${reqUrl} matched regex ${re} and resourceType ${resourceType}, queued for ${searchType} content search`));
1713
+ }
1714
+ if (dryRunMode) {
1715
+ matchedDomains.get('dryRunMatches').push({
1716
+ regex: re.source,
1717
+ domain: reqDomain,
1718
+ resourceType: resourceType,
1719
+ fullUrl: reqUrl,
1720
+ isFirstParty: isFirstParty,
1721
+ needsSearchStringCheck: true
1722
+ });
1723
+ }
1724
+ }
1725
+
1726
+ // If curl is enabled, download and analyze content immediately
1727
+ if (useCurl) {
1728
+ try {
1729
+ // Use grep handler if both grep and searchstring/searchstring_and are enabled
1730
+ if (useGrep && (hasSearchString || hasSearchStringAnd)) {
1731
+ const grepHandler = createGrepHandler({
1732
+ searchStrings,
1733
+ searchStringsAnd,
1734
+ regexes,
1735
+ matchedDomains,
1736
+ addMatchedDomain, // Pass the helper function
1737
+ currentUrl,
1738
+ perSiteSubDomains,
1739
+ ignoreDomains,
1740
+ matchesIgnoreDomain,
1741
+ getRootDomain,
1742
+ siteConfig,
1743
+ dumpUrls,
1744
+ matchedUrlsLogFile,
1745
+ forceDebug,
1746
+ userAgent: curlUserAgent,
1747
+ resourceType,
1748
+ hasSearchString,
1749
+ hasSearchStringAnd,
1750
+ grepOptions: {
1751
+ ignoreCase: true,
1752
+ wholeWord: false,
1753
+ regex: false
1754
+ }
1755
+ });
1756
+
1757
+ setImmediate(() => grepHandler(reqUrl));
1758
+ } else {
1759
+ // Use regular curl handler
1760
+ const curlHandler = createCurlHandler({
1761
+ searchStrings,
1762
+ searchStringsAnd,
1763
+ hasSearchStringAnd,
1764
+ regexes,
1765
+ matchedDomains,
1766
+ addMatchedDomain, // Pass the helper function
1767
+ currentUrl,
1768
+ perSiteSubDomains,
1769
+ ignoreDomains,
1770
+ matchesIgnoreDomain,
1771
+ getRootDomain,
1772
+ siteConfig,
1773
+ dumpUrls,
1774
+ matchedUrlsLogFile,
1775
+ forceDebug,
1776
+ userAgent: curlUserAgent,
1777
+ resourceType,
1778
+ hasSearchString
1779
+ });
1780
+
1781
+ setImmediate(() => curlHandler(reqUrl));
1782
+ }
1783
+ } catch (curlErr) {
1784
+ if (forceDebug) {
1785
+ console.log(formatLogMessage('debug', `Curl handler failed for ${reqUrl}: ${curlErr.message}`));
1786
+ }
1787
+ }
1788
+ }
1789
+
1790
+ break;
1791
+ }
1792
+ }
1793
+ request.continue();
1794
+ });
1795
+
1796
+ // Add response handler ONLY if searchstring/searchstring_and is defined AND neither curl nor grep is enabled
1797
+ if ((hasSearchString || hasSearchStringAnd) && !useCurl && !useGrep) {
1798
+ const responseHandler = createResponseHandler({
1799
+ searchStrings,
1800
+ searchStringsAnd,
1801
+ hasSearchStringAnd,
1802
+ regexes,
1803
+ matchedDomains,
1804
+ addMatchedDomain, // Pass the helper function
1805
+ currentUrl,
1806
+ perSiteSubDomains,
1807
+ ignoreDomains,
1808
+ matchesIgnoreDomain,
1809
+ getRootDomain,
1810
+ siteConfig,
1811
+ dumpUrls,
1812
+ matchedUrlsLogFile,
1813
+ forceDebug,
1814
+ resourceType: null // Response handler doesn't have direct access to resource type
1815
+ });
1816
+
1817
+ page.on('response', responseHandler);
1818
+ }
1819
+
1820
+ const interactEnabled = siteConfig.interact === true;
1821
+
1822
+ // --- Runtime CSS Element Blocking (Fallback) ---
1823
+ // Apply CSS blocking after page load as a fallback in case evaluateOnNewDocument didn't work
1824
+ if (cssBlockedSelectors && Array.isArray(cssBlockedSelectors) && cssBlockedSelectors.length > 0) {
1825
+ try {
1826
+ await page.evaluate((selectors) => {
1827
+ const existingStyle = document.querySelector('#css-blocker-runtime');
1828
+ if (!existingStyle) {
1829
+ const style = document.createElement('style');
1830
+ style.id = 'css-blocker-runtime';
1831
+ style.type = 'text/css';
1832
+ const cssRules = selectors.map(selector => `${selector} { display: none !important; visibility: hidden !important; }`).join('\n');
1833
+ style.innerHTML = cssRules;
1834
+ document.head.appendChild(style);
1835
+ }
1836
+ }, cssBlockedSelectors);
1837
+ } catch (cssRuntimeErr) {
1838
+ console.warn(formatLogMessage('warn', `[css_blocked] Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
1839
+ }
1840
+ }
1841
+
1842
+ try {
1843
+ // Use custom goto options if provided, otherwise default to 'load'
1844
+ // load Wait for all resources (default)
1845
+ // domcontentloaded Wait for DOM only
1846
+ // networkidle0 Wait until 0 network requests for 500ms
1847
+ // networkidle2 Wait until ≤2 network requests for 500ms
1848
+
1849
+ // Use faster defaults for sites with long timeouts to improve responsiveness
1850
+ const isFastSite = timeout <= 15000;
1851
+ const defaultWaitUntil = isFastSite ? 'load' : 'domcontentloaded';
1852
+ const defaultGotoOptions = {
1853
+ waitUntil: defaultWaitUntil,
1854
+ timeout: timeout
1855
+ };
1856
+ const gotoOptions = siteConfig.goto_options
1857
+ ? { ...defaultGotoOptions, ...siteConfig.goto_options }
1858
+ : defaultGotoOptions;
1859
+
1860
+ // Enhanced navigation with redirect handling - passes existing gotoOptions
1861
+ const navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage);
1862
+
1863
+ const { finalUrl, redirected, redirectChain, originalUrl, redirectDomains } = navigationResult;
1864
+
1865
+ // Handle redirect to new domain
1866
+ if (redirected) {
1867
+ const originalDomain = safeGetDomain(originalUrl);
1868
+ const finalDomain = safeGetDomain(finalUrl);
1869
+
1870
+ if (originalDomain !== finalDomain) {
1871
+ if (!silentMode) {
1872
+ console.log(`🔄 Redirect detected: ${originalDomain} → ${finalDomain}`);
1873
+ }
1874
+
1875
+ if (forceDebug) {
1876
+ console.log(formatLogMessage('debug', `Full redirect chain: ${redirectChain.join(' → ')}`));
1877
+ }
1878
+
1879
+ // Update currentUrl for all subsequent processing to use the final redirected URL
1880
+ currentUrl = finalUrl;
1881
+
1882
+ // IMPORTANT: Also update effectiveCurrentUrl for first-party detection
1883
+ // This ensures the request handler uses the redirected domain for party detection
1884
+ effectiveCurrentUrl = finalUrl;
1885
+
1886
+ // Update the redirect domains to exclude from matching
1887
+ if (redirectDomains && redirectDomains.length > 0) {
1888
+ redirectDomainsToExclude = redirectDomains;
1889
+
1890
+ if (forceDebug) {
1891
+ console.log(formatLogMessage('debug', `Excluding redirect domains from matching: ${redirectDomains.join(', ')}`));
1892
+ }
1893
+ }
1894
+ }
1895
+ }
1896
+
1897
+ siteCounter++;
1898
+
1899
+ // Handle all Cloudflare protections using the dedicated module
1900
+ const cloudflareResult = await handleCloudflareProtection(page, currentUrl, siteConfig, forceDebug);
1901
+
1902
+ if (!cloudflareResult.overallSuccess) {
1903
+ console.warn(`⚠ [cloudflare] Protection handling failed for ${currentUrl}:`);
1904
+ cloudflareResult.errors.forEach(error => {
1905
+ console.warn(` - ${error}`);
1906
+ });
1907
+ // Continue with scan despite Cloudflare issues
1908
+ }
1909
+
1910
+ // Handle flowProxy protection if enabled
1911
+ if (flowproxyDetection) {
1912
+ const flowproxyResult = await handleFlowProxyProtection(page, currentUrl, siteConfig, forceDebug);
1913
+
1914
+ if (flowproxyResult.flowProxyDetection.detected) {
1915
+ console.log(`🛡️ [flowproxy] FlowProxy protection detected on ${currentUrl}`);
1916
+
1917
+ if (!flowproxyResult.overallSuccess) {
1918
+ console.warn(`⚠ [flowproxy] Protection handling failed for ${currentUrl}:`);
1919
+ flowproxyResult.errors.forEach(error => {
1920
+ console.warn(` - ${error}`);
1921
+ });
1922
+ }
1923
+
1924
+ if (flowproxyResult.warnings.length > 0) {
1925
+ flowproxyResult.warnings.forEach(warning => {
1926
+ console.warn(`⚠ [flowproxy] ${warning}`);
1927
+ });
1928
+ }
1929
+ }
1930
+ }
1931
+
1932
+ console.log(formatLogMessage('info', `${messageColors.loaded('Loaded:')} (${siteCounter}/${totalUrls}) ${currentUrl}`));
1933
+ await page.evaluate(() => { console.log('Safe to evaluate on loaded page.'); });
1934
+
1935
+ // Wait for iframes to load and log them
1936
+ if (forceDebug) {
1937
+ try {
1938
+ await new Promise(resolve => setTimeout(resolve, 2000)); // Give iframes time to load
1939
+ const frames = page.frames();
1940
+ console.log(formatLogMessage('debug', `Total frames found: ${frames.length}`));
1941
+ frames.forEach((frame, index) => {
1942
+ const frameUrl = frame.url();
1943
+ if (frameUrl &&
1944
+ frameUrl !== 'about:blank' &&
1945
+ frameUrl !== 'about:srcdoc' &&
1946
+ !frameUrl.startsWith('about:') &&
1947
+ !frameUrl.startsWith('data:') &&
1948
+ !frameUrl.startsWith('chrome-error://') &&
1949
+ !frameUrl.startsWith('chrome-extension://') &&
1950
+ frame !== page.mainFrame()) {
1951
+ console.log(formatLogMessage('debug', `Iframe ${index}: ${frameUrl}`));
1952
+ }
1953
+ });
1954
+ } catch (frameDebugErr) {
1955
+ console.log(formatLogMessage('debug', `Frame debugging failed: ${frameDebugErr.message}`));
1956
+ }
1957
+ }
1958
+ } catch (err) {
1959
+ // Enhanced error handling for redirect timeouts using redirect module
1960
+ const timeoutResult = await handleRedirectTimeout(page, currentUrl, err, safeGetDomain, forceDebug, formatLogMessage);
1961
+
1962
+ if (timeoutResult.success) {
1963
+ console.log(`⚠ Partial redirect timeout recovered: ${safeGetDomain(currentUrl)} → ${safeGetDomain(timeoutResult.finalUrl)}`);
1964
+ currentUrl = timeoutResult.finalUrl; // Use the partial redirect URL
1965
+ siteCounter++;
1966
+ // Continue processing with the redirected URL instead of throwing error
1967
+ } else {
1968
+ console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
1969
+ throw err;
1970
+ }
1971
+ }
1972
+
1973
+ if (interactEnabled && !disableInteract) {
1974
+ if (forceDebug) console.log(formatLogMessage('debug', `interaction simulation enabled for ${currentUrl}`));
1975
+ const randomX = Math.floor(Math.random() * 500) + 50;
1976
+ const randomY = Math.floor(Math.random() * 500) + 50;
1977
+ await page.mouse.move(randomX, randomY, { steps: 10 });
1978
+ await page.mouse.move(randomX + 50, randomY + 50, { steps: 15 });
1979
+ await page.mouse.click(randomX + 25, randomY + 25);
1980
+ await page.hover('body');
1981
+ }
1982
+
1983
+ const delayMs = siteConfig.delay || 4000;
1984
+
1985
+ // Optimize network idle and delay times for better responsiveness
1986
+ const isFastSite = timeout <= 15000;
1987
+ const networkIdleTime = isFastSite ? 4000 : 2000; // Faster idle for slow sites
1988
+ const networkIdleTimeout = isFastSite ? timeout : Math.min(timeout / 2, 12000);
1989
+ const actualDelay = isFastSite ? delayMs : Math.min(delayMs, 2000); // Cap delay for slow sites
1990
+
1991
+ await page.waitForNetworkIdle({
1992
+ idleTime: networkIdleTime,
1993
+ timeout: networkIdleTimeout
1994
+ });
1995
+ await new Promise(resolve => setTimeout(resolve, actualDelay));
1996
+
1997
+ // Apply additional delay for flowProxy if detected
1998
+ if (flowproxyDetection) {
1999
+ const additionalDelay = siteConfig.flowproxy_additional_delay || 5000;
2000
+ if (forceDebug) console.log(formatLogMessage('debug', `Applying flowProxy additional delay: ${additionalDelay}ms`));
2001
+ await new Promise(resolve => setTimeout(resolve, additionalDelay));
2002
+ }
2003
+
2004
+ for (let i = 1; i < (siteConfig.reload || 1); i++) {
2005
+ if (siteConfig.clear_sitedata === true) {
2006
+ try {
2007
+ let reloadClearSession = null;
2008
+ try {
2009
+ reloadClearSession = await page.target().createCDPSession();
2010
+ await reloadClearSession.send('Network.clearBrowserCookies');
2011
+ await reloadClearSession.send('Network.clearBrowserCache');
2012
+ } finally {
2013
+ if (reloadClearSession) {
2014
+ try { await reloadClearSession.detach(); } catch (detachErr) { /* ignore */ }
2015
+ }
2016
+ }
2017
+ await page.evaluate(() => {
2018
+ localStorage.clear();
2019
+ sessionStorage.clear();
2020
+ indexedDB.databases().then(dbs => dbs.forEach(db => indexedDB.deleteDatabase(db.name)));
2021
+ });
2022
+ if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i + 1} for ${currentUrl}`));
2023
+ } catch (reloadClearErr) {
2024
+ console.warn(messageColors.warn(`[clear_sitedata before reload failed] ${currentUrl}: ${reloadClearErr.message}`));
2025
+ }
2026
+ }
2027
+ await page.reload({ waitUntil: 'domcontentloaded', timeout: timeout });
2028
+ await new Promise(resolve => setTimeout(resolve, delayMs));
2029
+ }
2030
+
2031
+ if (siteConfig.forcereload === true) {
2032
+ if (forceDebug) console.log(formatLogMessage('debug', `Forcing extra reload (cache disabled) for ${currentUrl}`));
2033
+ try {
2034
+ await page.setCacheEnabled(false);
2035
+ await page.reload({ waitUntil: 'domcontentloaded', timeout: timeout });
2036
+ await new Promise(resolve => setTimeout(resolve, delayMs));
2037
+ await page.setCacheEnabled(true);
2038
+ } catch (forceReloadErr) {
2039
+ console.warn(messageColors.warn(`[forcereload failed] ${currentUrl}: ${forceReloadErr.message}`));
2040
+ }
2041
+ }
2042
+
2043
+ if (dryRunMode) {
2044
+ // Get page title for dry run output
2045
+ let pageTitle = '';
2046
+ try {
2047
+ pageTitle = await page.title();
2048
+ } catch (titleErr) {
2049
+ if (forceDebug) {
2050
+ console.log(formatLogMessage('debug', `Failed to get page title for ${currentUrl}: ${titleErr.message}`));
2051
+ }
2052
+ }
2053
+
2054
+ // Get collected matches and enhance with searchstring results
2055
+ const dryRunMatches = matchedDomains.get('dryRunMatches') || [];
2056
+ const dryRunNetTools = matchedDomains.get('dryRunNetTools') || [];
2057
+ const dryRunSearchString = matchedDomains.get('dryRunSearchString') || new Map();
2058
+
2059
+ // Enhance matches with searchstring results
2060
+ const enhancedMatches = dryRunMatches.map(match => {
2061
+ const searchResult = dryRunSearchString.get(match.fullUrl);
2062
+ return {
2063
+ ...match,
2064
+ searchStringMatch: searchResult && searchResult.matched ? searchResult : null,
2065
+ searchStringChecked: match.needsSearchStringCheck
2066
+ };
2067
+ });
2068
+
2069
+ // Wait a moment for async nettools/searchstring operations to complete
2070
+ await new Promise(resolve => setTimeout(resolve, 3000)); // Increased for nettools operations
2071
+
2072
+ outputDryRunResults(currentUrl, enhancedMatches, dryRunNetTools, pageTitle);
2073
+
2074
+ return { url: currentUrl, rules: [], success: true, dryRun: true, matchCount: dryRunMatches.length + dryRunNetTools.length };
2075
+ } else {
2076
+ // Format rules using the output module
2077
+ const globalOptions = {
2078
+ localhostMode,
2079
+ localhostModeAlt,
2080
+ plainOutput,
2081
+ adblockRulesMode,
2082
+ dnsmasqMode,
2083
+ dnsmasqOldMode,
2084
+ unboundMode,
2085
+ privoxyMode,
2086
+ piholeMode
2087
+ };
2088
+ const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
2089
+
2090
+ return { url: currentUrl, rules: formattedRules, success: true };
2091
+ }
2092
+
2093
+ } catch (err) {
2094
+ // Enhanced error handling with rule preservation for partial matches
2095
+ if (err.message.includes('Protocol error') ||
2096
+ err.message.includes('Target closed') ||
2097
+ err.message.includes('Browser process was killed') ||
2098
+ err.message.includes('Browser protocol broken')) {
2099
+ console.error(formatLogMessage('error', `Critical browser error on ${currentUrl}: ${err.message}`));
2100
+ return {
2101
+ url: currentUrl,
2102
+ rules: [],
2103
+ success: false,
2104
+ needsImmediateRestart: true,
2105
+ error: err.message
2106
+ };
2107
+ }
2108
+
2109
+ // For other errors, preserve any matches we found before the error
2110
+ if (matchedDomains && (matchedDomains.size > 0 || (matchedDomains instanceof Map && matchedDomains.size > 0))) {
2111
+ const globalOptions = {
2112
+ localhostMode,
2113
+ localhostModeAlt,
2114
+ plainOutput,
2115
+ adblockRulesMode,
2116
+ dnsmasqMode,
2117
+ dnsmasqOldMode,
2118
+ unboundMode,
2119
+ privoxyMode,
2120
+ piholeMode
2121
+ };
2122
+ const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
2123
+ if (forceDebug) console.log(formatLogMessage('debug', `Saving ${formattedRules.length} rules despite page load failure`));
2124
+ return { url: currentUrl, rules: formattedRules, success: false, hasMatches: true };
2125
+ }
2126
+
2127
+ if (siteConfig.screenshot === true && page) {
2128
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
2129
+ const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_');
2130
+ const filename = `${safeUrl}-${timestamp}.jpg`;
2131
+ try {
2132
+ await page.screenshot({ path: filename, type: 'jpeg', fullPage: true });
2133
+ if (forceDebug) console.log(formatLogMessage('debug', `Screenshot saved: ${filename}`));
2134
+ } catch (screenshotErr) {
2135
+ console.warn(messageColors.warn(`[screenshot failed] ${currentUrl}: ${screenshotErr.message}`));
2136
+ }
2137
+ }
2138
+ return { url: currentUrl, rules: [], success: false };
2139
+ } finally {
2140
+ // Guaranteed resource cleanup - this runs regardless of success or failure
2141
+
2142
+ if (cdpSession) {
2143
+ try {
2144
+ await cdpSession.detach();
2145
+ if (forceDebug) console.log(formatLogMessage('debug', `CDP session detached for ${currentUrl}`));
2146
+ } catch (cdpCleanupErr) {
2147
+ if (forceDebug) console.log(formatLogMessage('debug', `Failed to detach CDP session for ${currentUrl}: ${cdpCleanupErr.message}`));
2148
+ }
2149
+ }
2150
+ // Add small delay to allow cleanup to complete
2151
+ try {
2152
+ await new Promise(resolve => setTimeout(resolve, 100));
2153
+ } catch (delayErr) {
2154
+ // Ignore timeout errors
2155
+ }
2156
+
2157
+ if (page && !page.isClosed()) {
2158
+ // Clear page resources before closing
2159
+ try {
2160
+ await page.evaluate(() => {
2161
+ if (window.gc) window.gc(); // Force garbage collection if available
2162
+ });
2163
+ } catch (gcErr) { /* ignore */ }
2164
+
2165
+ try {
2166
+ await page.close();
2167
+ if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
2168
+ } catch (pageCloseErr) {
2169
+ if (forceDebug) console.log(formatLogMessage('debug', `Failed to close page for ${currentUrl}: ${pageCloseErr.message}`));
2170
+ }
2171
+ }
2172
+ }
2173
+ }
2174
+
2175
+ // Temporarily store the pLimit function
2176
+ const originalLimit = limit;
2177
+
2178
+ // Group URLs by site to respect site boundaries during cleanup
2179
+ const siteGroups = [];
2180
+ let currentUrlCount = 0;
2181
+
2182
+ for (const site of sites) {
2183
+
2184
+ const urlsToProcess = Array.isArray(site.url) ? site.url : [site.url];
2185
+ siteGroups.push({
2186
+ config: site,
2187
+ urls: urlsToProcess
2188
+ });
2189
+ currentUrlCount += urlsToProcess.length;
2190
+ }
2191
+ if (!silentMode && currentUrlCount > 0) {
2192
+ console.log(`\n${messageColors.processing('Processing')} ${currentUrlCount} URLs across ${siteGroups.length} sites with concurrency ${MAX_CONCURRENT_SITES}...`);
2193
+ if (currentUrlCount > RESOURCE_CLEANUP_INTERVAL) {
2194
+ console.log(messageColors.processing('Browser will restart every') + ` ~${RESOURCE_CLEANUP_INTERVAL} URLs to free resources`);
2195
+ }
2196
+ }
2197
+
2198
+ const results = [];
2199
+ let processedUrlCount = 0;
2200
+ let urlsSinceLastCleanup = 0;
2201
+
2202
+ // Process sites one by one, but restart browser when hitting URL limits
2203
+ for (let siteIndex = 0; siteIndex < siteGroups.length; siteIndex++) {
2204
+ const siteGroup = siteGroups[siteIndex];
2205
+
2206
+ // Check browser health before processing each site
2207
+ const healthCheck = await monitorBrowserHealth(browser, {}, {
2208
+ siteIndex,
2209
+ totalSites: siteGroups.length,
2210
+ urlsSinceCleanup: urlsSinceLastCleanup,
2211
+ cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
2212
+ forceDebug,
2213
+ silentMode
2214
+ });
2215
+
2216
+ // Also check if browser was unhealthy during recent processing
2217
+ const recentResults = results.slice(-3);
2218
+ const hasRecentFailures = recentResults.filter(r => !r.success).length >= 2;
2219
+ const shouldRestartFromFailures = hasRecentFailures && urlsSinceLastCleanup > 5;
2220
+
2221
+ const siteUrlCount = siteGroup.urls.length;
2222
+
2223
+ // Check if processing this entire site would exceed cleanup interval OR health check suggests restart
2224
+ const wouldExceedLimit = urlsSinceLastCleanup + siteUrlCount >= RESOURCE_CLEANUP_INTERVAL;
2225
+ const isNotLastSite = siteIndex < siteGroups.length - 1;
2226
+
2227
+ // Restart browser if we've processed enough URLs, health check suggests it, and this isn't the last site
2228
+ if ((wouldExceedLimit || healthCheck.shouldRestart || shouldRestartFromFailures) && urlsSinceLastCleanup > 0 && isNotLastSite) {
2229
+
2230
+ let restartReason = 'Unknown';
2231
+ if (healthCheck.shouldRestart) {
2232
+ restartReason = healthCheck.reason;
2233
+ } else if (shouldRestartFromFailures) {
2234
+ restartReason = 'Multiple recent failures detected';
2235
+ } else if (wouldExceedLimit) {
2236
+ restartReason = `Processed ${urlsSinceLastCleanup} URLs`;
2237
+ }
2238
+
2239
+ if (!silentMode) {
2240
+ console.log(`\n${messageColors.fileOp('🔄 Browser restart triggered:')} ${restartReason}`);
2241
+ }
2242
+
2243
+ try {
2244
+ await handleBrowserExit(browser, {
2245
+ forceDebug,
2246
+ timeout: 10000,
2247
+ exitOnFailure: false,
2248
+ cleanTempFiles: true,
2249
+ comprehensiveCleanup: removeTempFiles // Respect --remove-tempfiles during restarts
2250
+ });
2251
+
2252
+ // Clean up the specific user data directory
2253
+ if (userDataDir && fs.existsSync(userDataDir)) {
2254
+ fs.rmSync(userDataDir, { recursive: true, force: true });
2255
+ if (forceDebug) console.log(formatLogMessage('debug', `Cleaned user data dir: ${userDataDir}`));
2256
+ }
2257
+
2258
+ // Additional cleanup for any remaining Chrome processes
2259
+ if (removeTempFiles) {
2260
+ await cleanupChromeTempFiles({
2261
+ includeSnapTemp: true,
2262
+ forceDebug,
2263
+ comprehensive: true
2264
+ });
2265
+ }
2266
+
2267
+ } catch (browserCloseErr) {
2268
+ if (forceDebug) console.log(formatLogMessage('debug', `Browser cleanup warning: ${browserCloseErr.message}`));
2269
+ }
2270
+
2271
+ // Create new browser for next batch
2272
+ browser = await createBrowser();
2273
+ if (forceDebug) console.log(formatLogMessage('debug', `New browser instance created for site ${siteIndex + 1}`));
2274
+
2275
+ // Reset cleanup counter and add delay
2276
+ urlsSinceLastCleanup = 0;
2277
+ await new Promise(resolve => setTimeout(resolve, 1000));
2278
+ }
2279
+
2280
+ if (forceDebug) {
2281
+ console.log(formatLogMessage('debug', `Processing site ${siteIndex + 1}/${siteGroups.length}: ${siteUrlCount} URL(s) (total processed: ${processedUrlCount})`));
2282
+ }
2283
+
2284
+ // Create tasks with current browser instance and process them
2285
+ const siteTasks = siteGroup.urls.map(url => originalLimit(() => processUrl(url, siteGroup.config, browser)));
2286
+ const siteResults = await Promise.all(siteTasks);
2287
+
2288
+ // Check if any results indicate immediate restart is needed
2289
+ const needsImmediateRestart = siteResults.some(r => r.needsImmediateRestart);
2290
+
2291
+ results.push(...siteResults);
2292
+
2293
+ processedUrlCount += siteUrlCount;
2294
+ urlsSinceLastCleanup += siteUrlCount;
2295
+
2296
+ // Force browser restart if any URL had critical errors
2297
+ if (needsImmediateRestart && siteIndex < siteGroups.length - 1) {
2298
+ if (!silentMode) {
2299
+ console.log(`\n${messageColors.fileOp('🔄 Emergency browser restart:')} Critical browser errors detected`);
2300
+ }
2301
+
2302
+ // Force browser restart immediately
2303
+ try {
2304
+ await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true, comprehensiveCleanup: removeTempFiles });
2305
+ // Additional cleanup after emergency restart
2306
+ if (removeTempFiles) {
2307
+ await cleanupChromeTempFiles({
2308
+ includeSnapTemp: true,
2309
+ forceDebug,
2310
+ comprehensive: true
2311
+ });
2312
+ }
2313
+ browser = await createBrowser();
2314
+ urlsSinceLastCleanup = 0; // Reset counter
2315
+ await new Promise(resolve => setTimeout(resolve, 2000)); // Give browser time to stabilize
2316
+ } catch (emergencyRestartErr) {
2317
+ if (forceDebug) console.log(formatLogMessage('debug', `Emergency restart failed: ${emergencyRestartErr.message}`));
2318
+ }
2319
+ }
2320
+ }
2321
+
2322
+ // Handle dry run output file writing
2323
+ if (dryRunMode && outputFile && dryRunOutput.length > 0) {
2324
+ try {
2325
+ const dryRunContent = dryRunOutput.join('\n');
2326
+ fs.writeFileSync(outputFile, dryRunContent);
2327
+ if (!silentMode) {
2328
+ console.log(`${messageColors.fileOp('📄 Dry run results saved to:')} ${outputFile}`);
2329
+ }
2330
+ } catch (writeErr) {
2331
+ console.error(`❌ Failed to write dry run output to ${outputFile}: ${writeErr.message}`);
2332
+ }
2333
+ }
2334
+
2335
+ let outputResult;
2336
+
2337
+ if (!dryRunMode) {
2338
+ // Handle all output using the output module
2339
+ const outputConfig = {
2340
+ outputFile,
2341
+ appendMode,
2342
+ compareFile,
2343
+ forceDebug,
2344
+ showTitles,
2345
+ removeDupes: removeDupes && outputFile,
2346
+ silentMode,
2347
+ dumpUrls,
2348
+ adblockRulesLogFile,
2349
+ ignoreDomains
2350
+ };
2351
+
2352
+ outputResult = handleOutput(results, outputConfig);
2353
+
2354
+ if (!outputResult.success) {
2355
+ console.error(messageColors.error('❌ Failed to write output files'));
2356
+ process.exit(1);
2357
+ }
2358
+
2359
+ } else {
2360
+ // For dry run mode, create a mock output result
2361
+ const totalMatches = results.reduce((sum, r) => sum + (r.matchCount || 0), 0);
2362
+ outputResult = {
2363
+ success: true,
2364
+ successfulPageLoads: results.filter(r => r.success).length,
2365
+ totalRules: totalMatches
2366
+ };
2367
+ }
2368
+
2369
+ // Use the success count from output handler
2370
+ siteCounter = outputResult.successfulPageLoads;
2371
+
2372
+ // Count pages that had matches even if they failed to load completely
2373
+ const pagesWithMatches = results.filter(r => r.success || r.hasMatches).length;
2374
+ const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
2375
+
2376
+ // Debug: Show output format being used
2377
+ if (forceDebug) {
2378
+ const globalOptions = {
2379
+ localhostMode,
2380
+ localhostModeAlt,
2381
+ plainOutput,
2382
+ adblockRules: adblockRulesMode,
2383
+ dnsmasq: dnsmasqMode,
2384
+ dnsmasqOld: dnsmasqOldMode,
2385
+ unbound: unboundMode,
2386
+ privoxy: privoxyMode,
2387
+ pihole: piholeMode
2388
+ };
2389
+ console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
2390
+ console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
2391
+ }
2392
+
2393
+ // Compress log files if --compress-logs is enabled
2394
+ if (compressLogs && dumpUrls && !dryRunMode) {
2395
+ // Collect all existing log files for compression
2396
+ const filesToCompress = [];
2397
+ if (debugLogFile && fs.existsSync(debugLogFile)) filesToCompress.push(debugLogFile);
2398
+ if (matchedUrlsLogFile && fs.existsSync(matchedUrlsLogFile)) filesToCompress.push(matchedUrlsLogFile);
2399
+ if (adblockRulesLogFile && fs.existsSync(adblockRulesLogFile)) filesToCompress.push(adblockRulesLogFile);
2400
+
2401
+ if (filesToCompress.length > 0) {
2402
+ if (!silentMode) console.log(`\n${messageColors.compression('Compressing')} ${filesToCompress.length} log file(s)...`);
2403
+ try {
2404
+ // Perform compression with original file deletion
2405
+ const results = await compressMultipleFiles(filesToCompress, true);
2406
+
2407
+ if (!silentMode) {
2408
+ // Report compression results and file sizes
2409
+ results.successful.forEach(({ original, compressed }) => {
2410
+ const originalSize = fs.statSync(compressed).size; // compressed file size
2411
+ console.log(messageColors.success('✅ Compressed:') + ` ${path.basename(original)} → ${path.basename(compressed)}`);
2412
+ });
2413
+ // Report any compression failures
2414
+ if (results.failed.length > 0) {
2415
+ results.failed.forEach(({ path: filePath, error }) => {
2416
+ console.warn(messageColors.warn(`⚠ Failed to compress ${path.basename(filePath)}: ${error}`));
2417
+ });
2418
+ }
2419
+ }
2420
+ } catch (compressionErr) {
2421
+ console.warn(formatLogMessage('warn', `Log compression failed: ${compressionErr.message}`));
2422
+ }
2423
+ }
2424
+ }
2425
+
2426
+ // Perform comprehensive final cleanup using enhanced browserexit module
2427
+ if (forceDebug) console.log(formatLogMessage('debug', `Starting comprehensive browser cleanup...`));
2428
+
2429
+
2430
+ const cleanupResult = await handleBrowserExit(browser, {
2431
+ forceDebug,
2432
+ timeout: 10000,
2433
+ exitOnFailure: true,
2434
+ cleanTempFiles: true,
2435
+ comprehensiveCleanup: removeTempFiles, // Use --remove-tempfiles flag
2436
+ userDataDir: browser._nwssUserDataDir,
2437
+ verbose: !silentMode && removeTempFiles // Show verbose output only if removing temp files and not silent
2438
+ });
2439
+
2440
+ if (forceDebug) {
2441
+ console.log(formatLogMessage('debug', `Final cleanup results: ${cleanupResult.success ? 'success' : 'failed'}`));
2442
+ console.log(formatLogMessage('debug', `Browser closed: ${cleanupResult.browserClosed}, Temp files cleaned: ${cleanupResult.tempFilesCleanedCount || 0}, User data cleaned: ${cleanupResult.userDataCleaned}`));
2443
+
2444
+ if (cleanupResult.errors.length > 0) {
2445
+ cleanupResult.errors.forEach(err => console.log(formatLogMessage('debug', `Cleanup error: ${err}`)));
2446
+ }
2447
+ }
2448
+
2449
+ // Final aggressive cleanup to catch any remaining temp files
2450
+ if (forceDebug) console.log(formatLogMessage('debug', 'Performing final aggressive temp file cleanup...'));
2451
+ await cleanupChromeTempFiles({
2452
+ includeSnapTemp: true,
2453
+ forceDebug,
2454
+ comprehensive: true
2455
+ });
2456
+ await new Promise(resolve => setTimeout(resolve, 1000)); // Give filesystem time to sync
2457
+
2458
+ // Calculate timing, success rates, and provide summary information
2459
+ if (forceDebug) console.log(formatLogMessage('debug', `Calculating timing statistics...`));
2460
+ const endTime = Date.now();
2461
+ const durationMs = endTime - startTime;
2462
+ const totalSeconds = Math.floor(durationMs / 1000);
2463
+ const hours = Math.floor(totalSeconds / 3600);
2464
+ const minutes = Math.floor((totalSeconds % 3600) / 60);
2465
+ const seconds = totalSeconds % 60;
2466
+
2467
+ // Final summary report with timing and success statistics
2468
+ if (!silentMode) {
2469
+ if (pagesWithMatches > outputResult.successfulPageLoads) {
2470
+ console.log(`\n${messageColors.success(dryRunMode ? 'Dry run completed.' : 'Scan completed.')} ${outputResult.successfulPageLoads} of ${totalUrls} URLs loaded successfully, ${pagesWithMatches} had matches in ${messageColors.timing(`${hours}h ${minutes}m ${seconds}s`)}`);
2471
+
2472
+ } else {
2473
+ console.log(`\n${messageColors.success(dryRunMode ? 'Dry run completed.' : 'Scan completed.')} ${outputResult.successfulPageLoads} of ${totalUrls} URLs processed successfully in ${messageColors.timing(`${hours}h ${minutes}m ${seconds}s`)}`);
2474
+
2475
+
2476
+ }
2477
+ if (outputResult.totalRules > 0 && !dryRunMode) {
2478
+ console.log(messageColors.success('Generated') + ` ${outputResult.totalRules} unique rules`);
2479
+ } else if (outputResult.totalRules > 0 && dryRunMode) {
2480
+ console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
2481
+ }
2482
+ }
2483
+
2484
+ // Clean process termination
2485
+ if (forceDebug) console.log(formatLogMessage('debug', `About to exit process...`));
2486
+ process.exit(0);
2487
+
2488
+ })();