@fanboynz/network-scanner 1.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +33 -0
- package/JSONMANUAL.md +121 -0
- package/LICENSE +674 -0
- package/README.md +357 -0
- package/config.json +74 -0
- package/lib/browserexit.js +522 -0
- package/lib/browserhealth.js +308 -0
- package/lib/cloudflare.js +660 -0
- package/lib/colorize.js +168 -0
- package/lib/compare.js +159 -0
- package/lib/compress.js +129 -0
- package/lib/fingerprint.js +613 -0
- package/lib/flowproxy.js +274 -0
- package/lib/grep.js +348 -0
- package/lib/ignore_similar.js +237 -0
- package/lib/nettools.js +1200 -0
- package/lib/output.js +633 -0
- package/lib/redirect.js +384 -0
- package/lib/searchstring.js +561 -0
- package/lib/validate_rules.js +1107 -0
- package/nwss.1 +824 -0
- package/nwss.js +2488 -0
- package/package.json +45 -0
- package/regex-samples.md +27 -0
- package/scanner-script-org.js +588 -0
package/nwss.js
ADDED
|
@@ -0,0 +1,2488 @@
|
|
|
1
|
+
// === Network scanner script (nwss.js) v1.0.35 ===
|
|
2
|
+
|
|
3
|
+
// puppeteer for browser automation, fs for file system operations, psl for domain parsing.
|
|
4
|
+
// const pLimit = require('p-limit'); // Will be dynamically imported
|
|
5
|
+
const puppeteer = require('puppeteer');
|
|
6
|
+
const fs = require('fs');
|
|
7
|
+
const psl = require('psl');
|
|
8
|
+
const path = require('path');
|
|
9
|
+
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
|
|
10
|
+
const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
|
|
11
|
+
const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
|
|
12
|
+
const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
|
|
13
|
+
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
|
|
14
|
+
// Rule validation
|
|
15
|
+
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
|
|
16
|
+
// CF Bypass
|
|
17
|
+
const { handleCloudflareProtection } = require('./lib/cloudflare');
|
|
18
|
+
// FP Bypass
|
|
19
|
+
const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
|
|
20
|
+
// ignore_similar rules
|
|
21
|
+
const { shouldIgnoreSimilarDomain } = require('./lib/ignore_similar');
|
|
22
|
+
// Graceful exit
|
|
23
|
+
const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
|
|
24
|
+
// Whois & Dig
|
|
25
|
+
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability } = require('./lib/nettools');
|
|
26
|
+
// File compare
|
|
27
|
+
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
|
|
28
|
+
// Colorize various text when used
|
|
29
|
+
const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize');
|
|
30
|
+
// Enhanced redirect handling
|
|
31
|
+
const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
|
|
32
|
+
// Ensure web browser is working correctly
|
|
33
|
+
const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth');
|
|
34
|
+
|
|
35
|
+
// --- Script Configuration & Constants ---
|
|
36
|
+
const VERSION = '1.0.35'; // Script version
|
|
37
|
+
const MAX_CONCURRENT_SITES = 5;
|
|
38
|
+
const RESOURCE_CLEANUP_INTERVAL = 80; // Close browser and restart every N sites to free resources
|
|
39
|
+
|
|
40
|
+
// get startTime
|
|
41
|
+
const startTime = Date.now();
|
|
42
|
+
|
|
43
|
+
// --- Command-Line Argument Parsing ---
|
|
44
|
+
const args = process.argv.slice(2);
|
|
45
|
+
|
|
46
|
+
if (args.length === 0) {
|
|
47
|
+
args.push('--help');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const headfulMode = args.includes('--headful');
|
|
51
|
+
const SOURCES_FOLDER = 'sources';
|
|
52
|
+
|
|
53
|
+
let outputFile = null;
|
|
54
|
+
const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o');
|
|
55
|
+
if (outputIndex !== -1 && args[outputIndex + 1]) {
|
|
56
|
+
outputFile = args[outputIndex + 1];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const appendMode = args.includes('--append');
|
|
60
|
+
|
|
61
|
+
let compareFile = null;
|
|
62
|
+
const compareIndex = args.findIndex(arg => arg === '--compare');
|
|
63
|
+
if (compareIndex !== -1 && args[compareIndex + 1]) {
|
|
64
|
+
compareFile = args[compareIndex + 1];
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
const forceVerbose = args.includes('--verbose');
|
|
69
|
+
const forceDebug = args.includes('--debug');
|
|
70
|
+
const silentMode = args.includes('--silent');
|
|
71
|
+
const showTitles = args.includes('--titles');
|
|
72
|
+
const dumpUrls = args.includes('--dumpurls');
|
|
73
|
+
const subDomainsMode = args.includes('--sub-domains');
|
|
74
|
+
const localhostMode = args.includes('--localhost');
|
|
75
|
+
const localhostModeAlt = args.includes('--localhost-0.0.0.0');
|
|
76
|
+
const disableInteract = args.includes('--no-interact');
|
|
77
|
+
const plainOutput = args.includes('--plain');
|
|
78
|
+
const enableCDP = args.includes('--cdp');
|
|
79
|
+
const dnsmasqMode = args.includes('--dnsmasq');
|
|
80
|
+
const dnsmasqOldMode = args.includes('--dnsmasq-old');
|
|
81
|
+
const unboundMode = args.includes('--unbound');
|
|
82
|
+
const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
|
|
83
|
+
const privoxyMode = args.includes('--privoxy');
|
|
84
|
+
const piholeMode = args.includes('--pihole');
|
|
85
|
+
const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
|
|
86
|
+
const dryRunMode = args.includes('--dry-run');
|
|
87
|
+
const compressLogs = args.includes('--compress-logs');
|
|
88
|
+
const removeTempFiles = args.includes('--remove-tempfiles');
|
|
89
|
+
const validateConfig = args.includes('--validate-config');
|
|
90
|
+
const validateRules = args.includes('--validate-rules');
|
|
91
|
+
const testValidation = args.includes('--test-validation');
|
|
92
|
+
let cleanRules = args.includes('--clean-rules');
|
|
93
|
+
|
|
94
|
+
let validateRulesFile = null;
|
|
95
|
+
const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules');
|
|
96
|
+
if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) {
|
|
97
|
+
validateRulesFile = args[validateRulesIndex + 1];
|
|
98
|
+
validateRules = true; // Override the boolean if file specified
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
let cleanRulesFile = null;
|
|
102
|
+
const cleanRulesIndex = args.findIndex(arg => arg === '--clean-rules');
|
|
103
|
+
if (cleanRulesIndex !== -1 && args[cleanRulesIndex + 1] && !args[cleanRulesIndex + 1].startsWith('--')) {
|
|
104
|
+
cleanRulesFile = args[cleanRulesIndex + 1];
|
|
105
|
+
cleanRules = true; // Override the boolean if file specified
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const enableColors = args.includes('--color') || args.includes('--colour');
|
|
109
|
+
let adblockRulesMode = args.includes('--adblock-rules');
|
|
110
|
+
|
|
111
|
+
// Validate --adblock-rules usage - ignore if used incorrectly instead of erroring
|
|
112
|
+
if (adblockRulesMode) {
|
|
113
|
+
if (!outputFile) {
|
|
114
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: requires --output (-o) to specify an output file`));
|
|
115
|
+
adblockRulesMode = false;
|
|
116
|
+
} else if (localhostMode || localhostModeAlt || plainOutput || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
|
|
117
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: incompatible with localhost/plain output modes`));
|
|
118
|
+
adblockRulesMode = false;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Validate --dnsmasq usage
|
|
123
|
+
if (dnsmasqMode) {
|
|
124
|
+
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
|
|
125
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
|
|
126
|
+
dnsmasqMode = false;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Validate --dnsmasq-old usage
|
|
131
|
+
if (dnsmasqOldMode) {
|
|
132
|
+
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || unboundMode || privoxyMode || piholeMode) {
|
|
133
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
|
|
134
|
+
dnsmasqOldMode = false;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Validate --unbound usage
|
|
139
|
+
if (unboundMode) {
|
|
140
|
+
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || privoxyMode || piholeMode) {
|
|
141
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--unbound ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
|
|
142
|
+
unboundMode = false;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Validate --privoxy usage
|
|
147
|
+
if (privoxyMode) {
|
|
148
|
+
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || piholeMode) {
|
|
149
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--privoxy ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound output modes`));
|
|
150
|
+
privoxyMode = false;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Validate --pihole usage
|
|
155
|
+
if (piholeMode) {
|
|
156
|
+
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode) {
|
|
157
|
+
if (forceDebug) console.log(formatLogMessage('debug', `--pihole ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound/privoxy output modes`));
|
|
158
|
+
piholeMode = false;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Validate --compress-logs usage
|
|
163
|
+
if (compressLogs && !dumpUrls) {
|
|
164
|
+
console.error(`❌ --compress-logs can only be used with --dumpurls`);
|
|
165
|
+
process.exit(1);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Validate --append usage
|
|
169
|
+
if (appendMode && !outputFile) {
|
|
170
|
+
console.error(`❌ --append requires --output (-o) to specify an output file`);
|
|
171
|
+
process.exit(1);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (appendMode && (compareFile || dryRunMode)) {
|
|
175
|
+
console.error(`❌ --append cannot be used with --compare or --dry-run`);
|
|
176
|
+
process.exit(1);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Validate --dry-run usage
|
|
180
|
+
if (dryRunMode) {
|
|
181
|
+
if (compressLogs || compareFile) {
|
|
182
|
+
console.error(`❌ --dry-run cannot be used with --compress-logs or --compare`);
|
|
183
|
+
process.exit(1);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Validate --compare usage
|
|
188
|
+
if (compareFile && !outputFile) {
|
|
189
|
+
console.error(`❌ --compare requires --output (-o) to specify an output file`);
|
|
190
|
+
process.exit(1);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (compareFile && !fs.existsSync(compareFile)) {
|
|
194
|
+
console.error(`❌ Compare file not found: ${compareFile}`);
|
|
195
|
+
process.exit(1);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if (args.includes('--version')) {
|
|
199
|
+
console.log(`nwss.js version ${VERSION}`);
|
|
200
|
+
process.exit(0);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Handle validation-only operations before main help
|
|
204
|
+
if (testValidation) {
|
|
205
|
+
console.log(`\n${messageColors.processing('Running domain validation tests...')}`);
|
|
206
|
+
const testResult = testDomainValidation();
|
|
207
|
+
if (testResult) {
|
|
208
|
+
console.log(`${messageColors.success('✅ All validation tests passed!')}`);
|
|
209
|
+
process.exit(0);
|
|
210
|
+
} else {
|
|
211
|
+
console.log(`${messageColors.error('❌ Some validation tests failed!')}`);
|
|
212
|
+
process.exit(1);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (validateConfig) {
|
|
217
|
+
console.log(`\n${messageColors.processing('Validating configuration file...')}`);
|
|
218
|
+
try {
|
|
219
|
+
const validation = validateFullConfig(config, { forceDebug, silentMode });
|
|
220
|
+
|
|
221
|
+
// Validate referrer_headers format
|
|
222
|
+
for (const site of sites) {
|
|
223
|
+
if (site.referrer_headers && typeof site.referrer_headers === 'object' && !Array.isArray(site.referrer_headers)) {
|
|
224
|
+
const validModes = ['random_search', 'social_media', 'direct_navigation', 'custom'];
|
|
225
|
+
if (site.referrer_headers.mode && !validModes.includes(site.referrer_headers.mode)) {
|
|
226
|
+
console.warn(`⚠ Invalid referrer_headers mode: ${site.referrer_headers.mode}. Valid modes: ${validModes.join(', ')}`);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (validation.isValid) {
|
|
232
|
+
console.log(`${messageColors.success('✅ Configuration is valid!')}`);
|
|
233
|
+
console.log(`${messageColors.info('Summary:')} ${validation.summary.validSites}/${validation.summary.totalSites} sites valid`);
|
|
234
|
+
if (validation.summary.sitesWithWarnings > 0) {
|
|
235
|
+
console.log(`${messageColors.warn('⚠ Warnings:')} ${validation.summary.sitesWithWarnings} sites have warnings`);
|
|
236
|
+
}
|
|
237
|
+
process.exit(0);
|
|
238
|
+
} else {
|
|
239
|
+
console.log(`${messageColors.error('❌ Configuration validation failed!')}`);
|
|
240
|
+
console.log(`${messageColors.error('Errors:')} ${validation.globalErrors.length} global, ${validation.summary.sitesWithErrors} site-specific`);
|
|
241
|
+
process.exit(1);
|
|
242
|
+
}
|
|
243
|
+
} catch (validationErr) {
|
|
244
|
+
console.error(`❌ Validation failed: ${validationErr.message}`);
|
|
245
|
+
process.exit(1);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (validateRules || validateRulesFile) {
|
|
250
|
+
const filesToValidate = validateRulesFile ? [validateRulesFile] : [outputFile, compareFile].filter(Boolean);
|
|
251
|
+
|
|
252
|
+
if (filesToValidate.length === 0) {
|
|
253
|
+
console.error('❌ --validate-rules requires either a file argument or --output/--compare files to be specified');
|
|
254
|
+
process.exit(1);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
console.log(`\n${messageColors.processing('Validating rule files...')}`);
|
|
258
|
+
let overallValid = true;
|
|
259
|
+
|
|
260
|
+
for (const file of filesToValidate) {
|
|
261
|
+
console.log(`\n${messageColors.info('Validating:')} ${file}`);
|
|
262
|
+
try {
|
|
263
|
+
const validation = validateRulesetFile(file, { forceDebug, silentMode, maxErrors: 20 });
|
|
264
|
+
|
|
265
|
+
if (validation.isValid) {
|
|
266
|
+
console.log(`${messageColors.success('✅ Valid:')} ${validation.stats.valid} rules, ${validation.stats.comments} comments`);
|
|
267
|
+
if (validation.duplicates.length > 0) {
|
|
268
|
+
console.log(`${messageColors.warn('⚠ Duplicates:')} ${validation.duplicates.length} duplicate rules found`);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (Object.keys(validation.stats.formats).length > 0) {
|
|
272
|
+
console.log(`${messageColors.info('Formats:')} ${Object.entries(validation.stats.formats).map(([f, c]) => `${f}(${c})`).join(', ')}`);
|
|
273
|
+
}
|
|
274
|
+
} else {
|
|
275
|
+
console.log(`${messageColors.error('❌ Invalid:')} ${validation.stats.invalid} invalid rules out of ${validation.stats.total} total`);
|
|
276
|
+
overallValid = false;
|
|
277
|
+
}
|
|
278
|
+
} catch (validationErr) {
|
|
279
|
+
console.error(`❌ Failed to validate ${file}: ${validationErr.message}`);
|
|
280
|
+
overallValid = false;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (overallValid) {
|
|
285
|
+
console.log(`\n${messageColors.success('✅ All rule files are valid!')}`);
|
|
286
|
+
process.exit(0);
|
|
287
|
+
} else {
|
|
288
|
+
console.log(`\n${messageColors.error('❌ Some rule files have validation errors!')}`);
|
|
289
|
+
process.exit(1);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (args.includes('--help') || args.includes('-h')) {
|
|
294
|
+
console.log(`Usage: node nwss.js [options]
|
|
295
|
+
|
|
296
|
+
Options:
|
|
297
|
+
--color, --colour Enable colored console output for status messages
|
|
298
|
+
-o, --output <file> Output file for rules. If omitted, prints to console
|
|
299
|
+
--compare <file> Remove rules that already exist in this file before output
|
|
300
|
+
--append Append new rules to output file instead of overwriting (requires -o)
|
|
301
|
+
|
|
302
|
+
Output Format Options:
|
|
303
|
+
--localhost Output as 127.0.0.1 domain.com
|
|
304
|
+
--localhost-0.0.0.0 Output as 0.0.0.0 domain.com
|
|
305
|
+
--plain Output just domains (no adblock formatting)
|
|
306
|
+
--dnsmasq Output as local=/domain.com/ (dnsmasq format)
|
|
307
|
+
--dnsmasq-old Output as server=/domain.com/ (dnsmasq old format)
|
|
308
|
+
--unbound Output as local-zone: "domain.com." always_null (unbound format)
|
|
309
|
+
--privoxy Output as { +block } .domain.com (Privoxy format)
|
|
310
|
+
--pihole Output as (^|\\.)domain\\.com$ (Pi-hole regex format)
|
|
311
|
+
--adblock-rules Generate adblock filter rules with resource type modifiers (requires -o)
|
|
312
|
+
|
|
313
|
+
General Options:
|
|
314
|
+
--verbose Force verbose mode globally
|
|
315
|
+
--debug Force debug mode globally
|
|
316
|
+
--silent Suppress normal console logs
|
|
317
|
+
--titles Add ! <url> title before each site's group
|
|
318
|
+
--dumpurls Dump matched URLs into matched_urls.log
|
|
319
|
+
--dry-run Console output only: show matching regex, titles, whois/dig/searchstring results, and adblock rules
|
|
320
|
+
--compress-logs Compress log files with gzip (requires --dumpurls)
|
|
321
|
+
--sub-domains Output full subdomains instead of collapsing to root
|
|
322
|
+
--no-interact Disable page interactions globally
|
|
323
|
+
--custom-json <file> Use a custom config JSON file instead of config.json
|
|
324
|
+
--headful Launch browser with GUI (not headless)
|
|
325
|
+
--cdp Enable Chrome DevTools Protocol logging (now per-page if enabled)
|
|
326
|
+
--remove-dupes Remove duplicate domains from output (only with -o)
|
|
327
|
+
--eval-on-doc Globally enable evaluateOnNewDocument() for Fetch/XHR interception
|
|
328
|
+
--help, -h Show this help menu
|
|
329
|
+
--version Show script version
|
|
330
|
+
--remove-tempfiles Remove Chrome/Puppeteer temporary files before exit
|
|
331
|
+
|
|
332
|
+
Validation Options:
|
|
333
|
+
--validate-config Validate config.json file and exit
|
|
334
|
+
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
|
|
335
|
+
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
|
|
336
|
+
--test-validation Run domain validation tests and exit
|
|
337
|
+
|
|
338
|
+
Global config.json options:
|
|
339
|
+
ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
|
|
340
|
+
blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
|
|
341
|
+
whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
|
|
342
|
+
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
343
|
+
ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
|
|
344
|
+
ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
Per-site config.json options:
|
|
348
|
+
url: "site" or ["site1", "site2"] Single URL or list of URLs
|
|
349
|
+
filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests
|
|
350
|
+
|
|
351
|
+
Redirect Handling Options:
|
|
352
|
+
follow_redirects: true/false Follow redirects to new domains (default: true)
|
|
353
|
+
max_redirects: 10 Maximum number of redirects to follow (default: 10)
|
|
354
|
+
js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
|
|
355
|
+
detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
|
|
356
|
+
redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
|
|
357
|
+
|
|
358
|
+
comments: "text" or ["text1", "text2"] Documentation/notes - ignored by script
|
|
359
|
+
searchstring: "text" or ["text1", "text2"] Text to search in response content (requires filterRegex match)
|
|
360
|
+
ignore_similar: true/false Override global ignore_similar setting for this site
|
|
361
|
+
ignore_similar_threshold: 80 Override global similarity threshold for this site
|
|
362
|
+
ignore_similar_ignored_domains: true/false Override global ignore_similar_ignored_domains for this site
|
|
363
|
+
searchstring_and: "text" or ["text1", "text2"] Text to search with AND logic - ALL terms must be present (requires filterRegex match)
|
|
364
|
+
curl: true/false Use curl to download content for analysis (default: false)
|
|
365
|
+
Note: curl respects filterRegex but ignores resourceTypes filtering
|
|
366
|
+
grep: true/false Use grep instead of JavaScript for pattern matching (default: false)
|
|
367
|
+
Note: requires curl=true, uses system grep command for faster searches
|
|
368
|
+
blocked: ["regex"] Regex patterns to block requests
|
|
369
|
+
css_blocked: ["#selector", ".class"] CSS selectors to hide elements
|
|
370
|
+
resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
|
|
371
|
+
interact: true/false Simulate mouse movements/clicks
|
|
372
|
+
isBrave: true/false Spoof Brave browser detection
|
|
373
|
+
userAgent: "chrome"|"firefox"|"safari" Custom desktop User-Agent
|
|
374
|
+
delay: <milliseconds> Delay after load (default: 4000)
|
|
375
|
+
reload: <number> Reload page n times after load (default: 1)
|
|
376
|
+
forcereload: true/false Force an additional reload after reloads
|
|
377
|
+
clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
|
|
378
|
+
subDomains: 1/0 Output full subdomains (default: 0)
|
|
379
|
+
localhost: true/false Force localhost output (127.0.0.1)
|
|
380
|
+
localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
|
|
381
|
+
dnsmasq: true/false Force dnsmasq output (local=/domain.com/)
|
|
382
|
+
dnsmasq_old: true/false Force dnsmasq old output (server=/domain.com/)
|
|
383
|
+
unbound: true/false Force unbound output (local-zone: "domain.com." always_null)
|
|
384
|
+
privoxy: true/false Force Privoxy output ({ +block } .domain.com)
|
|
385
|
+
pihole: true/false Force Pi-hole regex output ((^|\\.)domain\\.com$)
|
|
386
|
+
source: true/false Save page source HTML after load
|
|
387
|
+
firstParty: true/false Allow first-party matches (default: false)
|
|
388
|
+
thirdParty: true/false Allow third-party matches (default: true)
|
|
389
|
+
screenshot: true/false Capture screenshot on load failure
|
|
390
|
+
headful: true/false Launch browser with GUI for this site
|
|
391
|
+
fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
|
|
392
|
+
adblock_rules: true/false Generate adblock filter rules with resource types for this site
|
|
393
|
+
even_blocked: true/false Add matching rules even if requests are blocked (default: false)
|
|
394
|
+
|
|
395
|
+
referrer_headers: "url" or ["url1", "url2"] Set referrer header for realistic traffic sources
|
|
396
|
+
custom_headers: {"Header": "value"} Add custom HTTP headers to requests
|
|
397
|
+
|
|
398
|
+
Cloudflare Protection Options:
|
|
399
|
+
cloudflare_phish: true/false Auto-click through Cloudflare phishing warnings (default: false)
|
|
400
|
+
cloudflare_bypass: true/false Auto-solve Cloudflare "Verify you are human" challenges (default: false)
|
|
401
|
+
|
|
402
|
+
FlowProxy Protection Options:
|
|
403
|
+
flowproxy_detection: true/false Enable flowProxy protection detection and handling (default: false)
|
|
404
|
+
flowproxy_page_timeout: <milliseconds> Page timeout for flowProxy sites (default: 45000)
|
|
405
|
+
flowproxy_nav_timeout: <milliseconds> Navigation timeout for flowProxy sites (default: 45000)
|
|
406
|
+
flowproxy_js_timeout: <milliseconds> JavaScript challenge timeout (default: 15000)
|
|
407
|
+
flowproxy_delay: <milliseconds> Delay for rate limiting (default: 30000)
|
|
408
|
+
flowproxy_additional_delay: <milliseconds> Additional processing delay (default: 5000)
|
|
409
|
+
|
|
410
|
+
Advanced Options:
|
|
411
|
+
evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page (for this site)
|
|
412
|
+
cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page
|
|
413
|
+
whois: ["term1", "term2"] Check whois data for ALL specified terms (AND logic)
|
|
414
|
+
whois-or: ["term1", "term2"] Check whois data for ANY specified term (OR logic)
|
|
415
|
+
whois_server_mode: "random" or "cycle" Server selection mode: random (default) or cycle through list
|
|
416
|
+
whois_server: "whois.domain.com" or ["server1", "server2"] Custom whois server(s) - single server or randomized list (default: system default)
|
|
417
|
+
whois_max_retries: 2 Maximum retry attempts per domain (default: 2)
|
|
418
|
+
whois_timeout_multiplier: 1.5 Timeout increase multiplier per retry (default: 1.5)
|
|
419
|
+
whois_use_fallback: true Add TLD-specific fallback servers (default: true)
|
|
420
|
+
whois_retry_on_timeout: true Retry on timeout errors (default: true)
|
|
421
|
+
whois_retry_on_error: false Retry on connection/other errors (default: false)
|
|
422
|
+
whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
|
|
423
|
+
dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
|
|
424
|
+
dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
|
|
425
|
+
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"})
|
|
426
|
+
dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
|
|
427
|
+
digRecordType: "A" DNS record type for dig (default: A)
|
|
428
|
+
|
|
429
|
+
Referrer Header Options:
|
|
430
|
+
referrer_headers: "https://google.com" Single referrer URL
|
|
431
|
+
referrer_headers: ["url1", "url2"] Random selection from array
|
|
432
|
+
referrer_headers: {"mode": "random_search", "search_terms": ["term1"]} Smart search engine traffic
|
|
433
|
+
referrer_headers: {"mode": "social_media"} Random social media referrers
|
|
434
|
+
referrer_headers: {"mode": "direct_navigation"} No referrer (direct access)
|
|
435
|
+
custom_headers: {"Header": "Value"} Additional HTTP headers
|
|
436
|
+
`);
|
|
437
|
+
process.exit(0);
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// --- Configuration File Loading ---
|
|
441
|
+
const configPathIndex = args.findIndex(arg => arg === '--custom-json');
|
|
442
|
+
const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json';
|
|
443
|
+
let config;
|
|
444
|
+
try {
|
|
445
|
+
if (!fs.existsSync(configPath)) {
|
|
446
|
+
console.error(`❌ Config file not found: ${configPath}`);
|
|
447
|
+
process.exit(1);
|
|
448
|
+
}
|
|
449
|
+
if (forceDebug && configPath !== 'config.json') {
|
|
450
|
+
console.log(formatLogMessage('debug', `Using custom config file: ${configPath}`));
|
|
451
|
+
}
|
|
452
|
+
const raw = fs.readFileSync(configPath, 'utf8');
|
|
453
|
+
config = JSON.parse(raw);
|
|
454
|
+
} catch (e) {
|
|
455
|
+
console.error(`❌ Failed to load config file (${configPath}):`, e.message);
|
|
456
|
+
process.exit(1);
|
|
457
|
+
}
|
|
458
|
+
// Extract config values while ignoring 'comments' field at global and site levels
|
|
459
|
+
const { sites = [], ignoreDomains = [], blocked: globalBlocked = [], whois_delay = 3000, whois_server_mode = 'random', ignore_similar = true, ignore_similar_threshold = 80, ignore_similar_ignored_domains = true, comments: globalComments, ...otherGlobalConfig } = config;
|
|
460
|
+
|
|
461
|
+
// Handle --clean-rules after config is loaded (so we have access to sites)
|
|
462
|
+
if (cleanRules || cleanRulesFile) {
|
|
463
|
+
const filesToClean = cleanRulesFile ? [cleanRulesFile] : [outputFile, compareFile].filter(Boolean);
|
|
464
|
+
|
|
465
|
+
if (filesToClean.length === 0) {
|
|
466
|
+
console.error('❌ --clean-rules requires either a file argument or --output/--compare files to be specified');
|
|
467
|
+
process.exit(1);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
console.log(`\n${messageColors.processing('Cleaning rule files...')}`);
|
|
471
|
+
let overallSuccess = true;
|
|
472
|
+
let totalCleaned = 0;
|
|
473
|
+
|
|
474
|
+
// Check if we're cleaning the same file we want to use for output
|
|
475
|
+
const cleaningOutputFile = outputFile && filesToClean.includes(outputFile);
|
|
476
|
+
|
|
477
|
+
if (cleaningOutputFile && forceDebug) {
|
|
478
|
+
console.log(formatLogMessage('debug', `Output file detected: will clean ${outputFile} first, then continue with scan`));
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
for (const file of filesToClean) {
|
|
482
|
+
console.log(`\n${messageColors.info('Cleaning:')} ${file}`);
|
|
483
|
+
|
|
484
|
+
// Check if file exists before trying to clean it
|
|
485
|
+
if (!fs.existsSync(file)) {
|
|
486
|
+
if (file === outputFile) {
|
|
487
|
+
// If it's the output file that doesn't exist, that's OK - we'll create it during scan
|
|
488
|
+
const modeText = appendMode ? 'created (append mode)' : 'created';
|
|
489
|
+
console.log(`${messageColors.info('📄 Note:')} Output file ${file} doesn't exist yet - will be ${modeText} during scan`);
|
|
490
|
+
continue;
|
|
491
|
+
} else {
|
|
492
|
+
// For other files (like compare files), this is an error
|
|
493
|
+
console.log(`${messageColors.error('❌ Failed:')} File not found: ${file}`);
|
|
494
|
+
overallSuccess = false;
|
|
495
|
+
continue;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
try {
|
|
500
|
+
const cleanResult = cleanRulesetFile(file, null, {
|
|
501
|
+
forceDebug,
|
|
502
|
+
silentMode,
|
|
503
|
+
removeDuplicates: removeDupes,
|
|
504
|
+
backupOriginal: true,
|
|
505
|
+
dryRun: dryRunMode
|
|
506
|
+
});
|
|
507
|
+
|
|
508
|
+
if (cleanResult.success) {
|
|
509
|
+
if (dryRunMode) {
|
|
510
|
+
if (cleanResult.wouldModify) {
|
|
511
|
+
console.log(`${messageColors.info('🔍 Dry run:')} Would remove ${cleanResult.stats.removed} lines (${cleanResult.stats.invalid} invalid, ${cleanResult.stats.duplicates} duplicates)`);
|
|
512
|
+
} else {
|
|
513
|
+
console.log(`${messageColors.success('✅ Dry run:')} File is already clean - no changes needed`);
|
|
514
|
+
}
|
|
515
|
+
} else {
|
|
516
|
+
if (cleanResult.modified) {
|
|
517
|
+
console.log(`${messageColors.success('✅ Cleaned:')} Removed ${cleanResult.stats.removed} lines, preserved ${cleanResult.stats.valid} valid rules`);
|
|
518
|
+
if (cleanResult.backupCreated) {
|
|
519
|
+
console.log(`${messageColors.info('💾 Backup:')} Original file backed up`);
|
|
520
|
+
}
|
|
521
|
+
totalCleaned += cleanResult.stats.removed;
|
|
522
|
+
|
|
523
|
+
if (cleaningOutputFile && file === outputFile) {
|
|
524
|
+
console.log(`${messageColors.info('📄 Note:')} File cleaned - new rules will be ${appendMode ? 'appended' : 'written'} during scan`);
|
|
525
|
+
}
|
|
526
|
+
} else {
|
|
527
|
+
console.log(`${messageColors.success('✅ Clean:')} File was already valid - no changes needed`);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
} else {
|
|
531
|
+
console.log(`${messageColors.error('❌ Failed:')} ${cleanResult.error}`);
|
|
532
|
+
overallSuccess = false;
|
|
533
|
+
}
|
|
534
|
+
} catch (cleanErr) {
|
|
535
|
+
console.error(`❌ Failed to clean ${file}: ${cleanErr.message}`);
|
|
536
|
+
overallSuccess = false;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// Determine if we should continue with scanning
|
|
541
|
+
const shouldContinueScanning = sites && sites.length > 0 && outputFile;
|
|
542
|
+
const cleanedOutputFileForScanning = outputFile && filesToClean.includes(outputFile);
|
|
543
|
+
|
|
544
|
+
if (overallSuccess) {
|
|
545
|
+
if (dryRunMode) {
|
|
546
|
+
console.log(`\n${messageColors.info('🔍 Dry run completed successfully!')}`);
|
|
547
|
+
process.exit(0);
|
|
548
|
+
} else {
|
|
549
|
+
console.log(`\n${messageColors.success('✅ All rule files cleaned successfully!')} Total lines removed: ${totalCleaned}`);
|
|
550
|
+
|
|
551
|
+
// Continue with scan if we have sites to process and we cleaned the output file
|
|
552
|
+
if (shouldContinueScanning && cleanedOutputFileForScanning) {
|
|
553
|
+
const actionText = appendMode ? 'append new rules to' : 'write rules to';
|
|
554
|
+
console.log(`${messageColors.info('📄 Continuing:')} Proceeding with scan to ${actionText} ${outputFile}`);
|
|
555
|
+
// Don't exit - continue with scanning
|
|
556
|
+
} else {
|
|
557
|
+
process.exit(0);
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
} else {
|
|
561
|
+
console.log(`\n${messageColors.error('❌ Some rule files failed to clean!')}`);
|
|
562
|
+
process.exit(1);
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// Add global cycling index tracker for whois server selection
|
|
567
|
+
let globalWhoisServerIndex = 0;
|
|
568
|
+
|
|
569
|
+
// Track dry run output for file writing
|
|
570
|
+
let dryRunOutput = [];
|
|
571
|
+
|
|
572
|
+
// --- Log File Setup ---
|
|
573
|
+
let debugLogFile = null;
|
|
574
|
+
let matchedUrlsLogFile = null;
|
|
575
|
+
let adblockRulesLogFile = null;
|
|
576
|
+
if (forceDebug || dumpUrls) {
|
|
577
|
+
// Create logs folder if it doesn't exist
|
|
578
|
+
const logsFolder = 'logs';
|
|
579
|
+
if (!fs.existsSync(logsFolder)) {
|
|
580
|
+
fs.mkdirSync(logsFolder, { recursive: true });
|
|
581
|
+
console.log(formatLogMessage('debug', `Created logs folder: ${logsFolder}`));
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
// Generate timestamped log filenames
|
|
585
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').replace('T', '_').slice(0, -5);
|
|
586
|
+
|
|
587
|
+
if (forceDebug) {
|
|
588
|
+
debugLogFile = path.join(logsFolder, `debug_requests_${timestamp}.log`);
|
|
589
|
+
console.log(formatLogMessage('debug', `Debug requests will be logged to: ${debugLogFile}`));
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if (dumpUrls) {
|
|
593
|
+
matchedUrlsLogFile = path.join(logsFolder, `matched_urls_${timestamp}.log`);
|
|
594
|
+
console.log(messageColors.processing('Matched URLs will be logged to:') + ` ${matchedUrlsLogFile}`);
|
|
595
|
+
|
|
596
|
+
// Also create adblock rules log file with same timestamp
|
|
597
|
+
adblockRulesLogFile = path.join(logsFolder, `adblock_rules_${timestamp}.txt`);
|
|
598
|
+
console.log(messageColors.processing('Adblock rules will be saved to:') + ` ${adblockRulesLogFile}`);
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// Log comments if debug mode is enabled and comments exist
|
|
603
|
+
if (forceDebug && globalComments) {
|
|
604
|
+
const commentList = Array.isArray(globalComments) ? globalComments : [globalComments];
|
|
605
|
+
console.log(formatLogMessage('debug', `Global comments found: ${commentList.length} item(s)`));
|
|
606
|
+
commentList.forEach((comment, idx) => console.log(formatLogMessage('debug', ` Comment ${idx + 1}: ${comment}`)));
|
|
607
|
+
}
|
|
608
|
+
// --- Global CDP Override Logic --- [COMMENT RE-ADDED PREVIOUSLY, relevant to old logic]
|
|
609
|
+
// If globalCDP is not already enabled by the --cdp flag,
|
|
610
|
+
// check if any site in config.json has `cdp: true`. If so, enable globalCDP.
|
|
611
|
+
// This allows site-specific config to trigger CDP logging for the entire session.
|
|
612
|
+
// Note: Analysis suggests CDP should ideally be managed per-page for comprehensive logging.
|
|
613
|
+
// (The code block that utilized this logic for a global CDP variable has been removed
|
|
614
|
+
// as CDP is now handled per-page based on 'enableCDP' and 'siteConfig.cdp')
|
|
615
|
+
|
|
616
|
+
/**
|
|
617
|
+
* Extracts the root domain from a given URL string using the psl library.
|
|
618
|
+
* For example, for 'http://sub.example.com/path', it returns 'example.com'.
|
|
619
|
+
*
|
|
620
|
+
* @param {string} url - The URL string to parse.
|
|
621
|
+
* @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
|
|
622
|
+
*/
|
|
623
|
+
function getRootDomain(url) {
|
|
624
|
+
try {
|
|
625
|
+
const { hostname } = new URL(url);
|
|
626
|
+
const parsed = psl.parse(hostname);
|
|
627
|
+
return parsed.domain || hostname;
|
|
628
|
+
} catch {
|
|
629
|
+
return '';
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
/**
|
|
634
|
+
* Safely extracts hostname from a URL, handling malformed URLs gracefully
|
|
635
|
+
* @param {string} url - The URL string to parse
|
|
636
|
+
* @param {boolean} getFullHostname - If true, returns full hostname; if false, returns root domain
|
|
637
|
+
* @returns {string} The hostname/domain, or empty string if URL is invalid
|
|
638
|
+
*/
|
|
639
|
+
function safeGetDomain(url, getFullHostname = false) {
|
|
640
|
+
try {
|
|
641
|
+
const parsedUrl = new URL(url);
|
|
642
|
+
if (getFullHostname) {
|
|
643
|
+
return parsedUrl.hostname;
|
|
644
|
+
} else {
|
|
645
|
+
return getRootDomain(url);
|
|
646
|
+
}
|
|
647
|
+
} catch (urlError) {
|
|
648
|
+
// Log malformed URLs for debugging
|
|
649
|
+
if (forceDebug) {
|
|
650
|
+
console.log(formatLogMessage('debug', `Malformed URL skipped: ${url} (${urlError.message})`));
|
|
651
|
+
}
|
|
652
|
+
return '';
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
/**
|
|
657
|
+
* Outputs dry run results to console with formatted display
|
|
658
|
+
* If outputFile is specified, also captures output for file writing
|
|
659
|
+
* @param {string} url - The URL being processed
|
|
660
|
+
* @param {Array} matchedItems - Array of matched items with regex, domain, and resource type
|
|
661
|
+
* @param {Array} netToolsResults - Array of whois/dig results
|
|
662
|
+
* @param {string} pageTitle - Title of the page (if available)
|
|
663
|
+
*/
|
|
664
|
+
function outputDryRunResults(url, matchedItems, netToolsResults, pageTitle) {
|
|
665
|
+
const lines = [];
|
|
666
|
+
|
|
667
|
+
lines.push(`\n=== DRY RUN RESULTS === ${url}`);
|
|
668
|
+
|
|
669
|
+
console.log(`\n${messageColors.scanning('=== DRY RUN RESULTS ===')} ${url}`);
|
|
670
|
+
|
|
671
|
+
if (pageTitle && pageTitle.trim()) {
|
|
672
|
+
lines.push(`Title: ${pageTitle.trim()}`);
|
|
673
|
+
console.log(`${messageColors.info('Title:')} ${pageTitle.trim()}`);
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
if (matchedItems.length === 0 && netToolsResults.length === 0) {
|
|
677
|
+
lines.push(`No matching rules found on ${url}`);
|
|
678
|
+
|
|
679
|
+
// Store output for file writing if outputFile is specified
|
|
680
|
+
if (outputFile) {
|
|
681
|
+
dryRunOutput.push(...lines);
|
|
682
|
+
dryRunOutput.push(''); // Add empty line
|
|
683
|
+
}
|
|
684
|
+
console.log(messageColors.warn(`No matching rules found on ${url}`));
|
|
685
|
+
return;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
const totalMatches = matchedItems.length + netToolsResults.length;
|
|
689
|
+
lines.push(`Matches found: ${totalMatches}`);
|
|
690
|
+
console.log(`${messageColors.success('Matches found:')} ${totalMatches}`);
|
|
691
|
+
|
|
692
|
+
matchedItems.forEach((item, index) => {
|
|
693
|
+
lines.push('');
|
|
694
|
+
lines.push(`[${index + 1}] Regex Match:`);
|
|
695
|
+
lines.push(` Pattern: ${item.regex}`);
|
|
696
|
+
lines.push(` Domain: ${item.domain}`);
|
|
697
|
+
lines.push(` Resource Type: ${item.resourceType}`);
|
|
698
|
+
lines.push(` Full URL: ${item.fullUrl}`);
|
|
699
|
+
|
|
700
|
+
console.log(`\n${messageColors.highlight(`[${index + 1}]`)} ${messageColors.match('Regex Match:')}`);
|
|
701
|
+
console.log(` Pattern: ${item.regex}`);
|
|
702
|
+
console.log(` Domain: ${item.domain}`);
|
|
703
|
+
console.log(` Resource Type: ${item.resourceType}`);
|
|
704
|
+
console.log(` Full URL: ${item.fullUrl}`);
|
|
705
|
+
|
|
706
|
+
// Show searchstring results if available
|
|
707
|
+
if (item.searchStringMatch) {
|
|
708
|
+
lines.push(` ✓ Searchstring Match: ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`);
|
|
709
|
+
console.log(` ${messageColors.success('✓ Searchstring Match:')} ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`);
|
|
710
|
+
} else if (item.searchStringChecked) {
|
|
711
|
+
lines.push(` ✗ Searchstring: No matches found in content`);
|
|
712
|
+
console.log(` ${messageColors.warn('✗ Searchstring:')} No matches found in content`);
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// Generate adblock rule
|
|
716
|
+
const adblockRule = `||${item.domain}^$${item.resourceType}`;
|
|
717
|
+
lines.push(` Adblock Rule: ${adblockRule}`);
|
|
718
|
+
console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`);
|
|
719
|
+
});
|
|
720
|
+
|
|
721
|
+
// Display nettools results
|
|
722
|
+
netToolsResults.forEach((result, index) => {
|
|
723
|
+
const resultIndex = matchedItems.length + index + 1;
|
|
724
|
+
lines.push('');
|
|
725
|
+
lines.push(`[${resultIndex}] NetTools Match:`);
|
|
726
|
+
lines.push(` Domain: ${result.domain}`);
|
|
727
|
+
lines.push(` Tool: ${result.tool.toUpperCase()}`);
|
|
728
|
+
lines.push(` ✓ Match: ${result.matchType} - "${result.matchedTerm}"`);
|
|
729
|
+
if (result.details) {
|
|
730
|
+
lines.push(` Details: ${result.details}`);
|
|
731
|
+
}
|
|
732
|
+
console.log(`\n${messageColors.highlight(`[${resultIndex}]`)} ${messageColors.match('NetTools Match:')}`);
|
|
733
|
+
console.log(` Domain: ${result.domain}`);
|
|
734
|
+
console.log(` Tool: ${result.tool.toUpperCase()}`);
|
|
735
|
+
console.log(` ${messageColors.success('✓ Match:')} ${result.matchType} - "${result.matchedTerm}"`);
|
|
736
|
+
if (result.details) {
|
|
737
|
+
console.log(` Details: ${result.details}`);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// Generate adblock rule for nettools matches
|
|
741
|
+
const adblockRule = `||${result.domain}^`;
|
|
742
|
+
lines.push(` Adblock Rule: ${adblockRule}`);
|
|
743
|
+
console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`);
|
|
744
|
+
});
|
|
745
|
+
|
|
746
|
+
// Store output for file writing if outputFile is specified
|
|
747
|
+
if (outputFile) {
|
|
748
|
+
dryRunOutput.push(...lines);
|
|
749
|
+
dryRunOutput.push(''); // Add empty line between sites
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
// ability to use widcards in ignoreDomains
|
|
754
|
+
function matchesIgnoreDomain(domain, ignorePatterns) {
|
|
755
|
+
return ignorePatterns.some(pattern => {
|
|
756
|
+
if (pattern.includes('*')) {
|
|
757
|
+
// Convert wildcard pattern to regex
|
|
758
|
+
const regexPattern = pattern
|
|
759
|
+
.replace(/\./g, '\\.') // Escape dots
|
|
760
|
+
.replace(/\*/g, '.*'); // Convert * to .*
|
|
761
|
+
return new RegExp(`^${regexPattern}$`).test(domain);
|
|
762
|
+
}
|
|
763
|
+
return domain.endsWith(pattern);
|
|
764
|
+
});
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
function setupFrameHandling(page, forceDebug) {
|
|
768
|
+
// Handle frame creation with error suppression
|
|
769
|
+
page.on('frameattached', async (frame) => {
|
|
770
|
+
if (frame.parentFrame()) { // Only handle child frames, not main frame
|
|
771
|
+
try {
|
|
772
|
+
const frameUrl = frame.url();
|
|
773
|
+
|
|
774
|
+
if (forceDebug) {
|
|
775
|
+
console.log(formatLogMessage('debug', `New frame attached: ${frameUrl || 'about:blank'}`));
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
// Don't try to navigate to frames with invalid/empty URLs
|
|
779
|
+
if (!frameUrl ||
|
|
780
|
+
frameUrl === 'about:blank' ||
|
|
781
|
+
frameUrl === '' ||
|
|
782
|
+
frameUrl === 'about:srcdoc' ||
|
|
783
|
+
frameUrl.startsWith('about:') ||
|
|
784
|
+
frameUrl.startsWith('data:') ||
|
|
785
|
+
frameUrl.startsWith('blob:') ||
|
|
786
|
+
frameUrl.startsWith('chrome-error://') ||
|
|
787
|
+
frameUrl.startsWith('chrome-extension://')) {
|
|
788
|
+
if (forceDebug) {
|
|
789
|
+
console.log(formatLogMessage('debug', `Skipping frame with invalid/special URL: ${frameUrl}`));
|
|
790
|
+
}
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// Validate URL format before attempting navigation
|
|
795
|
+
try {
|
|
796
|
+
const parsedUrl = new URL(frameUrl);
|
|
797
|
+
// Only process http/https URLs
|
|
798
|
+
if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
|
|
799
|
+
if (forceDebug) {
|
|
800
|
+
console.log(formatLogMessage('debug', `Skipping frame with non-http protocol: ${frameUrl}`));
|
|
801
|
+
}
|
|
802
|
+
return;
|
|
803
|
+
}
|
|
804
|
+
} catch (urlErr) {
|
|
805
|
+
if (forceDebug) {
|
|
806
|
+
console.log(formatLogMessage('debug', `Skipping frame with malformed URL: ${frameUrl}`));
|
|
807
|
+
}
|
|
808
|
+
return;
|
|
809
|
+
}
|
|
810
|
+
// REMOVED: Don't try to manually navigate frames
|
|
811
|
+
// Let frames load naturally - manual navigation often causes Protocol errors
|
|
812
|
+
// await frame.goto(frame.url(), { waitUntil: 'domcontentloaded', timeout: 5000 });
|
|
813
|
+
|
|
814
|
+
if (forceDebug) {
|
|
815
|
+
console.log(formatLogMessage('debug', `Frame will load naturally: ${frameUrl}`));
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
} catch (err) {
|
|
819
|
+
// Suppress "Cannot navigate to invalid URL" errors but log others
|
|
820
|
+
if (!err.message.includes('Cannot navigate to invalid URL') &&
|
|
821
|
+
!err.message.includes('Protocol error')) {
|
|
822
|
+
if (forceDebug) {
|
|
823
|
+
console.log(formatLogMessage('debug', `Frame handling error: ${err.message}`));
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
});
|
|
829
|
+
// Handle frame navigations (keep this for monitoring)
|
|
830
|
+
page.on('framenavigated', (frame) => {
|
|
831
|
+
const frameUrl = frame.url();
|
|
832
|
+
if (forceDebug &&
|
|
833
|
+
frameUrl &&
|
|
834
|
+
frameUrl !== 'about:blank' &&
|
|
835
|
+
frameUrl !== 'about:srcdoc' &&
|
|
836
|
+
!frameUrl.startsWith('about:') &&
|
|
837
|
+
!frameUrl.startsWith('data:') &&
|
|
838
|
+
!frameUrl.startsWith('chrome-error://') &&
|
|
839
|
+
!frameUrl.startsWith('chrome-extension://')) {
|
|
840
|
+
console.log(formatLogMessage('debug', `Frame navigated to: ${frameUrl}`));
|
|
841
|
+
}
|
|
842
|
+
});
|
|
843
|
+
|
|
844
|
+
// Optional: Handle frame detachment for cleanup
|
|
845
|
+
page.on('framedetached', (frame) => {
|
|
846
|
+
if (forceDebug) {
|
|
847
|
+
const frameUrl = frame.url();
|
|
848
|
+
if (frameUrl &&
|
|
849
|
+
frameUrl !== 'about:blank' &&
|
|
850
|
+
frameUrl !== 'about:srcdoc' &&
|
|
851
|
+
!frameUrl.startsWith('about:') &&
|
|
852
|
+
!frameUrl.startsWith('chrome-error://') &&
|
|
853
|
+
!frameUrl.startsWith('chrome-extension://')) {
|
|
854
|
+
console.log(formatLogMessage('debug', `Frame detached: ${frameUrl}`));
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
});
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
// --- Main Asynchronous IIFE (Immediately Invoked Function Expression) ---
|
|
861
|
+
// This is the main entry point and execution block for the network scanner script.
|
|
862
|
+
(async () => {
|
|
863
|
+
/**
|
|
864
|
+
* Creates a new browser instance with consistent configuration
|
|
865
|
+
* Uses system Chrome and temporary directories to minimize disk usage
|
|
866
|
+
* @returns {Promise<import('puppeteer').Browser>} Browser instance
|
|
867
|
+
*/
|
|
868
|
+
async function createBrowser() {
|
|
869
|
+
// Create temporary user data directory that we can fully control and clean up
|
|
870
|
+
const tempUserDataDir = `/tmp/puppeteer-${Date.now()}-${Math.random().toString(36).substring(7)}`;
|
|
871
|
+
let userDataDir = tempUserDataDir; // Store for cleanup tracking
|
|
872
|
+
|
|
873
|
+
// Try to find system Chrome installation to avoid Puppeteer downloads
|
|
874
|
+
const systemChromePaths = [
|
|
875
|
+
'/usr/bin/google-chrome-stable',
|
|
876
|
+
'/usr/bin/google-chrome',
|
|
877
|
+
'/usr/bin/chromium-browser',
|
|
878
|
+
'/usr/bin/chromium',
|
|
879
|
+
'/snap/bin/chromium'
|
|
880
|
+
];
|
|
881
|
+
|
|
882
|
+
let executablePath = null;
|
|
883
|
+
for (const chromePath of systemChromePaths) {
|
|
884
|
+
if (fs.existsSync(chromePath)) {
|
|
885
|
+
executablePath = chromePath;
|
|
886
|
+
if (forceDebug) {
|
|
887
|
+
console.log(formatLogMessage('debug', `Using system Chrome: ${chromePath}`));
|
|
888
|
+
}
|
|
889
|
+
break;
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
const browser = await puppeteer.launch({
|
|
893
|
+
// Use system Chrome if available to avoid downloads
|
|
894
|
+
executablePath: executablePath,
|
|
895
|
+
// Force temporary user data directory for complete cleanup control
|
|
896
|
+
userDataDir: tempUserDataDir,
|
|
897
|
+
args: [
|
|
898
|
+
// Disk space controls - 50MB cache limits
|
|
899
|
+
'--disk-cache-size=52428800', // 50MB disk cache (50 * 1024 * 1024)
|
|
900
|
+
'--media-cache-size=52428800', // 50MB media cache
|
|
901
|
+
'--disable-application-cache',
|
|
902
|
+
'--disable-offline-load-stale-cache',
|
|
903
|
+
'--disable-background-downloads',
|
|
904
|
+
'--no-first-run',
|
|
905
|
+
'--disable-default-apps',
|
|
906
|
+
'--disable-component-extensions-with-background-pages',
|
|
907
|
+
'--disable-background-networking',
|
|
908
|
+
'--no-sandbox',
|
|
909
|
+
'--disable-setuid-sandbox',
|
|
910
|
+
'--disable-features=SafeBrowsing',
|
|
911
|
+
'--disable-dev-shm-usage',
|
|
912
|
+
'--disable-sync',
|
|
913
|
+
'--disable-gpu',
|
|
914
|
+
'--mute-audio',
|
|
915
|
+
'--disable-translate',
|
|
916
|
+
'--window-size=1920,1080',
|
|
917
|
+
'--disable-extensions',
|
|
918
|
+
'--no-default-browser-check',
|
|
919
|
+
'--safebrowsing-disable-auto-update',
|
|
920
|
+
'--max_old_space_size=1024',
|
|
921
|
+
'--ignore-ssl-errors',
|
|
922
|
+
'--ignore-certificate-errors',
|
|
923
|
+
'--ignore-certificate-errors-spki-list',
|
|
924
|
+
'--ignore-certificate-errors-ca-list',
|
|
925
|
+
'--disable-web-security',
|
|
926
|
+
'--allow-running-insecure-content',
|
|
927
|
+
'--disable-background-timer-throttling',
|
|
928
|
+
'--disable-backgrounding-occluded-windows',
|
|
929
|
+
'--disable-renderer-backgrounding',
|
|
930
|
+
'--disable-features=TranslateUI',
|
|
931
|
+
'--disable-features=VizDisplayCompositor',
|
|
932
|
+
'--run-all-compositor-stages-before-draw',
|
|
933
|
+
'--disable-threaded-animation',
|
|
934
|
+
'--disable-threaded-scrolling',
|
|
935
|
+
'--disable-checker-imaging',
|
|
936
|
+
'--disable-image-animation-resync'
|
|
937
|
+
],
|
|
938
|
+
headless: launchHeadless ? 'shell' : false,
|
|
939
|
+
protocolTimeout: 500000
|
|
940
|
+
});
|
|
941
|
+
|
|
942
|
+
// Store the user data directory on the browser object for cleanup
|
|
943
|
+
browser._nwssUserDataDir = tempUserDataDir;
|
|
944
|
+
return browser;
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
const pLimit = (await import('p-limit')).default;
|
|
949
|
+
const limit = pLimit(MAX_CONCURRENT_SITES);
|
|
950
|
+
|
|
951
|
+
const perSiteHeadful = sites.some(site => site.headful === true);
|
|
952
|
+
const launchHeadless = !(headfulMode || perSiteHeadful);
|
|
953
|
+
// launch with no safe browsing
|
|
954
|
+
let browser = await createBrowser();
|
|
955
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Launching browser with headless: ${launchHeadless}`));
|
|
956
|
+
|
|
957
|
+
// Log which headless mode is being used
|
|
958
|
+
if (forceDebug && launchHeadless) {
|
|
959
|
+
console.log(formatLogMessage('debug', `Using chrome-headless-shell for maximum performance`));
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
// Initial cleanup of any existing Chrome temp files - always comprehensive on startup
|
|
963
|
+
if (forceDebug) console.log(formatLogMessage('debug', 'Cleaning up any leftover temp files from previous runs...'));
|
|
964
|
+
await cleanupChromeTempFiles({
|
|
965
|
+
includeSnapTemp: true, // Always clean snap dirs on startup
|
|
966
|
+
forceDebug,
|
|
967
|
+
comprehensive: true // Always comprehensive on startup to clean leftovers
|
|
968
|
+
});
|
|
969
|
+
|
|
970
|
+
// Set up cleanup on process termination
|
|
971
|
+
process.on('SIGINT', async () => {
|
|
972
|
+
if (forceDebug) console.log(formatLogMessage('debug', 'SIGINT received, performing cleanup...'));
|
|
973
|
+
await performEmergencyCleanup();
|
|
974
|
+
process.exit(0);
|
|
975
|
+
});
|
|
976
|
+
|
|
977
|
+
process.on('SIGTERM', async () => {
|
|
978
|
+
if (forceDebug) console.log(formatLogMessage('debug', 'SIGTERM received, performing cleanup...'));
|
|
979
|
+
await performEmergencyCleanup();
|
|
980
|
+
process.exit(0);
|
|
981
|
+
});
|
|
982
|
+
|
|
983
|
+
// Emergency cleanup function
|
|
984
|
+
async function performEmergencyCleanup() {
|
|
985
|
+
try {
|
|
986
|
+
if (browser && !browser.process()?.killed) {
|
|
987
|
+
await handleBrowserExit(browser, {
|
|
988
|
+
forceDebug,
|
|
989
|
+
timeout: 5000,
|
|
990
|
+
exitOnFailure: false,
|
|
991
|
+
cleanTempFiles: true,
|
|
992
|
+
comprehensiveCleanup: true, // Always comprehensive on emergency
|
|
993
|
+
userDataDir: browser._nwssUserDataDir
|
|
994
|
+
});
|
|
995
|
+
} else {
|
|
996
|
+
// Browser already dead, just clean temp files
|
|
997
|
+
await cleanupChromeTempFiles({
|
|
998
|
+
includeSnapTemp: true,
|
|
999
|
+
forceDebug,
|
|
1000
|
+
comprehensive: true
|
|
1001
|
+
});
|
|
1002
|
+
}
|
|
1003
|
+
} catch (emergencyErr) {
|
|
1004
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Emergency cleanup failed: ${emergencyErr.message}`));
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
let siteCounter = 0;
|
|
1009
|
+
const totalUrls = sites.reduce((sum, site) => {
|
|
1010
|
+
const urls = Array.isArray(site.url) ? site.url.length : 1;
|
|
1011
|
+
return sum + urls;
|
|
1012
|
+
}, 0);
|
|
1013
|
+
|
|
1014
|
+
// --- Global CDP (Chrome DevTools Protocol) Session --- [COMMENT RE-ADDED PREVIOUSLY, relevant to old logic]
|
|
1015
|
+
// NOTE: This CDP session is attached to the initial browser page (e.g., about:blank).
|
|
1016
|
+
// For comprehensive network logging per scanned site, a CDP session should ideally be
|
|
1017
|
+
// created for each new page context. This current setup might miss some site-specific requests.
|
|
1018
|
+
// (The code block for this initial global CDP session has been removed.
|
|
1019
|
+
// CDP is now handled on a per-page basis within processUrl if enabled.)
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
// --- Global evaluateOnNewDocument for Fetch/XHR Interception ---
|
|
1023
|
+
// REMOVED: The old flawed global loop for evaluateOnNewDocument (Fetch/XHR interception) is removed.
|
|
1024
|
+
// This functionality is now correctly implemented within the processUrl function on the actual target page.
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
/**
|
|
1028
|
+
* Processes a single URL: navigates to it, applies configurations (spoofing, interception),
|
|
1029
|
+
* monitors network requests, and extracts domains based on matching filterRegex.
|
|
1030
|
+
*
|
|
1031
|
+
* @param {string} currentUrl - The URL to scan.
|
|
1032
|
+
* @param {object} siteConfig - The configuration object for this specific site/URL from config.json.
|
|
1033
|
+
* @param {import('puppeteer').Browser} browserInstance - The shared Puppeteer browser instance.
|
|
1034
|
+
* @returns {Promise<object>} A promise that resolves to an object containing scan results.
|
|
1035
|
+
*/
|
|
1036
|
+
async function processUrl(currentUrl, siteConfig, browserInstance) {
|
|
1037
|
+
const allowFirstParty = siteConfig.firstParty === 1;
|
|
1038
|
+
const allowThirdParty = siteConfig.thirdParty === undefined || siteConfig.thirdParty === 1;
|
|
1039
|
+
const perSiteSubDomains = siteConfig.subDomains === 1 ? true : subDomainsMode;
|
|
1040
|
+
const siteLocalhost = siteConfig.localhost === true;
|
|
1041
|
+
const siteLocalhostAlt = siteConfig.localhost_0_0_0_0 === true;
|
|
1042
|
+
const cloudflarePhishBypass = siteConfig.cloudflare_phish === true;
|
|
1043
|
+
const cloudflareBypass = siteConfig.cloudflare_bypass === true;
|
|
1044
|
+
const sitePrivoxy = siteConfig.privoxy === true;
|
|
1045
|
+
const sitePihole = siteConfig.pihole === true;
|
|
1046
|
+
const flowproxyDetection = siteConfig.flowproxy_detection === true;
|
|
1047
|
+
|
|
1048
|
+
const evenBlocked = siteConfig.even_blocked === true;
|
|
1049
|
+
// Log site-level comments if debug mode is enabled
|
|
1050
|
+
if (forceDebug && siteConfig.comments) {
|
|
1051
|
+
const siteComments = Array.isArray(siteConfig.comments) ? siteConfig.comments : [siteConfig.comments];
|
|
1052
|
+
console.log(formatLogMessage('debug', `Site comments for ${currentUrl}: ${siteComments.length} item(s)`));
|
|
1053
|
+
siteComments.forEach((comment, idx) =>
|
|
1054
|
+
console.log(formatLogMessage('debug', ` Site comment ${idx + 1}: ${comment}`))
|
|
1055
|
+
);
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
if (siteConfig.firstParty === 0 && siteConfig.thirdParty === 0) {
|
|
1059
|
+
console.warn(`⚠ Skipping ${currentUrl} because both firstParty and thirdParty are disabled.`);
|
|
1060
|
+
return { url: currentUrl, rules: [], success: false, skipped: true };
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
let page = null;
|
|
1064
|
+
let cdpSession = null;
|
|
1065
|
+
// Use Map to track domains and their resource types for --adblock-rules or --dry-run
|
|
1066
|
+
const matchedDomains = (adblockRulesMode || siteConfig.adblock_rules || dryRunMode) ? new Map() : new Set();
|
|
1067
|
+
|
|
1068
|
+
// Initialize dry run matches collection
|
|
1069
|
+
if (dryRunMode) {
|
|
1070
|
+
matchedDomains.set('dryRunMatches', []);
|
|
1071
|
+
matchedDomains.set('dryRunNetTools', []);
|
|
1072
|
+
matchedDomains.set('dryRunSearchString', new Map()); // Map URL to search results
|
|
1073
|
+
}
|
|
1074
|
+
const timeout = siteConfig.timeout || 30000;
|
|
1075
|
+
|
|
1076
|
+
if (!silentMode) console.log(`\n${messageColors.scanning('Scanning:')} ${currentUrl}`);
|
|
1077
|
+
|
|
1078
|
+
// Track redirect domains to exclude from matching
|
|
1079
|
+
let redirectDomainsToExclude = [];
|
|
1080
|
+
|
|
1081
|
+
// Track the effective current URL for first-party detection (updates after redirects)
|
|
1082
|
+
let effectiveCurrentUrl = currentUrl;
|
|
1083
|
+
|
|
1084
|
+
try {
|
|
1085
|
+
// Health check before creating new page
|
|
1086
|
+
const isHealthy = await isBrowserHealthy(browserInstance);
|
|
1087
|
+
if (!isHealthy) {
|
|
1088
|
+
if (forceDebug) {
|
|
1089
|
+
console.log(formatLogMessage('debug', `Browser health degraded before processing ${currentUrl} - forcing immediate restart`));
|
|
1090
|
+
}
|
|
1091
|
+
// Return special code to trigger immediate browser restart
|
|
1092
|
+
return {
|
|
1093
|
+
url: currentUrl,
|
|
1094
|
+
rules: [],
|
|
1095
|
+
success: false,
|
|
1096
|
+
needsImmediateRestart: true,
|
|
1097
|
+
error: 'Browser health degraded - restart required'
|
|
1098
|
+
};
|
|
1099
|
+
}
|
|
1100
|
+
// Check for Protocol timeout errors that indicate browser is broken
|
|
1101
|
+
if (browserInstance.process() && browserInstance.process().killed) {
|
|
1102
|
+
throw new Error('Browser process was killed - restart required');
|
|
1103
|
+
}
|
|
1104
|
+
page = await browserInstance.newPage();
|
|
1105
|
+
|
|
1106
|
+
// Set aggressive timeouts for problematic operations
|
|
1107
|
+
page.setDefaultTimeout(Math.min(timeout, 20000)); // Use site timeout or 20s max
|
|
1108
|
+
page.setDefaultNavigationTimeout(Math.min(timeout, 25000)); // Use site timeout or 25s max
|
|
1109
|
+
// Note: timeout variable from siteConfig.timeout || 30000 is overridden for stability
|
|
1110
|
+
|
|
1111
|
+
page.on('console', (msg) => {
|
|
1112
|
+
if (forceDebug && msg.type() === 'error') console.log(`[debug] Console error: ${msg.text()}`);
|
|
1113
|
+
});
|
|
1114
|
+
|
|
1115
|
+
// Add page crash handler
|
|
1116
|
+
page.on('error', (err) => {
|
|
1117
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Page crashed: ${err.message}`));
|
|
1118
|
+
// Don't throw here as it might cause hanging - let the timeout handle it
|
|
1119
|
+
});
|
|
1120
|
+
|
|
1121
|
+
// Apply flowProxy timeouts if detection is enabled
|
|
1122
|
+
if (flowproxyDetection) {
|
|
1123
|
+
const flowproxyTimeouts = getFlowProxyTimeouts(siteConfig);
|
|
1124
|
+
page.setDefaultTimeout(flowproxyTimeouts.pageTimeout);
|
|
1125
|
+
page.setDefaultNavigationTimeout(flowproxyTimeouts.navigationTimeout);
|
|
1126
|
+
if (forceDebug) {
|
|
1127
|
+
console.log(formatLogMessage('debug', `Applied flowProxy timeouts - page: ${flowproxyTimeouts.pageTimeout}ms, nav: ${flowproxyTimeouts.navigationTimeout}ms`));
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
// --- START: evaluateOnNewDocument for Fetch/XHR Interception (Moved and Fixed) ---
|
|
1132
|
+
// This script is injected if --eval-on-doc is used or siteConfig.evaluateOnNewDocument is true.
|
|
1133
|
+
const shouldInjectEvalForPage = siteConfig.evaluateOnNewDocument === true || globalEvalOnDoc;
|
|
1134
|
+
if (shouldInjectEvalForPage) {
|
|
1135
|
+
if (forceDebug) {
|
|
1136
|
+
if (globalEvalOnDoc) {
|
|
1137
|
+
console.log(formatLogMessage('debug', `[evalOnDoc] Global Fetch/XHR interception enabled, applying to: ${currentUrl}`));
|
|
1138
|
+
} else { // siteConfig.evaluateOnNewDocument must be true
|
|
1139
|
+
console.log(formatLogMessage('debug', `[evalOnDoc] Site-specific Fetch/XHR interception enabled for: ${currentUrl}`));
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
try {
|
|
1143
|
+
await page.evaluateOnNewDocument(() => {
|
|
1144
|
+
// This script intercepts and logs Fetch and XHR requests
|
|
1145
|
+
// from within the page context at the earliest possible moment.
|
|
1146
|
+
const originalFetch = window.fetch;
|
|
1147
|
+
window.fetch = (...args) => {
|
|
1148
|
+
console.log('[evalOnDoc][fetch]', args[0]); // Log fetch requests
|
|
1149
|
+
return originalFetch.apply(this, args);
|
|
1150
|
+
};
|
|
1151
|
+
|
|
1152
|
+
const originalXHROpen = XMLHttpRequest.prototype.open;
|
|
1153
|
+
XMLHttpRequest.prototype.open = function (method, xhrUrl) { // Renamed 'url' to 'xhrUrl' to avoid conflict
|
|
1154
|
+
console.log('[evalOnDoc][xhr]', xhrUrl); // Log XHR requests
|
|
1155
|
+
return originalXHROpen.apply(this, arguments);
|
|
1156
|
+
};
|
|
1157
|
+
});
|
|
1158
|
+
} catch (evalErr) {
|
|
1159
|
+
console.warn(formatLogMessage('warn', `[evalOnDoc] Failed to set up Fetch/XHR interception for ${currentUrl}: ${evalErr.message}`));
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
// --- END: evaluateOnNewDocument for Fetch/XHR Interception ---
|
|
1163
|
+
|
|
1164
|
+
// --- CSS Element Blocking Setup ---
|
|
1165
|
+
const cssBlockedSelectors = siteConfig.css_blocked;
|
|
1166
|
+
if (cssBlockedSelectors && Array.isArray(cssBlockedSelectors) && cssBlockedSelectors.length > 0) {
|
|
1167
|
+
if (forceDebug) console.log(formatLogMessage('debug', `CSS element blocking enabled for ${currentUrl}: ${cssBlockedSelectors.join(', ')}`));
|
|
1168
|
+
try {
|
|
1169
|
+
await page.evaluateOnNewDocument(({ selectors }) => {
|
|
1170
|
+
// Inject CSS to hide blocked elements
|
|
1171
|
+
const style = document.createElement('style');
|
|
1172
|
+
style.type = 'text/css';
|
|
1173
|
+
const cssRules = selectors.map(selector => `${selector} { display: none !important; visibility: hidden !important; }`).join('\n');
|
|
1174
|
+
style.innerHTML = cssRules;
|
|
1175
|
+
|
|
1176
|
+
// Add the style as soon as DOM is available
|
|
1177
|
+
if (document.head) {
|
|
1178
|
+
document.head.appendChild(style);
|
|
1179
|
+
} else {
|
|
1180
|
+
document.addEventListener('DOMContentLoaded', () => document.head.appendChild(style));
|
|
1181
|
+
}
|
|
1182
|
+
}, { selectors: cssBlockedSelectors });
|
|
1183
|
+
} catch (cssErr) {
|
|
1184
|
+
console.warn(formatLogMessage('warn', `[css_blocked] Failed to set up CSS element blocking for ${currentUrl}: ${cssErr.message}`));
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
// --- END: CSS Element Blocking Setup ---
|
|
1188
|
+
|
|
1189
|
+
// --- Per-Page CDP Setup ---
|
|
1190
|
+
const cdpLoggingNeededForPage = enableCDP || siteConfig.cdp === true;
|
|
1191
|
+
if (cdpLoggingNeededForPage) {
|
|
1192
|
+
if (forceDebug) {
|
|
1193
|
+
if (enableCDP) {
|
|
1194
|
+
console.log(formatLogMessage('debug', `CDP logging globally enabled by --cdp, applying to page: ${currentUrl}`));
|
|
1195
|
+
} else if (siteConfig.cdp === true) {
|
|
1196
|
+
console.log(formatLogMessage('debug', `CDP logging enabled for page ${currentUrl} via site-specific 'cdp: true' config.`));
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
try {
|
|
1200
|
+
cdpSession = await page.target().createCDPSession();
|
|
1201
|
+
await cdpSession.send('Network.enable');
|
|
1202
|
+
cdpSession.on('Network.requestWillBeSent', (params) => {
|
|
1203
|
+
const { url: requestUrl, method } = params.request;
|
|
1204
|
+
const initiator = params.initiator ? params.initiator.type : 'unknown';
|
|
1205
|
+
let hostnameForLog = 'unknown-host';
|
|
1206
|
+
try {
|
|
1207
|
+
hostnameForLog = new URL(currentUrl).hostname;
|
|
1208
|
+
} catch (_) { /* ignore if currentUrl is invalid for URL parsing */ }
|
|
1209
|
+
console.log(formatLogMessage('debug', `[cdp][${hostnameForLog}] ${method} ${requestUrl} (initiator: ${initiator})`));
|
|
1210
|
+
});
|
|
1211
|
+
} catch (cdpErr) {
|
|
1212
|
+
cdpSession = null; // Reset on failure
|
|
1213
|
+
if (cdpErr.message.includes('Network.enable timed out') ||
|
|
1214
|
+
cdpErr.message.includes('Protocol error')) {
|
|
1215
|
+
// This indicates browser is completely broken
|
|
1216
|
+
throw new Error(`Browser protocol broken: ${cdpErr.message}`);
|
|
1217
|
+
}
|
|
1218
|
+
console.warn(formatLogMessage('warn', `[cdp] Failed to attach CDP session for ${currentUrl}: ${cdpErr.message}`));
|
|
1219
|
+
}
|
|
1220
|
+
}
|
|
1221
|
+
// --- End of Per-Page CDP Setup ---
|
|
1222
|
+
|
|
1223
|
+
await page.setRequestInterception(true);
|
|
1224
|
+
|
|
1225
|
+
// Set up frame handling to suppress invalid URL errors
|
|
1226
|
+
setupFrameHandling(page, forceDebug);
|
|
1227
|
+
|
|
1228
|
+
if (siteConfig.clear_sitedata === true) {
|
|
1229
|
+
try {
|
|
1230
|
+
let clearDataSession = null;
|
|
1231
|
+
try {
|
|
1232
|
+
clearDataSession = await page.target().createCDPSession();
|
|
1233
|
+
await clearDataSession.send('Network.clearBrowserCookies');
|
|
1234
|
+
await clearDataSession.send('Network.clearBrowserCache');
|
|
1235
|
+
} finally {
|
|
1236
|
+
if (clearDataSession) {
|
|
1237
|
+
try { await clearDataSession.detach(); } catch (detachErr) { /* ignore */ }
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
await page.evaluate(() => {
|
|
1241
|
+
localStorage.clear();
|
|
1242
|
+
sessionStorage.clear();
|
|
1243
|
+
indexedDB.databases().then(dbs => dbs.forEach(db => indexedDB.deleteDatabase(db.name)));
|
|
1244
|
+
});
|
|
1245
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data for ${currentUrl}`));
|
|
1246
|
+
} catch (clearErr) {
|
|
1247
|
+
console.warn(messageColors.warn(`[clear_sitedata failed] ${currentUrl}: ${clearErr.message}`));
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
// --- Apply all fingerprint spoofing (user agent, Brave, fingerprint protection) ---
|
|
1252
|
+
await applyAllFingerprintSpoofing(page, siteConfig, forceDebug, currentUrl);
|
|
1253
|
+
|
|
1254
|
+
const regexes = Array.isArray(siteConfig.filterRegex)
|
|
1255
|
+
? siteConfig.filterRegex.map(r => new RegExp(r.replace(/^\/(.*)\/$/, '$1')))
|
|
1256
|
+
: siteConfig.filterRegex
|
|
1257
|
+
? [new RegExp(siteConfig.filterRegex.replace(/^\/(.*)\/$/, '$1'))]
|
|
1258
|
+
: [];
|
|
1259
|
+
|
|
1260
|
+
// Parse searchstring patterns using module
|
|
1261
|
+
const { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd } = parseSearchStrings(siteConfig.searchstring, siteConfig.searchstring_and);
|
|
1262
|
+
const useCurl = siteConfig.curl === true; // Use curl if enabled, regardless of searchstring
|
|
1263
|
+
let useGrep = siteConfig.grep === true && useCurl; // Grep requires curl to be enabled
|
|
1264
|
+
|
|
1265
|
+
// Get user agent for curl if needed
|
|
1266
|
+
let curlUserAgent = '';
|
|
1267
|
+
if (useCurl && siteConfig.userAgent) {
|
|
1268
|
+
const userAgents = {
|
|
1269
|
+
chrome: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
1270
|
+
firefox: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
|
1271
|
+
safari: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15"
|
|
1272
|
+
};
|
|
1273
|
+
curlUserAgent = userAgents[siteConfig.userAgent.toLowerCase()] || '';
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
if (useCurl && forceDebug) {
|
|
1277
|
+
console.log(formatLogMessage('debug', `Curl-based content analysis enabled for ${currentUrl}`));
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
if (useGrep && forceDebug) {
|
|
1281
|
+
console.log(formatLogMessage('debug', `Grep-based pattern matching enabled for ${currentUrl}`));
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
// Validate grep availability if needed
|
|
1285
|
+
if (useGrep && (hasSearchString || hasSearchStringAnd)) {
|
|
1286
|
+
const grepCheck = validateGrepAvailability();
|
|
1287
|
+
if (!grepCheck.isAvailable) {
|
|
1288
|
+
console.warn(formatLogMessage('warn', `Grep not available for ${currentUrl}: ${grepCheck.error}. Falling back to JavaScript search.`));
|
|
1289
|
+
useGrep = false;
|
|
1290
|
+
} else if (forceDebug) {
|
|
1291
|
+
console.log(formatLogMessage('debug', `Using grep: ${grepCheck.version}`));
|
|
1292
|
+
}
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
// Parse whois and dig terms
|
|
1296
|
+
const whoisTerms = siteConfig.whois && Array.isArray(siteConfig.whois) ? siteConfig.whois : null;
|
|
1297
|
+
const whoisOrTerms = siteConfig['whois-or'] && Array.isArray(siteConfig['whois-or']) ? siteConfig['whois-or'] : null;
|
|
1298
|
+
const whoisServer = siteConfig.whois_server || null; // Parse whois_server configuration
|
|
1299
|
+
const digTerms = siteConfig.dig && Array.isArray(siteConfig.dig) ? siteConfig.dig : null;
|
|
1300
|
+
const digOrTerms = siteConfig['dig-or'] && Array.isArray(siteConfig['dig-or']) ? siteConfig['dig-or'] : null;
|
|
1301
|
+
const digRecordType = siteConfig.digRecordType || 'A';
|
|
1302
|
+
const hasNetTools = whoisTerms || whoisOrTerms || digTerms || digOrTerms;
|
|
1303
|
+
|
|
1304
|
+
// Validate nettools availability if needed
|
|
1305
|
+
if (hasNetTools) {
|
|
1306
|
+
if (whoisTerms || whoisOrTerms) {
|
|
1307
|
+
const whoisCheck = validateWhoisAvailability();
|
|
1308
|
+
if (!whoisCheck.isAvailable) {
|
|
1309
|
+
console.warn(formatLogMessage('warn', `Whois not available for ${currentUrl}: ${whoisCheck.error}. Skipping whois checks.`));
|
|
1310
|
+
siteConfig.whois = null; // Disable whois for this site
|
|
1311
|
+
siteConfig['whois-or'] = null; // Disable whois-or for this site
|
|
1312
|
+
} else if (forceDebug) {
|
|
1313
|
+
console.log(formatLogMessage('debug', `Using whois: ${whoisCheck.version}`));
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
if (digTerms || digOrTerms) {
|
|
1318
|
+
const digCheck = validateDigAvailability();
|
|
1319
|
+
if (!digCheck.isAvailable) {
|
|
1320
|
+
console.warn(formatLogMessage('warn', `Dig not available for ${currentUrl}: ${digCheck.error}. Skipping dig checks.`));
|
|
1321
|
+
siteConfig.dig = null; // Disable dig for this site
|
|
1322
|
+
siteConfig['dig-or'] = null; // Disable dig-or for this site
|
|
1323
|
+
} else if (forceDebug) {
|
|
1324
|
+
console.log(formatLogMessage('debug', `Using dig: ${digCheck.version}`));
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
if (siteConfig.verbose === 1 && siteConfig.filterRegex) {
|
|
1330
|
+
const patterns = Array.isArray(siteConfig.filterRegex) ? siteConfig.filterRegex : [siteConfig.filterRegex];
|
|
1331
|
+
console.log(formatLogMessage('info', `Regex patterns for ${currentUrl}:`));
|
|
1332
|
+
patterns.forEach((pattern, idx) => {
|
|
1333
|
+
console.log(` [${idx + 1}] ${pattern}`);
|
|
1334
|
+
});
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
if (siteConfig.verbose === 1 && (hasSearchString || hasSearchStringAnd)) {
|
|
1338
|
+
console.log(formatLogMessage('info', `Search strings for ${currentUrl}:`));
|
|
1339
|
+
if (hasSearchString) {
|
|
1340
|
+
console.log(` OR logic (any must match):`);
|
|
1341
|
+
searchStrings.forEach((searchStr, idx) => {
|
|
1342
|
+
console.log(` [${idx + 1}] "${searchStr}"`);
|
|
1343
|
+
});
|
|
1344
|
+
}
|
|
1345
|
+
if (hasSearchStringAnd) {
|
|
1346
|
+
console.log(` AND logic (all must match):`);
|
|
1347
|
+
searchStringsAnd.forEach((searchStr, idx) => {
|
|
1348
|
+
console.log(` [${idx + 1}] "${searchStr}"`);
|
|
1349
|
+
});
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
if (siteConfig.verbose === 1 && whoisServer) {
|
|
1354
|
+
if (forceDebug) {
|
|
1355
|
+
if (Array.isArray(whoisServer)) {
|
|
1356
|
+
console.log(formatLogMessage('info', `Whois servers for ${currentUrl} (randomized): [${whoisServer.join(', ')}]`));
|
|
1357
|
+
} else {
|
|
1358
|
+
console.log(formatLogMessage('info', `Whois server for ${currentUrl}: ${whoisServer}`));
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
if (siteConfig.verbose === 1 && whoisTerms) {
|
|
1364
|
+
if (forceDebug) console.log(formatLogMessage('info', `Whois terms for ${currentUrl}:`));
|
|
1365
|
+
whoisTerms.forEach((term, idx) => {
|
|
1366
|
+
if (forceDebug) console.log(` [${idx + 1}] "${term}"`);
|
|
1367
|
+
});
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
if (siteConfig.verbose === 1 && whoisOrTerms) {
|
|
1371
|
+
if (forceDebug) console.log(formatLogMessage('info', `Whois-or terms for ${currentUrl}:`));
|
|
1372
|
+
whoisOrTerms.forEach((term, idx) => {
|
|
1373
|
+
if (forceDebug) console.log(` [${idx + 1}] "${term}" (OR logic)`);
|
|
1374
|
+
});
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
if (siteConfig.verbose === 1 && digTerms) {
|
|
1378
|
+
if (forceDebug) console.log(formatLogMessage('info', `Dig terms for ${currentUrl} (${digRecordType} records):`));
|
|
1379
|
+
digTerms.forEach((term, idx) => {
|
|
1380
|
+
if (forceDebug) console.log(` [${idx + 1}] "${term}"`);
|
|
1381
|
+
});
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
if (siteConfig.verbose === 1 && digOrTerms) {
|
|
1385
|
+
if (forceDebug) console.log(formatLogMessage('info', `Dig-or terms for ${currentUrl} (${digRecordType} records):`));
|
|
1386
|
+
digOrTerms.forEach((term, idx) => {
|
|
1387
|
+
if (forceDebug) console.log(` [${idx + 1}] "${term}" (OR logic)`);
|
|
1388
|
+
});
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
const blockedRegexes = Array.isArray(siteConfig.blocked)
|
|
1392
|
+
? siteConfig.blocked.map(pattern => new RegExp(pattern))
|
|
1393
|
+
: [];
|
|
1394
|
+
|
|
1395
|
+
// Add global blocked patterns
|
|
1396
|
+
const globalBlockedRegexes = Array.isArray(globalBlocked)
|
|
1397
|
+
? globalBlocked.map(pattern => new RegExp(pattern))
|
|
1398
|
+
: [];
|
|
1399
|
+
const allBlockedRegexes = [...blockedRegexes, ...globalBlockedRegexes];
|
|
1400
|
+
|
|
1401
|
+
/**
|
|
1402
|
+
* Helper function to add domain to matched collection
|
|
1403
|
+
* @param {string} domain - Domain to add
|
|
1404
|
+
* @param {string} resourceType - Resource type (for --adblock-rules mode)
|
|
1405
|
+
*/
|
|
1406
|
+
function addMatchedDomain(domain, resourceType = null) {
|
|
1407
|
+
// Check if we should ignore similar domains
|
|
1408
|
+
const ignoreSimilarEnabled = siteConfig.ignore_similar !== undefined ? siteConfig.ignore_similar : ignore_similar;
|
|
1409
|
+
const similarityThreshold = siteConfig.ignore_similar_threshold || ignore_similar_threshold;
|
|
1410
|
+
const ignoreSimilarIgnoredDomains = siteConfig.ignore_similar_ignored_domains !== undefined ? siteConfig.ignore_similar_ignored_domains : ignore_similar_ignored_domains;
|
|
1411
|
+
|
|
1412
|
+
if (ignoreSimilarEnabled) {
|
|
1413
|
+
const existingDomains = matchedDomains instanceof Map
|
|
1414
|
+
? Array.from(matchedDomains.keys()).filter(key => !['dryRunMatches', 'dryRunNetTools', 'dryRunSearchString'].includes(key))
|
|
1415
|
+
: Array.from(matchedDomains);
|
|
1416
|
+
|
|
1417
|
+
const similarCheck = shouldIgnoreSimilarDomain(domain, existingDomains, {
|
|
1418
|
+
enabled: true,
|
|
1419
|
+
threshold: similarityThreshold,
|
|
1420
|
+
forceDebug
|
|
1421
|
+
});
|
|
1422
|
+
|
|
1423
|
+
if (similarCheck.shouldIgnore) {
|
|
1424
|
+
if (forceDebug) {
|
|
1425
|
+
console.log(formatLogMessage('debug', `[ignore_similar] Skipping ${domain}: ${similarCheck.reason}`));
|
|
1426
|
+
}
|
|
1427
|
+
return; // Skip adding this domain
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
// Check if domain is similar to any in ignoreDomains list
|
|
1432
|
+
if (ignoreSimilarIgnoredDomains && ignoreDomains && ignoreDomains.length > 0) {
|
|
1433
|
+
const ignoredSimilarCheck = shouldIgnoreSimilarDomain(domain, ignoreDomains, {
|
|
1434
|
+
enabled: true,
|
|
1435
|
+
threshold: similarityThreshold,
|
|
1436
|
+
forceDebug
|
|
1437
|
+
});
|
|
1438
|
+
|
|
1439
|
+
if (ignoredSimilarCheck.shouldIgnore) {
|
|
1440
|
+
if (forceDebug) {
|
|
1441
|
+
console.log(formatLogMessage('debug', `[ignore_similar_ignored_domains] Skipping ${domain}: ${ignoredSimilarCheck.reason} (similar to ignoreDomains)`));
|
|
1442
|
+
}
|
|
1443
|
+
return; // Skip adding this domain
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
if (matchedDomains instanceof Map) {
|
|
1448
|
+
if (!matchedDomains.has(domain)) {
|
|
1449
|
+
matchedDomains.set(domain, new Set());
|
|
1450
|
+
}
|
|
1451
|
+
// Only add the specific resourceType that was matched, not all types for this domain
|
|
1452
|
+
if (resourceType) {
|
|
1453
|
+
matchedDomains.get(domain).add(resourceType);
|
|
1454
|
+
}
|
|
1455
|
+
} else {
|
|
1456
|
+
matchedDomains.add(domain);
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
// --- page.on('request', ...) Handler: Core Network Request Logic ---
|
|
1461
|
+
// This handler is triggered for every network request made by the page.
|
|
1462
|
+
// It decides whether to allow, block, or process the request based on:
|
|
1463
|
+
// - First-party/third-party status and site configuration.
|
|
1464
|
+
// - URL matching against blocklists (`blockedRegexes`).
|
|
1465
|
+
// - URL matching against filter patterns (`regexes`) for domain extraction.
|
|
1466
|
+
// - Global `ignoreDomains` list.
|
|
1467
|
+
page.on('request', request => {
|
|
1468
|
+
const checkedUrl = request.url();
|
|
1469
|
+
const checkedHostname = safeGetDomain(checkedUrl, true);
|
|
1470
|
+
// Use effectiveCurrentUrl which gets updated after redirects
|
|
1471
|
+
// This ensures first-party detection uses the final redirected domain
|
|
1472
|
+
const effectiveCurrentHostname = safeGetDomain(effectiveCurrentUrl, true);
|
|
1473
|
+
const isFirstParty = checkedHostname && effectiveCurrentHostname && checkedHostname === effectiveCurrentHostname;
|
|
1474
|
+
|
|
1475
|
+
// Block infinite iframe loops
|
|
1476
|
+
const frameUrl = request.frame() ? request.frame().url() : '';
|
|
1477
|
+
if (frameUrl && frameUrl.includes('creative.dmzjmp.com') &&
|
|
1478
|
+
request.url().includes('go.dmzjmp.com/api/models')) {
|
|
1479
|
+
if (forceDebug) {
|
|
1480
|
+
console.log(formatLogMessage('debug', `Blocking potential infinite iframe loop: ${request.url()}`));
|
|
1481
|
+
}
|
|
1482
|
+
request.abort();
|
|
1483
|
+
return;
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
// Enhanced debug logging to show which frame the request came from
|
|
1487
|
+
if (forceDebug) {
|
|
1488
|
+
const frameUrl = request.frame() ? request.frame().url() : 'unknown-frame';
|
|
1489
|
+
const isMainFrame = request.frame() === page.mainFrame();
|
|
1490
|
+
console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[frame: ${isMainFrame ? 'main' : 'iframe'}] ${frameUrl} → ${request.url()}`));
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
// Show --debug output and the url while its scanning
|
|
1494
|
+
if (forceDebug) {
|
|
1495
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
1496
|
+
const timestamp = new Date().toISOString();
|
|
1497
|
+
const logEntry = `${timestamp} [debug req][${simplifiedUrl}] ${request.url()}`;
|
|
1498
|
+
|
|
1499
|
+
// Output to console
|
|
1500
|
+
console.log(formatLogMessage('debug', `${messageColors.highlight('[req]')}[${simplifiedUrl}] ${request.url()}`));
|
|
1501
|
+
|
|
1502
|
+
// Output to file
|
|
1503
|
+
if (debugLogFile) {
|
|
1504
|
+
try {
|
|
1505
|
+
fs.appendFileSync(debugLogFile, logEntry + '\n');
|
|
1506
|
+
} catch (logErr) {
|
|
1507
|
+
console.warn(formatLogMessage('warn', `Failed to write to debug log file: ${logErr.message}`));
|
|
1508
|
+
}
|
|
1509
|
+
}
|
|
1510
|
+
}
|
|
1511
|
+
const reqUrl = request.url();
|
|
1512
|
+
|
|
1513
|
+
if (allBlockedRegexes.some(re => re.test(reqUrl))) {
|
|
1514
|
+
if (forceDebug) {
|
|
1515
|
+
// Find which specific pattern matched for debug logging
|
|
1516
|
+
const allPatterns = [...(siteConfig.blocked || []), ...globalBlocked];
|
|
1517
|
+
const matchedPattern = allPatterns.find(pattern => new RegExp(pattern).test(reqUrl));
|
|
1518
|
+
const patternSource = siteConfig.blocked && siteConfig.blocked.includes(matchedPattern) ? 'site' : 'global';
|
|
1519
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
1520
|
+
console.log(formatLogMessage('debug', `${messageColors.blocked('[blocked]')}[${simplifiedUrl}] ${reqUrl} blocked by ${patternSource} pattern: ${matchedPattern}`));
|
|
1521
|
+
|
|
1522
|
+
// Also log to file if debug logging is enabled
|
|
1523
|
+
if (debugLogFile) {
|
|
1524
|
+
try {
|
|
1525
|
+
const timestamp = new Date().toISOString();
|
|
1526
|
+
fs.appendFileSync(debugLogFile, `${timestamp} [blocked][${simplifiedUrl}] ${reqUrl} (${patternSource} pattern: ${matchedPattern})\n`);
|
|
1527
|
+
} catch (logErr) {
|
|
1528
|
+
console.warn(formatLogMessage('warn', `Failed to write blocked domain to debug log: ${logErr.message}`));
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
// NEW: Check if even_blocked is enabled and this URL matches filter regex
|
|
1534
|
+
if (evenBlocked) {
|
|
1535
|
+
const reqDomain = safeGetDomain(reqUrl, perSiteSubDomains);
|
|
1536
|
+
if (reqDomain && !matchesIgnoreDomain(reqDomain, ignoreDomains)) {
|
|
1537
|
+
for (const re of regexes) {
|
|
1538
|
+
if (re.test(reqUrl)) {
|
|
1539
|
+
const resourceType = request.resourceType();
|
|
1540
|
+
|
|
1541
|
+
// Apply same filtering logic as unblocked requests
|
|
1542
|
+
const allowedResourceTypes = siteConfig.resourceTypes;
|
|
1543
|
+
if (!allowedResourceTypes || !Array.isArray(allowedResourceTypes) || allowedResourceTypes.includes(resourceType)) {
|
|
1544
|
+
if (dryRunMode) {
|
|
1545
|
+
matchedDomains.get('dryRunMatches').push({
|
|
1546
|
+
regex: re.source,
|
|
1547
|
+
domain: reqDomain,
|
|
1548
|
+
resourceType: resourceType,
|
|
1549
|
+
fullUrl: reqUrl,
|
|
1550
|
+
isFirstParty: isFirstParty,
|
|
1551
|
+
wasBlocked: true
|
|
1552
|
+
});
|
|
1553
|
+
} else {
|
|
1554
|
+
addMatchedDomain(reqDomain, resourceType);
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
1558
|
+
if (siteConfig.verbose === 1) {
|
|
1559
|
+
const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
|
|
1560
|
+
console.log(formatLogMessage('match', `[${simplifiedUrl}] ${reqUrl} matched regex: ${re} and resourceType: ${resourceType}${resourceInfo} [BLOCKED BUT ADDED]`));
|
|
1561
|
+
}
|
|
1562
|
+
if (dumpUrls) {
|
|
1563
|
+
const timestamp = new Date().toISOString();
|
|
1564
|
+
const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
|
|
1565
|
+
fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${reqUrl} (resourceType: ${resourceType})${resourceInfo} [BLOCKED BUT ADDED]\n`);
|
|
1566
|
+
}
|
|
1567
|
+
break; // Only match once per URL
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
request.abort();
|
|
1575
|
+
return;
|
|
1576
|
+
}
|
|
1577
|
+
|
|
1578
|
+
const reqDomain = safeGetDomain(reqUrl, perSiteSubDomains);
|
|
1579
|
+
|
|
1580
|
+
if (!reqDomain) {
|
|
1581
|
+
if (forceDebug) {
|
|
1582
|
+
console.log(formatLogMessage('debug', `Skipping request with unparseable URL: ${reqUrl}`));
|
|
1583
|
+
}
|
|
1584
|
+
request.continue();
|
|
1585
|
+
return;
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
// Skip matching if this domain is one of the redirect intermediaries
|
|
1589
|
+
if (redirectDomainsToExclude && redirectDomainsToExclude.includes(reqDomain)) {
|
|
1590
|
+
if (forceDebug) {
|
|
1591
|
+
console.log(formatLogMessage('debug', `Skipping redirect intermediary domain: ${reqDomain}`));
|
|
1592
|
+
}
|
|
1593
|
+
request.continue();
|
|
1594
|
+
return;
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
for (const re of regexes) {
|
|
1598
|
+
if (re.test(reqUrl)) {
|
|
1599
|
+
const resourceType = request.resourceType();
|
|
1600
|
+
|
|
1601
|
+
// *** UNIVERSAL RESOURCE TYPE FILTER ***
|
|
1602
|
+
// Check resourceTypes filter FIRST, before ANY processing (nettools, searchstring, immediate matching)
|
|
1603
|
+
const allowedResourceTypes = siteConfig.resourceTypes;
|
|
1604
|
+
if (allowedResourceTypes && Array.isArray(allowedResourceTypes) && allowedResourceTypes.length > 0) {
|
|
1605
|
+
if (!allowedResourceTypes.includes(resourceType)) {
|
|
1606
|
+
if (forceDebug) {
|
|
1607
|
+
console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${allowedResourceTypes.join(', ')}]. Skipping ALL processing.`));
|
|
1608
|
+
}
|
|
1609
|
+
break; // Skip this URL entirely - doesn't match required resource types
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
// Check party filtering AFTER regex match but BEFORE domain processing
|
|
1614
|
+
if (isFirstParty && siteConfig.firstParty === false) {
|
|
1615
|
+
if (forceDebug) {
|
|
1616
|
+
console.log(formatLogMessage('debug', `Skipping first-party match: ${reqUrl} (firstParty disabled)`));
|
|
1617
|
+
}
|
|
1618
|
+
break; // Skip this URL - it's first-party but firstParty is disabled
|
|
1619
|
+
}
|
|
1620
|
+
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
1621
|
+
if (forceDebug) {
|
|
1622
|
+
console.log(formatLogMessage('debug', `Skipping third-party match: ${reqUrl} (thirdParty disabled)`));
|
|
1623
|
+
}
|
|
1624
|
+
break; // Skip this URL - it's third-party but thirdParty is disabled
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
// Check ignoreDomains AFTER regex match but BEFORE domain processing
|
|
1628
|
+
if (matchesIgnoreDomain(reqDomain, ignoreDomains)) {
|
|
1629
|
+
if (forceDebug) {
|
|
1630
|
+
console.log(formatLogMessage('debug', `Ignoring domain ${reqDomain} (matches ignoreDomains pattern)`));
|
|
1631
|
+
}
|
|
1632
|
+
break; // Skip this URL - domain is in ignore list
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
// REMOVED: Check if this URL matches any blocked patterns - if so, skip detection but still continue browser blocking
|
|
1636
|
+
// This check is no longer needed here since even_blocked handles it above
|
|
1637
|
+
|
|
1638
|
+
// If NO searchstring AND NO nettools are defined, match immediately (existing behavior)
|
|
1639
|
+
if (!hasSearchString && !hasSearchStringAnd && !hasNetTools) {
|
|
1640
|
+
if (dryRunMode) {
|
|
1641
|
+
matchedDomains.get('dryRunMatches').push({
|
|
1642
|
+
regex: re.source,
|
|
1643
|
+
domain: reqDomain,
|
|
1644
|
+
resourceType: resourceType,
|
|
1645
|
+
fullUrl: reqUrl,
|
|
1646
|
+
isFirstParty: isFirstParty
|
|
1647
|
+
});
|
|
1648
|
+
} else {
|
|
1649
|
+
addMatchedDomain(reqDomain, resourceType);
|
|
1650
|
+
}
|
|
1651
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
1652
|
+
if (siteConfig.verbose === 1) {
|
|
1653
|
+
const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
|
|
1654
|
+
console.log(formatLogMessage('match', `[${simplifiedUrl}] ${reqUrl} matched regex: ${re} and resourceType: ${resourceType}${resourceInfo}`));
|
|
1655
|
+
}
|
|
1656
|
+
if (dumpUrls) {
|
|
1657
|
+
const timestamp = new Date().toISOString();
|
|
1658
|
+
const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
|
|
1659
|
+
fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${reqUrl} (resourceType: ${resourceType})${resourceInfo}\n`);
|
|
1660
|
+
}
|
|
1661
|
+
} else if (hasNetTools && !hasSearchString && !hasSearchStringAnd) {
|
|
1662
|
+
// If nettools are configured (whois/dig), perform checks on the domain
|
|
1663
|
+
if (forceDebug) {
|
|
1664
|
+
console.log(formatLogMessage('debug', `${reqUrl} matched regex ${re} and resourceType ${resourceType}, queued for nettools check`));
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
if (dryRunMode) {
|
|
1668
|
+
// For dry run, we'll collect the domain for nettools checking
|
|
1669
|
+
matchedDomains.get('dryRunMatches').push({
|
|
1670
|
+
regex: re.source,
|
|
1671
|
+
domain: reqDomain,
|
|
1672
|
+
resourceType: resourceType,
|
|
1673
|
+
fullUrl: reqUrl,
|
|
1674
|
+
isFirstParty: isFirstParty,
|
|
1675
|
+
needsNetToolsCheck: true
|
|
1676
|
+
});
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
// Create and execute nettools handler
|
|
1680
|
+
const netToolsHandler = createNetToolsHandler({
|
|
1681
|
+
whoisTerms,
|
|
1682
|
+
whoisOrTerms,
|
|
1683
|
+
whoisDelay: siteConfig.whois_delay || whois_delay, // Site-specific or global fallback
|
|
1684
|
+
whoisServer, // Pass whois server configuration
|
|
1685
|
+
whoisServerMode: siteConfig.whois_server_mode || whois_server_mode,
|
|
1686
|
+
debugLogFile, // Pass debug log file for whois error logging
|
|
1687
|
+
fs, // Pass fs module for file operations
|
|
1688
|
+
digTerms,
|
|
1689
|
+
digOrTerms,
|
|
1690
|
+
digRecordType,
|
|
1691
|
+
digSubdomain: siteConfig.dig_subdomain === true,
|
|
1692
|
+
// Add dry run callback for nettools results
|
|
1693
|
+
dryRunCallback: dryRunMode ? createEnhancedDryRunCallback(matchedDomains, forceDebug) : null,
|
|
1694
|
+
matchedDomains,
|
|
1695
|
+
addMatchedDomain,
|
|
1696
|
+
currentUrl,
|
|
1697
|
+
getRootDomain,
|
|
1698
|
+
siteConfig,
|
|
1699
|
+
dumpUrls,
|
|
1700
|
+
matchedUrlsLogFile,
|
|
1701
|
+
forceDebug,
|
|
1702
|
+
fs
|
|
1703
|
+
});
|
|
1704
|
+
|
|
1705
|
+
// Execute nettools check asynchronously
|
|
1706
|
+
const originalDomain = (new URL(reqUrl)).hostname;
|
|
1707
|
+
setImmediate(() => netToolsHandler(reqDomain, originalDomain));
|
|
1708
|
+
} else {
|
|
1709
|
+
// If searchstring or searchstring_and IS defined (with or without nettools), queue for content checking
|
|
1710
|
+
if (forceDebug) {
|
|
1711
|
+
const searchType = hasSearchStringAnd ? 'searchstring_and' : 'searchstring';
|
|
1712
|
+
console.log(formatLogMessage('debug', `${reqUrl} matched regex ${re} and resourceType ${resourceType}, queued for ${searchType} content search`));
|
|
1713
|
+
}
|
|
1714
|
+
if (dryRunMode) {
|
|
1715
|
+
matchedDomains.get('dryRunMatches').push({
|
|
1716
|
+
regex: re.source,
|
|
1717
|
+
domain: reqDomain,
|
|
1718
|
+
resourceType: resourceType,
|
|
1719
|
+
fullUrl: reqUrl,
|
|
1720
|
+
isFirstParty: isFirstParty,
|
|
1721
|
+
needsSearchStringCheck: true
|
|
1722
|
+
});
|
|
1723
|
+
}
|
|
1724
|
+
}
|
|
1725
|
+
|
|
1726
|
+
// If curl is enabled, download and analyze content immediately
|
|
1727
|
+
if (useCurl) {
|
|
1728
|
+
try {
|
|
1729
|
+
// Use grep handler if both grep and searchstring/searchstring_and are enabled
|
|
1730
|
+
if (useGrep && (hasSearchString || hasSearchStringAnd)) {
|
|
1731
|
+
const grepHandler = createGrepHandler({
|
|
1732
|
+
searchStrings,
|
|
1733
|
+
searchStringsAnd,
|
|
1734
|
+
regexes,
|
|
1735
|
+
matchedDomains,
|
|
1736
|
+
addMatchedDomain, // Pass the helper function
|
|
1737
|
+
currentUrl,
|
|
1738
|
+
perSiteSubDomains,
|
|
1739
|
+
ignoreDomains,
|
|
1740
|
+
matchesIgnoreDomain,
|
|
1741
|
+
getRootDomain,
|
|
1742
|
+
siteConfig,
|
|
1743
|
+
dumpUrls,
|
|
1744
|
+
matchedUrlsLogFile,
|
|
1745
|
+
forceDebug,
|
|
1746
|
+
userAgent: curlUserAgent,
|
|
1747
|
+
resourceType,
|
|
1748
|
+
hasSearchString,
|
|
1749
|
+
hasSearchStringAnd,
|
|
1750
|
+
grepOptions: {
|
|
1751
|
+
ignoreCase: true,
|
|
1752
|
+
wholeWord: false,
|
|
1753
|
+
regex: false
|
|
1754
|
+
}
|
|
1755
|
+
});
|
|
1756
|
+
|
|
1757
|
+
setImmediate(() => grepHandler(reqUrl));
|
|
1758
|
+
} else {
|
|
1759
|
+
// Use regular curl handler
|
|
1760
|
+
const curlHandler = createCurlHandler({
|
|
1761
|
+
searchStrings,
|
|
1762
|
+
searchStringsAnd,
|
|
1763
|
+
hasSearchStringAnd,
|
|
1764
|
+
regexes,
|
|
1765
|
+
matchedDomains,
|
|
1766
|
+
addMatchedDomain, // Pass the helper function
|
|
1767
|
+
currentUrl,
|
|
1768
|
+
perSiteSubDomains,
|
|
1769
|
+
ignoreDomains,
|
|
1770
|
+
matchesIgnoreDomain,
|
|
1771
|
+
getRootDomain,
|
|
1772
|
+
siteConfig,
|
|
1773
|
+
dumpUrls,
|
|
1774
|
+
matchedUrlsLogFile,
|
|
1775
|
+
forceDebug,
|
|
1776
|
+
userAgent: curlUserAgent,
|
|
1777
|
+
resourceType,
|
|
1778
|
+
hasSearchString
|
|
1779
|
+
});
|
|
1780
|
+
|
|
1781
|
+
setImmediate(() => curlHandler(reqUrl));
|
|
1782
|
+
}
|
|
1783
|
+
} catch (curlErr) {
|
|
1784
|
+
if (forceDebug) {
|
|
1785
|
+
console.log(formatLogMessage('debug', `Curl handler failed for ${reqUrl}: ${curlErr.message}`));
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
break;
|
|
1791
|
+
}
|
|
1792
|
+
}
|
|
1793
|
+
request.continue();
|
|
1794
|
+
});
|
|
1795
|
+
|
|
1796
|
+
// Add response handler ONLY if searchstring/searchstring_and is defined AND neither curl nor grep is enabled
|
|
1797
|
+
if ((hasSearchString || hasSearchStringAnd) && !useCurl && !useGrep) {
|
|
1798
|
+
const responseHandler = createResponseHandler({
|
|
1799
|
+
searchStrings,
|
|
1800
|
+
searchStringsAnd,
|
|
1801
|
+
hasSearchStringAnd,
|
|
1802
|
+
regexes,
|
|
1803
|
+
matchedDomains,
|
|
1804
|
+
addMatchedDomain, // Pass the helper function
|
|
1805
|
+
currentUrl,
|
|
1806
|
+
perSiteSubDomains,
|
|
1807
|
+
ignoreDomains,
|
|
1808
|
+
matchesIgnoreDomain,
|
|
1809
|
+
getRootDomain,
|
|
1810
|
+
siteConfig,
|
|
1811
|
+
dumpUrls,
|
|
1812
|
+
matchedUrlsLogFile,
|
|
1813
|
+
forceDebug,
|
|
1814
|
+
resourceType: null // Response handler doesn't have direct access to resource type
|
|
1815
|
+
});
|
|
1816
|
+
|
|
1817
|
+
page.on('response', responseHandler);
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
const interactEnabled = siteConfig.interact === true;
|
|
1821
|
+
|
|
1822
|
+
// --- Runtime CSS Element Blocking (Fallback) ---
|
|
1823
|
+
// Apply CSS blocking after page load as a fallback in case evaluateOnNewDocument didn't work
|
|
1824
|
+
if (cssBlockedSelectors && Array.isArray(cssBlockedSelectors) && cssBlockedSelectors.length > 0) {
|
|
1825
|
+
try {
|
|
1826
|
+
await page.evaluate((selectors) => {
|
|
1827
|
+
const existingStyle = document.querySelector('#css-blocker-runtime');
|
|
1828
|
+
if (!existingStyle) {
|
|
1829
|
+
const style = document.createElement('style');
|
|
1830
|
+
style.id = 'css-blocker-runtime';
|
|
1831
|
+
style.type = 'text/css';
|
|
1832
|
+
const cssRules = selectors.map(selector => `${selector} { display: none !important; visibility: hidden !important; }`).join('\n');
|
|
1833
|
+
style.innerHTML = cssRules;
|
|
1834
|
+
document.head.appendChild(style);
|
|
1835
|
+
}
|
|
1836
|
+
}, cssBlockedSelectors);
|
|
1837
|
+
} catch (cssRuntimeErr) {
|
|
1838
|
+
console.warn(formatLogMessage('warn', `[css_blocked] Failed to apply runtime CSS blocking for ${currentUrl}: ${cssRuntimeErr.message}`));
|
|
1839
|
+
}
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
try {
|
|
1843
|
+
// Use custom goto options if provided, otherwise default to 'load'
|
|
1844
|
+
// load Wait for all resources (default)
|
|
1845
|
+
// domcontentloaded Wait for DOM only
|
|
1846
|
+
// networkidle0 Wait until 0 network requests for 500ms
|
|
1847
|
+
// networkidle2 Wait until ≤2 network requests for 500ms
|
|
1848
|
+
|
|
1849
|
+
// Use faster defaults for sites with long timeouts to improve responsiveness
|
|
1850
|
+
const isFastSite = timeout <= 15000;
|
|
1851
|
+
const defaultWaitUntil = isFastSite ? 'load' : 'domcontentloaded';
|
|
1852
|
+
const defaultGotoOptions = {
|
|
1853
|
+
waitUntil: defaultWaitUntil,
|
|
1854
|
+
timeout: timeout
|
|
1855
|
+
};
|
|
1856
|
+
const gotoOptions = siteConfig.goto_options
|
|
1857
|
+
? { ...defaultGotoOptions, ...siteConfig.goto_options }
|
|
1858
|
+
: defaultGotoOptions;
|
|
1859
|
+
|
|
1860
|
+
// Enhanced navigation with redirect handling - passes existing gotoOptions
|
|
1861
|
+
const navigationResult = await navigateWithRedirectHandling(page, currentUrl, siteConfig, gotoOptions, forceDebug, formatLogMessage);
|
|
1862
|
+
|
|
1863
|
+
const { finalUrl, redirected, redirectChain, originalUrl, redirectDomains } = navigationResult;
|
|
1864
|
+
|
|
1865
|
+
// Handle redirect to new domain
|
|
1866
|
+
if (redirected) {
|
|
1867
|
+
const originalDomain = safeGetDomain(originalUrl);
|
|
1868
|
+
const finalDomain = safeGetDomain(finalUrl);
|
|
1869
|
+
|
|
1870
|
+
if (originalDomain !== finalDomain) {
|
|
1871
|
+
if (!silentMode) {
|
|
1872
|
+
console.log(`🔄 Redirect detected: ${originalDomain} → ${finalDomain}`);
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
if (forceDebug) {
|
|
1876
|
+
console.log(formatLogMessage('debug', `Full redirect chain: ${redirectChain.join(' → ')}`));
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
// Update currentUrl for all subsequent processing to use the final redirected URL
|
|
1880
|
+
currentUrl = finalUrl;
|
|
1881
|
+
|
|
1882
|
+
// IMPORTANT: Also update effectiveCurrentUrl for first-party detection
|
|
1883
|
+
// This ensures the request handler uses the redirected domain for party detection
|
|
1884
|
+
effectiveCurrentUrl = finalUrl;
|
|
1885
|
+
|
|
1886
|
+
// Update the redirect domains to exclude from matching
|
|
1887
|
+
if (redirectDomains && redirectDomains.length > 0) {
|
|
1888
|
+
redirectDomainsToExclude = redirectDomains;
|
|
1889
|
+
|
|
1890
|
+
if (forceDebug) {
|
|
1891
|
+
console.log(formatLogMessage('debug', `Excluding redirect domains from matching: ${redirectDomains.join(', ')}`));
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1894
|
+
}
|
|
1895
|
+
}
|
|
1896
|
+
|
|
1897
|
+
siteCounter++;
|
|
1898
|
+
|
|
1899
|
+
// Handle all Cloudflare protections using the dedicated module
|
|
1900
|
+
const cloudflareResult = await handleCloudflareProtection(page, currentUrl, siteConfig, forceDebug);
|
|
1901
|
+
|
|
1902
|
+
if (!cloudflareResult.overallSuccess) {
|
|
1903
|
+
console.warn(`⚠ [cloudflare] Protection handling failed for ${currentUrl}:`);
|
|
1904
|
+
cloudflareResult.errors.forEach(error => {
|
|
1905
|
+
console.warn(` - ${error}`);
|
|
1906
|
+
});
|
|
1907
|
+
// Continue with scan despite Cloudflare issues
|
|
1908
|
+
}
|
|
1909
|
+
|
|
1910
|
+
// Handle flowProxy protection if enabled
|
|
1911
|
+
if (flowproxyDetection) {
|
|
1912
|
+
const flowproxyResult = await handleFlowProxyProtection(page, currentUrl, siteConfig, forceDebug);
|
|
1913
|
+
|
|
1914
|
+
if (flowproxyResult.flowProxyDetection.detected) {
|
|
1915
|
+
console.log(`🛡️ [flowproxy] FlowProxy protection detected on ${currentUrl}`);
|
|
1916
|
+
|
|
1917
|
+
if (!flowproxyResult.overallSuccess) {
|
|
1918
|
+
console.warn(`⚠ [flowproxy] Protection handling failed for ${currentUrl}:`);
|
|
1919
|
+
flowproxyResult.errors.forEach(error => {
|
|
1920
|
+
console.warn(` - ${error}`);
|
|
1921
|
+
});
|
|
1922
|
+
}
|
|
1923
|
+
|
|
1924
|
+
if (flowproxyResult.warnings.length > 0) {
|
|
1925
|
+
flowproxyResult.warnings.forEach(warning => {
|
|
1926
|
+
console.warn(`⚠ [flowproxy] ${warning}`);
|
|
1927
|
+
});
|
|
1928
|
+
}
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1932
|
+
console.log(formatLogMessage('info', `${messageColors.loaded('Loaded:')} (${siteCounter}/${totalUrls}) ${currentUrl}`));
|
|
1933
|
+
await page.evaluate(() => { console.log('Safe to evaluate on loaded page.'); });
|
|
1934
|
+
|
|
1935
|
+
// Wait for iframes to load and log them
|
|
1936
|
+
if (forceDebug) {
|
|
1937
|
+
try {
|
|
1938
|
+
await new Promise(resolve => setTimeout(resolve, 2000)); // Give iframes time to load
|
|
1939
|
+
const frames = page.frames();
|
|
1940
|
+
console.log(formatLogMessage('debug', `Total frames found: ${frames.length}`));
|
|
1941
|
+
frames.forEach((frame, index) => {
|
|
1942
|
+
const frameUrl = frame.url();
|
|
1943
|
+
if (frameUrl &&
|
|
1944
|
+
frameUrl !== 'about:blank' &&
|
|
1945
|
+
frameUrl !== 'about:srcdoc' &&
|
|
1946
|
+
!frameUrl.startsWith('about:') &&
|
|
1947
|
+
!frameUrl.startsWith('data:') &&
|
|
1948
|
+
!frameUrl.startsWith('chrome-error://') &&
|
|
1949
|
+
!frameUrl.startsWith('chrome-extension://') &&
|
|
1950
|
+
frame !== page.mainFrame()) {
|
|
1951
|
+
console.log(formatLogMessage('debug', `Iframe ${index}: ${frameUrl}`));
|
|
1952
|
+
}
|
|
1953
|
+
});
|
|
1954
|
+
} catch (frameDebugErr) {
|
|
1955
|
+
console.log(formatLogMessage('debug', `Frame debugging failed: ${frameDebugErr.message}`));
|
|
1956
|
+
}
|
|
1957
|
+
}
|
|
1958
|
+
} catch (err) {
|
|
1959
|
+
// Enhanced error handling for redirect timeouts using redirect module
|
|
1960
|
+
const timeoutResult = await handleRedirectTimeout(page, currentUrl, err, safeGetDomain, forceDebug, formatLogMessage);
|
|
1961
|
+
|
|
1962
|
+
if (timeoutResult.success) {
|
|
1963
|
+
console.log(`⚠ Partial redirect timeout recovered: ${safeGetDomain(currentUrl)} → ${safeGetDomain(timeoutResult.finalUrl)}`);
|
|
1964
|
+
currentUrl = timeoutResult.finalUrl; // Use the partial redirect URL
|
|
1965
|
+
siteCounter++;
|
|
1966
|
+
// Continue processing with the redirected URL instead of throwing error
|
|
1967
|
+
} else {
|
|
1968
|
+
console.error(formatLogMessage('error', `Failed on ${currentUrl}: ${err.message}`));
|
|
1969
|
+
throw err;
|
|
1970
|
+
}
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
if (interactEnabled && !disableInteract) {
|
|
1974
|
+
if (forceDebug) console.log(formatLogMessage('debug', `interaction simulation enabled for ${currentUrl}`));
|
|
1975
|
+
const randomX = Math.floor(Math.random() * 500) + 50;
|
|
1976
|
+
const randomY = Math.floor(Math.random() * 500) + 50;
|
|
1977
|
+
await page.mouse.move(randomX, randomY, { steps: 10 });
|
|
1978
|
+
await page.mouse.move(randomX + 50, randomY + 50, { steps: 15 });
|
|
1979
|
+
await page.mouse.click(randomX + 25, randomY + 25);
|
|
1980
|
+
await page.hover('body');
|
|
1981
|
+
}
|
|
1982
|
+
|
|
1983
|
+
const delayMs = siteConfig.delay || 4000;
|
|
1984
|
+
|
|
1985
|
+
// Optimize network idle and delay times for better responsiveness
|
|
1986
|
+
const isFastSite = timeout <= 15000;
|
|
1987
|
+
const networkIdleTime = isFastSite ? 4000 : 2000; // Faster idle for slow sites
|
|
1988
|
+
const networkIdleTimeout = isFastSite ? timeout : Math.min(timeout / 2, 12000);
|
|
1989
|
+
const actualDelay = isFastSite ? delayMs : Math.min(delayMs, 2000); // Cap delay for slow sites
|
|
1990
|
+
|
|
1991
|
+
await page.waitForNetworkIdle({
|
|
1992
|
+
idleTime: networkIdleTime,
|
|
1993
|
+
timeout: networkIdleTimeout
|
|
1994
|
+
});
|
|
1995
|
+
await new Promise(resolve => setTimeout(resolve, actualDelay));
|
|
1996
|
+
|
|
1997
|
+
// Apply additional delay for flowProxy if detected
|
|
1998
|
+
if (flowproxyDetection) {
|
|
1999
|
+
const additionalDelay = siteConfig.flowproxy_additional_delay || 5000;
|
|
2000
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Applying flowProxy additional delay: ${additionalDelay}ms`));
|
|
2001
|
+
await new Promise(resolve => setTimeout(resolve, additionalDelay));
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
for (let i = 1; i < (siteConfig.reload || 1); i++) {
|
|
2005
|
+
if (siteConfig.clear_sitedata === true) {
|
|
2006
|
+
try {
|
|
2007
|
+
let reloadClearSession = null;
|
|
2008
|
+
try {
|
|
2009
|
+
reloadClearSession = await page.target().createCDPSession();
|
|
2010
|
+
await reloadClearSession.send('Network.clearBrowserCookies');
|
|
2011
|
+
await reloadClearSession.send('Network.clearBrowserCache');
|
|
2012
|
+
} finally {
|
|
2013
|
+
if (reloadClearSession) {
|
|
2014
|
+
try { await reloadClearSession.detach(); } catch (detachErr) { /* ignore */ }
|
|
2015
|
+
}
|
|
2016
|
+
}
|
|
2017
|
+
await page.evaluate(() => {
|
|
2018
|
+
localStorage.clear();
|
|
2019
|
+
sessionStorage.clear();
|
|
2020
|
+
indexedDB.databases().then(dbs => dbs.forEach(db => indexedDB.deleteDatabase(db.name)));
|
|
2021
|
+
});
|
|
2022
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Cleared site data before reload #${i + 1} for ${currentUrl}`));
|
|
2023
|
+
} catch (reloadClearErr) {
|
|
2024
|
+
console.warn(messageColors.warn(`[clear_sitedata before reload failed] ${currentUrl}: ${reloadClearErr.message}`));
|
|
2025
|
+
}
|
|
2026
|
+
}
|
|
2027
|
+
await page.reload({ waitUntil: 'domcontentloaded', timeout: timeout });
|
|
2028
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
2029
|
+
}
|
|
2030
|
+
|
|
2031
|
+
if (siteConfig.forcereload === true) {
|
|
2032
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Forcing extra reload (cache disabled) for ${currentUrl}`));
|
|
2033
|
+
try {
|
|
2034
|
+
await page.setCacheEnabled(false);
|
|
2035
|
+
await page.reload({ waitUntil: 'domcontentloaded', timeout: timeout });
|
|
2036
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
2037
|
+
await page.setCacheEnabled(true);
|
|
2038
|
+
} catch (forceReloadErr) {
|
|
2039
|
+
console.warn(messageColors.warn(`[forcereload failed] ${currentUrl}: ${forceReloadErr.message}`));
|
|
2040
|
+
}
|
|
2041
|
+
}
|
|
2042
|
+
|
|
2043
|
+
if (dryRunMode) {
|
|
2044
|
+
// Get page title for dry run output
|
|
2045
|
+
let pageTitle = '';
|
|
2046
|
+
try {
|
|
2047
|
+
pageTitle = await page.title();
|
|
2048
|
+
} catch (titleErr) {
|
|
2049
|
+
if (forceDebug) {
|
|
2050
|
+
console.log(formatLogMessage('debug', `Failed to get page title for ${currentUrl}: ${titleErr.message}`));
|
|
2051
|
+
}
|
|
2052
|
+
}
|
|
2053
|
+
|
|
2054
|
+
// Get collected matches and enhance with searchstring results
|
|
2055
|
+
const dryRunMatches = matchedDomains.get('dryRunMatches') || [];
|
|
2056
|
+
const dryRunNetTools = matchedDomains.get('dryRunNetTools') || [];
|
|
2057
|
+
const dryRunSearchString = matchedDomains.get('dryRunSearchString') || new Map();
|
|
2058
|
+
|
|
2059
|
+
// Enhance matches with searchstring results
|
|
2060
|
+
const enhancedMatches = dryRunMatches.map(match => {
|
|
2061
|
+
const searchResult = dryRunSearchString.get(match.fullUrl);
|
|
2062
|
+
return {
|
|
2063
|
+
...match,
|
|
2064
|
+
searchStringMatch: searchResult && searchResult.matched ? searchResult : null,
|
|
2065
|
+
searchStringChecked: match.needsSearchStringCheck
|
|
2066
|
+
};
|
|
2067
|
+
});
|
|
2068
|
+
|
|
2069
|
+
// Wait a moment for async nettools/searchstring operations to complete
|
|
2070
|
+
await new Promise(resolve => setTimeout(resolve, 3000)); // Increased for nettools operations
|
|
2071
|
+
|
|
2072
|
+
outputDryRunResults(currentUrl, enhancedMatches, dryRunNetTools, pageTitle);
|
|
2073
|
+
|
|
2074
|
+
return { url: currentUrl, rules: [], success: true, dryRun: true, matchCount: dryRunMatches.length + dryRunNetTools.length };
|
|
2075
|
+
} else {
|
|
2076
|
+
// Format rules using the output module
|
|
2077
|
+
const globalOptions = {
|
|
2078
|
+
localhostMode,
|
|
2079
|
+
localhostModeAlt,
|
|
2080
|
+
plainOutput,
|
|
2081
|
+
adblockRulesMode,
|
|
2082
|
+
dnsmasqMode,
|
|
2083
|
+
dnsmasqOldMode,
|
|
2084
|
+
unboundMode,
|
|
2085
|
+
privoxyMode,
|
|
2086
|
+
piholeMode
|
|
2087
|
+
};
|
|
2088
|
+
const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
|
|
2089
|
+
|
|
2090
|
+
return { url: currentUrl, rules: formattedRules, success: true };
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
} catch (err) {
|
|
2094
|
+
// Enhanced error handling with rule preservation for partial matches
|
|
2095
|
+
if (err.message.includes('Protocol error') ||
|
|
2096
|
+
err.message.includes('Target closed') ||
|
|
2097
|
+
err.message.includes('Browser process was killed') ||
|
|
2098
|
+
err.message.includes('Browser protocol broken')) {
|
|
2099
|
+
console.error(formatLogMessage('error', `Critical browser error on ${currentUrl}: ${err.message}`));
|
|
2100
|
+
return {
|
|
2101
|
+
url: currentUrl,
|
|
2102
|
+
rules: [],
|
|
2103
|
+
success: false,
|
|
2104
|
+
needsImmediateRestart: true,
|
|
2105
|
+
error: err.message
|
|
2106
|
+
};
|
|
2107
|
+
}
|
|
2108
|
+
|
|
2109
|
+
// For other errors, preserve any matches we found before the error
|
|
2110
|
+
if (matchedDomains && (matchedDomains.size > 0 || (matchedDomains instanceof Map && matchedDomains.size > 0))) {
|
|
2111
|
+
const globalOptions = {
|
|
2112
|
+
localhostMode,
|
|
2113
|
+
localhostModeAlt,
|
|
2114
|
+
plainOutput,
|
|
2115
|
+
adblockRulesMode,
|
|
2116
|
+
dnsmasqMode,
|
|
2117
|
+
dnsmasqOldMode,
|
|
2118
|
+
unboundMode,
|
|
2119
|
+
privoxyMode,
|
|
2120
|
+
piholeMode
|
|
2121
|
+
};
|
|
2122
|
+
const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
|
|
2123
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Saving ${formattedRules.length} rules despite page load failure`));
|
|
2124
|
+
return { url: currentUrl, rules: formattedRules, success: false, hasMatches: true };
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
if (siteConfig.screenshot === true && page) {
|
|
2128
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
2129
|
+
const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_');
|
|
2130
|
+
const filename = `${safeUrl}-${timestamp}.jpg`;
|
|
2131
|
+
try {
|
|
2132
|
+
await page.screenshot({ path: filename, type: 'jpeg', fullPage: true });
|
|
2133
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Screenshot saved: ${filename}`));
|
|
2134
|
+
} catch (screenshotErr) {
|
|
2135
|
+
console.warn(messageColors.warn(`[screenshot failed] ${currentUrl}: ${screenshotErr.message}`));
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
return { url: currentUrl, rules: [], success: false };
|
|
2139
|
+
} finally {
|
|
2140
|
+
// Guaranteed resource cleanup - this runs regardless of success or failure
|
|
2141
|
+
|
|
2142
|
+
if (cdpSession) {
|
|
2143
|
+
try {
|
|
2144
|
+
await cdpSession.detach();
|
|
2145
|
+
if (forceDebug) console.log(formatLogMessage('debug', `CDP session detached for ${currentUrl}`));
|
|
2146
|
+
} catch (cdpCleanupErr) {
|
|
2147
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Failed to detach CDP session for ${currentUrl}: ${cdpCleanupErr.message}`));
|
|
2148
|
+
}
|
|
2149
|
+
}
|
|
2150
|
+
// Add small delay to allow cleanup to complete
|
|
2151
|
+
try {
|
|
2152
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
2153
|
+
} catch (delayErr) {
|
|
2154
|
+
// Ignore timeout errors
|
|
2155
|
+
}
|
|
2156
|
+
|
|
2157
|
+
if (page && !page.isClosed()) {
|
|
2158
|
+
// Clear page resources before closing
|
|
2159
|
+
try {
|
|
2160
|
+
await page.evaluate(() => {
|
|
2161
|
+
if (window.gc) window.gc(); // Force garbage collection if available
|
|
2162
|
+
});
|
|
2163
|
+
} catch (gcErr) { /* ignore */ }
|
|
2164
|
+
|
|
2165
|
+
try {
|
|
2166
|
+
await page.close();
|
|
2167
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Page closed for ${currentUrl}`));
|
|
2168
|
+
} catch (pageCloseErr) {
|
|
2169
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Failed to close page for ${currentUrl}: ${pageCloseErr.message}`));
|
|
2170
|
+
}
|
|
2171
|
+
}
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
// Temporarily store the pLimit function
|
|
2176
|
+
const originalLimit = limit;
|
|
2177
|
+
|
|
2178
|
+
// Group URLs by site to respect site boundaries during cleanup
|
|
2179
|
+
const siteGroups = [];
|
|
2180
|
+
let currentUrlCount = 0;
|
|
2181
|
+
|
|
2182
|
+
for (const site of sites) {
|
|
2183
|
+
|
|
2184
|
+
const urlsToProcess = Array.isArray(site.url) ? site.url : [site.url];
|
|
2185
|
+
siteGroups.push({
|
|
2186
|
+
config: site,
|
|
2187
|
+
urls: urlsToProcess
|
|
2188
|
+
});
|
|
2189
|
+
currentUrlCount += urlsToProcess.length;
|
|
2190
|
+
}
|
|
2191
|
+
if (!silentMode && currentUrlCount > 0) {
|
|
2192
|
+
console.log(`\n${messageColors.processing('Processing')} ${currentUrlCount} URLs across ${siteGroups.length} sites with concurrency ${MAX_CONCURRENT_SITES}...`);
|
|
2193
|
+
if (currentUrlCount > RESOURCE_CLEANUP_INTERVAL) {
|
|
2194
|
+
console.log(messageColors.processing('Browser will restart every') + ` ~${RESOURCE_CLEANUP_INTERVAL} URLs to free resources`);
|
|
2195
|
+
}
|
|
2196
|
+
}
|
|
2197
|
+
|
|
2198
|
+
const results = [];
|
|
2199
|
+
let processedUrlCount = 0;
|
|
2200
|
+
let urlsSinceLastCleanup = 0;
|
|
2201
|
+
|
|
2202
|
+
// Process sites one by one, but restart browser when hitting URL limits
|
|
2203
|
+
for (let siteIndex = 0; siteIndex < siteGroups.length; siteIndex++) {
|
|
2204
|
+
const siteGroup = siteGroups[siteIndex];
|
|
2205
|
+
|
|
2206
|
+
// Check browser health before processing each site
|
|
2207
|
+
const healthCheck = await monitorBrowserHealth(browser, {}, {
|
|
2208
|
+
siteIndex,
|
|
2209
|
+
totalSites: siteGroups.length,
|
|
2210
|
+
urlsSinceCleanup: urlsSinceLastCleanup,
|
|
2211
|
+
cleanupInterval: RESOURCE_CLEANUP_INTERVAL,
|
|
2212
|
+
forceDebug,
|
|
2213
|
+
silentMode
|
|
2214
|
+
});
|
|
2215
|
+
|
|
2216
|
+
// Also check if browser was unhealthy during recent processing
|
|
2217
|
+
const recentResults = results.slice(-3);
|
|
2218
|
+
const hasRecentFailures = recentResults.filter(r => !r.success).length >= 2;
|
|
2219
|
+
const shouldRestartFromFailures = hasRecentFailures && urlsSinceLastCleanup > 5;
|
|
2220
|
+
|
|
2221
|
+
const siteUrlCount = siteGroup.urls.length;
|
|
2222
|
+
|
|
2223
|
+
// Check if processing this entire site would exceed cleanup interval OR health check suggests restart
|
|
2224
|
+
const wouldExceedLimit = urlsSinceLastCleanup + siteUrlCount >= RESOURCE_CLEANUP_INTERVAL;
|
|
2225
|
+
const isNotLastSite = siteIndex < siteGroups.length - 1;
|
|
2226
|
+
|
|
2227
|
+
// Restart browser if we've processed enough URLs, health check suggests it, and this isn't the last site
|
|
2228
|
+
if ((wouldExceedLimit || healthCheck.shouldRestart || shouldRestartFromFailures) && urlsSinceLastCleanup > 0 && isNotLastSite) {
|
|
2229
|
+
|
|
2230
|
+
let restartReason = 'Unknown';
|
|
2231
|
+
if (healthCheck.shouldRestart) {
|
|
2232
|
+
restartReason = healthCheck.reason;
|
|
2233
|
+
} else if (shouldRestartFromFailures) {
|
|
2234
|
+
restartReason = 'Multiple recent failures detected';
|
|
2235
|
+
} else if (wouldExceedLimit) {
|
|
2236
|
+
restartReason = `Processed ${urlsSinceLastCleanup} URLs`;
|
|
2237
|
+
}
|
|
2238
|
+
|
|
2239
|
+
if (!silentMode) {
|
|
2240
|
+
console.log(`\n${messageColors.fileOp('🔄 Browser restart triggered:')} ${restartReason}`);
|
|
2241
|
+
}
|
|
2242
|
+
|
|
2243
|
+
try {
|
|
2244
|
+
await handleBrowserExit(browser, {
|
|
2245
|
+
forceDebug,
|
|
2246
|
+
timeout: 10000,
|
|
2247
|
+
exitOnFailure: false,
|
|
2248
|
+
cleanTempFiles: true,
|
|
2249
|
+
comprehensiveCleanup: removeTempFiles // Respect --remove-tempfiles during restarts
|
|
2250
|
+
});
|
|
2251
|
+
|
|
2252
|
+
// Clean up the specific user data directory
|
|
2253
|
+
if (userDataDir && fs.existsSync(userDataDir)) {
|
|
2254
|
+
fs.rmSync(userDataDir, { recursive: true, force: true });
|
|
2255
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Cleaned user data dir: ${userDataDir}`));
|
|
2256
|
+
}
|
|
2257
|
+
|
|
2258
|
+
// Additional cleanup for any remaining Chrome processes
|
|
2259
|
+
if (removeTempFiles) {
|
|
2260
|
+
await cleanupChromeTempFiles({
|
|
2261
|
+
includeSnapTemp: true,
|
|
2262
|
+
forceDebug,
|
|
2263
|
+
comprehensive: true
|
|
2264
|
+
});
|
|
2265
|
+
}
|
|
2266
|
+
|
|
2267
|
+
} catch (browserCloseErr) {
|
|
2268
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Browser cleanup warning: ${browserCloseErr.message}`));
|
|
2269
|
+
}
|
|
2270
|
+
|
|
2271
|
+
// Create new browser for next batch
|
|
2272
|
+
browser = await createBrowser();
|
|
2273
|
+
if (forceDebug) console.log(formatLogMessage('debug', `New browser instance created for site ${siteIndex + 1}`));
|
|
2274
|
+
|
|
2275
|
+
// Reset cleanup counter and add delay
|
|
2276
|
+
urlsSinceLastCleanup = 0;
|
|
2277
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
2278
|
+
}
|
|
2279
|
+
|
|
2280
|
+
if (forceDebug) {
|
|
2281
|
+
console.log(formatLogMessage('debug', `Processing site ${siteIndex + 1}/${siteGroups.length}: ${siteUrlCount} URL(s) (total processed: ${processedUrlCount})`));
|
|
2282
|
+
}
|
|
2283
|
+
|
|
2284
|
+
// Create tasks with current browser instance and process them
|
|
2285
|
+
const siteTasks = siteGroup.urls.map(url => originalLimit(() => processUrl(url, siteGroup.config, browser)));
|
|
2286
|
+
const siteResults = await Promise.all(siteTasks);
|
|
2287
|
+
|
|
2288
|
+
// Check if any results indicate immediate restart is needed
|
|
2289
|
+
const needsImmediateRestart = siteResults.some(r => r.needsImmediateRestart);
|
|
2290
|
+
|
|
2291
|
+
results.push(...siteResults);
|
|
2292
|
+
|
|
2293
|
+
processedUrlCount += siteUrlCount;
|
|
2294
|
+
urlsSinceLastCleanup += siteUrlCount;
|
|
2295
|
+
|
|
2296
|
+
// Force browser restart if any URL had critical errors
|
|
2297
|
+
if (needsImmediateRestart && siteIndex < siteGroups.length - 1) {
|
|
2298
|
+
if (!silentMode) {
|
|
2299
|
+
console.log(`\n${messageColors.fileOp('🔄 Emergency browser restart:')} Critical browser errors detected`);
|
|
2300
|
+
}
|
|
2301
|
+
|
|
2302
|
+
// Force browser restart immediately
|
|
2303
|
+
try {
|
|
2304
|
+
await handleBrowserExit(browser, { forceDebug, timeout: 5000, exitOnFailure: false, cleanTempFiles: true, comprehensiveCleanup: removeTempFiles });
|
|
2305
|
+
// Additional cleanup after emergency restart
|
|
2306
|
+
if (removeTempFiles) {
|
|
2307
|
+
await cleanupChromeTempFiles({
|
|
2308
|
+
includeSnapTemp: true,
|
|
2309
|
+
forceDebug,
|
|
2310
|
+
comprehensive: true
|
|
2311
|
+
});
|
|
2312
|
+
}
|
|
2313
|
+
browser = await createBrowser();
|
|
2314
|
+
urlsSinceLastCleanup = 0; // Reset counter
|
|
2315
|
+
await new Promise(resolve => setTimeout(resolve, 2000)); // Give browser time to stabilize
|
|
2316
|
+
} catch (emergencyRestartErr) {
|
|
2317
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Emergency restart failed: ${emergencyRestartErr.message}`));
|
|
2318
|
+
}
|
|
2319
|
+
}
|
|
2320
|
+
}
|
|
2321
|
+
|
|
2322
|
+
// Handle dry run output file writing
|
|
2323
|
+
if (dryRunMode && outputFile && dryRunOutput.length > 0) {
|
|
2324
|
+
try {
|
|
2325
|
+
const dryRunContent = dryRunOutput.join('\n');
|
|
2326
|
+
fs.writeFileSync(outputFile, dryRunContent);
|
|
2327
|
+
if (!silentMode) {
|
|
2328
|
+
console.log(`${messageColors.fileOp('📄 Dry run results saved to:')} ${outputFile}`);
|
|
2329
|
+
}
|
|
2330
|
+
} catch (writeErr) {
|
|
2331
|
+
console.error(`❌ Failed to write dry run output to ${outputFile}: ${writeErr.message}`);
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
|
|
2335
|
+
let outputResult;
|
|
2336
|
+
|
|
2337
|
+
if (!dryRunMode) {
|
|
2338
|
+
// Handle all output using the output module
|
|
2339
|
+
const outputConfig = {
|
|
2340
|
+
outputFile,
|
|
2341
|
+
appendMode,
|
|
2342
|
+
compareFile,
|
|
2343
|
+
forceDebug,
|
|
2344
|
+
showTitles,
|
|
2345
|
+
removeDupes: removeDupes && outputFile,
|
|
2346
|
+
silentMode,
|
|
2347
|
+
dumpUrls,
|
|
2348
|
+
adblockRulesLogFile,
|
|
2349
|
+
ignoreDomains
|
|
2350
|
+
};
|
|
2351
|
+
|
|
2352
|
+
outputResult = handleOutput(results, outputConfig);
|
|
2353
|
+
|
|
2354
|
+
if (!outputResult.success) {
|
|
2355
|
+
console.error(messageColors.error('❌ Failed to write output files'));
|
|
2356
|
+
process.exit(1);
|
|
2357
|
+
}
|
|
2358
|
+
|
|
2359
|
+
} else {
|
|
2360
|
+
// For dry run mode, create a mock output result
|
|
2361
|
+
const totalMatches = results.reduce((sum, r) => sum + (r.matchCount || 0), 0);
|
|
2362
|
+
outputResult = {
|
|
2363
|
+
success: true,
|
|
2364
|
+
successfulPageLoads: results.filter(r => r.success).length,
|
|
2365
|
+
totalRules: totalMatches
|
|
2366
|
+
};
|
|
2367
|
+
}
|
|
2368
|
+
|
|
2369
|
+
// Use the success count from output handler
|
|
2370
|
+
siteCounter = outputResult.successfulPageLoads;
|
|
2371
|
+
|
|
2372
|
+
// Count pages that had matches even if they failed to load completely
|
|
2373
|
+
const pagesWithMatches = results.filter(r => r.success || r.hasMatches).length;
|
|
2374
|
+
const totalMatches = results.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
|
|
2375
|
+
|
|
2376
|
+
// Debug: Show output format being used
|
|
2377
|
+
if (forceDebug) {
|
|
2378
|
+
const globalOptions = {
|
|
2379
|
+
localhostMode,
|
|
2380
|
+
localhostModeAlt,
|
|
2381
|
+
plainOutput,
|
|
2382
|
+
adblockRules: adblockRulesMode,
|
|
2383
|
+
dnsmasq: dnsmasqMode,
|
|
2384
|
+
dnsmasqOld: dnsmasqOldMode,
|
|
2385
|
+
unbound: unboundMode,
|
|
2386
|
+
privoxy: privoxyMode,
|
|
2387
|
+
pihole: piholeMode
|
|
2388
|
+
};
|
|
2389
|
+
console.log(formatLogMessage('debug', `Output format: ${getFormatDescription(globalOptions)}`));
|
|
2390
|
+
console.log(formatLogMessage('debug', `Generated ${outputResult.totalRules} rules from ${outputResult.successfulPageLoads} successful page loads`));
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2393
|
+
// Compress log files if --compress-logs is enabled
|
|
2394
|
+
if (compressLogs && dumpUrls && !dryRunMode) {
|
|
2395
|
+
// Collect all existing log files for compression
|
|
2396
|
+
const filesToCompress = [];
|
|
2397
|
+
if (debugLogFile && fs.existsSync(debugLogFile)) filesToCompress.push(debugLogFile);
|
|
2398
|
+
if (matchedUrlsLogFile && fs.existsSync(matchedUrlsLogFile)) filesToCompress.push(matchedUrlsLogFile);
|
|
2399
|
+
if (adblockRulesLogFile && fs.existsSync(adblockRulesLogFile)) filesToCompress.push(adblockRulesLogFile);
|
|
2400
|
+
|
|
2401
|
+
if (filesToCompress.length > 0) {
|
|
2402
|
+
if (!silentMode) console.log(`\n${messageColors.compression('Compressing')} ${filesToCompress.length} log file(s)...`);
|
|
2403
|
+
try {
|
|
2404
|
+
// Perform compression with original file deletion
|
|
2405
|
+
const results = await compressMultipleFiles(filesToCompress, true);
|
|
2406
|
+
|
|
2407
|
+
if (!silentMode) {
|
|
2408
|
+
// Report compression results and file sizes
|
|
2409
|
+
results.successful.forEach(({ original, compressed }) => {
|
|
2410
|
+
const originalSize = fs.statSync(compressed).size; // compressed file size
|
|
2411
|
+
console.log(messageColors.success('✅ Compressed:') + ` ${path.basename(original)} → ${path.basename(compressed)}`);
|
|
2412
|
+
});
|
|
2413
|
+
// Report any compression failures
|
|
2414
|
+
if (results.failed.length > 0) {
|
|
2415
|
+
results.failed.forEach(({ path: filePath, error }) => {
|
|
2416
|
+
console.warn(messageColors.warn(`⚠ Failed to compress ${path.basename(filePath)}: ${error}`));
|
|
2417
|
+
});
|
|
2418
|
+
}
|
|
2419
|
+
}
|
|
2420
|
+
} catch (compressionErr) {
|
|
2421
|
+
console.warn(formatLogMessage('warn', `Log compression failed: ${compressionErr.message}`));
|
|
2422
|
+
}
|
|
2423
|
+
}
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
// Perform comprehensive final cleanup using enhanced browserexit module
|
|
2427
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Starting comprehensive browser cleanup...`));
|
|
2428
|
+
|
|
2429
|
+
|
|
2430
|
+
const cleanupResult = await handleBrowserExit(browser, {
|
|
2431
|
+
forceDebug,
|
|
2432
|
+
timeout: 10000,
|
|
2433
|
+
exitOnFailure: true,
|
|
2434
|
+
cleanTempFiles: true,
|
|
2435
|
+
comprehensiveCleanup: removeTempFiles, // Use --remove-tempfiles flag
|
|
2436
|
+
userDataDir: browser._nwssUserDataDir,
|
|
2437
|
+
verbose: !silentMode && removeTempFiles // Show verbose output only if removing temp files and not silent
|
|
2438
|
+
});
|
|
2439
|
+
|
|
2440
|
+
if (forceDebug) {
|
|
2441
|
+
console.log(formatLogMessage('debug', `Final cleanup results: ${cleanupResult.success ? 'success' : 'failed'}`));
|
|
2442
|
+
console.log(formatLogMessage('debug', `Browser closed: ${cleanupResult.browserClosed}, Temp files cleaned: ${cleanupResult.tempFilesCleanedCount || 0}, User data cleaned: ${cleanupResult.userDataCleaned}`));
|
|
2443
|
+
|
|
2444
|
+
if (cleanupResult.errors.length > 0) {
|
|
2445
|
+
cleanupResult.errors.forEach(err => console.log(formatLogMessage('debug', `Cleanup error: ${err}`)));
|
|
2446
|
+
}
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
// Final aggressive cleanup to catch any remaining temp files
|
|
2450
|
+
if (forceDebug) console.log(formatLogMessage('debug', 'Performing final aggressive temp file cleanup...'));
|
|
2451
|
+
await cleanupChromeTempFiles({
|
|
2452
|
+
includeSnapTemp: true,
|
|
2453
|
+
forceDebug,
|
|
2454
|
+
comprehensive: true
|
|
2455
|
+
});
|
|
2456
|
+
await new Promise(resolve => setTimeout(resolve, 1000)); // Give filesystem time to sync
|
|
2457
|
+
|
|
2458
|
+
// Calculate timing, success rates, and provide summary information
|
|
2459
|
+
if (forceDebug) console.log(formatLogMessage('debug', `Calculating timing statistics...`));
|
|
2460
|
+
const endTime = Date.now();
|
|
2461
|
+
const durationMs = endTime - startTime;
|
|
2462
|
+
const totalSeconds = Math.floor(durationMs / 1000);
|
|
2463
|
+
const hours = Math.floor(totalSeconds / 3600);
|
|
2464
|
+
const minutes = Math.floor((totalSeconds % 3600) / 60);
|
|
2465
|
+
const seconds = totalSeconds % 60;
|
|
2466
|
+
|
|
2467
|
+
// Final summary report with timing and success statistics
|
|
2468
|
+
if (!silentMode) {
|
|
2469
|
+
if (pagesWithMatches > outputResult.successfulPageLoads) {
|
|
2470
|
+
console.log(`\n${messageColors.success(dryRunMode ? 'Dry run completed.' : 'Scan completed.')} ${outputResult.successfulPageLoads} of ${totalUrls} URLs loaded successfully, ${pagesWithMatches} had matches in ${messageColors.timing(`${hours}h ${minutes}m ${seconds}s`)}`);
|
|
2471
|
+
|
|
2472
|
+
} else {
|
|
2473
|
+
console.log(`\n${messageColors.success(dryRunMode ? 'Dry run completed.' : 'Scan completed.')} ${outputResult.successfulPageLoads} of ${totalUrls} URLs processed successfully in ${messageColors.timing(`${hours}h ${minutes}m ${seconds}s`)}`);
|
|
2474
|
+
|
|
2475
|
+
|
|
2476
|
+
}
|
|
2477
|
+
if (outputResult.totalRules > 0 && !dryRunMode) {
|
|
2478
|
+
console.log(messageColors.success('Generated') + ` ${outputResult.totalRules} unique rules`);
|
|
2479
|
+
} else if (outputResult.totalRules > 0 && dryRunMode) {
|
|
2480
|
+
console.log(messageColors.success('Found') + ` ${outputResult.totalRules} total matches across all URLs`);
|
|
2481
|
+
}
|
|
2482
|
+
}
|
|
2483
|
+
|
|
2484
|
+
// Clean process termination
|
|
2485
|
+
if (forceDebug) console.log(formatLogMessage('debug', `About to exit process...`));
|
|
2486
|
+
process.exit(0);
|
|
2487
|
+
|
|
2488
|
+
})();
|