@fanboynz/network-scanner 1.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +33 -0
- package/JSONMANUAL.md +121 -0
- package/LICENSE +674 -0
- package/README.md +357 -0
- package/config.json +74 -0
- package/lib/browserexit.js +522 -0
- package/lib/browserhealth.js +308 -0
- package/lib/cloudflare.js +660 -0
- package/lib/colorize.js +168 -0
- package/lib/compare.js +159 -0
- package/lib/compress.js +129 -0
- package/lib/fingerprint.js +613 -0
- package/lib/flowproxy.js +274 -0
- package/lib/grep.js +348 -0
- package/lib/ignore_similar.js +237 -0
- package/lib/nettools.js +1200 -0
- package/lib/output.js +633 -0
- package/lib/redirect.js +384 -0
- package/lib/searchstring.js +561 -0
- package/lib/validate_rules.js +1107 -0
- package/nwss.1 +824 -0
- package/nwss.js +2488 -0
- package/package.json +45 -0
- package/regex-samples.md +27 -0
- package/scanner-script-org.js +588 -0
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@fanboynz/network-scanner",
|
|
3
|
+
"version": "1.0.35",
|
|
4
|
+
"description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
|
|
5
|
+
"main": "nwss.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"start": "node nwss.js",
|
|
8
|
+
"scan": "node nwss.js",
|
|
9
|
+
"help": "node nwss.js --help",
|
|
10
|
+
"version": "node nwss.js --version"
|
|
11
|
+
},
|
|
12
|
+
"dependencies": {
|
|
13
|
+
"p-limit": "^4.0.0",
|
|
14
|
+
"psl": "^1.9.0",
|
|
15
|
+
"puppeteer": "^22.15.0"
|
|
16
|
+
},
|
|
17
|
+
"keywords": [
|
|
18
|
+
"puppeteer",
|
|
19
|
+
"network-scanner",
|
|
20
|
+
"adblock",
|
|
21
|
+
"web-scraping",
|
|
22
|
+
"privacy",
|
|
23
|
+
"tracking-protection",
|
|
24
|
+
"filter-lists",
|
|
25
|
+
"fingerprint-spoofing",
|
|
26
|
+
"cloudflare-bypass",
|
|
27
|
+
"network-analysis"
|
|
28
|
+
],
|
|
29
|
+
"author": "FanboyNZ",
|
|
30
|
+
"license": "GPL-3.0",
|
|
31
|
+
"engines": {
|
|
32
|
+
"node": ">=16.0.0"
|
|
33
|
+
},
|
|
34
|
+
"repository": {
|
|
35
|
+
"type": "git",
|
|
36
|
+
"url": "git+https://github.com/ryanbr/network-scanner"
|
|
37
|
+
},
|
|
38
|
+
"publishConfig": {
|
|
39
|
+
"access": "public"
|
|
40
|
+
},
|
|
41
|
+
"bugs": {
|
|
42
|
+
"url": "https://github.com/ryanbr/network-scanner/issues"
|
|
43
|
+
},
|
|
44
|
+
"homepage": "https://github.com/ryanbr/network-scanner"
|
|
45
|
+
}
|
package/regex-samples.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Just examples of usage, always review the output before using publicly
|
|
2
|
+
|
|
3
|
+
Validate using;
|
|
4
|
+
* https://regex101.com/
|
|
5
|
+
* https://regexr.com/
|
|
6
|
+
|
|
7
|
+
| Domain | JSON Regex |
|
|
8
|
+
|:---------------------------|:------------|
|
|
9
|
+
| `/api/test/` | `\\/api\\/test\\/` |
|
|
10
|
+
| `/rto.js` | `\\/rto\\.js` |
|
|
11
|
+
| `/rto.min.js` | `\\/rto\\.min\\.js$` |
|
|
12
|
+
| `.com/` | `\\.com\\/` |
|
|
13
|
+
| `/test/` | `\\/test\\/` |
|
|
14
|
+
| `/ab/cd.php?ev=` | `\\/ab\\/cd\\.php\\?ev=` |
|
|
15
|
+
| `/ab/cde/ow/bra?` | `\\/ab\\/cde\\/ow\\/bra\\?.*` |
|
|
16
|
+
| `dcbgh` | `dcbgh` |
|
|
17
|
+
| `/gts_test=` | `\\/\\?gts_test=` |
|
|
18
|
+
| `abcdefghjk.top/` | `^https?:\\/\\/[a-z]{8,19}\\.top\\/$` |
|
|
19
|
+
| `abcdefghjk.top/*` | `^https?:\\/\\/[a-z]{8,19}\\.top\\/.*$` |
|
|
20
|
+
| `abcdefghjk.top/com` | `^https?:\\/\\/[a-z]{8,19}\\.(top\|com)\\/$` |
|
|
21
|
+
| `abcdefghjk.top com/*` | `^https?:\\/\\/[a-z]{8,19}\\.(top\|com)\\/.*$` |
|
|
22
|
+
| `.net/bar/` | `\\.net\\/bar\\/` |
|
|
23
|
+
| `&test_me=` | `&test_me=` |
|
|
24
|
+
| `/new/` `/test/` | `\\/(new\|test)\\/` |
|
|
25
|
+
| `.com` or `.net` | `\\.(com\|net)\\/` |
|
|
26
|
+
|
|
27
|
+
|
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
// === Network scanner script v0.9.0 ===
|
|
2
|
+
|
|
3
|
+
// puppeteer for browser automation, fs for file system operations, psl for domain parsing.
|
|
4
|
+
const puppeteer = require('puppeteer');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const psl = require('psl');
|
|
7
|
+
|
|
8
|
+
// --- Script Configuration & Constants ---
|
|
9
|
+
const VERSION = '0.9.0'; // Script version
|
|
10
|
+
|
|
11
|
+
// get startTime
|
|
12
|
+
const startTime = Date.now();
|
|
13
|
+
// Default values for fingerprint spoofing if not set to 'random'
|
|
14
|
+
const DEFAULT_PLATFORM = 'Win32';
|
|
15
|
+
const DEFAULT_TIMEZONE = 'America/New_York';
|
|
16
|
+
|
|
17
|
+
// --- Command-Line Argument Parsing ---
|
|
18
|
+
// process.argv contains node path, script path, then arguments. slice(2) gets just the arguments.
|
|
19
|
+
const args = process.argv.slice(2);
|
|
20
|
+
|
|
21
|
+
// If no command-line arguments are given, default to showing the help menu.
|
|
22
|
+
if (args.length === 0) {
|
|
23
|
+
args.push('--help');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Check for --headful flag to run browser with GUI.
|
|
27
|
+
const headfulMode = args.includes('--headful');
|
|
28
|
+
const SOURCES_FOLDER = 'sources'; // Declared, but not actively used in the provided script.
|
|
29
|
+
|
|
30
|
+
// Parse --output or -o argument for specifying the output file.
|
|
31
|
+
let outputFile = null;
|
|
32
|
+
const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o');
|
|
33
|
+
if (outputIndex !== -1 && args[outputIndex + 1]) {
|
|
34
|
+
outputFile = args[outputIndex + 1]; // Assign the filename provided after the flag.
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Boolean flags for various script behaviors.
|
|
38
|
+
const forceVerbose = args.includes('--verbose'); // Enables detailed logging.
|
|
39
|
+
const forceDebug = args.includes('--debug'); // Enables even more detailed debug logging.
|
|
40
|
+
const silentMode = args.includes('--silent'); // Suppresses most console output.
|
|
41
|
+
const showTitles = args.includes('--titles'); // Adds URL titles as comments in the output.
|
|
42
|
+
const dumpUrls = args.includes('--dumpurls'); // Logs all matched URLs to 'matched_urls.log'.
|
|
43
|
+
const subDomainsMode = args.includes('--sub-domains'); // Outputs full subdomains instead of root domains.
|
|
44
|
+
const localhostMode = args.includes('--localhost'); // Formats output for /etc/hosts (127.0.0.1).
|
|
45
|
+
const localhostModeAlt = args.includes('--localhost-0.0.0.0'); // Formats output for /etc/hosts (0.0.0.0).
|
|
46
|
+
const disableInteract = args.includes('--no-interact'); // Disables all simulated page interactions.
|
|
47
|
+
const plainOutput = args.includes('--plain'); // Outputs matched domains without adblock syntax.
|
|
48
|
+
const enableCDP = args.includes('--cdp'); // Enables Chrome DevTools Protocol logging globally.
|
|
49
|
+
let globalCDP = enableCDP; // Initialize globalCDP state; may be overridden by site config.
|
|
50
|
+
const globalEvalOnDoc = args.includes('--eval-on-doc'); // Enables evaluateOnNewDocument for all sites.
|
|
51
|
+
|
|
52
|
+
// Handle --version flag: print version and exit.
|
|
53
|
+
if (args.includes('--version')) {
|
|
54
|
+
console.log(`scanner-script.js version ${VERSION}`);
|
|
55
|
+
process.exit(0);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Handle --help or -h flag: print usage instructions and exit.
|
|
59
|
+
if (args.includes('--help') || args.includes('-h')) {
|
|
60
|
+
console.log(`Usage: node scanner-script.js [options]
|
|
61
|
+
|
|
62
|
+
Options:
|
|
63
|
+
-o, --output <file> Output file for rules. If omitted, prints to console
|
|
64
|
+
--verbose Force verbose mode globally
|
|
65
|
+
--debug Force debug mode globally
|
|
66
|
+
--silent Suppress normal console logs
|
|
67
|
+
--titles Add ! <url> title before each site's group
|
|
68
|
+
--dumpurls Dump matched URLs into matched_urls.log
|
|
69
|
+
--sub-domains Output full subdomains instead of collapsing to root
|
|
70
|
+
--localhost Output as 127.0.0.1 domain.com
|
|
71
|
+
--localhost-0.0.0.0 Output as 0.0.0.0 domain.com
|
|
72
|
+
--no-interact Disable page interactions globally
|
|
73
|
+
--custom-json <file> Use a custom config JSON file instead of config.json
|
|
74
|
+
--headful Launch browser with GUI (not headless)
|
|
75
|
+
--plain Output just domains (no adblock formatting)
|
|
76
|
+
--cdp Enable Chrome DevTools Protocol logging
|
|
77
|
+
--eval-on-doc Globally enable evaluateOnNewDocument()
|
|
78
|
+
--help, -h Show this help menu
|
|
79
|
+
--version Show script version
|
|
80
|
+
|
|
81
|
+
Per-site config.json options:
|
|
82
|
+
url: "site" or ["site1", "site2"] Single URL or list of URLs
|
|
83
|
+
filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests
|
|
84
|
+
blocked: ["regex"] Regex patterns to block requests
|
|
85
|
+
interact: true/false Simulate mouse movements/clicks
|
|
86
|
+
isBrave: true/false Spoof Brave browser detection
|
|
87
|
+
userAgent: "chrome"|"firefox"|"safari" Custom desktop User-Agent
|
|
88
|
+
delay: <milliseconds> Delay after load (default: 4000)
|
|
89
|
+
reload: <number> Reload page n times after load (default: 1)
|
|
90
|
+
forcereload: true/false Force an additional reload after reloads
|
|
91
|
+
clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
|
|
92
|
+
subDomains: 1/0 Output full subdomains (default: 0)
|
|
93
|
+
localhost: true/false Force localhost output (127.0.0.1)
|
|
94
|
+
localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
|
|
95
|
+
source: true/false Save page source HTML after load
|
|
96
|
+
firstParty: true/false Allow first-party matches (default: false)
|
|
97
|
+
thirdParty: true/false Allow third-party matches (default: true)
|
|
98
|
+
screenshot: true/false Capture screenshot on load failure
|
|
99
|
+
headful: true/false Launch browser with GUI for this site
|
|
100
|
+
fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
|
|
101
|
+
evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page
|
|
102
|
+
cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page
|
|
103
|
+
`);
|
|
104
|
+
process.exit(0);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// --- Configuration File Loading ---
|
|
108
|
+
// Determine path to config.json, allowing override with --custom-json flag.
|
|
109
|
+
const configPathIndex = args.findIndex(arg => arg === '--custom-json');
|
|
110
|
+
const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json';
|
|
111
|
+
let config;
|
|
112
|
+
try {
|
|
113
|
+
// Check if the configuration file exists.
|
|
114
|
+
if (!fs.existsSync(configPath)) {
|
|
115
|
+
console.error(`❌ Config file not found: ${configPath}`);
|
|
116
|
+
process.exit(1); // Exit if config file is missing.
|
|
117
|
+
}
|
|
118
|
+
// Log if a custom config file is being used (in debug mode).
|
|
119
|
+
if (forceDebug && configPath !== 'config.json') {
|
|
120
|
+
console.log(`[debug] Using custom config file: ${configPath}`);
|
|
121
|
+
}
|
|
122
|
+
// Read and parse the JSON configuration file.
|
|
123
|
+
const raw = fs.readFileSync(configPath, 'utf8');
|
|
124
|
+
config = JSON.parse(raw);
|
|
125
|
+
} catch (e) {
|
|
126
|
+
// Handle errors during file loading or JSON parsing.
|
|
127
|
+
console.error(`❌ Failed to load config file (${configPath}):`, e.message);
|
|
128
|
+
process.exit(1);
|
|
129
|
+
}
|
|
130
|
+
// Destructure essential properties from config, providing defaults if they are missing.
|
|
131
|
+
// sites: array of site objects to scan.
|
|
132
|
+
// ignoreDomains: array of domain strings to ignore during scanning.
|
|
133
|
+
// globalBlocked: array of regex strings for requests to block globally (applied if site doesn't override).
|
|
134
|
+
const { sites = [], ignoreDomains = [], blocked: globalBlocked = [] } = config;
|
|
135
|
+
|
|
136
|
+
// --- Global CDP Override Logic ---
|
|
137
|
+
// If globalCDP is not already enabled by the --cdp flag,
|
|
138
|
+
// check if any site in config.json has `cdp: true`. If so, enable globalCDP.
|
|
139
|
+
// This allows site-specific config to trigger CDP logging for the entire session.
|
|
140
|
+
// Note: Analysis suggests CDP should ideally be managed per-page for comprehensive logging.
|
|
141
|
+
if (!enableCDP) {
|
|
142
|
+
globalCDP = sites.some(site => site.cdp === true);
|
|
143
|
+
if (forceDebug && globalCDP) {
|
|
144
|
+
const cdpSites = sites.filter(site => site.cdp === true).map(site => site.url);
|
|
145
|
+
console.log('[debug] CDP enabled via config.json for sites:', cdpSites.join(', '));
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Extracts the root domain from a given URL string using the psl library.
|
|
151
|
+
* For example, for 'http://sub.example.com/path', it returns 'example.com'.
|
|
152
|
+
*
|
|
153
|
+
* @param {string} url - The URL string to parse.
|
|
154
|
+
* @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
|
|
155
|
+
*/
|
|
156
|
+
function getRootDomain(url) { // Utility function to get the main domain part of a URL.
|
|
157
|
+
try {
|
|
158
|
+
const { hostname } = new URL(url); // Extract hostname from URL.
|
|
159
|
+
const parsed = psl.parse(hostname); // Use psl library to parse the hostname.
|
|
160
|
+
return parsed.domain || hostname; // Return the parsed domain or the original hostname if psl fails.
|
|
161
|
+
} catch {
|
|
162
|
+
return ''; // Return empty string if URL parsing fails.
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Generates an object with randomized browser fingerprint values.
|
|
168
|
+
* This is used to spoof various navigator and screen properties to make
|
|
169
|
+
* the headless browser instance appear more like a regular user's browser
|
|
170
|
+
* and potentially bypass some fingerprint-based bot detection.
|
|
171
|
+
*
|
|
172
|
+
* @returns {object} An object containing the spoofed fingerprint properties:
|
|
173
|
+
* @property {number} deviceMemory - Randomized device memory (4 or 8 GB).
|
|
174
|
+
* @property {number} hardwareConcurrency - Randomized CPU cores (2, 4, or 8).
|
|
175
|
+
* @property {object} screen - Randomized screen dimensions and color depth.
|
|
176
|
+
* @property {number} screen.width - Randomized screen width.
|
|
177
|
+
* @property {number} screen.height - Randomized screen height.
|
|
178
|
+
* @property {number} screen.colorDepth - Fixed color depth (24).
|
|
179
|
+
* @property {string} platform - Fixed platform string ('Linux x86_64').
|
|
180
|
+
* @property {string} timezone - Fixed timezone ('UTC').
|
|
181
|
+
*/
|
|
182
|
+
function getRandomFingerprint() { // Utility function to generate randomized fingerprint data.
|
|
183
|
+
return {
|
|
184
|
+
deviceMemory: Math.random() < 0.5 ? 4 : 8, // Randomly pick 4 or 8 GB RAM.
|
|
185
|
+
hardwareConcurrency: [2, 4, 8][Math.floor(Math.random() * 3)], // Randomly pick 2, 4, or 8 cores.
|
|
186
|
+
screen: { // Randomize screen dimensions to mimic common mobile/desktop sizes.
|
|
187
|
+
width: 360 + Math.floor(Math.random() * 400), // Base width + random addition.
|
|
188
|
+
height: 640 + Math.floor(Math.random() * 500), // Base height + random addition.
|
|
189
|
+
colorDepth: 24 // Standard color depth.
|
|
190
|
+
},
|
|
191
|
+
platform: 'Linux x86_64', // Fixed platform.
|
|
192
|
+
timezone: 'UTC' // Fixed timezone.
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// --- Main Asynchronous IIFE (Immediately Invoked Function Expression) ---
|
|
197
|
+
// This is where the main script logic resides.
|
|
198
|
+
(async () => {
|
|
199
|
+
// --- Puppeteer Browser Launch Configuration ---
|
|
200
|
+
// Check if any site-specific config requests headful, otherwise use global headfulMode.
|
|
201
|
+
const perSiteHeadful = sites.some(site => site.headful === true);
|
|
202
|
+
// Launch headless unless global --headful or any site-specific headful is true.
|
|
203
|
+
const launchHeadless = !(headfulMode || perSiteHeadful);
|
|
204
|
+
const browser = await puppeteer.launch({
|
|
205
|
+
args: ['--no-sandbox', '--disable-setuid-sandbox'], // Common args for CI/Docker environments.
|
|
206
|
+
headless: launchHeadless,
|
|
207
|
+
protocolTimeout: 300000 // Set a higher protocol timeout (5 minutes).
|
|
208
|
+
});
|
|
209
|
+
if (forceDebug) console.log(`[debug] Launching browser with headless: ${launchHeadless}`);
|
|
210
|
+
|
|
211
|
+
// --- Site Processing Counter Setup ---
|
|
212
|
+
let siteCounter = 0; // Counts successfully loaded sites.
|
|
213
|
+
// Calculate total number of URLs to be processed for progress tracking.
|
|
214
|
+
const totalUrls = sites.reduce((sum, site) => {
|
|
215
|
+
const urls = Array.isArray(site.url) ? site.url.length : 1;
|
|
216
|
+
return sum + urls;
|
|
217
|
+
}, 0);
|
|
218
|
+
|
|
219
|
+
// --- Global CDP (Chrome DevTools Protocol) Session ---
|
|
220
|
+
// NOTE: This CDP session is attached to the initial browser page (e.g., about:blank).
|
|
221
|
+
// For comprehensive network logging per scanned site, a CDP session should ideally be
|
|
222
|
+
// created for each new page context. This current setup might miss some site-specific requests.
|
|
223
|
+
if (globalCDP && forceDebug) {
|
|
224
|
+
const [page] = await browser.pages(); // Get the initial page.
|
|
225
|
+
const cdpSession = await page.target().createCDPSession();
|
|
226
|
+
await cdpSession.send('Network.enable'); // Enable network request monitoring.
|
|
227
|
+
cdpSession.on('Network.requestWillBeSent', (params) => { // Log requests.
|
|
228
|
+
const { url, method } = params.request;
|
|
229
|
+
const initiator = params.initiator ? params.initiator.type : 'unknown';
|
|
230
|
+
console.log(`[cdp] ${method} ${url} (initiator: ${initiator})`);
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// --- Global evaluateOnNewDocument for Fetch/XHR Interception ---
|
|
235
|
+
// This loop attempts to set up fetch/XHR interception for sites that require it.
|
|
236
|
+
// NOTE: As per analysis, this `evaluateOnNewDocument` is applied to a temporary page
|
|
237
|
+
// created here (`browser.newPage().then(...)`) which is NOT the page used for actual site navigation later.
|
|
238
|
+
// This means the interception defined here likely won't apply as intended to the target pages.
|
|
239
|
+
// This should be refactored to apply to the correct page context during site processing.
|
|
240
|
+
for (const site of sites) {
|
|
241
|
+
const shouldInjectEval = site.evaluateOnNewDocument === true || globalEvalOnDoc;
|
|
242
|
+
if (shouldInjectEval) { // Redundant debug log here was removed in previous analysis.
|
|
243
|
+
if (forceDebug) console.log(`[debug] evaluateOnNewDocument pre-injection attempt for ${site.url}`);
|
|
244
|
+
await browser.newPage().then(page => { // Creates a new, temporary page.
|
|
245
|
+
page.evaluateOnNewDocument(() => { // Script to intercept fetch and XHR.
|
|
246
|
+
const originalFetch = window.fetch;
|
|
247
|
+
window.fetch = (...args) => {
|
|
248
|
+
console.log('[evalOnDoc][fetch]', args[0]); // Log fetch requests.
|
|
249
|
+
return originalFetch.apply(this, args);
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
const originalXHR = XMLHttpRequest.prototype.open;
|
|
253
|
+
XMLHttpRequest.prototype.open = function (method, url) {
|
|
254
|
+
console.log('[evalOnDoc][xhr]', url); // Log XHR requests.
|
|
255
|
+
return originalXHR.apply(this, arguments);
|
|
256
|
+
};
|
|
257
|
+
});
|
|
258
|
+
// This temporary page is not explicitly closed here or reused.
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const siteRules = []; // Array to store generated rules for all sites.
|
|
264
|
+
|
|
265
|
+
// --- Main Site Loop: Iterate through each site configuration from config.json ---
|
|
266
|
+
for (const site of sites) {
|
|
267
|
+
// A single site entry in config can have one or multiple URLs.
|
|
268
|
+
const urls = Array.isArray(site.url) ? site.url : [site.url];
|
|
269
|
+
|
|
270
|
+
// --- Inner URL Loop: Process each URL for the current site configuration ---
|
|
271
|
+
for (const currentUrl of urls) {
|
|
272
|
+
// --- Per-URL Variable Setup: Configure behavior for the current scan ---
|
|
273
|
+
const allowFirstParty = site.firstParty === 1; // Match first-party requests if true.
|
|
274
|
+
// Match third-party requests if true or undefined (default is true).
|
|
275
|
+
const allowThirdParty = site.thirdParty === undefined || site.thirdParty === 1;
|
|
276
|
+
// Use site-specific subdomain settings, else fallback to global subDomainsMode.
|
|
277
|
+
const perSiteSubDomains = site.subDomains === 1 ? true : subDomainsMode;
|
|
278
|
+
const siteLocalhost = site.localhost === true; // Format output as 127.0.0.1 for this site.
|
|
279
|
+
const siteLocalhostAlt = site.localhost_0_0_0_0 === true; // Format as 0.0.0.0.
|
|
280
|
+
const fingerprintSetting = site.fingerprint_protection || false; // Fingerprint spoofing setting.
|
|
281
|
+
|
|
282
|
+
// Skip if both first-party and third-party are disabled for this site.
|
|
283
|
+
if (site.firstParty === 0 && site.thirdParty === 0) {
|
|
284
|
+
console.warn(`⚠ Skipping ${currentUrl} because both firstParty and thirdParty are disabled.`);
|
|
285
|
+
continue;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
let page; // Will hold the Puppeteer page object.
|
|
289
|
+
const matchedDomains = new Set(); // Store unique matched domains for this URL.
|
|
290
|
+
let pageLoadFailed = false; // Flag to track if page loading fails.
|
|
291
|
+
|
|
292
|
+
if (!silentMode) console.log(`\nScanning: ${currentUrl}`);
|
|
293
|
+
|
|
294
|
+
try {
|
|
295
|
+
// --- Page Setup & Spoofing ---
|
|
296
|
+
page = await browser.newPage(); // Create a new page for the current URL.
|
|
297
|
+
await page.setRequestInterception(true); // Enable request interception.
|
|
298
|
+
|
|
299
|
+
// Clear site data before navigating if enabled
|
|
300
|
+
if (site.clear_sitedata === true) {
|
|
301
|
+
try {
|
|
302
|
+
const client = await page.target().createCDPSession();
|
|
303
|
+
await client.send('Network.clearBrowserCookies');
|
|
304
|
+
await client.send('Network.clearBrowserCache');
|
|
305
|
+
await page.evaluate(() => {
|
|
306
|
+
localStorage.clear();
|
|
307
|
+
sessionStorage.clear();
|
|
308
|
+
indexedDB.databases().then(dbs => dbs.forEach(db => indexedDB.deleteDatabase(db.name)));
|
|
309
|
+
});
|
|
310
|
+
if (forceDebug) console.log(`[debug] Cleared site data for ${currentUrl}`);
|
|
311
|
+
} catch (err) {
|
|
312
|
+
console.warn(`[clear_sitedata failed] ${currentUrl}: ${err.message}`);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Apply User-Agent spoofing if specified in site config.
|
|
317
|
+
if (site.userAgent) {
|
|
318
|
+
if (forceDebug) console.log(`[debug] userAgent spoofing enabled for ${currentUrl}: ${site.userAgent}`);
|
|
319
|
+
const userAgents = { // Predefined User-Agent strings.
|
|
320
|
+
chrome: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
|
|
321
|
+
firefox: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0",
|
|
322
|
+
safari: "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
|
|
323
|
+
};
|
|
324
|
+
const ua = userAgents[site.userAgent.toLowerCase()];
|
|
325
|
+
if (ua) await page.setUserAgent(ua);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Apply Brave browser detection spoofing if specified.
|
|
329
|
+
if (site.isBrave) {
|
|
330
|
+
if (forceDebug) console.log(`[debug] Brave spoofing enabled for ${currentUrl}`);
|
|
331
|
+
// Inject script to make navigator.brave appear available.
|
|
332
|
+
await page.evaluateOnNewDocument(() => {
|
|
333
|
+
Object.defineProperty(navigator, 'brave', {
|
|
334
|
+
get: () => ({ isBrave: () => Promise.resolve(true) })
|
|
335
|
+
});
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// Apply Fingerprint Protection if specified.
|
|
340
|
+
if (fingerprintSetting) {
|
|
341
|
+
if (forceDebug) console.log(`[debug] fingerprint_protection enabled for ${currentUrl}`);
|
|
342
|
+
// Use random fingerprint or predefined defaults.
|
|
343
|
+
const spoof = fingerprintSetting === 'random' ? getRandomFingerprint() : {
|
|
344
|
+
deviceMemory: 8, hardwareConcurrency: 4,
|
|
345
|
+
screen: { width: 1920, height: 1080, colorDepth: 24 },
|
|
346
|
+
platform: DEFAULT_PLATFORM, timezone: DEFAULT_TIMEZONE
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
try {
|
|
350
|
+
// Inject script to override various navigator and screen properties.
|
|
351
|
+
await page.evaluateOnNewDocument(({ spoof }) => {
|
|
352
|
+
Object.defineProperty(navigator, 'deviceMemory', { get: () => spoof.deviceMemory });
|
|
353
|
+
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => spoof.hardwareConcurrency });
|
|
354
|
+
Object.defineProperty(window.screen, 'width', { get: () => spoof.screen.width });
|
|
355
|
+
Object.defineProperty(window.screen, 'height', { get: () => spoof.screen.height });
|
|
356
|
+
Object.defineProperty(window.screen, 'colorDepth', { get: () => spoof.screen.colorDepth });
|
|
357
|
+
Object.defineProperty(navigator, 'platform', { get: () => spoof.platform });
|
|
358
|
+
Intl.DateTimeFormat = class extends Intl.DateTimeFormat {
|
|
359
|
+
resolvedOptions() { return { timeZone: spoof.timezone }; }
|
|
360
|
+
};
|
|
361
|
+
}, { spoof });
|
|
362
|
+
} catch (err) {
|
|
363
|
+
console.warn(`[fingerprint spoof failed] ${currentUrl}: ${err.message}`);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// --- Regex Compilation for Filtering and Blocking ---
|
|
368
|
+
// Compile filterRegex strings from config into RegExp objects.
|
|
369
|
+
// Handles single regex string or an array of regex strings.
|
|
370
|
+
// Removes leading/trailing slashes if present (e.g. "/regex/").
|
|
371
|
+
const regexes = Array.isArray(site.filterRegex)
|
|
372
|
+
? site.filterRegex.map(r => new RegExp(r.replace(/^\/(.*)\/$/, '$1')))
|
|
373
|
+
: site.filterRegex
|
|
374
|
+
? [new RegExp(site.filterRegex.replace(/^\/(.*)\/$/, '$1'))]
|
|
375
|
+
: [];
|
|
376
|
+
|
|
377
|
+
// verbose logging, pattern matching
|
|
378
|
+
if (site.verbose === 1 && site.filterRegex) {
|
|
379
|
+
const patterns = Array.isArray(site.filterRegex) ? site.filterRegex : [site.filterRegex];
|
|
380
|
+
console.log(`[info] Regex patterns for ${currentUrl}:`);
|
|
381
|
+
patterns.forEach((pattern, idx) => {
|
|
382
|
+
console.log(` [${idx + 1}] ${pattern}`);
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Compile blocked request patterns from config into RegExp objects.
|
|
387
|
+
const blockedRegexes = Array.isArray(site.blocked)
|
|
388
|
+
? site.blocked.map(pattern => new RegExp(pattern))
|
|
389
|
+
: [];
|
|
390
|
+
|
|
391
|
+
// --- page.on('request', ...) Handler: Core Network Request Logic ---
|
|
392
|
+
const pageUrl = currentUrl; // Reference to the current page's URL for first-party checks.
|
|
393
|
+
page.on('request', request => {
|
|
394
|
+
const checkedUrl = request.url();
|
|
395
|
+
// Determine if the request is first-party relative to the page's main URL.
|
|
396
|
+
const isFirstParty = new URL(checkedUrl).hostname === new URL(pageUrl).hostname;
|
|
397
|
+
|
|
398
|
+
// Skip first-party requests if first-party matching is disabled for the site.
|
|
399
|
+
if (isFirstParty && site.firstParty === false) {
|
|
400
|
+
request.continue();
|
|
401
|
+
return;
|
|
402
|
+
}
|
|
403
|
+
// Skip third-party requests if third-party matching is disabled for the site.
|
|
404
|
+
if (!isFirstParty && site.thirdParty === false) {
|
|
405
|
+
request.continue();
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (forceDebug) console.log('[debug request]', request.url());
|
|
410
|
+
const reqUrl = request.url();
|
|
411
|
+
|
|
412
|
+
// Abort requests that match any of the `blockedRegexes`.
|
|
413
|
+
if (blockedRegexes.some(re => re.test(reqUrl))) {
|
|
414
|
+
request.abort();
|
|
415
|
+
return;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// Extract domain: full hostname or root domain based on `perSiteSubDomains`.
|
|
419
|
+
const reqDomain = perSiteSubDomains ? (new URL(reqUrl)).hostname : getRootDomain(reqUrl);
|
|
420
|
+
|
|
421
|
+
// Ignore if domain is empty or matches any entry in `ignoreDomains`.
|
|
422
|
+
if (!reqDomain || ignoreDomains.some(domain => reqDomain.endsWith(domain))) {
|
|
423
|
+
request.continue();
|
|
424
|
+
return;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// show verbose logging if enabled
|
|
428
|
+
for (const re of regexes) {
|
|
429
|
+
if (re.test(reqUrl)) {
|
|
430
|
+
matchedDomains.add(reqDomain);
|
|
431
|
+
if (site.verbose === 1) {
|
|
432
|
+
console.log(`[match] ${reqUrl} matched regex: ${re}`);
|
|
433
|
+
}
|
|
434
|
+
if (dumpUrls) fs.appendFileSync('matched_urls.log', `${reqUrl}\n`);
|
|
435
|
+
break;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
request.continue(); // Allow all other requests to proceed.
|
|
440
|
+
});
|
|
441
|
+
|
|
442
|
+
// --- Page Navigation and Interaction ---
|
|
443
|
+
const interactEnabled = site.interact === true;
|
|
444
|
+
try {
|
|
445
|
+
// Navigate to the current URL.
|
|
446
|
+
await page.goto(currentUrl, { waitUntil: 'load', timeout: site.timeout || 40000 });
|
|
447
|
+
siteCounter++; // Increment successful load counter.
|
|
448
|
+
console.log(`[info] Loaded: (${siteCounter}/${totalUrls}) ${currentUrl}`);
|
|
449
|
+
// Simple evaluation to confirm page context is accessible.
|
|
450
|
+
await page.evaluate(() => { console.log('Safe to evaluate on loaded page.'); });
|
|
451
|
+
} catch (err) {
|
|
452
|
+
console.error(`[error] Failed on ${currentUrl}: ${err.message}`);
|
|
453
|
+
// Note: pageLoadFailed will be set in the outer catch if this throws.
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Simulate user interaction if enabled for the site and not globally disabled.
|
|
457
|
+
if (interactEnabled && !disableInteract) {
|
|
458
|
+
if (forceDebug) console.log(`[debug] interaction simulation enabled for ${currentUrl}`);
|
|
459
|
+
// Perform random mouse movements and a click.
|
|
460
|
+
const randomX = Math.floor(Math.random() * 500) + 50;
|
|
461
|
+
const randomY = Math.floor(Math.random() * 500) + 50;
|
|
462
|
+
await page.mouse.move(randomX, randomY, { steps: 10 });
|
|
463
|
+
await page.mouse.move(randomX + 50, randomY + 50, { steps: 15 });
|
|
464
|
+
await page.mouse.click(randomX + 25, randomY + 25);
|
|
465
|
+
await page.hover('body'); // Hover over body to potentially trigger events.
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// Wait for network to be idle and then an additional fixed delay.
|
|
469
|
+
const delayMs = site.delay || 4000; // Site-specific delay or default 2s.
|
|
470
|
+
await page.waitForNetworkIdle({ idleTime: 4000, timeout: site.timeout || 30000 });
|
|
471
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
472
|
+
|
|
473
|
+
// Reload the page multiple times if specified in site config.
|
|
474
|
+
for (let i = 1; i < (site.reload || 1); i++) { // Default is 1 (no extra reloads).
|
|
475
|
+
if (site.clear_sitedata === true) { // If true, clear site data
|
|
476
|
+
try {
|
|
477
|
+
const client = await page.target().createCDPSession();
|
|
478
|
+
await client.send('Network.clearBrowserCookies');
|
|
479
|
+
await client.send('Network.clearBrowserCache');
|
|
480
|
+
await page.evaluate(() => {
|
|
481
|
+
localStorage.clear();
|
|
482
|
+
sessionStorage.clear();
|
|
483
|
+
indexedDB.databases().then(dbs => dbs.forEach(db => indexedDB.deleteDatabase(db.name)));
|
|
484
|
+
});
|
|
485
|
+
if (forceDebug) console.log(`[debug] Cleared site data before reload #${i + 1} for ${currentUrl}`);
|
|
486
|
+
} catch (err) {
|
|
487
|
+
console.warn(`[clear_sitedata before reload failed] ${currentUrl}: ${err.message}`);
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
await page.reload({ waitUntil: 'domcontentloaded', timeout: site.timeout || 30000 });
|
|
492
|
+
await new Promise(resolve => setTimeout(resolve, delayMs)); // Wait after each reload.
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// Force an extra reload if specified "Shift reload website"
|
|
496
|
+
if (site.forcereload === true) {
|
|
497
|
+
if (forceDebug) console.log(`[debug] Forcing extra reload (cache disabled) for ${currentUrl}`);
|
|
498
|
+
try {
|
|
499
|
+
await page.setCacheEnabled(false);
|
|
500
|
+
await page.reload({ waitUntil: 'domcontentloaded', timeout: site.timeout || 30000 });
|
|
501
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
502
|
+
await page.setCacheEnabled(true);
|
|
503
|
+
} catch (err) {
|
|
504
|
+
console.warn(`[forcereload failed] ${currentUrl}: ${err.message}`);
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
await page.close(); // Close the page after processing.
|
|
509
|
+
} catch (err) { // --- Error Handling for Page Load/Processing ---
|
|
510
|
+
console.warn(`⚠ Failed to load or process: ${currentUrl} (${err.message})`);
|
|
511
|
+
// If screenshot on failure is enabled and page object exists.
|
|
512
|
+
if (site.screenshot === true && page) {
|
|
513
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
514
|
+
const safeUrl = currentUrl.replace(/https?:\/\//, '').replace(/[^a-zA-Z0-9]/g, '_');
|
|
515
|
+
const filename = `${safeUrl}-${timestamp}.jpg`;
|
|
516
|
+
try {
|
|
517
|
+
// Take a full-page screenshot.
|
|
518
|
+
await page.screenshot({ path: filename, type: 'jpeg', fullPage: true });
|
|
519
|
+
if (forceDebug) console.log(`[debug] Screenshot saved: ${filename}`);
|
|
520
|
+
} catch (errSc) {
|
|
521
|
+
console.warn(`[screenshot failed] ${currentUrl}: ${errSc.message}`);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
pageLoadFailed = true; // Mark that this page failed.
|
|
525
|
+
if (page && !page.isClosed()) await page.close(); // Ensure page is closed on error.
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// --- Output Formatting for Matched Domains ---
|
|
529
|
+
const siteMatchedDomains = []; // Store formatted rules for this specific URL.
|
|
530
|
+
matchedDomains.forEach(domain => {
|
|
531
|
+
// Basic validation for domain string.
|
|
532
|
+
if (domain.length > 6 && domain.includes('.')) {
|
|
533
|
+
// Determine if plain output (just domain) or adblock rule format is needed.
|
|
534
|
+
// site.plain defaults to false if undefined. True only if explicitly site.plain: true.
|
|
535
|
+
const sitePlainSetting = site.plain === true;
|
|
536
|
+
// Use plainOutput if global flag is set OR if site-specific plain is true.
|
|
537
|
+
const usePlain = plainOutput || sitePlainSetting;
|
|
538
|
+
|
|
539
|
+
// Format based on localhost flags or standard adblock syntax.
|
|
540
|
+
if (localhostMode || siteLocalhost) { // 127.0.0.1 format
|
|
541
|
+
siteMatchedDomains.push(usePlain ? domain : `127.0.0.1 ${domain}`);
|
|
542
|
+
} else if (localhostModeAlt || siteLocalhostAlt) { // 0.0.0.0 format
|
|
543
|
+
siteMatchedDomains.push(usePlain ? domain : `0.0.0.0 ${domain}`);
|
|
544
|
+
} else { // Standard adblocker format (e.g., ||domain.com^)
|
|
545
|
+
siteMatchedDomains.push(usePlain ? domain : `||${domain}^`);
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
});
|
|
549
|
+
|
|
550
|
+
// Store the rules collected for this URL along with the URL itself.
|
|
551
|
+
siteRules.push({ url: currentUrl, rules: siteMatchedDomains });
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// --- Final Output Aggregation & Writing ---
|
|
556
|
+
const outputLines = []; // Array to hold all lines for the final output.
|
|
557
|
+
// Iterate through rules collected from all scanned URLs.
|
|
558
|
+
for (const { url, rules } of siteRules) {
|
|
559
|
+
if (rules.length > 0) { // Only process if there are rules for this URL.
|
|
560
|
+
// Add a title comment (e.g., "! https://example.com") if showTitles is enabled.
|
|
561
|
+
if (showTitles) outputLines.push(`! ${url}`);
|
|
562
|
+
outputLines.push(...rules); // Add the actual rules.
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// Write the aggregated rules to the specified output file or to the console.
|
|
567
|
+
if (outputFile) {
|
|
568
|
+
fs.writeFileSync(outputFile, outputLines.join('\n') + '\n');
|
|
569
|
+
if (!silentMode) console.log(`Adblock rules saved to ${outputFile}`);
|
|
570
|
+
} else {
|
|
571
|
+
console.log(outputLines.join('\n')); // Print to console if no output file.
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
await browser.close(); // Close the browser instance.
|
|
575
|
+
// show time taken
|
|
576
|
+
const endTime = Date.now();
|
|
577
|
+
const durationMs = endTime - startTime;
|
|
578
|
+
const totalSeconds = Math.floor(durationMs / 1000);
|
|
579
|
+
const hours = Math.floor(totalSeconds / 3600);
|
|
580
|
+
const minutes = Math.floor((totalSeconds % 3600) / 60);
|
|
581
|
+
const seconds = totalSeconds % 60;
|
|
582
|
+
|
|
583
|
+
if (!silentMode) {
|
|
584
|
+
console.log(`Scan completed in ${hours}h ${minutes}m ${seconds}s`);
|
|
585
|
+
}
|
|
586
|
+
// Exit
|
|
587
|
+
process.exit(0); // Exit script successfully.
|
|
588
|
+
})();
|