@fanboynz/network-scanner 2.0.62 → 2.0.64
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/README.md +4 -2
- package/config.json +3 -3
- package/lib/adblock-rust.js +368 -0
- package/lib/output.js +35 -24
- package/nwss.js +117 -34
- package/package.json +6 -5
package/CLAUDE.md
CHANGED
|
@@ -27,7 +27,7 @@ Puppeteer-based network scanner for analyzing web traffic, generating adblock fi
|
|
|
27
27
|
|
|
28
28
|
## Tech Stack
|
|
29
29
|
|
|
30
|
-
- **Node.js** >=
|
|
30
|
+
- **Node.js** >=22.0.0
|
|
31
31
|
- **puppeteer** >=20.0.0 — Headless browser automation
|
|
32
32
|
- **psl** — Public Suffix List for domain parsing
|
|
33
33
|
- **lru-cache** — LRU cache implementation
|
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
|
|
|
5
5
|
- Scan websites and detect matching third-party or first-party resources
|
|
6
6
|
- Output Adblock-formatted blocking rules
|
|
7
7
|
- Support for multiple filters per site
|
|
8
|
-
- Grouped titles (! <url>) before site matches
|
|
8
|
+
- Grouped titles (! <url>) before site matches, including redirect source and matching regex
|
|
9
9
|
- Ignore unwanted domains (global and per-site)
|
|
10
10
|
- Block unwanted domains during scan (simulate adblock)
|
|
11
11
|
- Support Chrome, Firefox, Safari user agents (desktop or mobile)
|
|
@@ -64,6 +64,7 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
|
|
|
64
64
|
| `--headful` | Launch browser with GUI (not headless) |
|
|
65
65
|
| `--keep-open` | Keep browser and tabs open after scan completes (use with `--headful` for debugging) |
|
|
66
66
|
| `--use-puppeteer-core` | Use `puppeteer-core` with system Chrome instead of bundled Chromium |
|
|
67
|
+
| `--use-obscura` | Connect to running Obscura CDP server (`ws://127.0.0.1:9222` or `OBSCURA_WS` env). Skips fingerprint injection — Obscura provides built-in stealth |
|
|
67
68
|
| `--load-extension <path>` | Load unpacked Chrome extension from directory (can be used multiple times) |
|
|
68
69
|
| `--dns-cache` | Persist dig/whois results to disk between runs (14hr TTL, `.digcache`/`.whoiscache`) |
|
|
69
70
|
| `--block-ads=<files>` | Block ads using EasyList format rules (comma-separated: `easylist.txt,easyprivacy.txt`) |
|
|
@@ -448,7 +449,7 @@ node nwss.js config-clean2.json --debug # .nwssconfig + debug overri
|
|
|
448
449
|
node nwss.js config-other.json --max-concurrent 5 # no match in .nwssconfig, uses CLI flags
|
|
449
450
|
```
|
|
450
451
|
|
|
451
|
-
**Supported settings:** `output`, `max_concurrent`, `dns_cache`, `cache_requests`, `dumpurls`, `remove_tempfiles`, `color`, `remove_dupes`, `compress_logs`, `debug`, `silent`, `verbose`, `headful`, `keep_open`, `dry_run`, `titles`, `sub_domains`, `no_interact`, `ghost_cursor`, `plain`, `cdp`, `dnsmasq`, `unbound`, `privoxy`, `pihole`, `eval_on_doc`, `use_puppeteer_core`, `ignore_cache`, `clear_cache`, `block_ads`, `compare`, `localhost`, `append`.
|
|
452
|
+
**Supported settings:** `output`, `max_concurrent`, `dns_cache`, `cache_requests`, `dumpurls`, `remove_tempfiles`, `color`, `remove_dupes`, `compress_logs`, `debug`, `silent`, `verbose`, `headful`, `keep_open`, `dry_run`, `titles`, `sub_domains`, `no_interact`, `ghost_cursor`, `plain`, `cdp`, `dnsmasq`, `unbound`, `privoxy`, `pihole`, `eval_on_doc`, `use_puppeteer_core`, `use_obscura`, `ignore_cache`, `clear_cache`, `block_ads`, `compare`, `localhost`, `append`.
|
|
452
453
|
|
|
453
454
|
**Priority:** CLI flags > `.nwssconfig` > hardcoded defaults.
|
|
454
455
|
|
|
@@ -461,6 +462,7 @@ These options go at the root level of your config.json:
|
|
|
461
462
|
| Field | Values | Default | Description |
|
|
462
463
|
|:---------------------|:-------|:-------:|:------------|
|
|
463
464
|
| `ignoreDomains` | Array | - | Domains to completely ignore (supports wildcards like `*.ads.com`) |
|
|
465
|
+
| `ignoreDomainsByUrl` | Array | - | Regex patterns; if a request URL matches, the request's root domain is dynamically ignored for the rest of the scan (e.g. `["\\/jwplayer\\/", "\\/build\\/assets\\/"]`) |
|
|
464
466
|
| `blocked` | Array | - | Global regex patterns to block requests (combined with per-site blocked) |
|
|
465
467
|
| `whois_server_mode` | String | `"random"` | Default server selection mode for all sites |
|
|
466
468
|
| `ignore_similar` | Boolean | `true` | Ignore domains similar to already found domains |
|
package/config.json
CHANGED
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"sites": [
|
|
39
39
|
{
|
|
40
40
|
"url": "https://www.anandtech.com/",
|
|
41
|
-
"filterRegex": "
|
|
41
|
+
"filterRegex": "teststring",
|
|
42
42
|
"resourceTypes": ["script", "xhr", "document"],
|
|
43
43
|
"reload": 1,
|
|
44
44
|
"timeout": 25000,
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
},
|
|
51
51
|
{
|
|
52
52
|
"url": "https://www.tomshardware.com/",
|
|
53
|
-
"filterRegex": "
|
|
53
|
+
"filterRegex": "anotherstrng",
|
|
54
54
|
"resourceTypes": ["all"],
|
|
55
55
|
"reload": 2,
|
|
56
56
|
"timeout": 25000,
|
|
@@ -61,7 +61,7 @@
|
|
|
61
61
|
},
|
|
62
62
|
{
|
|
63
63
|
"url": ["https://www.tomshardware.com/", "https://www.anandtech.com/"],
|
|
64
|
-
"filterRegex": "
|
|
64
|
+
"filterRegex": "morestrings",
|
|
65
65
|
"resourceTypes": ["all"],
|
|
66
66
|
"reload": 2,
|
|
67
67
|
"timeout": 25000,
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
// === Adblock Rust Engine Wrapper (adblock-rust.js) ===
|
|
2
|
+
// Drop-in replacement for ./lib/adblock that delegates matching to Brave's
|
|
3
|
+
// adblock-rust engine (npm: adblock-rs) for higher throughput on large lists.
|
|
4
|
+
//
|
|
5
|
+
// Exposes the same parseAdblockRules(filePath, options) factory and the same
|
|
6
|
+
// matcher shape ({ shouldBlock, getStats, rules }) so nwss.js can switch
|
|
7
|
+
// engines with a single require() swap.
|
|
8
|
+
|
|
9
|
+
const fs = require('fs');
|
|
10
|
+
const path = require('path');
|
|
11
|
+
const os = require('os');
|
|
12
|
+
const crypto = require('crypto');
|
|
13
|
+
|
|
14
|
+
let adblockRust = null;
|
|
15
|
+
let adblockRustVersion = null;
|
|
16
|
+
function loadAdblockRust() {
|
|
17
|
+
if (adblockRust) return adblockRust;
|
|
18
|
+
try {
|
|
19
|
+
adblockRust = require('adblock-rs');
|
|
20
|
+
// Read once for the cache key — serialized engine format is not promised
|
|
21
|
+
// stable across versions, so partitioning cache files by version means
|
|
22
|
+
// upgrades cleanly invalidate without producing confusing deserialize
|
|
23
|
+
// failures on the warm path.
|
|
24
|
+
adblockRustVersion = require('adblock-rs/package.json').version;
|
|
25
|
+
} catch (err) {
|
|
26
|
+
throw new Error(
|
|
27
|
+
"adblock-rs is not installed. Install with: npm install adblock-rs " +
|
|
28
|
+
"(requires Rust toolchain for native build). Original error: " + err.message
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
return adblockRust;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Best-effort cleanup of stale serialized engines. Filter lists change roughly
|
|
35
|
+
// monthly; cache files older than this are unlikely to be reused and only cost
|
|
36
|
+
// disk space. Runs once per cold parse and swallows all errors — cleanup
|
|
37
|
+
// failure must never block a scan.
|
|
38
|
+
function pruneOldCacheFiles(cacheDir, maxAgeMs) {
|
|
39
|
+
try {
|
|
40
|
+
const cutoff = Date.now() - maxAgeMs;
|
|
41
|
+
const files = fs.readdirSync(cacheDir);
|
|
42
|
+
for (const name of files) {
|
|
43
|
+
// Only touch our own files; `.tmp` covers stray writes from killed
|
|
44
|
+
// processes. Skip anything else (in case the dir is shared).
|
|
45
|
+
if (!name.endsWith('.bin') && !name.endsWith('.tmp')) continue;
|
|
46
|
+
const full = path.join(cacheDir, name);
|
|
47
|
+
try {
|
|
48
|
+
if (fs.statSync(full).mtimeMs < cutoff) fs.unlinkSync(full);
|
|
49
|
+
} catch (_) { /* file vanished mid-walk — fine */ }
|
|
50
|
+
}
|
|
51
|
+
} catch (_) { /* dir doesn't exist or unreadable — fine */ }
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Map Puppeteer/CDP resource type names to adblock-rust request types.
|
|
55
|
+
// Uses a null-prototype object so lookups skip the prototype chain — small but
|
|
56
|
+
// free win on a hot-path lookup that runs once per network request.
|
|
57
|
+
const RESOURCE_TYPE_MAP = Object.assign(Object.create(null), {
|
|
58
|
+
'document': 'main_frame',
|
|
59
|
+
'subdocument': 'sub_frame',
|
|
60
|
+
'stylesheet': 'stylesheet',
|
|
61
|
+
'script': 'script',
|
|
62
|
+
'image': 'image',
|
|
63
|
+
'font': 'font',
|
|
64
|
+
'media': 'media',
|
|
65
|
+
'texttrack': 'media',
|
|
66
|
+
'xhr': 'xmlhttprequest',
|
|
67
|
+
'fetch': 'xmlhttprequest',
|
|
68
|
+
'xmlhttprequest': 'xmlhttprequest',
|
|
69
|
+
'eventsource': 'other',
|
|
70
|
+
'websocket': 'websocket',
|
|
71
|
+
'manifest': 'other',
|
|
72
|
+
'signedexchange': 'other',
|
|
73
|
+
'ping': 'ping',
|
|
74
|
+
'cspviolationreport': 'other',
|
|
75
|
+
'preflight': 'other',
|
|
76
|
+
'other': 'other',
|
|
77
|
+
'': ''
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
function normalizeResourceType(type) {
|
|
81
|
+
if (!type) return '';
|
|
82
|
+
return RESOURCE_TYPE_MAP[type] || 'other';
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Small FIFO cache keyed on (url \0 sourceUrl \0 resourceType). Despite the
|
|
86
|
+
// class name, eviction is insertion-order, not access-order — `get()` does not
|
|
87
|
+
// promote. For this workload (per-page request bursts whose working set fits
|
|
88
|
+
// in maxSize) FIFO and true LRU produce the same evictions, so the simpler
|
|
89
|
+
// path wins. If cache effectiveness becomes a concern with larger working
|
|
90
|
+
// sets, promote on hit by re-inserting (delete + set).
|
|
91
|
+
class ResultLRU {
|
|
92
|
+
constructor(maxSize) {
|
|
93
|
+
this.cache = new Map();
|
|
94
|
+
this.maxSize = maxSize;
|
|
95
|
+
}
|
|
96
|
+
get(k) { return this.cache.get(k); }
|
|
97
|
+
set(k, v) {
|
|
98
|
+
if (this.cache.size >= this.maxSize) {
|
|
99
|
+
this.cache.delete(this.cache.keys().next().value);
|
|
100
|
+
}
|
|
101
|
+
this.cache.set(k, v);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Build a request-blocking matcher backed by Brave's adblock-rs engine.
|
|
107
|
+
*
|
|
108
|
+
* @param {string|string[]} filePathOrArray - One filter list path, or an array
|
|
109
|
+
* of paths to load in order. Order is significant: it affects rule
|
|
110
|
+
* precedence and the cache key.
|
|
111
|
+
* @param {object} [options]
|
|
112
|
+
* @param {boolean} [options.enableLogging=false] - Print parse + cache events.
|
|
113
|
+
* @param {number} [options.resultCacheSize=32000] - Max entries in the
|
|
114
|
+
* per-matcher result cache (FIFO eviction).
|
|
115
|
+
* @param {boolean} [options.useDiskCache=true] - Persist the compiled engine
|
|
116
|
+
* to disk and reload on next run with the same input lists + library version.
|
|
117
|
+
* @param {string} [options.cacheDir] - Directory for compiled-engine cache
|
|
118
|
+
* files. Defaults to a folder under the OS temp dir.
|
|
119
|
+
* @param {number} [options.cacheTtlMs=2592000000] - Files in cacheDir older
|
|
120
|
+
* than this are pruned during cold parse. Default 30 days.
|
|
121
|
+
* @returns {{shouldBlock: Function, getStats: Function, rules: object}}
|
|
122
|
+
*/
|
|
123
|
+
function parseAdblockRules(filePathOrArray, options = {}) {
|
|
124
|
+
const {
|
|
125
|
+
enableLogging = false,
|
|
126
|
+
resultCacheSize = 32000,
|
|
127
|
+
useDiskCache = true,
|
|
128
|
+
cacheDir = path.join(os.tmpdir(), 'nwss-adblock-rs-cache'),
|
|
129
|
+
cacheTtlMs = 30 * 24 * 60 * 60 * 1000
|
|
130
|
+
} = options;
|
|
131
|
+
const rust = loadAdblockRust();
|
|
132
|
+
|
|
133
|
+
// Accept a single path or an array of paths — caller no longer needs to
|
|
134
|
+
// materialize a temp concatenation file for multi-list scans.
|
|
135
|
+
const filePaths = Array.isArray(filePathOrArray) ? filePathOrArray : [filePathOrArray];
|
|
136
|
+
|
|
137
|
+
// Read all files up front; hash the raw bytes so the disk cache key reflects
|
|
138
|
+
// both content changes and list-order changes. Mix in the adblock-rs version
|
|
139
|
+
// so a library upgrade (which may change the serialized format) doesn't try
|
|
140
|
+
// to deserialize an incompatible blob.
|
|
141
|
+
const buffers = [];
|
|
142
|
+
const hash = crypto.createHash('sha256');
|
|
143
|
+
hash.update('adblock-rs:' + adblockRustVersion + '\0');
|
|
144
|
+
let totalBytes = 0;
|
|
145
|
+
for (const fp of filePaths) {
|
|
146
|
+
let buf;
|
|
147
|
+
try {
|
|
148
|
+
buf = fs.readFileSync(fp);
|
|
149
|
+
} catch (err) {
|
|
150
|
+
throw new Error(`Adblock rules file not found: ${fp}`);
|
|
151
|
+
}
|
|
152
|
+
buffers.push(buf);
|
|
153
|
+
hash.update(buf);
|
|
154
|
+
hash.update('\0');
|
|
155
|
+
totalBytes += buf.length;
|
|
156
|
+
}
|
|
157
|
+
const cacheKey = hash.digest('hex');
|
|
158
|
+
const cachePath = path.join(cacheDir, cacheKey + '.bin');
|
|
159
|
+
|
|
160
|
+
let engine = null;
|
|
161
|
+
let ruleCount = 0;
|
|
162
|
+
let cacheHit = false;
|
|
163
|
+
|
|
164
|
+
// Fast path: deserialize a previously-compiled engine if available.
|
|
165
|
+
// Skip the existsSync/readFileSync double-syscall pattern — let readFileSync
|
|
166
|
+
// throw ENOENT and treat it as a clean cache-miss. Avoids a redundant stat()
|
|
167
|
+
// and the TOCTOU race where the cache file could be removed between the
|
|
168
|
+
// exists check and the read.
|
|
169
|
+
if (useDiskCache) {
|
|
170
|
+
let compiled;
|
|
171
|
+
try {
|
|
172
|
+
compiled = fs.readFileSync(cachePath);
|
|
173
|
+
} catch (err) {
|
|
174
|
+
if (err.code !== 'ENOENT' && enableLogging) {
|
|
175
|
+
console.log(`[Adblock-Rust] Cache read failed (${err.message}); reparsing`);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (compiled) {
|
|
179
|
+
try {
|
|
180
|
+
engine = new rust.Engine(new rust.FilterSet(enableLogging), true);
|
|
181
|
+
// Avoid copying the ~10MB serialized engine when the underlying
|
|
182
|
+
// ArrayBuffer is exclusively ours (true for any read above Node's
|
|
183
|
+
// ~4KB Buffer pool threshold — i.e. always for compiled engines).
|
|
184
|
+
// Fall back to slicing only when the Buffer is a view into a pooled
|
|
185
|
+
// backing store, which would otherwise leak unrelated data.
|
|
186
|
+
const ab = (compiled.byteOffset === 0 &&
|
|
187
|
+
compiled.byteLength === compiled.buffer.byteLength)
|
|
188
|
+
? compiled.buffer
|
|
189
|
+
: compiled.buffer.slice(
|
|
190
|
+
compiled.byteOffset,
|
|
191
|
+
compiled.byteOffset + compiled.byteLength
|
|
192
|
+
);
|
|
193
|
+
engine.deserialize(ab);
|
|
194
|
+
cacheHit = true;
|
|
195
|
+
} catch (err) {
|
|
196
|
+
// Corrupt cache or version mismatch — fall through to a fresh parse.
|
|
197
|
+
engine = null;
|
|
198
|
+
if (enableLogging) {
|
|
199
|
+
console.log(`[Adblock-Rust] Cache deserialize failed (${err.message}); reparsing`);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (!engine) {
|
|
206
|
+
// Slow path: parse every list. Use addFilters per-file so a single bad
|
|
207
|
+
// line in one list does not blast the whole input, and so the per-list
|
|
208
|
+
// line count is correct. Release each buffer's reference as soon as it
|
|
209
|
+
// is consumed so GC can reclaim the file bytes mid-loop instead of
|
|
210
|
+
// holding all input files (~3-5MB combined for easylist+easyprivacy)
|
|
211
|
+
// alive until the function returns.
|
|
212
|
+
const filterSet = new rust.FilterSet(enableLogging);
|
|
213
|
+
for (let i = 0; i < buffers.length; i++) {
|
|
214
|
+
const buf = buffers[i];
|
|
215
|
+
buffers[i] = null;
|
|
216
|
+
const lines = buf.toString('utf-8').split('\n');
|
|
217
|
+
for (let j = 0; j < lines.length; j++) {
|
|
218
|
+
const line = lines[j];
|
|
219
|
+
if (line.length === 0) continue;
|
|
220
|
+
if (line.charCodeAt(0) === 0x21) continue;
|
|
221
|
+
ruleCount++;
|
|
222
|
+
}
|
|
223
|
+
filterSet.addFilters(lines);
|
|
224
|
+
}
|
|
225
|
+
engine = new rust.Engine(filterSet, true);
|
|
226
|
+
|
|
227
|
+
if (useDiskCache) {
|
|
228
|
+
try {
|
|
229
|
+
fs.mkdirSync(cacheDir, { recursive: true });
|
|
230
|
+
const serialized = engine.serialize();
|
|
231
|
+
// Atomic write: writeFileSync to a per-pid tmp path then rename. If
|
|
232
|
+
// the process is killed mid-write we leave a stray .tmp file (cleaned
|
|
233
|
+
// up by the TTL prune on a future run) but the final cachePath is
|
|
234
|
+
// either complete or absent — never half-written.
|
|
235
|
+
const tmpPath = cachePath + '.' + process.pid + '.tmp';
|
|
236
|
+
fs.writeFileSync(tmpPath, Buffer.from(serialized));
|
|
237
|
+
fs.renameSync(tmpPath, cachePath);
|
|
238
|
+
// Best-effort prune of stale cache files. Done after our own write so
|
|
239
|
+
// we never delete the entry we just created.
|
|
240
|
+
pruneOldCacheFiles(cacheDir, cacheTtlMs);
|
|
241
|
+
} catch (err) {
|
|
242
|
+
if (enableLogging) {
|
|
243
|
+
console.log(`[Adblock-Rust] Cache write failed (${err.message}); continuing`);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const stats = {
|
|
250
|
+
// When deserialized from cache we don't see the rules; report bytes instead
|
|
251
|
+
// so the startup banner remains informative.
|
|
252
|
+
total: cacheHit ? null : ruleCount,
|
|
253
|
+
bytes: totalBytes,
|
|
254
|
+
engine: 'adblock-rust',
|
|
255
|
+
fromDiskCache: cacheHit,
|
|
256
|
+
listCount: filePaths.length,
|
|
257
|
+
blocked: 0,
|
|
258
|
+
allowed: 0,
|
|
259
|
+
exceptions: 0,
|
|
260
|
+
errors: 0,
|
|
261
|
+
cacheHits: 0,
|
|
262
|
+
cacheMisses: 0
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
const resultCache = new ResultLRU(resultCacheSize);
|
|
266
|
+
// Hot-path optimization: shared "no_match" object — most checks return this,
|
|
267
|
+
// skip per-call object allocation. Safe because callers only read fields.
|
|
268
|
+
const NO_MATCH = Object.freeze({ blocked: false, rule: null, reason: 'no_match' });
|
|
269
|
+
// Bind once: skips the prototype property lookup for `engine.check` on every
|
|
270
|
+
// call. The adblock-rs forwarder still does an internal name concat per
|
|
271
|
+
// invocation; bypassing that further would require reaching into the native
|
|
272
|
+
// binding (engine.boxed + blocker.Engine_check), which is brittle across
|
|
273
|
+
// library versions.
|
|
274
|
+
const engineCheck = engine.check.bind(engine);
|
|
275
|
+
|
|
276
|
+
if (enableLogging) {
|
|
277
|
+
if (cacheHit) {
|
|
278
|
+
console.log(`[Adblock-Rust] Restored compiled engine from ${cachePath} (${(totalBytes/1024/1024).toFixed(2)}MB source, ${filePaths.length} list${filePaths.length>1?'s':''})`);
|
|
279
|
+
} else {
|
|
280
|
+
console.log(`[Adblock-Rust] Compiled ${ruleCount} rules from ${filePaths.length} list${filePaths.length>1?'s':''} (${(totalBytes/1024/1024).toFixed(2)}MB)`);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
rules: { stats },
|
|
286
|
+
|
|
287
|
+
shouldBlock(url, sourceUrl, resourceType) {
|
|
288
|
+
// Avoid default-parameter syntax in the hot path — explicit null/undefined
|
|
289
|
+
// checks are slightly cheaper for V8's argument adaptor.
|
|
290
|
+
const src = sourceUrl || '';
|
|
291
|
+
const rt = resourceType || '';
|
|
292
|
+
// Single null-proto object lookup; falls back to 'other' for unknown types.
|
|
293
|
+
const normType = rt ? (RESOURCE_TYPE_MAP[rt] || 'other') : '';
|
|
294
|
+
const key = url + '\0' + src + '\0' + normType;
|
|
295
|
+
const cached = resultCache.get(key);
|
|
296
|
+
if (cached !== undefined) {
|
|
297
|
+
stats.cacheHits++;
|
|
298
|
+
return cached;
|
|
299
|
+
}
|
|
300
|
+
stats.cacheMisses++;
|
|
301
|
+
|
|
302
|
+
// Narrow try/catch to the native call only — keeps the rest of the
|
|
303
|
+
// function on TurboFan's fast path and avoids exception-handler overhead
|
|
304
|
+
// on stats updates and Map operations.
|
|
305
|
+
let result;
|
|
306
|
+
try {
|
|
307
|
+
// Pass empty string (not the request URL) when source is unknown — the
|
|
308
|
+
// engine then skips first/third-party determination instead of treating
|
|
309
|
+
// the request as same-origin to itself, which would suppress
|
|
310
|
+
// $third-party rules entirely.
|
|
311
|
+
// The 4th arg MUST be true: with false adblock-rs returns a bare
|
|
312
|
+
// boolean instead of the {matched, exception, filter, important}
|
|
313
|
+
// object we read below, which silently breaks matching.
|
|
314
|
+
result = engineCheck(url, src, normType, true);
|
|
315
|
+
} catch (err) {
|
|
316
|
+
stats.errors++;
|
|
317
|
+
if (enableLogging) {
|
|
318
|
+
console.log(`[Adblock-Rust] Error checking ${url}: ${err.message}`);
|
|
319
|
+
}
|
|
320
|
+
// Don't cache errors — next call may succeed (transient native panic).
|
|
321
|
+
return { blocked: false, rule: null, reason: 'error' };
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// engine.check is contract-bound to return an object; no null guard
|
|
325
|
+
// needed. Reading each field once into a local keeps the IC monomorphic.
|
|
326
|
+
let r;
|
|
327
|
+
if (result.matched) {
|
|
328
|
+
const exception = result.exception;
|
|
329
|
+
if (exception) {
|
|
330
|
+
stats.exceptions++;
|
|
331
|
+
r = { blocked: false, rule: exception, reason: 'whitelisted' };
|
|
332
|
+
} else {
|
|
333
|
+
stats.blocked++;
|
|
334
|
+
r = {
|
|
335
|
+
blocked: true,
|
|
336
|
+
rule: result.filter || null,
|
|
337
|
+
reason: result.important ? 'important_rule' : 'adblock_rust'
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
} else {
|
|
341
|
+
stats.allowed++;
|
|
342
|
+
r = NO_MATCH;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
resultCache.set(key, r);
|
|
346
|
+
return r;
|
|
347
|
+
},
|
|
348
|
+
|
|
349
|
+
getStats() {
|
|
350
|
+
const total = stats.cacheHits + stats.cacheMisses;
|
|
351
|
+
const hitRate = total > 0 ? ((stats.cacheHits / total) * 100).toFixed(1) + '%' : '0%';
|
|
352
|
+
return {
|
|
353
|
+
...stats,
|
|
354
|
+
cache: {
|
|
355
|
+
hits: stats.cacheHits,
|
|
356
|
+
misses: stats.cacheMisses,
|
|
357
|
+
hitRate,
|
|
358
|
+
size: resultCache.cache.size,
|
|
359
|
+
maxSize: resultCache.maxSize
|
|
360
|
+
}
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
module.exports = {
|
|
367
|
+
parseAdblockRules
|
|
368
|
+
};
|
package/lib/output.js
CHANGED
|
@@ -324,35 +324,40 @@ function buildOutputLines(results, options = {}) {
|
|
|
324
324
|
const { showTitles = false, removeDupes = false, ignoreDomains = [], forLogFile = false } = options;
|
|
325
325
|
|
|
326
326
|
// Consolidate rules from all results, handling multiple results for same URL
|
|
327
|
-
const consolidatedRules = new Map(); // URL -> Set
|
|
327
|
+
const consolidatedRules = new Map(); // URL -> { rules: Set, originalUrl, regexes: Set }
|
|
328
328
|
let successfulPageLoads = 0;
|
|
329
|
-
|
|
329
|
+
|
|
330
330
|
results.forEach(result => {
|
|
331
331
|
if (result) {
|
|
332
332
|
if (result.success) {
|
|
333
333
|
successfulPageLoads++;
|
|
334
334
|
}
|
|
335
335
|
if (result.rules && result.rules.length > 0) {
|
|
336
|
-
// Consolidate rules by URL to handle multiple site entries for same URL
|
|
337
336
|
if (!consolidatedRules.has(result.url)) {
|
|
338
|
-
consolidatedRules.set(result.url, new Set());
|
|
337
|
+
consolidatedRules.set(result.url, { rules: new Set(), originalUrl: result.originalUrl || result.url, regexes: new Set() });
|
|
338
|
+
}
|
|
339
|
+
const entry = consolidatedRules.get(result.url);
|
|
340
|
+
result.rules.forEach(rule => entry.rules.add(rule));
|
|
341
|
+
if (Array.isArray(result.matchedRegexes)) {
|
|
342
|
+
result.matchedRegexes.forEach(rx => entry.regexes.add(rx));
|
|
343
|
+
}
|
|
344
|
+
// Prefer the original URL from any result entry that has one different from final
|
|
345
|
+
if (result.originalUrl && result.originalUrl !== result.url) {
|
|
346
|
+
entry.originalUrl = result.originalUrl;
|
|
339
347
|
}
|
|
340
|
-
|
|
341
|
-
// Add all rules from this result to the consolidated set
|
|
342
|
-
result.rules.forEach(rule => {
|
|
343
|
-
consolidatedRules.get(result.url).add(rule);
|
|
344
|
-
});
|
|
345
348
|
}
|
|
346
349
|
}
|
|
347
350
|
});
|
|
348
351
|
|
|
349
352
|
// Convert consolidated rules back to array format
|
|
350
353
|
const finalSiteRules = [];
|
|
351
|
-
consolidatedRules.forEach((
|
|
352
|
-
if (
|
|
353
|
-
finalSiteRules.push({
|
|
354
|
-
url: url,
|
|
355
|
-
|
|
354
|
+
consolidatedRules.forEach((entry, url) => {
|
|
355
|
+
if (entry.rules.size > 0) {
|
|
356
|
+
finalSiteRules.push({
|
|
357
|
+
url: url,
|
|
358
|
+
originalUrl: entry.originalUrl,
|
|
359
|
+
regexes: Array.from(entry.regexes),
|
|
360
|
+
rules: Array.from(entry.rules)
|
|
356
361
|
});
|
|
357
362
|
}
|
|
358
363
|
});
|
|
@@ -362,35 +367,41 @@ function buildOutputLines(results, options = {}) {
|
|
|
362
367
|
const outputLinesWithTitles = [];
|
|
363
368
|
let filteredOutCount = 0;
|
|
364
369
|
|
|
365
|
-
for (const { url, rules } of finalSiteRules) {
|
|
370
|
+
for (const { url, originalUrl, regexes, rules } of finalSiteRules) {
|
|
366
371
|
if (rules.length > 0) {
|
|
372
|
+
// Build title comments — include redirect source if URL changed and matched regex(es)
|
|
373
|
+
const titleLines = [`! ${url}`];
|
|
374
|
+
if (originalUrl && originalUrl !== url) {
|
|
375
|
+
titleLines.push(`! Redirected from: ${originalUrl}`);
|
|
376
|
+
}
|
|
377
|
+
if (regexes && regexes.length > 0) {
|
|
378
|
+
titleLines.push(`! Regex: ${regexes.join(', ')}`);
|
|
379
|
+
}
|
|
380
|
+
|
|
367
381
|
// Regular output (for -o files and console) - only add titles if --titles flag used
|
|
368
382
|
if (showTitles) {
|
|
369
|
-
outputLines.push(
|
|
383
|
+
outputLines.push(...titleLines);
|
|
370
384
|
}
|
|
371
|
-
|
|
385
|
+
|
|
372
386
|
// Filter out ignored domains from rules
|
|
373
387
|
const filteredRules = rules.filter(rule => {
|
|
374
388
|
const domain = extractDomainFromRule(rule);
|
|
375
389
|
if (domain && matchesIgnoreDomain(domain, ignoreDomains)) {
|
|
376
390
|
filteredOutCount++;
|
|
377
|
-
|
|
378
|
-
// Log each filtered domain
|
|
379
391
|
if (options.forceDebug) {
|
|
380
392
|
console.log(formatLogMessage('debug', `[output-filter] Removed rule matching ignoreDomains: ${rule} (domain: ${domain})`));
|
|
381
393
|
} else if (!options.silentMode) {
|
|
382
394
|
console.log(formatLogMessage('info', `Filtered out: ${domain}`));
|
|
383
|
-
|
|
384
|
-
|
|
395
|
+
}
|
|
385
396
|
return false;
|
|
386
397
|
}
|
|
387
398
|
return true;
|
|
388
399
|
});
|
|
389
|
-
|
|
400
|
+
|
|
390
401
|
outputLines.push(...filteredRules);
|
|
391
|
-
|
|
402
|
+
|
|
392
403
|
// Output with titles (for auto-saved log files) - always add titles
|
|
393
|
-
outputLinesWithTitles.push(
|
|
404
|
+
outputLinesWithTitles.push(...titleLines);
|
|
394
405
|
outputLinesWithTitles.push(...filteredRules);
|
|
395
406
|
}
|
|
396
407
|
}
|
package/nwss.js
CHANGED
|
@@ -58,7 +58,8 @@ const { clearSiteData } = require('./lib/clear_sitedata');
|
|
|
58
58
|
// Referrer header generation
|
|
59
59
|
const { getReferrerForUrl, validateReferrerConfig, validateReferrerDisable } = require('./lib/referrer');
|
|
60
60
|
// Adblock rules parser
|
|
61
|
-
const
|
|
61
|
+
const adblockJs = require('./lib/adblock');
|
|
62
|
+
const adblockRust = require('./lib/adblock-rust');
|
|
62
63
|
// WireGuard VPN
|
|
63
64
|
const { connectForSite: wgConnect, disconnectForSite: wgDisconnect, disconnectAll: wgDisconnectAll, validateVpnConfig, normalizeVpnConfig } = require('./lib/wireguard_vpn');
|
|
64
65
|
// OpenVPN
|
|
@@ -185,9 +186,19 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
|
|
|
185
186
|
const nwssConfig = JSON.parse(fs.readFileSync(NWSSCONFIG_PATH, 'utf-8'));
|
|
186
187
|
// Find which config file is being used (--custom-json <file> or positional .json arg)
|
|
187
188
|
const customJsonIdx = args.findIndex(arg => arg === '--custom-json');
|
|
189
|
+
const positionalJson = (customJsonIdx === -1)
|
|
190
|
+
? args.find(a => a.endsWith('.json') && !a.startsWith('--'))
|
|
191
|
+
: null;
|
|
188
192
|
const configFilename = (customJsonIdx !== -1 && args[customJsonIdx + 1])
|
|
189
193
|
? args[customJsonIdx + 1]
|
|
190
|
-
:
|
|
194
|
+
: positionalJson;
|
|
195
|
+
|
|
196
|
+
// If a positional .json was used (not --custom-json), wire it to --custom-json
|
|
197
|
+
// so the real config loader picks it up instead of defaulting to config.json
|
|
198
|
+
if (positionalJson && customJsonIdx === -1) {
|
|
199
|
+
args.push('--custom-json', positionalJson);
|
|
200
|
+
process.argv.push('--custom-json', positionalJson);
|
|
201
|
+
}
|
|
191
202
|
|
|
192
203
|
if (configFilename && nwssConfig.configs && nwssConfig.configs[configFilename]) {
|
|
193
204
|
const settings = nwssConfig.configs[configFilename];
|
|
@@ -584,6 +595,22 @@ if (validateRules || validateRulesFile) {
|
|
|
584
595
|
}
|
|
585
596
|
}
|
|
586
597
|
|
|
598
|
+
// Parse --adblock-engine=<js|rust> (default: js). Selects the matcher backend
|
|
599
|
+
// used by --block-ads. The rust engine requires the optional adblock-rs package.
|
|
600
|
+
const adblockEngineIndex = args.findIndex(arg => arg.startsWith('--adblock-engine'));
|
|
601
|
+
let adblockEngineName = 'js';
|
|
602
|
+
if (adblockEngineIndex !== -1) {
|
|
603
|
+
const engineArg = args[adblockEngineIndex].includes('=')
|
|
604
|
+
? args[adblockEngineIndex].split('=')[1]
|
|
605
|
+
: args[adblockEngineIndex + 1];
|
|
606
|
+
if (engineArg === 'rust' || engineArg === 'js') {
|
|
607
|
+
adblockEngineName = engineArg;
|
|
608
|
+
} else {
|
|
609
|
+
console.log(`Error: --adblock-engine must be 'js' or 'rust' (got: ${engineArg})`);
|
|
610
|
+
process.exit(1);
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
587
614
|
// Parse --block-ads argument for request-level ad blocking (supports comma-separated lists)
|
|
588
615
|
const blockAdsIndex = args.findIndex(arg => arg.startsWith('--block-ads'));
|
|
589
616
|
if (blockAdsIndex !== -1) {
|
|
@@ -604,18 +631,31 @@ if (blockAdsIndex !== -1) {
|
|
|
604
631
|
}
|
|
605
632
|
}
|
|
606
633
|
|
|
607
|
-
// Concatenate multiple lists into a single temp file for the parser
|
|
608
|
-
let rulesFile = rulesFiles[0];
|
|
609
|
-
if (rulesFiles.length > 1) {
|
|
610
|
-
rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
|
|
611
|
-
const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
|
|
612
|
-
fs.writeFileSync(rulesFile, combined);
|
|
613
|
-
}
|
|
614
|
-
|
|
615
634
|
adblockEnabled = true;
|
|
616
|
-
|
|
635
|
+
const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
|
|
636
|
+
try {
|
|
637
|
+
if (engine === adblockRust) {
|
|
638
|
+
// Rust wrapper accepts an array directly — no temp file needed.
|
|
639
|
+
adblockMatcher = engine.parseAdblockRules(rulesFiles, { enableLogging: forceDebug });
|
|
640
|
+
} else {
|
|
641
|
+
// JS engine takes a single path; concat to a temp file when multiple lists.
|
|
642
|
+
let rulesFile = rulesFiles[0];
|
|
643
|
+
if (rulesFiles.length > 1) {
|
|
644
|
+
rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
|
|
645
|
+
const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
|
|
646
|
+
fs.writeFileSync(rulesFile, combined);
|
|
647
|
+
}
|
|
648
|
+
adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
|
|
649
|
+
}
|
|
650
|
+
} catch (err) {
|
|
651
|
+
console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
|
|
652
|
+
process.exit(1);
|
|
653
|
+
}
|
|
617
654
|
const stats = adblockMatcher.getStats();
|
|
618
|
-
|
|
655
|
+
const ruleDesc = stats.total != null
|
|
656
|
+
? `${stats.total} blocking rules`
|
|
657
|
+
: `compiled engine (cached)`;
|
|
658
|
+
if (!silentMode) console.log(messageColors.success(`Adblock enabled (${adblockEngineName}): Loaded ${ruleDesc} from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`));
|
|
619
659
|
}
|
|
620
660
|
|
|
621
661
|
if (args.includes('--help') || args.includes('-h')) {
|
|
@@ -641,6 +681,9 @@ Output Format Options:
|
|
|
641
681
|
Request Blocking:
|
|
642
682
|
--block-ads=<file> Block ads/trackers using EasyList format rules (||domain.com^, /ads/*, etc)
|
|
643
683
|
Works at request-level for maximum performance
|
|
684
|
+
Supports comma-separated lists: --block-ads=easylist.txt,easyprivacy.txt
|
|
685
|
+
--adblock-engine=<js|rust> Matcher backend for --block-ads (default: js)
|
|
686
|
+
'rust' uses Brave's adblock-rs (faster on large lists; needs: npm i adblock-rs)
|
|
644
687
|
|
|
645
688
|
Per-config settings file (.nwssconfig):
|
|
646
689
|
Place a .nwssconfig file in the project root to define per-config settings.
|
|
@@ -687,6 +730,7 @@ Validation Options:
|
|
|
687
730
|
|
|
688
731
|
Global config.json options:
|
|
689
732
|
ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
|
|
733
|
+
ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
|
|
690
734
|
blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
|
|
691
735
|
whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
|
|
692
736
|
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
|
|
@@ -854,8 +898,9 @@ try {
|
|
|
854
898
|
// Extract config values while ignoring 'comments' field at global and site levels
|
|
855
899
|
const {
|
|
856
900
|
sites = [],
|
|
857
|
-
ignoreDomains = [],
|
|
858
|
-
|
|
901
|
+
ignoreDomains = [],
|
|
902
|
+
ignoreDomainsByUrl = [],
|
|
903
|
+
blocked: globalBlocked = [],
|
|
859
904
|
whois_delay = 3000,
|
|
860
905
|
whois_server_mode = 'random',
|
|
861
906
|
ignore_similar = true,
|
|
@@ -901,6 +946,15 @@ for (const pattern of ignoreDomains) {
|
|
|
901
946
|
}
|
|
902
947
|
}
|
|
903
948
|
|
|
949
|
+
// Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
|
|
950
|
+
const _ignoreDomainsByUrlRegexes = Array.isArray(ignoreDomainsByUrl)
|
|
951
|
+
? ignoreDomainsByUrl.map(p => {
|
|
952
|
+
try { return getCompiledRegex(p); } catch { return null; }
|
|
953
|
+
}).filter(r => r)
|
|
954
|
+
: [];
|
|
955
|
+
// Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
|
|
956
|
+
const _dynamicallyIgnoredDomains = new Set();
|
|
957
|
+
|
|
904
958
|
// Apply global configuration overrides with validation
|
|
905
959
|
// Priority: Command line args > config.json > defaults
|
|
906
960
|
const MAX_CONCURRENT_SITES = (() => {
|
|
@@ -1312,6 +1366,8 @@ function shouldBypassCacheForUrl(url, siteConfig) {
|
|
|
1312
1366
|
// Cache compiled wildcard regexes to avoid recompilation on every request
|
|
1313
1367
|
const _wildcardRegexCache = new Map();
|
|
1314
1368
|
function matchesIgnoreDomain(domain, ignorePatterns) {
|
|
1369
|
+
// Dynamically ignored domains (from URL pattern matches via ignoreDomainsByUrl)
|
|
1370
|
+
if (_dynamicallyIgnoredDomains.has(domain)) return true;
|
|
1315
1371
|
// Fast path: exact match or suffix match against Set (O(n) for parts, but no regex)
|
|
1316
1372
|
if (_ignoreDomainsExact.size > 0) {
|
|
1317
1373
|
if (_ignoreDomainsExact.has(domain)) return true;
|
|
@@ -1789,6 +1845,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
1789
1845
|
* @returns {Promise<object>} A promise that resolves to an object containing scan results.
|
|
1790
1846
|
*/
|
|
1791
1847
|
async function processUrl(currentUrl, siteConfig, browserInstance) {
|
|
1848
|
+
// Preserve the original URL (before any redirect) for output display
|
|
1849
|
+
const originalRequestedUrl = currentUrl;
|
|
1850
|
+
// Track regex patterns that produced matches (for title comments in output)
|
|
1851
|
+
const matchedRegexPatterns = new Set();
|
|
1792
1852
|
// V8 Optimization: Single destructuring to avoid multiple property lookups
|
|
1793
1853
|
const {
|
|
1794
1854
|
firstParty,
|
|
@@ -2553,6 +2613,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2553
2613
|
const blockedRegexes = Array.isArray(siteConfig.blocked)
|
|
2554
2614
|
? siteConfig.blocked.map(pattern => getCompiledRegex(pattern))
|
|
2555
2615
|
: [];
|
|
2616
|
+
|
|
2617
|
+
// Pre-build Set for O(1) resourceType lookups (fired per request)
|
|
2618
|
+
const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
|
|
2619
|
+
? new Set(siteConfig.resourceTypes)
|
|
2620
|
+
: null;
|
|
2556
2621
|
|
|
2557
2622
|
// Combine site-specific with pre-compiled global blocked patterns
|
|
2558
2623
|
const allBlockedRegexes = blockedRegexes.length > 0
|
|
@@ -2774,9 +2839,22 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2774
2839
|
bufferedLogWrite(debugLogFile, logEntry);
|
|
2775
2840
|
}
|
|
2776
2841
|
const reqUrl = checkedUrl;
|
|
2777
|
-
|
|
2842
|
+
|
|
2778
2843
|
const reqDomain = perSiteSubDomains ? fullSubdomain : checkedRootDomain;
|
|
2779
2844
|
|
|
2845
|
+
// ignoreDomainsByUrl — if any pattern matches this URL, mark the root domain as ignored for the rest of the scan
|
|
2846
|
+
if (_ignoreDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
|
|
2847
|
+
for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
|
|
2848
|
+
if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
|
|
2849
|
+
_dynamicallyIgnoredDomains.add(checkedRootDomain);
|
|
2850
|
+
if (forceDebug) {
|
|
2851
|
+
console.log(formatLogMessage('debug', `[ignoreDomainsByUrl] ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
|
|
2852
|
+
}
|
|
2853
|
+
break;
|
|
2854
|
+
}
|
|
2855
|
+
}
|
|
2856
|
+
}
|
|
2857
|
+
|
|
2780
2858
|
let blockedMatchIndex = -1;
|
|
2781
2859
|
for (let i = 0; i < allBlockedRegexes.length; i++) {
|
|
2782
2860
|
if (allBlockedRegexes[i].test(reqUrl)) {
|
|
@@ -2801,14 +2879,14 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2801
2879
|
if (reqDomain && !matchesIgnoreDomain(reqDomain, ignoreDomains)) {
|
|
2802
2880
|
for (const re of regexes) {
|
|
2803
2881
|
if (re.test(reqUrl)) {
|
|
2882
|
+
const evenBlockedRegexPattern = re.source;
|
|
2804
2883
|
const resourceType = request.resourceType();
|
|
2805
|
-
|
|
2884
|
+
|
|
2806
2885
|
// Apply same filtering logic as unblocked requests
|
|
2807
|
-
|
|
2808
|
-
if (!allowedResourceTypes || !Array.isArray(allowedResourceTypes) || allowedResourceTypes.includes(resourceType)) {
|
|
2886
|
+
if (!allowedResourceTypesSet || allowedResourceTypesSet.has(resourceType)) {
|
|
2809
2887
|
if (dryRunMode) {
|
|
2810
2888
|
addDryRunMatch(matchedDomains, {
|
|
2811
|
-
regex:
|
|
2889
|
+
regex: evenBlockedRegexPattern,
|
|
2812
2890
|
domain: reqDomain,
|
|
2813
2891
|
resourceType: resourceType,
|
|
2814
2892
|
fullUrl: reqUrl,
|
|
@@ -2818,10 +2896,11 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2818
2896
|
} else {
|
|
2819
2897
|
addMatchedDomain(reqDomain, resourceType, fullSubdomain);
|
|
2820
2898
|
}
|
|
2821
|
-
|
|
2899
|
+
matchedRegexPatterns.add(evenBlockedRegexPattern);
|
|
2900
|
+
|
|
2822
2901
|
if (siteConfig.verbose === 1) {
|
|
2823
2902
|
const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
|
|
2824
|
-
console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${
|
|
2903
|
+
console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${evenBlockedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
|
|
2825
2904
|
}
|
|
2826
2905
|
if (dumpUrls) {
|
|
2827
2906
|
const timestamp = new Date().toISOString();
|
|
@@ -2889,11 +2968,10 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2889
2968
|
|
|
2890
2969
|
// *** UNIVERSAL RESOURCE TYPE FILTER ***
|
|
2891
2970
|
// Check resourceTypes filter FIRST, before ANY processing (nettools, searchstring, immediate matching)
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
if (!allowedResourceTypes.includes(resourceType)) {
|
|
2971
|
+
if (allowedResourceTypesSet && allowedResourceTypesSet.size > 0) {
|
|
2972
|
+
if (!allowedResourceTypesSet.has(resourceType)) {
|
|
2895
2973
|
if (forceDebug) {
|
|
2896
|
-
console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${
|
|
2974
|
+
console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${Array.from(allowedResourceTypesSet).join(', ')}]. Skipping ALL processing.`));
|
|
2897
2975
|
}
|
|
2898
2976
|
// Skip this URL entirely - doesn't match required resource types
|
|
2899
2977
|
request.continue();
|
|
@@ -2981,6 +3059,7 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
2981
3059
|
} else {
|
|
2982
3060
|
addMatchedDomain(reqDomain, resourceType);
|
|
2983
3061
|
}
|
|
3062
|
+
if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
|
|
2984
3063
|
if (siteConfig.verbose === 1) {
|
|
2985
3064
|
const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
|
|
2986
3065
|
console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${matchedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
|
|
@@ -4011,12 +4090,14 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4011
4090
|
};
|
|
4012
4091
|
const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
|
|
4013
4092
|
|
|
4014
|
-
return {
|
|
4015
|
-
url: currentUrl,
|
|
4016
|
-
|
|
4093
|
+
return {
|
|
4094
|
+
url: currentUrl,
|
|
4095
|
+
originalUrl: originalRequestedUrl,
|
|
4096
|
+
rules: formattedRules,
|
|
4017
4097
|
success: true,
|
|
4018
4098
|
finalUrl: finalUrlAfterRedirect || currentUrl,
|
|
4019
|
-
redirectDomains: redirectDomainsToExclude
|
|
4099
|
+
redirectDomains: redirectDomainsToExclude,
|
|
4100
|
+
matchedRegexes: Array.from(matchedRegexPatterns)
|
|
4020
4101
|
};
|
|
4021
4102
|
}
|
|
4022
4103
|
|
|
@@ -4072,13 +4153,15 @@ function setupFrameHandling(page, forceDebug) {
|
|
|
4072
4153
|
};
|
|
4073
4154
|
const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
|
|
4074
4155
|
if (forceDebug) console.log(formatLogMessage('debug', `Saving ${formattedRules.length} rules despite page load failure`));
|
|
4075
|
-
return {
|
|
4076
|
-
url: currentUrl,
|
|
4077
|
-
|
|
4078
|
-
|
|
4156
|
+
return {
|
|
4157
|
+
url: currentUrl,
|
|
4158
|
+
originalUrl: originalRequestedUrl,
|
|
4159
|
+
rules: formattedRules,
|
|
4160
|
+
success: false,
|
|
4079
4161
|
hasMatches: true,
|
|
4080
4162
|
finalUrl: finalUrlAfterRedirect || currentUrl,
|
|
4081
|
-
redirectDomains: redirectDomainsToExclude
|
|
4163
|
+
redirectDomains: redirectDomainsToExclude,
|
|
4164
|
+
matchedRegexes: Array.from(matchedRegexPatterns)
|
|
4082
4165
|
};
|
|
4083
4166
|
}
|
|
4084
4167
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fanboynz/network-scanner",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.64",
|
|
4
4
|
"description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
|
|
5
5
|
"main": "nwss.js",
|
|
6
6
|
"scripts": {
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
},
|
|
12
12
|
"dependencies": {
|
|
13
13
|
"ghost-cursor": "^1.4.2",
|
|
14
|
-
"lru-cache": "^
|
|
15
|
-
"p-limit": "^
|
|
14
|
+
"lru-cache": "^11.3.5",
|
|
15
|
+
"p-limit": "^7.3.0",
|
|
16
16
|
"psl": "^1.15.0",
|
|
17
17
|
"puppeteer": ">=20.0.0"
|
|
18
18
|
},
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"author": "FanboyNZ",
|
|
37
37
|
"license": "GPL-3.0",
|
|
38
38
|
"engines": {
|
|
39
|
-
"node": ">=
|
|
39
|
+
"node": ">=22.0.0"
|
|
40
40
|
},
|
|
41
41
|
"repository": {
|
|
42
42
|
"type": "git",
|
|
@@ -50,10 +50,11 @@
|
|
|
50
50
|
},
|
|
51
51
|
"homepage": "https://github.com/ryanbr/network-scanner",
|
|
52
52
|
"optionalDependencies": {
|
|
53
|
+
"adblock-rs": "^0.12.3",
|
|
53
54
|
"puppeteer-core": ">=20.0.0"
|
|
54
55
|
},
|
|
55
56
|
"devDependencies": {
|
|
56
57
|
"eslint": "^10.0.2",
|
|
57
|
-
"globals": "^
|
|
58
|
+
"globals": "^17.6.0"
|
|
58
59
|
}
|
|
59
60
|
}
|