@fanboynz/network-scanner 2.0.62 → 2.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CLAUDE.md CHANGED
@@ -27,7 +27,7 @@ Puppeteer-based network scanner for analyzing web traffic, generating adblock fi
27
27
 
28
28
  ## Tech Stack
29
29
 
30
- - **Node.js** >=20.0.0
30
+ - **Node.js** >=22.0.0
31
31
  - **puppeteer** >=20.0.0 — Headless browser automation
32
32
  - **psl** — Public Suffix List for domain parsing
33
33
  - **lru-cache** — LRU cache implementation
package/README.md CHANGED
@@ -5,7 +5,7 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
5
5
  - Scan websites and detect matching third-party or first-party resources
6
6
  - Output Adblock-formatted blocking rules
7
7
  - Support for multiple filters per site
8
- - Grouped titles (! <url>) before site matches
8
+ - Grouped titles (! <url>) before site matches, including redirect source and matching regex
9
9
  - Ignore unwanted domains (global and per-site)
10
10
  - Block unwanted domains during scan (simulate adblock)
11
11
  - Support Chrome, Firefox, Safari user agents (desktop or mobile)
@@ -64,6 +64,7 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
64
64
  | `--headful` | Launch browser with GUI (not headless) |
65
65
  | `--keep-open` | Keep browser and tabs open after scan completes (use with `--headful` for debugging) |
66
66
  | `--use-puppeteer-core` | Use `puppeteer-core` with system Chrome instead of bundled Chromium |
67
+ | `--use-obscura` | Connect to running Obscura CDP server (`ws://127.0.0.1:9222` or `OBSCURA_WS` env). Skips fingerprint injection — Obscura provides built-in stealth |
67
68
  | `--load-extension <path>` | Load unpacked Chrome extension from directory (can be used multiple times) |
68
69
  | `--dns-cache` | Persist dig/whois results to disk between runs (14hr TTL, `.digcache`/`.whoiscache`) |
69
70
  | `--block-ads=<files>` | Block ads using EasyList format rules (comma-separated: `easylist.txt,easyprivacy.txt`) |
@@ -448,7 +449,7 @@ node nwss.js config-clean2.json --debug # .nwssconfig + debug overri
448
449
  node nwss.js config-other.json --max-concurrent 5 # no match in .nwssconfig, uses CLI flags
449
450
  ```
450
451
 
451
- **Supported settings:** `output`, `max_concurrent`, `dns_cache`, `cache_requests`, `dumpurls`, `remove_tempfiles`, `color`, `remove_dupes`, `compress_logs`, `debug`, `silent`, `verbose`, `headful`, `keep_open`, `dry_run`, `titles`, `sub_domains`, `no_interact`, `ghost_cursor`, `plain`, `cdp`, `dnsmasq`, `unbound`, `privoxy`, `pihole`, `eval_on_doc`, `use_puppeteer_core`, `ignore_cache`, `clear_cache`, `block_ads`, `compare`, `localhost`, `append`.
452
+ **Supported settings:** `output`, `max_concurrent`, `dns_cache`, `cache_requests`, `dumpurls`, `remove_tempfiles`, `color`, `remove_dupes`, `compress_logs`, `debug`, `silent`, `verbose`, `headful`, `keep_open`, `dry_run`, `titles`, `sub_domains`, `no_interact`, `ghost_cursor`, `plain`, `cdp`, `dnsmasq`, `unbound`, `privoxy`, `pihole`, `eval_on_doc`, `use_puppeteer_core`, `use_obscura`, `ignore_cache`, `clear_cache`, `block_ads`, `compare`, `localhost`, `append`.
452
453
 
453
454
  **Priority:** CLI flags > `.nwssconfig` > hardcoded defaults.
454
455
 
@@ -461,6 +462,7 @@ These options go at the root level of your config.json:
461
462
  | Field | Values | Default | Description |
462
463
  |:---------------------|:-------|:-------:|:------------|
463
464
  | `ignoreDomains` | Array | - | Domains to completely ignore (supports wildcards like `*.ads.com`) |
465
+ | `ignoreDomainsByUrl` | Array | - | Regex patterns; if a request URL matches, the request's root domain is dynamically ignored for the rest of the scan (e.g. `["\\/jwplayer\\/", "\\/build\\/assets\\/"]`) |
464
466
  | `blocked` | Array | - | Global regex patterns to block requests (combined with per-site blocked) |
465
467
  | `whois_server_mode` | String | `"random"` | Default server selection mode for all sites |
466
468
  | `ignore_similar` | Boolean | `true` | Ignore domains similar to already found domains |
package/config.json CHANGED
@@ -38,7 +38,7 @@
38
38
  "sites": [
39
39
  {
40
40
  "url": "https://www.anandtech.com/",
41
- "filterRegex": ".",
41
+ "filterRegex": "teststring",
42
42
  "resourceTypes": ["script", "xhr", "document"],
43
43
  "reload": 1,
44
44
  "timeout": 25000,
@@ -50,7 +50,7 @@
50
50
  },
51
51
  {
52
52
  "url": "https://www.tomshardware.com/",
53
- "filterRegex": ".",
53
+ "filterRegex": "anotherstrng",
54
54
  "resourceTypes": ["all"],
55
55
  "reload": 2,
56
56
  "timeout": 25000,
@@ -61,7 +61,7 @@
61
61
  },
62
62
  {
63
63
  "url": ["https://www.tomshardware.com/", "https://www.anandtech.com/"],
64
- "filterRegex": ".",
64
+ "filterRegex": "morestrings",
65
65
  "resourceTypes": ["all"],
66
66
  "reload": 2,
67
67
  "timeout": 25000,
@@ -0,0 +1,368 @@
1
+ // === Adblock Rust Engine Wrapper (adblock-rust.js) ===
2
+ // Drop-in replacement for ./lib/adblock that delegates matching to Brave's
3
+ // adblock-rust engine (npm: adblock-rs) for higher throughput on large lists.
4
+ //
5
+ // Exposes the same parseAdblockRules(filePath, options) factory and the same
6
+ // matcher shape ({ shouldBlock, getStats, rules }) so nwss.js can switch
7
+ // engines with a single require() swap.
8
+
9
+ const fs = require('fs');
10
+ const path = require('path');
11
+ const os = require('os');
12
+ const crypto = require('crypto');
13
+
14
+ let adblockRust = null;
15
+ let adblockRustVersion = null;
16
+ function loadAdblockRust() {
17
+ if (adblockRust) return adblockRust;
18
+ try {
19
+ adblockRust = require('adblock-rs');
20
+ // Read once for the cache key — serialized engine format is not promised
21
+ // stable across versions, so partitioning cache files by version means
22
+ // upgrades cleanly invalidate without producing confusing deserialize
23
+ // failures on the warm path.
24
+ adblockRustVersion = require('adblock-rs/package.json').version;
25
+ } catch (err) {
26
+ throw new Error(
27
+ "adblock-rs is not installed. Install with: npm install adblock-rs " +
28
+ "(requires Rust toolchain for native build). Original error: " + err.message
29
+ );
30
+ }
31
+ return adblockRust;
32
+ }
33
+
34
+ // Best-effort cleanup of stale serialized engines. Filter lists change roughly
35
+ // monthly; cache files older than this are unlikely to be reused and only cost
36
+ // disk space. Runs once per cold parse and swallows all errors — cleanup
37
+ // failure must never block a scan.
38
+ function pruneOldCacheFiles(cacheDir, maxAgeMs) {
39
+ try {
40
+ const cutoff = Date.now() - maxAgeMs;
41
+ const files = fs.readdirSync(cacheDir);
42
+ for (const name of files) {
43
+ // Only touch our own files; `.tmp` covers stray writes from killed
44
+ // processes. Skip anything else (in case the dir is shared).
45
+ if (!name.endsWith('.bin') && !name.endsWith('.tmp')) continue;
46
+ const full = path.join(cacheDir, name);
47
+ try {
48
+ if (fs.statSync(full).mtimeMs < cutoff) fs.unlinkSync(full);
49
+ } catch (_) { /* file vanished mid-walk — fine */ }
50
+ }
51
+ } catch (_) { /* dir doesn't exist or unreadable — fine */ }
52
+ }
53
+
54
+ // Map Puppeteer/CDP resource type names to adblock-rust request types.
55
+ // Uses a null-prototype object so lookups skip the prototype chain — small but
56
+ // free win on a hot-path lookup that runs once per network request.
57
+ const RESOURCE_TYPE_MAP = Object.assign(Object.create(null), {
58
+ 'document': 'main_frame',
59
+ 'subdocument': 'sub_frame',
60
+ 'stylesheet': 'stylesheet',
61
+ 'script': 'script',
62
+ 'image': 'image',
63
+ 'font': 'font',
64
+ 'media': 'media',
65
+ 'texttrack': 'media',
66
+ 'xhr': 'xmlhttprequest',
67
+ 'fetch': 'xmlhttprequest',
68
+ 'xmlhttprequest': 'xmlhttprequest',
69
+ 'eventsource': 'other',
70
+ 'websocket': 'websocket',
71
+ 'manifest': 'other',
72
+ 'signedexchange': 'other',
73
+ 'ping': 'ping',
74
+ 'cspviolationreport': 'other',
75
+ 'preflight': 'other',
76
+ 'other': 'other',
77
+ '': ''
78
+ });
79
+
80
+ function normalizeResourceType(type) {
81
+ if (!type) return '';
82
+ return RESOURCE_TYPE_MAP[type] || 'other';
83
+ }
84
+
85
+ // Small FIFO cache keyed on (url \0 sourceUrl \0 resourceType). Despite the
86
+ // class name, eviction is insertion-order, not access-order — `get()` does not
87
+ // promote. For this workload (per-page request bursts whose working set fits
88
+ // in maxSize) FIFO and true LRU produce the same evictions, so the simpler
89
+ // path wins. If cache effectiveness becomes a concern with larger working
90
+ // sets, promote on hit by re-inserting (delete + set).
91
+ class ResultLRU {
92
+ constructor(maxSize) {
93
+ this.cache = new Map();
94
+ this.maxSize = maxSize;
95
+ }
96
+ get(k) { return this.cache.get(k); }
97
+ set(k, v) {
98
+ if (this.cache.size >= this.maxSize) {
99
+ this.cache.delete(this.cache.keys().next().value);
100
+ }
101
+ this.cache.set(k, v);
102
+ }
103
+ }
104
+
105
+ /**
106
+ * Build a request-blocking matcher backed by Brave's adblock-rs engine.
107
+ *
108
+ * @param {string|string[]} filePathOrArray - One filter list path, or an array
109
+ * of paths to load in order. Order is significant: it affects rule
110
+ * precedence and the cache key.
111
+ * @param {object} [options]
112
+ * @param {boolean} [options.enableLogging=false] - Print parse + cache events.
113
+ * @param {number} [options.resultCacheSize=32000] - Max entries in the
114
+ * per-matcher result cache (FIFO eviction).
115
+ * @param {boolean} [options.useDiskCache=true] - Persist the compiled engine
116
+ * to disk and reload on next run with the same input lists + library version.
117
+ * @param {string} [options.cacheDir] - Directory for compiled-engine cache
118
+ * files. Defaults to a folder under the OS temp dir.
119
+ * @param {number} [options.cacheTtlMs=2592000000] - Files in cacheDir older
120
+ * than this are pruned during cold parse. Default 30 days.
121
+ * @returns {{shouldBlock: Function, getStats: Function, rules: object}}
122
+ */
123
+ function parseAdblockRules(filePathOrArray, options = {}) {
124
+ const {
125
+ enableLogging = false,
126
+ resultCacheSize = 32000,
127
+ useDiskCache = true,
128
+ cacheDir = path.join(os.tmpdir(), 'nwss-adblock-rs-cache'),
129
+ cacheTtlMs = 30 * 24 * 60 * 60 * 1000
130
+ } = options;
131
+ const rust = loadAdblockRust();
132
+
133
+ // Accept a single path or an array of paths — caller no longer needs to
134
+ // materialize a temp concatenation file for multi-list scans.
135
+ const filePaths = Array.isArray(filePathOrArray) ? filePathOrArray : [filePathOrArray];
136
+
137
+ // Read all files up front; hash the raw bytes so the disk cache key reflects
138
+ // both content changes and list-order changes. Mix in the adblock-rs version
139
+ // so a library upgrade (which may change the serialized format) doesn't try
140
+ // to deserialize an incompatible blob.
141
+ const buffers = [];
142
+ const hash = crypto.createHash('sha256');
143
+ hash.update('adblock-rs:' + adblockRustVersion + '\0');
144
+ let totalBytes = 0;
145
+ for (const fp of filePaths) {
146
+ let buf;
147
+ try {
148
+ buf = fs.readFileSync(fp);
149
+ } catch (err) {
150
+ throw new Error(`Adblock rules file not found: ${fp}`);
151
+ }
152
+ buffers.push(buf);
153
+ hash.update(buf);
154
+ hash.update('\0');
155
+ totalBytes += buf.length;
156
+ }
157
+ const cacheKey = hash.digest('hex');
158
+ const cachePath = path.join(cacheDir, cacheKey + '.bin');
159
+
160
+ let engine = null;
161
+ let ruleCount = 0;
162
+ let cacheHit = false;
163
+
164
+ // Fast path: deserialize a previously-compiled engine if available.
165
+ // Skip the existsSync/readFileSync double-syscall pattern — let readFileSync
166
+ // throw ENOENT and treat it as a clean cache-miss. Avoids a redundant stat()
167
+ // and the TOCTOU race where the cache file could be removed between the
168
+ // exists check and the read.
169
+ if (useDiskCache) {
170
+ let compiled;
171
+ try {
172
+ compiled = fs.readFileSync(cachePath);
173
+ } catch (err) {
174
+ if (err.code !== 'ENOENT' && enableLogging) {
175
+ console.log(`[Adblock-Rust] Cache read failed (${err.message}); reparsing`);
176
+ }
177
+ }
178
+ if (compiled) {
179
+ try {
180
+ engine = new rust.Engine(new rust.FilterSet(enableLogging), true);
181
+ // Avoid copying the ~10MB serialized engine when the underlying
182
+ // ArrayBuffer is exclusively ours (true for any read above Node's
183
+ // ~4KB Buffer pool threshold — i.e. always for compiled engines).
184
+ // Fall back to slicing only when the Buffer is a view into a pooled
185
+ // backing store, which would otherwise leak unrelated data.
186
+ const ab = (compiled.byteOffset === 0 &&
187
+ compiled.byteLength === compiled.buffer.byteLength)
188
+ ? compiled.buffer
189
+ : compiled.buffer.slice(
190
+ compiled.byteOffset,
191
+ compiled.byteOffset + compiled.byteLength
192
+ );
193
+ engine.deserialize(ab);
194
+ cacheHit = true;
195
+ } catch (err) {
196
+ // Corrupt cache or version mismatch — fall through to a fresh parse.
197
+ engine = null;
198
+ if (enableLogging) {
199
+ console.log(`[Adblock-Rust] Cache deserialize failed (${err.message}); reparsing`);
200
+ }
201
+ }
202
+ }
203
+ }
204
+
205
+ if (!engine) {
206
+ // Slow path: parse every list. Use addFilters per-file so a single bad
207
+ // line in one list does not blast the whole input, and so the per-list
208
+ // line count is correct. Release each buffer's reference as soon as it
209
+ // is consumed so GC can reclaim the file bytes mid-loop instead of
210
+ // holding all input files (~3-5MB combined for easylist+easyprivacy)
211
+ // alive until the function returns.
212
+ const filterSet = new rust.FilterSet(enableLogging);
213
+ for (let i = 0; i < buffers.length; i++) {
214
+ const buf = buffers[i];
215
+ buffers[i] = null;
216
+ const lines = buf.toString('utf-8').split('\n');
217
+ for (let j = 0; j < lines.length; j++) {
218
+ const line = lines[j];
219
+ if (line.length === 0) continue;
220
+ if (line.charCodeAt(0) === 0x21) continue;
221
+ ruleCount++;
222
+ }
223
+ filterSet.addFilters(lines);
224
+ }
225
+ engine = new rust.Engine(filterSet, true);
226
+
227
+ if (useDiskCache) {
228
+ try {
229
+ fs.mkdirSync(cacheDir, { recursive: true });
230
+ const serialized = engine.serialize();
231
+ // Atomic write: writeFileSync to a per-pid tmp path then rename. If
232
+ // the process is killed mid-write we leave a stray .tmp file (cleaned
233
+ // up by the TTL prune on a future run) but the final cachePath is
234
+ // either complete or absent — never half-written.
235
+ const tmpPath = cachePath + '.' + process.pid + '.tmp';
236
+ fs.writeFileSync(tmpPath, Buffer.from(serialized));
237
+ fs.renameSync(tmpPath, cachePath);
238
+ // Best-effort prune of stale cache files. Done after our own write so
239
+ // we never delete the entry we just created.
240
+ pruneOldCacheFiles(cacheDir, cacheTtlMs);
241
+ } catch (err) {
242
+ if (enableLogging) {
243
+ console.log(`[Adblock-Rust] Cache write failed (${err.message}); continuing`);
244
+ }
245
+ }
246
+ }
247
+ }
248
+
249
+ const stats = {
250
+ // When deserialized from cache we don't see the rules; report bytes instead
251
+ // so the startup banner remains informative.
252
+ total: cacheHit ? null : ruleCount,
253
+ bytes: totalBytes,
254
+ engine: 'adblock-rust',
255
+ fromDiskCache: cacheHit,
256
+ listCount: filePaths.length,
257
+ blocked: 0,
258
+ allowed: 0,
259
+ exceptions: 0,
260
+ errors: 0,
261
+ cacheHits: 0,
262
+ cacheMisses: 0
263
+ };
264
+
265
+ const resultCache = new ResultLRU(resultCacheSize);
266
+ // Hot-path optimization: shared "no_match" object — most checks return this,
267
+ // skip per-call object allocation. Safe because callers only read fields.
268
+ const NO_MATCH = Object.freeze({ blocked: false, rule: null, reason: 'no_match' });
269
+ // Bind once: skips the prototype property lookup for `engine.check` on every
270
+ // call. The adblock-rs forwarder still does an internal name concat per
271
+ // invocation; bypassing that further would require reaching into the native
272
+ // binding (engine.boxed + blocker.Engine_check), which is brittle across
273
+ // library versions.
274
+ const engineCheck = engine.check.bind(engine);
275
+
276
+ if (enableLogging) {
277
+ if (cacheHit) {
278
+ console.log(`[Adblock-Rust] Restored compiled engine from ${cachePath} (${(totalBytes/1024/1024).toFixed(2)}MB source, ${filePaths.length} list${filePaths.length>1?'s':''})`);
279
+ } else {
280
+ console.log(`[Adblock-Rust] Compiled ${ruleCount} rules from ${filePaths.length} list${filePaths.length>1?'s':''} (${(totalBytes/1024/1024).toFixed(2)}MB)`);
281
+ }
282
+ }
283
+
284
+ return {
285
+ rules: { stats },
286
+
287
+ shouldBlock(url, sourceUrl, resourceType) {
288
+ // Avoid default-parameter syntax in the hot path — explicit null/undefined
289
+ // checks are slightly cheaper for V8's argument adaptor.
290
+ const src = sourceUrl || '';
291
+ const rt = resourceType || '';
292
+ // Single null-proto object lookup; falls back to 'other' for unknown types.
293
+ const normType = rt ? (RESOURCE_TYPE_MAP[rt] || 'other') : '';
294
+ const key = url + '\0' + src + '\0' + normType;
295
+ const cached = resultCache.get(key);
296
+ if (cached !== undefined) {
297
+ stats.cacheHits++;
298
+ return cached;
299
+ }
300
+ stats.cacheMisses++;
301
+
302
+ // Narrow try/catch to the native call only — keeps the rest of the
303
+ // function on TurboFan's fast path and avoids exception-handler overhead
304
+ // on stats updates and Map operations.
305
+ let result;
306
+ try {
307
+ // Pass empty string (not the request URL) when source is unknown — the
308
+ // engine then skips first/third-party determination instead of treating
309
+ // the request as same-origin to itself, which would suppress
310
+ // $third-party rules entirely.
311
+ // The 4th arg MUST be true: with false adblock-rs returns a bare
312
+ // boolean instead of the {matched, exception, filter, important}
313
+ // object we read below, which silently breaks matching.
314
+ result = engineCheck(url, src, normType, true);
315
+ } catch (err) {
316
+ stats.errors++;
317
+ if (enableLogging) {
318
+ console.log(`[Adblock-Rust] Error checking ${url}: ${err.message}`);
319
+ }
320
+ // Don't cache errors — next call may succeed (transient native panic).
321
+ return { blocked: false, rule: null, reason: 'error' };
322
+ }
323
+
324
+ // engine.check is contract-bound to return an object; no null guard
325
+ // needed. Reading each field once into a local keeps the IC monomorphic.
326
+ let r;
327
+ if (result.matched) {
328
+ const exception = result.exception;
329
+ if (exception) {
330
+ stats.exceptions++;
331
+ r = { blocked: false, rule: exception, reason: 'whitelisted' };
332
+ } else {
333
+ stats.blocked++;
334
+ r = {
335
+ blocked: true,
336
+ rule: result.filter || null,
337
+ reason: result.important ? 'important_rule' : 'adblock_rust'
338
+ };
339
+ }
340
+ } else {
341
+ stats.allowed++;
342
+ r = NO_MATCH;
343
+ }
344
+
345
+ resultCache.set(key, r);
346
+ return r;
347
+ },
348
+
349
+ getStats() {
350
+ const total = stats.cacheHits + stats.cacheMisses;
351
+ const hitRate = total > 0 ? ((stats.cacheHits / total) * 100).toFixed(1) + '%' : '0%';
352
+ return {
353
+ ...stats,
354
+ cache: {
355
+ hits: stats.cacheHits,
356
+ misses: stats.cacheMisses,
357
+ hitRate,
358
+ size: resultCache.cache.size,
359
+ maxSize: resultCache.maxSize
360
+ }
361
+ };
362
+ }
363
+ };
364
+ }
365
+
366
+ module.exports = {
367
+ parseAdblockRules
368
+ };
package/lib/output.js CHANGED
@@ -324,35 +324,40 @@ function buildOutputLines(results, options = {}) {
324
324
  const { showTitles = false, removeDupes = false, ignoreDomains = [], forLogFile = false } = options;
325
325
 
326
326
  // Consolidate rules from all results, handling multiple results for same URL
327
- const consolidatedRules = new Map(); // URL -> Set of rules
327
+ const consolidatedRules = new Map(); // URL -> { rules: Set, originalUrl, regexes: Set }
328
328
  let successfulPageLoads = 0;
329
-
329
+
330
330
  results.forEach(result => {
331
331
  if (result) {
332
332
  if (result.success) {
333
333
  successfulPageLoads++;
334
334
  }
335
335
  if (result.rules && result.rules.length > 0) {
336
- // Consolidate rules by URL to handle multiple site entries for same URL
337
336
  if (!consolidatedRules.has(result.url)) {
338
- consolidatedRules.set(result.url, new Set());
337
+ consolidatedRules.set(result.url, { rules: new Set(), originalUrl: result.originalUrl || result.url, regexes: new Set() });
338
+ }
339
+ const entry = consolidatedRules.get(result.url);
340
+ result.rules.forEach(rule => entry.rules.add(rule));
341
+ if (Array.isArray(result.matchedRegexes)) {
342
+ result.matchedRegexes.forEach(rx => entry.regexes.add(rx));
343
+ }
344
+ // Prefer the original URL from any result entry that has one different from final
345
+ if (result.originalUrl && result.originalUrl !== result.url) {
346
+ entry.originalUrl = result.originalUrl;
339
347
  }
340
-
341
- // Add all rules from this result to the consolidated set
342
- result.rules.forEach(rule => {
343
- consolidatedRules.get(result.url).add(rule);
344
- });
345
348
  }
346
349
  }
347
350
  });
348
351
 
349
352
  // Convert consolidated rules back to array format
350
353
  const finalSiteRules = [];
351
- consolidatedRules.forEach((rulesSet, url) => {
352
- if (rulesSet.size > 0) {
353
- finalSiteRules.push({
354
- url: url,
355
- rules: Array.from(rulesSet)
354
+ consolidatedRules.forEach((entry, url) => {
355
+ if (entry.rules.size > 0) {
356
+ finalSiteRules.push({
357
+ url: url,
358
+ originalUrl: entry.originalUrl,
359
+ regexes: Array.from(entry.regexes),
360
+ rules: Array.from(entry.rules)
356
361
  });
357
362
  }
358
363
  });
@@ -362,35 +367,41 @@ function buildOutputLines(results, options = {}) {
362
367
  const outputLinesWithTitles = [];
363
368
  let filteredOutCount = 0;
364
369
 
365
- for (const { url, rules } of finalSiteRules) {
370
+ for (const { url, originalUrl, regexes, rules } of finalSiteRules) {
366
371
  if (rules.length > 0) {
372
+ // Build title comments — include redirect source if URL changed and matched regex(es)
373
+ const titleLines = [`! ${url}`];
374
+ if (originalUrl && originalUrl !== url) {
375
+ titleLines.push(`! Redirected from: ${originalUrl}`);
376
+ }
377
+ if (regexes && regexes.length > 0) {
378
+ titleLines.push(`! Regex: ${regexes.join(', ')}`);
379
+ }
380
+
367
381
  // Regular output (for -o files and console) - only add titles if --titles flag used
368
382
  if (showTitles) {
369
- outputLines.push(`! ${url}`);
383
+ outputLines.push(...titleLines);
370
384
  }
371
-
385
+
372
386
  // Filter out ignored domains from rules
373
387
  const filteredRules = rules.filter(rule => {
374
388
  const domain = extractDomainFromRule(rule);
375
389
  if (domain && matchesIgnoreDomain(domain, ignoreDomains)) {
376
390
  filteredOutCount++;
377
-
378
- // Log each filtered domain
379
391
  if (options.forceDebug) {
380
392
  console.log(formatLogMessage('debug', `[output-filter] Removed rule matching ignoreDomains: ${rule} (domain: ${domain})`));
381
393
  } else if (!options.silentMode) {
382
394
  console.log(formatLogMessage('info', `Filtered out: ${domain}`));
383
- }
384
-
395
+ }
385
396
  return false;
386
397
  }
387
398
  return true;
388
399
  });
389
-
400
+
390
401
  outputLines.push(...filteredRules);
391
-
402
+
392
403
  // Output with titles (for auto-saved log files) - always add titles
393
- outputLinesWithTitles.push(`! ${url}`);
404
+ outputLinesWithTitles.push(...titleLines);
394
405
  outputLinesWithTitles.push(...filteredRules);
395
406
  }
396
407
  }
package/nwss.js CHANGED
@@ -58,7 +58,8 @@ const { clearSiteData } = require('./lib/clear_sitedata');
58
58
  // Referrer header generation
59
59
  const { getReferrerForUrl, validateReferrerConfig, validateReferrerDisable } = require('./lib/referrer');
60
60
  // Adblock rules parser
61
- const { parseAdblockRules } = require('./lib/adblock');
61
+ const adblockJs = require('./lib/adblock');
62
+ const adblockRust = require('./lib/adblock-rust');
62
63
  // WireGuard VPN
63
64
  const { connectForSite: wgConnect, disconnectForSite: wgDisconnect, disconnectAll: wgDisconnectAll, validateVpnConfig, normalizeVpnConfig } = require('./lib/wireguard_vpn');
64
65
  // OpenVPN
@@ -185,9 +186,19 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
185
186
  const nwssConfig = JSON.parse(fs.readFileSync(NWSSCONFIG_PATH, 'utf-8'));
186
187
  // Find which config file is being used (--custom-json <file> or positional .json arg)
187
188
  const customJsonIdx = args.findIndex(arg => arg === '--custom-json');
189
+ const positionalJson = (customJsonIdx === -1)
190
+ ? args.find(a => a.endsWith('.json') && !a.startsWith('--'))
191
+ : null;
188
192
  const configFilename = (customJsonIdx !== -1 && args[customJsonIdx + 1])
189
193
  ? args[customJsonIdx + 1]
190
- : args.find(a => a.endsWith('.json') && !a.startsWith('--'));
194
+ : positionalJson;
195
+
196
+ // If a positional .json was used (not --custom-json), wire it to --custom-json
197
+ // so the real config loader picks it up instead of defaulting to config.json
198
+ if (positionalJson && customJsonIdx === -1) {
199
+ args.push('--custom-json', positionalJson);
200
+ process.argv.push('--custom-json', positionalJson);
201
+ }
191
202
 
192
203
  if (configFilename && nwssConfig.configs && nwssConfig.configs[configFilename]) {
193
204
  const settings = nwssConfig.configs[configFilename];
@@ -584,6 +595,22 @@ if (validateRules || validateRulesFile) {
584
595
  }
585
596
  }
586
597
 
598
+ // Parse --adblock-engine=<js|rust> (default: js). Selects the matcher backend
599
+ // used by --block-ads. The rust engine requires the optional adblock-rs package.
600
+ const adblockEngineIndex = args.findIndex(arg => arg.startsWith('--adblock-engine'));
601
+ let adblockEngineName = 'js';
602
+ if (adblockEngineIndex !== -1) {
603
+ const engineArg = args[adblockEngineIndex].includes('=')
604
+ ? args[adblockEngineIndex].split('=')[1]
605
+ : args[adblockEngineIndex + 1];
606
+ if (engineArg === 'rust' || engineArg === 'js') {
607
+ adblockEngineName = engineArg;
608
+ } else {
609
+ console.log(`Error: --adblock-engine must be 'js' or 'rust' (got: ${engineArg})`);
610
+ process.exit(1);
611
+ }
612
+ }
613
+
587
614
  // Parse --block-ads argument for request-level ad blocking (supports comma-separated lists)
588
615
  const blockAdsIndex = args.findIndex(arg => arg.startsWith('--block-ads'));
589
616
  if (blockAdsIndex !== -1) {
@@ -604,18 +631,31 @@ if (blockAdsIndex !== -1) {
604
631
  }
605
632
  }
606
633
 
607
- // Concatenate multiple lists into a single temp file for the parser
608
- let rulesFile = rulesFiles[0];
609
- if (rulesFiles.length > 1) {
610
- rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
611
- const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
612
- fs.writeFileSync(rulesFile, combined);
613
- }
614
-
615
634
  adblockEnabled = true;
616
- adblockMatcher = parseAdblockRules(rulesFile, { enableLogging: forceDebug });
635
+ const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
636
+ try {
637
+ if (engine === adblockRust) {
638
+ // Rust wrapper accepts an array directly — no temp file needed.
639
+ adblockMatcher = engine.parseAdblockRules(rulesFiles, { enableLogging: forceDebug });
640
+ } else {
641
+ // JS engine takes a single path; concat to a temp file when multiple lists.
642
+ let rulesFile = rulesFiles[0];
643
+ if (rulesFiles.length > 1) {
644
+ rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
645
+ const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
646
+ fs.writeFileSync(rulesFile, combined);
647
+ }
648
+ adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
649
+ }
650
+ } catch (err) {
651
+ console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
652
+ process.exit(1);
653
+ }
617
654
  const stats = adblockMatcher.getStats();
618
- if (!silentMode) console.log(messageColors.success(`Adblock enabled: Loaded ${stats.total} blocking rules from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`));
655
+ const ruleDesc = stats.total != null
656
+ ? `${stats.total} blocking rules`
657
+ : `compiled engine (cached)`;
658
+ if (!silentMode) console.log(messageColors.success(`Adblock enabled (${adblockEngineName}): Loaded ${ruleDesc} from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`));
619
659
  }
620
660
 
621
661
  if (args.includes('--help') || args.includes('-h')) {
@@ -641,6 +681,9 @@ Output Format Options:
641
681
  Request Blocking:
642
682
  --block-ads=<file> Block ads/trackers using EasyList format rules (||domain.com^, /ads/*, etc)
643
683
  Works at request-level for maximum performance
684
+ Supports comma-separated lists: --block-ads=easylist.txt,easyprivacy.txt
685
+ --adblock-engine=<js|rust> Matcher backend for --block-ads (default: js)
686
+ 'rust' uses Brave's adblock-rs (faster on large lists; needs: npm i adblock-rs)
644
687
 
645
688
  Per-config settings file (.nwssconfig):
646
689
  Place a .nwssconfig file in the project root to define per-config settings.
@@ -687,6 +730,7 @@ Validation Options:
687
730
 
688
731
  Global config.json options:
689
732
  ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
733
+ ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
690
734
  blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
691
735
  whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
692
736
  ignore_similar: true/false Ignore domains similar to already found domains (default: true)
@@ -854,8 +898,9 @@ try {
854
898
  // Extract config values while ignoring 'comments' field at global and site levels
855
899
  const {
856
900
  sites = [],
857
- ignoreDomains = [],
858
- blocked: globalBlocked = [],
901
+ ignoreDomains = [],
902
+ ignoreDomainsByUrl = [],
903
+ blocked: globalBlocked = [],
859
904
  whois_delay = 3000,
860
905
  whois_server_mode = 'random',
861
906
  ignore_similar = true,
@@ -901,6 +946,15 @@ for (const pattern of ignoreDomains) {
901
946
  }
902
947
  }
903
948
 
949
+ // Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
950
+ const _ignoreDomainsByUrlRegexes = Array.isArray(ignoreDomainsByUrl)
951
+ ? ignoreDomainsByUrl.map(p => {
952
+ try { return getCompiledRegex(p); } catch { return null; }
953
+ }).filter(r => r)
954
+ : [];
955
+ // Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
956
+ const _dynamicallyIgnoredDomains = new Set();
957
+
904
958
  // Apply global configuration overrides with validation
905
959
  // Priority: Command line args > config.json > defaults
906
960
  const MAX_CONCURRENT_SITES = (() => {
@@ -1312,6 +1366,8 @@ function shouldBypassCacheForUrl(url, siteConfig) {
1312
1366
  // Cache compiled wildcard regexes to avoid recompilation on every request
1313
1367
  const _wildcardRegexCache = new Map();
1314
1368
  function matchesIgnoreDomain(domain, ignorePatterns) {
1369
+ // Dynamically ignored domains (from URL pattern matches via ignoreDomainsByUrl)
1370
+ if (_dynamicallyIgnoredDomains.has(domain)) return true;
1315
1371
  // Fast path: exact match or suffix match against Set (O(n) for parts, but no regex)
1316
1372
  if (_ignoreDomainsExact.size > 0) {
1317
1373
  if (_ignoreDomainsExact.has(domain)) return true;
@@ -1789,6 +1845,10 @@ function setupFrameHandling(page, forceDebug) {
1789
1845
  * @returns {Promise<object>} A promise that resolves to an object containing scan results.
1790
1846
  */
1791
1847
  async function processUrl(currentUrl, siteConfig, browserInstance) {
1848
+ // Preserve the original URL (before any redirect) for output display
1849
+ const originalRequestedUrl = currentUrl;
1850
+ // Track regex patterns that produced matches (for title comments in output)
1851
+ const matchedRegexPatterns = new Set();
1792
1852
  // V8 Optimization: Single destructuring to avoid multiple property lookups
1793
1853
  const {
1794
1854
  firstParty,
@@ -2553,6 +2613,11 @@ function setupFrameHandling(page, forceDebug) {
2553
2613
  const blockedRegexes = Array.isArray(siteConfig.blocked)
2554
2614
  ? siteConfig.blocked.map(pattern => getCompiledRegex(pattern))
2555
2615
  : [];
2616
+
2617
+ // Pre-build Set for O(1) resourceType lookups (fired per request)
2618
+ const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
2619
+ ? new Set(siteConfig.resourceTypes)
2620
+ : null;
2556
2621
 
2557
2622
  // Combine site-specific with pre-compiled global blocked patterns
2558
2623
  const allBlockedRegexes = blockedRegexes.length > 0
@@ -2774,9 +2839,22 @@ function setupFrameHandling(page, forceDebug) {
2774
2839
  bufferedLogWrite(debugLogFile, logEntry);
2775
2840
  }
2776
2841
  const reqUrl = checkedUrl;
2777
-
2842
+
2778
2843
  const reqDomain = perSiteSubDomains ? fullSubdomain : checkedRootDomain;
2779
2844
 
2845
+ // ignoreDomainsByUrl — if any pattern matches this URL, mark the root domain as ignored for the rest of the scan
2846
+ if (_ignoreDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
2847
+ for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
2848
+ if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
2849
+ _dynamicallyIgnoredDomains.add(checkedRootDomain);
2850
+ if (forceDebug) {
2851
+ console.log(formatLogMessage('debug', `[ignoreDomainsByUrl] ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
2852
+ }
2853
+ break;
2854
+ }
2855
+ }
2856
+ }
2857
+
2780
2858
  let blockedMatchIndex = -1;
2781
2859
  for (let i = 0; i < allBlockedRegexes.length; i++) {
2782
2860
  if (allBlockedRegexes[i].test(reqUrl)) {
@@ -2801,14 +2879,14 @@ function setupFrameHandling(page, forceDebug) {
2801
2879
  if (reqDomain && !matchesIgnoreDomain(reqDomain, ignoreDomains)) {
2802
2880
  for (const re of regexes) {
2803
2881
  if (re.test(reqUrl)) {
2882
+ const evenBlockedRegexPattern = re.source;
2804
2883
  const resourceType = request.resourceType();
2805
-
2884
+
2806
2885
  // Apply same filtering logic as unblocked requests
2807
- const allowedResourceTypes = siteConfig.resourceTypes;
2808
- if (!allowedResourceTypes || !Array.isArray(allowedResourceTypes) || allowedResourceTypes.includes(resourceType)) {
2886
+ if (!allowedResourceTypesSet || allowedResourceTypesSet.has(resourceType)) {
2809
2887
  if (dryRunMode) {
2810
2888
  addDryRunMatch(matchedDomains, {
2811
- regex: matchedRegexPattern,
2889
+ regex: evenBlockedRegexPattern,
2812
2890
  domain: reqDomain,
2813
2891
  resourceType: resourceType,
2814
2892
  fullUrl: reqUrl,
@@ -2818,10 +2896,11 @@ function setupFrameHandling(page, forceDebug) {
2818
2896
  } else {
2819
2897
  addMatchedDomain(reqDomain, resourceType, fullSubdomain);
2820
2898
  }
2821
-
2899
+ matchedRegexPatterns.add(evenBlockedRegexPattern);
2900
+
2822
2901
  if (siteConfig.verbose === 1) {
2823
2902
  const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
2824
- console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${matchedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
2903
+ console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${evenBlockedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
2825
2904
  }
2826
2905
  if (dumpUrls) {
2827
2906
  const timestamp = new Date().toISOString();
@@ -2889,11 +2968,10 @@ function setupFrameHandling(page, forceDebug) {
2889
2968
 
2890
2969
  // *** UNIVERSAL RESOURCE TYPE FILTER ***
2891
2970
  // Check resourceTypes filter FIRST, before ANY processing (nettools, searchstring, immediate matching)
2892
- const allowedResourceTypes = siteConfig.resourceTypes;
2893
- if (allowedResourceTypes && Array.isArray(allowedResourceTypes) && allowedResourceTypes.length > 0) {
2894
- if (!allowedResourceTypes.includes(resourceType)) {
2971
+ if (allowedResourceTypesSet && allowedResourceTypesSet.size > 0) {
2972
+ if (!allowedResourceTypesSet.has(resourceType)) {
2895
2973
  if (forceDebug) {
2896
- console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${allowedResourceTypes.join(', ')}]. Skipping ALL processing.`));
2974
+ console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${Array.from(allowedResourceTypesSet).join(', ')}]. Skipping ALL processing.`));
2897
2975
  }
2898
2976
  // Skip this URL entirely - doesn't match required resource types
2899
2977
  request.continue();
@@ -2981,6 +3059,7 @@ function setupFrameHandling(page, forceDebug) {
2981
3059
  } else {
2982
3060
  addMatchedDomain(reqDomain, resourceType);
2983
3061
  }
3062
+ if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
2984
3063
  if (siteConfig.verbose === 1) {
2985
3064
  const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
2986
3065
  console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${matchedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
@@ -4011,12 +4090,14 @@ function setupFrameHandling(page, forceDebug) {
4011
4090
  };
4012
4091
  const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
4013
4092
 
4014
- return {
4015
- url: currentUrl,
4016
- rules: formattedRules,
4093
+ return {
4094
+ url: currentUrl,
4095
+ originalUrl: originalRequestedUrl,
4096
+ rules: formattedRules,
4017
4097
  success: true,
4018
4098
  finalUrl: finalUrlAfterRedirect || currentUrl,
4019
- redirectDomains: redirectDomainsToExclude
4099
+ redirectDomains: redirectDomainsToExclude,
4100
+ matchedRegexes: Array.from(matchedRegexPatterns)
4020
4101
  };
4021
4102
  }
4022
4103
 
@@ -4072,13 +4153,15 @@ function setupFrameHandling(page, forceDebug) {
4072
4153
  };
4073
4154
  const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
4074
4155
  if (forceDebug) console.log(formatLogMessage('debug', `Saving ${formattedRules.length} rules despite page load failure`));
4075
- return {
4076
- url: currentUrl,
4077
- rules: formattedRules,
4078
- success: false,
4156
+ return {
4157
+ url: currentUrl,
4158
+ originalUrl: originalRequestedUrl,
4159
+ rules: formattedRules,
4160
+ success: false,
4079
4161
  hasMatches: true,
4080
4162
  finalUrl: finalUrlAfterRedirect || currentUrl,
4081
- redirectDomains: redirectDomainsToExclude
4163
+ redirectDomains: redirectDomainsToExclude,
4164
+ matchedRegexes: Array.from(matchedRegexPatterns)
4082
4165
  };
4083
4166
  }
4084
4167
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fanboynz/network-scanner",
3
- "version": "2.0.62",
3
+ "version": "2.0.64",
4
4
  "description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
5
5
  "main": "nwss.js",
6
6
  "scripts": {
@@ -11,8 +11,8 @@
11
11
  },
12
12
  "dependencies": {
13
13
  "ghost-cursor": "^1.4.2",
14
- "lru-cache": "^10.4.3",
15
- "p-limit": "^4.0.0",
14
+ "lru-cache": "^11.3.5",
15
+ "p-limit": "^7.3.0",
16
16
  "psl": "^1.15.0",
17
17
  "puppeteer": ">=20.0.0"
18
18
  },
@@ -36,7 +36,7 @@
36
36
  "author": "FanboyNZ",
37
37
  "license": "GPL-3.0",
38
38
  "engines": {
39
- "node": ">=20.0.0"
39
+ "node": ">=22.0.0"
40
40
  },
41
41
  "repository": {
42
42
  "type": "git",
@@ -50,10 +50,11 @@
50
50
  },
51
51
  "homepage": "https://github.com/ryanbr/network-scanner",
52
52
  "optionalDependencies": {
53
+ "adblock-rs": "^0.12.3",
53
54
  "puppeteer-core": ">=20.0.0"
54
55
  },
55
56
  "devDependencies": {
56
57
  "eslint": "^10.0.2",
57
- "globals": "^16.3.0"
58
+ "globals": "^17.6.0"
58
59
  }
59
60
  }