npm - @fanboynz/network-scanner - Versions diffs - 2.0.62 → 2.0.64 - Mend

@fanboynz/network-scanner 2.0.62 → 2.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/CLAUDE.md CHANGED Viewed

@@ -27,7 +27,7 @@ Puppeteer-based network scanner for analyzing web traffic, generating adblock fi
 ## Tech Stack
-- **Node.js** >=20.0.0
+- **Node.js** >=22.0.0
 - **puppeteer** >=20.0.0 — Headless browser automation
 - **psl** — Public Suffix List for domain parsing
 - **lru-cache** — LRU cache implementation

package/README.md CHANGED Viewed

@@ -5,7 +5,7 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
 - Scan websites and detect matching third-party or first-party resources
 - Output Adblock-formatted blocking rules
 - Support for multiple filters per site
-- Grouped titles (! <url>) before site matches
+- Grouped titles (! <url>) before site matches, including redirect source and matching regex
 - Ignore unwanted domains (global and per-site)
 - Block unwanted domains during scan (simulate adblock)
 - Support Chrome, Firefox, Safari user agents (desktop or mobile)
@@ -64,6 +64,7 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
 | `--headful`                 | Launch browser with GUI (not headless) |
 | `--keep-open`               | Keep browser and tabs open after scan completes (use with `--headful` for debugging) |
 | `--use-puppeteer-core`      | Use `puppeteer-core` with system Chrome instead of bundled Chromium |
+| `--use-obscura`             | Connect to running Obscura CDP server (`ws://127.0.0.1:9222` or `OBSCURA_WS` env). Skips fingerprint injection — Obscura provides built-in stealth |
 | `--load-extension <path>`   | Load unpacked Chrome extension from directory (can be used multiple times) |
 | `--dns-cache`               | Persist dig/whois results to disk between runs (14hr TTL, `.digcache`/`.whoiscache`) |
 | `--block-ads=<files>`       | Block ads using EasyList format rules (comma-separated: `easylist.txt,easyprivacy.txt`) |
@@ -448,7 +449,7 @@ node nwss.js config-clean2.json --debug             # .nwssconfig + debug overri
 node nwss.js config-other.json --max-concurrent 5   # no match in .nwssconfig, uses CLI flags
 ```
-**Supported settings:** `output`, `max_concurrent`, `dns_cache`, `cache_requests`, `dumpurls`, `remove_tempfiles`, `color`, `remove_dupes`, `compress_logs`, `debug`, `silent`, `verbose`, `headful`, `keep_open`, `dry_run`, `titles`, `sub_domains`, `no_interact`, `ghost_cursor`, `plain`, `cdp`, `dnsmasq`, `unbound`, `privoxy`, `pihole`, `eval_on_doc`, `use_puppeteer_core`, `ignore_cache`, `clear_cache`, `block_ads`, `compare`, `localhost`, `append`.
+**Supported settings:** `output`, `max_concurrent`, `dns_cache`, `cache_requests`, `dumpurls`, `remove_tempfiles`, `color`, `remove_dupes`, `compress_logs`, `debug`, `silent`, `verbose`, `headful`, `keep_open`, `dry_run`, `titles`, `sub_domains`, `no_interact`, `ghost_cursor`, `plain`, `cdp`, `dnsmasq`, `unbound`, `privoxy`, `pihole`, `eval_on_doc`, `use_puppeteer_core`, `use_obscura`, `ignore_cache`, `clear_cache`, `block_ads`, `compare`, `localhost`, `append`.
 **Priority:** CLI flags > `.nwssconfig` > hardcoded defaults.
@@ -461,6 +462,7 @@ These options go at the root level of your config.json:
 | Field                | Values | Default | Description |
 |:---------------------|:-------|:-------:|:------------|
 | `ignoreDomains`      | Array | - | Domains to completely ignore (supports wildcards like `*.ads.com`) |
+| `ignoreDomainsByUrl` | Array | - | Regex patterns; if a request URL matches, the request's root domain is dynamically ignored for the rest of the scan (e.g. `["\\/jwplayer\\/", "\\/build\\/assets\\/"]`) |
 | `blocked`            | Array | - | Global regex patterns to block requests (combined with per-site blocked) |
 | `whois_server_mode`  | String | `"random"` | Default server selection mode for all sites |
 | `ignore_similar`     | Boolean | `true` | Ignore domains similar to already found domains |

package/config.json CHANGED Viewed

@@ -38,7 +38,7 @@
  "sites": [
     {
     "url": "https://www.anandtech.com/",
-    "filterRegex": ".",
+    "filterRegex": "teststring",
     "resourceTypes": ["script", "xhr", "document"],
     "reload": 1,
     "timeout": 25000,
@@ -50,7 +50,7 @@
   },
   {
     "url": "https://www.tomshardware.com/",
-    "filterRegex": ".",
+    "filterRegex": "anotherstrng",
     "resourceTypes": ["all"],
     "reload": 2,
     "timeout": 25000,
@@ -61,7 +61,7 @@
   },
   {
     "url": ["https://www.tomshardware.com/", "https://www.anandtech.com/"],
-    "filterRegex": ".",
+    "filterRegex": "morestrings",
     "resourceTypes": ["all"],
     "reload": 2,
     "timeout": 25000,

package/lib/adblock-rust.js ADDED Viewed

@@ -0,0 +1,368 @@
+// === Adblock Rust Engine Wrapper (adblock-rust.js) ===
+// Drop-in replacement for ./lib/adblock that delegates matching to Brave's
+// adblock-rust engine (npm: adblock-rs) for higher throughput on large lists.
+//
+// Exposes the same parseAdblockRules(filePath, options) factory and the same
+// matcher shape ({ shouldBlock, getStats, rules }) so nwss.js can switch
+// engines with a single require() swap.
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const crypto = require('crypto');
+let adblockRust = null;
+let adblockRustVersion = null;
+function loadAdblockRust() {
+  if (adblockRust) return adblockRust;
+  try {
+    adblockRust = require('adblock-rs');
+    // Read once for the cache key — serialized engine format is not promised
+    // stable across versions, so partitioning cache files by version means
+    // upgrades cleanly invalidate without producing confusing deserialize
+    // failures on the warm path.
+    adblockRustVersion = require('adblock-rs/package.json').version;
+  } catch (err) {
+    throw new Error(
+      "adblock-rs is not installed. Install with: npm install adblock-rs " +
+      "(requires Rust toolchain for native build). Original error: " + err.message
+    );
+  }
+  return adblockRust;
+}
+// Best-effort cleanup of stale serialized engines. Filter lists change roughly
+// monthly; cache files older than this are unlikely to be reused and only cost
+// disk space. Runs once per cold parse and swallows all errors — cleanup
+// failure must never block a scan.
+function pruneOldCacheFiles(cacheDir, maxAgeMs) {
+  try {
+    const cutoff = Date.now() - maxAgeMs;
+    const files = fs.readdirSync(cacheDir);
+    for (const name of files) {
+      // Only touch our own files; `.tmp` covers stray writes from killed
+      // processes. Skip anything else (in case the dir is shared).
+      if (!name.endsWith('.bin') && !name.endsWith('.tmp')) continue;
+      const full = path.join(cacheDir, name);
+      try {
+        if (fs.statSync(full).mtimeMs < cutoff) fs.unlinkSync(full);
+      } catch (_) { /* file vanished mid-walk — fine */ }
+    }
+  } catch (_) { /* dir doesn't exist or unreadable — fine */ }
+}
+// Map Puppeteer/CDP resource type names to adblock-rust request types.
+// Uses a null-prototype object so lookups skip the prototype chain — small but
+// free win on a hot-path lookup that runs once per network request.
+const RESOURCE_TYPE_MAP = Object.assign(Object.create(null), {
+  'document':            'main_frame',
+  'subdocument':         'sub_frame',
+  'stylesheet':          'stylesheet',
+  'script':              'script',
+  'image':               'image',
+  'font':                'font',
+  'media':               'media',
+  'texttrack':           'media',
+  'xhr':                 'xmlhttprequest',
+  'fetch':               'xmlhttprequest',
+  'xmlhttprequest':      'xmlhttprequest',
+  'eventsource':         'other',
+  'websocket':           'websocket',
+  'manifest':            'other',
+  'signedexchange':      'other',
+  'ping':                'ping',
+  'cspviolationreport':  'other',
+  'preflight':           'other',
+  'other':               'other',
+  '':                    ''
+});
+function normalizeResourceType(type) {
+  if (!type) return '';
+  return RESOURCE_TYPE_MAP[type] || 'other';
+}
+// Small FIFO cache keyed on (url \0 sourceUrl \0 resourceType). Despite the
+// class name, eviction is insertion-order, not access-order — `get()` does not
+// promote. For this workload (per-page request bursts whose working set fits
+// in maxSize) FIFO and true LRU produce the same evictions, so the simpler
+// path wins. If cache effectiveness becomes a concern with larger working
+// sets, promote on hit by re-inserting (delete + set).
+class ResultLRU {
+  constructor(maxSize) {
+    this.cache = new Map();
+    this.maxSize = maxSize;
+  }
+  get(k) { return this.cache.get(k); }
+  set(k, v) {
+    if (this.cache.size >= this.maxSize) {
+      this.cache.delete(this.cache.keys().next().value);
+    }
+    this.cache.set(k, v);
+  }
+}
+/**
+ * Build a request-blocking matcher backed by Brave's adblock-rs engine.
+ *
+ * @param {string|string[]} filePathOrArray - One filter list path, or an array
+ *   of paths to load in order. Order is significant: it affects rule
+ *   precedence and the cache key.
+ * @param {object} [options]
+ * @param {boolean} [options.enableLogging=false] - Print parse + cache events.
+ * @param {number} [options.resultCacheSize=32000] - Max entries in the
+ *   per-matcher result cache (FIFO eviction).
+ * @param {boolean} [options.useDiskCache=true] - Persist the compiled engine
+ *   to disk and reload on next run with the same input lists + library version.
+ * @param {string} [options.cacheDir] - Directory for compiled-engine cache
+ *   files. Defaults to a folder under the OS temp dir.
+ * @param {number} [options.cacheTtlMs=2592000000] - Files in cacheDir older
+ *   than this are pruned during cold parse. Default 30 days.
+ * @returns {{shouldBlock: Function, getStats: Function, rules: object}}
+ */
+function parseAdblockRules(filePathOrArray, options = {}) {
+  const {
+    enableLogging = false,
+    resultCacheSize = 32000,
+    useDiskCache = true,
+    cacheDir = path.join(os.tmpdir(), 'nwss-adblock-rs-cache'),
+    cacheTtlMs = 30 * 24 * 60 * 60 * 1000
+  } = options;
+  const rust = loadAdblockRust();
+  // Accept a single path or an array of paths — caller no longer needs to
+  // materialize a temp concatenation file for multi-list scans.
+  const filePaths = Array.isArray(filePathOrArray) ? filePathOrArray : [filePathOrArray];
+  // Read all files up front; hash the raw bytes so the disk cache key reflects
+  // both content changes and list-order changes. Mix in the adblock-rs version
+  // so a library upgrade (which may change the serialized format) doesn't try
+  // to deserialize an incompatible blob.
+  const buffers = [];
+  const hash = crypto.createHash('sha256');
+  hash.update('adblock-rs:' + adblockRustVersion + '\0');
+  let totalBytes = 0;
+  for (const fp of filePaths) {
+    let buf;
+    try {
+      buf = fs.readFileSync(fp);
+    } catch (err) {
+      throw new Error(`Adblock rules file not found: ${fp}`);
+    }
+    buffers.push(buf);
+    hash.update(buf);
+    hash.update('\0');
+    totalBytes += buf.length;
+  }
+  const cacheKey = hash.digest('hex');
+  const cachePath = path.join(cacheDir, cacheKey + '.bin');
+  let engine = null;
+  let ruleCount = 0;
+  let cacheHit = false;
+  // Fast path: deserialize a previously-compiled engine if available.
+  // Skip the existsSync/readFileSync double-syscall pattern — let readFileSync
+  // throw ENOENT and treat it as a clean cache-miss. Avoids a redundant stat()
+  // and the TOCTOU race where the cache file could be removed between the
+  // exists check and the read.
+  if (useDiskCache) {
+    let compiled;
+    try {
+      compiled = fs.readFileSync(cachePath);
+    } catch (err) {
+      if (err.code !== 'ENOENT' && enableLogging) {
+        console.log(`[Adblock-Rust] Cache read failed (${err.message}); reparsing`);
+      }
+    }
+    if (compiled) {
+      try {
+        engine = new rust.Engine(new rust.FilterSet(enableLogging), true);
+        // Avoid copying the ~10MB serialized engine when the underlying
+        // ArrayBuffer is exclusively ours (true for any read above Node's
+        // ~4KB Buffer pool threshold — i.e. always for compiled engines).
+        // Fall back to slicing only when the Buffer is a view into a pooled
+        // backing store, which would otherwise leak unrelated data.
+        const ab = (compiled.byteOffset === 0 &&
+                    compiled.byteLength === compiled.buffer.byteLength)
+          ? compiled.buffer
+          : compiled.buffer.slice(
+              compiled.byteOffset,
+              compiled.byteOffset + compiled.byteLength
+            );
+        engine.deserialize(ab);
+        cacheHit = true;
+      } catch (err) {
+        // Corrupt cache or version mismatch — fall through to a fresh parse.
+        engine = null;
+        if (enableLogging) {
+          console.log(`[Adblock-Rust] Cache deserialize failed (${err.message}); reparsing`);
+        }
+      }
+    }
+  }
+  if (!engine) {
+    // Slow path: parse every list. Use addFilters per-file so a single bad
+    // line in one list does not blast the whole input, and so the per-list
+    // line count is correct. Release each buffer's reference as soon as it
+    // is consumed so GC can reclaim the file bytes mid-loop instead of
+    // holding all input files (~3-5MB combined for easylist+easyprivacy)
+    // alive until the function returns.
+    const filterSet = new rust.FilterSet(enableLogging);
+    for (let i = 0; i < buffers.length; i++) {
+      const buf = buffers[i];
+      buffers[i] = null;
+      const lines = buf.toString('utf-8').split('\n');
+      for (let j = 0; j < lines.length; j++) {
+        const line = lines[j];
+        if (line.length === 0) continue;
+        if (line.charCodeAt(0) === 0x21) continue;
+        ruleCount++;
+      }
+      filterSet.addFilters(lines);
+    }
+    engine = new rust.Engine(filterSet, true);
+    if (useDiskCache) {
+      try {
+        fs.mkdirSync(cacheDir, { recursive: true });
+        const serialized = engine.serialize();
+        // Atomic write: writeFileSync to a per-pid tmp path then rename. If
+        // the process is killed mid-write we leave a stray .tmp file (cleaned
+        // up by the TTL prune on a future run) but the final cachePath is
+        // either complete or absent — never half-written.
+        const tmpPath = cachePath + '.' + process.pid + '.tmp';
+        fs.writeFileSync(tmpPath, Buffer.from(serialized));
+        fs.renameSync(tmpPath, cachePath);
+        // Best-effort prune of stale cache files. Done after our own write so
+        // we never delete the entry we just created.
+        pruneOldCacheFiles(cacheDir, cacheTtlMs);
+      } catch (err) {
+        if (enableLogging) {
+          console.log(`[Adblock-Rust] Cache write failed (${err.message}); continuing`);
+        }
+      }
+    }
+  }
+  const stats = {
+    // When deserialized from cache we don't see the rules; report bytes instead
+    // so the startup banner remains informative.
+    total: cacheHit ? null : ruleCount,
+    bytes: totalBytes,
+    engine: 'adblock-rust',
+    fromDiskCache: cacheHit,
+    listCount: filePaths.length,
+    blocked: 0,
+    allowed: 0,
+    exceptions: 0,
+    errors: 0,
+    cacheHits: 0,
+    cacheMisses: 0
+  };
+  const resultCache = new ResultLRU(resultCacheSize);
+  // Hot-path optimization: shared "no_match" object — most checks return this,
+  // skip per-call object allocation. Safe because callers only read fields.
+  const NO_MATCH = Object.freeze({ blocked: false, rule: null, reason: 'no_match' });
+  // Bind once: skips the prototype property lookup for `engine.check` on every
+  // call. The adblock-rs forwarder still does an internal name concat per
+  // invocation; bypassing that further would require reaching into the native
+  // binding (engine.boxed + blocker.Engine_check), which is brittle across
+  // library versions.
+  const engineCheck = engine.check.bind(engine);
+  if (enableLogging) {
+    if (cacheHit) {
+      console.log(`[Adblock-Rust] Restored compiled engine from ${cachePath} (${(totalBytes/1024/1024).toFixed(2)}MB source, ${filePaths.length} list${filePaths.length>1?'s':''})`);
+    } else {
+      console.log(`[Adblock-Rust] Compiled ${ruleCount} rules from ${filePaths.length} list${filePaths.length>1?'s':''} (${(totalBytes/1024/1024).toFixed(2)}MB)`);
+    }
+  }
+  return {
+    rules: { stats },
+    shouldBlock(url, sourceUrl, resourceType) {
+      // Avoid default-parameter syntax in the hot path — explicit null/undefined
+      // checks are slightly cheaper for V8's argument adaptor.
+      const src = sourceUrl || '';
+      const rt = resourceType || '';
+      // Single null-proto object lookup; falls back to 'other' for unknown types.
+      const normType = rt ? (RESOURCE_TYPE_MAP[rt] || 'other') : '';
+      const key = url + '\0' + src + '\0' + normType;
+      const cached = resultCache.get(key);
+      if (cached !== undefined) {
+        stats.cacheHits++;
+        return cached;
+      }
+      stats.cacheMisses++;
+      // Narrow try/catch to the native call only — keeps the rest of the
+      // function on TurboFan's fast path and avoids exception-handler overhead
+      // on stats updates and Map operations.
+      let result;
+      try {
+        // Pass empty string (not the request URL) when source is unknown — the
+        // engine then skips first/third-party determination instead of treating
+        // the request as same-origin to itself, which would suppress
+        // $third-party rules entirely.
+        // The 4th arg MUST be true: with false adblock-rs returns a bare
+        // boolean instead of the {matched, exception, filter, important}
+        // object we read below, which silently breaks matching.
+        result = engineCheck(url, src, normType, true);
+      } catch (err) {
+        stats.errors++;
+        if (enableLogging) {
+          console.log(`[Adblock-Rust] Error checking ${url}: ${err.message}`);
+        }
+        // Don't cache errors — next call may succeed (transient native panic).
+        return { blocked: false, rule: null, reason: 'error' };
+      }
+      // engine.check is contract-bound to return an object; no null guard
+      // needed. Reading each field once into a local keeps the IC monomorphic.
+      let r;
+      if (result.matched) {
+        const exception = result.exception;
+        if (exception) {
+          stats.exceptions++;
+          r = { blocked: false, rule: exception, reason: 'whitelisted' };
+        } else {
+          stats.blocked++;
+          r = {
+            blocked: true,
+            rule: result.filter || null,
+            reason: result.important ? 'important_rule' : 'adblock_rust'
+          };
+        }
+      } else {
+        stats.allowed++;
+        r = NO_MATCH;
+      }
+      resultCache.set(key, r);
+      return r;
+    },
+    getStats() {
+      const total = stats.cacheHits + stats.cacheMisses;
+      const hitRate = total > 0 ? ((stats.cacheHits / total) * 100).toFixed(1) + '%' : '0%';
+      return {
+        ...stats,
+        cache: {
+          hits: stats.cacheHits,
+          misses: stats.cacheMisses,
+          hitRate,
+          size: resultCache.cache.size,
+          maxSize: resultCache.maxSize
+        }
+      };
+    }
+  };
+}
+module.exports = {
+  parseAdblockRules
+};

package/lib/output.js CHANGED Viewed

@@ -324,35 +324,40 @@ function buildOutputLines(results, options = {}) {
   const { showTitles = false, removeDupes = false, ignoreDomains = [], forLogFile = false } = options;
   // Consolidate rules from all results, handling multiple results for same URL
-  const consolidatedRules = new Map(); // URL -> Set of rules
+  const consolidatedRules = new Map(); // URL -> { rules: Set, originalUrl, regexes: Set }
   let successfulPageLoads = 0;
   results.forEach(result => {
     if (result) {
       if (result.success) {
         successfulPageLoads++;
       }
       if (result.rules && result.rules.length > 0) {
-        // Consolidate rules by URL to handle multiple site entries for same URL
         if (!consolidatedRules.has(result.url)) {
-          consolidatedRules.set(result.url, new Set());
+          consolidatedRules.set(result.url, { rules: new Set(), originalUrl: result.originalUrl || result.url, regexes: new Set() });
+        }
+        const entry = consolidatedRules.get(result.url);
+        result.rules.forEach(rule => entry.rules.add(rule));
+        if (Array.isArray(result.matchedRegexes)) {
+          result.matchedRegexes.forEach(rx => entry.regexes.add(rx));
+        }
+        // Prefer the original URL from any result entry that has one different from final
+        if (result.originalUrl && result.originalUrl !== result.url) {
+          entry.originalUrl = result.originalUrl;
         }
-        // Add all rules from this result to the consolidated set
-        result.rules.forEach(rule => {
-          consolidatedRules.get(result.url).add(rule);
-        });
       }
     }
   });
   // Convert consolidated rules back to array format
   const finalSiteRules = [];
-  consolidatedRules.forEach((rulesSet, url) => {
-    if (rulesSet.size > 0) {
-      finalSiteRules.push({
-        url: url,
-        rules: Array.from(rulesSet)
+  consolidatedRules.forEach((entry, url) => {
+    if (entry.rules.size > 0) {
+      finalSiteRules.push({
+        url: url,
+        originalUrl: entry.originalUrl,
+        regexes: Array.from(entry.regexes),
+        rules: Array.from(entry.rules)
       });
     }
   });
@@ -362,35 +367,41 @@ function buildOutputLines(results, options = {}) {
   const outputLinesWithTitles = [];
   let filteredOutCount = 0;
-  for (const { url, rules } of finalSiteRules) {
+  for (const { url, originalUrl, regexes, rules } of finalSiteRules) {
     if (rules.length > 0) {
+      // Build title comments — include redirect source if URL changed and matched regex(es)
+      const titleLines = [`! ${url}`];
+      if (originalUrl && originalUrl !== url) {
+        titleLines.push(`! Redirected from: ${originalUrl}`);
+      }
+      if (regexes && regexes.length > 0) {
+        titleLines.push(`! Regex: ${regexes.join(', ')}`);
+      }
       // Regular output (for -o files and console) - only add titles if --titles flag used
       if (showTitles) {
-        outputLines.push(`! ${url}`);
+        outputLines.push(...titleLines);
       }
       // Filter out ignored domains from rules
       const filteredRules = rules.filter(rule => {
         const domain = extractDomainFromRule(rule);
         if (domain && matchesIgnoreDomain(domain, ignoreDomains)) {
           filteredOutCount++;
-          // Log each filtered domain
           if (options.forceDebug) {
             console.log(formatLogMessage('debug', `[output-filter] Removed rule matching ignoreDomains: ${rule} (domain: ${domain})`));
           } else if (!options.silentMode) {
             console.log(formatLogMessage('info', `Filtered out: ${domain}`));
-         }
+          }
           return false;
         }
         return true;
       });
       outputLines.push(...filteredRules);
       // Output with titles (for auto-saved log files) - always add titles
-      outputLinesWithTitles.push(`! ${url}`);
+      outputLinesWithTitles.push(...titleLines);
       outputLinesWithTitles.push(...filteredRules);
     }
   }

package/nwss.js CHANGED Viewed

@@ -58,7 +58,8 @@ const { clearSiteData } = require('./lib/clear_sitedata');
 // Referrer header generation
 const { getReferrerForUrl, validateReferrerConfig, validateReferrerDisable } = require('./lib/referrer');
 // Adblock rules parser
-const { parseAdblockRules } = require('./lib/adblock');
+const adblockJs = require('./lib/adblock');
+const adblockRust = require('./lib/adblock-rust');
 // WireGuard VPN
 const { connectForSite: wgConnect, disconnectForSite: wgDisconnect, disconnectAll: wgDisconnectAll, validateVpnConfig, normalizeVpnConfig } = require('./lib/wireguard_vpn');
 // OpenVPN
@@ -185,9 +186,19 @@ if (fs.existsSync(NWSSCONFIG_PATH)) {
     const nwssConfig = JSON.parse(fs.readFileSync(NWSSCONFIG_PATH, 'utf-8'));
     // Find which config file is being used (--custom-json <file> or positional .json arg)
     const customJsonIdx = args.findIndex(arg => arg === '--custom-json');
+    const positionalJson = (customJsonIdx === -1)
+      ? args.find(a => a.endsWith('.json') && !a.startsWith('--'))
+      : null;
     const configFilename = (customJsonIdx !== -1 && args[customJsonIdx + 1])
       ? args[customJsonIdx + 1]
-      : args.find(a => a.endsWith('.json') && !a.startsWith('--'));
+      : positionalJson;
+    // If a positional .json was used (not --custom-json), wire it to --custom-json
+    // so the real config loader picks it up instead of defaulting to config.json
+    if (positionalJson && customJsonIdx === -1) {
+      args.push('--custom-json', positionalJson);
+      process.argv.push('--custom-json', positionalJson);
+    }
     if (configFilename && nwssConfig.configs && nwssConfig.configs[configFilename]) {
       const settings = nwssConfig.configs[configFilename];
@@ -584,6 +595,22 @@ if (validateRules || validateRulesFile) {
   }
 }
+// Parse --adblock-engine=<js|rust> (default: js). Selects the matcher backend
+// used by --block-ads. The rust engine requires the optional adblock-rs package.
+const adblockEngineIndex = args.findIndex(arg => arg.startsWith('--adblock-engine'));
+let adblockEngineName = 'js';
+if (adblockEngineIndex !== -1) {
+  const engineArg = args[adblockEngineIndex].includes('=')
+    ? args[adblockEngineIndex].split('=')[1]
+    : args[adblockEngineIndex + 1];
+  if (engineArg === 'rust' || engineArg === 'js') {
+    adblockEngineName = engineArg;
+  } else {
+    console.log(`Error: --adblock-engine must be 'js' or 'rust' (got: ${engineArg})`);
+    process.exit(1);
+  }
+}
 // Parse --block-ads argument for request-level ad blocking (supports comma-separated lists)
 const blockAdsIndex = args.findIndex(arg => arg.startsWith('--block-ads'));
 if (blockAdsIndex !== -1) {
@@ -604,18 +631,31 @@ if (blockAdsIndex !== -1) {
     }
   }
-  // Concatenate multiple lists into a single temp file for the parser
-  let rulesFile = rulesFiles[0];
-  if (rulesFiles.length > 1) {
-    rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
-    const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
-    fs.writeFileSync(rulesFile, combined);
-  }
   adblockEnabled = true;
-  adblockMatcher = parseAdblockRules(rulesFile, { enableLogging: forceDebug });
+  const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
+  try {
+    if (engine === adblockRust) {
+      // Rust wrapper accepts an array directly — no temp file needed.
+      adblockMatcher = engine.parseAdblockRules(rulesFiles, { enableLogging: forceDebug });
+    } else {
+      // JS engine takes a single path; concat to a temp file when multiple lists.
+      let rulesFile = rulesFiles[0];
+      if (rulesFiles.length > 1) {
+        rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
+        const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
+        fs.writeFileSync(rulesFile, combined);
+      }
+      adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
+    }
+  } catch (err) {
+    console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
+    process.exit(1);
+  }
   const stats = adblockMatcher.getStats();
-  if (!silentMode) console.log(messageColors.success(`Adblock enabled: Loaded ${stats.total} blocking rules from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`));
+  const ruleDesc = stats.total != null
+    ? `${stats.total} blocking rules`
+    : `compiled engine (cached)`;
+  if (!silentMode) console.log(messageColors.success(`Adblock enabled (${adblockEngineName}): Loaded ${ruleDesc} from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`));
 }
 if (args.includes('--help') || args.includes('-h')) {
@@ -641,6 +681,9 @@ Output Format Options:
 Request Blocking:
   --block-ads=<file>             Block ads/trackers using EasyList format rules (||domain.com^, /ads/*, etc)
                                  Works at request-level for maximum performance
+                                 Supports comma-separated lists: --block-ads=easylist.txt,easyprivacy.txt
+  --adblock-engine=<js|rust>     Matcher backend for --block-ads (default: js)
+                                 'rust' uses Brave's adblock-rs (faster on large lists; needs: npm i adblock-rs)
 Per-config settings file (.nwssconfig):
   Place a .nwssconfig file in the project root to define per-config settings.
@@ -687,6 +730,7 @@ Validation Options:
 Global config.json options:
   ignoreDomains: ["domain.com", "*.ads.com"]     Domains to completely ignore (supports wildcards)
+  ignoreDomainsByUrl: ["regex1", "regex2"]       Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
   blocked: ["regex1", "regex2"]                   Global regex patterns to block requests (combined with per-site blocked)
   whois_server_mode: "random" or "cycle"      Default server selection mode for all sites (default: random)
   ignore_similar: true/false                      Ignore domains similar to already found domains (default: true)
@@ -854,8 +898,9 @@ try {
 // Extract config values while ignoring 'comments' field at global and site levels
 const {
   sites = [],
-  ignoreDomains = [],
-  blocked: globalBlocked = [],
+  ignoreDomains = [],
+  ignoreDomainsByUrl = [],
+  blocked: globalBlocked = [],
   whois_delay = 3000,
   whois_server_mode = 'random',
   ignore_similar = true,
@@ -901,6 +946,15 @@ for (const pattern of ignoreDomains) {
   }
 }
+// Compile ignoreDomainsByUrl patterns once — match request URLs to dynamically ignore domains
+const _ignoreDomainsByUrlRegexes = Array.isArray(ignoreDomainsByUrl)
+  ? ignoreDomainsByUrl.map(p => {
+      try { return getCompiledRegex(p); } catch { return null; }
+    }).filter(r => r)
+  : [];
+// Runtime Set of domains marked ignored by URL pattern matches — shared across all sites in this scan
+const _dynamicallyIgnoredDomains = new Set();
 // Apply global configuration overrides with validation
 // Priority: Command line args > config.json > defaults
 const MAX_CONCURRENT_SITES = (() => {
@@ -1312,6 +1366,8 @@ function shouldBypassCacheForUrl(url, siteConfig) {
 // Cache compiled wildcard regexes to avoid recompilation on every request
 const _wildcardRegexCache = new Map();
 function matchesIgnoreDomain(domain, ignorePatterns) {
+  // Dynamically ignored domains (from URL pattern matches via ignoreDomainsByUrl)
+  if (_dynamicallyIgnoredDomains.has(domain)) return true;
   // Fast path: exact match or suffix match against Set (O(n) for parts, but no regex)
   if (_ignoreDomainsExact.size > 0) {
     if (_ignoreDomainsExact.has(domain)) return true;
@@ -1789,6 +1845,10 @@ function setupFrameHandling(page, forceDebug) {
    * @returns {Promise<object>} A promise that resolves to an object containing scan results.
    */
   async function processUrl(currentUrl, siteConfig, browserInstance) {
+    // Preserve the original URL (before any redirect) for output display
+    const originalRequestedUrl = currentUrl;
+    // Track regex patterns that produced matches (for title comments in output)
+    const matchedRegexPatterns = new Set();
     // V8 Optimization: Single destructuring to avoid multiple property lookups
     const {
       firstParty,
@@ -2553,6 +2613,11 @@ function setupFrameHandling(page, forceDebug) {
       const blockedRegexes = Array.isArray(siteConfig.blocked)
         ? siteConfig.blocked.map(pattern => getCompiledRegex(pattern))
         : [];
+      // Pre-build Set for O(1) resourceType lookups (fired per request)
+      const allowedResourceTypesSet = Array.isArray(siteConfig.resourceTypes)
+        ? new Set(siteConfig.resourceTypes)
+        : null;
       // Combine site-specific with pre-compiled global blocked patterns
       const allBlockedRegexes = blockedRegexes.length > 0
@@ -2774,9 +2839,22 @@ function setupFrameHandling(page, forceDebug) {
           bufferedLogWrite(debugLogFile, logEntry);
         }
         const reqUrl = checkedUrl;
         const reqDomain = perSiteSubDomains ? fullSubdomain : checkedRootDomain;
+        // ignoreDomainsByUrl — if any pattern matches this URL, mark the root domain as ignored for the rest of the scan
+        if (_ignoreDomainsByUrlRegexes.length > 0 && checkedRootDomain && !_dynamicallyIgnoredDomains.has(checkedRootDomain)) {
+          for (let i = 0; i < _ignoreDomainsByUrlRegexes.length; i++) {
+            if (_ignoreDomainsByUrlRegexes[i].test(reqUrl)) {
+              _dynamicallyIgnoredDomains.add(checkedRootDomain);
+              if (forceDebug) {
+                console.log(formatLogMessage('debug', `[ignoreDomainsByUrl] ${checkedRootDomain} ignored — matched pattern: ${_ignoreDomainsByUrlRegexes[i].source}`));
+              }
+              break;
+            }
+          }
+        }
         let blockedMatchIndex = -1;
         for (let i = 0; i < allBlockedRegexes.length; i++) {
           if (allBlockedRegexes[i].test(reqUrl)) {
@@ -2801,14 +2879,14 @@ function setupFrameHandling(page, forceDebug) {
             if (reqDomain && !matchesIgnoreDomain(reqDomain, ignoreDomains)) {
               for (const re of regexes) {
                 if (re.test(reqUrl)) {
+                  const evenBlockedRegexPattern = re.source;
                   const resourceType = request.resourceType();
                   // Apply same filtering logic as unblocked requests
-                  const allowedResourceTypes = siteConfig.resourceTypes;
-                  if (!allowedResourceTypes || !Array.isArray(allowedResourceTypes) || allowedResourceTypes.includes(resourceType)) {
+                  if (!allowedResourceTypesSet || allowedResourceTypesSet.has(resourceType)) {
                     if (dryRunMode) {
                       addDryRunMatch(matchedDomains, {
-                        regex: matchedRegexPattern,
+                        regex: evenBlockedRegexPattern,
                         domain: reqDomain,
                         resourceType: resourceType,
                         fullUrl: reqUrl,
@@ -2818,10 +2896,11 @@ function setupFrameHandling(page, forceDebug) {
                     } else {
                       addMatchedDomain(reqDomain, resourceType, fullSubdomain);
                     }
+                    matchedRegexPatterns.add(evenBlockedRegexPattern);
                     if (siteConfig.verbose === 1) {
                       const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
-                      console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${matchedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
+                      console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${evenBlockedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
                     }
                     if (dumpUrls) {
                       const timestamp = new Date().toISOString();
@@ -2889,11 +2968,10 @@ function setupFrameHandling(page, forceDebug) {
            // *** UNIVERSAL RESOURCE TYPE FILTER ***
            // Check resourceTypes filter FIRST, before ANY processing (nettools, searchstring, immediate matching)
-           const allowedResourceTypes = siteConfig.resourceTypes;
-           if (allowedResourceTypes && Array.isArray(allowedResourceTypes) && allowedResourceTypes.length > 0) {
-             if (!allowedResourceTypes.includes(resourceType)) {
+           if (allowedResourceTypesSet && allowedResourceTypesSet.size > 0) {
+             if (!allowedResourceTypesSet.has(resourceType)) {
                if (forceDebug) {
-                 console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${allowedResourceTypes.join(', ')}]. Skipping ALL processing.`));
+                 console.log(formatLogMessage('debug', `URL ${reqUrl} matches regex but resourceType '${resourceType}' not in allowed types [${Array.from(allowedResourceTypesSet).join(', ')}]. Skipping ALL processing.`));
                }
                // Skip this URL entirely - doesn't match required resource types
                request.continue();
@@ -2981,6 +3059,7 @@ function setupFrameHandling(page, forceDebug) {
              } else {
                addMatchedDomain(reqDomain, resourceType);
              }
+             if (matchedRegexPattern) matchedRegexPatterns.add(matchedRegexPattern);
              if (siteConfig.verbose === 1) {
                const resourceInfo = (adblockRulesMode || siteConfig.adblock_rules) ? ` (${resourceType})` : '';
               console.log(formatLogMessage('match', `[${simplifiedCurrentUrl}] ${reqUrl} matched regex: ${matchedRegexPattern} and resourceType: ${resourceType}${resourceInfo}`));
@@ -4011,12 +4090,14 @@ function setupFrameHandling(page, forceDebug) {
       };
         const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
-        return {
-          url: currentUrl,
-          rules: formattedRules,
+        return {
+          url: currentUrl,
+          originalUrl: originalRequestedUrl,
+          rules: formattedRules,
           success: true,
           finalUrl: finalUrlAfterRedirect || currentUrl,
-          redirectDomains: redirectDomainsToExclude
+          redirectDomains: redirectDomainsToExclude,
+          matchedRegexes: Array.from(matchedRegexPatterns)
         };
       }
@@ -4072,13 +4153,15 @@ function setupFrameHandling(page, forceDebug) {
         };
         const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
         if (forceDebug) console.log(formatLogMessage('debug', `Saving ${formattedRules.length} rules despite page load failure`));
-        return {
-          url: currentUrl,
-          rules: formattedRules,
-          success: false,
+        return {
+          url: currentUrl,
+          originalUrl: originalRequestedUrl,
+          rules: formattedRules,
+          success: false,
           hasMatches: true,
           finalUrl: finalUrlAfterRedirect || currentUrl,
-          redirectDomains: redirectDomainsToExclude
+          redirectDomains: redirectDomainsToExclude,
+          matchedRegexes: Array.from(matchedRegexPatterns)
         };
       }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fanboynz/network-scanner",
-  "version": "2.0.62",
+  "version": "2.0.64",
   "description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
   "main": "nwss.js",
   "scripts": {
@@ -11,8 +11,8 @@
   },
   "dependencies": {
     "ghost-cursor": "^1.4.2",
-    "lru-cache": "^10.4.3",
-    "p-limit": "^4.0.0",
+    "lru-cache": "^11.3.5",
+    "p-limit": "^7.3.0",
     "psl": "^1.15.0",
     "puppeteer": ">=20.0.0"
   },
@@ -36,7 +36,7 @@
   "author": "FanboyNZ",
   "license": "GPL-3.0",
   "engines": {
-    "node": ">=20.0.0"
+    "node": ">=22.0.0"
   },
   "repository": {
     "type": "git",
@@ -50,10 +50,11 @@
   },
   "homepage": "https://github.com/ryanbr/network-scanner",
   "optionalDependencies": {
+    "adblock-rs": "^0.12.3",
     "puppeteer-core": ">=20.0.0"
   },
   "devDependencies": {
     "eslint": "^10.0.2",
-    "globals": "^16.3.0"
+    "globals": "^17.6.0"
   }
 }