@fanboynz/network-scanner 2.0.66 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,41 +1,165 @@
1
1
  name: Publish to NPM
2
+
2
3
  on:
3
4
  workflow_dispatch:
5
+ inputs:
6
+ version_bump:
7
+ description: 'Version bump type'
8
+ required: true
9
+ default: 'patch'
10
+ type: choice
11
+ options:
12
+ - patch
13
+ - minor
14
+ - major
15
+ - explicit
16
+ explicit_version:
17
+ description: 'Explicit version (required if version_bump = explicit, e.g. 3.0.0)'
18
+ required: false
19
+ default: ''
20
+ release_notes_source:
21
+ description: 'GitHub Release body source'
22
+ required: true
23
+ default: 'changelog'
24
+ type: choice
25
+ options:
26
+ - changelog
27
+ - manual
28
+ manual_release_notes:
29
+ description: 'Manual release notes (used if release_notes_source = manual; supports markdown)'
30
+ required: false
31
+ default: ''
32
+ prerelease:
33
+ description: 'Mark GitHub Release as pre-release'
34
+ required: false
35
+ default: false
36
+ type: boolean
4
37
 
5
38
  jobs:
6
39
  publish:
7
40
  runs-on: ubuntu-latest
8
41
  permissions:
9
42
  contents: write
10
-
43
+
11
44
  steps:
12
45
  - uses: actions/checkout@v5
13
46
  with:
14
47
  token: ${{ secrets.GITHUB_TOKEN }}
15
48
  fetch-depth: 0
16
-
49
+
17
50
  - name: Setup Node.js
18
51
  uses: actions/setup-node@v5
19
52
  with:
20
53
  node-version: '20'
21
54
  registry-url: 'https://registry.npmjs.org'
22
-
55
+
23
56
  - run: npm ci
24
57
  - run: npm run lint
25
-
58
+
26
59
  - name: Configure git
27
60
  run: |
28
61
  git config user.name "github-actions[bot]"
29
62
  git config user.email "github-actions[bot]@users.noreply.github.com"
30
-
31
- - name: Version and publish
63
+
64
+ - name: Validate explicit-version input
65
+ if: inputs.version_bump == 'explicit'
66
+ run: |
67
+ if [ -z "${{ inputs.explicit_version }}" ]; then
68
+ echo "::error::version_bump=explicit but explicit_version is empty"
69
+ exit 1
70
+ fi
71
+ # Loose semver shape check -- npm version will do the strict validation
72
+ if ! echo "${{ inputs.explicit_version }}" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+([+-].+)?$'; then
73
+ echo "::error::explicit_version '${{ inputs.explicit_version }}' does not look like semver (e.g. 3.0.0)"
74
+ exit 1
75
+ fi
76
+
77
+ - name: Bump version + promote CHANGELOG Unreleased
78
+ id: version
79
+ run: |
80
+ # Use --no-git-tag-version so we can fold the CHANGELOG promotion
81
+ # into the same commit + tag. Otherwise the auto-commit npm creates
82
+ # would not include the changelog edit (and amending afterwards
83
+ # would orphan the tag).
84
+ if [ "${{ inputs.version_bump }}" = "explicit" ]; then
85
+ npm version "${{ inputs.explicit_version }}" --no-git-tag-version
86
+ else
87
+ npm version "${{ inputs.version_bump }}" --no-git-tag-version
88
+ fi
89
+ NEW_VERSION=$(node -p "require('./package.json').version")
90
+ echo "version=$NEW_VERSION" >> "$GITHUB_OUTPUT"
91
+ DATE=$(date -u +%Y-%m-%d)
92
+ echo "date=$DATE" >> "$GITHUB_OUTPUT"
93
+
94
+ # If CHANGELOG has an [Unreleased] section, rename its header to
95
+ # the new version + today's date. Idempotent: skips if the header
96
+ # is already a versioned one (e.g. for re-runs after a failure).
97
+ if grep -q '^## \[Unreleased\]$' CHANGELOG.md; then
98
+ sed -i "s/^## \[Unreleased\]\$/## [$NEW_VERSION] - $DATE/" CHANGELOG.md
99
+ echo "Promoted [Unreleased] -> [$NEW_VERSION] - $DATE"
100
+ else
101
+ echo "::warning::No [Unreleased] section in CHANGELOG.md; skipping promotion. Release notes will use whatever already exists at [$NEW_VERSION]."
102
+ fi
103
+
104
+ # Single combined commit + tag so the tag points at HEAD containing
105
+ # BOTH the package.json bump AND the changelog promotion.
106
+ git add package.json package-lock.json CHANGELOG.md
107
+ git commit -m "$NEW_VERSION"
108
+ git tag "v$NEW_VERSION"
109
+
110
+ - name: Extract release notes from CHANGELOG
111
+ id: changelog_notes
112
+ if: inputs.release_notes_source == 'changelog'
113
+ run: |
114
+ NEW_VERSION="${{ steps.version.outputs.version }}"
115
+ # Pull every line between `## [VERSION]` and the next `## [` header.
116
+ # awk: flag=1 after the start line; flag=0 at the next ## [ header.
117
+ NOTES=$(awk -v v="$NEW_VERSION" '
118
+ $0 ~ "^## \\[" v "\\]" { flag = 1; next }
119
+ $0 ~ "^## \\[" { flag = 0 }
120
+ flag { print }
121
+ ' CHANGELOG.md)
122
+
123
+ if [ -z "$(echo "$NOTES" | tr -d '[:space:]')" ]; then
124
+ echo "::warning::No CHANGELOG section found for $NEW_VERSION -- release body will be empty"
125
+ fi
126
+
127
+ # Multi-line GITHUB_OUTPUT requires a delimited heredoc
128
+ {
129
+ echo "notes<<NOTES_EOF"
130
+ echo "$NOTES"
131
+ echo "NOTES_EOF"
132
+ } >> "$GITHUB_OUTPUT"
133
+
134
+ - name: Use manual release notes
135
+ id: manual_notes
136
+ if: inputs.release_notes_source == 'manual'
32
137
  run: |
33
- npm version patch
34
- npm publish
138
+ {
139
+ echo "notes<<NOTES_EOF"
140
+ echo "${{ inputs.manual_release_notes }}"
141
+ echo "NOTES_EOF"
142
+ } >> "$GITHUB_OUTPUT"
143
+
144
+ - name: Publish to npm
145
+ # Publish BEFORE pushing to GitHub: if npm rejects (auth/network/
146
+ # name-collision), nothing reaches GitHub and a re-run is clean.
147
+ run: npm publish
35
148
  env:
36
149
  NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
37
-
38
- - name: Push changes
150
+
151
+ - name: Push commits and tag
39
152
  run: git push --follow-tags
40
153
  env:
41
154
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
155
+
156
+ - name: Create GitHub Release
157
+ uses: softprops/action-gh-release@v2
158
+ with:
159
+ tag_name: v${{ steps.version.outputs.version }}
160
+ name: v${{ steps.version.outputs.version }}
161
+ body: ${{ inputs.release_notes_source == 'changelog' && steps.changelog_notes.outputs.notes || steps.manual_notes.outputs.notes }}
162
+ draft: false
163
+ prerelease: ${{ inputs.prerelease }}
164
+ env:
165
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
package/CHANGELOG.md CHANGED
@@ -2,6 +2,141 @@
2
2
 
3
3
  All notable changes to the Network Scanner (nwss.js) project.
4
4
 
5
+ ## [3.0.0] - 2026-05-23
6
+
7
+ ### Changed
8
+ - **Engines floor bumped**: `engines.node` from `>=22.0.0` to `>=22.12.0` to match Puppeteer 25's stable `require()`-of-ESM requirement. Anyone running on Node 22.0–22.11 will see an npm engine warning and should upgrade.
9
+ - **Puppeteer dependency floor bumped**: `puppeteer` and `puppeteer-core` from `>=20.0.0` to `>=24.0.0`. Range still permits both v24 and v25 — pick via `npm install puppeteer@24` or `npm install puppeteer@25` according to taste. Dev lockfile moved to `puppeteer@25.0.4`.
10
+ - Audit confirms no breaking-change impact from Puppeteer 25's `executablePath`/`defaultArgs` Promise return — neither is called in this codebase. `require('puppeteer')` continues to work on the now-ESM-only package thanks to Node 22.12+'s stable require-of-ESM.
11
+
12
+ ### Added
13
+ - `blockDomainsByUrl` config key (top-level) — regex patterns mirroring `ignoreDomainsByUrl` but for active blocking. A matching request URL triggers Puppeteer `request.abort()` on the triggering request, the request's root domain, and all subsequent requests to that domain or its subdomains for the rest of the scan
14
+ - Cloudflare aggregate stats accessible via `getAggregateStats({reset})` — returns `byOutcome`, `bySolveMethod`, `maxDurationMs`, `avgDurationMs`, `failures`, `timedOut` counts; bumped on every URL regardless of debug mode
15
+ - Cloudflare per-stage timing breakdown in outcome lines: `q=Xms p=Xms c=Xms` (zero-stage suffixes omitted)
16
+ - Production-level Cloudflare outcome logs: `warn` severity for `!overallSuccess || timedOut`, `info` for 5xx origin-error pages, debug-only on success
17
+ - DNS pre-check positive-resolution shortcut — hosts already proven live by dig or whois within the cache TTL skip the c-ares pre-check via a `knownResolvedHostnames` index (also warmed at startup from disk-loaded dig/whois caches)
18
+ - DNS pre-check skip summary now reports both NXDOMAIN-cache and positive-cache savings: `DNS pre-check skipped: N URL(s) via M unresolvable host(s), N URL(s) via M resolved host(s)`
19
+ - `[blocked-stats]` per-pattern hit counters reported at scan end — surfaces which `blocked` patterns are doing work vs. which are stale
20
+ - `disable_adblock` per-site config flag to escape global ad-blocking layers
21
+ - `capture_popups` now runs whois/dig validation on matched popup URLs
22
+ - `lib/spawn-async.js` shared async-spawn helper module — consolidates 4 near-identical Promise wrappers across curl/grep/searchstring
23
+
24
+ ### Fixed
25
+ - **Security**: nettools shell-injection vector closed — `exec(string)` replaced with `execFile(cmd, args)` (no shell); config-supplied `whois_server` and `recordType` values can no longer execute commands via `$()`/backticks/etc.
26
+ - Cloudflare `detectChallengeLoop` off-by-one bug — counted the current URL against itself, tripping `>= 2` threshold one iteration early
27
+ - Cloudflare `detectChallengeLoop` threshold was unreachable with default `cloudflare_max_retries = 2`; new exact-match path catches reload-to-same-URL loops at attempt 2
28
+ - Cloudflare outcome cache namespace collision — now stored in a separate Map (was sharing keys with the detection cache, getting evicted by detection-cache pressure)
29
+ - `ignoreDomains` dynamic Set didn't cascade to subdomains — `ignoreDomainsByUrl` dynamic adds now apply parent-walk just like static config (e.g. dynamically-ignored `example.com` now also catches `cdn.example.com`)
30
+ - `blocked` / `blockDomainsByUrl` / `ignoreDomainsByUrl` regex compile failures unified — was silent-drop for *byUrl and hard-throw for blocked; now all warn loudly with `[config] X pattern dropped (compile error): "..." -- regex msg` and continue
31
+ - adblock pattern-cache key mismatch — anchored patterns (`||example.com`) were missing their own cache because get/set used different keys
32
+ - grep AND-logic silently dropped non-matching rules; ENOBUFS silently truncated output on large pages
33
+ - Cloudflare debug logs rendered literal `"undefined"` when detection short-circuited on non-HTTP pages (popup → about:blank case)
34
+ - Outcome label `no_indicators` was lying when detection short-circuited on non-HTTP page URL; now correctly reports `skipped(non-http)`
35
+ - Cloudflare `handleLegacyCheckbox` selector list aligned with detection — dropped orphan `.cf-turnstile input[type="checkbox"]` selector that had no matching detection entry
36
+ - Cloudflare `safeWaitForNavigation` warn was unconditional; now `forceDebug`-gated (was spamming stderr on phishing-bypass nav failures in production)
37
+ - Cloudflare `enhancedParallelChallengeDetection` had zero callers — deleted
38
+ - `analyzeCloudflareChallenge` ignored managed-challenge signals (`.cf-managed-challenge`, `[data-cf-managed]`); now folded into `isChallengePresent`
39
+ - `isChallengeCompleted` double-queried the same DOM element; cached once
40
+ - Various correctness fixes across compare (inline hosts-comment stripping), curl, dry-run, flowproxy (error-path bug, cookie parsing), referrer, searchstring, validate_rules modules
41
+ - 30+ dead exports trimmed across nettools (11), cloudflare (18 → then re-trimmed after refactor), adblock, adblock-rust, compare, dry-run
42
+
43
+ ### Improved
44
+ - Dig/whois cache TTL 14h → 20h, capacity 1000 → 2000 entries each — covers overnight scan-then-rescan cadence without forcing fresh lookups
45
+ - nettools disk-cache writes now atomic (tmp + rename) — surviving SIGKILL/OOM/power-loss mid-write no longer leaves a truncated file that wipes the cache on next load
46
+ - Corrupt `.digcache`/`.whoiscache` files surface a `[dns-cache] X was unreadable (...); starting fresh` warn instead of silently resetting
47
+ - `dnsCacheStats.freshDig`/`freshWhois` arrays capped at 1000 entries (FIFO) — no more unbounded growth on scans with thousands of unique fresh lookups
48
+ - nettools `enableDiskCache` made idempotent (uses the previously-dead `diskCacheEnabled` flag); also warms the resolved-hostnames index from loaded entries
49
+ - 200+ log sites unified through `formatLogMessage` + subsystem tags across cloudflare, adblock, adblock-rust, compare, ignore_similar, validate_rules, wireguard_vpn, dry-run, smart-cache, flowproxy, browserexit, redirect, post-processing, cdp, output, interaction modules
50
+ - Cloudflare `runWithRetries` helper extracted — verification-challenge and phishing-warning retry harnesses collapsed from ~150 lines of duplication to thin hook-driven wrappers
51
+ - Cloudflare 14-line debug block in `handleVerificationChallenge` collapsed to one structured line: `Challenge detected: turnstile=t js=f ... title="..."`
52
+ - Cloudflare timing constants pruned (4 dead, 1 dead local var); `waitForTimeout(page, ms)` renamed to `fastTimeout(ms)`, unused `page` arg dropped
53
+ - Cloudflare `attemptChallengeSolve` post-failure diagnostic + `JS challenge` body.textContent now capped (2KB) per poll — was materializing MB on content-heavy pages
54
+ - adblock-rust: zero-copy deserialize, eager buffer release, FIFOCache rename for honest naming
55
+ - `interaction.js` performance: ~350ms saved per no-click interaction, ~750ms per with-click
56
+ - nwss per-URL timeout 120s → 75s for faster hang recovery
57
+ - Popup handler honors both `ignoreDomainsByUrl` and `blockDomainsByUrl`
58
+ - Early `ignoreDomains` gate added at main request handler — skips dig/whois/regex cycles on ignored hostnames
59
+ - `--dns-cache` help text refreshed (was stale "3hr/4hr TTL"; now "20h TTL, 2000-entry cap each")
60
+
61
+ ## [2.0.66] - 2026-05-20
62
+
63
+ ### Added
64
+ - DNS pre-check before `page.goto()` to skip unresolvable hosts fast — `--no-dns-precheck` to disable
65
+ - In-process SOCKS5 auth relay so `socks5://user:pass@host` URLs work end-to-end
66
+ - socks-relay handshake-phase watchdog so stalled clients can't sit forever
67
+ - DNS pre-check EAI_AGAIN retry-once + FIFO cap on negative cache
68
+
69
+ ### Fixed
70
+ - proxy.js: SOCKS auth false-success + SOCKS4 remote-DNS footgun
71
+ - DNS pre-check was starving under scan load (`dns.lookup` queued behind Puppeteer's libuv threadpool); switched to `dns.resolve` (c-ares, no threadpool contention)
72
+ - DNS pre-check: clear the timeout timer when lookup wins the race
73
+ - Bumped `ws` override to >=8.20.1 (CVE-2026-45736, GHSA-58qx-3vcg-4xpx)
74
+
75
+ ### Improved
76
+ - Neutralize Fullscreen API so sites can't hijack the window in `--headful` mode
77
+ - socks-relay: disable Nagle + reject unoffered no-auth selection
78
+
79
+ ## [2.0.65] - 2026-05-15
80
+
81
+ ### Added
82
+ - Cloudflare 5xx origin-error page detection — recognizes `<domain> | 5xx: <reason>` titles, marks as `error_page(522)` etc. instead of treating as a bypass target
83
+ - Per-URL Cloudflare outcome summary log with cookie state + error-code signal
84
+ - HTTP status + cf-ray captured at `page.goto()` time and threaded through to the Cloudflare outcome line
85
+ - Surface Cloudflare 5xx origin-error page count in scan stats
86
+ - HANG CHECK: per-URL progress counter + per-URL timeout + short-circuit queued URLs on restart flag
87
+ - Surface adblock-rust engine stats in debug exit output
88
+
89
+ ### Fixed
90
+ - HANG CHECK detection logic was debug-gated and never fired in production
91
+ - `--validate-config` TDZ crash by moving block below config load
92
+ - Scan-exit hang: cleanups now run on normal completion (was relying on `process.exit(0)` to skip them)
93
+ - nettools: pending-lookup leak + signal-handler conflict with nwss.js cleanup
94
+ - cloudflare: null-safe error categorization, unref'd cache timer, body.textContent reuse
95
+ - Suppressed contradictory "no indicators / error page detected" log pair
96
+
97
+ ### Improved
98
+ - cloudflare: precompile skip-proto regex, combine within-category selectors, rename outcome key
99
+ - redirect.js: skip `detectCommonJSRedirects` in production, cap `outerHTML`, filter `chrome-error://`
100
+ - Cloudflare module banner + "no indicators" log deduped (was firing once per URL)
101
+ - npm update: adblock-rs, lru-cache, puppeteer patch bumps
102
+ - Removed dead `scanner-script-org.js` prototype
103
+
104
+ ## [2.0.64] - 2026-05-02
105
+
106
+ ### Added
107
+ - `--adblock-engine=rust` option using Brave's adblock-rs (faster on large filter lists; requires `npm install adblock-rs`)
108
+ - Cache hygiene: atomic write, version key, 30-day prune, JSDoc
109
+
110
+ ### Fixed
111
+ - adblock-rs always returning `no_match` (4th arg to `engine.check` was missing — caused silent total-block-failure)
112
+ - Drop existsSync before readFileSync in cache load path (avoids redundant stat + TOCTOU)
113
+
114
+ ### Improved
115
+ - Reduce wrapper memory: zero-copy deserialize, eager buffer release
116
+ - Bumped `engines.node` floor to >=22
117
+ - npm update: `p-limit` 4.0 → 7.x (ESM API unchanged), `lru-cache` 10.4 → 11.3 (drop-in), `globals` 16.5 → 17.6 (dev-dep), `eslint` patch bump
118
+ - V8 micro-opts in adblock-rs hot path (null-proto resource-type map, bound engine.check)
119
+
120
+ ## [2.0.63] - 2026-04-25
121
+
122
+ ### Added
123
+ - `ignoreDomainsByUrl` config (top-level) — regex patterns; if any request URL matches, the request's root domain is dynamically ignored for the rest of the scan
124
+ - Redirect source and matching regex now included in `adblock_rules` log titles
125
+
126
+ ### Fixed
127
+ - Positional `.json` arg was ignored by config loader (always defaulted to `config.json`)
128
+ - ReferenceError on `allowedResourceTypes` in debug log
129
+ - ReferenceError on `matchedRegexPattern` in even_blocked path
130
+
131
+ ### Improved
132
+ - Convert resourceTypes filter to Set for O(1) lookups in hot path
133
+ - Sample `config.json` filterRegex values updated
134
+
135
+ ## [2.0.62] - 2026-04-25
136
+
137
+ ### Fixed
138
+ - TypeError in `SmartCache.getStats` when `requestCache` fails to initialize
139
+
5
140
  ## [2.0.61] - 2026-03-17
6
141
 
7
142
  ### Added
package/CLAUDE.md CHANGED
@@ -4,46 +4,57 @@ Puppeteer-based network scanner for analyzing web traffic, generating adblock fi
4
4
 
5
5
  ## Project Structure
6
6
 
7
- - `nwss.js` — Main entry point (~4,600 lines). CLI args, URL processing, orchestration.
7
+ - `nwss.js` — Main entry point (~5,800 lines). CLI args, URL processing, orchestration.
8
8
  - `config.json` — Default scan configuration (sites, filters, options).
9
- - `lib/` — 28 focused, single-purpose modules:
9
+ - `lib/` — 32 focused, single-purpose modules:
10
10
  - `fingerprint.js` — Bot detection evasion (device/GPU/timezone spoofing)
11
11
  - `cloudflare.js` — Cloudflare challenge detection and solving
12
12
  - `browserhealth.js` — Memory management and browser lifecycle
13
13
  - `interaction.js` — Human-like mouse/scroll/typing simulation
14
+ - `ghost-cursor.js` — Bezier-curve cursor pathing for human-like mouse movement
14
15
  - `smart-cache.js` — Multi-layer caching with persistence
15
16
  - `nettools.js` — WHOIS/dig integration
16
17
  - `output.js` — Multi-format rule output (adblock, dnsmasq, unbound, pihole, etc.)
17
18
  - `proxy.js` — SOCKS5/HTTP proxy support
19
+ - `socks-relay.js` — Local SOCKS proxy relay/chain helper
18
20
  - `wireguard_vpn.js` / `openvpn_vpn.js` — VPN routing
19
- - `adblock.js` — Adblock filter parsing and validation
21
+ - `adblock.js` — Adblock filter parsing and validation (native JS engine)
22
+ - `adblock-rust.js` — Drop-in adblock.js replacement backed by Brave's `adblock-rs` Rust engine; same matcher shape (`shouldBlock`, `getStats`, `rules`) so callers swap with one `require()`
20
23
  - `validate_rules.js` — Domain and rule format validation
21
24
  - `colorize.js` — Console output formatting and colors
22
25
  - `domain-cache.js` — Domain detection cache for performance
23
26
  - `post-processing.js` — Result cleanup and deduplication
27
+ - `spawn-async.js` — Shared `runProcess(cmd, args, opts)` helper used by curl/grep/searchstring; resolves (never rejects) with `{code, signal, stdout, stderr, truncated, error}`, enforces timeout + stdout caps
24
28
  - `redirect.js`, `referrer.js`, `cdp.js`, `curl.js`, `grep.js`, `compare.js`, `compress.js`, `dry-run.js`, `browserexit.js`, `clear_sitedata.js`, `flowproxy.js`, `ignore_similar.js`, `searchstring.js`
25
29
  - `.github/workflows/npm-publish.yml` — Automated npm publishing
26
30
  - `nwss.1` — Man page
27
31
 
28
32
  ## Tech Stack
29
33
 
30
- - **Node.js** >=22.0.0
31
- - **puppeteer** >=20.0.0 — Headless browser automation
32
- - **psl** — Public Suffix List for domain parsing
34
+ - **Node.js** >=22.12.0 (required for stable `require()` of ESM-only puppeteer 25)
35
+ - **puppeteer** >=24.0.0 — Headless browser automation. Range permits both v24 and v25; dev lockfile is on v25.
36
+ - **psl** — Public Suffix List for domain parsing (prefer this over hand-curated TLD lists)
33
37
  - **lru-cache** — LRU cache implementation
34
38
  - **p-limit** — Concurrency limiting (dynamically imported)
39
+ - **adblock-rs** — Optional native Rust filter engine, used by `lib/adblock-rust.js`. Install with `npm install adblock-rs` (requires Rust toolchain). Not a hard dep — `lib/adblock.js` is the default.
35
40
  - **eslint** — Linting (`npm run lint`)
36
41
 
37
42
  ## Conventions
38
43
 
39
44
  - Store modular functionality in `./lib/` with focused, single-purpose modules
40
45
  - Use `messageColors` and `formatLogMessage` from `./lib/colorize` for consistent console output
46
+ - Prefix every log line with a subsystem tag, e.g. `const TAG = messageColors.processing('[adblock]');` then `formatLogMessage('warn', `${TAG} ...`)`. Keeps mixed-module output attributable; every module in `lib/` follows this — match it when adding new ones.
47
+ - Pick severities deliberately: `warn` for actual errors/failures (cache write fail, native exception), `debug` for diagnostic chatter (cache misses, parse summaries, per-match traces)
41
48
  - Implement timeout protection for all Puppeteer operations using `Promise.race` patterns
42
49
  - Handle browser lifecycle with comprehensive cleanup in try-finally blocks
43
50
  - Validate all external tool availability before use (grep, curl, whois, dig)
44
51
  - Use `forceDebug` flag for detailed logging, `silentMode` for minimal output
45
52
  - Use `Object.freeze` for constant configuration objects (TIMEOUTS, CACHE_LIMITS, CONCURRENCY_LIMITS)
46
- - Use `fastTimeout(ms)` helper instead of `node:timers/promises` for Puppeteer 22.x compatibility
53
+ - Use `fastTimeout(ms)` helper instead of `node:timers/promises` for delays — project convention since the Puppeteer 22.x `page.waitForTimeout` removal, retained as the standard for all Promise-based sleeps
54
+ - Prefer `runProcess` from `./lib/spawn-async` over bare `child_process.spawn`/`spawnSync` for new external-tool calls. It resolves (never rejects), enforces a SIGKILL timeout + stdout cap, and returns a uniform result object. `lib/wireguard_vpn.js` intentionally stays on `spawnSync` — startup-only validation paths where sync is simpler. Don't follow that exception unless you have the same justification.
55
+ - Prefer `net.isIP()` over hand-rolled IPv4/IPv6 regexes for IP validation
56
+ - For disk-cache writes use the atomic `tmpPath = path + '.' + pid + '.tmp'` + `fs.renameSync` pattern (see `lib/adblock-rust.js`) so a killed process never leaves a half-written cache file
57
+ - Keep `module.exports` minimal — trim helpers that have no external consumers (grep the repo before deciding); internal-only functions stay as functions but leave the exports surface
47
58
 
48
59
  ## Running
49
60
 
package/README.md CHANGED
@@ -66,7 +66,8 @@ A Puppeteer-based tool for scanning websites to find third-party (or optionally
66
66
  | `--use-puppeteer-core` | Use `puppeteer-core` with system Chrome instead of bundled Chromium |
67
67
  | `--use-obscura` | Connect to running Obscura CDP server (`ws://127.0.0.1:9222` or `OBSCURA_WS` env). Skips fingerprint injection — Obscura provides built-in stealth |
68
68
  | `--load-extension <path>` | Load unpacked Chrome extension from directory (can be used multiple times) |
69
- | `--dns-cache` | Persist dig/whois results to disk between runs (14hr TTL, `.digcache`/`.whoiscache`) |
69
+ | `--dns-cache` | Persist dig/whois results to disk between runs (20hr TTL, 2000-entry cap each, `.digcache`/`.whoiscache`). Disk writes are atomic (tmp + rename); corrupt cache files are detected on load with a `[dns-cache]` warn line and reset cleanly. |
70
+ | `--no-dns-precheck` | Disable per-URL DNS resolution check before page navigation. By default, hosts that dig/whois have already proven live (within the 20hr cache TTL) skip their c-ares pre-check via a positive-resolution index. |
70
71
  | `--block-ads=<files>` | Block ads using EasyList format rules (comma-separated: `easylist.txt,easyprivacy.txt`) |
71
72
  | `--cdp` | Enable Chrome DevTools Protocol logging (now per-page if enabled) |
72
73
  | `--remove-dupes` | Remove duplicate domains from output (only with `-o`) |
@@ -101,6 +102,12 @@ Example:
101
102
  "googleapis.com",
102
103
  "googletagmanager.com"
103
104
  ],
105
+ "ignoreDomainsByUrl": [
106
+ "\\/jwplayer\\/"
107
+ ],
108
+ "blockDomainsByUrl": [
109
+ "\\/tracker\\/"
110
+ ],
104
111
  "sites": [
105
112
  {
106
113
  "url": "https://example.com/",
@@ -461,9 +468,10 @@ These options go at the root level of your config.json:
461
468
 
462
469
  | Field | Values | Default | Description |
463
470
  |:---------------------|:-------|:-------:|:------------|
464
- | `ignoreDomains` | Array | - | Domains to completely ignore (supports wildcards like `*.ads.com`) |
465
- | `ignoreDomainsByUrl` | Array | - | Regex patterns; if a request URL matches, the request's root domain is dynamically ignored for the rest of the scan (e.g. `["\\/jwplayer\\/", "\\/build\\/assets\\/"]`) |
466
- | `blocked` | Array | - | Global regex patterns to block requests (combined with per-site blocked) |
471
+ | `ignoreDomains` | Array | - | Domains to completely ignore (supports wildcards like `*.ads.com`). Subdomains of any listed entry are also ignored via parent-walk (e.g. `example.com` ignores `cdn.example.com` and `a.b.example.com`). |
472
+ | `ignoreDomainsByUrl` | Array | - | Regex patterns; if a request URL matches, the request's root domain is dynamically ignored for the rest of the scan AND any subsequent request to its subdomains (cascade matches the static `ignoreDomains` semantic). Example: `["\\/jwplayer\\/", "\\/build\\/assets\\/"]` |
473
+ | `blockDomainsByUrl` | Array | - | Symmetric to `ignoreDomainsByUrl` but for active blocking. Regex patterns; if a request URL matches, the request's root domain is added to a dynamic block set and ALL subsequent requests on that root (and subdomains) are aborted via Puppeteer for the rest of the scan. The triggering request itself is also aborted. Use when seeing a trigger URL is sufficient evidence the whole host is hostile. |
474
+ | `blocked` | Array | - | Global regex patterns to block requests (combined with per-site blocked). Patterns that fail to compile are warned about at scan start (`[config] blocked (global) pattern dropped (compile error): ...`) instead of crashing startup or silently disappearing. Per-pattern hit counts are reported at scan end via `[blocked-stats]` lines so stale patterns are easy to spot. |
467
475
  | `whois_server_mode` | String | `"random"` | Default server selection mode for all sites |
468
476
  | `ignore_similar` | Boolean | `true` | Ignore domains similar to already found domains |
469
477
  | `ignore_similar_threshold` | Integer | `80` | Similarity threshold percentage for ignore_similar |
@@ -10,6 +10,10 @@ const fs = require('fs');
10
10
  const path = require('path');
11
11
  const os = require('os');
12
12
  const crypto = require('crypto');
13
+ const { formatLogMessage, messageColors } = require('./colorize');
14
+ // Subsystem tag matches the project convention used by other modules
15
+ // (lib/adblock.js, flowproxy, cloudflare, curl, grep, etc.).
16
+ const ADBLOCK_RUST_TAG = messageColors.processing('[adblock-rust]');
13
17
 
14
18
  let adblockRust = null;
15
19
  let adblockRustVersion = null;
@@ -77,18 +81,19 @@ const RESOURCE_TYPE_MAP = Object.assign(Object.create(null), {
77
81
  '': ''
78
82
  });
79
83
 
80
- function normalizeResourceType(type) {
81
- if (!type) return '';
82
- return RESOURCE_TYPE_MAP[type] || 'other';
83
- }
84
+ // Removed: normalizeResourceType() helper. The hot path in shouldBlock
85
+ // inlines the (RESOURCE_TYPE_MAP[rt] || 'other') lookup directly to skip
86
+ // the function-call frame; the standalone helper had zero callers.
84
87
 
85
- // Small FIFO cache keyed on (url \0 sourceUrl \0 resourceType). Despite the
86
- // class name, eviction is insertion-order, not access-order — `get()` does not
87
- // promote. For this workload (per-page request bursts whose working set fits
88
- // in maxSize) FIFO and true LRU produce the same evictions, so the simpler
89
- // path wins. If cache effectiveness becomes a concern with larger working
90
- // sets, promote on hit by re-inserting (delete + set).
91
- class ResultLRU {
88
+ // Small FIFO cache keyed on (url \0 sourceUrl \0 resourceType). Eviction
89
+ // is insertion-order — `get()` does not promote. For this workload
90
+ // (per-page request bursts whose working set fits in maxSize) FIFO and
91
+ // true LRU produce the same evictions, so the simpler path wins. If
92
+ // cache effectiveness becomes a concern with larger working sets,
93
+ // promote on hit by re-inserting (delete + set). Renamed from ResultLRU
94
+ // since the previous name lied about the eviction policy — matches
95
+ // the FIFOCache rename in lib/adblock.js.
96
+ class FIFOCache {
92
97
  constructor(maxSize) {
93
98
  this.cache = new Map();
94
99
  this.maxSize = maxSize;
@@ -172,7 +177,7 @@ function parseAdblockRules(filePathOrArray, options = {}) {
172
177
  compiled = fs.readFileSync(cachePath);
173
178
  } catch (err) {
174
179
  if (err.code !== 'ENOENT' && enableLogging) {
175
- console.log(`[Adblock-Rust] Cache read failed (${err.message}); reparsing`);
180
+ console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Cache read failed (${err.message}); reparsing`));
176
181
  }
177
182
  }
178
183
  if (compiled) {
@@ -196,7 +201,7 @@ function parseAdblockRules(filePathOrArray, options = {}) {
196
201
  // Corrupt cache or version mismatch — fall through to a fresh parse.
197
202
  engine = null;
198
203
  if (enableLogging) {
199
- console.log(`[Adblock-Rust] Cache deserialize failed (${err.message}); reparsing`);
204
+ console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Cache deserialize failed (${err.message}); reparsing`));
200
205
  }
201
206
  }
202
207
  }
@@ -240,7 +245,7 @@ function parseAdblockRules(filePathOrArray, options = {}) {
240
245
  pruneOldCacheFiles(cacheDir, cacheTtlMs);
241
246
  } catch (err) {
242
247
  if (enableLogging) {
243
- console.log(`[Adblock-Rust] Cache write failed (${err.message}); continuing`);
248
+ console.log(formatLogMessage('warn', `${ADBLOCK_RUST_TAG} Cache write failed (${err.message}); continuing`));
244
249
  }
245
250
  }
246
251
  }
@@ -262,7 +267,7 @@ function parseAdblockRules(filePathOrArray, options = {}) {
262
267
  cacheMisses: 0
263
268
  };
264
269
 
265
- const resultCache = new ResultLRU(resultCacheSize);
270
+ const resultCache = new FIFOCache(resultCacheSize);
266
271
  // Hot-path optimization: shared "no_match" object — most checks return this,
267
272
  // skip per-call object allocation. Safe because callers only read fields.
268
273
  const NO_MATCH = Object.freeze({ blocked: false, rule: null, reason: 'no_match' });
@@ -275,9 +280,9 @@ function parseAdblockRules(filePathOrArray, options = {}) {
275
280
 
276
281
  if (enableLogging) {
277
282
  if (cacheHit) {
278
- console.log(`[Adblock-Rust] Restored compiled engine from ${cachePath} (${(totalBytes/1024/1024).toFixed(2)}MB source, ${filePaths.length} list${filePaths.length>1?'s':''})`);
283
+ console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Restored compiled engine from ${cachePath} (${(totalBytes/1024/1024).toFixed(2)}MB source, ${filePaths.length} list${filePaths.length>1?'s':''})`));
279
284
  } else {
280
- console.log(`[Adblock-Rust] Compiled ${ruleCount} rules from ${filePaths.length} list${filePaths.length>1?'s':''} (${(totalBytes/1024/1024).toFixed(2)}MB)`);
285
+ console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Compiled ${ruleCount} rules from ${filePaths.length} list${filePaths.length>1?'s':''} (${(totalBytes/1024/1024).toFixed(2)}MB)`));
281
286
  }
282
287
  }
283
288
 
@@ -315,7 +320,7 @@ function parseAdblockRules(filePathOrArray, options = {}) {
315
320
  } catch (err) {
316
321
  stats.errors++;
317
322
  if (enableLogging) {
318
- console.log(`[Adblock-Rust] Error checking ${url}: ${err.message}`);
323
+ console.log(formatLogMessage('warn', `${ADBLOCK_RUST_TAG} Error checking ${url}: ${err.message}`));
319
324
  }
320
325
  // Don't cache errors — next call may succeed (transient native panic).
321
326
  return { blocked: false, rule: null, reason: 'error' };