@fanboynz/network-scanner 3.0.3 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,11 +10,22 @@ const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
10
10
  * @param {Page} page - Puppeteer page instance
11
11
  * @param {string} currentUrl - URL being processed
12
12
  * @param {boolean} forceDebug - Debug logging flag
13
- * @param {boolean} quickMode - If true, only clear cache/cookies (for reloads)
13
+ * @param {boolean} quickMode - If true, skip the HEAVY storage types
14
+ * (IndexedDB, WebSQL, service workers) and the page-level fallback. Still
15
+ * clears cookies + cache + localStorage + sessionStorage, which are the
16
+ * four storage layers where session-cap tracking actually lives for
17
+ * ad/popunder networks. Used between reloads where full storage wipes
18
+ * would add unwanted latency on every cycle.
14
19
  * @returns {Promise<{success: boolean, operations: string[]}>}
15
20
  */
16
21
  async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = false) {
17
22
  let clearDataSession = null;
23
+ // Hoisted outside the try so the catch-block orphan-cleanup branch can
24
+ // reference it — same pattern as cdp.js (commit 0772ccd). Promise.race
25
+ // cannot cancel the underlying createCDPSession() call; if the 10s timer
26
+ // wins, the original promise may still resolve to a real session that
27
+ // nothing references → orphan on the browser side.
28
+ let sessionPromise = null;
18
29
  const completedOperations = [];
19
30
 
20
31
  try {
@@ -23,7 +34,7 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
23
34
  // path completed — reject was swallowed (race already settled)
24
35
  // but the timer kept the event loop reference for up to 10s.
25
36
  let timeoutTimer;
26
- const sessionPromise = page.target().createCDPSession();
37
+ sessionPromise = page.target().createCDPSession();
27
38
  const timeoutPromise = new Promise((_, reject) => {
28
39
  timeoutTimer = setTimeout(() => reject(new Error('CDP session timeout')), 10000);
29
40
  if (typeof timeoutTimer.unref === 'function') timeoutTimer.unref();
@@ -36,23 +47,46 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
36
47
 
37
48
  const origin = new URL(currentUrl).origin;
38
49
 
39
- // Always clear cache and cookies (even in quick mode)
40
- const basicOperations = [
41
- { cmd: 'Network.clearBrowserCookies', name: 'cookies' },
42
- { cmd: 'Network.clearBrowserCache', name: 'cache' }
50
+ // Always-on quick-mode batch: cookies + cache + localStorage + sessionStorage.
51
+ // All four are independent CDP methods touching different Chromium
52
+ // subsystems (Network domain vs Storage domain, distinct storage types) —
53
+ // no ordering dependency, no shared mutex. Previously these ran as 4
54
+ // sequential `await session.send(...)` calls, burning 3 microtask-roundtrips
55
+ // of pure wait time per site load; Promise.all collapses them to one
56
+ // slowest-of-four wait (~5-15ms saved per site load with clear_sitedata).
57
+ //
58
+ // localStorage/sessionStorage MUST stay in this always-on batch — they're
59
+ // where AdsCore-family popunder networks track per-session caps
60
+ // (aclibSubKey-popunder etc.; see commit 11e1f49). Skipping them in quick
61
+ // mode capped popunder discovery at ~1 capture per scan.
62
+ //
63
+ // Heavier storage types (IndexedDB, WebSQL, service workers) still gate
64
+ // on !quickMode below.
65
+ const parallelOps = [
66
+ { send: () => clearDataSession.send('Network.clearBrowserCookies'), name: 'cookies' },
67
+ { send: () => clearDataSession.send('Network.clearBrowserCache'), name: 'cache' },
68
+ { send: () => clearDataSession.send('Storage.clearDataForOrigin', { origin, storageTypes: 'local_storage' }), name: 'localStorage' },
69
+ { send: () => clearDataSession.send('Storage.clearDataForOrigin', { origin, storageTypes: 'session_storage' }), name: 'sessionStorage' }
43
70
  ];
44
-
45
- for (const op of basicOperations) {
46
- try {
47
- await clearDataSession.send(op.cmd);
48
- completedOperations.push(op.name);
49
- } catch (opErr) {
50
- if (forceDebug) {
51
- console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} ${op.name} clear failed: ${opErr.message}`));
52
- }
71
+
72
+ // Promise.all preserves input order in the results array, so iterating
73
+ // back in order gives the same completedOperations ordering as the old
74
+ // sequential loops (cookies, cache, localStorage, sessionStorage) — keeps
75
+ // debug logs stable.
76
+ const results = await Promise.all(parallelOps.map(op =>
77
+ op.send().then(
78
+ () => ({ name: op.name, ok: true }),
79
+ err => ({ name: op.name, ok: false, err })
80
+ )
81
+ ));
82
+ for (const r of results) {
83
+ if (r.ok) {
84
+ completedOperations.push(r.name);
85
+ } else if (forceDebug) {
86
+ console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} ${r.name} clear failed: ${r.err.message}`));
53
87
  }
54
88
  }
55
-
89
+
56
90
  // Full storage clearing (skip in quick mode for reloads)
57
91
  if (!quickMode) {
58
92
  // Try comprehensive storage clearing first
@@ -63,10 +97,11 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
63
97
  });
64
98
  completedOperations.push('all_storage');
65
99
  } catch (allStorageErr) {
66
- // Fallback: try individual storage types
100
+ // Fallback: try individual storage types. local_storage and
101
+ // session_storage are intentionally omitted here — they were already
102
+ // cleared in the quick-mode-always-on block above (lines ~73-86),
103
+ // so re-clearing them would just add 2 wasted CDP roundtrips.
67
104
  const storageTypes = [
68
- { type: 'local_storage', name: 'localStorage' },
69
- { type: 'session_storage', name: 'sessionStorage' },
70
105
  { type: 'indexeddb', name: 'indexedDB' },
71
106
  { type: 'websql', name: 'webSQL' },
72
107
  { type: 'service_workers', name: 'serviceWorkers' }
@@ -95,6 +130,14 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
95
130
  return { success: completedOperations.length > 0, operations: completedOperations };
96
131
 
97
132
  } catch (cdpErr) {
133
+ // Orphan cleanup: if clearDataSession is null, the race lost (timer
134
+ // won) before the underlying createCDPSession() resolved. Attach a
135
+ // detach-on-resolve so the orphan is reaped if it arrives after we
136
+ // gave up. The outer finally only handles the case where the session
137
+ // was actually assigned. Same fix pattern as cdp.js L2.
138
+ if (!clearDataSession && sessionPromise) {
139
+ sessionPromise.then(s => s.detach().catch(() => {})).catch(() => {});
140
+ }
98
141
  if (forceDebug) {
99
142
  console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} CDP session failed: ${cdpErr.message}`));
100
143
  }
@@ -200,7 +243,12 @@ async function clearSiteDataViaPage(page, forceDebug) {
200
243
  * @param {Page} page - Puppeteer page instance
201
244
  * @param {string} currentUrl - URL being processed
202
245
  * @param {boolean} forceDebug - Debug logging flag
203
- * @param {boolean} quickMode - If true, only clear cache/cookies (for reloads)
246
+ * @param {boolean} quickMode - If true, skip heavy storage types
247
+ * (IndexedDB / WebSQL / serviceWorkers) and the page-level fallback. Still
248
+ * clears cookies + cache + localStorage + sessionStorage — the four
249
+ * storage layers where session-cap tracking actually lives for ad/
250
+ * popunder networks. Used between reloads where full clears would add
251
+ * per-cycle latency without freeing additional session state.
204
252
  * @returns {Promise<{success: boolean, operations: string[], method: string}>}
205
253
  */
206
254
  async function clearSiteData(page, currentUrl, forceDebug, quickMode = false) {
package/lib/compress.js CHANGED
@@ -4,54 +4,39 @@
4
4
  const fs = require('fs');
5
5
  const { formatLogMessage } = require('./colorize');
6
6
  const zlib = require('zlib');
7
- const path = require('path');
7
+ const { pipeline } = require('node:stream/promises');
8
8
 
9
9
  /**
10
- * Compresses a file using gzip and optionally removes the original
10
+ * Compresses a file using gzip and optionally removes the original.
11
+ * Uses stream.pipeline for automatic cleanup of all streams on any error
12
+ * (previously the manual readStream/gzipStream/writeStream wiring left the
13
+ * other two streams alive when one errored, holding their fds until GC).
11
14
  * @param {string} filePath - Path to the file to compress
12
15
  * @param {boolean} removeOriginal - Whether to remove the original file after compression
13
16
  * @returns {Promise<string>} - Path to the compressed file
14
17
  */
15
18
  async function compressFile(filePath, removeOriginal = true) {
16
- return new Promise((resolve, reject) => {
17
- const compressedPath = `${filePath}.gz`;
18
-
19
- // Create read and write streams
20
- const readStream = fs.createReadStream(filePath);
21
- const writeStream = fs.createWriteStream(compressedPath);
22
- const gzipStream = zlib.createGzip();
23
-
24
- // Handle errors
25
- const handleError = (error) => {
26
- // Clean up partial compressed file on error
27
- try {
28
- fs.unlinkSync(compressedPath);
29
- } catch (cleanupErr) {
30
- // Ignore cleanup errors
31
- }
32
- reject(error);
33
- };
34
-
35
- readStream.on('error', handleError);
36
- writeStream.on('error', handleError);
37
- gzipStream.on('error', handleError);
38
-
39
- // Handle successful completion
40
- writeStream.on('finish', () => {
41
- if (removeOriginal) {
42
- try {
43
- fs.unlinkSync(filePath);
44
- } catch (removeErr) {
45
- // If we can't remove original, still consider compression successful
46
- console.warn(formatLogMessage('warn', `Failed to remove original file ${filePath}: ${removeErr.message}`));
47
- }
48
- }
49
- resolve(compressedPath);
50
- });
51
-
52
- // Pipe the streams
53
- readStream.pipe(gzipStream).pipe(writeStream);
54
- });
19
+ const compressedPath = `${filePath}.gz`;
20
+ try {
21
+ await pipeline(
22
+ fs.createReadStream(filePath),
23
+ zlib.createGzip(),
24
+ fs.createWriteStream(compressedPath)
25
+ );
26
+ } catch (err) {
27
+ // Clean up partial compressed file on error
28
+ try { fs.unlinkSync(compressedPath); } catch { /* ignore */ }
29
+ throw err;
30
+ }
31
+ if (removeOriginal) {
32
+ try {
33
+ fs.unlinkSync(filePath);
34
+ } catch (removeErr) {
35
+ // If we can't remove the original, compression is still successful
36
+ console.warn(formatLogMessage('warn', `Failed to remove original file ${filePath}: ${removeErr.message}`));
37
+ }
38
+ }
39
+ return compressedPath;
55
40
  }
56
41
 
57
42
  /**
@@ -84,22 +69,6 @@ async function compressMultipleFiles(filePaths, removeOriginals = true) {
84
69
  return results;
85
70
  }
86
71
 
87
- /**
88
- * Gets the compression ratio of a file
89
- * @param {string} originalPath - Path to original file
90
- * @param {string} compressedPath - Path to compressed file
91
- * @returns {number} - Compression ratio (0-1, where 0.5 means 50% of original size)
92
- */
93
- function getCompressionRatio(originalPath, compressedPath) {
94
- try {
95
- const originalSize = fs.statSync(originalPath).size;
96
- const compressedSize = fs.statSync(compressedPath).size;
97
- return compressedSize / originalSize;
98
- } catch (error) {
99
- return null;
100
- }
101
- }
102
-
103
72
  /**
104
73
  * Formats file size in human readable format
105
74
  * @param {number} bytes - Size in bytes
@@ -116,6 +85,5 @@ function formatFileSize(bytes) {
116
85
  module.exports = {
117
86
  compressFile,
118
87
  compressMultipleFiles,
119
- getCompressionRatio,
120
88
  formatFileSize
121
89
  };
package/lib/curl.js CHANGED
@@ -78,7 +78,12 @@ async function downloadWithCurl(url, userAgent = '', options = {}) {
78
78
  Object.entries(customHeaders).forEach(([key, value]) => {
79
79
  curlArgs.push('-H', `${key}: ${value}`);
80
80
  });
81
- curlArgs.push(url);
81
+ // '--' end-of-options marker before the URL — defense in depth. Today
82
+ // the URL is bounded by upstream `new URL(requestUrl)` validation which
83
+ // guarantees a 'scheme://' prefix (so the string never starts with '-'),
84
+ // but '--' guards against any future caller that bypasses that check
85
+ // from accidentally letting curl reinterpret the URL as a flag.
86
+ curlArgs.push('--', url);
82
87
 
83
88
  // Shared async-spawn helper handles streaming/cap/timeout/kill plumbing.
84
89
  const result = await runProcess('curl', curlArgs, {
@@ -125,14 +130,25 @@ async function downloadWithCurl(url, userAgent = '', options = {}) {
125
130
  }
126
131
 
127
132
  /**
128
- * Searches content for patterns using JavaScript (case-insensitive)
133
+ * Searches content for patterns using JavaScript (case-insensitive).
134
+ *
135
+ * Patterns are passed in both original and pre-lowered form — the
136
+ * lowered arrays are used for matching, the original arrays for the
137
+ * user-facing return values (matchedPattern, foundPatterns,
138
+ * missingPatterns). Pre-lowering is done ONCE at handler creation
139
+ * (createCurlHandler) instead of per-call here — patterns never change
140
+ * across calls, so the previous per-call .toLowerCase() / .map() work
141
+ * was wasted on every matched URL.
142
+ *
129
143
  * @param {string} content - Content to search
130
- * @param {Array<string>} searchStrings - OR patterns (any can match)
131
- * @param {Array<string>} searchStringsAnd - AND patterns (all must match)
144
+ * @param {Array<string>} searchStrings - OR patterns (any can match), original case
145
+ * @param {Array<string>} searchStringsLower - OR patterns, pre-lowercased
146
+ * @param {Array<string>} searchStringsAnd - AND patterns, original case
147
+ * @param {Array<string>} searchStringsAndLower - AND patterns, pre-lowercased
132
148
  * @param {boolean} hasSearchStringAnd - Whether AND logic is being used
133
149
  * @returns {object} Search result with found status and matched pattern
134
150
  */
135
- function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSearchStringAnd = false) {
151
+ function searchContent(content, searchStrings, searchStringsLower, searchStringsAnd, searchStringsAndLower, hasSearchStringAnd) {
136
152
  if (!content || content.length === 0) {
137
153
  return { found: false, matchedPattern: null, matchType: null };
138
154
  }
@@ -144,13 +160,9 @@ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSe
144
160
  // entire list to build a full missingPatterns array that's only used
145
161
  // by a debug log. Now we early-exit and report the first miss (the
146
162
  // debug log's missingPatterns.join(', ') still works with one entry).
147
- if (hasSearchStringAnd && searchStringsAnd.length > 0) {
148
- // Pre-lower patterns once was per-iteration toLowerCase before.
149
- // For a 20-pattern AND check the difference is small per call but
150
- // the pattern itself never changes between iterations of the loop.
151
- const lowered = searchStringsAnd.map(p => p.toLowerCase());
152
- for (let i = 0; i < searchStringsAnd.length; i++) {
153
- if (!lowerContent.includes(lowered[i])) {
163
+ if (hasSearchStringAnd && searchStringsAndLower.length > 0) {
164
+ for (let i = 0; i < searchStringsAndLower.length; i++) {
165
+ if (!lowerContent.includes(searchStringsAndLower[i])) {
154
166
  return {
155
167
  found: false,
156
168
  matchedPattern: null,
@@ -169,12 +181,10 @@ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSe
169
181
  };
170
182
  }
171
183
 
172
- // Handle OR logic searchstring (any pattern can match). Same pre-lower
173
- // optimization, though OR usually short-circuits early so the savings
174
- // are smaller.
175
- if (searchStrings.length > 0) {
176
- for (let i = 0; i < searchStrings.length; i++) {
177
- if (lowerContent.includes(searchStrings[i].toLowerCase())) {
184
+ // Handle OR logic searchstring (any pattern can match).
185
+ if (searchStringsLower.length > 0) {
186
+ for (let i = 0; i < searchStringsLower.length; i++) {
187
+ if (lowerContent.includes(searchStringsLower[i])) {
178
188
  return {
179
189
  found: true,
180
190
  matchedPattern: searchStrings[i],
@@ -271,6 +281,14 @@ function createCurlHandler(config) {
271
281
  let currentRootDomain = '';
272
282
  try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
273
283
 
284
+ // Pre-lower searchstring patterns ONCE at handler creation. Patterns
285
+ // never change across calls, so the previous per-call .toLowerCase()
286
+ // inside searchContent wasted work on every matched URL × N patterns.
287
+ // Default to [] when undefined so searchContent's `.length` checks
288
+ // don't fault — matches the prior default-arg behavior.
289
+ const searchStringsLower = (searchStrings || []).map(p => p.toLowerCase());
290
+ const searchStringsAndLower = (searchStringsAnd || []).map(p => p.toLowerCase());
291
+
274
292
  return async function curlHandler(requestUrl) {
275
293
  try {
276
294
  // Regex check FIRST — cheap filter that skips ~99% of requests.
@@ -413,11 +431,15 @@ function createCurlHandler(config) {
413
431
  }
414
432
  }
415
433
 
416
- // Search content for patterns
434
+ // Search content for patterns — pass pre-lowered patterns alongside
435
+ // originals so the matching path uses the cached lowercase form and
436
+ // the result still reports the user-facing original-case strings.
417
437
  const searchResult = searchContent(
418
- downloadResult.content,
419
- searchStrings,
420
- searchStringsAnd,
438
+ downloadResult.content,
439
+ searchStrings || [],
440
+ searchStringsLower,
441
+ searchStringsAnd || [],
442
+ searchStringsAndLower,
421
443
  hasSearchStringAnd
422
444
  );
423
445
 
@@ -199,14 +199,6 @@ class DomainCache {
199
199
  }
200
200
  }
201
201
 
202
- /**
203
- * Get all cached domains (for debugging)
204
- * @returns {Array<string>} Array of cached domains
205
- */
206
- getAllCachedDomains() {
207
- return Array.from(this.cache);
208
- }
209
-
210
202
  /**
211
203
  * Check if cache contains a specific domain (without updating stats)
212
204
  * @param {string} domain - Domain to check
@@ -216,21 +208,6 @@ class DomainCache {
216
208
  return this.cache.has(domain);
217
209
  }
218
210
 
219
- /**
220
- * Remove a specific domain from cache
221
- * @param {string} domain - Domain to remove
222
- * @returns {boolean} True if domain was removed, false if it wasn't in cache
223
- */
224
- removeDomain(domain) {
225
- const wasRemoved = this.cache.delete(domain);
226
-
227
- if (wasRemoved && this.enableLogging) {
228
- console.log(formatLogMessage('debug', `${this.logPrefix} Removed from cache: ${domain}`));
229
- }
230
-
231
- return wasRemoved;
232
- }
233
-
234
211
  /**
235
212
  * Add multiple domains to cache at once. Uses a single .size delta to
236
213
  * count actually-new entries (skipping per-domain .has() calls), and
@@ -347,38 +324,15 @@ function resetGlobalCache() {
347
324
  /**
348
325
  * Legacy wrapper functions for backward compatibility
349
326
  * These match the original function signatures from nwss.js
327
+ *
328
+ * NOTE: getTotalDomainsSkipped and getDetectedDomainsCount are the only
329
+ * ones kept — they're used directly by nwss.js for end-of-scan stats.
330
+ * Previously-defined isDomainAlreadyDetected / markDomainAsDetected /
331
+ * checkAndMark wrappers were removed: nwss.js calls those via
332
+ * createGlobalHelpers() now and repo-wide grep confirmed zero remaining
333
+ * external callers of the legacy wrappers.
350
334
  */
351
335
 
352
- /**
353
- * Check if a domain was already detected (legacy wrapper)
354
- * @param {string} domain - Domain to check
355
- * @returns {boolean} True if domain was already detected
356
- */
357
- function isDomainAlreadyDetected(domain) {
358
- const cache = getGlobalDomainCache();
359
- return cache.isDomainAlreadyDetected(domain);
360
- }
361
-
362
- /**
363
- * Mark a domain as detected (legacy wrapper)
364
- * @param {string} domain - Domain to mark as detected
365
- */
366
- function markDomainAsDetected(domain) {
367
- const cache = getGlobalDomainCache();
368
- cache.markDomainAsDetected(domain);
369
- }
370
-
371
- /**
372
- * Combined check-and-mark in one pass — one Set.has() call instead of the
373
- * two you'd pay for isDomainAlreadyDetected() + markDomainAsDetected().
374
- * @param {string} domain - Domain to check and mark
375
- * @returns {boolean} True if already detected (skip), false if new (process)
376
- */
377
- function checkAndMark(domain) {
378
- const cache = getGlobalDomainCache();
379
- return cache.checkAndMark(domain);
380
- }
381
-
382
336
  /**
383
337
  * Get total domains skipped (legacy wrapper)
384
338
  * @returns {number} Number of domains skipped
@@ -406,10 +360,7 @@ module.exports = {
406
360
  createGlobalHelpers,
407
361
  resetGlobalCache,
408
362
 
409
- // Legacy wrapper functions for backward compatibility
410
- isDomainAlreadyDetected,
411
- markDomainAsDetected,
412
- checkAndMark,
363
+ // Legacy wrappers still used by nwss.js for end-of-scan stats
413
364
  getTotalDomainsSkipped,
414
365
  getDetectedDomainsCount
415
366
  };
package/lib/dry-run.js CHANGED
@@ -222,12 +222,17 @@ function outputDryRunResults(url, matchedItems = [], netToolsResults = [], pageT
222
222
  );
223
223
  }
224
224
 
225
- // Searchstring "not found" see processDryRunResults comment
226
- // for why the positive-match branch was removed.
225
+ // searchStringChecked flag means "this match would trigger a
226
+ // searchstring validation step in live mode" — dry-run doesn't
227
+ // download content, so we can't actually run that check here.
228
+ // Surface the deferred-validation status accurately rather than
229
+ // claiming "no matches found" (which falsely implied the check
230
+ // was performed). See processDryRunResults comment for why the
231
+ // positive-match lookup branch was removed entirely.
227
232
  if (item.searchStringChecked) {
228
233
  emit(
229
- ` Searchstring: No matches found in content`,
230
- ` ${messageColors.warn(' Searchstring:')} No matches found in content`
234
+ ` Searchstring: Would require live check (skipped in dry-run)`,
235
+ ` ${messageColors.warn(' Searchstring:')} Would require live check (skipped in dry-run)`
231
236
  );
232
237
  }
233
238