npm - @fanboynz/network-scanner - Versions diffs - 3.0.3 → 3.1.0 - Mend

@fanboynz/network-scanner 3.0.3 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/lib/clear_sitedata.js CHANGED Viewed

@@ -10,11 +10,22 @@ const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
  * @param {Page} page - Puppeteer page instance
  * @param {string} currentUrl - URL being processed
  * @param {boolean} forceDebug - Debug logging flag
- * @param {boolean} quickMode - If true, only clear cache/cookies (for reloads)
+ * @param {boolean} quickMode - If true, skip the HEAVY storage types
+ *   (IndexedDB, WebSQL, service workers) and the page-level fallback. Still
+ *   clears cookies + cache + localStorage + sessionStorage, which are the
+ *   four storage layers where session-cap tracking actually lives for
+ *   ad/popunder networks. Used between reloads where full storage wipes
+ *   would add unwanted latency on every cycle.
  * @returns {Promise<{success: boolean, operations: string[]}>}
  */
 async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = false) {
   let clearDataSession = null;
+  // Hoisted outside the try so the catch-block orphan-cleanup branch can
+  // reference it — same pattern as cdp.js (commit 0772ccd). Promise.race
+  // cannot cancel the underlying createCDPSession() call; if the 10s timer
+  // wins, the original promise may still resolve to a real session that
+  // nothing references → orphan on the browser side.
+  let sessionPromise = null;
   const completedOperations = [];
   try {
@@ -23,7 +34,7 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
     // path completed — reject was swallowed (race already settled)
     // but the timer kept the event loop reference for up to 10s.
     let timeoutTimer;
-    const sessionPromise = page.target().createCDPSession();
+    sessionPromise = page.target().createCDPSession();
     const timeoutPromise = new Promise((_, reject) => {
       timeoutTimer = setTimeout(() => reject(new Error('CDP session timeout')), 10000);
       if (typeof timeoutTimer.unref === 'function') timeoutTimer.unref();
@@ -36,23 +47,46 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
     const origin = new URL(currentUrl).origin;
-    // Always clear cache and cookies (even in quick mode)
-    const basicOperations = [
-      { cmd: 'Network.clearBrowserCookies', name: 'cookies' },
-      { cmd: 'Network.clearBrowserCache', name: 'cache' }
+    // Always-on quick-mode batch: cookies + cache + localStorage + sessionStorage.
+    // All four are independent CDP methods touching different Chromium
+    // subsystems (Network domain vs Storage domain, distinct storage types) —
+    // no ordering dependency, no shared mutex. Previously these ran as 4
+    // sequential `await session.send(...)` calls, burning 3 microtask-roundtrips
+    // of pure wait time per site load; Promise.all collapses them to one
+    // slowest-of-four wait (~5-15ms saved per site load with clear_sitedata).
+    //
+    // localStorage/sessionStorage MUST stay in this always-on batch — they're
+    // where AdsCore-family popunder networks track per-session caps
+    // (aclibSubKey-popunder etc.; see commit 11e1f49). Skipping them in quick
+    // mode capped popunder discovery at ~1 capture per scan.
+    //
+    // Heavier storage types (IndexedDB, WebSQL, service workers) still gate
+    // on !quickMode below.
+    const parallelOps = [
+      { send: () => clearDataSession.send('Network.clearBrowserCookies'), name: 'cookies' },
+      { send: () => clearDataSession.send('Network.clearBrowserCache'), name: 'cache' },
+      { send: () => clearDataSession.send('Storage.clearDataForOrigin', { origin, storageTypes: 'local_storage' }), name: 'localStorage' },
+      { send: () => clearDataSession.send('Storage.clearDataForOrigin', { origin, storageTypes: 'session_storage' }), name: 'sessionStorage' }
     ];
-    for (const op of basicOperations) {
-      try {
-        await clearDataSession.send(op.cmd);
-        completedOperations.push(op.name);
-      } catch (opErr) {
-        if (forceDebug) {
-          console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} ${op.name} clear failed: ${opErr.message}`));
-        }
+    // Promise.all preserves input order in the results array, so iterating
+    // back in order gives the same completedOperations ordering as the old
+    // sequential loops (cookies, cache, localStorage, sessionStorage) — keeps
+    // debug logs stable.
+    const results = await Promise.all(parallelOps.map(op =>
+      op.send().then(
+        () => ({ name: op.name, ok: true }),
+        err => ({ name: op.name, ok: false, err })
+      )
+    ));
+    for (const r of results) {
+      if (r.ok) {
+        completedOperations.push(r.name);
+      } else if (forceDebug) {
+        console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} ${r.name} clear failed: ${r.err.message}`));
       }
     }
     // Full storage clearing (skip in quick mode for reloads)
     if (!quickMode) {
       // Try comprehensive storage clearing first
@@ -63,10 +97,11 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
         });
         completedOperations.push('all_storage');
       } catch (allStorageErr) {
-        // Fallback: try individual storage types
+        // Fallback: try individual storage types. local_storage and
+        // session_storage are intentionally omitted here — they were already
+        // cleared in the quick-mode-always-on block above (lines ~73-86),
+        // so re-clearing them would just add 2 wasted CDP roundtrips.
         const storageTypes = [
-          { type: 'local_storage', name: 'localStorage' },
-          { type: 'session_storage', name: 'sessionStorage' },
           { type: 'indexeddb', name: 'indexedDB' },
           { type: 'websql', name: 'webSQL' },
           { type: 'service_workers', name: 'serviceWorkers' }
@@ -95,6 +130,14 @@ async function clearSiteDataViaCDP(page, currentUrl, forceDebug, quickMode = fal
     return { success: completedOperations.length > 0, operations: completedOperations };
   } catch (cdpErr) {
+    // Orphan cleanup: if clearDataSession is null, the race lost (timer
+    // won) before the underlying createCDPSession() resolved. Attach a
+    // detach-on-resolve so the orphan is reaped if it arrives after we
+    // gave up. The outer finally only handles the case where the session
+    // was actually assigned. Same fix pattern as cdp.js L2.
+    if (!clearDataSession && sessionPromise) {
+      sessionPromise.then(s => s.detach().catch(() => {})).catch(() => {});
+    }
     if (forceDebug) {
       console.log(formatLogMessage('debug', `${CLEAR_SITEDATA_TAG} CDP session failed: ${cdpErr.message}`));
     }
@@ -200,7 +243,12 @@ async function clearSiteDataViaPage(page, forceDebug) {
  * @param {Page} page - Puppeteer page instance
  * @param {string} currentUrl - URL being processed
  * @param {boolean} forceDebug - Debug logging flag
- * @param {boolean} quickMode - If true, only clear cache/cookies (for reloads)
+ * @param {boolean} quickMode - If true, skip heavy storage types
+ *   (IndexedDB / WebSQL / serviceWorkers) and the page-level fallback. Still
+ *   clears cookies + cache + localStorage + sessionStorage — the four
+ *   storage layers where session-cap tracking actually lives for ad/
+ *   popunder networks. Used between reloads where full clears would add
+ *   per-cycle latency without freeing additional session state.
  * @returns {Promise<{success: boolean, operations: string[], method: string}>}
  */
 async function clearSiteData(page, currentUrl, forceDebug, quickMode = false) {

package/lib/compress.js CHANGED Viewed

@@ -4,54 +4,39 @@
 const fs = require('fs');
 const { formatLogMessage } = require('./colorize');
 const zlib = require('zlib');
-const path = require('path');
+const { pipeline } = require('node:stream/promises');
 /**
- * Compresses a file using gzip and optionally removes the original
+ * Compresses a file using gzip and optionally removes the original.
+ * Uses stream.pipeline for automatic cleanup of all streams on any error
+ * (previously the manual readStream/gzipStream/writeStream wiring left the
+ * other two streams alive when one errored, holding their fds until GC).
  * @param {string} filePath - Path to the file to compress
  * @param {boolean} removeOriginal - Whether to remove the original file after compression
  * @returns {Promise<string>} - Path to the compressed file
  */
 async function compressFile(filePath, removeOriginal = true) {
-  return new Promise((resolve, reject) => {
-    const compressedPath = `${filePath}.gz`;
-    // Create read and write streams
-    const readStream = fs.createReadStream(filePath);
-    const writeStream = fs.createWriteStream(compressedPath);
-    const gzipStream = zlib.createGzip();
-    // Handle errors
-    const handleError = (error) => {
-      // Clean up partial compressed file on error
-      try {
-        fs.unlinkSync(compressedPath);
-      } catch (cleanupErr) {
-        // Ignore cleanup errors
-      }
-      reject(error);
-    };
-    readStream.on('error', handleError);
-    writeStream.on('error', handleError);
-    gzipStream.on('error', handleError);
-    // Handle successful completion
-    writeStream.on('finish', () => {
-      if (removeOriginal) {
-        try {
-          fs.unlinkSync(filePath);
-        } catch (removeErr) {
-          // If we can't remove original, still consider compression successful
-          console.warn(formatLogMessage('warn', `Failed to remove original file ${filePath}: ${removeErr.message}`));
-        }
-      }
-      resolve(compressedPath);
-    });
-    // Pipe the streams
-    readStream.pipe(gzipStream).pipe(writeStream);
-  });
+  const compressedPath = `${filePath}.gz`;
+  try {
+    await pipeline(
+      fs.createReadStream(filePath),
+      zlib.createGzip(),
+      fs.createWriteStream(compressedPath)
+    );
+  } catch (err) {
+    // Clean up partial compressed file on error
+    try { fs.unlinkSync(compressedPath); } catch { /* ignore */ }
+    throw err;
+  }
+  if (removeOriginal) {
+    try {
+      fs.unlinkSync(filePath);
+    } catch (removeErr) {
+      // If we can't remove the original, compression is still successful
+      console.warn(formatLogMessage('warn', `Failed to remove original file ${filePath}: ${removeErr.message}`));
+    }
+  }
+  return compressedPath;
 }
 /**
@@ -84,22 +69,6 @@ async function compressMultipleFiles(filePaths, removeOriginals = true) {
   return results;
 }
-/**
- * Gets the compression ratio of a file
- * @param {string} originalPath - Path to original file
- * @param {string} compressedPath - Path to compressed file
- * @returns {number} - Compression ratio (0-1, where 0.5 means 50% of original size)
- */
-function getCompressionRatio(originalPath, compressedPath) {
-  try {
-    const originalSize = fs.statSync(originalPath).size;
-    const compressedSize = fs.statSync(compressedPath).size;
-    return compressedSize / originalSize;
-  } catch (error) {
-    return null;
-  }
-}
 /**
  * Formats file size in human readable format
  * @param {number} bytes - Size in bytes
@@ -116,6 +85,5 @@ function formatFileSize(bytes) {
 module.exports = {
   compressFile,
   compressMultipleFiles,
-  getCompressionRatio,
   formatFileSize
 };

package/lib/curl.js CHANGED Viewed

@@ -78,7 +78,12 @@ async function downloadWithCurl(url, userAgent = '', options = {}) {
   Object.entries(customHeaders).forEach(([key, value]) => {
     curlArgs.push('-H', `${key}: ${value}`);
   });
-  curlArgs.push(url);
+  // '--' end-of-options marker before the URL — defense in depth. Today
+  // the URL is bounded by upstream `new URL(requestUrl)` validation which
+  // guarantees a 'scheme://' prefix (so the string never starts with '-'),
+  // but '--' guards against any future caller that bypasses that check
+  // from accidentally letting curl reinterpret the URL as a flag.
+  curlArgs.push('--', url);
   // Shared async-spawn helper handles streaming/cap/timeout/kill plumbing.
   const result = await runProcess('curl', curlArgs, {
@@ -125,14 +130,25 @@ async function downloadWithCurl(url, userAgent = '', options = {}) {
 }
 /**
- * Searches content for patterns using JavaScript (case-insensitive)
+ * Searches content for patterns using JavaScript (case-insensitive).
+ *
+ * Patterns are passed in both original and pre-lowered form — the
+ * lowered arrays are used for matching, the original arrays for the
+ * user-facing return values (matchedPattern, foundPatterns,
+ * missingPatterns). Pre-lowering is done ONCE at handler creation
+ * (createCurlHandler) instead of per-call here — patterns never change
+ * across calls, so the previous per-call .toLowerCase() / .map() work
+ * was wasted on every matched URL.
+ *
  * @param {string} content - Content to search
- * @param {Array<string>} searchStrings - OR patterns (any can match)
- * @param {Array<string>} searchStringsAnd - AND patterns (all must match)
+ * @param {Array<string>} searchStrings - OR patterns (any can match), original case
+ * @param {Array<string>} searchStringsLower - OR patterns, pre-lowercased
+ * @param {Array<string>} searchStringsAnd - AND patterns, original case
+ * @param {Array<string>} searchStringsAndLower - AND patterns, pre-lowercased
  * @param {boolean} hasSearchStringAnd - Whether AND logic is being used
  * @returns {object} Search result with found status and matched pattern
  */
-function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSearchStringAnd = false) {
+function searchContent(content, searchStrings, searchStringsLower, searchStringsAnd, searchStringsAndLower, hasSearchStringAnd) {
   if (!content || content.length === 0) {
     return { found: false, matchedPattern: null, matchType: null };
   }
@@ -144,13 +160,9 @@ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSe
   // entire list to build a full missingPatterns array that's only used
   // by a debug log. Now we early-exit and report the first miss (the
   // debug log's missingPatterns.join(', ') still works with one entry).
-  if (hasSearchStringAnd && searchStringsAnd.length > 0) {
-    // Pre-lower patterns once — was per-iteration toLowerCase before.
-    // For a 20-pattern AND check the difference is small per call but
-    // the pattern itself never changes between iterations of the loop.
-    const lowered = searchStringsAnd.map(p => p.toLowerCase());
-    for (let i = 0; i < searchStringsAnd.length; i++) {
-      if (!lowerContent.includes(lowered[i])) {
+  if (hasSearchStringAnd && searchStringsAndLower.length > 0) {
+    for (let i = 0; i < searchStringsAndLower.length; i++) {
+      if (!lowerContent.includes(searchStringsAndLower[i])) {
         return {
           found: false,
           matchedPattern: null,
@@ -169,12 +181,10 @@ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSe
     };
   }
-  // Handle OR logic searchstring (any pattern can match). Same pre-lower
-  // optimization, though OR usually short-circuits early so the savings
-  // are smaller.
-  if (searchStrings.length > 0) {
-    for (let i = 0; i < searchStrings.length; i++) {
-      if (lowerContent.includes(searchStrings[i].toLowerCase())) {
+  // Handle OR logic searchstring (any pattern can match).
+  if (searchStringsLower.length > 0) {
+    for (let i = 0; i < searchStringsLower.length; i++) {
+      if (lowerContent.includes(searchStringsLower[i])) {
         return {
           found: true,
           matchedPattern: searchStrings[i],
@@ -271,6 +281,14 @@ function createCurlHandler(config) {
   let currentRootDomain = '';
   try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
+  // Pre-lower searchstring patterns ONCE at handler creation. Patterns
+  // never change across calls, so the previous per-call .toLowerCase()
+  // inside searchContent wasted work on every matched URL × N patterns.
+  // Default to [] when undefined so searchContent's `.length` checks
+  // don't fault — matches the prior default-arg behavior.
+  const searchStringsLower = (searchStrings || []).map(p => p.toLowerCase());
+  const searchStringsAndLower = (searchStringsAnd || []).map(p => p.toLowerCase());
   return async function curlHandler(requestUrl) {
     try {
       // Regex check FIRST — cheap filter that skips ~99% of requests.
@@ -413,11 +431,15 @@ function createCurlHandler(config) {
         }
       }
-      // Search content for patterns
+      // Search content for patterns — pass pre-lowered patterns alongside
+      // originals so the matching path uses the cached lowercase form and
+      // the result still reports the user-facing original-case strings.
       const searchResult = searchContent(
-        downloadResult.content,
-        searchStrings,
-        searchStringsAnd,
+        downloadResult.content,
+        searchStrings || [],
+        searchStringsLower,
+        searchStringsAnd || [],
+        searchStringsAndLower,
         hasSearchStringAnd
       );

package/lib/domain-cache.js CHANGED Viewed

@@ -199,14 +199,6 @@ class DomainCache {
     }
   }
-  /**
-   * Get all cached domains (for debugging)
-   * @returns {Array<string>} Array of cached domains
-   */
-  getAllCachedDomains() {
-    return Array.from(this.cache);
-  }
   /**
    * Check if cache contains a specific domain (without updating stats)
    * @param {string} domain - Domain to check
@@ -216,21 +208,6 @@ class DomainCache {
     return this.cache.has(domain);
   }
-  /**
-   * Remove a specific domain from cache
-   * @param {string} domain - Domain to remove
-   * @returns {boolean} True if domain was removed, false if it wasn't in cache
-   */
-  removeDomain(domain) {
-    const wasRemoved = this.cache.delete(domain);
-    if (wasRemoved && this.enableLogging) {
-      console.log(formatLogMessage('debug', `${this.logPrefix} Removed from cache: ${domain}`));
-    }
-    return wasRemoved;
-  }
   /**
    * Add multiple domains to cache at once. Uses a single .size delta to
    * count actually-new entries (skipping per-domain .has() calls), and
@@ -347,38 +324,15 @@ function resetGlobalCache() {
 /**
  * Legacy wrapper functions for backward compatibility
  * These match the original function signatures from nwss.js
+ *
+ * NOTE: getTotalDomainsSkipped and getDetectedDomainsCount are the only
+ * ones kept — they're used directly by nwss.js for end-of-scan stats.
+ * Previously-defined isDomainAlreadyDetected / markDomainAsDetected /
+ * checkAndMark wrappers were removed: nwss.js calls those via
+ * createGlobalHelpers() now and repo-wide grep confirmed zero remaining
+ * external callers of the legacy wrappers.
  */
-/**
- * Check if a domain was already detected (legacy wrapper)
- * @param {string} domain - Domain to check
- * @returns {boolean} True if domain was already detected
- */
-function isDomainAlreadyDetected(domain) {
-  const cache = getGlobalDomainCache();
-  return cache.isDomainAlreadyDetected(domain);
-}
-/**
- * Mark a domain as detected (legacy wrapper)
- * @param {string} domain - Domain to mark as detected
- */
-function markDomainAsDetected(domain) {
-  const cache = getGlobalDomainCache();
-  cache.markDomainAsDetected(domain);
-}
-/**
- * Combined check-and-mark in one pass — one Set.has() call instead of the
- * two you'd pay for isDomainAlreadyDetected() + markDomainAsDetected().
- * @param {string} domain - Domain to check and mark
- * @returns {boolean} True if already detected (skip), false if new (process)
- */
-function checkAndMark(domain) {
-  const cache = getGlobalDomainCache();
-  return cache.checkAndMark(domain);
-}
 /**
  * Get total domains skipped (legacy wrapper)
  * @returns {number} Number of domains skipped
@@ -406,10 +360,7 @@ module.exports = {
   createGlobalHelpers,
   resetGlobalCache,
-  // Legacy wrapper functions for backward compatibility
-  isDomainAlreadyDetected,
-  markDomainAsDetected,
-  checkAndMark,
+  // Legacy wrappers still used by nwss.js for end-of-scan stats
   getTotalDomainsSkipped,
   getDetectedDomainsCount
 };

package/lib/dry-run.js CHANGED Viewed

@@ -222,12 +222,17 @@ function outputDryRunResults(url, matchedItems = [], netToolsResults = [], pageT
           );
         }
-        // Searchstring "not found" — see processDryRunResults comment
-        // for why the positive-match branch was removed.
+        // searchStringChecked flag means "this match would trigger a
+        // searchstring validation step in live mode" — dry-run doesn't
+        // download content, so we can't actually run that check here.
+        // Surface the deferred-validation status accurately rather than
+        // claiming "no matches found" (which falsely implied the check
+        // was performed). See processDryRunResults comment for why the
+        // positive-match lookup branch was removed entirely.
         if (item.searchStringChecked) {
           emit(
-            `  ✗ Searchstring: No matches found in content`,
-            `  ${messageColors.warn('✗ Searchstring:')} No matches found in content`
+            `  ⚠ Searchstring: Would require live check (skipped in dry-run)`,
+            `  ${messageColors.warn('⚠ Searchstring:')} Would require live check (skipped in dry-run)`
           );
         }