npm - @fanboynz/network-scanner - Versions diffs - 2.0.55 → 2.0.57 - Mend

@fanboynz/network-scanner 2.0.55 → 2.0.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.github/workflows/npm-publish.yml +3 -4
package/lib/browserhealth.js +207 -179
package/lib/cloudflare.js +117 -65
package/lib/ignore_similar.js +78 -209
package/lib/post-processing.js +282 -356
package/lib/smart-cache.js +347 -267
package/nwss.js +53 -13
package/package.json +3 -2

package/lib/ignore_similar.js CHANGED Viewed

@@ -1,30 +1,37 @@
 const { formatLogMessage } = require('./colorize');
-/**
- * IGNORE_SIMILAR MODULE
- *
- * This module implements domain similarity detection to prevent collecting
- * domains that are too similar to ones already found. It uses Levenshtein
- * distance algorithm to calculate similarity between domain base names.
- *
- * Main use case: When scanning for ad/tracker domains, prevent collecting
- * both "googleads.com" and "googlevds.com" since they're 89% similar.
- *
- * Performance consideration: This runs on every potential domain match,
- * so the algorithms need to be efficient for high-volume scanning.
- */
+// Precompiled regex (avoids recompilation per getBaseDomainName call)
+const REGEX_PROTOCOL = /^https?:\/\//;
+const REGEX_WWW = /^www\./;
+// Multi-part TLD lookup (module-level Set, O(1) instead of per-call array + O(n) .includes)
+const MULTI_PART_TLDS = new Set([
+  'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
+  'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
+  'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
+  'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
+  'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
+  'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
+  'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
+  'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
+  'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
+  'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
+  'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
+  'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
+  'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
+  'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
+  'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
+  'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
+  'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
+  'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
+  'com.eg', 'org.eg', 'or.ke'
+]);
+// 3-part TLD lookup
+const THREE_PART_TLDS = new Set(['com.au.com', 'co.uk.com']);
 /**
  * Extracts the base domain name without TLD for similarity comparison
- *
- * Examples:
- * - "ads.google.com" -> "google"
- * - "tracker.facebook.co.uk" -> "facebook"
- * - "cdn.example.org" -> "example"
- *
- * Why we do this: We want to compare the actual brand/company name part
- * of domains, not be fooled by different TLDs or subdomains.
- *
  * @param {string} domain - The domain to process
  * @returns {string} The base domain name
  */
@@ -33,259 +40,146 @@ function getBaseDomainName(domain) {
     return '';
   }
-  // Remove protocol if present (handles cases where full URLs are passed)
-  domain = domain.replace(/^https?:\/\//, '');
+  domain = domain.replace(REGEX_PROTOCOL, '');
+  domain = domain.replace(REGEX_WWW, '');
-  // Remove www prefix (standardize domain format)
-  domain = domain.replace(/^www\./, '');
-  // Split by dots and get the part before the last dot (TLD)
   const parts = domain.split('.');
   if (parts.length < 2) {
-    return domain; // Single part, return as-is (localhost, IP addresses, etc.)
+    return domain;
   }
-  /**
-   * MULTI-PART TLD HANDLING
-   *
-   * Many countries use multi-part TLDs like "co.uk", "com.au", etc.
-   * We need to account for these when extracting the base domain name.
-   *
-   * Without this logic:
-   * - "example.co.uk" would incorrectly return "co" instead of "example"
-   * - "google.com.au" would return "com" instead of "google"
-   *
-   * This extensive list covers most common multi-part TLDs worldwide.
-   */
-  const multiPartTLDs = [
-    // Common Anglo countries
-    'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
-    // Latin America
-    'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
-    // Asia-Pacific
-    'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
-    'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
-    // Central America & Africa
-    'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
-    // Europe extensions
-    'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
-    'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
-    'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
-    'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
-    // Asia-Pacific extensions detailed
-    'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
-    'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
-    'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
-    'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
-    // Americas extensions detailed
-    'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
-    'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
-    'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
-    // Central America & Caribbean
-    'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
-    'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
-    // Middle East & Africa extensions
-    'com.eg', 'org.eg', 'or.ke'
-  ];
+  // Check multi-part TLD (O(1) Set lookup instead of O(n) array scan)
+  const lastTwoParts = parts[parts.length - 2] + '.' + parts[parts.length - 1];
-  // Check if domain ends with a multi-part TLD
-  const lastTwoParts = parts.slice(-2).join('.');      // e.g., "co.uk"
-  const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : ''; // e.g., "com.au.com"
-  // Handle 2-part TLDs (most common case)
-  // Example: "google.co.uk" -> parts = ["google", "co", "uk"] -> return "google"
-  if (multiPartTLDs.includes(lastTwoParts)) {
+  if (MULTI_PART_TLDS.has(lastTwoParts)) {
     return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
   }
-  // Handle rare 3-part TLDs (future-proofing)
-  // This is mostly theoretical but good to have for completeness
-  if (parts.length >= 4 && lastThreeParts &&
-      ['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
-    return parts[parts.length - 4];
+  // Handle rare 3-part TLDs
+  if (parts.length >= 4) {
+    const lastThreeParts = parts[parts.length - 3] + '.' + lastTwoParts;
+    if (THREE_PART_TLDS.has(lastThreeParts)) {
+      return parts[parts.length - 4];
+    }
   }
-  // For standard TLDs, take the second-to-last part
-  // Example: "google.com" -> parts = ["google", "com"] -> return "google"
   return parts[parts.length - 2];
 }
 /**
  * Calculates similarity between two domain base names using Levenshtein distance
- *
- * The Levenshtein distance is the minimum number of single-character edits
- * (insertions, deletions, substitutions) needed to transform one string into another.
- *
- * We convert this to a percentage similarity for easier threshold comparison.
- *
- * Examples:
- * - "google" vs "googl" = 83% similar (1 deletion needed)
- * - "facebook" vs "facebo0k" = 87% similar (1 substitution needed)
- * - "amazon" vs "amaz0n" = 83% similar (1 substitution needed)
- *
- * Why this matters: Malicious domains often use typosquatting techniques
- * like character substitution, insertion, or deletion to appear legitimate.
- *
  * @param {string} domain1 - First domain base name
  * @param {string} domain2 - Second domain base name
  * @returns {number} Similarity percentage (0-100)
  */
 function calculateSimilarity(domain1, domain2) {
-  // Exact match = 100% similar (optimization for common case)
   if (domain1 === domain2) return 100;
-  // Empty strings have no similarity
   if (!domain1 || !domain2) return 0;
-  // Identify longer and shorter strings for algorithm efficiency
   const longer = domain1.length > domain2.length ? domain1 : domain2;
   const shorter = domain1.length > domain2.length ? domain2 : domain1;
-  // Edge case: empty longer string means both are empty (100% similar)
   if (longer.length === 0) return 100;
-  // Calculate edit distance using dynamic programming algorithm
   const distance = levenshteinDistance(longer, shorter);
-  // Convert to percentage: (max_length - edits_needed) / max_length * 100
-  // Higher percentage = more similar
   return Math.round(((longer.length - distance) / longer.length) * 100);
 }
 /**
- * Calculates Levenshtein distance between two strings using dynamic programming
- *
- * This is the core algorithm that powers our similarity detection.
- * Time complexity: O(m*n) where m and n are string lengths
- * Space complexity: O(m*n) for the matrix
- *
- * The algorithm builds a matrix where each cell [i,j] represents the minimum
- * edit distance between the first i characters of str1 and first j characters of str2.
- *
- * Dynamic programming recurrence relation:
- * - If characters match: matrix[i][j] = matrix[i-1][j-1] (no edit needed)
- * - If different: matrix[i][j] = 1 + min(substitution, insertion, deletion)
- *
+ * Calculates Levenshtein distance using two-row approach
+ * Same results as original, but O(min(m,n)) space instead of O(m*n)
  * @param {string} str1 - First string
  * @param {string} str2 - Second string
- * @returns {number} Edit distance (number of edits needed to transform str1 to str2)
+ * @returns {number} Edit distance
  */
 function levenshteinDistance(str1, str2) {
-  // Initialize matrix with base cases
-  const matrix = [];
+  const m = str1.length;
+  const n = str2.length;
-  // Base case: transforming empty string to str2 requires str2.length insertions
-  for (let i = 0; i <= str2.length; i++) {
-    matrix[i] = [i];
-  }
+  // Ensure we iterate over the shorter dimension for row arrays
+  if (m < n) return levenshteinDistance(str2, str1);
+  // Two rows instead of full matrix
+  let prevRow = new Array(n + 1);
+  let currRow = new Array(n + 1);
-  // Base case: transforming str1 to empty string requires str1.length deletions
-  for (let j = 0; j <= str1.length; j++) {
-    matrix[0][j] = j;
+  for (let j = 0; j <= n; j++) {
+    prevRow[j] = j;
   }
-  // Fill matrix using dynamic programming
-  for (let i = 1; i <= str2.length; i++) {
-    for (let j = 1; j <= str1.length; j++) {
-      // If characters match, no additional cost
-      if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
-        matrix[i][j] = matrix[i - 1][j - 1];
+  for (let i = 1; i <= m; i++) {
+    currRow[0] = i;
+    const ch1 = str1[i - 1];
+    for (let j = 1; j <= n; j++) {
+      if (ch1 === str2[j - 1]) {
+        currRow[j] = prevRow[j - 1];
       } else {
-        // Take minimum cost operation:
-        matrix[i][j] = Math.min(
-          matrix[i - 1][j - 1] + 1, // substitution: replace char in str1
-          matrix[i][j - 1] + 1,     // insertion: add char to str1
-          matrix[i - 1][j] + 1      // deletion: remove char from str1
-        );
+        const sub = prevRow[j - 1];
+        const ins = currRow[j - 1];
+        const del = prevRow[j];
+        currRow[j] = (sub < ins ? (sub < del ? sub : del) : (ins < del ? ins : del)) + 1;
       }
     }
+    // Swap rows
+    const temp = prevRow;
+    prevRow = currRow;
+    currRow = temp;
   }
-  // Bottom-right cell contains the final edit distance
-  return matrix[str2.length][str1.length];
+  return prevRow[n];
 }
 /**
  * Main function: Checks if a domain should be ignored based on similarity to existing domains
- *
- * This is called for every potential domain match during scanning, so it needs to be
- * efficient. The function uses early returns and optimizations to minimize processing.
- *
- * Usage workflow:
- * 1. New domain found: "g00gleads.com"
- * 2. Extract base: "g00gleads"
- * 3. Compare to existing: ["googleads", "facebook", "amazon"]
- * 4. Find "googleads" is 89% similar (above 80% threshold)
- * 5. Return shouldIgnore: true
- *
  * @param {string} newDomain - The domain to check for similarity
  * @param {Set|Array} existingDomains - Collection of already found domains
  * @param {object} options - Configuration options
- * @param {boolean} options.enabled - Whether similarity checking is enabled
- * @param {number} options.threshold - Similarity percentage threshold (0-100)
- * @param {boolean} options.forceDebug - Whether to log debug information
  * @returns {object} Result object with shouldIgnore boolean and metadata
  */
 function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
   const {
     enabled = true,
-    threshold = 80, // Default: ignore domains that are 80%+ similar
+    threshold = 80,
     forceDebug = false
   } = options;
-  // Quick exit if feature is disabled (performance optimization)
   if (!enabled) {
     return { shouldIgnore: false, reason: 'ignore_similar disabled' };
   }
-  // Validate input domain
   if (!newDomain) {
     return { shouldIgnore: false, reason: 'invalid domain' };
   }
-  // Extract base domain name for comparison
   const newBaseDomain = getBaseDomainName(newDomain);
   if (!newBaseDomain) {
     return { shouldIgnore: false, reason: 'could not extract base domain' };
   }
-  // Convert Set to Array if needed (handles both data structures)
+  // KEEP original guard exactly as-is: Array.from handles undefined/null/objects safely
   const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
-  // Compare against each existing domain
   for (const existingDomain of domainsArray) {
-    // Skip invalid, empty, or identical domains (optimization)
     if (!existingDomain || existingDomain === newDomain) {
       continue;
     }
-    // Extract base domain for comparison
     const existingBaseDomain = getBaseDomainName(existingDomain);
     if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
-      continue; // Skip if same base domain or extraction failed
+      continue;
     }
-    // Calculate similarity percentage
     const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
-    // Check if similarity exceeds threshold
     if (similarity >= threshold) {
-      // Debug logging for similarity matches (helps tune thresholds)
       if (forceDebug) {
         console.log(formatLogMessage('debug',
           `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
         ));
       }
-      // Return detailed similarity information for debugging/analysis
       return {
         shouldIgnore: true,
         reason: `${similarity}% similar to ${existingDomain}`,
@@ -297,24 +191,14 @@ function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
     }
   }
-  // No similar domains found - safe to add this domain
   return { shouldIgnore: false, reason: 'no similar domains found' };
 }
 /**
  * Utility function: Filters out similar domains from a collection
- *
- * This is useful for post-processing existing domain lists to remove
- * similar entries. It processes the array sequentially, comparing each
- * domain against the already-accepted domains.
- *
- * Use case: Clean up an existing blocklist by removing similar domains
- * Example: ["googleads.com", "g00gleads.com", "facebook.com"]
- *         -> ["googleads.com", "facebook.com"] (removed g00gleads as similar)
- *
  * @param {Array} domains - Array of domains to filter
- * @param {object} options - Filtering options (same as shouldIgnoreSimilarDomain)
- * @returns {object} Result with filtered domains and information about removed domains
+ * @param {object} options - Filtering options
+ * @returns {object} Result with filtered domains and removed domains
  */
 function filterSimilarDomains(domains, options = {}) {
   const {
@@ -323,33 +207,27 @@ function filterSimilarDomains(domains, options = {}) {
     forceDebug = false
   } = options;
-  // Quick exit if disabled or invalid input
   if (!enabled || !Array.isArray(domains)) {
     return { filtered: domains, removed: [] };
   }
-  const filtered = [];   // Domains to keep
-  const removed = [];    // Domains that were filtered out (for reporting)
+  const filtered = [];
+  const removed = [];
-  // Process each domain sequentially
   for (const domain of domains) {
-    // Check if this domain is similar to any already-accepted domain
     const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
     if (result.shouldIgnore) {
-      // Domain is too similar - add to removed list with metadata
       removed.push({
         domain,
         reason: result.reason,
         similarTo: result.similarDomain
       });
     } else {
-      // Domain is unique enough - add to filtered list
       filtered.push(domain);
     }
   }
-  // Debug reporting for filtering results
   if (forceDebug && removed.length > 0) {
     console.log(formatLogMessage('debug',
       `[ignore_similar] Filtered out ${removed.length} similar domains`
@@ -359,15 +237,6 @@ function filterSimilarDomains(domains, options = {}) {
   return { filtered, removed };
 }
-/**
- * MODULE EXPORTS
- *
- * Public API for the ignore_similar module:
- * - getBaseDomainName: Extract base domain from full domain
- * - calculateSimilarity: Get similarity percentage between two domains
- * - shouldIgnoreSimilarDomain: Main function for real-time similarity checking
- * - filterSimilarDomains: Batch processing function for existing lists
- */
 module.exports = {
   getBaseDomainName,
   calculateSimilarity,