npm - @fanboynz/network-scanner - Versions diffs - 1.0.49 → 1.0.50 - Mend

@fanboynz/network-scanner 1.0.49 → 1.0.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/lib/ignore_similar.js CHANGED Viewed

@@ -1,7 +1,30 @@
 const { formatLogMessage } = require('./colorize');
+/**
+ * IGNORE_SIMILAR MODULE
+ *
+ * This module implements domain similarity detection to prevent collecting
+ * domains that are too similar to ones already found. It uses Levenshtein
+ * distance algorithm to calculate similarity between domain base names.
+ *
+ * Main use case: When scanning for ad/tracker domains, prevent collecting
+ * both "googleads.com" and "googlevds.com" since they're 89% similar.
+ *
+ * Performance consideration: This runs on every potential domain match,
+ * so the algorithms need to be efficient for high-volume scanning.
+ */
 /**
  * Extracts the base domain name without TLD for similarity comparison
+ *
+ * Examples:
+ * - "ads.google.com" -> "google"
+ * - "tracker.facebook.co.uk" -> "facebook"
+ * - "cdn.example.org" -> "example"
+ *
+ * Why we do this: We want to compare the actual brand/company name part
+ * of domains, not be fooled by different TLDs or subdomains.
+ *
  * @param {string} domain - The domain to process
  * @returns {string} The base domain name
  */
@@ -10,170 +33,259 @@ function getBaseDomainName(domain) {
     return '';
   }
-  // Remove protocol if present
+  // Remove protocol if present (handles cases where full URLs are passed)
   domain = domain.replace(/^https?:\/\//, '');
-  // Remove www prefix
+  // Remove www prefix (standardize domain format)
   domain = domain.replace(/^www\./, '');
   // Split by dots and get the part before the last dot (TLD)
   const parts = domain.split('.');
   if (parts.length < 2) {
-    return domain; // Single part, return as-is
+    return domain; // Single part, return as-is (localhost, IP addresses, etc.)
   }
- // Handle common multi-part TLDs (country code domains)
- const multiPartTLDs = [
-  'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
-  'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
-  'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
-  'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
-  'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
-  // Europe extensions
-  'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
-  'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
-  'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
-  'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
-  // Asia-Pacific extensions
-  'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
-  'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
-  'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
-  'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
-  // Americas extensions
-  'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
-  'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
-  'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
-  // Central America & Caribbean
-  'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
-  'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
-  // Middle East & Africa extensions
-  'com.eg', 'org.eg', 'or.ke'
- ];
- // Check if domain ends with a multi-part TLD
- const lastTwoParts = parts.slice(-2).join('.');
- const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : '';
- // Handle multi-part TLDs (e.g., google.co.nz ? "google")
- if (multiPartTLDs.includes(lastTwoParts)) {
-   return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
- }
- // Handle some 3-part TLDs (e.g., com.au.com if it existed)
- if (parts.length >= 4 && lastThreeParts &&
-     ['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
-   return parts[parts.length - 4];
- }
- // For standard TLDs, take the second-to-last part (e.g., google.com ? "google")
- return parts[parts.length - 2];
+  /**
+   * MULTI-PART TLD HANDLING
+   *
+   * Many countries use multi-part TLDs like "co.uk", "com.au", etc.
+   * We need to account for these when extracting the base domain name.
+   *
+   * Without this logic:
+   * - "example.co.uk" would incorrectly return "co" instead of "example"
+   * - "google.com.au" would return "com" instead of "google"
+   *
+   * This extensive list covers most common multi-part TLDs worldwide.
+   */
+  const multiPartTLDs = [
+    // Common Anglo countries
+    'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
+    // Latin America
+    'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
+    // Asia-Pacific
+    'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
+    'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
+    // Central America & Africa
+    'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
+    // Europe extensions
+    'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
+    'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
+    'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
+    'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
+    // Asia-Pacific extensions detailed
+    'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
+    'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
+    'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
+    'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
+    // Americas extensions detailed
+    'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
+    'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
+    'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
+    // Central America & Caribbean
+    'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
+    'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
+    // Middle East & Africa extensions
+    'com.eg', 'org.eg', 'or.ke'
+  ];
+  // Check if domain ends with a multi-part TLD
+  const lastTwoParts = parts.slice(-2).join('.');      // e.g., "co.uk"
+  const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : ''; // e.g., "com.au.com"
+  // Handle 2-part TLDs (most common case)
+  // Example: "google.co.uk" -> parts = ["google", "co", "uk"] -> return "google"
+  if (multiPartTLDs.includes(lastTwoParts)) {
+    return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
+  }
+  // Handle rare 3-part TLDs (future-proofing)
+  // This is mostly theoretical but good to have for completeness
+  if (parts.length >= 4 && lastThreeParts &&
+      ['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
+    return parts[parts.length - 4];
+  }
+  // For standard TLDs, take the second-to-last part
+  // Example: "google.com" -> parts = ["google", "com"] -> return "google"
+  return parts[parts.length - 2];
 }
 /**
  * Calculates similarity between two domain base names using Levenshtein distance
+ *
+ * The Levenshtein distance is the minimum number of single-character edits
+ * (insertions, deletions, substitutions) needed to transform one string into another.
+ *
+ * We convert this to a percentage similarity for easier threshold comparison.
+ *
+ * Examples:
+ * - "google" vs "googl" = 83% similar (1 deletion needed)
+ * - "facebook" vs "facebo0k" = 87% similar (1 substitution needed)
+ * - "amazon" vs "amaz0n" = 83% similar (1 substitution needed)
+ *
+ * Why this matters: Malicious domains often use typosquatting techniques
+ * like character substitution, insertion, or deletion to appear legitimate.
+ *
  * @param {string} domain1 - First domain base name
  * @param {string} domain2 - Second domain base name
  * @returns {number} Similarity percentage (0-100)
  */
 function calculateSimilarity(domain1, domain2) {
+  // Exact match = 100% similar (optimization for common case)
   if (domain1 === domain2) return 100;
+  // Empty strings have no similarity
   if (!domain1 || !domain2) return 0;
+  // Identify longer and shorter strings for algorithm efficiency
   const longer = domain1.length > domain2.length ? domain1 : domain2;
   const shorter = domain1.length > domain2.length ? domain2 : domain1;
+  // Edge case: empty longer string means both are empty (100% similar)
   if (longer.length === 0) return 100;
+  // Calculate edit distance using dynamic programming algorithm
   const distance = levenshteinDistance(longer, shorter);
+  // Convert to percentage: (max_length - edits_needed) / max_length * 100
+  // Higher percentage = more similar
   return Math.round(((longer.length - distance) / longer.length) * 100);
 }
 /**
- * Calculates Levenshtein distance between two strings
+ * Calculates Levenshtein distance between two strings using dynamic programming
+ *
+ * This is the core algorithm that powers our similarity detection.
+ * Time complexity: O(m*n) where m and n are string lengths
+ * Space complexity: O(m*n) for the matrix
+ *
+ * The algorithm builds a matrix where each cell [i,j] represents the minimum
+ * edit distance between the first i characters of str1 and first j characters of str2.
+ *
+ * Dynamic programming recurrence relation:
+ * - If characters match: matrix[i][j] = matrix[i-1][j-1] (no edit needed)
+ * - If different: matrix[i][j] = 1 + min(substitution, insertion, deletion)
+ *
  * @param {string} str1 - First string
- * @param {string} str2 - Second string
- * @returns {number} Edit distance
+ * @param {string} str2 - Second string
+ * @returns {number} Edit distance (number of edits needed to transform str1 to str2)
  */
 function levenshteinDistance(str1, str2) {
+  // Initialize matrix with base cases
   const matrix = [];
+  // Base case: transforming empty string to str2 requires str2.length insertions
   for (let i = 0; i <= str2.length; i++) {
     matrix[i] = [i];
   }
+  // Base case: transforming str1 to empty string requires str1.length deletions
   for (let j = 0; j <= str1.length; j++) {
     matrix[0][j] = j;
   }
+  // Fill matrix using dynamic programming
   for (let i = 1; i <= str2.length; i++) {
     for (let j = 1; j <= str1.length; j++) {
+      // If characters match, no additional cost
       if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
         matrix[i][j] = matrix[i - 1][j - 1];
       } else {
+        // Take minimum cost operation:
         matrix[i][j] = Math.min(
-          matrix[i - 1][j - 1] + 1, // substitution
-          matrix[i][j - 1] + 1,     // insertion
-          matrix[i - 1][j] + 1      // deletion
+          matrix[i - 1][j - 1] + 1, // substitution: replace char in str1
+          matrix[i][j - 1] + 1,     // insertion: add char to str1
+          matrix[i - 1][j] + 1      // deletion: remove char from str1
         );
       }
     }
   }
+  // Bottom-right cell contains the final edit distance
   return matrix[str2.length][str1.length];
 }
 /**
- * Checks if a domain should be ignored based on similarity to existing domains
- * @param {string} newDomain - The domain to check
- * @param {Set|Array} existingDomains - Collection of existing domains
- * @param {object} options - Options for similarity checking
- * @returns {object} Result object with shouldIgnore boolean and details
+ * Main function: Checks if a domain should be ignored based on similarity to existing domains
+ *
+ * This is called for every potential domain match during scanning, so it needs to be
+ * efficient. The function uses early returns and optimizations to minimize processing.
+ *
+ * Usage workflow:
+ * 1. New domain found: "g00gleads.com"
+ * 2. Extract base: "g00gleads"
+ * 3. Compare to existing: ["googleads", "facebook", "amazon"]
+ * 4. Find "googleads" is 89% similar (above 80% threshold)
+ * 5. Return shouldIgnore: true
+ *
+ * @param {string} newDomain - The domain to check for similarity
+ * @param {Set|Array} existingDomains - Collection of already found domains
+ * @param {object} options - Configuration options
+ * @param {boolean} options.enabled - Whether similarity checking is enabled
+ * @param {number} options.threshold - Similarity percentage threshold (0-100)
+ * @param {boolean} options.forceDebug - Whether to log debug information
+ * @returns {object} Result object with shouldIgnore boolean and metadata
  */
 function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
   const {
     enabled = true,
-    threshold = 80, // Similarity threshold percentage (80% by default)
+    threshold = 80, // Default: ignore domains that are 80%+ similar
     forceDebug = false
   } = options;
+  // Quick exit if feature is disabled (performance optimization)
   if (!enabled) {
     return { shouldIgnore: false, reason: 'ignore_similar disabled' };
   }
+  // Validate input domain
   if (!newDomain) {
     return { shouldIgnore: false, reason: 'invalid domain' };
   }
+  // Extract base domain name for comparison
   const newBaseDomain = getBaseDomainName(newDomain);
   if (!newBaseDomain) {
     return { shouldIgnore: false, reason: 'could not extract base domain' };
   }
-  // Convert Set to Array if needed
+  // Convert Set to Array if needed (handles both data structures)
   const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
+  // Compare against each existing domain
   for (const existingDomain of domainsArray) {
+    // Skip invalid, empty, or identical domains (optimization)
     if (!existingDomain || existingDomain === newDomain) {
-      continue; // Skip empty or identical domains
+      continue;
     }
+    // Extract base domain for comparison
     const existingBaseDomain = getBaseDomainName(existingDomain);
     if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
-      continue; // Skip if same base domain or invalid
+      continue; // Skip if same base domain or extraction failed
     }
+    // Calculate similarity percentage
     const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
+    // Check if similarity exceeds threshold
     if (similarity >= threshold) {
+      // Debug logging for similarity matches (helps tune thresholds)
       if (forceDebug) {
-        console.log(formatLogMessage('debug', `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`));
+        console.log(formatLogMessage('debug',
+          `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
+        ));
       }
+      // Return detailed similarity information for debugging/analysis
       return {
         shouldIgnore: true,
         reason: `${similarity}% similar to ${existingDomain}`,
@@ -185,14 +297,24 @@ function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
     }
   }
+  // No similar domains found - safe to add this domain
   return { shouldIgnore: false, reason: 'no similar domains found' };
 }
 /**
- * Filters out similar domains from a collection
+ * Utility function: Filters out similar domains from a collection
+ *
+ * This is useful for post-processing existing domain lists to remove
+ * similar entries. It processes the array sequentially, comparing each
+ * domain against the already-accepted domains.
+ *
+ * Use case: Clean up an existing blocklist by removing similar domains
+ * Example: ["googleads.com", "g00gleads.com", "facebook.com"]
+ *         -> ["googleads.com", "facebook.com"] (removed g00gleads as similar)
+ *
  * @param {Array} domains - Array of domains to filter
- * @param {object} options - Filtering options
- * @returns {object} Result with filtered domains and removed domains
+ * @param {object} options - Filtering options (same as shouldIgnoreSimilarDomain)
+ * @returns {object} Result with filtered domains and information about removed domains
  */
 function filterSimilarDomains(domains, options = {}) {
   const {
@@ -201,37 +323,54 @@ function filterSimilarDomains(domains, options = {}) {
     forceDebug = false
   } = options;
+  // Quick exit if disabled or invalid input
   if (!enabled || !Array.isArray(domains)) {
     return { filtered: domains, removed: [] };
   }
-  const filtered = [];
-  const removed = [];
+  const filtered = [];   // Domains to keep
+  const removed = [];    // Domains that were filtered out (for reporting)
+  // Process each domain sequentially
   for (const domain of domains) {
+    // Check if this domain is similar to any already-accepted domain
     const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
     if (result.shouldIgnore) {
+      // Domain is too similar - add to removed list with metadata
       removed.push({
         domain,
         reason: result.reason,
         similarTo: result.similarDomain
       });
     } else {
+      // Domain is unique enough - add to filtered list
       filtered.push(domain);
     }
   }
+  // Debug reporting for filtering results
   if (forceDebug && removed.length > 0) {
-    console.log(formatLogMessage('debug', `[ignore_similar] Filtered out ${removed.length} similar domains`));
+    console.log(formatLogMessage('debug',
+      `[ignore_similar] Filtered out ${removed.length} similar domains`
+    ));
   }
   return { filtered, removed };
 }
+/**
+ * MODULE EXPORTS
+ *
+ * Public API for the ignore_similar module:
+ * - getBaseDomainName: Extract base domain from full domain
+ * - calculateSimilarity: Get similarity percentage between two domains
+ * - shouldIgnoreSimilarDomain: Main function for real-time similarity checking
+ * - filterSimilarDomains: Batch processing function for existing lists
+ */
 module.exports = {
   getBaseDomainName,
   calculateSimilarity,
   shouldIgnoreSimilarDomain,
   filterSimilarDomains
-};
+};

package/lib/nettools.js CHANGED Viewed

@@ -1148,7 +1148,7 @@ function createNetToolsHandler(config) {
           }
           if (!digResult) {
-          const digResult = await digLookup(digDomain, digRecordType, 5000); // 5 second timeout for dig
+          digResult = await digLookup(digDomain, digRecordType, 5000); // 5 second timeout for dig
             // Cache the result for future use
             digResultCache.set(digCacheKey, {
@@ -1348,4 +1348,4 @@ module.exports = {
   getCommonWhoisServers,
   suggestWhoisServers,
   execWithTimeout // Export for testing
-};
+};

package/nwss.js CHANGED Viewed

@@ -1,4 +1,4 @@
-// === Network scanner script (nwss.js) v1.0.49 ===
+// === Network scanner script (nwss.js) v1.0.50 ===
 // puppeteer for browser automation, fs for file system operations, psl for domain parsing.
 // const pLimit = require('p-limit'); // Will be dynamically imported
@@ -39,7 +39,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
 const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth');
 // --- Script Configuration & Constants ---
-const VERSION = '1.0.49'; // Script version
+const VERSION = '1.0.50'; // Script version
 // get startTime
 const startTime = Date.now();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fanboynz/network-scanner",
-  "version": "1.0.49",
+  "version": "1.0.50",
   "description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
   "main": "nwss.js",
   "scripts": {