@fanboynz/network-scanner 2.0.55 → 2.0.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,30 +1,37 @@
1
1
  const { formatLogMessage } = require('./colorize');
2
2
 
3
- /**
4
- * IGNORE_SIMILAR MODULE
5
- *
6
- * This module implements domain similarity detection to prevent collecting
7
- * domains that are too similar to ones already found. It uses Levenshtein
8
- * distance algorithm to calculate similarity between domain base names.
9
- *
10
- * Main use case: When scanning for ad/tracker domains, prevent collecting
11
- * both "googleads.com" and "googlevds.com" since they're 89% similar.
12
- *
13
- * Performance consideration: This runs on every potential domain match,
14
- * so the algorithms need to be efficient for high-volume scanning.
15
- */
3
+ // Precompiled regex (avoids recompilation per getBaseDomainName call)
4
+ const REGEX_PROTOCOL = /^https?:\/\//;
5
+ const REGEX_WWW = /^www\./;
6
+
7
+ // Multi-part TLD lookup (module-level Set, O(1) instead of per-call array + O(n) .includes)
8
+ const MULTI_PART_TLDS = new Set([
9
+ 'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
10
+ 'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
11
+ 'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
12
+ 'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
13
+ 'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
14
+ 'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
15
+ 'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
16
+ 'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
17
+ 'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
18
+ 'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
19
+ 'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
20
+ 'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
21
+ 'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
22
+ 'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
23
+ 'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
24
+ 'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
25
+ 'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
26
+ 'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
27
+ 'com.eg', 'org.eg', 'or.ke'
28
+ ]);
29
+
30
+ // 3-part TLD lookup
31
+ const THREE_PART_TLDS = new Set(['com.au.com', 'co.uk.com']);
16
32
 
17
33
  /**
18
34
  * Extracts the base domain name without TLD for similarity comparison
19
- *
20
- * Examples:
21
- * - "ads.google.com" -> "google"
22
- * - "tracker.facebook.co.uk" -> "facebook"
23
- * - "cdn.example.org" -> "example"
24
- *
25
- * Why we do this: We want to compare the actual brand/company name part
26
- * of domains, not be fooled by different TLDs or subdomains.
27
- *
28
35
  * @param {string} domain - The domain to process
29
36
  * @returns {string} The base domain name
30
37
  */
@@ -33,259 +40,146 @@ function getBaseDomainName(domain) {
33
40
  return '';
34
41
  }
35
42
 
36
- // Remove protocol if present (handles cases where full URLs are passed)
37
- domain = domain.replace(/^https?:\/\//, '');
43
+ domain = domain.replace(REGEX_PROTOCOL, '');
44
+ domain = domain.replace(REGEX_WWW, '');
38
45
 
39
- // Remove www prefix (standardize domain format)
40
- domain = domain.replace(/^www\./, '');
41
-
42
- // Split by dots and get the part before the last dot (TLD)
43
46
  const parts = domain.split('.');
44
47
  if (parts.length < 2) {
45
- return domain; // Single part, return as-is (localhost, IP addresses, etc.)
48
+ return domain;
46
49
  }
47
50
 
48
- /**
49
- * MULTI-PART TLD HANDLING
50
- *
51
- * Many countries use multi-part TLDs like "co.uk", "com.au", etc.
52
- * We need to account for these when extracting the base domain name.
53
- *
54
- * Without this logic:
55
- * - "example.co.uk" would incorrectly return "co" instead of "example"
56
- * - "google.com.au" would return "com" instead of "google"
57
- *
58
- * This extensive list covers most common multi-part TLDs worldwide.
59
- */
60
- const multiPartTLDs = [
61
- // Common Anglo countries
62
- 'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
63
-
64
- // Latin America
65
- 'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
66
-
67
- // Asia-Pacific
68
- 'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
69
- 'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
70
-
71
- // Central America & Africa
72
- 'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
73
-
74
- // Europe extensions
75
- 'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
76
- 'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
77
- 'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
78
- 'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
79
-
80
- // Asia-Pacific extensions detailed
81
- 'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
82
- 'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
83
- 'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
84
- 'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
85
-
86
- // Americas extensions detailed
87
- 'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
88
- 'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
89
- 'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
90
-
91
- // Central America & Caribbean
92
- 'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
93
- 'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
94
-
95
- // Middle East & Africa extensions
96
- 'com.eg', 'org.eg', 'or.ke'
97
- ];
51
+ // Check multi-part TLD (O(1) Set lookup instead of O(n) array scan)
52
+ const lastTwoParts = parts[parts.length - 2] + '.' + parts[parts.length - 1];
98
53
 
99
- // Check if domain ends with a multi-part TLD
100
- const lastTwoParts = parts.slice(-2).join('.'); // e.g., "co.uk"
101
- const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : ''; // e.g., "com.au.com"
102
-
103
- // Handle 2-part TLDs (most common case)
104
- // Example: "google.co.uk" -> parts = ["google", "co", "uk"] -> return "google"
105
- if (multiPartTLDs.includes(lastTwoParts)) {
54
+ if (MULTI_PART_TLDS.has(lastTwoParts)) {
106
55
  return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
107
56
  }
108
57
 
109
- // Handle rare 3-part TLDs (future-proofing)
110
- // This is mostly theoretical but good to have for completeness
111
- if (parts.length >= 4 && lastThreeParts &&
112
- ['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
113
- return parts[parts.length - 4];
58
+ // Handle rare 3-part TLDs
59
+ if (parts.length >= 4) {
60
+ const lastThreeParts = parts[parts.length - 3] + '.' + lastTwoParts;
61
+ if (THREE_PART_TLDS.has(lastThreeParts)) {
62
+ return parts[parts.length - 4];
63
+ }
114
64
  }
115
65
 
116
- // For standard TLDs, take the second-to-last part
117
- // Example: "google.com" -> parts = ["google", "com"] -> return "google"
118
66
  return parts[parts.length - 2];
119
67
  }
120
68
 
121
69
  /**
122
70
  * Calculates similarity between two domain base names using Levenshtein distance
123
- *
124
- * The Levenshtein distance is the minimum number of single-character edits
125
- * (insertions, deletions, substitutions) needed to transform one string into another.
126
- *
127
- * We convert this to a percentage similarity for easier threshold comparison.
128
- *
129
- * Examples:
130
- * - "google" vs "googl" = 83% similar (1 deletion needed)
131
- * - "facebook" vs "facebo0k" = 87% similar (1 substitution needed)
132
- * - "amazon" vs "amaz0n" = 83% similar (1 substitution needed)
133
- *
134
- * Why this matters: Malicious domains often use typosquatting techniques
135
- * like character substitution, insertion, or deletion to appear legitimate.
136
- *
137
71
  * @param {string} domain1 - First domain base name
138
72
  * @param {string} domain2 - Second domain base name
139
73
  * @returns {number} Similarity percentage (0-100)
140
74
  */
141
75
  function calculateSimilarity(domain1, domain2) {
142
- // Exact match = 100% similar (optimization for common case)
143
76
  if (domain1 === domain2) return 100;
144
-
145
- // Empty strings have no similarity
146
77
  if (!domain1 || !domain2) return 0;
147
78
 
148
- // Identify longer and shorter strings for algorithm efficiency
149
79
  const longer = domain1.length > domain2.length ? domain1 : domain2;
150
80
  const shorter = domain1.length > domain2.length ? domain2 : domain1;
151
81
 
152
- // Edge case: empty longer string means both are empty (100% similar)
153
82
  if (longer.length === 0) return 100;
154
83
 
155
- // Calculate edit distance using dynamic programming algorithm
156
84
  const distance = levenshteinDistance(longer, shorter);
157
-
158
- // Convert to percentage: (max_length - edits_needed) / max_length * 100
159
- // Higher percentage = more similar
160
85
  return Math.round(((longer.length - distance) / longer.length) * 100);
161
86
  }
162
87
 
163
88
  /**
164
- * Calculates Levenshtein distance between two strings using dynamic programming
165
- *
166
- * This is the core algorithm that powers our similarity detection.
167
- * Time complexity: O(m*n) where m and n are string lengths
168
- * Space complexity: O(m*n) for the matrix
169
- *
170
- * The algorithm builds a matrix where each cell [i,j] represents the minimum
171
- * edit distance between the first i characters of str1 and first j characters of str2.
172
- *
173
- * Dynamic programming recurrence relation:
174
- * - If characters match: matrix[i][j] = matrix[i-1][j-1] (no edit needed)
175
- * - If different: matrix[i][j] = 1 + min(substitution, insertion, deletion)
176
- *
89
+ * Calculates Levenshtein distance using two-row approach
90
+ * Same results as original, but O(min(m,n)) space instead of O(m*n)
177
91
  * @param {string} str1 - First string
178
92
  * @param {string} str2 - Second string
179
- * @returns {number} Edit distance (number of edits needed to transform str1 to str2)
93
+ * @returns {number} Edit distance
180
94
  */
181
95
  function levenshteinDistance(str1, str2) {
182
- // Initialize matrix with base cases
183
- const matrix = [];
96
+ const m = str1.length;
97
+ const n = str2.length;
184
98
 
185
- // Base case: transforming empty string to str2 requires str2.length insertions
186
- for (let i = 0; i <= str2.length; i++) {
187
- matrix[i] = [i];
188
- }
99
+ // Ensure we iterate over the shorter dimension for row arrays
100
+ if (m < n) return levenshteinDistance(str2, str1);
101
+
102
+ // Two rows instead of full matrix
103
+ let prevRow = new Array(n + 1);
104
+ let currRow = new Array(n + 1);
189
105
 
190
- // Base case: transforming str1 to empty string requires str1.length deletions
191
- for (let j = 0; j <= str1.length; j++) {
192
- matrix[0][j] = j;
106
+ for (let j = 0; j <= n; j++) {
107
+ prevRow[j] = j;
193
108
  }
194
109
 
195
- // Fill matrix using dynamic programming
196
- for (let i = 1; i <= str2.length; i++) {
197
- for (let j = 1; j <= str1.length; j++) {
198
- // If characters match, no additional cost
199
- if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
200
- matrix[i][j] = matrix[i - 1][j - 1];
110
+ for (let i = 1; i <= m; i++) {
111
+ currRow[0] = i;
112
+ const ch1 = str1[i - 1];
113
+
114
+ for (let j = 1; j <= n; j++) {
115
+ if (ch1 === str2[j - 1]) {
116
+ currRow[j] = prevRow[j - 1];
201
117
  } else {
202
- // Take minimum cost operation:
203
- matrix[i][j] = Math.min(
204
- matrix[i - 1][j - 1] + 1, // substitution: replace char in str1
205
- matrix[i][j - 1] + 1, // insertion: add char to str1
206
- matrix[i - 1][j] + 1 // deletion: remove char from str1
207
- );
118
+ const sub = prevRow[j - 1];
119
+ const ins = currRow[j - 1];
120
+ const del = prevRow[j];
121
+ currRow[j] = (sub < ins ? (sub < del ? sub : del) : (ins < del ? ins : del)) + 1;
208
122
  }
209
123
  }
124
+
125
+ // Swap rows
126
+ const temp = prevRow;
127
+ prevRow = currRow;
128
+ currRow = temp;
210
129
  }
211
130
 
212
- // Bottom-right cell contains the final edit distance
213
- return matrix[str2.length][str1.length];
131
+ return prevRow[n];
214
132
  }
215
133
 
216
134
  /**
217
135
  * Main function: Checks if a domain should be ignored based on similarity to existing domains
218
- *
219
- * This is called for every potential domain match during scanning, so it needs to be
220
- * efficient. The function uses early returns and optimizations to minimize processing.
221
- *
222
- * Usage workflow:
223
- * 1. New domain found: "g00gleads.com"
224
- * 2. Extract base: "g00gleads"
225
- * 3. Compare to existing: ["googleads", "facebook", "amazon"]
226
- * 4. Find "googleads" is 89% similar (above 80% threshold)
227
- * 5. Return shouldIgnore: true
228
- *
229
136
  * @param {string} newDomain - The domain to check for similarity
230
137
  * @param {Set|Array} existingDomains - Collection of already found domains
231
138
  * @param {object} options - Configuration options
232
- * @param {boolean} options.enabled - Whether similarity checking is enabled
233
- * @param {number} options.threshold - Similarity percentage threshold (0-100)
234
- * @param {boolean} options.forceDebug - Whether to log debug information
235
139
  * @returns {object} Result object with shouldIgnore boolean and metadata
236
140
  */
237
141
  function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
238
142
  const {
239
143
  enabled = true,
240
- threshold = 80, // Default: ignore domains that are 80%+ similar
144
+ threshold = 80,
241
145
  forceDebug = false
242
146
  } = options;
243
147
 
244
- // Quick exit if feature is disabled (performance optimization)
245
148
  if (!enabled) {
246
149
  return { shouldIgnore: false, reason: 'ignore_similar disabled' };
247
150
  }
248
151
 
249
- // Validate input domain
250
152
  if (!newDomain) {
251
153
  return { shouldIgnore: false, reason: 'invalid domain' };
252
154
  }
253
155
 
254
- // Extract base domain name for comparison
255
156
  const newBaseDomain = getBaseDomainName(newDomain);
256
157
  if (!newBaseDomain) {
257
158
  return { shouldIgnore: false, reason: 'could not extract base domain' };
258
159
  }
259
160
 
260
- // Convert Set to Array if needed (handles both data structures)
161
+ // KEEP original guard exactly as-is: Array.from handles undefined/null/objects safely
261
162
  const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
262
163
 
263
- // Compare against each existing domain
264
164
  for (const existingDomain of domainsArray) {
265
- // Skip invalid, empty, or identical domains (optimization)
266
165
  if (!existingDomain || existingDomain === newDomain) {
267
166
  continue;
268
167
  }
269
168
 
270
- // Extract base domain for comparison
271
169
  const existingBaseDomain = getBaseDomainName(existingDomain);
272
170
  if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
273
- continue; // Skip if same base domain or extraction failed
171
+ continue;
274
172
  }
275
173
 
276
- // Calculate similarity percentage
277
174
  const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
278
175
 
279
- // Check if similarity exceeds threshold
280
176
  if (similarity >= threshold) {
281
- // Debug logging for similarity matches (helps tune thresholds)
282
177
  if (forceDebug) {
283
178
  console.log(formatLogMessage('debug',
284
179
  `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
285
180
  ));
286
181
  }
287
182
 
288
- // Return detailed similarity information for debugging/analysis
289
183
  return {
290
184
  shouldIgnore: true,
291
185
  reason: `${similarity}% similar to ${existingDomain}`,
@@ -297,24 +191,14 @@ function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
297
191
  }
298
192
  }
299
193
 
300
- // No similar domains found - safe to add this domain
301
194
  return { shouldIgnore: false, reason: 'no similar domains found' };
302
195
  }
303
196
 
304
197
  /**
305
198
  * Utility function: Filters out similar domains from a collection
306
- *
307
- * This is useful for post-processing existing domain lists to remove
308
- * similar entries. It processes the array sequentially, comparing each
309
- * domain against the already-accepted domains.
310
- *
311
- * Use case: Clean up an existing blocklist by removing similar domains
312
- * Example: ["googleads.com", "g00gleads.com", "facebook.com"]
313
- * -> ["googleads.com", "facebook.com"] (removed g00gleads as similar)
314
- *
315
199
  * @param {Array} domains - Array of domains to filter
316
- * @param {object} options - Filtering options (same as shouldIgnoreSimilarDomain)
317
- * @returns {object} Result with filtered domains and information about removed domains
200
+ * @param {object} options - Filtering options
201
+ * @returns {object} Result with filtered domains and removed domains
318
202
  */
319
203
  function filterSimilarDomains(domains, options = {}) {
320
204
  const {
@@ -323,33 +207,27 @@ function filterSimilarDomains(domains, options = {}) {
323
207
  forceDebug = false
324
208
  } = options;
325
209
 
326
- // Quick exit if disabled or invalid input
327
210
  if (!enabled || !Array.isArray(domains)) {
328
211
  return { filtered: domains, removed: [] };
329
212
  }
330
213
 
331
- const filtered = []; // Domains to keep
332
- const removed = []; // Domains that were filtered out (for reporting)
214
+ const filtered = [];
215
+ const removed = [];
333
216
 
334
- // Process each domain sequentially
335
217
  for (const domain of domains) {
336
- // Check if this domain is similar to any already-accepted domain
337
218
  const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
338
219
 
339
220
  if (result.shouldIgnore) {
340
- // Domain is too similar - add to removed list with metadata
341
221
  removed.push({
342
222
  domain,
343
223
  reason: result.reason,
344
224
  similarTo: result.similarDomain
345
225
  });
346
226
  } else {
347
- // Domain is unique enough - add to filtered list
348
227
  filtered.push(domain);
349
228
  }
350
229
  }
351
230
 
352
- // Debug reporting for filtering results
353
231
  if (forceDebug && removed.length > 0) {
354
232
  console.log(formatLogMessage('debug',
355
233
  `[ignore_similar] Filtered out ${removed.length} similar domains`
@@ -359,15 +237,6 @@ function filterSimilarDomains(domains, options = {}) {
359
237
  return { filtered, removed };
360
238
  }
361
239
 
362
- /**
363
- * MODULE EXPORTS
364
- *
365
- * Public API for the ignore_similar module:
366
- * - getBaseDomainName: Extract base domain from full domain
367
- * - calculateSimilarity: Get similarity percentage between two domains
368
- * - shouldIgnoreSimilarDomain: Main function for real-time similarity checking
369
- * - filterSimilarDomains: Batch processing function for existing lists
370
- */
371
240
  module.exports = {
372
241
  getBaseDomainName,
373
242
  calculateSimilarity,