@fanboynz/network-scanner 1.0.49 → 1.0.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,30 @@
1
1
  const { formatLogMessage } = require('./colorize');
2
2
 
3
+ /**
4
+ * IGNORE_SIMILAR MODULE
5
+ *
6
+ * This module implements domain similarity detection to prevent collecting
7
+ * domains that are too similar to ones already found. It uses Levenshtein
8
+ * distance algorithm to calculate similarity between domain base names.
9
+ *
10
+ * Main use case: When scanning for ad/tracker domains, prevent collecting
11
+ * both "googleads.com" and "googlevds.com" since they're 89% similar.
12
+ *
13
+ * Performance consideration: This runs on every potential domain match,
14
+ * so the algorithms need to be efficient for high-volume scanning.
15
+ */
16
+
3
17
  /**
4
18
  * Extracts the base domain name without TLD for similarity comparison
19
+ *
20
+ * Examples:
21
+ * - "ads.google.com" -> "google"
22
+ * - "tracker.facebook.co.uk" -> "facebook"
23
+ * - "cdn.example.org" -> "example"
24
+ *
25
+ * Why we do this: We want to compare the actual brand/company name part
26
+ * of domains, not be fooled by different TLDs or subdomains.
27
+ *
5
28
  * @param {string} domain - The domain to process
6
29
  * @returns {string} The base domain name
7
30
  */
@@ -10,170 +33,259 @@ function getBaseDomainName(domain) {
10
33
  return '';
11
34
  }
12
35
 
13
- // Remove protocol if present
36
+ // Remove protocol if present (handles cases where full URLs are passed)
14
37
  domain = domain.replace(/^https?:\/\//, '');
15
38
 
16
- // Remove www prefix
39
+ // Remove www prefix (standardize domain format)
17
40
  domain = domain.replace(/^www\./, '');
18
41
 
19
42
  // Split by dots and get the part before the last dot (TLD)
20
43
  const parts = domain.split('.');
21
44
  if (parts.length < 2) {
22
- return domain; // Single part, return as-is
45
+ return domain; // Single part, return as-is (localhost, IP addresses, etc.)
23
46
  }
24
47
 
25
- // Handle common multi-part TLDs (country code domains)
26
- const multiPartTLDs = [
27
- 'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
28
- 'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
29
- 'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
30
- 'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
31
- 'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
32
-
33
- // Europe extensions
34
- 'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
35
- 'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
36
- 'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
37
- 'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
38
-
39
- // Asia-Pacific extensions
40
- 'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
41
- 'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
42
- 'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
43
- 'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
44
-
45
- // Americas extensions
46
- 'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
47
- 'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
48
- 'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
49
-
50
- // Central America & Caribbean
51
- 'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
52
- 'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
53
-
54
- // Middle East & Africa extensions
55
- 'com.eg', 'org.eg', 'or.ke'
56
- ];
57
-
58
- // Check if domain ends with a multi-part TLD
59
- const lastTwoParts = parts.slice(-2).join('.');
60
- const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : '';
61
-
62
- // Handle multi-part TLDs (e.g., google.co.nz ? "google")
63
- if (multiPartTLDs.includes(lastTwoParts)) {
64
- return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
65
- }
66
-
67
- // Handle some 3-part TLDs (e.g., com.au.com if it existed)
68
- if (parts.length >= 4 && lastThreeParts &&
69
- ['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
70
- return parts[parts.length - 4];
71
- }
72
-
73
- // For standard TLDs, take the second-to-last part (e.g., google.com ? "google")
74
- return parts[parts.length - 2];
48
+ /**
49
+ * MULTI-PART TLD HANDLING
50
+ *
51
+ * Many countries use multi-part TLDs like "co.uk", "com.au", etc.
52
+ * We need to account for these when extracting the base domain name.
53
+ *
54
+ * Without this logic:
55
+ * - "example.co.uk" would incorrectly return "co" instead of "example"
56
+ * - "google.com.au" would return "com" instead of "google"
57
+ *
58
+ * This extensive list covers most common multi-part TLDs worldwide.
59
+ */
60
+ const multiPartTLDs = [
61
+ // Common Anglo countries
62
+ 'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
63
+
64
+ // Latin America
65
+ 'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
66
+
67
+ // Asia-Pacific
68
+ 'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
69
+ 'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
70
+
71
+ // Central America & Africa
72
+ 'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
73
+
74
+ // Europe extensions
75
+ 'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
76
+ 'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
77
+ 'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
78
+ 'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
79
+
80
+ // Asia-Pacific extensions detailed
81
+ 'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
82
+ 'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
83
+ 'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
84
+ 'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
85
+
86
+ // Americas extensions detailed
87
+ 'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
88
+ 'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
89
+ 'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
90
+
91
+ // Central America & Caribbean
92
+ 'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
93
+ 'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
94
+
95
+ // Middle East & Africa extensions
96
+ 'com.eg', 'org.eg', 'or.ke'
97
+ ];
98
+
99
+ // Check if domain ends with a multi-part TLD
100
+ const lastTwoParts = parts.slice(-2).join('.'); // e.g., "co.uk"
101
+ const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : ''; // e.g., "com.au.com"
102
+
103
+ // Handle 2-part TLDs (most common case)
104
+ // Example: "google.co.uk" -> parts = ["google", "co", "uk"] -> return "google"
105
+ if (multiPartTLDs.includes(lastTwoParts)) {
106
+ return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
107
+ }
108
+
109
+ // Handle rare 3-part TLDs (future-proofing)
110
+ // This is mostly theoretical but good to have for completeness
111
+ if (parts.length >= 4 && lastThreeParts &&
112
+ ['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
113
+ return parts[parts.length - 4];
114
+ }
115
+
116
+ // For standard TLDs, take the second-to-last part
117
+ // Example: "google.com" -> parts = ["google", "com"] -> return "google"
118
+ return parts[parts.length - 2];
75
119
  }
76
120
 
77
121
  /**
78
122
  * Calculates similarity between two domain base names using Levenshtein distance
123
+ *
124
+ * The Levenshtein distance is the minimum number of single-character edits
125
+ * (insertions, deletions, substitutions) needed to transform one string into another.
126
+ *
127
+ * We convert this to a percentage similarity for easier threshold comparison.
128
+ *
129
+ * Examples:
130
+ * - "google" vs "googl" = 83% similar (1 deletion needed)
131
+ * - "facebook" vs "facebo0k" = 87% similar (1 substitution needed)
132
+ * - "amazon" vs "amaz0n" = 83% similar (1 substitution needed)
133
+ *
134
+ * Why this matters: Malicious domains often use typosquatting techniques
135
+ * like character substitution, insertion, or deletion to appear legitimate.
136
+ *
79
137
  * @param {string} domain1 - First domain base name
80
138
  * @param {string} domain2 - Second domain base name
81
139
  * @returns {number} Similarity percentage (0-100)
82
140
  */
83
141
  function calculateSimilarity(domain1, domain2) {
142
+ // Exact match = 100% similar (optimization for common case)
84
143
  if (domain1 === domain2) return 100;
144
+
145
+ // Empty strings have no similarity
85
146
  if (!domain1 || !domain2) return 0;
86
147
 
148
+ // Identify longer and shorter strings for algorithm efficiency
87
149
  const longer = domain1.length > domain2.length ? domain1 : domain2;
88
150
  const shorter = domain1.length > domain2.length ? domain2 : domain1;
89
151
 
152
+ // Edge case: empty longer string means both are empty (100% similar)
90
153
  if (longer.length === 0) return 100;
91
154
 
155
+ // Calculate edit distance using dynamic programming algorithm
92
156
  const distance = levenshteinDistance(longer, shorter);
157
+
158
+ // Convert to percentage: (max_length - edits_needed) / max_length * 100
159
+ // Higher percentage = more similar
93
160
  return Math.round(((longer.length - distance) / longer.length) * 100);
94
161
  }
95
162
 
96
163
  /**
97
- * Calculates Levenshtein distance between two strings
164
+ * Calculates Levenshtein distance between two strings using dynamic programming
165
+ *
166
+ * This is the core algorithm that powers our similarity detection.
167
+ * Time complexity: O(m*n) where m and n are string lengths
168
+ * Space complexity: O(m*n) for the matrix
169
+ *
170
+ * The algorithm builds a matrix where each cell [i,j] represents the minimum
171
+ * edit distance between the first i characters of str1 and first j characters of str2.
172
+ *
173
+ * Dynamic programming recurrence relation:
174
+ * - If characters match: matrix[i][j] = matrix[i-1][j-1] (no edit needed)
175
+ * - If different: matrix[i][j] = 1 + min(substitution, insertion, deletion)
176
+ *
98
177
  * @param {string} str1 - First string
99
- * @param {string} str2 - Second string
100
- * @returns {number} Edit distance
178
+ * @param {string} str2 - Second string
179
+ * @returns {number} Edit distance (number of edits needed to transform str1 to str2)
101
180
  */
102
181
  function levenshteinDistance(str1, str2) {
182
+ // Initialize matrix with base cases
103
183
  const matrix = [];
104
184
 
185
+ // Base case: transforming empty string to str2 requires str2.length insertions
105
186
  for (let i = 0; i <= str2.length; i++) {
106
187
  matrix[i] = [i];
107
188
  }
108
189
 
190
+ // Base case: transforming str1 to empty string requires str1.length deletions
109
191
  for (let j = 0; j <= str1.length; j++) {
110
192
  matrix[0][j] = j;
111
193
  }
112
194
 
195
+ // Fill matrix using dynamic programming
113
196
  for (let i = 1; i <= str2.length; i++) {
114
197
  for (let j = 1; j <= str1.length; j++) {
198
+ // If characters match, no additional cost
115
199
  if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
116
200
  matrix[i][j] = matrix[i - 1][j - 1];
117
201
  } else {
202
+ // Take minimum cost operation:
118
203
  matrix[i][j] = Math.min(
119
- matrix[i - 1][j - 1] + 1, // substitution
120
- matrix[i][j - 1] + 1, // insertion
121
- matrix[i - 1][j] + 1 // deletion
204
+ matrix[i - 1][j - 1] + 1, // substitution: replace char in str1
205
+ matrix[i][j - 1] + 1, // insertion: add char to str1
206
+ matrix[i - 1][j] + 1 // deletion: remove char from str1
122
207
  );
123
208
  }
124
209
  }
125
210
  }
126
211
 
212
+ // Bottom-right cell contains the final edit distance
127
213
  return matrix[str2.length][str1.length];
128
214
  }
129
215
 
130
216
  /**
131
- * Checks if a domain should be ignored based on similarity to existing domains
132
- * @param {string} newDomain - The domain to check
133
- * @param {Set|Array} existingDomains - Collection of existing domains
134
- * @param {object} options - Options for similarity checking
135
- * @returns {object} Result object with shouldIgnore boolean and details
217
+ * Main function: Checks if a domain should be ignored based on similarity to existing domains
218
+ *
219
+ * This is called for every potential domain match during scanning, so it needs to be
220
+ * efficient. The function uses early returns and optimizations to minimize processing.
221
+ *
222
+ * Usage workflow:
223
+ * 1. New domain found: "g00gleads.com"
224
+ * 2. Extract base: "g00gleads"
225
+ * 3. Compare to existing: ["googleads", "facebook", "amazon"]
226
+ * 4. Find "googleads" is 89% similar (above 80% threshold)
227
+ * 5. Return shouldIgnore: true
228
+ *
229
+ * @param {string} newDomain - The domain to check for similarity
230
+ * @param {Set|Array} existingDomains - Collection of already found domains
231
+ * @param {object} options - Configuration options
232
+ * @param {boolean} options.enabled - Whether similarity checking is enabled
233
+ * @param {number} options.threshold - Similarity percentage threshold (0-100)
234
+ * @param {boolean} options.forceDebug - Whether to log debug information
235
+ * @returns {object} Result object with shouldIgnore boolean and metadata
136
236
  */
137
237
  function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
138
238
  const {
139
239
  enabled = true,
140
- threshold = 80, // Similarity threshold percentage (80% by default)
240
+ threshold = 80, // Default: ignore domains that are 80%+ similar
141
241
  forceDebug = false
142
242
  } = options;
143
243
 
244
+ // Quick exit if feature is disabled (performance optimization)
144
245
  if (!enabled) {
145
246
  return { shouldIgnore: false, reason: 'ignore_similar disabled' };
146
247
  }
147
248
 
249
+ // Validate input domain
148
250
  if (!newDomain) {
149
251
  return { shouldIgnore: false, reason: 'invalid domain' };
150
252
  }
151
253
 
254
+ // Extract base domain name for comparison
152
255
  const newBaseDomain = getBaseDomainName(newDomain);
153
256
  if (!newBaseDomain) {
154
257
  return { shouldIgnore: false, reason: 'could not extract base domain' };
155
258
  }
156
259
 
157
- // Convert Set to Array if needed
260
+ // Convert Set to Array if needed (handles both data structures)
158
261
  const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
159
262
 
263
+ // Compare against each existing domain
160
264
  for (const existingDomain of domainsArray) {
265
+ // Skip invalid, empty, or identical domains (optimization)
161
266
  if (!existingDomain || existingDomain === newDomain) {
162
- continue; // Skip empty or identical domains
267
+ continue;
163
268
  }
164
269
 
270
+ // Extract base domain for comparison
165
271
  const existingBaseDomain = getBaseDomainName(existingDomain);
166
272
  if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
167
- continue; // Skip if same base domain or invalid
273
+ continue; // Skip if same base domain or extraction failed
168
274
  }
169
275
 
276
+ // Calculate similarity percentage
170
277
  const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
171
278
 
279
+ // Check if similarity exceeds threshold
172
280
  if (similarity >= threshold) {
281
+ // Debug logging for similarity matches (helps tune thresholds)
173
282
  if (forceDebug) {
174
- console.log(formatLogMessage('debug', `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`));
283
+ console.log(formatLogMessage('debug',
284
+ `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
285
+ ));
175
286
  }
176
287
 
288
+ // Return detailed similarity information for debugging/analysis
177
289
  return {
178
290
  shouldIgnore: true,
179
291
  reason: `${similarity}% similar to ${existingDomain}`,
@@ -185,14 +297,24 @@ function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
185
297
  }
186
298
  }
187
299
 
300
+ // No similar domains found - safe to add this domain
188
301
  return { shouldIgnore: false, reason: 'no similar domains found' };
189
302
  }
190
303
 
191
304
  /**
192
- * Filters out similar domains from a collection
305
+ * Utility function: Filters out similar domains from a collection
306
+ *
307
+ * This is useful for post-processing existing domain lists to remove
308
+ * similar entries. It processes the array sequentially, comparing each
309
+ * domain against the already-accepted domains.
310
+ *
311
+ * Use case: Clean up an existing blocklist by removing similar domains
312
+ * Example: ["googleads.com", "g00gleads.com", "facebook.com"]
313
+ * -> ["googleads.com", "facebook.com"] (removed g00gleads as similar)
314
+ *
193
315
  * @param {Array} domains - Array of domains to filter
194
- * @param {object} options - Filtering options
195
- * @returns {object} Result with filtered domains and removed domains
316
+ * @param {object} options - Filtering options (same as shouldIgnoreSimilarDomain)
317
+ * @returns {object} Result with filtered domains and information about removed domains
196
318
  */
197
319
  function filterSimilarDomains(domains, options = {}) {
198
320
  const {
@@ -201,37 +323,54 @@ function filterSimilarDomains(domains, options = {}) {
201
323
  forceDebug = false
202
324
  } = options;
203
325
 
326
+ // Quick exit if disabled or invalid input
204
327
  if (!enabled || !Array.isArray(domains)) {
205
328
  return { filtered: domains, removed: [] };
206
329
  }
207
330
 
208
- const filtered = [];
209
- const removed = [];
331
+ const filtered = []; // Domains to keep
332
+ const removed = []; // Domains that were filtered out (for reporting)
210
333
 
334
+ // Process each domain sequentially
211
335
  for (const domain of domains) {
336
+ // Check if this domain is similar to any already-accepted domain
212
337
  const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
213
338
 
214
339
  if (result.shouldIgnore) {
340
+ // Domain is too similar - add to removed list with metadata
215
341
  removed.push({
216
342
  domain,
217
343
  reason: result.reason,
218
344
  similarTo: result.similarDomain
219
345
  });
220
346
  } else {
347
+ // Domain is unique enough - add to filtered list
221
348
  filtered.push(domain);
222
349
  }
223
350
  }
224
351
 
352
+ // Debug reporting for filtering results
225
353
  if (forceDebug && removed.length > 0) {
226
- console.log(formatLogMessage('debug', `[ignore_similar] Filtered out ${removed.length} similar domains`));
354
+ console.log(formatLogMessage('debug',
355
+ `[ignore_similar] Filtered out ${removed.length} similar domains`
356
+ ));
227
357
  }
228
358
 
229
359
  return { filtered, removed };
230
360
  }
231
361
 
362
+ /**
363
+ * MODULE EXPORTS
364
+ *
365
+ * Public API for the ignore_similar module:
366
+ * - getBaseDomainName: Extract base domain from full domain
367
+ * - calculateSimilarity: Get similarity percentage between two domains
368
+ * - shouldIgnoreSimilarDomain: Main function for real-time similarity checking
369
+ * - filterSimilarDomains: Batch processing function for existing lists
370
+ */
232
371
  module.exports = {
233
372
  getBaseDomainName,
234
373
  calculateSimilarity,
235
374
  shouldIgnoreSimilarDomain,
236
375
  filterSimilarDomains
237
- };
376
+ };
package/lib/nettools.js CHANGED
@@ -1148,7 +1148,7 @@ function createNetToolsHandler(config) {
1148
1148
  }
1149
1149
 
1150
1150
  if (!digResult) {
1151
- const digResult = await digLookup(digDomain, digRecordType, 5000); // 5 second timeout for dig
1151
+ digResult = await digLookup(digDomain, digRecordType, 5000); // 5 second timeout for dig
1152
1152
 
1153
1153
  // Cache the result for future use
1154
1154
  digResultCache.set(digCacheKey, {
@@ -1348,4 +1348,4 @@ module.exports = {
1348
1348
  getCommonWhoisServers,
1349
1349
  suggestWhoisServers,
1350
1350
  execWithTimeout // Export for testing
1351
- };
1351
+ };
package/nwss.js CHANGED
@@ -1,4 +1,4 @@
1
- // === Network scanner script (nwss.js) v1.0.49 ===
1
+ // === Network scanner script (nwss.js) v1.0.50 ===
2
2
 
3
3
  // puppeteer for browser automation, fs for file system operations, psl for domain parsing.
4
4
  // const pLimit = require('p-limit'); // Will be dynamically imported
@@ -39,7 +39,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
39
39
  const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth');
40
40
 
41
41
  // --- Script Configuration & Constants ---
42
- const VERSION = '1.0.49'; // Script version
42
+ const VERSION = '1.0.50'; // Script version
43
43
 
44
44
  // get startTime
45
45
  const startTime = Date.now();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fanboynz/network-scanner",
3
- "version": "1.0.49",
3
+ "version": "1.0.50",
4
4
  "description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
5
5
  "main": "nwss.js",
6
6
  "scripts": {