@fanboynz/network-scanner 2.0.66 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,69 +1,42 @@
1
- const { formatLogMessage } = require('./colorize');
1
+ const psl = require('psl');
2
+ const { formatLogMessage, messageColors } = require('./colorize');
3
+ const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
2
4
 
3
- // Precompiled regex (avoids recompilation per getBaseDomainName call)
5
+ // Strip protocol before handing to psl.parse, which expects a bare
6
+ // hostname per Public Suffix List semantics. psl handles 'www.' as a
7
+ // subdomain naturally (no need for a separate strip).
4
8
  const REGEX_PROTOCOL = /^https?:\/\//;
5
- const REGEX_WWW = /^www\./;
6
-
7
- // Multi-part TLD lookup (module-level Set, O(1) instead of per-call array + O(n) .includes)
8
- const MULTI_PART_TLDS = new Set([
9
- 'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
10
- 'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
11
- 'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
12
- 'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
13
- 'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
14
- 'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
15
- 'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
16
- 'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
17
- 'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
18
- 'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
19
- 'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
20
- 'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
21
- 'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
22
- 'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
23
- 'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
24
- 'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
25
- 'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
26
- 'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
27
- 'com.eg', 'org.eg', 'or.ke'
28
- ]);
29
-
30
- // 3-part TLD lookup
31
- const THREE_PART_TLDS = new Set(['com.au.com', 'co.uk.com']);
32
9
 
33
10
  /**
34
- * Extracts the base domain name without TLD for similarity comparison
11
+ * Extracts the base domain name (sld) without TLD for similarity comparison.
12
+ *
13
+ * Uses the project's `psl` dependency — the canonical Public Suffix List
14
+ * parser, maintained against the live Mozilla list. Replaces a hand-curated
15
+ * ~80-entry MULTI_PART_TLDS Set that went stale as PSL changed, plus a
16
+ * THREE_PART_TLDS set that only listed two entries (both vanity domains
17
+ * 'com.au.com'/'co.uk.com', not real public suffixes). The rest of the
18
+ * codebase already uses psl (nwss.js, lib/post-processing.js, etc.) — this
19
+ * brings ignore_similar in line.
20
+ *
35
21
  * @param {string} domain - The domain to process
36
- * @returns {string} The base domain name
22
+ * @returns {string} The base domain name (sld), e.g. 'example' for
23
+ * 'www.example.co.uk'. Returns '' for invalid input; falls back to
24
+ * second-to-last token for hostnames psl can't parse (IPs, single-token
25
+ * hosts, unlisted TLDs).
37
26
  */
38
27
  function getBaseDomainName(domain) {
39
28
  if (!domain || typeof domain !== 'string') {
40
29
  return '';
41
30
  }
42
-
43
- domain = domain.replace(REGEX_PROTOCOL, '');
44
- domain = domain.replace(REGEX_WWW, '');
45
-
46
- const parts = domain.split('.');
47
- if (parts.length < 2) {
48
- return domain;
31
+ const hostname = domain.replace(REGEX_PROTOCOL, '');
32
+ const parsed = psl.parse(hostname);
33
+ if (parsed && parsed.sld) {
34
+ return parsed.sld;
49
35
  }
50
-
51
- // Check multi-part TLD (O(1) Set lookup instead of O(n) array scan)
52
- const lastTwoParts = parts[parts.length - 2] + '.' + parts[parts.length - 1];
53
-
54
- if (MULTI_PART_TLDS.has(lastTwoParts)) {
55
- return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
56
- }
57
-
58
- // Handle rare 3-part TLDs
59
- if (parts.length >= 4) {
60
- const lastThreeParts = parts[parts.length - 3] + '.' + lastTwoParts;
61
- if (THREE_PART_TLDS.has(lastThreeParts)) {
62
- return parts[parts.length - 4];
63
- }
64
- }
65
-
66
- return parts[parts.length - 2];
36
+ // Fallback for IPs / single-token / unparseable: best-effort
37
+ // second-to-last token (the old behavior's default branch).
38
+ const parts = hostname.split('.');
39
+ return parts.length >= 2 ? parts[parts.length - 2] : hostname;
67
40
  }
68
41
 
69
42
  /**
@@ -75,12 +48,13 @@ function getBaseDomainName(domain) {
75
48
  function calculateSimilarity(domain1, domain2) {
76
49
  if (domain1 === domain2) return 100;
77
50
  if (!domain1 || !domain2) return 0;
78
-
79
- const longer = domain1.length > domain2.length ? domain1 : domain2;
51
+
52
+ // Both inputs are non-empty different strings at this point — the
53
+ // `''` cases are handled by the two guards above. (Used to have an
54
+ // `if (longer.length === 0) return 100` here but it was unreachable.)
55
+ const longer = domain1.length > domain2.length ? domain1 : domain2;
80
56
  const shorter = domain1.length > domain2.length ? domain2 : domain1;
81
-
82
- if (longer.length === 0) return 100;
83
-
57
+
84
58
  const distance = levenshteinDistance(longer, shorter);
85
59
  return Math.round(((longer.length - distance) / longer.length) * 100);
86
60
  }
@@ -93,26 +67,29 @@ function calculateSimilarity(domain1, domain2) {
93
67
  * @returns {number} Edit distance
94
68
  */
95
69
  function levenshteinDistance(str1, str2) {
96
- const m = str1.length;
97
- const n = str2.length;
98
-
99
- // Ensure we iterate over the shorter dimension for row arrays
100
- if (m < n) return levenshteinDistance(str2, str1);
101
-
102
- // Two rows instead of full matrix
70
+ // Ensure str1 is the longer one so the inner-loop dimension (n)
71
+ // stays small. Inline swap instead of recursive re-entry — the old
72
+ // `if (m < n) return levenshteinDistance(str2, str1)` paid a stack
73
+ // frame + re-validation for what's really just a variable rename.
74
+ let a = str1, b = str2;
75
+ if (a.length < b.length) { const t = a; a = b; b = t; }
76
+ const m = a.length;
77
+ const n = b.length;
78
+
79
+ // Two rows instead of full matrix — O(n) space instead of O(m*n).
103
80
  let prevRow = new Array(n + 1);
104
81
  let currRow = new Array(n + 1);
105
-
82
+
106
83
  for (let j = 0; j <= n; j++) {
107
84
  prevRow[j] = j;
108
85
  }
109
-
86
+
110
87
  for (let i = 1; i <= m; i++) {
111
88
  currRow[0] = i;
112
- const ch1 = str1[i - 1];
113
-
89
+ const ch1 = a[i - 1];
90
+
114
91
  for (let j = 1; j <= n; j++) {
115
- if (ch1 === str2[j - 1]) {
92
+ if (ch1 === b[j - 1]) {
116
93
  currRow[j] = prevRow[j - 1];
117
94
  } else {
118
95
  const sub = prevRow[j - 1];
@@ -121,13 +98,13 @@ function levenshteinDistance(str1, str2) {
121
98
  currRow[j] = (sub < ins ? (sub < del ? sub : del) : (ins < del ? ins : del)) + 1;
122
99
  }
123
100
  }
124
-
101
+
125
102
  // Swap rows
126
103
  const temp = prevRow;
127
104
  prevRow = currRow;
128
105
  currRow = temp;
129
106
  }
130
-
107
+
131
108
  return prevRow[n];
132
109
  }
133
110
 
@@ -165,18 +142,28 @@ function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
165
142
  if (!existingDomain || existingDomain === newDomain) {
166
143
  continue;
167
144
  }
168
-
145
+
169
146
  const existingBaseDomain = getBaseDomainName(existingDomain);
170
- if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
147
+ if (!existingBaseDomain) {
171
148
  continue;
172
149
  }
173
-
150
+
151
+ // BEHAVIOR NOTE: identical base names (e.g. google.com vs google.net)
152
+ // now count as 100% similar — calculateSimilarity returns 100 for
153
+ // matching strings, which is above any reasonable threshold. The old
154
+ // `existingBaseDomain === newBaseDomain` skip silently exempted
155
+ // same-base-different-TLD pairs, defeating the dedup purpose for the
156
+ // most common variant case (brand registrations across multiple TLDs).
157
+ // Both call sites in nwss.js (matched-dedup at ~2833, ignoreDomains
158
+ // expansion at ~2849) want this stricter behavior. Set a lower
159
+ // threshold or disable ignore_similar entirely if you actually want
160
+ // to keep brand variants.
174
161
  const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
175
162
 
176
163
  if (similarity >= threshold) {
177
164
  if (forceDebug) {
178
165
  console.log(formatLogMessage('debug',
179
- `[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
166
+ `${IGNORE_SIMILAR_TAG} ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
180
167
  ));
181
168
  }
182
169
 
@@ -194,52 +181,11 @@ function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
194
181
  return { shouldIgnore: false, reason: 'no similar domains found' };
195
182
  }
196
183
 
197
- /**
198
- * Utility function: Filters out similar domains from a collection
199
- * @param {Array} domains - Array of domains to filter
200
- * @param {object} options - Filtering options
201
- * @returns {object} Result with filtered domains and removed domains
202
- */
203
- function filterSimilarDomains(domains, options = {}) {
204
- const {
205
- enabled = true,
206
- threshold = 80,
207
- forceDebug = false
208
- } = options;
209
-
210
- if (!enabled || !Array.isArray(domains)) {
211
- return { filtered: domains, removed: [] };
212
- }
213
-
214
- const filtered = [];
215
- const removed = [];
216
-
217
- for (const domain of domains) {
218
- const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
219
-
220
- if (result.shouldIgnore) {
221
- removed.push({
222
- domain,
223
- reason: result.reason,
224
- similarTo: result.similarDomain
225
- });
226
- } else {
227
- filtered.push(domain);
228
- }
229
- }
230
-
231
- if (forceDebug && removed.length > 0) {
232
- console.log(formatLogMessage('debug',
233
- `[ignore_similar] Filtered out ${removed.length} similar domains`
234
- ));
235
- }
236
-
237
- return { filtered, removed };
238
- }
239
-
184
+ // Public surface used by nwss.js. getBaseDomainName + (deleted)
185
+ // filterSimilarDomains had zero external callers getBaseDomainName
186
+ // stays as an internal helper, filterSimilarDomains is gone entirely
187
+ // (no internal callers either).
240
188
  module.exports = {
241
- getBaseDomainName,
242
189
  calculateSimilarity,
243
- shouldIgnoreSimilarDomain,
244
- filterSimilarDomains
190
+ shouldIgnoreSimilarDomain
245
191
  };