@fanboynz/network-scanner 1.0.67 → 1.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,607 @@
1
+ // === Post-Processing Module for Network Scanner ===
2
+ // Handles cleanup and validation of scan results after scanning is complete
3
+
4
+ const { formatLogMessage, messageColors } = require('./colorize');
5
+
6
+ /**
7
+ * Safely extracts hostname from a URL, handling malformed URLs gracefully
8
+ * @param {string} url - The URL string to parse
9
+ * @param {boolean} getFullHostname - If true, returns full hostname; if false, returns root domain
10
+ * @returns {string} The hostname/domain, or empty string if URL is invalid
11
+ */
12
+ function safeGetDomain(url, getFullHostname = false) {
13
+ try {
14
+ const psl = require('psl');
15
+ const parsedUrl = new URL(url);
16
+ if (getFullHostname) {
17
+ return parsedUrl.hostname;
18
+ } else {
19
+ // Extract root domain using psl library
20
+ const parsed = psl.parse(parsedUrl.hostname);
21
+ return parsed.domain || parsedUrl.hostname;
22
+ }
23
+ } catch (urlError) {
24
+ return '';
25
+ }
26
+ }
27
+
28
+ /**
29
+ * Enhanced domain matching for ignoreDomains patterns (including wildcards)
30
+ * @param {string} domain - Domain to check
31
+ * @param {Array} ignorePatterns - Array of ignore patterns (supports wildcards)
32
+ * @param {boolean} forceDebug - Debug logging flag
33
+ * @returns {Object} Match result with shouldIgnore flag and reason
34
+ */
35
+ function shouldIgnoreAsIgnoreDomain(domain, ignorePatterns, forceDebug) {
36
+ if (!domain || !ignorePatterns || ignorePatterns.length === 0) {
37
+ return { shouldIgnore: false, reason: 'No ignore patterns' };
38
+ }
39
+
40
+ for (const pattern of ignorePatterns) {
41
+ if (pattern.includes('*')) {
42
+ // Handle wildcard patterns
43
+ if (pattern.startsWith('*.')) {
44
+ // Pattern: *.example.com
45
+ const wildcardDomain = pattern.substring(2); // Remove "*."
46
+ const wildcardRoot = safeGetDomain(`http://${wildcardDomain}`, false);
47
+ const domainRoot = safeGetDomain(`http://${domain}`, false);
48
+
49
+ if (wildcardRoot === domainRoot) {
50
+ return {
51
+ shouldIgnore: true,
52
+ reason: `Matches wildcard ignore pattern: ${pattern}`
53
+ };
54
+ }
55
+ } else if (pattern.endsWith('.*')) {
56
+ // Pattern: example.*
57
+ const baseDomain = pattern.slice(0, -2); // Remove ".*"
58
+ if (domain.startsWith(baseDomain + '.')) {
59
+ return {
60
+ shouldIgnore: true,
61
+ reason: `Matches wildcard TLD ignore pattern: ${pattern}`
62
+ };
63
+ }
64
+ } else {
65
+ // Complex wildcard pattern
66
+ const wildcardRegex = new RegExp('^' + pattern.replace(/\*/g, '.*').replace(/\./g, '\\.') + '$');
67
+ if (wildcardRegex.test(domain)) {
68
+ return {
69
+ shouldIgnore: true,
70
+ reason: `Matches complex wildcard ignore pattern: ${pattern}`
71
+ };
72
+ }
73
+ }
74
+ } else {
75
+ // Exact pattern matching
76
+ if (domain === pattern || domain.endsWith('.' + pattern)) {
77
+ return {
78
+ shouldIgnore: true,
79
+ reason: `Matches exact ignore pattern: ${pattern}`
80
+ };
81
+ }
82
+ }
83
+ }
84
+
85
+ return { shouldIgnore: false, reason: 'No ignore pattern matches' };
86
+ }
87
+
88
+ /**
89
+ * Enhanced domain matching that handles wildcards and first-party detection
90
+ * @param {string} extractedDomain - Domain extracted from rule
91
+ * @param {string} scannedRootDomain - Root domain of the scanned site
92
+ * @param {boolean} forceDebug - Debug logging flag
93
+ * @returns {Object} Match result with shouldRemove flag and reason
94
+ */
95
+ function shouldRemoveAsFirstParty(extractedDomain, scannedRootDomain, forceDebug) {
96
+ if (!extractedDomain || !scannedRootDomain) {
97
+ return { shouldRemove: false, reason: 'Missing domain data' };
98
+ }
99
+
100
+ // Handle wildcard patterns
101
+ if (extractedDomain.includes('*')) {
102
+ // Common wildcard patterns
103
+ if (extractedDomain.startsWith('*.')) {
104
+ // Pattern: *.example.com
105
+ const wildcardDomain = extractedDomain.substring(2); // Remove "*."
106
+ const wildcardRoot = safeGetDomain(`http://${wildcardDomain}`, false);
107
+
108
+ if (wildcardRoot === scannedRootDomain) {
109
+ return {
110
+ shouldRemove: true,
111
+ reason: `Wildcard subdomain pattern matches root domain (*.${wildcardRoot})`
112
+ };
113
+ }
114
+ } else if (extractedDomain.endsWith('.*')) {
115
+ // Pattern: example.*
116
+ const baseDomain = extractedDomain.slice(0, -2); // Remove ".*"
117
+ if (scannedRootDomain.startsWith(baseDomain + '.')) {
118
+ return {
119
+ shouldRemove: true,
120
+ reason: `Wildcard TLD pattern matches base domain (${baseDomain}.*)`
121
+ };
122
+ }
123
+ } else if (extractedDomain.includes('*')) {
124
+ // Pattern: sub*.example.com or other wildcard positions
125
+ const wildcardRegex = new RegExp('^' + extractedDomain.replace(/\*/g, '.*').replace(/\./g, '\\.') + '$');
126
+ if (wildcardRegex.test(scannedRootDomain)) {
127
+ return {
128
+ shouldRemove: true,
129
+ reason: `Complex wildcard pattern matches root domain (${extractedDomain})`
130
+ };
131
+ }
132
+ }
133
+ }
134
+
135
+ // Standard exact root domain matching
136
+ const extractedRoot = safeGetDomain(`http://${extractedDomain}`, false);
137
+ if (extractedRoot === scannedRootDomain) {
138
+ return {
139
+ shouldRemove: true,
140
+ reason: `Exact root domain match (${extractedRoot})`
141
+ };
142
+ }
143
+
144
+ return { shouldRemove: false, reason: 'No first-party match detected' };
145
+ }
146
+
147
+ /**
148
+ * Post-scan cleanup function to remove ignoreDomains from results
149
+ * This is a final safety net to catch any domains that should have been ignored
150
+ *
151
+ * @param {Array} results - Array of scan results from all sites
152
+ * @param {Array} ignoreDomains - Array of domains/patterns to ignore
153
+ * @param {Object} options - Options object
154
+ * @param {boolean} options.forceDebug - Debug logging flag
155
+ * @param {boolean} options.silentMode - Silent mode flag
156
+ * @returns {Array} Cleaned results with ignoreDomains removed
157
+ */
158
+ function cleanupIgnoreDomains(results, ignoreDomains, options = {}) {
159
+ const { forceDebug = false, silentMode = false } = options;
160
+
161
+ if (!results || results.length === 0 || !ignoreDomains || ignoreDomains.length === 0) {
162
+ return results;
163
+ }
164
+
165
+ if (forceDebug) {
166
+ console.log(formatLogMessage('debug', `[ignoreDomains cleanup] Processing ${results.length} results against ${ignoreDomains.length} ignore patterns`));
167
+ }
168
+
169
+ const cleanedResults = [];
170
+ let totalRulesRemoved = 0;
171
+ let sitesAffected = 0;
172
+
173
+ results.forEach(result => {
174
+ if (!result.rules || result.rules.length === 0) {
175
+ cleanedResults.push(result);
176
+ return;
177
+ }
178
+
179
+ const originalRulesCount = result.rules.length;
180
+ const cleanedRules = [];
181
+ const removedRules = [];
182
+
183
+ // Filter out rules that match ignoreDomains patterns
184
+ result.rules.forEach(rule => {
185
+ let extractedDomain = null;
186
+
187
+ try {
188
+ // Extract domain from different rule formats (same logic as first-party cleanup)
189
+ if (rule.startsWith('||') && rule.includes('^')) {
190
+ // ||domain.com^ format (adblock)
191
+ const match = rule.match(/^\|\|([^/\^]+)/);
192
+ if (match) {
193
+ extractedDomain = match[1];
194
+ }
195
+ } else if (rule.startsWith('127.0.0.1 ') || rule.startsWith('0.0.0.0 ')) {
196
+ // hosts file format
197
+ const parts = rule.split(/\s+/);
198
+ if (parts.length >= 2) {
199
+ extractedDomain = parts[1];
200
+ }
201
+ } else if (rule.includes('local=/') && rule.includes('/')) {
202
+ // dnsmasq format: local=/domain.com/
203
+ const match = rule.match(/local=\/([^/]+)\//);
204
+ if (match) {
205
+ extractedDomain = match[1];
206
+ }
207
+ } else if (rule.includes('server=/') && rule.includes('/')) {
208
+ // dnsmasq old format: server=/domain.com/
209
+ const match = rule.match(/server=\/([^/]+)\//);
210
+ if (match) {
211
+ extractedDomain = match[1];
212
+ }
213
+ } else if (rule.includes('local-zone:') && rule.includes('always_null')) {
214
+ // unbound format: local-zone: "domain.com." always_null
215
+ const match = rule.match(/local-zone:\s*"([^"]+)\.?"/);
216
+ if (match) {
217
+ extractedDomain = match[1];
218
+ }
219
+ } else if (rule.includes('+block') && rule.includes('.')) {
220
+ // privoxy format: { +block } .domain.com
221
+ const match = rule.match(/\{\s*\+block\s*\}\s*\.?([^\s]+)/);
222
+ if (match) {
223
+ extractedDomain = match[1];
224
+ }
225
+ } else if (rule.match(/^\(\^\|\\\.\).*\\\.\w+\$$/)) {
226
+ // pi-hole regex format: (^|\.)domain\.com$
227
+ const match = rule.match(/^\(\^\|\\\.\)(.+)\\\.\w+\$$/);
228
+ if (match) {
229
+ // Unescape the domain
230
+ extractedDomain = match[1].replace(/\\\./g, '.');
231
+ }
232
+ } else {
233
+ // Try to extract any domain-like pattern as fallback
234
+ const domainMatch = rule.match(/([a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,})/);
235
+ if (domainMatch) {
236
+ extractedDomain = domainMatch[1];
237
+ }
238
+ }
239
+ // Check if extracted domain should be ignored
240
+ if (extractedDomain) {
241
+ const ignoreResult = shouldIgnoreAsIgnoreDomain(extractedDomain, ignoreDomains, forceDebug);
242
+
243
+ if (ignoreResult.shouldIgnore) {
244
+ removedRules.push({
245
+ rule: rule,
246
+ domain: extractedDomain,
247
+ reason: `ignoreDomains: ${ignoreResult.reason}`,
248
+ matchType: ignoreResult.reason.includes('wildcard') ? 'wildcard' : 'exact'
249
+ });
250
+ return; // Exit early - rule should be removed
251
+ }
252
+ }
253
+ } catch (parseErr) {
254
+ if (forceDebug) {
255
+ console.log(formatLogMessage('debug', `[ignoreDomains cleanup] Failed to parse rule: ${rule} - ${parseErr.message}`));
256
+ }
257
+ }
258
+
259
+ // If we reach here, the rule should be kept
260
+ cleanedRules.push(rule);
261
+ });
262
+
263
+ cleanedResults.push({ ...result, rules: cleanedRules });
264
+
265
+ if (removedRules.length > 0) {
266
+ sitesAffected++;
267
+ totalRulesRemoved += removedRules.length;
268
+
269
+ if (!silentMode) {
270
+ const wildcardCount = removedRules.filter(r => r.matchType === 'wildcard').length;
271
+ const exactCount = removedRules.filter(r => r.matchType === 'exact').length;
272
+
273
+ let cleanupMessage = `?? Removed ${removedRules.length} ignoreDomains rule(s) from ${safeGetDomain(result.url)} (final cleanup)`;
274
+ if (wildcardCount > 0) {
275
+ cleanupMessage += ` [${wildcardCount} wildcard, ${exactCount} exact]`;
276
+ }
277
+
278
+ if (messageColors && messageColors.cleanup) {
279
+ console.log(messageColors.cleanup(cleanupMessage));
280
+ } else {
281
+ console.log(cleanupMessage);
282
+ }
283
+ }
284
+ if (forceDebug) {
285
+ console.log(formatLogMessage('debug', `[ignoreDomains cleanup] Removed rules from ${result.url}:`));
286
+ removedRules.forEach((removed, idx) => {
287
+ console.log(formatLogMessage('debug', ` [${idx + 1}] ${removed.rule} (${removed.reason}) [${removed.matchType}]`));
288
+ });
289
+ }
290
+ }
291
+ });
292
+
293
+ // Summary
294
+ if (totalRulesRemoved > 0 && !silentMode) {
295
+ const allRemovedRules = cleanedResults.reduce((acc, result) => {
296
+ if (result.removedIgnoreDomains) {
297
+ acc.push(...result.removedIgnoreDomains);
298
+ }
299
+ return acc;
300
+ }, []);
301
+
302
+ const totalWildcardCount = allRemovedRules.filter(r => r.matchType === 'wildcard').length;
303
+ const totalExactCount = allRemovedRules.filter(r => r.matchType === 'exact').length;
304
+
305
+ const summaryMessage = `\n?? ignoreDomains cleanup completed: Removed ${totalRulesRemoved} rules from ${sitesAffected} site(s)` +
306
+ (totalWildcardCount > 0 ? ` [${totalWildcardCount} wildcard patterns, ${totalExactCount} exact matches]` : '');
307
+ if (messageColors && messageColors.cleanup) {
308
+ console.log(messageColors.cleanup(summaryMessage));
309
+ } else {
310
+ console.log(summaryMessage);
311
+ }
312
+ } else if (forceDebug) {
313
+ console.log(formatLogMessage('debug', '[ignoreDomains cleanup] No ignoreDomains rules found to remove'));
314
+ }
315
+
316
+ return cleanedResults;
317
+ }
318
+
319
+ /**
320
+ * Post-scan cleanup function to remove first-party domains from results
321
+ * Only processes sites that have firstParty: false in their configuration
322
+ *
323
+ * @param {Array} results - Array of scan results from all sites
324
+ * @param {Array} sites - Array of site configurations
325
+ * @param {Object} options - Options object
326
+ * @param {boolean} options.forceDebug - Debug logging flag
327
+ * @param {boolean} options.silentMode - Silent mode flag
328
+ * @returns {Array} Cleaned results with conditional first-party removal
329
+ */
330
+ function cleanupFirstPartyDomains(results, sites, options = {}) {
331
+ const { forceDebug = false, silentMode = false } = options;
332
+
333
+ if (!results || results.length === 0) {
334
+ return results;
335
+ }
336
+
337
+ // Build mapping of URLs to their site configs
338
+ const urlToSiteConfig = new Map();
339
+ sites.forEach(site => {
340
+ const urls = Array.isArray(site.url) ? site.url : [site.url];
341
+ urls.forEach(url => {
342
+ urlToSiteConfig.set(url, site);
343
+ });
344
+ });
345
+
346
+ const cleanedResults = [];
347
+ let totalRulesRemoved = 0;
348
+ let sitesAffected = 0;
349
+
350
+ results.forEach(result => {
351
+ // Find the site config for this result
352
+ const siteConfig = urlToSiteConfig.get(result.url);
353
+
354
+ // Only clean if firstParty is explicitly set to false
355
+ const shouldCleanFirstParty = siteConfig && siteConfig.firstParty === false;
356
+
357
+ if (!shouldCleanFirstParty || !result.rules || result.rules.length === 0) {
358
+ cleanedResults.push(result);
359
+ return;
360
+ }
361
+
362
+ if (forceDebug) {
363
+ console.log(formatLogMessage('debug', `[cleanup] Processing ${result.url} (firstParty: false detected)`));
364
+ }
365
+
366
+ // Get the scanned domain for this specific result
367
+ const scannedDomain = safeGetDomain(result.url, false);
368
+ if (!scannedDomain) {
369
+ cleanedResults.push(result);
370
+ return;
371
+ }
372
+
373
+ const originalRulesCount = result.rules.length;
374
+ const cleanedRules = [];
375
+ const removedRules = [];
376
+
377
+ // Filter out rules that match the scanned domain
378
+ result.rules.forEach(rule => {
379
+ let shouldRemove = false;
380
+ let extractedDomain = null;
381
+
382
+ try {
383
+ // Extract domain from different rule formats
384
+ if (rule.startsWith('||') && rule.includes('^')) {
385
+ // ||domain.com^ format (adblock)
386
+ const match = rule.match(/^\|\|([^/\^]+)/);
387
+ if (match) {
388
+ extractedDomain = match[1];
389
+ }
390
+ } else if (rule.startsWith('127.0.0.1 ') || rule.startsWith('0.0.0.0 ')) {
391
+ // hosts file format
392
+ const parts = rule.split(/\s+/);
393
+ if (parts.length >= 2) {
394
+ extractedDomain = parts[1];
395
+ }
396
+ } else if (rule.includes('local=/') && rule.includes('/')) {
397
+ // dnsmasq format: local=/domain.com/
398
+ const match = rule.match(/local=\/([^/]+)\//);
399
+ if (match) {
400
+ extractedDomain = match[1];
401
+ }
402
+ } else if (rule.includes('server=/') && rule.includes('/')) {
403
+ // dnsmasq old format: server=/domain.com/
404
+ const match = rule.match(/server=\/([^/]+)\//);
405
+ if (match) {
406
+ extractedDomain = match[1];
407
+ }
408
+ } else if (rule.includes('local-zone:') && rule.includes('always_null')) {
409
+ // unbound format: local-zone: "domain.com." always_null
410
+ const match = rule.match(/local-zone:\s*"([^"]+)\.?"/);
411
+ if (match) {
412
+ extractedDomain = match[1];
413
+ }
414
+ } else if (rule.includes('+block') && rule.includes('.')) {
415
+ // privoxy format: { +block } .domain.com
416
+ const match = rule.match(/\{\s*\+block\s*\}\s*\.?([^\s]+)/);
417
+ if (match) {
418
+ extractedDomain = match[1];
419
+ }
420
+ } else if (rule.match(/^\(\^\|\\\.\).*\\\.\w+\$$/)) {
421
+ // pi-hole regex format: (^|\.)domain\.com$
422
+ const match = rule.match(/^\(\^\|\\\.\)(.+)\\\.\w+\$$/);
423
+ if (match) {
424
+ // Unescape the domain
425
+ extractedDomain = match[1].replace(/\\\./g, '.');
426
+ }
427
+ } else {
428
+ // Try to extract any domain-like pattern as fallback
429
+ const domainMatch = rule.match(/([a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,})/);
430
+ if (domainMatch) {
431
+ extractedDomain = domainMatch[1];
432
+ }
433
+ }
434
+
435
+ // Check if extracted domain is a first-party domain
436
+ if (extractedDomain) {
437
+ const matchResult = shouldRemoveAsFirstParty(extractedDomain, scannedDomain, forceDebug);
438
+
439
+ if (matchResult.shouldRemove) {
440
+ removedRules.push({
441
+ rule: rule,
442
+ domain: extractedDomain,
443
+ rootDomain: scannedDomain,
444
+ reason: `First-party: ${matchResult.reason} (firstParty: false)`,
445
+ matchType: matchResult.reason.includes('Wildcard') ? 'wildcard' : 'exact'
446
+ });
447
+ return; // Exit early - rule should be removed
448
+ }
449
+ }
450
+ } catch (parseErr) {
451
+ if (forceDebug) {
452
+ console.log(formatLogMessage('debug', `[cleanup] Failed to parse rule: ${rule} - ${parseErr.message}`));
453
+ }
454
+ }
455
+
456
+ // If we reach here, the rule should be kept
457
+ cleanedRules.push(rule);
458
+ });
459
+
460
+ cleanedResults.push({ ...result, rules: cleanedRules });
461
+
462
+ if (removedRules.length > 0) {
463
+ sitesAffected++;
464
+ totalRulesRemoved += removedRules.length;
465
+
466
+ if (!silentMode) {
467
+ const wildcardCount = removedRules.filter(r => r.matchType === 'wildcard').length;
468
+ const exactCount = removedRules.filter(r => r.matchType === 'exact').length;
469
+
470
+ let cleanupMessage = `?? Cleaned ${removedRules.length} first-party rule(s) from ${scannedDomain} (firstParty: false)`;
471
+ if (wildcardCount > 0) {
472
+ cleanupMessage += ` [${wildcardCount} wildcard, ${exactCount} exact]`;
473
+ }
474
+ if (messageColors && messageColors.cleanup) {
475
+ console.log(messageColors.cleanup(cleanupMessage));
476
+ } else {
477
+ console.log(cleanupMessage);
478
+ }
479
+ }
480
+
481
+ if (forceDebug) {
482
+ console.log(formatLogMessage('debug', `[cleanup] Removed rules from ${result.url}:`));
483
+ removedRules.forEach((removed, idx) => {
484
+ console.log(formatLogMessage('debug', ` [${idx + 1}] ${removed.rule} (${removed.reason}) [${removed.matchType}]`));
485
+ });
486
+ }
487
+ }
488
+ });
489
+
490
+ // Summary
491
+ if (totalRulesRemoved > 0 && !silentMode) {
492
+ const summaryMessage = `\n?? First-party cleanup completed: Removed ${totalRulesRemoved} rules from ${sitesAffected} site(s) with firstParty: false`;
493
+ if (messageColors && messageColors.cleanup) {
494
+ console.log(messageColors.cleanup(summaryMessage));
495
+ } else {
496
+ console.log(summaryMessage);
497
+ }
498
+ } else if (forceDebug) {
499
+ console.log(formatLogMessage('debug', '[cleanup] No first-party rules found to remove'));
500
+ }
501
+
502
+ return cleanedResults;
503
+ }
504
+
505
+ /**
506
+ * Validates scan results and removes any obvious false positives
507
+ *
508
+ * @param {Array} results - Array of scan results
509
+ * @param {Object} options - Options object
510
+ * @param {boolean} options.forceDebug - Debug logging flag
511
+ * @param {Array} options.ignoreDomains - Domains to ignore
512
+ * @returns {Array} Validated results
513
+ */
514
+ function validateScanResults(results, options = {}) {
515
+ const { forceDebug = false, ignoreDomains = [] } = options;
516
+
517
+ if (!results || results.length === 0) {
518
+ return results;
519
+ }
520
+
521
+ let totalValidated = 0;
522
+ let totalRemoved = 0;
523
+
524
+ const validatedResults = results.map(result => {
525
+ if (!result.rules || result.rules.length === 0) {
526
+ return result;
527
+ }
528
+
529
+ const originalCount = result.rules.length;
530
+ const validRules = result.rules.filter(rule => {
531
+ // Basic validation - ensure rule isn't empty or malformed
532
+ if (!rule || typeof rule !== 'string' || rule.trim().length === 0) {
533
+ if (forceDebug) {
534
+ console.log(formatLogMessage('debug', `[validation] Removed empty/invalid rule`));
535
+ }
536
+ totalRemoved++;
537
+ return false;
538
+ }
539
+
540
+ // Check against ignore domains if provided
541
+ if (ignoreDomains.length > 0) {
542
+ for (const ignorePattern of ignoreDomains) {
543
+ if (rule.includes(ignorePattern.replace('*', ''))) {
544
+ if (forceDebug) {
545
+ console.log(formatLogMessage('debug', `[validation] Removed rule matching ignore pattern: ${ignorePattern}`));
546
+ }
547
+ totalRemoved++;
548
+ return false;
549
+ }
550
+ }
551
+ }
552
+
553
+ return true;
554
+ });
555
+
556
+ totalValidated += originalCount;
557
+ return { ...result, rules: validRules };
558
+ });
559
+
560
+ if (forceDebug && totalRemoved > 0) {
561
+ console.log(formatLogMessage('debug', `[validation] Validated ${totalValidated} rules, removed ${totalRemoved} invalid rules`));
562
+ }
563
+
564
+ return validatedResults;
565
+ }
566
+
567
+ /**
568
+ * Main post-processing function that runs all cleanup and validation steps
569
+ *
570
+ * @param {Array} results - Array of scan results from all sites
571
+ * @param {Array} sites - Array of site configurations
572
+ * @param {Object} options - Options object
573
+ * @param {boolean} options.forceDebug - Debug logging flag
574
+ * @param {boolean} options.silentMode - Silent mode flag
575
+ * @param {Array} options.ignoreDomains - Domains to ignore during validation
576
+ * @returns {Array} Fully processed and cleaned results
577
+ */
578
+ function processResults(results, sites, options = {}) {
579
+ const { forceDebug = false, silentMode = false } = options;
580
+
581
+ if (forceDebug) {
582
+ console.log(formatLogMessage('debug', `[post-processing] Starting post-processing of ${results.length} results`));
583
+ }
584
+
585
+ // Step 1: Clean up first-party domains
586
+ let processedResults = cleanupFirstPartyDomains(results, sites, options);
587
+
588
+ // Step 2: Clean up ignoreDomains (final safety net)
589
+ processedResults = cleanupIgnoreDomains(processedResults, options.ignoreDomains || [], options);
590
+
591
+ // Step 3: Validate results
592
+ processedResults = validateScanResults(processedResults, options);
593
+
594
+ if (forceDebug) {
595
+ const totalRules = processedResults.reduce((sum, r) => sum + (r.rules ? r.rules.length : 0), 0);
596
+ console.log(formatLogMessage('debug', `[post-processing] Completed: ${totalRules} total rules remaining`));
597
+ }
598
+
599
+ return processedResults;
600
+ }
601
+
602
+ module.exports = {
603
+ cleanupFirstPartyDomains,
604
+ cleanupIgnoreDomains,
605
+ validateScanResults,
606
+ processResults
607
+ };
package/nwss.js CHANGED
@@ -1,4 +1,4 @@
1
- // === Network scanner script (nwss.js) v1.0.67 ===
1
+ // === Network scanner script (nwss.js) v1.0.68 ===
2
2
 
3
3
  // puppeteer for browser automation, fs for file system operations, psl for domain parsing.
4
4
  // const pLimit = require('p-limit'); // Will be dynamically imported
@@ -27,6 +27,8 @@ const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvaila
27
27
  const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
28
28
  // CDP functionality
29
29
  const { createCDPSession } = require('./lib/cdp');
30
+ // Post-processing cleanup
31
+ const { processResults } = require('./lib/post-processing');
30
32
  // Colorize various text when used
31
33
  const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize');
32
34
  // Enhanced mouse interaction and page simulation
@@ -85,7 +87,7 @@ const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/r
85
87
  const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth');
86
88
 
87
89
  // --- Script Configuration & Constants ---
88
- const VERSION = '1.0.67'; // Script version
90
+ const VERSION = '1.0.68'; // Script version
89
91
 
90
92
  // get startTime
91
93
  const startTime = Date.now();
@@ -936,6 +938,22 @@ function outputDryRunResults(url, matchedItems, netToolsResults, pageTitle) {
936
938
  }
937
939
  }
938
940
 
941
+ /**
942
+ * Helper function to check if a URL should be processed (valid HTTP/HTTPS)
943
+ * @param {string} url - URL to validate
944
+ * @param {boolean} forceDebug - Debug logging flag
945
+ * @returns {boolean} True if URL is valid for processing
946
+ */
947
+ function shouldProcessUrl(url, forceDebug) {
948
+ try {
949
+ const parsed = new URL(url);
950
+ return parsed.protocol === 'http:' || parsed.protocol === 'https:';
951
+ } catch (err) {
952
+ if (forceDebug) console.log(formatLogMessage('debug', `Invalid URL for processing: ${url}`));
953
+ return false;
954
+ }
955
+ }
956
+
939
957
  // ability to use widcards in ignoreDomains
940
958
  function matchesIgnoreDomain(domain, ignorePatterns) {
941
959
  return ignorePatterns.some(pattern => {
@@ -1322,11 +1340,19 @@ function setupFrameHandling(page, forceDebug) {
1322
1340
 
1323
1341
  if (!silentMode) console.log(`\n${messageColors.scanning('Scanning:')} ${currentUrl}`);
1324
1342
 
1343
+ // Track ALL domains that should be considered first-party (original + redirects)
1344
+ const firstPartyDomains = new Set();
1345
+ const originalRootDomain = safeGetDomain(currentUrl, false);
1346
+ if (originalRootDomain) {
1347
+ firstPartyDomains.add(originalRootDomain);
1348
+ }
1349
+
1325
1350
  // Track redirect domains to exclude from matching
1326
1351
  let redirectDomainsToExclude = [];
1327
1352
 
1328
- // Track the effective current URL for first-party detection (updates after redirects)
1353
+ // Track the effective current URL and final URL for first-party detection (updates after redirects)
1329
1354
  let effectiveCurrentUrl = currentUrl;
1355
+ let finalUrlAfterRedirect = null;
1330
1356
 
1331
1357
  // Enhanced error types for Puppeteer 23.x compatibility
1332
1358
  const CRITICAL_BROWSER_ERRORS = [
@@ -1838,13 +1864,9 @@ function setupFrameHandling(page, forceDebug) {
1838
1864
  const checkedUrl = request.url();
1839
1865
  const checkedHostname = safeGetDomain(checkedUrl, true);
1840
1866
  const checkedRootDomain = safeGetDomain(checkedUrl, false); // Root domain for first-party detection
1841
- // Use effectiveCurrentUrl which gets updated after redirects
1842
- // This ensures first-party detection uses the final redirected domain
1843
- const effectiveCurrentHostname = safeGetDomain(effectiveCurrentUrl, true);
1844
- const effectiveCurrentRootDomain = safeGetDomain(effectiveCurrentUrl, false); // Root domain for comparison
1845
-
1846
- // FIXED: Compare root domains instead of full hostnames for first-party detection
1847
- const isFirstParty = checkedRootDomain && effectiveCurrentRootDomain && checkedRootDomain === effectiveCurrentRootDomain;
1867
+ // Check against ALL first-party domains (original + all redirects)
1868
+ // This prevents redirect destinations from being marked as third-party
1869
+ const isFirstParty = checkedRootDomain && firstPartyDomains.has(checkedRootDomain);
1848
1870
 
1849
1871
  // Block infinite iframe loops
1850
1872
  const frameUrl = request.frame() ? request.frame().url() : '';
@@ -2000,14 +2022,6 @@ function setupFrameHandling(page, forceDebug) {
2000
2022
  }
2001
2023
  break; // Skip this URL - it's third-party but thirdParty is disabled
2002
2024
  }
2003
-
2004
- // Check ignoreDomains AFTER regex match but BEFORE domain processing
2005
- if (matchesIgnoreDomain(fullSubdomain, ignoreDomains)) {
2006
- if (forceDebug) {
2007
- console.log(formatLogMessage('debug', `Ignoring domain ${fullSubdomain} (matches ignoreDomains pattern)`));
2008
- }
2009
- break; // Skip this URL - domain is in ignore list
2010
- }
2011
2025
 
2012
2026
  // REMOVED: Check if this URL matches any blocked patterns - if so, skip detection but still continue browser blocking
2013
2027
  // This check is no longer needed here since even_blocked handles it above
@@ -2309,6 +2323,19 @@ function setupFrameHandling(page, forceDebug) {
2309
2323
  if (redirected) {
2310
2324
  const originalDomain = safeGetDomain(originalUrl);
2311
2325
  const finalDomain = safeGetDomain(finalUrl);
2326
+
2327
+ // Add redirect destination to first-party domains immediately
2328
+ if (finalDomain) {
2329
+ firstPartyDomains.add(finalDomain);
2330
+ }
2331
+
2332
+ // Also add any intermediate redirect domains as first-party
2333
+ if (redirectDomains && redirectDomains.length > 0) {
2334
+ redirectDomains.forEach(domain => {
2335
+ const rootDomain = safeGetDomain(`http://${domain}`, false);
2336
+ if (rootDomain) firstPartyDomains.add(rootDomain);
2337
+ });
2338
+ }
2312
2339
 
2313
2340
  if (originalDomain !== finalDomain) {
2314
2341
  if (!silentMode) {
@@ -2317,6 +2344,7 @@ function setupFrameHandling(page, forceDebug) {
2317
2344
 
2318
2345
  if (forceDebug) {
2319
2346
  console.log(formatLogMessage('debug', `Full redirect chain: ${redirectChain.join(' → ')}`));
2347
+ console.log(formatLogMessage('debug', `All first-party domains: ${Array.from(firstPartyDomains).join(', ')}`));
2320
2348
  }
2321
2349
 
2322
2350
  // VALIDATION: Only update currentUrl if finalUrl is a valid HTTP/HTTPS URL
@@ -2326,6 +2354,7 @@ function setupFrameHandling(page, forceDebug) {
2326
2354
 
2327
2355
  // IMPORTANT: Also update effectiveCurrentUrl for first-party detection
2328
2356
  effectiveCurrentUrl = finalUrl;
2357
+ finalUrlAfterRedirect = finalUrl;
2329
2358
 
2330
2359
  // Update the redirect domains to exclude from matching
2331
2360
  if (redirectDomains && redirectDomains.length > 0) {
@@ -2541,7 +2570,13 @@ function setupFrameHandling(page, forceDebug) {
2541
2570
  };
2542
2571
  const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
2543
2572
 
2544
- return { url: currentUrl, rules: formattedRules, success: true };
2573
+ return {
2574
+ url: currentUrl,
2575
+ rules: formattedRules,
2576
+ success: true,
2577
+ finalUrl: finalUrlAfterRedirect || currentUrl,
2578
+ redirectDomains: redirectDomainsToExclude
2579
+ };
2545
2580
  }
2546
2581
 
2547
2582
  } catch (err) {
@@ -2577,7 +2612,14 @@ function setupFrameHandling(page, forceDebug) {
2577
2612
  };
2578
2613
  const formattedRules = formatRules(matchedDomains, siteConfig, globalOptions);
2579
2614
  if (forceDebug) console.log(formatLogMessage('debug', `Saving ${formattedRules.length} rules despite page load failure`));
2580
- return { url: currentUrl, rules: formattedRules, success: false, hasMatches: true };
2615
+ return {
2616
+ url: currentUrl,
2617
+ rules: formattedRules,
2618
+ success: false,
2619
+ hasMatches: true,
2620
+ finalUrl: finalUrlAfterRedirect || currentUrl,
2621
+ redirectDomains: redirectDomainsToExclude
2622
+ };
2581
2623
  }
2582
2624
 
2583
2625
  if (siteConfig.screenshot === true && page) {
@@ -2591,7 +2633,13 @@ function setupFrameHandling(page, forceDebug) {
2591
2633
  console.warn(messageColors.warn(`[screenshot failed] ${currentUrl}: ${screenshotErr.message}`));
2592
2634
  }
2593
2635
  }
2594
- return { url: currentUrl, rules: [], success: false };
2636
+ return {
2637
+ url: currentUrl,
2638
+ rules: [],
2639
+ success: false,
2640
+ finalUrl: finalUrlAfterRedirect || currentUrl,
2641
+ redirectDomains: redirectDomainsToExclude
2642
+ };
2595
2643
  } finally {
2596
2644
  // Guaranteed resource cleanup - this runs regardless of success or failure
2597
2645
 
@@ -2640,7 +2688,7 @@ function setupFrameHandling(page, forceDebug) {
2640
2688
  }
2641
2689
  }
2642
2690
 
2643
- const results = [];
2691
+ let results = [];
2644
2692
  let processedUrlCount = 0;
2645
2693
  let urlsSinceLastCleanup = 0;
2646
2694
 
@@ -2794,6 +2842,20 @@ function setupFrameHandling(page, forceDebug) {
2794
2842
  }
2795
2843
  }
2796
2844
 
2845
+ // === POST-SCAN PROCESSING ===
2846
+ // Clean up first-party domains and validate results
2847
+ if (!dryRunMode) {
2848
+ // Always run post-processing for both firstParty cleanup and ignoreDomains safety net
2849
+ const sitesWithFirstPartyDisabled = sites.filter(site => site.firstParty === false);
2850
+ if (sitesWithFirstPartyDisabled.length > 0) {
2851
+ if (forceDebug) {
2852
+ console.log(formatLogMessage('debug', `Running post-scan processing for ${sitesWithFirstPartyDisabled.length} sites with firstParty: false`));
2853
+ }
2854
+ // Always run post-processing for ignoreDomains safety net
2855
+ results = processResults(results, sites, { forceDebug, silentMode, ignoreDomains });
2856
+ }
2857
+ }
2858
+
2797
2859
  // Handle dry run output file writing
2798
2860
  if (dryRunMode && outputFile && dryRunOutput.length > 0) {
2799
2861
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fanboynz/network-scanner",
3
- "version": "1.0.67",
3
+ "version": "1.0.68",
4
4
  "description": "A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.",
5
5
  "main": "nwss.js",
6
6
  "scripts": {