@fanboynz/network-scanner 1.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,561 @@
1
+ // === searchstring.js - Content Search Module ===
2
+ // Handles response content analysis for searchstring functionality
3
+
4
+ const fs = require('fs');
5
+ const { spawnSync } = require('child_process');
6
+
7
+ /**
8
+ * Parses searchstring configuration into a normalized format
9
+ * @param {string|Array<string>|undefined} searchstring - The searchstring config value (OR logic)
10
+ * @param {string|Array<string>|undefined} searchstringAnd - The searchstring_and config value (AND logic)
11
+ * @returns {object} Object with searchStrings array, searchStringsAnd array, hasSearchString boolean, and hasSearchStringAnd boolean
12
+ */
13
+ function parseSearchStrings(searchstring, searchstringAnd) {
14
+ let searchStrings = Array.isArray(searchstring)
15
+ ? searchstring
16
+ : searchstring
17
+ ? [searchstring]
18
+ : [];
19
+
20
+ let searchStringsAnd = Array.isArray(searchstringAnd)
21
+ ? searchstringAnd
22
+ : searchstringAnd
23
+ ? [searchstringAnd]
24
+ : [];
25
+
26
+ // Filter out empty strings to prevent matching everything
27
+ searchStrings = searchStrings.filter(str => str && str.trim().length > 0);
28
+ searchStringsAnd = searchStringsAnd.filter(str => str && str.trim().length > 0);
29
+
30
+ const hasSearchString = searchStrings.length > 0;
31
+ const hasSearchStringAnd = searchStringsAnd.length > 0;
32
+
33
+ return {
34
+ searchStrings,
35
+ searchStringsAnd,
36
+ hasSearchString,
37
+ hasSearchStringAnd
38
+ };
39
+ }
40
+
41
+ /**
42
+ * Helper function to add domain to matched collection (handles both Set and Map)
43
+ * @param {Set|Map} matchedDomains - The matched domains collection
44
+ * @param {Function} addMatchedDomain - Optional helper function for adding domains
45
+ * @param {string} domain - Domain to add
46
+ * @param {string} resourceType - Resource type (for --adblock-rules mode)
47
+ */
48
+ function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourceType = null) {
49
+ // Use helper function if provided (preferred method)
50
+ if (typeof addMatchedDomain === 'function') {
51
+ addMatchedDomain(domain, resourceType);
52
+ return;
53
+ }
54
+
55
+ // Fallback: handle different collection types directly
56
+ if (matchedDomains instanceof Set) {
57
+ matchedDomains.add(domain);
58
+ } else if (matchedDomains instanceof Map) {
59
+ if (!matchedDomains.has(domain)) {
60
+ matchedDomains.set(domain, new Set());
61
+ }
62
+ if (resourceType) {
63
+ matchedDomains.get(domain).add(resourceType);
64
+ }
65
+ } else {
66
+ console.warn('[warn] Unknown matchedDomains type, skipping domain addition');
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Downloads content using curl with appropriate headers and timeout
72
+ * @param {string} url - The URL to download
73
+ * @param {string} userAgent - User agent string to use
74
+ * @param {number} timeout - Timeout in seconds (default: 30)
75
+ * @returns {Promise<string>} The downloaded content
76
+ */
77
+ async function downloadWithCurl(url, userAgent = '', timeout = 30) {
78
+ return new Promise((resolve, reject) => {
79
+ try {
80
+ const curlArgs = [
81
+ '-s', // Silent mode
82
+ '-L', // Follow redirects
83
+ '--max-time', timeout.toString(),
84
+ '--max-redirs', '5',
85
+ '--fail-with-body', // Return body even on HTTP errors
86
+ '--compressed', // Accept compressed responses
87
+ ];
88
+
89
+ if (userAgent) {
90
+ curlArgs.push('-H', `User-Agent: ${userAgent}`);
91
+ }
92
+
93
+ // Add common headers to appear more browser-like
94
+ curlArgs.push(
95
+ '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
96
+ '-H', 'Accept-Language: en-US,en;q=0.5',
97
+ '-H', 'Accept-Encoding: gzip, deflate',
98
+ '-H', 'Connection: keep-alive',
99
+ '-H', 'Upgrade-Insecure-Requests: 1'
100
+ );
101
+
102
+ curlArgs.push(url);
103
+
104
+ // Use spawnSync with proper argument separation
105
+ const result = spawnSync('curl', curlArgs, {
106
+ encoding: 'utf8',
107
+ timeout: timeout * 1000,
108
+ maxBuffer: 10 * 1024 * 1024 // 10MB max buffer
109
+ });
110
+
111
+ if (result.error) {
112
+ throw result.error;
113
+ }
114
+
115
+ if (result.status !== 0) {
116
+ throw new Error(`Curl exited with status ${result.status}: ${result.stderr}`);
117
+ }
118
+
119
+ resolve(result.stdout);
120
+ } catch (error) {
121
+ reject(new Error(`Curl failed for ${url}: ${error.message}`));
122
+ }
123
+ });
124
+ }
125
+
126
+ /**
127
+ * Checks if response content contains any of the search strings (OR logic)
128
+ * or all of the AND search strings (AND logic)
129
+ * Handles both raw text search and basic XML content extraction
130
+ * @param {string} content - The response content to search
131
+ * @param {Array<string>} searchStrings - Array of strings to search for (OR logic)
132
+ * @param {Array<string>} searchStringsAnd - Array of strings that must all be present (AND logic)
133
+ * @param {string} contentType - Content type for specialized handling
134
+ * @returns {object} Object with found boolean, matchedString/matchedStrings, allMatches array, and logic type
135
+ */
136
+ function searchContent(content, searchStrings, searchStringsAnd = [], contentType = '') {
137
+ let searchableContent = content;
138
+
139
+ // For XML content, also search decoded entities and stripped tags for better matching
140
+ if (contentType.includes('xml')) {
141
+ // Decode common XML entities
142
+ const decodedContent = content
143
+ .replace(/&lt;/g, '<')
144
+ .replace(/&gt;/g, '>')
145
+ .replace(/&amp;/g, '&')
146
+ .replace(/&quot;/g, '"')
147
+ .replace(/&#39;/g, "'");
148
+
149
+ // Create version with XML tags stripped for text content search
150
+ const strippedContent = decodedContent.replace(/<[^>]*>/g, ' ');
151
+
152
+ // Search in: original + decoded + stripped content
153
+ searchableContent = content + '\n' + decodedContent + '\n' + strippedContent;
154
+ }
155
+
156
+ // Check AND logic first (more restrictive)
157
+ if (searchStringsAnd && searchStringsAnd.length > 0) {
158
+ const lowerContent = searchableContent.toLowerCase();
159
+ const foundAndStrings = [];
160
+
161
+ for (const searchStr of searchStringsAnd) {
162
+ if (lowerContent.includes(searchStr.toLowerCase())) {
163
+ foundAndStrings.push(searchStr);
164
+ }
165
+ }
166
+
167
+ // AND logic: ALL strings must be found
168
+ if (foundAndStrings.length === searchStringsAnd.length) {
169
+ return {
170
+ found: true,
171
+ matchedString: foundAndStrings.join(' AND '), // Show all matched strings
172
+ matchedStrings: foundAndStrings,
173
+ allMatches: foundAndStrings,
174
+ logicType: 'AND'
175
+ };
176
+ }
177
+ }
178
+
179
+ // Fall back to OR logic if AND logic didn't match or wasn't specified
180
+ const lowerContent = searchableContent.toLowerCase();
181
+ const allMatches = [];
182
+ let firstMatch = null;
183
+
184
+ for (const searchStr of searchStrings) {
185
+ if (lowerContent.includes(searchStr.toLowerCase())) {
186
+ allMatches.push(searchStr);
187
+ if (!firstMatch) {
188
+ firstMatch = searchStr;
189
+ }
190
+ }
191
+ }
192
+
193
+ return {
194
+ found: allMatches.length > 0,
195
+ matchedString: firstMatch,
196
+ matchedStrings: allMatches,
197
+ allMatches: allMatches,
198
+ logicType: 'OR'
199
+ };
200
+ }
201
+
202
+ /**
203
+ * Determines if a content type should be analyzed for search strings
204
+ * @param {string} contentType - The response content-type header
205
+ * @returns {boolean} True if content should be analyzed
206
+ */
207
+ function shouldAnalyzeContentType(contentType) {
208
+ if (!contentType) return false;
209
+
210
+ const textTypes = [
211
+ 'text/', // text/html, text/plain, text/xml, etc.
212
+ 'application/json',
213
+ 'application/javascript',
214
+ 'application/xml', // Standard XML
215
+ 'application/x-javascript',
216
+ 'application/soap+xml', // SOAP XML
217
+ 'application/rss+xml', // RSS feeds
218
+ 'application/atom+xml', // Atom feeds
219
+ 'application/xhtml+xml' // XHTML
220
+ ];
221
+
222
+ return textTypes.some(type => contentType.includes(type));
223
+ }
224
+
225
+ /**
226
+ * Creates a curl-based URL handler for downloading and optionally searching content
227
+ * @param {object} config - Configuration object containing all necessary parameters
228
+ * @returns {Function} URL handler function for curl-based content analysis
229
+ */
230
+ function createCurlHandler(config) {
231
+ const {
232
+ searchStrings,
233
+ searchStringsAnd,
234
+ hasSearchStringAnd,
235
+ regexes,
236
+ matchedDomains,
237
+ addMatchedDomain, // Helper function for adding domains
238
+ currentUrl,
239
+ perSiteSubDomains,
240
+ ignoreDomains,
241
+ matchesIgnoreDomain,
242
+ getRootDomain,
243
+ siteConfig,
244
+ dumpUrls,
245
+ matchedUrlsLogFile,
246
+ forceDebug,
247
+ userAgent,
248
+ resourceType, // Resource type from request
249
+ hasSearchString
250
+ } = config;
251
+
252
+ return async function curlHandler(requestUrl) {
253
+ const respDomain = perSiteSubDomains ? (new URL(requestUrl)).hostname : getRootDomain(requestUrl);
254
+
255
+ // Only process URLs that match our regex patterns
256
+ const matchesRegex = regexes.some(re => re.test(requestUrl));
257
+ if (!matchesRegex) return;
258
+
259
+ // Check if this is a first-party request (same domain as the URL being scanned)
260
+ const currentUrlHostname = new URL(currentUrl).hostname;
261
+ const requestHostname = new URL(requestUrl).hostname;
262
+ const isFirstParty = currentUrlHostname === requestHostname;
263
+
264
+ // Apply first-party/third-party filtering
265
+ if (isFirstParty && siteConfig.firstParty === false) {
266
+ if (forceDebug) {
267
+ console.log(`[debug][curl] Skipping first-party request (firstParty=false): ${requestUrl}`);
268
+ }
269
+ return;
270
+ }
271
+
272
+ if (!isFirstParty && siteConfig.thirdParty === false) {
273
+ if (forceDebug) {
274
+ console.log(`[debug][curl] Skipping third-party request (thirdParty=false): ${requestUrl}`);
275
+ }
276
+ return;
277
+ }
278
+
279
+ try {
280
+ if (forceDebug) {
281
+ console.log(`[debug][curl] Downloading content from: ${requestUrl}`);
282
+ }
283
+
284
+ // If NO searchstring is defined, match immediately (like browser behavior)
285
+ if (!hasSearchString && !hasSearchStringAnd) {
286
+ if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
287
+ return;
288
+ }
289
+
290
+ addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, resourceType);
291
+ const simplifiedUrl = getRootDomain(currentUrl);
292
+
293
+ if (siteConfig.verbose === 1) {
294
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
295
+ const resourceInfo = resourceType ? ` (${resourceType})` : '';
296
+ console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`);
297
+ }
298
+
299
+ if (dumpUrls) {
300
+ const timestamp = new Date().toISOString();
301
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
302
+ const resourceInfo = resourceType ? ` (${resourceType})` : '';
303
+ try {
304
+ fs.appendFileSync(matchedUrlsLogFile,
305
+ `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
306
+ } catch (logErr) {
307
+ console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
308
+ }
309
+ }
310
+ return;
311
+ }
312
+
313
+ // If searchstring IS defined, download and search content
314
+ const content = await downloadWithCurl(requestUrl, userAgent, 30);
315
+
316
+ // Check if content contains search strings (OR or AND logic)
317
+ const { found, matchedString, logicType } = searchContent(content, searchStrings, searchStringsAnd, '');
318
+
319
+ if (found) {
320
+ if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
321
+ return;
322
+ }
323
+
324
+ addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, resourceType);
325
+ const simplifiedUrl = getRootDomain(currentUrl);
326
+
327
+ if (siteConfig.verbose === 1) {
328
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
329
+ const resourceInfo = resourceType ? ` (${resourceType})` : '';
330
+ console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains searchstring (${logicType}): "${matchedString}"${resourceInfo}`);
331
+ }
332
+
333
+ if (dumpUrls) {
334
+ const timestamp = new Date().toISOString();
335
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
336
+ const resourceInfo = resourceType ? ` (${resourceType})` : '';
337
+ try {
338
+ fs.appendFileSync(matchedUrlsLogFile,
339
+ `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
340
+ } catch (logErr) {
341
+ console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
342
+ }
343
+ }
344
+ } else if (forceDebug) {
345
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
346
+ console.log(`[debug][curl] ${requestUrl} (${partyType}) matched regex but no searchstring found`);
347
+ }
348
+
349
+ } catch (err) {
350
+ if (forceDebug) {
351
+ console.log(`[debug][curl] Failed to download content for ${requestUrl}: ${err.message}`);
352
+ }
353
+ }
354
+ };
355
+ }
356
+
357
+ /**
358
+ * Creates a response handler function for the given configuration
359
+ * @param {object} config - Configuration object containing all necessary parameters
360
+ * @returns {Function} Response handler function for page.on('response', handler)
361
+ */
362
+ function createResponseHandler(config) {
363
+ const {
364
+ searchStrings,
365
+ searchStringsAnd,
366
+ hasSearchStringAnd,
367
+ regexes,
368
+ matchedDomains,
369
+ addMatchedDomain, // Helper function for adding domains
370
+ currentUrl,
371
+ perSiteSubDomains,
372
+ ignoreDomains,
373
+ matchesIgnoreDomain,
374
+ getRootDomain,
375
+ siteConfig,
376
+ dumpUrls,
377
+ matchedUrlsLogFile,
378
+ forceDebug,
379
+ resourceType // Will be null for response handler
380
+ } = config;
381
+
382
+ return async function responseHandler(response) {
383
+ const respUrl = response.url();
384
+ const respDomain = perSiteSubDomains ? (new URL(respUrl)).hostname : getRootDomain(respUrl);
385
+
386
+ // Only process responses that match our regex patterns
387
+ const matchesRegex = regexes.some(re => re.test(respUrl));
388
+ if (!matchesRegex) return;
389
+
390
+ // Check if this is a first-party response (same domain as the URL being scanned)
391
+ const currentUrlHostname = new URL(currentUrl).hostname;
392
+ const responseHostname = new URL(respUrl).hostname;
393
+ const isFirstParty = currentUrlHostname === responseHostname;
394
+
395
+ // The main request handler already filtered first-party/third-party requests
396
+ // This response handler only runs for requests that passed that filter
397
+ // However, we need to apply the same first-party/third-party logic here for searchstring analysis
398
+ // because the response handler analyzes content, not just URLs
399
+
400
+ // Apply first-party/third-party filtering for searchstring analysis
401
+ // Use the exact same logic as the main request handler
402
+ if (isFirstParty && siteConfig.firstParty === false) {
403
+ if (forceDebug) {
404
+ console.log(`[debug] Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`);
405
+ }
406
+ return;
407
+ }
408
+
409
+ if (!isFirstParty && siteConfig.thirdParty === false) {
410
+ if (forceDebug) {
411
+ console.log(`[debug] Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`);
412
+ }
413
+ return;
414
+ }
415
+
416
+ try {
417
+ // Only capture appropriate content types to avoid binary data
418
+ const contentType = response.headers()['content-type'] || '';
419
+ if (!shouldAnalyzeContentType(contentType)) {
420
+ if (forceDebug) {
421
+ console.log(`[debug] Skipping content analysis for ${respUrl} (content-type: ${contentType})`);
422
+ }
423
+ return;
424
+ }
425
+
426
+ const content = await response.text();
427
+
428
+ // Check if content contains search strings (OR or AND logic)
429
+ const { found, matchedString, logicType } = searchContent(content, searchStrings, searchStringsAnd, contentType);
430
+
431
+ if (found) {
432
+ if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
433
+ return;
434
+ }
435
+
436
+ // Response handler doesn't have access to specific resource type
437
+ addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, null);
438
+ const simplifiedUrl = getRootDomain(currentUrl);
439
+
440
+ if (siteConfig.verbose === 1) {
441
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
442
+ console.log(`[match][${simplifiedUrl}] ${respUrl} (${partyType}) contains searchstring (${logicType}): "${matchedString}"`);
443
+ }
444
+
445
+ if (dumpUrls) {
446
+ const timestamp = new Date().toISOString();
447
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
448
+ try {
449
+ fs.appendFileSync(matchedUrlsLogFile,
450
+ `${timestamp} [match][${simplifiedUrl}] ${respUrl} (${partyType}, searchstring (${logicType}): "${matchedString}")\n`);
451
+ } catch (logErr) {
452
+ console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
453
+ }
454
+ }
455
+ } else if (forceDebug) {
456
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
457
+ console.log(`[debug] ${respUrl} (${partyType}) matched regex but no searchstring found`);
458
+ }
459
+
460
+ } catch (err) {
461
+ if (forceDebug) {
462
+ console.log(`[debug] Failed to read response content for ${respUrl}: ${err.message}`);
463
+ }
464
+ }
465
+ };
466
+ }
467
+
468
+ /**
469
+ * Validates searchstring configuration
470
+ * @param {any} searchstring - The searchstring value to validate
471
+ * @param {any} searchstringAnd - The searchstring_and value to validate
472
+ * @returns {object} Validation result with isValid boolean and error message
473
+ */
474
+ function validateSearchString(searchstring, searchstringAnd) {
475
+ if (searchstring === undefined || searchstring === null) {
476
+ return { isValid: true, error: null };
477
+ }
478
+
479
+ if (typeof searchstring === 'string') {
480
+ if (searchstring.length === 0) {
481
+ return { isValid: false, error: 'searchstring cannot be empty string' };
482
+ }
483
+ return { isValid: true, error: null };
484
+ }
485
+
486
+ if (Array.isArray(searchstring)) {
487
+ if (searchstring.length === 0) {
488
+ return { isValid: false, error: 'searchstring array cannot be empty' };
489
+ }
490
+
491
+ for (let i = 0; i < searchstring.length; i++) {
492
+ if (typeof searchstring[i] !== 'string') {
493
+ return { isValid: false, error: `searchstring[${i}] must be a string` };
494
+ }
495
+ if (searchstring[i].length === 0) {
496
+ return { isValid: false, error: `searchstring[${i}] cannot be empty string` };
497
+ }
498
+ }
499
+
500
+ return { isValid: true, error: null };
501
+ }
502
+
503
+ // Validate searchstring_and
504
+ if (searchstringAnd !== undefined && searchstringAnd !== null) {
505
+ if (typeof searchstringAnd === 'string') {
506
+ if (searchstringAnd.length === 0) {
507
+ return { isValid: false, error: 'searchstring_and cannot be empty string' };
508
+ }
509
+ } else if (Array.isArray(searchstringAnd)) {
510
+ if (searchstringAnd.length === 0) {
511
+ return { isValid: false, error: 'searchstring_and array cannot be empty' };
512
+ }
513
+
514
+ for (let i = 0; i < searchstringAnd.length; i++) {
515
+ if (typeof searchstringAnd[i] !== 'string') {
516
+ return { isValid: false, error: `searchstring_and[${i}] must be a string` };
517
+ }
518
+ if (searchstringAnd[i].length === 0) {
519
+ return { isValid: false, error: `searchstring_and[${i}] cannot be empty string` };
520
+ }
521
+ }
522
+ } else {
523
+ return { isValid: false, error: 'searchstring_and must be string or array of strings' };
524
+ }
525
+ }
526
+
527
+ // Check that both searchstring and searchstring_and aren't defined simultaneously
528
+ if (searchstring && searchstringAnd) {
529
+ return { isValid: false, error: 'Cannot use both searchstring (OR) and searchstring_and (AND) simultaneously. Choose one logic type.' };
530
+ }
531
+
532
+ return { isValid: false, error: 'searchstring must be string or array of strings' };
533
+ }
534
+
535
+ /**
536
+ * Gets statistics about search string matches
537
+ * @param {Set|Map} matchedDomains - Set or Map of matched domains
538
+ * @param {Array<string>} searchStrings - Array of search strings used
539
+ * @returns {object} Statistics object
540
+ */
541
+ function getSearchStats(matchedDomains, searchStrings) {
542
+ const totalMatches = matchedDomains instanceof Map ? matchedDomains.size : matchedDomains.size;
543
+
544
+ return {
545
+ totalMatches,
546
+ searchStringCount: searchStrings.length,
547
+ searchStrings: [...searchStrings]
548
+ };
549
+ }
550
+
551
+ module.exports = {
552
+ parseSearchStrings,
553
+ searchContent,
554
+ shouldAnalyzeContentType,
555
+ createResponseHandler,
556
+ createCurlHandler,
557
+ downloadWithCurl,
558
+ validateSearchString,
559
+ getSearchStats,
560
+ addDomainToCollection
561
+ };