@fanboynz/network-scanner 1.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +33 -0
- package/JSONMANUAL.md +121 -0
- package/LICENSE +674 -0
- package/README.md +357 -0
- package/config.json +74 -0
- package/lib/browserexit.js +522 -0
- package/lib/browserhealth.js +308 -0
- package/lib/cloudflare.js +660 -0
- package/lib/colorize.js +168 -0
- package/lib/compare.js +159 -0
- package/lib/compress.js +129 -0
- package/lib/fingerprint.js +613 -0
- package/lib/flowproxy.js +274 -0
- package/lib/grep.js +348 -0
- package/lib/ignore_similar.js +237 -0
- package/lib/nettools.js +1200 -0
- package/lib/output.js +633 -0
- package/lib/redirect.js +384 -0
- package/lib/searchstring.js +561 -0
- package/lib/validate_rules.js +1107 -0
- package/nwss.1 +824 -0
- package/nwss.js +2488 -0
- package/package.json +45 -0
- package/regex-samples.md +27 -0
- package/scanner-script-org.js +588 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
// === searchstring.js - Content Search Module ===
|
|
2
|
+
// Handles response content analysis for searchstring functionality
|
|
3
|
+
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const { spawnSync } = require('child_process');
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Parses searchstring configuration into a normalized format
|
|
9
|
+
* @param {string|Array<string>|undefined} searchstring - The searchstring config value (OR logic)
|
|
10
|
+
* @param {string|Array<string>|undefined} searchstringAnd - The searchstring_and config value (AND logic)
|
|
11
|
+
* @returns {object} Object with searchStrings array, searchStringsAnd array, hasSearchString boolean, and hasSearchStringAnd boolean
|
|
12
|
+
*/
|
|
13
|
+
function parseSearchStrings(searchstring, searchstringAnd) {
|
|
14
|
+
let searchStrings = Array.isArray(searchstring)
|
|
15
|
+
? searchstring
|
|
16
|
+
: searchstring
|
|
17
|
+
? [searchstring]
|
|
18
|
+
: [];
|
|
19
|
+
|
|
20
|
+
let searchStringsAnd = Array.isArray(searchstringAnd)
|
|
21
|
+
? searchstringAnd
|
|
22
|
+
: searchstringAnd
|
|
23
|
+
? [searchstringAnd]
|
|
24
|
+
: [];
|
|
25
|
+
|
|
26
|
+
// Filter out empty strings to prevent matching everything
|
|
27
|
+
searchStrings = searchStrings.filter(str => str && str.trim().length > 0);
|
|
28
|
+
searchStringsAnd = searchStringsAnd.filter(str => str && str.trim().length > 0);
|
|
29
|
+
|
|
30
|
+
const hasSearchString = searchStrings.length > 0;
|
|
31
|
+
const hasSearchStringAnd = searchStringsAnd.length > 0;
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
searchStrings,
|
|
35
|
+
searchStringsAnd,
|
|
36
|
+
hasSearchString,
|
|
37
|
+
hasSearchStringAnd
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Helper function to add domain to matched collection (handles both Set and Map)
|
|
43
|
+
* @param {Set|Map} matchedDomains - The matched domains collection
|
|
44
|
+
* @param {Function} addMatchedDomain - Optional helper function for adding domains
|
|
45
|
+
* @param {string} domain - Domain to add
|
|
46
|
+
* @param {string} resourceType - Resource type (for --adblock-rules mode)
|
|
47
|
+
*/
|
|
48
|
+
function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourceType = null) {
|
|
49
|
+
// Use helper function if provided (preferred method)
|
|
50
|
+
if (typeof addMatchedDomain === 'function') {
|
|
51
|
+
addMatchedDomain(domain, resourceType);
|
|
52
|
+
return;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Fallback: handle different collection types directly
|
|
56
|
+
if (matchedDomains instanceof Set) {
|
|
57
|
+
matchedDomains.add(domain);
|
|
58
|
+
} else if (matchedDomains instanceof Map) {
|
|
59
|
+
if (!matchedDomains.has(domain)) {
|
|
60
|
+
matchedDomains.set(domain, new Set());
|
|
61
|
+
}
|
|
62
|
+
if (resourceType) {
|
|
63
|
+
matchedDomains.get(domain).add(resourceType);
|
|
64
|
+
}
|
|
65
|
+
} else {
|
|
66
|
+
console.warn('[warn] Unknown matchedDomains type, skipping domain addition');
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Downloads content using curl with appropriate headers and timeout
|
|
72
|
+
* @param {string} url - The URL to download
|
|
73
|
+
* @param {string} userAgent - User agent string to use
|
|
74
|
+
* @param {number} timeout - Timeout in seconds (default: 30)
|
|
75
|
+
* @returns {Promise<string>} The downloaded content
|
|
76
|
+
*/
|
|
77
|
+
async function downloadWithCurl(url, userAgent = '', timeout = 30) {
|
|
78
|
+
return new Promise((resolve, reject) => {
|
|
79
|
+
try {
|
|
80
|
+
const curlArgs = [
|
|
81
|
+
'-s', // Silent mode
|
|
82
|
+
'-L', // Follow redirects
|
|
83
|
+
'--max-time', timeout.toString(),
|
|
84
|
+
'--max-redirs', '5',
|
|
85
|
+
'--fail-with-body', // Return body even on HTTP errors
|
|
86
|
+
'--compressed', // Accept compressed responses
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
if (userAgent) {
|
|
90
|
+
curlArgs.push('-H', `User-Agent: ${userAgent}`);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Add common headers to appear more browser-like
|
|
94
|
+
curlArgs.push(
|
|
95
|
+
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
96
|
+
'-H', 'Accept-Language: en-US,en;q=0.5',
|
|
97
|
+
'-H', 'Accept-Encoding: gzip, deflate',
|
|
98
|
+
'-H', 'Connection: keep-alive',
|
|
99
|
+
'-H', 'Upgrade-Insecure-Requests: 1'
|
|
100
|
+
);
|
|
101
|
+
|
|
102
|
+
curlArgs.push(url);
|
|
103
|
+
|
|
104
|
+
// Use spawnSync with proper argument separation
|
|
105
|
+
const result = spawnSync('curl', curlArgs, {
|
|
106
|
+
encoding: 'utf8',
|
|
107
|
+
timeout: timeout * 1000,
|
|
108
|
+
maxBuffer: 10 * 1024 * 1024 // 10MB max buffer
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
if (result.error) {
|
|
112
|
+
throw result.error;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (result.status !== 0) {
|
|
116
|
+
throw new Error(`Curl exited with status ${result.status}: ${result.stderr}`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
resolve(result.stdout);
|
|
120
|
+
} catch (error) {
|
|
121
|
+
reject(new Error(`Curl failed for ${url}: ${error.message}`));
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Checks if response content contains any of the search strings (OR logic)
|
|
128
|
+
* or all of the AND search strings (AND logic)
|
|
129
|
+
* Handles both raw text search and basic XML content extraction
|
|
130
|
+
* @param {string} content - The response content to search
|
|
131
|
+
* @param {Array<string>} searchStrings - Array of strings to search for (OR logic)
|
|
132
|
+
* @param {Array<string>} searchStringsAnd - Array of strings that must all be present (AND logic)
|
|
133
|
+
* @param {string} contentType - Content type for specialized handling
|
|
134
|
+
* @returns {object} Object with found boolean, matchedString/matchedStrings, allMatches array, and logic type
|
|
135
|
+
*/
|
|
136
|
+
function searchContent(content, searchStrings, searchStringsAnd = [], contentType = '') {
|
|
137
|
+
let searchableContent = content;
|
|
138
|
+
|
|
139
|
+
// For XML content, also search decoded entities and stripped tags for better matching
|
|
140
|
+
if (contentType.includes('xml')) {
|
|
141
|
+
// Decode common XML entities
|
|
142
|
+
const decodedContent = content
|
|
143
|
+
.replace(/</g, '<')
|
|
144
|
+
.replace(/>/g, '>')
|
|
145
|
+
.replace(/&/g, '&')
|
|
146
|
+
.replace(/"/g, '"')
|
|
147
|
+
.replace(/'/g, "'");
|
|
148
|
+
|
|
149
|
+
// Create version with XML tags stripped for text content search
|
|
150
|
+
const strippedContent = decodedContent.replace(/<[^>]*>/g, ' ');
|
|
151
|
+
|
|
152
|
+
// Search in: original + decoded + stripped content
|
|
153
|
+
searchableContent = content + '\n' + decodedContent + '\n' + strippedContent;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Check AND logic first (more restrictive)
|
|
157
|
+
if (searchStringsAnd && searchStringsAnd.length > 0) {
|
|
158
|
+
const lowerContent = searchableContent.toLowerCase();
|
|
159
|
+
const foundAndStrings = [];
|
|
160
|
+
|
|
161
|
+
for (const searchStr of searchStringsAnd) {
|
|
162
|
+
if (lowerContent.includes(searchStr.toLowerCase())) {
|
|
163
|
+
foundAndStrings.push(searchStr);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// AND logic: ALL strings must be found
|
|
168
|
+
if (foundAndStrings.length === searchStringsAnd.length) {
|
|
169
|
+
return {
|
|
170
|
+
found: true,
|
|
171
|
+
matchedString: foundAndStrings.join(' AND '), // Show all matched strings
|
|
172
|
+
matchedStrings: foundAndStrings,
|
|
173
|
+
allMatches: foundAndStrings,
|
|
174
|
+
logicType: 'AND'
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Fall back to OR logic if AND logic didn't match or wasn't specified
|
|
180
|
+
const lowerContent = searchableContent.toLowerCase();
|
|
181
|
+
const allMatches = [];
|
|
182
|
+
let firstMatch = null;
|
|
183
|
+
|
|
184
|
+
for (const searchStr of searchStrings) {
|
|
185
|
+
if (lowerContent.includes(searchStr.toLowerCase())) {
|
|
186
|
+
allMatches.push(searchStr);
|
|
187
|
+
if (!firstMatch) {
|
|
188
|
+
firstMatch = searchStr;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
found: allMatches.length > 0,
|
|
195
|
+
matchedString: firstMatch,
|
|
196
|
+
matchedStrings: allMatches,
|
|
197
|
+
allMatches: allMatches,
|
|
198
|
+
logicType: 'OR'
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Determines if a content type should be analyzed for search strings
|
|
204
|
+
* @param {string} contentType - The response content-type header
|
|
205
|
+
* @returns {boolean} True if content should be analyzed
|
|
206
|
+
*/
|
|
207
|
+
function shouldAnalyzeContentType(contentType) {
|
|
208
|
+
if (!contentType) return false;
|
|
209
|
+
|
|
210
|
+
const textTypes = [
|
|
211
|
+
'text/', // text/html, text/plain, text/xml, etc.
|
|
212
|
+
'application/json',
|
|
213
|
+
'application/javascript',
|
|
214
|
+
'application/xml', // Standard XML
|
|
215
|
+
'application/x-javascript',
|
|
216
|
+
'application/soap+xml', // SOAP XML
|
|
217
|
+
'application/rss+xml', // RSS feeds
|
|
218
|
+
'application/atom+xml', // Atom feeds
|
|
219
|
+
'application/xhtml+xml' // XHTML
|
|
220
|
+
];
|
|
221
|
+
|
|
222
|
+
return textTypes.some(type => contentType.includes(type));
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Creates a curl-based URL handler for downloading and optionally searching content
|
|
227
|
+
* @param {object} config - Configuration object containing all necessary parameters
|
|
228
|
+
* @returns {Function} URL handler function for curl-based content analysis
|
|
229
|
+
*/
|
|
230
|
+
function createCurlHandler(config) {
|
|
231
|
+
const {
|
|
232
|
+
searchStrings,
|
|
233
|
+
searchStringsAnd,
|
|
234
|
+
hasSearchStringAnd,
|
|
235
|
+
regexes,
|
|
236
|
+
matchedDomains,
|
|
237
|
+
addMatchedDomain, // Helper function for adding domains
|
|
238
|
+
currentUrl,
|
|
239
|
+
perSiteSubDomains,
|
|
240
|
+
ignoreDomains,
|
|
241
|
+
matchesIgnoreDomain,
|
|
242
|
+
getRootDomain,
|
|
243
|
+
siteConfig,
|
|
244
|
+
dumpUrls,
|
|
245
|
+
matchedUrlsLogFile,
|
|
246
|
+
forceDebug,
|
|
247
|
+
userAgent,
|
|
248
|
+
resourceType, // Resource type from request
|
|
249
|
+
hasSearchString
|
|
250
|
+
} = config;
|
|
251
|
+
|
|
252
|
+
return async function curlHandler(requestUrl) {
|
|
253
|
+
const respDomain = perSiteSubDomains ? (new URL(requestUrl)).hostname : getRootDomain(requestUrl);
|
|
254
|
+
|
|
255
|
+
// Only process URLs that match our regex patterns
|
|
256
|
+
const matchesRegex = regexes.some(re => re.test(requestUrl));
|
|
257
|
+
if (!matchesRegex) return;
|
|
258
|
+
|
|
259
|
+
// Check if this is a first-party request (same domain as the URL being scanned)
|
|
260
|
+
const currentUrlHostname = new URL(currentUrl).hostname;
|
|
261
|
+
const requestHostname = new URL(requestUrl).hostname;
|
|
262
|
+
const isFirstParty = currentUrlHostname === requestHostname;
|
|
263
|
+
|
|
264
|
+
// Apply first-party/third-party filtering
|
|
265
|
+
if (isFirstParty && siteConfig.firstParty === false) {
|
|
266
|
+
if (forceDebug) {
|
|
267
|
+
console.log(`[debug][curl] Skipping first-party request (firstParty=false): ${requestUrl}`);
|
|
268
|
+
}
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
273
|
+
if (forceDebug) {
|
|
274
|
+
console.log(`[debug][curl] Skipping third-party request (thirdParty=false): ${requestUrl}`);
|
|
275
|
+
}
|
|
276
|
+
return;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
try {
|
|
280
|
+
if (forceDebug) {
|
|
281
|
+
console.log(`[debug][curl] Downloading content from: ${requestUrl}`);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// If NO searchstring is defined, match immediately (like browser behavior)
|
|
285
|
+
if (!hasSearchString && !hasSearchStringAnd) {
|
|
286
|
+
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
|
|
287
|
+
return;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, resourceType);
|
|
291
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
292
|
+
|
|
293
|
+
if (siteConfig.verbose === 1) {
|
|
294
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
295
|
+
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
296
|
+
console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (dumpUrls) {
|
|
300
|
+
const timestamp = new Date().toISOString();
|
|
301
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
302
|
+
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
303
|
+
try {
|
|
304
|
+
fs.appendFileSync(matchedUrlsLogFile,
|
|
305
|
+
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
|
|
306
|
+
} catch (logErr) {
|
|
307
|
+
console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
return;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// If searchstring IS defined, download and search content
|
|
314
|
+
const content = await downloadWithCurl(requestUrl, userAgent, 30);
|
|
315
|
+
|
|
316
|
+
// Check if content contains search strings (OR or AND logic)
|
|
317
|
+
const { found, matchedString, logicType } = searchContent(content, searchStrings, searchStringsAnd, '');
|
|
318
|
+
|
|
319
|
+
if (found) {
|
|
320
|
+
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, resourceType);
|
|
325
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
326
|
+
|
|
327
|
+
if (siteConfig.verbose === 1) {
|
|
328
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
329
|
+
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
330
|
+
console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains searchstring (${logicType}): "${matchedString}"${resourceInfo}`);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if (dumpUrls) {
|
|
334
|
+
const timestamp = new Date().toISOString();
|
|
335
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
336
|
+
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
337
|
+
try {
|
|
338
|
+
fs.appendFileSync(matchedUrlsLogFile,
|
|
339
|
+
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
|
|
340
|
+
} catch (logErr) {
|
|
341
|
+
console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
} else if (forceDebug) {
|
|
345
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
346
|
+
console.log(`[debug][curl] ${requestUrl} (${partyType}) matched regex but no searchstring found`);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
} catch (err) {
|
|
350
|
+
if (forceDebug) {
|
|
351
|
+
console.log(`[debug][curl] Failed to download content for ${requestUrl}: ${err.message}`);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Creates a response handler function for the given configuration
|
|
359
|
+
* @param {object} config - Configuration object containing all necessary parameters
|
|
360
|
+
* @returns {Function} Response handler function for page.on('response', handler)
|
|
361
|
+
*/
|
|
362
|
+
function createResponseHandler(config) {
|
|
363
|
+
const {
|
|
364
|
+
searchStrings,
|
|
365
|
+
searchStringsAnd,
|
|
366
|
+
hasSearchStringAnd,
|
|
367
|
+
regexes,
|
|
368
|
+
matchedDomains,
|
|
369
|
+
addMatchedDomain, // Helper function for adding domains
|
|
370
|
+
currentUrl,
|
|
371
|
+
perSiteSubDomains,
|
|
372
|
+
ignoreDomains,
|
|
373
|
+
matchesIgnoreDomain,
|
|
374
|
+
getRootDomain,
|
|
375
|
+
siteConfig,
|
|
376
|
+
dumpUrls,
|
|
377
|
+
matchedUrlsLogFile,
|
|
378
|
+
forceDebug,
|
|
379
|
+
resourceType // Will be null for response handler
|
|
380
|
+
} = config;
|
|
381
|
+
|
|
382
|
+
return async function responseHandler(response) {
|
|
383
|
+
const respUrl = response.url();
|
|
384
|
+
const respDomain = perSiteSubDomains ? (new URL(respUrl)).hostname : getRootDomain(respUrl);
|
|
385
|
+
|
|
386
|
+
// Only process responses that match our regex patterns
|
|
387
|
+
const matchesRegex = regexes.some(re => re.test(respUrl));
|
|
388
|
+
if (!matchesRegex) return;
|
|
389
|
+
|
|
390
|
+
// Check if this is a first-party response (same domain as the URL being scanned)
|
|
391
|
+
const currentUrlHostname = new URL(currentUrl).hostname;
|
|
392
|
+
const responseHostname = new URL(respUrl).hostname;
|
|
393
|
+
const isFirstParty = currentUrlHostname === responseHostname;
|
|
394
|
+
|
|
395
|
+
// The main request handler already filtered first-party/third-party requests
|
|
396
|
+
// This response handler only runs for requests that passed that filter
|
|
397
|
+
// However, we need to apply the same first-party/third-party logic here for searchstring analysis
|
|
398
|
+
// because the response handler analyzes content, not just URLs
|
|
399
|
+
|
|
400
|
+
// Apply first-party/third-party filtering for searchstring analysis
|
|
401
|
+
// Use the exact same logic as the main request handler
|
|
402
|
+
if (isFirstParty && siteConfig.firstParty === false) {
|
|
403
|
+
if (forceDebug) {
|
|
404
|
+
console.log(`[debug] Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`);
|
|
405
|
+
}
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
410
|
+
if (forceDebug) {
|
|
411
|
+
console.log(`[debug] Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`);
|
|
412
|
+
}
|
|
413
|
+
return;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
try {
|
|
417
|
+
// Only capture appropriate content types to avoid binary data
|
|
418
|
+
const contentType = response.headers()['content-type'] || '';
|
|
419
|
+
if (!shouldAnalyzeContentType(contentType)) {
|
|
420
|
+
if (forceDebug) {
|
|
421
|
+
console.log(`[debug] Skipping content analysis for ${respUrl} (content-type: ${contentType})`);
|
|
422
|
+
}
|
|
423
|
+
return;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
const content = await response.text();
|
|
427
|
+
|
|
428
|
+
// Check if content contains search strings (OR or AND logic)
|
|
429
|
+
const { found, matchedString, logicType } = searchContent(content, searchStrings, searchStringsAnd, contentType);
|
|
430
|
+
|
|
431
|
+
if (found) {
|
|
432
|
+
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
|
|
433
|
+
return;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Response handler doesn't have access to specific resource type
|
|
437
|
+
addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, null);
|
|
438
|
+
const simplifiedUrl = getRootDomain(currentUrl);
|
|
439
|
+
|
|
440
|
+
if (siteConfig.verbose === 1) {
|
|
441
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
442
|
+
console.log(`[match][${simplifiedUrl}] ${respUrl} (${partyType}) contains searchstring (${logicType}): "${matchedString}"`);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
if (dumpUrls) {
|
|
446
|
+
const timestamp = new Date().toISOString();
|
|
447
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
448
|
+
try {
|
|
449
|
+
fs.appendFileSync(matchedUrlsLogFile,
|
|
450
|
+
`${timestamp} [match][${simplifiedUrl}] ${respUrl} (${partyType}, searchstring (${logicType}): "${matchedString}")\n`);
|
|
451
|
+
} catch (logErr) {
|
|
452
|
+
console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
} else if (forceDebug) {
|
|
456
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
457
|
+
console.log(`[debug] ${respUrl} (${partyType}) matched regex but no searchstring found`);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
} catch (err) {
|
|
461
|
+
if (forceDebug) {
|
|
462
|
+
console.log(`[debug] Failed to read response content for ${respUrl}: ${err.message}`);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Validates searchstring configuration
|
|
470
|
+
* @param {any} searchstring - The searchstring value to validate
|
|
471
|
+
* @param {any} searchstringAnd - The searchstring_and value to validate
|
|
472
|
+
* @returns {object} Validation result with isValid boolean and error message
|
|
473
|
+
*/
|
|
474
|
+
function validateSearchString(searchstring, searchstringAnd) {
|
|
475
|
+
if (searchstring === undefined || searchstring === null) {
|
|
476
|
+
return { isValid: true, error: null };
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (typeof searchstring === 'string') {
|
|
480
|
+
if (searchstring.length === 0) {
|
|
481
|
+
return { isValid: false, error: 'searchstring cannot be empty string' };
|
|
482
|
+
}
|
|
483
|
+
return { isValid: true, error: null };
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
if (Array.isArray(searchstring)) {
|
|
487
|
+
if (searchstring.length === 0) {
|
|
488
|
+
return { isValid: false, error: 'searchstring array cannot be empty' };
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
for (let i = 0; i < searchstring.length; i++) {
|
|
492
|
+
if (typeof searchstring[i] !== 'string') {
|
|
493
|
+
return { isValid: false, error: `searchstring[${i}] must be a string` };
|
|
494
|
+
}
|
|
495
|
+
if (searchstring[i].length === 0) {
|
|
496
|
+
return { isValid: false, error: `searchstring[${i}] cannot be empty string` };
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
return { isValid: true, error: null };
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
// Validate searchstring_and
|
|
504
|
+
if (searchstringAnd !== undefined && searchstringAnd !== null) {
|
|
505
|
+
if (typeof searchstringAnd === 'string') {
|
|
506
|
+
if (searchstringAnd.length === 0) {
|
|
507
|
+
return { isValid: false, error: 'searchstring_and cannot be empty string' };
|
|
508
|
+
}
|
|
509
|
+
} else if (Array.isArray(searchstringAnd)) {
|
|
510
|
+
if (searchstringAnd.length === 0) {
|
|
511
|
+
return { isValid: false, error: 'searchstring_and array cannot be empty' };
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
for (let i = 0; i < searchstringAnd.length; i++) {
|
|
515
|
+
if (typeof searchstringAnd[i] !== 'string') {
|
|
516
|
+
return { isValid: false, error: `searchstring_and[${i}] must be a string` };
|
|
517
|
+
}
|
|
518
|
+
if (searchstringAnd[i].length === 0) {
|
|
519
|
+
return { isValid: false, error: `searchstring_and[${i}] cannot be empty string` };
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
} else {
|
|
523
|
+
return { isValid: false, error: 'searchstring_and must be string or array of strings' };
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Check that both searchstring and searchstring_and aren't defined simultaneously
|
|
528
|
+
if (searchstring && searchstringAnd) {
|
|
529
|
+
return { isValid: false, error: 'Cannot use both searchstring (OR) and searchstring_and (AND) simultaneously. Choose one logic type.' };
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
return { isValid: false, error: 'searchstring must be string or array of strings' };
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
/**
|
|
536
|
+
* Gets statistics about search string matches
|
|
537
|
+
* @param {Set|Map} matchedDomains - Set or Map of matched domains
|
|
538
|
+
* @param {Array<string>} searchStrings - Array of search strings used
|
|
539
|
+
* @returns {object} Statistics object
|
|
540
|
+
*/
|
|
541
|
+
function getSearchStats(matchedDomains, searchStrings) {
|
|
542
|
+
const totalMatches = matchedDomains instanceof Map ? matchedDomains.size : matchedDomains.size;
|
|
543
|
+
|
|
544
|
+
return {
|
|
545
|
+
totalMatches,
|
|
546
|
+
searchStringCount: searchStrings.length,
|
|
547
|
+
searchStrings: [...searchStrings]
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
module.exports = {
|
|
552
|
+
parseSearchStrings,
|
|
553
|
+
searchContent,
|
|
554
|
+
shouldAnalyzeContentType,
|
|
555
|
+
createResponseHandler,
|
|
556
|
+
createCurlHandler,
|
|
557
|
+
downloadWithCurl,
|
|
558
|
+
validateSearchString,
|
|
559
|
+
getSearchStats,
|
|
560
|
+
addDomainToCollection
|
|
561
|
+
};
|