@fanboynz/network-scanner 2.0.66 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,14 +2,19 @@
2
2
  // Handles response content analysis for searchstring functionality
3
3
 
4
4
  const fs = require('fs');
5
- const { spawnSync } = require('child_process');
5
+ const { formatLogMessage, messageColors } = require('./colorize');
6
+ const CURL_TAG = messageColors.processing('[curl]');
7
+ // responseHandler is a separate code path (Puppeteer response listener,
8
+ // not curl) — its debug output gets its own subsystem prefix so it's
9
+ // distinguishable from curl-handler logs.
10
+ const SEARCHSTRING_TAG = messageColors.processing('[searchstring]');
11
+ const { runProcess } = require('./spawn-async');
6
12
  const { grepContent } = require('./grep');
7
13
 
8
14
  // Configuration constants for search logic
9
15
  const SEARCH_CONFIG = {
10
16
  MAX_CONTENT_SIZE: 50 * 1024 * 1024, // 50MB max content size
11
- MAX_SEARCH_STRING_LENGTH: 1000,
12
- XML_ENTITY_TIMEOUT: 5000 // 5 second timeout for XML processing
17
+ MAX_SEARCH_STRING_LENGTH: 1000
13
18
  };
14
19
 
15
20
  /**
@@ -46,36 +51,6 @@ function parseSearchStrings(searchstring, searchstringAnd) {
46
51
  };
47
52
  }
48
53
 
49
- /**
50
- * Helper function to add domain to matched collection (handles both Set and Map)
51
- * @param {Set|Map} matchedDomains - The matched domains collection
52
- * @param {Function} addMatchedDomain - Optional helper function for adding domains
53
- * @param {string} domain - Domain to add
54
- * @param {string} resourceType - Resource type (for --adblock-rules mode)
55
- * @param {string} fullSubdomain - Full subdomain for cache tracking (optional)
56
- */
57
- function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourceType = null, fullSubdomain = null) {
58
- // Use helper function if provided (preferred method)
59
- if (typeof addMatchedDomain === 'function') {
60
- addMatchedDomain(domain, resourceType, fullSubdomain);
61
- return;
62
- }
63
-
64
- // Fallback: handle different collection types directly
65
- if (matchedDomains instanceof Set) {
66
- matchedDomains.add(domain);
67
- } else if (matchedDomains instanceof Map) {
68
- if (!matchedDomains.has(domain)) {
69
- matchedDomains.set(domain, new Set());
70
- }
71
- if (resourceType) {
72
- matchedDomains.get(domain).add(resourceType);
73
- }
74
- } else {
75
- console.warn('[warn] Unknown matchedDomains type, skipping domain addition');
76
- }
77
- }
78
-
79
54
  /**
80
55
  * Downloads content using curl with appropriate headers and timeout
81
56
  * @param {string} url - The URL to download
@@ -84,55 +59,42 @@ function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourc
84
59
  * @returns {Promise<string>} The downloaded content
85
60
  */
86
61
  async function downloadWithCurl(url, userAgent = '', timeout = 30) {
87
- return new Promise((resolve, reject) => {
88
- try {
89
- const curlArgs = [
90
- '-s', // Silent mode
91
- '-L', // Follow redirects
92
- '--max-time', timeout.toString(),
93
- '--max-redirs', '5',
94
- '--fail-with-body', // Return body even on HTTP errors
95
- '--max-filesize', '52428800', // 50MB limit
96
- '--range', '0-52428799', // Limit download size
97
- '--compressed', // Accept compressed responses
98
- ];
62
+ const MAX_STDOUT_BYTES = 52428800; // 50MB, matches --max-filesize below
99
63
 
100
- if (userAgent) {
101
- curlArgs.push('-H', `User-Agent: ${userAgent}`);
102
- }
64
+ const curlArgs = [
65
+ '-s',
66
+ '-L',
67
+ '--max-time', timeout.toString(),
68
+ '--max-redirs', '5',
69
+ '--fail-with-body',
70
+ '--max-filesize', '52428800',
71
+ '--range', '0-52428799',
72
+ '--compressed'
73
+ ];
74
+ if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
75
+ curlArgs.push(
76
+ '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
77
+ '-H', 'Accept-Language: en-US,en;q=0.5',
78
+ '-H', 'Accept-Encoding: gzip, deflate',
79
+ '-H', 'Connection: keep-alive',
80
+ '-H', 'Upgrade-Insecure-Requests: 1'
81
+ );
82
+ curlArgs.push(url);
103
83
 
104
- // Add common headers to appear more browser-like
105
- curlArgs.push(
106
- '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
107
- '-H', 'Accept-Language: en-US,en;q=0.5',
108
- '-H', 'Accept-Encoding: gzip, deflate',
109
- '-H', 'Connection: keep-alive',
110
- '-H', 'Upgrade-Insecure-Requests: 1'
111
- );
112
-
113
- curlArgs.push(url);
114
-
115
- // Use spawnSync with proper argument separation
116
- const result = spawnSync('curl', curlArgs, {
117
- encoding: 'utf8',
118
- timeout: timeout * 1000,
119
- maxBuffer: 10 * 1024 * 1024, // 10MB max buffer
120
- killSignal: 'SIGTERM'
121
- });
122
-
123
- if (result.error) {
124
- throw result.error;
125
- }
126
-
127
- if (result.status !== 0) {
128
- throw new Error(`Curl exited with status ${result.status}: ${result.stderr}`);
129
- }
130
-
131
- resolve(result.stdout);
132
- } catch (error) {
133
- reject(new Error(`Curl failed for ${url}: ${error.message}`));
134
- }
84
+ // Shared async-spawn helper same streaming/cap/timeout/kill plumbing
85
+ // that used to be ~80 lines of inline boilerplate here.
86
+ const result = await runProcess('curl', curlArgs, {
87
+ timeout: timeout * 1000,
88
+ maxStdout: MAX_STDOUT_BYTES
135
89
  });
90
+
91
+ if (result.error) throw new Error(`Curl failed for ${url}: ${result.error}`);
92
+ if (result.truncated) throw new Error(`Curl output exceeded ${MAX_STDOUT_BYTES} bytes for ${url}`);
93
+ if (result.signal) throw new Error(`Curl killed by signal ${result.signal} for ${url}`);
94
+ if (result.code !== 0) {
95
+ throw new Error(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
96
+ }
97
+ return result.stdout.toString('utf8');
136
98
  }
137
99
 
138
100
  /**
@@ -166,59 +128,48 @@ async function downloadWithRetry(url, userAgent = '', timeout = 30, retries = 2)
166
128
  }
167
129
  }
168
130
 
131
+ // Lookup table for the 6 named entities the previous chained-replace
132
+ // handled. Hoisted out of safeDecodeXmlEntities so the object isn't
133
+ // reallocated per call.
134
+ const NAMED_ENTITIES = Object.freeze({
135
+ '&lt;': '<', '&gt;': '>', '&amp;': '&',
136
+ '&quot;': '"', '&apos;': "'", '&#39;': "'"
137
+ });
138
+
169
139
  /**
170
- * Safely decodes XML entities with timeout protection
140
+ * Safely decodes XML entities (named + numeric decimal + numeric hex)
141
+ * in a SINGLE regex pass. The old implementation chained 8 separate
142
+ * .replace() calls, each allocating a full intermediate string — for
143
+ * 50MB content that was ~8 × 50MB ≈ 400MB of throwaway allocations per
144
+ * XML response. Also drops the previous "timeout" check, which only
145
+ * fired between regex passes (not during them) so it never actually
146
+ * bounded runtime on pathological input.
171
147
  * @param {string} content - Content to decode
172
148
  * @returns {string} Decoded content or original if processing fails
173
149
  */
174
150
  function safeDecodeXmlEntities(content) {
175
- const startTime = Date.now();
176
-
177
151
  try {
178
- let decoded = content
179
- .replace(/&lt;/g, '<')
180
- .replace(/&gt;/g, '>')
181
- .replace(/&amp;/g, '&')
182
- .replace(/&quot;/g, '"')
183
- .replace(/&#39;/g, "'")
184
- .replace(/&apos;/g, "'");
185
-
186
- // Check timeout before expensive regex operations
187
- if (Date.now() - startTime > SEARCH_CONFIG.XML_ENTITY_TIMEOUT) {
188
- console.warn('[warn] XML entity decoding timeout, using partial result');
189
- return decoded;
190
- }
191
-
192
- // Decode numeric entities (decimal)
193
- decoded = decoded.replace(/&#(\d+);/g, (match, dec) => {
194
- const num = parseInt(dec, 10);
195
- // Validate range for safety (valid Unicode range)
196
- if (num >= 0 && num <= 0x10FFFF) {
197
- return String.fromCharCode(num);
152
+ return content.replace(
153
+ /&lt;|&gt;|&amp;|&quot;|&apos;|&#39;|&#\d+;|&#x[0-9a-fA-F]+;/g,
154
+ (match) => {
155
+ // Named entity — exact match in the lookup table.
156
+ const named = NAMED_ENTITIES[match];
157
+ if (named) return named;
158
+ // Numeric entity — &#xNN; (hex) or &#NN; (decimal).
159
+ const isHex = match[2] === 'x' || match[2] === 'X';
160
+ const numStr = isHex ? match.slice(3, -1) : match.slice(2, -1);
161
+ const num = parseInt(numStr, isHex ? 16 : 10);
162
+ // String.fromCodePoint (NOT fromCharCode) fromCharCode truncates
163
+ // to 16 bits, so &#128512; (😀, codepoint 0x1F600) would decode to
164
+ // '' (a single garbage BMP char) instead of the emoji.
165
+ // fromCodePoint handles the full Unicode range up to 0x10FFFF.
166
+ if (num >= 0 && num <= 0x10FFFF) return String.fromCodePoint(num);
167
+ return match; // out-of-range keep original
198
168
  }
199
- return match; // Keep original if invalid
200
- });
201
-
202
- // Check timeout again
203
- if (Date.now() - startTime > SEARCH_CONFIG.XML_ENTITY_TIMEOUT) {
204
- console.warn('[warn] XML entity decoding timeout, using partial result');
205
- return decoded;
206
- }
207
-
208
- // Decode numeric entities (hexadecimal)
209
- decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (match, hex) => {
210
- const num = parseInt(hex, 16);
211
- // Validate range for safety (valid Unicode range)
212
- if (num >= 0 && num <= 0x10FFFF) {
213
- return String.fromCharCode(num);
214
- }
215
- return match; // Keep original if invalid
216
- });
217
-
218
- return decoded;
169
+ );
219
170
  } catch (xmlErr) {
220
- console.warn(`[warn] XML entity decoding failed: ${xmlErr.message}`);
221
- return content; // Return original content if decoding fails
171
+ console.warn(formatLogMessage('warn', `XML entity decoding failed: ${xmlErr.message}`));
172
+ return content;
222
173
  }
223
174
  }
224
175
 
@@ -229,15 +180,12 @@ function safeDecodeXmlEntities(content) {
229
180
  */
230
181
  function safeStripTags(content) {
231
182
  try {
232
- // Limit content size for tag stripping to prevent excessive memory usage
233
- const limitedContent = content.length > SEARCH_CONFIG.MAX_CONTENT_SIZE
234
- ? content.substring(0, SEARCH_CONFIG.MAX_CONTENT_SIZE)
235
- : content;
236
-
237
- // Replace tags with spaces to preserve word boundaries
238
- return limitedContent.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ');
183
+ // No content-size cap here searchContent already truncated to
184
+ // MAX_CONTENT_SIZE before calling, so the previous cap was a no-op.
185
+ // Replace tags with spaces to preserve word boundaries.
186
+ return content.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ');
239
187
  } catch (stripErr) {
240
- console.warn(`[warn] Tag stripping failed: ${stripErr.message}`);
188
+ console.warn(formatLogMessage('warn', `Tag stripping failed: ${stripErr.message}`));
241
189
  return content;
242
190
  }
243
191
  }
@@ -251,134 +199,110 @@ function safeStripTags(content) {
251
199
  * @param {Array<string>} searchStringsAnd - Array of strings that must all be present (AND logic)
252
200
  * @param {string} contentType - Content type for specialized handling
253
201
  * @param {string} url - URL for debugging context (optional)
254
- * @returns {object} Object with found boolean, matchedString/matchedStrings, allMatches array, and logic type
202
+ * @returns {{found: boolean, matchedString: string|null, logicType: 'AND'|'OR'|'NONE', error?: string}}
255
203
  */
256
204
  function searchContent(content, searchStrings, searchStringsAnd = [], contentType = '', url = '') {
257
- // Input validation
205
+ // Input validation. Return shape carries only what callers actually
206
+ // destructure ({found, matchedString, logicType, error}); the old
207
+ // matchedStrings/allMatches/contentSize/searchableSize/processedAsXml
208
+ // fields were computed and returned but never read by any caller.
258
209
  if (!content || typeof content !== 'string') {
259
- return {
260
- found: false,
261
- matchedString: null,
262
- matchedStrings: [],
263
- allMatches: [],
264
- logicType: 'NONE',
265
- error: 'Invalid or empty content'
266
- };
210
+ return { found: false, matchedString: null, logicType: 'NONE', error: 'Invalid or empty content' };
267
211
  }
268
212
 
213
+ // Validate search strings FIRST — before paying for content truncation,
214
+ // XML entity decoding, tag stripping, and 3× lowercase. Previously these
215
+ // ran first, so a config with zero valid search strings still burned
216
+ // ~150MB of allocations on a 50MB XML response before returning empty.
217
+ const validSearchStrings = searchStrings.filter(str =>
218
+ str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
219
+ );
220
+ const validSearchStringsAnd = searchStringsAnd.filter(str =>
221
+ str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
222
+ );
223
+
224
+ if (validSearchStrings.length !== searchStrings.length) {
225
+ console.warn(formatLogMessage('warn', `Filtered ${searchStrings.length - validSearchStrings.length} invalid search strings`));
226
+ }
227
+ if (validSearchStringsAnd.length !== searchStringsAnd.length) {
228
+ console.warn(formatLogMessage('warn', `Filtered ${searchStringsAnd.length - validSearchStringsAnd.length} invalid AND search strings`));
229
+ }
230
+
231
+ if (validSearchStrings.length === 0 && validSearchStringsAnd.length === 0) {
232
+ return { found: false, matchedString: null, logicType: 'NONE', error: 'No valid search strings provided' };
233
+ }
234
+
269
235
  // Size check and truncation with warning
270
236
  const originalLength = content.length;
271
237
  if (originalLength > SEARCH_CONFIG.MAX_CONTENT_SIZE) {
272
238
  content = content.substring(0, SEARCH_CONFIG.MAX_CONTENT_SIZE);
273
- console.warn(`[warn] Content truncated from ${originalLength} to ${SEARCH_CONFIG.MAX_CONTENT_SIZE} chars for ${url || 'unknown URL'}`);
239
+ console.warn(formatLogMessage('warn', `Content truncated from ${originalLength} to ${SEARCH_CONFIG.MAX_CONTENT_SIZE} chars for ${url || 'unknown URL'}`));
274
240
  }
275
- let searchableContent = content;
276
241
 
277
- const isXmlContent = contentType.toLowerCase().includes('xml') ||
278
- contentType.toLowerCase().includes('html');
279
-
242
+ // For XML/HTML we search across three views — original, entity-decoded,
243
+ // tag-stripped — so encoded strings ("&amp;") and DOM-text strings
244
+ // ("body text") and raw-source strings (attribute values) all match.
245
+ //
246
+ // The previous implementation joined all three into a single 3× string
247
+ // then .toLowerCase()'d it. For a 50MB response that allocated a 150MB
248
+ // intermediate plus a 150MB lowercase copy. Now we lowercase each
249
+ // version independently and probe with `versionsIncludes()` — same
250
+ // matching semantics (a string found in ANY version still counts) but
251
+ // ~half the peak memory.
252
+ const ct = contentType.toLowerCase();
253
+ const isXmlContent = ct.includes('xml') || ct.includes('html');
254
+
255
+ let lowerVersions;
280
256
  if (isXmlContent) {
281
257
  try {
282
- // Safely decode XML entities
283
258
  const decodedContent = safeDecodeXmlEntities(content);
284
-
285
- // Safely strip tags to extract text content
286
259
  const strippedContent = safeStripTags(decodedContent);
287
-
288
- // Search in: original + decoded + stripped content
289
- // Use newlines as separators to prevent false matches across content types
290
- searchableContent = [content, decodedContent, strippedContent].join('\n');
291
-
260
+ lowerVersions = [
261
+ content.toLowerCase(),
262
+ decodedContent.toLowerCase(),
263
+ strippedContent.toLowerCase()
264
+ ];
292
265
  } catch (xmlProcessingErr) {
293
- console.warn(`[warn] XML processing failed for ${url || 'unknown URL'}: ${xmlProcessingErr.message}`);
294
- // Fall back to original content
295
- searchableContent = content;
266
+ console.warn(formatLogMessage('warn', `XML processing failed for ${url || 'unknown URL'}: ${xmlProcessingErr.message}`));
267
+ lowerVersions = [content.toLowerCase()];
296
268
  }
269
+ } else {
270
+ lowerVersions = [content.toLowerCase()];
297
271
  }
272
+
273
+ const versionsIncludes = (needleLower) => {
274
+ for (let i = 0; i < lowerVersions.length; i++) {
275
+ if (lowerVersions[i].includes(needleLower)) return true;
276
+ }
277
+ return false;
278
+ };
298
279
 
299
- // Input validation for search strings
300
- const validSearchStrings = searchStrings.filter(str =>
301
- str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
302
- );
303
- const validSearchStringsAnd = searchStringsAnd.filter(str =>
304
- str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
305
- );
306
-
307
- // Warn about filtered search strings
308
- if (validSearchStrings.length !== searchStrings.length) {
309
- console.warn(`[warn] Filtered ${searchStrings.length - validSearchStrings.length} invalid search strings`);
310
- }
311
- if (validSearchStringsAnd.length !== searchStringsAnd.length) {
312
- console.warn(`[warn] Filtered ${searchStringsAnd.length - validSearchStringsAnd.length} invalid AND search strings`);
313
- }
314
-
315
- // Early return if no valid search strings
316
- if (validSearchStrings.length === 0 && validSearchStringsAnd.length === 0) {
317
- return {
318
- found: false,
319
- matchedString: null,
320
- matchedStrings: [],
321
- allMatches: [],
322
- logicType: 'NONE',
323
- error: 'No valid search strings provided'
324
- };
325
- }
326
-
327
- // Pre-compute lowercase content once for better performance
328
- const lowerContent = searchableContent.toLowerCase();
329
-
330
- // Check AND logic first (more restrictive) - ALL strings must be present
331
- if (validSearchStringsAnd && validSearchStringsAnd.length > 0) {
332
- const foundAndStrings = [];
333
-
280
+ // Check AND logic first (more restrictive) — ALL strings must be present
281
+ // in at least one of the searchable versions. Loop exits early on first
282
+ // NOT-found.
283
+ if (validSearchStringsAnd.length > 0) {
284
+ let allFound = true;
334
285
  for (const searchStr of validSearchStringsAnd) {
335
- const lowerSearchStr = searchStr.toLowerCase();
336
- if (lowerContent.includes(lowerSearchStr)) {
337
- foundAndStrings.push(searchStr);
338
- } else {
339
- // Early exit if any AND string is not found
286
+ if (!versionsIncludes(searchStr.toLowerCase())) {
287
+ allFound = false;
340
288
  break;
341
289
  }
342
290
  }
343
-
344
- // AND logic: ALL valid strings must be found
345
- if (foundAndStrings.length === validSearchStringsAnd.length) {
346
- return {
347
- found: true,
348
- matchedString: foundAndStrings.join(' AND '),
349
- matchedStrings: foundAndStrings,
350
- allMatches: foundAndStrings,
351
- logicType: 'AND',
352
- contentSize: originalLength,
353
- searchableSize: searchableContent.length
354
- };
291
+ if (allFound) {
292
+ return { found: true, matchedString: validSearchStringsAnd.join(' AND '), logicType: 'AND' };
355
293
  }
356
294
  }
357
-
358
- // OR logic: ANY string can match
359
- const allMatches = [];
360
- let firstMatch = null;
361
-
295
+
296
+ // OR logic: ANY string can match. Early-exit on first hit since the
297
+ // caller only reads matchedString (the first match). Previously the
298
+ // loop ran to completion to fill an `allMatches` array no caller read.
362
299
  for (const searchStr of validSearchStrings) {
363
- const lowerSearchStr = searchStr.toLowerCase();
364
- if (lowerContent.includes(lowerSearchStr)) {
365
- allMatches.push(searchStr);
366
- if (!firstMatch) {
367
- firstMatch = searchStr;
368
- }
300
+ if (versionsIncludes(searchStr.toLowerCase())) {
301
+ return { found: true, matchedString: searchStr, logicType: 'OR' };
369
302
  }
370
303
  }
371
-
372
- return {
373
- found: allMatches.length > 0,
374
- matchedString: firstMatch,
375
- matchedStrings: allMatches,
376
- allMatches: allMatches,
377
- logicType: validSearchStrings.length > 0 ? 'OR' : 'NONE',
378
- contentSize: originalLength,
379
- searchableSize: searchableContent.length,
380
- processedAsXml: isXmlContent
381
- };
304
+
305
+ return { found: false, matchedString: null, logicType: validSearchStrings.length > 0 ? 'OR' : 'NONE' };
382
306
  }
383
307
 
384
308
  /**
@@ -440,44 +364,52 @@ function createCurlHandler(config) {
440
364
  hasSearchString
441
365
  } = config;
442
366
 
367
+ // Hoisted: currentUrl doesn't change for this handler's lifetime, so
368
+ // parsing its hostname once at handler-creation eliminates the
369
+ // per-request URL allocation.
370
+ let currentUrlHostname = '';
371
+ try { currentUrlHostname = new URL(currentUrl).hostname; } catch (_) {}
372
+
443
373
  return async function curlHandler(requestUrl) {
444
-
445
- // Only process URLs that match our regex patterns
374
+ // Regex check FIRST — cheap filter that skips ~99% of requests.
375
+ // Previously this ran AFTER a URL parse + domain-cache lookup;
376
+ // the parse is the expensive bit, so doing it after the cheap
377
+ // gate moves the cost off the hot path.
446
378
  const matchesRegex = regexes.some(re => re.test(requestUrl));
447
379
  if (!matchesRegex) return;
448
380
 
449
- // Extract domain and check if already detected (skip expensive operations)
450
- const reqDomain = perSiteSubDomains ? (new URL(requestUrl)).hostname : getRootDomain(requestUrl);
381
+ // Parse requestUrl ONCE and reuse. Was parsed 2-3 times.
382
+ let requestHostname;
383
+ try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
384
+ const reqDomain = perSiteSubDomains ? requestHostname : getRootDomain(requestUrl);
385
+
451
386
  if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(reqDomain)) {
452
387
  if (forceDebug) {
453
- console.log(`[debug][curl] Skipping already detected domain: ${reqDomain}`);
388
+ console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected domain: ${reqDomain}`));
454
389
  }
455
390
  return;
456
391
  }
457
-
458
- // Check if this is a first-party request (same domain as the URL being scanned)
459
- const currentUrlHostname = new URL(currentUrl).hostname;
460
- const requestHostname = new URL(requestUrl).hostname;
392
+
461
393
  const isFirstParty = currentUrlHostname === requestHostname;
462
394
 
463
395
  // Apply first-party/third-party filtering
464
396
  if (isFirstParty && siteConfig.firstParty === false) {
465
397
  if (forceDebug) {
466
- console.log(`[debug][curl] Skipping first-party request (firstParty=false): ${requestUrl}`);
398
+ console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty=false): ${requestUrl}`));
467
399
  }
468
400
  return;
469
401
  }
470
402
 
471
403
  if (!isFirstParty && siteConfig.thirdParty === false) {
472
404
  if (forceDebug) {
473
- console.log(`[debug][curl] Skipping third-party request (thirdParty=false): ${requestUrl}`);
405
+ console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty=false): ${requestUrl}`));
474
406
  }
475
407
  return;
476
408
  }
477
409
 
478
410
  try {
479
411
  if (forceDebug) {
480
- console.log(`[debug][curl] Downloading content from: ${requestUrl}`);
412
+ console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content from: ${requestUrl}`));
481
413
  }
482
414
 
483
415
  // If NO searchstring is defined, match immediately (like browser behavior)
@@ -486,7 +418,7 @@ function createCurlHandler(config) {
486
418
  return;
487
419
  }
488
420
 
489
- addDomainToCollection(matchedDomains, addMatchedDomain, reqDomain, resourceType);
421
+ addMatchedDomain(reqDomain, resourceType);
490
422
  const simplifiedUrl = getRootDomain(currentUrl);
491
423
 
492
424
  if (siteConfig.verbose === 1) {
@@ -503,7 +435,7 @@ function createCurlHandler(config) {
503
435
  fs.appendFileSync(matchedUrlsLogFile,
504
436
  `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
505
437
  } catch (logErr) {
506
- console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
438
+ console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
507
439
  }
508
440
  }
509
441
  return;
@@ -520,7 +452,7 @@ function createCurlHandler(config) {
520
452
  return;
521
453
  }
522
454
 
523
- addDomainToCollection(matchedDomains, addMatchedDomain, reqDomain, resourceType);
455
+ addMatchedDomain(reqDomain, resourceType);
524
456
  const simplifiedUrl = getRootDomain(currentUrl);
525
457
 
526
458
  if (siteConfig.verbose === 1) {
@@ -537,20 +469,20 @@ function createCurlHandler(config) {
537
469
  fs.appendFileSync(matchedUrlsLogFile,
538
470
  `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
539
471
  } catch (logErr) {
540
- console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
472
+ console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
541
473
  }
542
474
  }
543
475
  } else if (forceDebug) {
544
476
  const partyType = isFirstParty ? 'first-party' : 'third-party';
545
- console.log(`[debug][curl] ${requestUrl} (${partyType}) matched regex but no searchstring found`);
477
+ console.log(formatLogMessage('debug', `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no searchstring found`));
546
478
  if (error) {
547
- console.log(`[debug][curl] Search error: ${error}`);
479
+ console.log(formatLogMessage('debug', `${CURL_TAG} Search error: ${error}`));
548
480
  }
549
481
  }
550
482
 
551
483
  } catch (err) {
552
484
  if (forceDebug) {
553
- console.log(`[debug][curl] Failed to download content for ${requestUrl}: ${err.message}`);
485
+ console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download content for ${requestUrl}: ${err.message}`));
554
486
  }
555
487
  }
556
488
  };
@@ -582,56 +514,66 @@ function createResponseHandler(config) {
582
514
  resourceType // Will be null for response handler
583
515
  } = config;
584
516
 
517
+ // Hoisted: currentUrl doesn't change for this handler's lifetime.
518
+ // Root domain (not bare hostname) so first-party matches the definition
519
+ // used by nwss.js's main request handler AND lib/curl.js — previously
520
+ // this module used hostname equality, so cdn.example.com and
521
+ // static.example.com were classified third-party here but first-party
522
+ // by the main handler. Unified to the registrable-root rule.
523
+ let currentRootDomain = '';
524
+ try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
525
+
585
526
  return async function responseHandler(response) {
586
527
  const respUrl = response.url();
587
- const respDomain = perSiteSubDomains ? (new URL(respUrl)).hostname : getRootDomain(respUrl);
588
-
589
- // Only process responses that match our regex patterns
590
- const fullSubdomain = (new URL(respUrl)).hostname; // Always get full subdomain for cache tracking
591
-
592
- // Skip if already detected to avoid duplicates
528
+
529
+ // Regex check FIRST — cheapest filter, eliminates ~99% of responses
530
+ // before paying for URL parses + domain-cache lookup. Previously this
531
+ // ran AFTER URL parses + isDomainAlreadyDetected; reordering moves
532
+ // the parse cost off the hot path of every subresource response.
533
+ const matchesRegex = regexes.some(re => re.test(respUrl));
534
+ if (!matchesRegex) return;
535
+
536
+ // Parse respUrl ONCE and reuse. Was parsed 2-3 times per response.
537
+ let respHostname;
538
+ try { respHostname = new URL(respUrl).hostname; } catch (_) { return; }
539
+ const fullSubdomain = respHostname; // hostname is always the full subdomain
540
+
593
541
  if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(fullSubdomain)) {
594
542
  return;
595
543
  }
596
- const matchesRegex = regexes.some(re => re.test(respUrl));
597
- if (!matchesRegex) return;
598
-
599
- // Extract domain and check if already detected (skip expensive operations)
600
- // The main request handler already filtered first-party/third-party requests
601
- // This response handler only runs for requests that passed that filter
602
- // However, we need to apply the same first-party/third-party logic here for searchstring analysis
603
- // because the response handler analyzes content, not just URLs
604
-
605
- // Apply first-party/third-party filtering for searchstring analysis
606
- // Use the exact same logic as the main request handler
544
+ // respDomain (root domain) is only needed inside the `if (found)` block
545
+ // below. Deferring the getRootDomain call avoids the URL re-parse for
546
+ // every regex-matched response whose content doesn't contain the
547
+ // searchstring the common case on most pages.
607
548
 
608
- const currentUrlHostname = new URL(currentUrl).hostname;
609
- const responseHostname = new URL(respUrl).hostname;
610
- const isFirstParty = currentUrlHostname === responseHostname;
549
+ // First-party / third-party gate. Root-domain comparison matches the
550
+ // main handler and curl.js — old hostname comparison disagreed.
551
+ const respRootDomain = getRootDomain(respUrl);
552
+ const isFirstParty = currentRootDomain === respRootDomain;
611
553
  if (isFirstParty && siteConfig.firstParty === false) {
612
554
  if (forceDebug) {
613
- console.log(`[debug] Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`);
555
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`));
614
556
  }
615
557
  return;
616
558
  }
617
-
559
+
618
560
  if (!isFirstParty && siteConfig.thirdParty === false) {
619
561
  if (forceDebug) {
620
- console.log(`[debug] Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`);
562
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`));
621
563
  }
622
564
  return;
623
565
  }
624
-
566
+
625
567
  try {
626
568
  // Only capture appropriate content types to avoid binary data
627
569
  const contentType = response.headers()['content-type'] || '';
628
570
  if (!shouldAnalyzeContentType(contentType)) {
629
571
  if (forceDebug) {
630
- console.log(`[debug] Skipping content analysis for ${respUrl} (content-type: ${contentType})`);
572
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Skipping content analysis for ${respUrl} (content-type: ${contentType})`));
631
573
  }
632
574
  return;
633
575
  }
634
-
576
+
635
577
  const content = await response.text();
636
578
 
637
579
  // Cache the fetched content if callback provided
@@ -640,7 +582,7 @@ function createResponseHandler(config) {
640
582
  config.onContentFetched(respUrl, content);
641
583
  } catch (cacheErr) {
642
584
  if (forceDebug) {
643
- console.log(`[debug] Content caching failed: ${cacheErr.message}`);
585
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Content caching failed: ${cacheErr.message}`));
644
586
  }
645
587
  }
646
588
  }
@@ -677,7 +619,7 @@ function createResponseHandler(config) {
677
619
  }
678
620
  } catch (grepErr) {
679
621
  if (forceDebug) {
680
- console.log(`[debug] Grep failed for ${respUrl}, falling back to JavaScript: ${grepErr.message}`);
622
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Grep failed for ${respUrl}, falling back to JavaScript: ${grepErr.message}`));
681
623
  }
682
624
  // Fallback to JavaScript search
683
625
  searchResult = searchContent(content, searchStrings, searchStringsAnd, contentType, respUrl);
@@ -690,10 +632,13 @@ function createResponseHandler(config) {
690
632
  const { found, matchedString, logicType, error } = searchResult;
691
633
 
692
634
  if (found) {
635
+ // Reuse respRootDomain from the first-party check — was already
636
+ // computed above. Saves a second getRootDomain call per match.
637
+ const respDomain = perSiteSubDomains ? respHostname : respRootDomain;
693
638
  if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
694
639
  return;
695
640
  }
696
-
641
+
697
642
  // Response handler doesn't have access to specific resource type
698
643
  // Use the addMatchedDomain helper which handles fullSubdomain properly
699
644
  addMatchedDomain(respDomain, null, fullSubdomain);
@@ -713,138 +658,104 @@ function createResponseHandler(config) {
713
658
  fs.appendFileSync(matchedUrlsLogFile,
714
659
  `${timestamp} [match][${simplifiedUrl}] ${respUrl} (${partyType}, ${searchMethod}, searchstring (${logicType}): "${matchedString}")\n`);
715
660
  } catch (logErr) {
716
- console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
661
+ console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
717
662
  }
718
663
  }
719
664
  } else if (forceDebug) {
720
665
  const partyType = isFirstParty ? 'first-party' : 'third-party';
721
666
  const searchMethod = useGrep ? 'grep' : 'js';
722
- console.log(`[debug] ${respUrl} (${partyType}, ${searchMethod}) matched regex but no searchstring found`);
667
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} ${respUrl} (${partyType}, ${searchMethod}) matched regex but no searchstring found`));
723
668
  if (error) {
724
- console.log(`[debug] Search error: ${error}`);
669
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Search error: ${error}`));
725
670
  }
726
671
  }
727
-
672
+
728
673
  } catch (err) {
729
674
  if (forceDebug) {
730
- console.log(`[debug] Failed to read response content for ${respUrl}: ${err.message}`);
675
+ console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Failed to read response content for ${respUrl}: ${err.message}`));
731
676
  }
732
677
  }
733
678
  };
734
679
  }
735
680
 
736
681
  /**
737
- * Validates searchstring configuration
738
- * @param {any} searchstring - The searchstring value to validate
739
- * @param {any} searchstringAnd - The searchstring_and value to validate
740
- * @returns {object} Validation result with isValid boolean and error message
682
+ * Validates a single string-or-array-of-strings value against the
683
+ * shared rules: type, non-empty, per-element type/non-empty, length cap.
684
+ * Used by validateSearchString for both searchstring and searchstring_and.
685
+ *
686
+ * @param {string|Array<string>} value
687
+ * @param {string} fieldName - e.g. 'searchstring' or 'searchstring_and'
688
+ * @returns {{isValid: boolean, error: string|null}}
741
689
  */
742
- function validateSearchString(searchstring, searchstringAnd) {
743
- if (searchstring === undefined || searchstring === null) {
744
- return { isValid: true, error: null };
745
- }
746
-
747
- if (typeof searchstring === 'string') {
748
- if (searchstring.length === 0) {
749
- return { isValid: false, error: 'searchstring cannot be empty string' };
690
+ function validateSearchValue(value, fieldName) {
691
+ if (typeof value === 'string') {
692
+ if (value.length === 0) {
693
+ return { isValid: false, error: `${fieldName} cannot be empty string` };
694
+ }
695
+ if (value.length > SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH) {
696
+ return { isValid: false, error: `${fieldName} too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` };
750
697
  }
751
698
  return { isValid: true, error: null };
752
699
  }
753
-
754
- if (Array.isArray(searchstring)) {
755
- if (searchstring.length === 0) {
756
- return { isValid: false, error: 'searchstring array cannot be empty' };
700
+ if (Array.isArray(value)) {
701
+ if (value.length === 0) {
702
+ return { isValid: false, error: `${fieldName} array cannot be empty` };
757
703
  }
758
-
759
- for (let i = 0; i < searchstring.length; i++) {
760
- if (typeof searchstring[i] !== 'string') {
761
- return { isValid: false, error: `searchstring[${i}] must be a string` };
704
+ for (let i = 0; i < value.length; i++) {
705
+ if (typeof value[i] !== 'string') {
706
+ return { isValid: false, error: `${fieldName}[${i}] must be a string` };
707
+ }
708
+ if (value[i].length === 0) {
709
+ return { isValid: false, error: `${fieldName}[${i}] cannot be empty string` };
762
710
  }
763
- if (searchstring[i].length === 0) {
764
- return { isValid: false, error: `searchstring[${i}] cannot be empty string` };
711
+ if (value[i].length > SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH) {
712
+ return { isValid: false, error: `${fieldName}[${i}] too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` };
765
713
  }
766
714
  }
767
-
768
715
  return { isValid: true, error: null };
769
716
  }
770
-
771
- // Validate searchstring_and
772
- if (searchstringAnd !== undefined && searchstringAnd !== null) {
773
- if (typeof searchstringAnd === 'string') {
774
- if (searchstringAnd.length === 0) {
775
- return { isValid: false, error: 'searchstring_and cannot be empty string' };
776
- }
777
- } else if (Array.isArray(searchstringAnd)) {
778
- if (searchstringAnd.length === 0) {
779
- return { isValid: false, error: 'searchstring_and array cannot be empty' };
780
- }
781
-
782
- for (let i = 0; i < searchstringAnd.length; i++) {
783
- if (typeof searchstringAnd[i] !== 'string') {
784
- return { isValid: false, error: `searchstring_and[${i}] must be a string` };
785
- }
786
- if (searchstringAnd[i].length === 0) {
787
- return { isValid: false, error: `searchstring_and[${i}] cannot be empty string` };
788
- }
789
- }
790
- } else {
791
- return { isValid: false, error: 'searchstring_and must be string or array of strings' };
792
- }
717
+ return { isValid: false, error: `${fieldName} must be string or array of strings` };
718
+ }
719
+
720
+ /**
721
+ * Validates searchstring configuration. The old structure returned
722
+ * early on valid string/array searchstring, so 60+ lines of validation
723
+ * below (the both-defined check, length caps, searchstring_and type
724
+ * check) were unreachable for valid inputs — e.g. passing both
725
+ * searchstring AND searchstring_and would have passed validation
726
+ * despite the documented mutual-exclusion rule. Rewritten as a linear
727
+ * sequence of independent checks via the shared validateSearchValue
728
+ * helper so every rule actually runs.
729
+ *
730
+ * @param {any} searchstring - The searchstring value (OR logic)
731
+ * @param {any} searchstringAnd - The searchstring_and value (AND logic)
732
+ * @returns {{isValid: boolean, error: string|null}}
733
+ */
734
+ function validateSearchString(searchstring, searchstringAnd) {
735
+ const hasOR = searchstring !== undefined && searchstring !== null;
736
+ const hasAND = searchstringAnd !== undefined && searchstringAnd !== null;
737
+
738
+ // Both unset is fine no searchstring filtering will be applied.
739
+ if (!hasOR && !hasAND) {
740
+ return { isValid: true, error: null };
793
741
  }
794
-
795
- // Check that both searchstring and searchstring_and aren't defined simultaneously
796
- if ((searchstring !== undefined && searchstring !== null) &&
797
- (searchstringAnd !== undefined && searchstringAnd !== null)) {
742
+
743
+ // Mutual exclusion: can't combine OR and AND logic in one site config.
744
+ if (hasOR && hasAND) {
798
745
  return { isValid: false, error: 'Cannot use both searchstring (OR) and searchstring_and (AND) simultaneously. Choose one logic type.' };
799
746
  }
800
747
 
801
- // Additional validation for search string length limits
802
- const validateStringLength = (str, fieldName) => {
803
- if (str.length > SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH) {
804
- return { isValid: false, error: `${fieldName} too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` };
805
- }
806
- return { isValid: true };
807
- };
808
-
809
- // Validate search string lengths
810
- if (typeof searchstring === 'string') {
811
- const lengthCheck = validateStringLength(searchstring, 'searchstring');
812
- if (!lengthCheck.isValid) return lengthCheck;
813
- } else if (Array.isArray(searchstring)) {
814
- for (let i = 0; i < searchstring.length; i++) {
815
- const lengthCheck = validateStringLength(searchstring[i], `searchstring[${i}]`);
816
- if (!lengthCheck.isValid) return lengthCheck;
817
- }
748
+ if (hasOR) {
749
+ const check = validateSearchValue(searchstring, 'searchstring');
750
+ if (!check.isValid) return check;
818
751
  }
819
-
820
- // Validate AND search string lengths
821
- if (typeof searchstringAnd === 'string') {
822
- const lengthCheck = validateStringLength(searchstringAnd, 'searchstring_and');
823
- if (!lengthCheck.isValid) return lengthCheck;
824
- } else if (Array.isArray(searchstringAnd)) {
825
- for (let i = 0; i < searchstringAnd.length; i++) {
826
- const lengthCheck = validateStringLength(searchstringAnd[i], `searchstring_and[${i}]`);
827
- if (!lengthCheck.isValid) return lengthCheck;
828
- }
752
+
753
+ if (hasAND) {
754
+ const check = validateSearchValue(searchstringAnd, 'searchstring_and');
755
+ if (!check.isValid) return check;
829
756
  }
830
-
831
- return { isValid: false, error: 'searchstring must be string or array of strings' };
832
- }
833
757
 
834
- /**
835
- * Gets statistics about search string matches
836
- * @param {Set|Map} matchedDomains - Set or Map of matched domains
837
- * @param {Array<string>} searchStrings - Array of search strings used
838
- * @returns {object} Statistics object
839
- */
840
- function getSearchStats(matchedDomains, searchStrings) {
841
- const totalMatches = matchedDomains instanceof Map ? matchedDomains.size : matchedDomains.size;
842
-
843
- return {
844
- totalMatches,
845
- searchStringCount: searchStrings.length,
846
- searchStrings: [...searchStrings]
847
- };
758
+ return { isValid: true, error: null };
848
759
  }
849
760
 
850
761
  module.exports = {
@@ -856,7 +767,5 @@ module.exports = {
856
767
  createCurlHandler,
857
768
  downloadWithCurl,
858
769
  validateSearchString,
859
- getSearchStats,
860
- addDomainToCollection,
861
770
  downloadWithRetry
862
771
  };