@fanboynz/network-scanner 2.0.66 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/curl.js CHANGED
@@ -2,22 +2,36 @@
2
2
  // Handles HTTP content downloading using curl for searchstring analysis
3
3
 
4
4
  const fs = require('fs');
5
+ // spawnSync only kept for validateCurlAvailability (runs once at
6
+ // startup). Production curl downloads go through runProcess (async).
5
7
  const { spawnSync } = require('child_process');
6
- const { colorize, colors, messageColors, tags, formatLogMessage } = require('./colorize');
8
+ const { runProcess } = require('./spawn-async');
9
+ const { messageColors, formatLogMessage } = require('./colorize');
10
+ const { getReferrerForUrl } = require('./referrer');
11
+ const CURL_TAG = messageColors.processing('[curl]');
7
12
 
8
13
  // === Constants ===
9
14
  const CURL_DEFAULTS = {
10
15
  TIMEOUT_SECONDS: 30,
11
16
  MAX_REDIRECTS: 5,
12
- MAX_SIZE_BYTES: 10 * 1024 * 1024, // 10MB
13
- VALIDATION_TIMEOUT: 5000, // 5 seconds
14
- SPAWN_TIMEOUT_MULTIPLIER: 1000, // Convert seconds to milliseconds
15
- HTTP_SUCCESS_CODE: 200,
17
+ // 50MB to match lib/searchstring.js's downloadWithCurl cap — the two
18
+ // modules previously had different defaults (10MB vs 50MB) so the same
19
+ // URL could succeed or fail depending on which code path fetched it.
20
+ MAX_SIZE_BYTES: 50 * 1024 * 1024,
21
+ VALIDATION_TIMEOUT: 5000,
16
22
  CURL_SUCCESS_STATUS: 0,
17
- METADATA_PIPE_PARTS: 3, // http_code|content_type|size_download
18
23
  VERSION_LINE_INDEX: 0
19
24
  };
20
25
 
26
+ // Module-level so downloadWithCurl doesn't reallocate this closure on
27
+ // every call. No state captured — pure factory.
28
+ function errResult(msg) {
29
+ return {
30
+ content: '', httpCode: 0, contentType: 'unknown', downloadSize: 0,
31
+ success: false, error: msg
32
+ };
33
+ }
34
+
21
35
  /**
22
36
  * Downloads content using curl with browser-like headers
23
37
  * @param {string} url - The URL to download
@@ -34,90 +48,80 @@ async function downloadWithCurl(url, userAgent = '', options = {}) {
34
48
  customHeaders = {}
35
49
  } = options;
36
50
 
37
- try {
38
- const curlArgs = [
39
- '-s', // Silent mode
40
- '--max-time', timeout.toString(),
41
- '--max-redirs', maxRedirects.toString(),
42
- '--fail-with-body', // Return body even on HTTP errors
43
- '--compressed', // Accept compressed responses
44
- '--write-out', '%{http_code}|%{content_type}|%{size_download}', // Output metadata
45
- ];
46
-
47
- if (followRedirects) {
48
- curlArgs.push('-L'); // Follow redirects
49
- }
51
+ const curlArgs = [
52
+ '-s',
53
+ '--max-time', timeout.toString(),
54
+ '--max-redirs', maxRedirects.toString(),
55
+ '--fail-with-body',
56
+ '--compressed',
57
+ // Leading '\n' guarantees the metadata sits on its own line even
58
+ // when content has no trailing newline (older format had no
59
+ // separator and concatenated metadata with the last content byte).
60
+ '--write-out', '\n%{http_code}|%{content_type}|%{size_download}'
61
+ ];
50
62
 
51
- // Add user agent if provided
52
- if (userAgent) {
53
- curlArgs.push('-H', `User-Agent: ${userAgent}`);
54
- }
63
+ if (followRedirects) curlArgs.push('-L');
64
+ if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
55
65
 
56
- // Add common browser headers
57
- curlArgs.push(
58
- '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
59
- '-H', 'Accept-Language: en-US,en;q=0.5',
60
- '-H', 'Accept-Encoding: gzip, deflate, br',
61
- '-H', 'Connection: keep-alive',
62
- '-H', 'Upgrade-Insecure-Requests: 1',
63
- '-H', 'Sec-Fetch-Dest: document',
64
- '-H', 'Sec-Fetch-Mode: navigate',
65
- '-H', 'Sec-Fetch-Site: none',
66
- '-H', 'Cache-Control: no-cache'
67
- );
68
-
69
- // Add custom headers
70
- Object.entries(customHeaders).forEach(([key, value]) => {
71
- curlArgs.push('-H', `${key}: ${value}`);
72
- });
66
+ curlArgs.push(
67
+ '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
68
+ '-H', 'Accept-Language: en-US,en;q=0.5',
69
+ '-H', 'Accept-Encoding: gzip, deflate, br',
70
+ '-H', 'Connection: keep-alive',
71
+ '-H', 'Upgrade-Insecure-Requests: 1',
72
+ '-H', 'Sec-Fetch-Dest: document',
73
+ '-H', 'Sec-Fetch-Mode: navigate',
74
+ '-H', 'Sec-Fetch-Site: none',
75
+ '-H', 'Cache-Control: no-cache'
76
+ );
73
77
 
74
- curlArgs.push(url);
78
+ Object.entries(customHeaders).forEach(([key, value]) => {
79
+ curlArgs.push('-H', `${key}: ${value}`);
80
+ });
81
+ curlArgs.push(url);
75
82
 
76
- // Execute curl
77
- const curlResult = spawnSync('curl', curlArgs, {
78
- encoding: 'utf8',
79
- timeout: timeout * CURL_DEFAULTS.SPAWN_TIMEOUT_MULTIPLIER,
80
- maxBuffer: maxSize
81
- });
82
-
83
- if (curlResult.error) {
84
- throw curlResult.error;
85
- }
86
-
87
- if (curlResult.status !== CURL_DEFAULTS.CURL_SUCCESS_STATUS) {
88
- throw new Error(`Curl exited with status ${curlResult.status}: ${curlResult.stderr}`);
89
- }
90
-
91
- const output = curlResult.stdout;
92
- const lines = output.split('\n');
93
- const metadata = lines[lines.length - 1]; // Last line contains write-out data
94
- const content = lines.slice(0, -1).join('\n'); // Everything except last line
95
-
96
- // Parse metadata
97
- const metadataParts = metadata.split('|');
98
- if (metadataParts.length !== CURL_DEFAULTS.METADATA_PIPE_PARTS) {
99
- throw new Error(`Invalid metadata format: expected ${CURL_DEFAULTS.METADATA_PIPE_PARTS} parts, got ${metadataParts.length}`);
100
- }
101
- const [httpCode, contentType, downloadSize] = metadataParts;
102
-
103
- return {
104
- content,
105
- httpCode: parseInt(httpCode) || 0,
106
- contentType: contentType || 'unknown',
107
- downloadSize: parseInt(downloadSize) || content.length,
108
- success: true
109
- };
110
-
111
- } catch (error) {
112
- return {
113
- content: '',
114
- httpCode: 0,
115
- contentType: 'unknown',
116
- downloadSize: 0,
117
- success: false,
118
- error: error.message
119
- };
83
+ // Shared async-spawn helper handles streaming/cap/timeout/kill plumbing.
84
+ const result = await runProcess('curl', curlArgs, {
85
+ timeout: timeout * 1000,
86
+ maxStdout: maxSize
87
+ });
88
+
89
+ if (result.error) return errResult(result.error);
90
+ if (result.truncated) return errResult(`Output exceeded ${maxSize} bytes`);
91
+ if (result.signal) return errResult(`Killed by signal ${result.signal}`);
92
+ if (result.code !== CURL_DEFAULTS.CURL_SUCCESS_STATUS) {
93
+ return errResult(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
94
+ }
95
+
96
+ const output = result.stdout.toString('utf8');
97
+ // lastIndexOf('\n') is a single O(n) scan from the end vs the old
98
+ // split('\n') + slice(0,-1) + join('\n') which was three full passes
99
+ // plus two intermediate array allocations.
100
+ const sepIdx = output.lastIndexOf('\n');
101
+ if (sepIdx === -1) return errResult('No metadata separator in curl output');
102
+
103
+ const content = output.slice(0, sepIdx);
104
+ const metadata = output.slice(sepIdx + 1);
105
+
106
+ // Split on first/last pipe so the middle (content-type) can legitimately
107
+ // contain pipes — naive split('|') with parts-count check would drop the
108
+ // whole response with 'Invalid metadata format' for such content-types.
109
+ const firstPipe = metadata.indexOf('|');
110
+ const lastPipe = metadata.lastIndexOf('|');
111
+ if (firstPipe === -1 || firstPipe === lastPipe) {
112
+ return errResult(`Invalid metadata format: missing pipes in "${metadata}"`);
120
113
  }
114
+ const httpCode = metadata.slice(0, firstPipe);
115
+ const contentType = metadata.slice(firstPipe + 1, lastPipe);
116
+ const downloadSize = metadata.slice(lastPipe + 1);
117
+
118
+ return {
119
+ content,
120
+ httpCode: parseInt(httpCode, 10) || 0,
121
+ contentType: contentType || 'unknown',
122
+ downloadSize: parseInt(downloadSize, 10) || content.length,
123
+ success: true
124
+ };
121
125
  }
122
126
 
123
127
  /**
@@ -134,58 +138,103 @@ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSe
134
138
  }
135
139
 
136
140
  const lowerContent = content.toLowerCase();
137
-
138
- // Handle AND logic searchstring_and (all patterns must be present)
141
+
142
+ // Handle AND logic searchstring_and (all patterns must be present).
143
+ // Short-circuits on first missing pattern — the old code walked the
144
+ // entire list to build a full missingPatterns array that's only used
145
+ // by a debug log. Now we early-exit and report the first miss (the
146
+ // debug log's missingPatterns.join(', ') still works with one entry).
139
147
  if (hasSearchStringAnd && searchStringsAnd.length > 0) {
140
- const missingPatterns = [];
141
- const foundPatterns = [];
142
-
143
- for (const pattern of searchStringsAnd) {
144
- const lowerPattern = pattern.toLowerCase();
145
- if (lowerContent.includes(lowerPattern)) {
146
- foundPatterns.push(pattern);
147
- } else {
148
- missingPatterns.push(pattern);
148
+ // Pre-lower patterns once — was per-iteration toLowerCase before.
149
+ // For a 20-pattern AND check the difference is small per call but
150
+ // the pattern itself never changes between iterations of the loop.
151
+ const lowered = searchStringsAnd.map(p => p.toLowerCase());
152
+ for (let i = 0; i < searchStringsAnd.length; i++) {
153
+ if (!lowerContent.includes(lowered[i])) {
154
+ return {
155
+ found: false,
156
+ matchedPattern: null,
157
+ matchType: 'AND',
158
+ foundPatterns: searchStringsAnd.slice(0, i),
159
+ missingPatterns: [searchStringsAnd[i]]
160
+ };
149
161
  }
150
162
  }
151
-
152
- // All patterns must be found for AND logic
153
- if (missingPatterns.length === 0) {
154
- return {
155
- found: true,
156
- matchedPattern: foundPatterns.join(' AND '),
157
- matchType: 'AND',
158
- foundPatterns,
159
- missingPatterns: []
160
- };
161
- } else {
162
- return {
163
- found: false,
164
- matchedPattern: null,
165
- matchType: 'AND',
166
- foundPatterns,
167
- missingPatterns
168
- };
169
- }
163
+ return {
164
+ found: true,
165
+ matchedPattern: searchStringsAnd.join(' AND '),
166
+ matchType: 'AND',
167
+ foundPatterns: searchStringsAnd,
168
+ missingPatterns: []
169
+ };
170
170
  }
171
-
172
- // Handle OR logic searchstring (any pattern can match)
171
+
172
+ // Handle OR logic searchstring (any pattern can match). Same pre-lower
173
+ // optimization, though OR usually short-circuits early so the savings
174
+ // are smaller.
173
175
  if (searchStrings.length > 0) {
174
- for (const pattern of searchStrings) {
175
- const lowerPattern = pattern.toLowerCase();
176
- if (lowerContent.includes(lowerPattern)) {
177
- return {
178
- found: true,
179
- matchedPattern: pattern,
176
+ for (let i = 0; i < searchStrings.length; i++) {
177
+ if (lowerContent.includes(searchStrings[i].toLowerCase())) {
178
+ return {
179
+ found: true,
180
+ matchedPattern: searchStrings[i],
180
181
  matchType: 'OR'
181
182
  };
182
183
  }
183
184
  }
184
185
  }
185
-
186
+
186
187
  return { found: false, matchedPattern: null, matchType: null };
187
188
  }
188
189
 
190
+ /**
191
+ * Emits a match for a curl-fetched URL to both the verbose console
192
+ * (when siteConfig.verbose === 1) and the matched-URLs log file
193
+ * (when dumpUrls is true). Single source of truth for the format —
194
+ * both no-searchstring and with-searchstring match paths funnel
195
+ * through here so partyType / resourceInfo / timestamp / format
196
+ * don't drift between the two branches.
197
+ *
198
+ * @param {object} opts
199
+ * @param {string} opts.simplifiedUrl
200
+ * @param {string} opts.requestUrl
201
+ * @param {boolean} opts.isFirstParty
202
+ * @param {string|null} opts.resourceType
203
+ * @param {string|null} opts.matchInfo - null for "matched regex only"
204
+ * (no searchstring), a string like
205
+ * 'pattern: "X"' or 'patterns: 2/3'
206
+ * for searchstring matches
207
+ * @param {number|undefined} opts.verbose
208
+ * @param {boolean} opts.dumpUrls
209
+ * @param {string} opts.matchedUrlsLogFile
210
+ */
211
+ function logMatchedRequest({
212
+ simplifiedUrl, requestUrl, isFirstParty, resourceType,
213
+ matchInfo, verbose, dumpUrls, matchedUrlsLogFile
214
+ }) {
215
+ const partyType = isFirstParty ? 'first-party' : 'third-party';
216
+ const resourceInfo = resourceType ? ` (${resourceType})` : '';
217
+
218
+ if (verbose === 1) {
219
+ const verboseSuffix = matchInfo ? ` contains ${matchInfo}` : ' matched regex';
220
+ console.log(formatLogMessage('match',
221
+ `[${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${verboseSuffix}${resourceInfo}`));
222
+ }
223
+
224
+ if (dumpUrls && matchedUrlsLogFile) {
225
+ const timestamp = new Date().toISOString();
226
+ // matchInfo goes INSIDE the (party, curl, ...) parens to mirror the
227
+ // pre-refactor file format.
228
+ const fileExtra = matchInfo ? `, ${matchInfo}` : '';
229
+ try {
230
+ fs.appendFileSync(matchedUrlsLogFile,
231
+ `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl${fileExtra})${resourceInfo}\n`);
232
+ } catch (logErr) {
233
+ console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
234
+ }
235
+ }
236
+ }
237
+
189
238
  /**
190
239
  * Creates a curl-based URL handler for downloading and searching content
191
240
  * @param {object} config - Configuration object containing all necessary parameters
@@ -197,7 +246,8 @@ function createCurlHandler(config) {
197
246
  searchStringsAnd,
198
247
  hasSearchStringAnd,
199
248
  regexes,
200
- matchedDomains,
249
+ // matchedDomains intentionally not destructured — only addMatchedDomain
250
+ // is called; the underlying collection is opaque to this handler.
201
251
  addMatchedDomain,
202
252
  isDomainAlreadyDetected,
203
253
  onContentFetched,
@@ -215,101 +265,128 @@ function createCurlHandler(config) {
215
265
  hasSearchString
216
266
  } = config;
217
267
 
268
+ // Hoisted: currentUrl doesn't change for this handler's lifetime, so
269
+ // parsing its root domain once at handler-creation eliminates the
270
+ // per-request parse + getRootDomain call.
271
+ let currentRootDomain = '';
272
+ try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
273
+
218
274
  return async function curlHandler(requestUrl) {
219
275
  try {
220
- const respDomain = perSiteSubDomains ? (new URL(requestUrl)).hostname : getRootDomain(requestUrl);
221
- const fullSubdomain = (new URL(requestUrl)).hostname; // Always get full subdomain for cache tracking
222
-
223
- // Skip if already detected to avoid duplicates
224
- if (isDomainAlreadyDetected(fullSubdomain)) {
276
+ // Regex check FIRST cheap filter that skips ~99% of requests.
277
+ // Previously this ran AFTER a URL parse + domain-cache lookup,
278
+ // paying for parses on requests we then immediately drop.
279
+ const matchesRegex = regexes.some(re => re.test(requestUrl));
280
+ if (!matchesRegex) {
225
281
  if (forceDebug) {
226
- console.log(formatLogMessage('debug', `[curl] Skipping already detected subdomain: ${fullSubdomain}`));
282
+ console.log(formatLogMessage('debug', `${CURL_TAG} URL ${requestUrl} doesn't match any regex patterns`));
227
283
  }
228
284
  return;
229
285
  }
230
-
231
- // Only process URLs that match our regex patterns
232
- const matchesRegex = regexes.some(re => re.test(requestUrl));
233
- if (!matchesRegex) {
286
+
287
+ // Parse requestUrl ONCE and reuse. The prior structure parsed it
288
+ // 4-6 times: two `new URL().hostname` calls, two dead-var
289
+ // hostname computations that were never read, plus the
290
+ // getRootDomain calls. Single parse + the cache key (fullSubdomain)
291
+ // + first-party root-domain comparison all come from this one URL
292
+ // object now.
293
+ let requestHostname;
294
+ try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
295
+ const fullSubdomain = requestHostname; // always the full subdomain
296
+
297
+ // Compute requestRootDomain ONCE — derive respDomain from it when
298
+ // perSiteSubDomains is false, and reuse it for the first-party
299
+ // check. Previously getRootDomain(requestUrl) was called twice in
300
+ // that path.
301
+ const requestRootDomain = getRootDomain(requestUrl);
302
+ const respDomain = perSiteSubDomains ? requestHostname : requestRootDomain;
303
+
304
+ // Skip if already detected to avoid duplicates
305
+ if (isDomainAlreadyDetected(fullSubdomain)) {
234
306
  if (forceDebug) {
235
- console.log(formatLogMessage('debug', `[curl] URL ${requestUrl} doesn't match any regex patterns`));
307
+ console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected subdomain: ${fullSubdomain}`));
236
308
  }
237
309
  return;
238
310
  }
239
-
240
- // Check if this is a first-party request (same domain as the URL being scanned)
241
- const currentUrlHostname = new URL(currentUrl).hostname;
242
- const requestHostname = new URL(requestUrl).hostname;
243
- const currentRootDomain = getRootDomain(currentUrl);
244
- const requestRootDomain = getRootDomain(requestUrl);
311
+
312
+ // First-party = same registrable root domain. Same definition the
313
+ // main request handler uses; matches what searchstring.js's
314
+ // responseHandler does too (post the cross-module unification).
245
315
  const isFirstParty = currentRootDomain === requestRootDomain;
246
-
247
- // Apply first-party/third-party filtering
248
- if (isFirstParty && (siteConfig.firstParty === false || siteConfig.firstParty === 0)) {
316
+
317
+ // Apply first-party/third-party filtering. `=== false` only (no
318
+ // `|| === 0`) matches lib/searchstring.js and the main request
319
+ // handler, which all treat these as boolean flags. Accepting 0 as
320
+ // "disabled" here but not elsewhere would silently disagree if a
321
+ // user ever set "firstParty": 0 in JSON config.
322
+ if (isFirstParty && siteConfig.firstParty === false) {
249
323
  if (forceDebug) {
250
- console.log(formatLogMessage('debug', `[curl] Skipping first-party request (firstParty disabled): ${requestUrl}`));
324
+ console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty disabled): ${requestUrl}`));
251
325
  }
252
326
  return;
253
327
  }
254
-
255
- if (!isFirstParty && (siteConfig.thirdParty === false || siteConfig.thirdParty === 0)) {
328
+
329
+ if (!isFirstParty && siteConfig.thirdParty === false) {
256
330
  if (forceDebug) {
257
- console.log(formatLogMessage('debug', `[curl] Skipping third-party request (thirdParty disabled): ${requestUrl}`));
331
+ console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty disabled): ${requestUrl}`));
258
332
  }
259
333
  return;
260
334
  }
261
-
335
+
262
336
  if (forceDebug) {
263
- console.log(formatLogMessage('debug', `[curl] Processing ${isFirstParty ? 'first-party' : 'third-party'} request: ${requestUrl}`));
337
+ console.log(formatLogMessage('debug', `${CURL_TAG} Processing ${isFirstParty ? 'first-party' : 'third-party'} request: ${requestUrl}`));
264
338
  }
265
-
266
- // If NO searchstring is defined, match immediately (like browser behavior)
267
- if (!hasSearchString || ((!searchStrings || !searchStrings.length) && (!searchStringsAnd || !searchStringsAnd.length))) {
339
+
340
+ // If NO searchstring is defined, match immediately (like browser
341
+ // behavior). Simplified from the prior convoluted condition
342
+ // (hasSearchString being true while both arrays are empty is
343
+ // impossible given parseSearchStrings, so the OR was redundant).
344
+ if (!hasSearchString && !hasSearchStringAnd) {
268
345
  if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
269
346
  if (forceDebug) {
270
- console.log(formatLogMessage('debug', `[curl] Domain ${respDomain} is in ignore list`));
347
+ console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} is in ignore list`));
271
348
  }
272
349
  return;
273
350
  }
274
351
 
275
352
  addMatchedDomain(respDomain, resourceType, fullSubdomain);
276
- const simplifiedUrl = getRootDomain(currentUrl);
277
-
278
- if (siteConfig.verbose === 1) {
279
- const partyType = isFirstParty ? 'first-party' : 'third-party';
280
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
281
- console.log(formatLogMessage('match', `[${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`));
282
- }
283
-
284
- if (dumpUrls && matchedUrlsLogFile) {
285
- const timestamp = new Date().toISOString();
286
- const partyType = isFirstParty ? 'first-party' : 'third-party';
287
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
288
- try {
289
- fs.appendFileSync(matchedUrlsLogFile,
290
- `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
291
- } catch (logErr) {
292
- console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
293
- }
294
- }
353
+ logMatchedRequest({
354
+ simplifiedUrl: currentRootDomain,
355
+ requestUrl,
356
+ isFirstParty,
357
+ resourceType,
358
+ matchInfo: null, // no searchstring log says "matched regex"
359
+ verbose: siteConfig.verbose,
360
+ dumpUrls,
361
+ matchedUrlsLogFile
362
+ });
295
363
  return;
296
364
  }
297
365
 
298
366
  // If searchstring IS defined, download and search content
299
- if (hasSearchString && ((searchStrings && searchStrings.length > 0) || (searchStringsAnd && searchStringsAnd.length > 0)) && forceDebug) {
300
- console.log(formatLogMessage('debug', `[curl] Downloading content for pattern matching: ${requestUrl}`));
367
+ if ((hasSearchString || hasSearchStringAnd) && forceDebug) {
368
+ console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content for pattern matching: ${requestUrl}`));
301
369
  }
302
-
303
- // Prepare custom headers from site config
304
- const customHeaders = siteConfig.custom_headers || {};
370
+
371
+ // Prepare custom headers from site config. SHALLOW-COPY so the
372
+ // Referer assignment below doesn't mutate the underlying siteConfig
373
+ // object — the old `siteConfig.custom_headers || {}` was a reference
374
+ // (when present), so setting customHeaders['Referer'] persisted the
375
+ // first URL's random-mode referrer onto siteConfig.custom_headers,
376
+ // and every subsequent URL inherited that pinned value. Silent
377
+ // breakage of {mode:'random_search'} variation across a site's URLs.
378
+ //
379
+ // Uses getReferrerForUrl so ALL referrer modes work — the old
380
+ // inline string/array logic dropped object modes silently.
381
+ const customHeaders = { ...(siteConfig.custom_headers || {}) };
305
382
  if (siteConfig.referrer_headers) {
306
- const referrerUrl = Array.isArray(siteConfig.referrer_headers)
307
- ? siteConfig.referrer_headers[Math.floor(Math.random() * siteConfig.referrer_headers.length)]
308
- : siteConfig.referrer_headers;
309
-
310
- if (typeof referrerUrl === 'string' && referrerUrl.startsWith('http')) {
311
- customHeaders['Referer'] = referrerUrl;
312
- }
383
+ const referrerUrl = getReferrerForUrl(
384
+ requestUrl,
385
+ siteConfig.referrer_headers,
386
+ siteConfig.referrer_disable,
387
+ forceDebug
388
+ );
389
+ if (referrerUrl) customHeaders['Referer'] = referrerUrl;
313
390
  }
314
391
 
315
392
  const downloadResult = await downloadWithCurl(requestUrl, userAgent, {
@@ -320,7 +397,7 @@ function createCurlHandler(config) {
320
397
 
321
398
  if (!downloadResult.success) {
322
399
  if (forceDebug) {
323
- console.log(formatLogMessage('debug', `[curl] Failed to download ${requestUrl}: ${downloadResult.error}`));
400
+ console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download ${requestUrl}: ${downloadResult.error}`));
324
401
  }
325
402
  return;
326
403
  }
@@ -331,7 +408,7 @@ function createCurlHandler(config) {
331
408
  onContentFetched(requestUrl, downloadResult.content);
332
409
  } catch (cacheErr) {
333
410
  if (forceDebug) {
334
- console.log(formatLogMessage('debug', `[curl] Content caching failed: ${cacheErr.message}`));
411
+ console.log(formatLogMessage('debug', `${CURL_TAG} Content caching failed: ${cacheErr.message}`));
335
412
  }
336
413
  }
337
414
  }
@@ -347,54 +424,41 @@ function createCurlHandler(config) {
347
424
  if (searchResult.found) {
348
425
  if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
349
426
  if (forceDebug) {
350
- console.log(formatLogMessage('debug', `[curl] Domain ${respDomain} matches but is in ignore list`));
427
+ console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} matches but is in ignore list`));
351
428
  }
352
429
  return;
353
430
  }
354
431
 
355
432
  addMatchedDomain(respDomain, resourceType, fullSubdomain);
356
- const simplifiedUrl = getRootDomain(currentUrl);
357
-
358
- if (siteConfig.verbose === 1) {
359
- const partyType = isFirstParty ? 'first-party' : 'third-party';
360
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
361
- const matchInfo = searchResult.matchType === 'AND'
362
- ? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}`
363
- : `pattern: "${searchResult.matchedPattern}"`;
364
- console.log(formatLogMessage('match',
365
- `[${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains ${matchInfo}${resourceInfo}`));
366
- }
367
-
368
- if (dumpUrls && matchedUrlsLogFile) {
369
- const timestamp = new Date().toISOString();
370
- const partyType = isFirstParty ? 'first-party' : 'third-party';
371
- const resourceInfo = resourceType ? ` (${resourceType})` : '';
372
- const matchInfo = searchResult.matchType === 'AND'
373
- ? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}`
374
- : `pattern: "${searchResult.matchedPattern}"`;
375
- try {
376
- fs.appendFileSync(matchedUrlsLogFile,
377
- `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, ${matchInfo})${resourceInfo}\n`);
378
- } catch (logErr) {
379
- console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
380
- }
381
- }
433
+ const matchInfo = searchResult.matchType === 'AND'
434
+ ? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}`
435
+ : `pattern: "${searchResult.matchedPattern}"`;
436
+ logMatchedRequest({
437
+ simplifiedUrl: currentRootDomain,
438
+ requestUrl,
439
+ isFirstParty,
440
+ resourceType,
441
+ matchInfo,
442
+ verbose: siteConfig.verbose,
443
+ dumpUrls,
444
+ matchedUrlsLogFile
445
+ });
382
446
  } else {
383
447
  if (forceDebug) {
384
448
  const partyType = isFirstParty ? 'first-party' : 'third-party';
385
449
  if (searchResult.matchType === 'AND' && searchResult.missingPatterns) {
386
450
  console.log(formatLogMessage('debug',
387
- `[curl] ${requestUrl} (${partyType}) matched regex but missing AND patterns: ${searchResult.missingPatterns.join(', ')}`));
451
+ `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but missing AND patterns: ${searchResult.missingPatterns.join(', ')}`));
388
452
  } else {
389
453
  console.log(formatLogMessage('debug',
390
- `[curl] ${requestUrl} (${partyType}) matched regex but no search patterns found`));
454
+ `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no search patterns found`));
391
455
  }
392
456
  }
393
457
  }
394
458
 
395
459
  } catch (err) {
396
460
  if (forceDebug) {
397
- console.log(formatLogMessage('debug', `[curl] Handler failed for ${requestUrl}: ${err.message}`));
461
+ console.log(formatLogMessage('debug', `${CURL_TAG} Handler failed for ${requestUrl}: ${err.message}`));
398
462
  }
399
463
  }
400
464
  };
@@ -434,9 +498,12 @@ function validateCurlAvailability() {
434
498
  }
435
499
  }
436
500
 
501
+ // Public surface used by nwss.js (createCurlHandler + validateCurlAvailability).
502
+ // downloadWithCurl and searchContent are module-internal helpers — no external
503
+ // caller imports them from here. lib/searchstring.js has its own independently-
504
+ // defined functions of the same names, which is why a naive grep showed
505
+ // false-positive 'external uses'.
437
506
  module.exports = {
438
- downloadWithCurl,
439
- searchContent,
440
507
  createCurlHandler,
441
508
  validateCurlAvailability
442
509
  };