@fanboynz/network-scanner 2.0.66 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +134 -10
- package/CHANGELOG.md +135 -0
- package/CLAUDE.md +18 -7
- package/README.md +12 -4
- package/lib/adblock-rust.js +23 -18
- package/lib/adblock.js +127 -82
- package/lib/browserexit.js +210 -200
- package/lib/browserhealth.js +84 -60
- package/lib/cdp.js +103 -81
- package/lib/clear_sitedata.js +61 -159
- package/lib/cloudflare.js +579 -409
- package/lib/colorize.js +29 -12
- package/lib/compare.js +16 -8
- package/lib/compress.js +2 -1
- package/lib/curl.js +287 -220
- package/lib/domain-cache.js +87 -40
- package/lib/dry-run.js +137 -194
- package/lib/fingerprint.js +20 -18
- package/lib/flowproxy.js +391 -188
- package/lib/ghost-cursor.js +8 -7
- package/lib/grep.js +248 -171
- package/lib/ignore_similar.js +70 -124
- package/lib/interaction.js +132 -235
- package/lib/nettools.js +309 -87
- package/lib/openvpn_vpn.js +12 -11
- package/lib/output.js +92 -59
- package/lib/post-processing.js +216 -162
- package/lib/redirect.js +46 -30
- package/lib/referrer.js +158 -165
- package/lib/searchstring.js +290 -381
- package/lib/smart-cache.js +141 -91
- package/lib/socks-relay.js +8 -7
- package/lib/spawn-async.js +137 -0
- package/lib/validate_rules.js +188 -176
- package/lib/wireguard_vpn.js +111 -117
- package/nwss.js +740 -156
- package/package.json +4 -4
package/lib/curl.js
CHANGED
|
@@ -2,22 +2,36 @@
|
|
|
2
2
|
// Handles HTTP content downloading using curl for searchstring analysis
|
|
3
3
|
|
|
4
4
|
const fs = require('fs');
|
|
5
|
+
// spawnSync only kept for validateCurlAvailability (runs once at
|
|
6
|
+
// startup). Production curl downloads go through runProcess (async).
|
|
5
7
|
const { spawnSync } = require('child_process');
|
|
6
|
-
const {
|
|
8
|
+
const { runProcess } = require('./spawn-async');
|
|
9
|
+
const { messageColors, formatLogMessage } = require('./colorize');
|
|
10
|
+
const { getReferrerForUrl } = require('./referrer');
|
|
11
|
+
const CURL_TAG = messageColors.processing('[curl]');
|
|
7
12
|
|
|
8
13
|
// === Constants ===
|
|
9
14
|
const CURL_DEFAULTS = {
|
|
10
15
|
TIMEOUT_SECONDS: 30,
|
|
11
16
|
MAX_REDIRECTS: 5,
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
17
|
+
// 50MB to match lib/searchstring.js's downloadWithCurl cap — the two
|
|
18
|
+
// modules previously had different defaults (10MB vs 50MB) so the same
|
|
19
|
+
// URL could succeed or fail depending on which code path fetched it.
|
|
20
|
+
MAX_SIZE_BYTES: 50 * 1024 * 1024,
|
|
21
|
+
VALIDATION_TIMEOUT: 5000,
|
|
16
22
|
CURL_SUCCESS_STATUS: 0,
|
|
17
|
-
METADATA_PIPE_PARTS: 3, // http_code|content_type|size_download
|
|
18
23
|
VERSION_LINE_INDEX: 0
|
|
19
24
|
};
|
|
20
25
|
|
|
26
|
+
// Module-level so downloadWithCurl doesn't reallocate this closure on
|
|
27
|
+
// every call. No state captured — pure factory.
|
|
28
|
+
function errResult(msg) {
|
|
29
|
+
return {
|
|
30
|
+
content: '', httpCode: 0, contentType: 'unknown', downloadSize: 0,
|
|
31
|
+
success: false, error: msg
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
21
35
|
/**
|
|
22
36
|
* Downloads content using curl with browser-like headers
|
|
23
37
|
* @param {string} url - The URL to download
|
|
@@ -34,90 +48,80 @@ async function downloadWithCurl(url, userAgent = '', options = {}) {
|
|
|
34
48
|
customHeaders = {}
|
|
35
49
|
} = options;
|
|
36
50
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
curlArgs.push('-L'); // Follow redirects
|
|
49
|
-
}
|
|
51
|
+
const curlArgs = [
|
|
52
|
+
'-s',
|
|
53
|
+
'--max-time', timeout.toString(),
|
|
54
|
+
'--max-redirs', maxRedirects.toString(),
|
|
55
|
+
'--fail-with-body',
|
|
56
|
+
'--compressed',
|
|
57
|
+
// Leading '\n' guarantees the metadata sits on its own line even
|
|
58
|
+
// when content has no trailing newline (older format had no
|
|
59
|
+
// separator and concatenated metadata with the last content byte).
|
|
60
|
+
'--write-out', '\n%{http_code}|%{content_type}|%{size_download}'
|
|
61
|
+
];
|
|
50
62
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
curlArgs.push('-H', `User-Agent: ${userAgent}`);
|
|
54
|
-
}
|
|
63
|
+
if (followRedirects) curlArgs.push('-L');
|
|
64
|
+
if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
|
|
55
65
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
);
|
|
68
|
-
|
|
69
|
-
// Add custom headers
|
|
70
|
-
Object.entries(customHeaders).forEach(([key, value]) => {
|
|
71
|
-
curlArgs.push('-H', `${key}: ${value}`);
|
|
72
|
-
});
|
|
66
|
+
curlArgs.push(
|
|
67
|
+
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
68
|
+
'-H', 'Accept-Language: en-US,en;q=0.5',
|
|
69
|
+
'-H', 'Accept-Encoding: gzip, deflate, br',
|
|
70
|
+
'-H', 'Connection: keep-alive',
|
|
71
|
+
'-H', 'Upgrade-Insecure-Requests: 1',
|
|
72
|
+
'-H', 'Sec-Fetch-Dest: document',
|
|
73
|
+
'-H', 'Sec-Fetch-Mode: navigate',
|
|
74
|
+
'-H', 'Sec-Fetch-Site: none',
|
|
75
|
+
'-H', 'Cache-Control: no-cache'
|
|
76
|
+
);
|
|
73
77
|
|
|
74
|
-
|
|
78
|
+
Object.entries(customHeaders).forEach(([key, value]) => {
|
|
79
|
+
curlArgs.push('-H', `${key}: ${value}`);
|
|
80
|
+
});
|
|
81
|
+
curlArgs.push(url);
|
|
75
82
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
contentType: contentType || 'unknown',
|
|
107
|
-
downloadSize: parseInt(downloadSize) || content.length,
|
|
108
|
-
success: true
|
|
109
|
-
};
|
|
110
|
-
|
|
111
|
-
} catch (error) {
|
|
112
|
-
return {
|
|
113
|
-
content: '',
|
|
114
|
-
httpCode: 0,
|
|
115
|
-
contentType: 'unknown',
|
|
116
|
-
downloadSize: 0,
|
|
117
|
-
success: false,
|
|
118
|
-
error: error.message
|
|
119
|
-
};
|
|
83
|
+
// Shared async-spawn helper handles streaming/cap/timeout/kill plumbing.
|
|
84
|
+
const result = await runProcess('curl', curlArgs, {
|
|
85
|
+
timeout: timeout * 1000,
|
|
86
|
+
maxStdout: maxSize
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
if (result.error) return errResult(result.error);
|
|
90
|
+
if (result.truncated) return errResult(`Output exceeded ${maxSize} bytes`);
|
|
91
|
+
if (result.signal) return errResult(`Killed by signal ${result.signal}`);
|
|
92
|
+
if (result.code !== CURL_DEFAULTS.CURL_SUCCESS_STATUS) {
|
|
93
|
+
return errResult(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const output = result.stdout.toString('utf8');
|
|
97
|
+
// lastIndexOf('\n') is a single O(n) scan from the end vs the old
|
|
98
|
+
// split('\n') + slice(0,-1) + join('\n') which was three full passes
|
|
99
|
+
// plus two intermediate array allocations.
|
|
100
|
+
const sepIdx = output.lastIndexOf('\n');
|
|
101
|
+
if (sepIdx === -1) return errResult('No metadata separator in curl output');
|
|
102
|
+
|
|
103
|
+
const content = output.slice(0, sepIdx);
|
|
104
|
+
const metadata = output.slice(sepIdx + 1);
|
|
105
|
+
|
|
106
|
+
// Split on first/last pipe so the middle (content-type) can legitimately
|
|
107
|
+
// contain pipes — naive split('|') with parts-count check would drop the
|
|
108
|
+
// whole response with 'Invalid metadata format' for such content-types.
|
|
109
|
+
const firstPipe = metadata.indexOf('|');
|
|
110
|
+
const lastPipe = metadata.lastIndexOf('|');
|
|
111
|
+
if (firstPipe === -1 || firstPipe === lastPipe) {
|
|
112
|
+
return errResult(`Invalid metadata format: missing pipes in "${metadata}"`);
|
|
120
113
|
}
|
|
114
|
+
const httpCode = metadata.slice(0, firstPipe);
|
|
115
|
+
const contentType = metadata.slice(firstPipe + 1, lastPipe);
|
|
116
|
+
const downloadSize = metadata.slice(lastPipe + 1);
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
content,
|
|
120
|
+
httpCode: parseInt(httpCode, 10) || 0,
|
|
121
|
+
contentType: contentType || 'unknown',
|
|
122
|
+
downloadSize: parseInt(downloadSize, 10) || content.length,
|
|
123
|
+
success: true
|
|
124
|
+
};
|
|
121
125
|
}
|
|
122
126
|
|
|
123
127
|
/**
|
|
@@ -134,58 +138,103 @@ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSe
|
|
|
134
138
|
}
|
|
135
139
|
|
|
136
140
|
const lowerContent = content.toLowerCase();
|
|
137
|
-
|
|
138
|
-
// Handle AND logic searchstring_and (all patterns must be present)
|
|
141
|
+
|
|
142
|
+
// Handle AND logic searchstring_and (all patterns must be present).
|
|
143
|
+
// Short-circuits on first missing pattern — the old code walked the
|
|
144
|
+
// entire list to build a full missingPatterns array that's only used
|
|
145
|
+
// by a debug log. Now we early-exit and report the first miss (the
|
|
146
|
+
// debug log's missingPatterns.join(', ') still works with one entry).
|
|
139
147
|
if (hasSearchStringAnd && searchStringsAnd.length > 0) {
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
if (lowerContent.includes(
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
// Pre-lower patterns once — was per-iteration toLowerCase before.
|
|
149
|
+
// For a 20-pattern AND check the difference is small per call but
|
|
150
|
+
// the pattern itself never changes between iterations of the loop.
|
|
151
|
+
const lowered = searchStringsAnd.map(p => p.toLowerCase());
|
|
152
|
+
for (let i = 0; i < searchStringsAnd.length; i++) {
|
|
153
|
+
if (!lowerContent.includes(lowered[i])) {
|
|
154
|
+
return {
|
|
155
|
+
found: false,
|
|
156
|
+
matchedPattern: null,
|
|
157
|
+
matchType: 'AND',
|
|
158
|
+
foundPatterns: searchStringsAnd.slice(0, i),
|
|
159
|
+
missingPatterns: [searchStringsAnd[i]]
|
|
160
|
+
};
|
|
149
161
|
}
|
|
150
162
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
foundPatterns,
|
|
159
|
-
missingPatterns: []
|
|
160
|
-
};
|
|
161
|
-
} else {
|
|
162
|
-
return {
|
|
163
|
-
found: false,
|
|
164
|
-
matchedPattern: null,
|
|
165
|
-
matchType: 'AND',
|
|
166
|
-
foundPatterns,
|
|
167
|
-
missingPatterns
|
|
168
|
-
};
|
|
169
|
-
}
|
|
163
|
+
return {
|
|
164
|
+
found: true,
|
|
165
|
+
matchedPattern: searchStringsAnd.join(' AND '),
|
|
166
|
+
matchType: 'AND',
|
|
167
|
+
foundPatterns: searchStringsAnd,
|
|
168
|
+
missingPatterns: []
|
|
169
|
+
};
|
|
170
170
|
}
|
|
171
|
-
|
|
172
|
-
// Handle OR logic searchstring (any pattern can match)
|
|
171
|
+
|
|
172
|
+
// Handle OR logic searchstring (any pattern can match). Same pre-lower
|
|
173
|
+
// optimization, though OR usually short-circuits early so the savings
|
|
174
|
+
// are smaller.
|
|
173
175
|
if (searchStrings.length > 0) {
|
|
174
|
-
for (
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
matchedPattern: pattern,
|
|
176
|
+
for (let i = 0; i < searchStrings.length; i++) {
|
|
177
|
+
if (lowerContent.includes(searchStrings[i].toLowerCase())) {
|
|
178
|
+
return {
|
|
179
|
+
found: true,
|
|
180
|
+
matchedPattern: searchStrings[i],
|
|
180
181
|
matchType: 'OR'
|
|
181
182
|
};
|
|
182
183
|
}
|
|
183
184
|
}
|
|
184
185
|
}
|
|
185
|
-
|
|
186
|
+
|
|
186
187
|
return { found: false, matchedPattern: null, matchType: null };
|
|
187
188
|
}
|
|
188
189
|
|
|
190
|
+
/**
|
|
191
|
+
* Emits a match for a curl-fetched URL to both the verbose console
|
|
192
|
+
* (when siteConfig.verbose === 1) and the matched-URLs log file
|
|
193
|
+
* (when dumpUrls is true). Single source of truth for the format —
|
|
194
|
+
* both no-searchstring and with-searchstring match paths funnel
|
|
195
|
+
* through here so partyType / resourceInfo / timestamp / format
|
|
196
|
+
* don't drift between the two branches.
|
|
197
|
+
*
|
|
198
|
+
* @param {object} opts
|
|
199
|
+
* @param {string} opts.simplifiedUrl
|
|
200
|
+
* @param {string} opts.requestUrl
|
|
201
|
+
* @param {boolean} opts.isFirstParty
|
|
202
|
+
* @param {string|null} opts.resourceType
|
|
203
|
+
* @param {string|null} opts.matchInfo - null for "matched regex only"
|
|
204
|
+
* (no searchstring), a string like
|
|
205
|
+
* 'pattern: "X"' or 'patterns: 2/3'
|
|
206
|
+
* for searchstring matches
|
|
207
|
+
* @param {number|undefined} opts.verbose
|
|
208
|
+
* @param {boolean} opts.dumpUrls
|
|
209
|
+
* @param {string} opts.matchedUrlsLogFile
|
|
210
|
+
*/
|
|
211
|
+
function logMatchedRequest({
|
|
212
|
+
simplifiedUrl, requestUrl, isFirstParty, resourceType,
|
|
213
|
+
matchInfo, verbose, dumpUrls, matchedUrlsLogFile
|
|
214
|
+
}) {
|
|
215
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
216
|
+
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
217
|
+
|
|
218
|
+
if (verbose === 1) {
|
|
219
|
+
const verboseSuffix = matchInfo ? ` contains ${matchInfo}` : ' matched regex';
|
|
220
|
+
console.log(formatLogMessage('match',
|
|
221
|
+
`[${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${verboseSuffix}${resourceInfo}`));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (dumpUrls && matchedUrlsLogFile) {
|
|
225
|
+
const timestamp = new Date().toISOString();
|
|
226
|
+
// matchInfo goes INSIDE the (party, curl, ...) parens to mirror the
|
|
227
|
+
// pre-refactor file format.
|
|
228
|
+
const fileExtra = matchInfo ? `, ${matchInfo}` : '';
|
|
229
|
+
try {
|
|
230
|
+
fs.appendFileSync(matchedUrlsLogFile,
|
|
231
|
+
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl${fileExtra})${resourceInfo}\n`);
|
|
232
|
+
} catch (logErr) {
|
|
233
|
+
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
189
238
|
/**
|
|
190
239
|
* Creates a curl-based URL handler for downloading and searching content
|
|
191
240
|
* @param {object} config - Configuration object containing all necessary parameters
|
|
@@ -197,7 +246,8 @@ function createCurlHandler(config) {
|
|
|
197
246
|
searchStringsAnd,
|
|
198
247
|
hasSearchStringAnd,
|
|
199
248
|
regexes,
|
|
200
|
-
matchedDomains
|
|
249
|
+
// matchedDomains intentionally not destructured — only addMatchedDomain
|
|
250
|
+
// is called; the underlying collection is opaque to this handler.
|
|
201
251
|
addMatchedDomain,
|
|
202
252
|
isDomainAlreadyDetected,
|
|
203
253
|
onContentFetched,
|
|
@@ -215,101 +265,128 @@ function createCurlHandler(config) {
|
|
|
215
265
|
hasSearchString
|
|
216
266
|
} = config;
|
|
217
267
|
|
|
268
|
+
// Hoisted: currentUrl doesn't change for this handler's lifetime, so
|
|
269
|
+
// parsing its root domain once at handler-creation eliminates the
|
|
270
|
+
// per-request parse + getRootDomain call.
|
|
271
|
+
let currentRootDomain = '';
|
|
272
|
+
try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
|
|
273
|
+
|
|
218
274
|
return async function curlHandler(requestUrl) {
|
|
219
275
|
try {
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
if (
|
|
276
|
+
// Regex check FIRST — cheap filter that skips ~99% of requests.
|
|
277
|
+
// Previously this ran AFTER a URL parse + domain-cache lookup,
|
|
278
|
+
// paying for parses on requests we then immediately drop.
|
|
279
|
+
const matchesRegex = regexes.some(re => re.test(requestUrl));
|
|
280
|
+
if (!matchesRegex) {
|
|
225
281
|
if (forceDebug) {
|
|
226
|
-
console.log(formatLogMessage('debug',
|
|
282
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} URL ${requestUrl} doesn't match any regex patterns`));
|
|
227
283
|
}
|
|
228
284
|
return;
|
|
229
285
|
}
|
|
230
|
-
|
|
231
|
-
//
|
|
232
|
-
|
|
233
|
-
|
|
286
|
+
|
|
287
|
+
// Parse requestUrl ONCE and reuse. The prior structure parsed it
|
|
288
|
+
// 4-6 times: two `new URL().hostname` calls, two dead-var
|
|
289
|
+
// hostname computations that were never read, plus the
|
|
290
|
+
// getRootDomain calls. Single parse + the cache key (fullSubdomain)
|
|
291
|
+
// + first-party root-domain comparison all come from this one URL
|
|
292
|
+
// object now.
|
|
293
|
+
let requestHostname;
|
|
294
|
+
try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
|
|
295
|
+
const fullSubdomain = requestHostname; // always the full subdomain
|
|
296
|
+
|
|
297
|
+
// Compute requestRootDomain ONCE — derive respDomain from it when
|
|
298
|
+
// perSiteSubDomains is false, and reuse it for the first-party
|
|
299
|
+
// check. Previously getRootDomain(requestUrl) was called twice in
|
|
300
|
+
// that path.
|
|
301
|
+
const requestRootDomain = getRootDomain(requestUrl);
|
|
302
|
+
const respDomain = perSiteSubDomains ? requestHostname : requestRootDomain;
|
|
303
|
+
|
|
304
|
+
// Skip if already detected to avoid duplicates
|
|
305
|
+
if (isDomainAlreadyDetected(fullSubdomain)) {
|
|
234
306
|
if (forceDebug) {
|
|
235
|
-
console.log(formatLogMessage('debug',
|
|
307
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected subdomain: ${fullSubdomain}`));
|
|
236
308
|
}
|
|
237
309
|
return;
|
|
238
310
|
}
|
|
239
|
-
|
|
240
|
-
//
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
const currentRootDomain = getRootDomain(currentUrl);
|
|
244
|
-
const requestRootDomain = getRootDomain(requestUrl);
|
|
311
|
+
|
|
312
|
+
// First-party = same registrable root domain. Same definition the
|
|
313
|
+
// main request handler uses; matches what searchstring.js's
|
|
314
|
+
// responseHandler does too (post the cross-module unification).
|
|
245
315
|
const isFirstParty = currentRootDomain === requestRootDomain;
|
|
246
|
-
|
|
247
|
-
// Apply first-party/third-party filtering
|
|
248
|
-
|
|
316
|
+
|
|
317
|
+
// Apply first-party/third-party filtering. `=== false` only (no
|
|
318
|
+
// `|| === 0`) — matches lib/searchstring.js and the main request
|
|
319
|
+
// handler, which all treat these as boolean flags. Accepting 0 as
|
|
320
|
+
// "disabled" here but not elsewhere would silently disagree if a
|
|
321
|
+
// user ever set "firstParty": 0 in JSON config.
|
|
322
|
+
if (isFirstParty && siteConfig.firstParty === false) {
|
|
249
323
|
if (forceDebug) {
|
|
250
|
-
console.log(formatLogMessage('debug',
|
|
324
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty disabled): ${requestUrl}`));
|
|
251
325
|
}
|
|
252
326
|
return;
|
|
253
327
|
}
|
|
254
|
-
|
|
255
|
-
if (!isFirstParty &&
|
|
328
|
+
|
|
329
|
+
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
256
330
|
if (forceDebug) {
|
|
257
|
-
console.log(formatLogMessage('debug',
|
|
331
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty disabled): ${requestUrl}`));
|
|
258
332
|
}
|
|
259
333
|
return;
|
|
260
334
|
}
|
|
261
|
-
|
|
335
|
+
|
|
262
336
|
if (forceDebug) {
|
|
263
|
-
console.log(formatLogMessage('debug',
|
|
337
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Processing ${isFirstParty ? 'first-party' : 'third-party'} request: ${requestUrl}`));
|
|
264
338
|
}
|
|
265
|
-
|
|
266
|
-
// If NO searchstring is defined, match immediately (like browser
|
|
267
|
-
|
|
339
|
+
|
|
340
|
+
// If NO searchstring is defined, match immediately (like browser
|
|
341
|
+
// behavior). Simplified from the prior convoluted condition
|
|
342
|
+
// (hasSearchString being true while both arrays are empty is
|
|
343
|
+
// impossible given parseSearchStrings, so the OR was redundant).
|
|
344
|
+
if (!hasSearchString && !hasSearchStringAnd) {
|
|
268
345
|
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
|
|
269
346
|
if (forceDebug) {
|
|
270
|
-
console.log(formatLogMessage('debug',
|
|
347
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} is in ignore list`));
|
|
271
348
|
}
|
|
272
349
|
return;
|
|
273
350
|
}
|
|
274
351
|
|
|
275
352
|
addMatchedDomain(respDomain, resourceType, fullSubdomain);
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
287
|
-
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
288
|
-
try {
|
|
289
|
-
fs.appendFileSync(matchedUrlsLogFile,
|
|
290
|
-
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
|
|
291
|
-
} catch (logErr) {
|
|
292
|
-
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
293
|
-
}
|
|
294
|
-
}
|
|
353
|
+
logMatchedRequest({
|
|
354
|
+
simplifiedUrl: currentRootDomain,
|
|
355
|
+
requestUrl,
|
|
356
|
+
isFirstParty,
|
|
357
|
+
resourceType,
|
|
358
|
+
matchInfo: null, // no searchstring — log says "matched regex"
|
|
359
|
+
verbose: siteConfig.verbose,
|
|
360
|
+
dumpUrls,
|
|
361
|
+
matchedUrlsLogFile
|
|
362
|
+
});
|
|
295
363
|
return;
|
|
296
364
|
}
|
|
297
365
|
|
|
298
366
|
// If searchstring IS defined, download and search content
|
|
299
|
-
if (hasSearchString
|
|
300
|
-
console.log(formatLogMessage('debug',
|
|
367
|
+
if ((hasSearchString || hasSearchStringAnd) && forceDebug) {
|
|
368
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content for pattern matching: ${requestUrl}`));
|
|
301
369
|
}
|
|
302
|
-
|
|
303
|
-
// Prepare custom headers from site config
|
|
304
|
-
|
|
370
|
+
|
|
371
|
+
// Prepare custom headers from site config. SHALLOW-COPY so the
|
|
372
|
+
// Referer assignment below doesn't mutate the underlying siteConfig
|
|
373
|
+
// object — the old `siteConfig.custom_headers || {}` was a reference
|
|
374
|
+
// (when present), so setting customHeaders['Referer'] persisted the
|
|
375
|
+
// first URL's random-mode referrer onto siteConfig.custom_headers,
|
|
376
|
+
// and every subsequent URL inherited that pinned value. Silent
|
|
377
|
+
// breakage of {mode:'random_search'} variation across a site's URLs.
|
|
378
|
+
//
|
|
379
|
+
// Uses getReferrerForUrl so ALL referrer modes work — the old
|
|
380
|
+
// inline string/array logic dropped object modes silently.
|
|
381
|
+
const customHeaders = { ...(siteConfig.custom_headers || {}) };
|
|
305
382
|
if (siteConfig.referrer_headers) {
|
|
306
|
-
const referrerUrl =
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
383
|
+
const referrerUrl = getReferrerForUrl(
|
|
384
|
+
requestUrl,
|
|
385
|
+
siteConfig.referrer_headers,
|
|
386
|
+
siteConfig.referrer_disable,
|
|
387
|
+
forceDebug
|
|
388
|
+
);
|
|
389
|
+
if (referrerUrl) customHeaders['Referer'] = referrerUrl;
|
|
313
390
|
}
|
|
314
391
|
|
|
315
392
|
const downloadResult = await downloadWithCurl(requestUrl, userAgent, {
|
|
@@ -320,7 +397,7 @@ function createCurlHandler(config) {
|
|
|
320
397
|
|
|
321
398
|
if (!downloadResult.success) {
|
|
322
399
|
if (forceDebug) {
|
|
323
|
-
console.log(formatLogMessage('debug',
|
|
400
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download ${requestUrl}: ${downloadResult.error}`));
|
|
324
401
|
}
|
|
325
402
|
return;
|
|
326
403
|
}
|
|
@@ -331,7 +408,7 @@ function createCurlHandler(config) {
|
|
|
331
408
|
onContentFetched(requestUrl, downloadResult.content);
|
|
332
409
|
} catch (cacheErr) {
|
|
333
410
|
if (forceDebug) {
|
|
334
|
-
console.log(formatLogMessage('debug',
|
|
411
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Content caching failed: ${cacheErr.message}`));
|
|
335
412
|
}
|
|
336
413
|
}
|
|
337
414
|
}
|
|
@@ -347,54 +424,41 @@ function createCurlHandler(config) {
|
|
|
347
424
|
if (searchResult.found) {
|
|
348
425
|
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
|
|
349
426
|
if (forceDebug) {
|
|
350
|
-
console.log(formatLogMessage('debug',
|
|
427
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} matches but is in ignore list`));
|
|
351
428
|
}
|
|
352
429
|
return;
|
|
353
430
|
}
|
|
354
431
|
|
|
355
432
|
addMatchedDomain(respDomain, resourceType, fullSubdomain);
|
|
356
|
-
const
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
const timestamp = new Date().toISOString();
|
|
370
|
-
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
371
|
-
const resourceInfo = resourceType ? ` (${resourceType})` : '';
|
|
372
|
-
const matchInfo = searchResult.matchType === 'AND'
|
|
373
|
-
? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}`
|
|
374
|
-
: `pattern: "${searchResult.matchedPattern}"`;
|
|
375
|
-
try {
|
|
376
|
-
fs.appendFileSync(matchedUrlsLogFile,
|
|
377
|
-
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, ${matchInfo})${resourceInfo}\n`);
|
|
378
|
-
} catch (logErr) {
|
|
379
|
-
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
380
|
-
}
|
|
381
|
-
}
|
|
433
|
+
const matchInfo = searchResult.matchType === 'AND'
|
|
434
|
+
? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}`
|
|
435
|
+
: `pattern: "${searchResult.matchedPattern}"`;
|
|
436
|
+
logMatchedRequest({
|
|
437
|
+
simplifiedUrl: currentRootDomain,
|
|
438
|
+
requestUrl,
|
|
439
|
+
isFirstParty,
|
|
440
|
+
resourceType,
|
|
441
|
+
matchInfo,
|
|
442
|
+
verbose: siteConfig.verbose,
|
|
443
|
+
dumpUrls,
|
|
444
|
+
matchedUrlsLogFile
|
|
445
|
+
});
|
|
382
446
|
} else {
|
|
383
447
|
if (forceDebug) {
|
|
384
448
|
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
385
449
|
if (searchResult.matchType === 'AND' && searchResult.missingPatterns) {
|
|
386
450
|
console.log(formatLogMessage('debug',
|
|
387
|
-
|
|
451
|
+
`${CURL_TAG} ${requestUrl} (${partyType}) matched regex but missing AND patterns: ${searchResult.missingPatterns.join(', ')}`));
|
|
388
452
|
} else {
|
|
389
453
|
console.log(formatLogMessage('debug',
|
|
390
|
-
|
|
454
|
+
`${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no search patterns found`));
|
|
391
455
|
}
|
|
392
456
|
}
|
|
393
457
|
}
|
|
394
458
|
|
|
395
459
|
} catch (err) {
|
|
396
460
|
if (forceDebug) {
|
|
397
|
-
console.log(formatLogMessage('debug',
|
|
461
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Handler failed for ${requestUrl}: ${err.message}`));
|
|
398
462
|
}
|
|
399
463
|
}
|
|
400
464
|
};
|
|
@@ -434,9 +498,12 @@ function validateCurlAvailability() {
|
|
|
434
498
|
}
|
|
435
499
|
}
|
|
436
500
|
|
|
501
|
+
// Public surface used by nwss.js (createCurlHandler + validateCurlAvailability).
|
|
502
|
+
// downloadWithCurl and searchContent are module-internal helpers — no external
|
|
503
|
+
// caller imports them from here. lib/searchstring.js has its own independently-
|
|
504
|
+
// defined functions of the same names, which is why a naive grep showed
|
|
505
|
+
// false-positive 'external uses'.
|
|
437
506
|
module.exports = {
|
|
438
|
-
downloadWithCurl,
|
|
439
|
-
searchContent,
|
|
440
507
|
createCurlHandler,
|
|
441
508
|
validateCurlAvailability
|
|
442
509
|
};
|