@fanboynz/network-scanner 2.0.66 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +134 -10
- package/CHANGELOG.md +135 -0
- package/CLAUDE.md +18 -7
- package/README.md +12 -4
- package/lib/adblock-rust.js +23 -18
- package/lib/adblock.js +127 -82
- package/lib/browserexit.js +210 -200
- package/lib/browserhealth.js +84 -60
- package/lib/cdp.js +103 -81
- package/lib/clear_sitedata.js +61 -159
- package/lib/cloudflare.js +579 -409
- package/lib/colorize.js +29 -12
- package/lib/compare.js +16 -8
- package/lib/compress.js +2 -1
- package/lib/curl.js +287 -220
- package/lib/domain-cache.js +87 -40
- package/lib/dry-run.js +137 -194
- package/lib/fingerprint.js +20 -18
- package/lib/flowproxy.js +391 -188
- package/lib/ghost-cursor.js +8 -7
- package/lib/grep.js +248 -171
- package/lib/ignore_similar.js +70 -124
- package/lib/interaction.js +132 -235
- package/lib/nettools.js +309 -87
- package/lib/openvpn_vpn.js +12 -11
- package/lib/output.js +92 -59
- package/lib/post-processing.js +216 -162
- package/lib/redirect.js +46 -30
- package/lib/referrer.js +158 -165
- package/lib/searchstring.js +290 -381
- package/lib/smart-cache.js +141 -91
- package/lib/socks-relay.js +8 -7
- package/lib/spawn-async.js +137 -0
- package/lib/validate_rules.js +188 -176
- package/lib/wireguard_vpn.js +111 -117
- package/nwss.js +740 -156
- package/package.json +4 -4
package/lib/searchstring.js
CHANGED
|
@@ -2,14 +2,19 @@
|
|
|
2
2
|
// Handles response content analysis for searchstring functionality
|
|
3
3
|
|
|
4
4
|
const fs = require('fs');
|
|
5
|
-
const {
|
|
5
|
+
const { formatLogMessage, messageColors } = require('./colorize');
|
|
6
|
+
const CURL_TAG = messageColors.processing('[curl]');
|
|
7
|
+
// responseHandler is a separate code path (Puppeteer response listener,
|
|
8
|
+
// not curl) — its debug output gets its own subsystem prefix so it's
|
|
9
|
+
// distinguishable from curl-handler logs.
|
|
10
|
+
const SEARCHSTRING_TAG = messageColors.processing('[searchstring]');
|
|
11
|
+
const { runProcess } = require('./spawn-async');
|
|
6
12
|
const { grepContent } = require('./grep');
|
|
7
13
|
|
|
8
14
|
// Configuration constants for search logic
|
|
9
15
|
const SEARCH_CONFIG = {
|
|
10
16
|
MAX_CONTENT_SIZE: 50 * 1024 * 1024, // 50MB max content size
|
|
11
|
-
MAX_SEARCH_STRING_LENGTH: 1000
|
|
12
|
-
XML_ENTITY_TIMEOUT: 5000 // 5 second timeout for XML processing
|
|
17
|
+
MAX_SEARCH_STRING_LENGTH: 1000
|
|
13
18
|
};
|
|
14
19
|
|
|
15
20
|
/**
|
|
@@ -46,36 +51,6 @@ function parseSearchStrings(searchstring, searchstringAnd) {
|
|
|
46
51
|
};
|
|
47
52
|
}
|
|
48
53
|
|
|
49
|
-
/**
|
|
50
|
-
* Helper function to add domain to matched collection (handles both Set and Map)
|
|
51
|
-
* @param {Set|Map} matchedDomains - The matched domains collection
|
|
52
|
-
* @param {Function} addMatchedDomain - Optional helper function for adding domains
|
|
53
|
-
* @param {string} domain - Domain to add
|
|
54
|
-
* @param {string} resourceType - Resource type (for --adblock-rules mode)
|
|
55
|
-
* @param {string} fullSubdomain - Full subdomain for cache tracking (optional)
|
|
56
|
-
*/
|
|
57
|
-
function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourceType = null, fullSubdomain = null) {
|
|
58
|
-
// Use helper function if provided (preferred method)
|
|
59
|
-
if (typeof addMatchedDomain === 'function') {
|
|
60
|
-
addMatchedDomain(domain, resourceType, fullSubdomain);
|
|
61
|
-
return;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Fallback: handle different collection types directly
|
|
65
|
-
if (matchedDomains instanceof Set) {
|
|
66
|
-
matchedDomains.add(domain);
|
|
67
|
-
} else if (matchedDomains instanceof Map) {
|
|
68
|
-
if (!matchedDomains.has(domain)) {
|
|
69
|
-
matchedDomains.set(domain, new Set());
|
|
70
|
-
}
|
|
71
|
-
if (resourceType) {
|
|
72
|
-
matchedDomains.get(domain).add(resourceType);
|
|
73
|
-
}
|
|
74
|
-
} else {
|
|
75
|
-
console.warn('[warn] Unknown matchedDomains type, skipping domain addition');
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
54
|
/**
|
|
80
55
|
* Downloads content using curl with appropriate headers and timeout
|
|
81
56
|
* @param {string} url - The URL to download
|
|
@@ -84,55 +59,42 @@ function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourc
|
|
|
84
59
|
* @returns {Promise<string>} The downloaded content
|
|
85
60
|
*/
|
|
86
61
|
async function downloadWithCurl(url, userAgent = '', timeout = 30) {
|
|
87
|
-
|
|
88
|
-
try {
|
|
89
|
-
const curlArgs = [
|
|
90
|
-
'-s', // Silent mode
|
|
91
|
-
'-L', // Follow redirects
|
|
92
|
-
'--max-time', timeout.toString(),
|
|
93
|
-
'--max-redirs', '5',
|
|
94
|
-
'--fail-with-body', // Return body even on HTTP errors
|
|
95
|
-
'--max-filesize', '52428800', // 50MB limit
|
|
96
|
-
'--range', '0-52428799', // Limit download size
|
|
97
|
-
'--compressed', // Accept compressed responses
|
|
98
|
-
];
|
|
62
|
+
const MAX_STDOUT_BYTES = 52428800; // 50MB, matches --max-filesize below
|
|
99
63
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
64
|
+
const curlArgs = [
|
|
65
|
+
'-s',
|
|
66
|
+
'-L',
|
|
67
|
+
'--max-time', timeout.toString(),
|
|
68
|
+
'--max-redirs', '5',
|
|
69
|
+
'--fail-with-body',
|
|
70
|
+
'--max-filesize', '52428800',
|
|
71
|
+
'--range', '0-52428799',
|
|
72
|
+
'--compressed'
|
|
73
|
+
];
|
|
74
|
+
if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
|
|
75
|
+
curlArgs.push(
|
|
76
|
+
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
77
|
+
'-H', 'Accept-Language: en-US,en;q=0.5',
|
|
78
|
+
'-H', 'Accept-Encoding: gzip, deflate',
|
|
79
|
+
'-H', 'Connection: keep-alive',
|
|
80
|
+
'-H', 'Upgrade-Insecure-Requests: 1'
|
|
81
|
+
);
|
|
82
|
+
curlArgs.push(url);
|
|
103
83
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
'-H', 'Connection: keep-alive',
|
|
110
|
-
'-H', 'Upgrade-Insecure-Requests: 1'
|
|
111
|
-
);
|
|
112
|
-
|
|
113
|
-
curlArgs.push(url);
|
|
114
|
-
|
|
115
|
-
// Use spawnSync with proper argument separation
|
|
116
|
-
const result = spawnSync('curl', curlArgs, {
|
|
117
|
-
encoding: 'utf8',
|
|
118
|
-
timeout: timeout * 1000,
|
|
119
|
-
maxBuffer: 10 * 1024 * 1024, // 10MB max buffer
|
|
120
|
-
killSignal: 'SIGTERM'
|
|
121
|
-
});
|
|
122
|
-
|
|
123
|
-
if (result.error) {
|
|
124
|
-
throw result.error;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
if (result.status !== 0) {
|
|
128
|
-
throw new Error(`Curl exited with status ${result.status}: ${result.stderr}`);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
resolve(result.stdout);
|
|
132
|
-
} catch (error) {
|
|
133
|
-
reject(new Error(`Curl failed for ${url}: ${error.message}`));
|
|
134
|
-
}
|
|
84
|
+
// Shared async-spawn helper — same streaming/cap/timeout/kill plumbing
|
|
85
|
+
// that used to be ~80 lines of inline boilerplate here.
|
|
86
|
+
const result = await runProcess('curl', curlArgs, {
|
|
87
|
+
timeout: timeout * 1000,
|
|
88
|
+
maxStdout: MAX_STDOUT_BYTES
|
|
135
89
|
});
|
|
90
|
+
|
|
91
|
+
if (result.error) throw new Error(`Curl failed for ${url}: ${result.error}`);
|
|
92
|
+
if (result.truncated) throw new Error(`Curl output exceeded ${MAX_STDOUT_BYTES} bytes for ${url}`);
|
|
93
|
+
if (result.signal) throw new Error(`Curl killed by signal ${result.signal} for ${url}`);
|
|
94
|
+
if (result.code !== 0) {
|
|
95
|
+
throw new Error(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
|
|
96
|
+
}
|
|
97
|
+
return result.stdout.toString('utf8');
|
|
136
98
|
}
|
|
137
99
|
|
|
138
100
|
/**
|
|
@@ -166,59 +128,48 @@ async function downloadWithRetry(url, userAgent = '', timeout = 30, retries = 2)
|
|
|
166
128
|
}
|
|
167
129
|
}
|
|
168
130
|
|
|
131
|
+
// Lookup table for the 6 named entities the previous chained-replace
|
|
132
|
+
// handled. Hoisted out of safeDecodeXmlEntities so the object isn't
|
|
133
|
+
// reallocated per call.
|
|
134
|
+
const NAMED_ENTITIES = Object.freeze({
|
|
135
|
+
'<': '<', '>': '>', '&': '&',
|
|
136
|
+
'"': '"', ''': "'", ''': "'"
|
|
137
|
+
});
|
|
138
|
+
|
|
169
139
|
/**
|
|
170
|
-
* Safely decodes XML entities
|
|
140
|
+
* Safely decodes XML entities (named + numeric decimal + numeric hex)
|
|
141
|
+
* in a SINGLE regex pass. The old implementation chained 8 separate
|
|
142
|
+
* .replace() calls, each allocating a full intermediate string — for
|
|
143
|
+
* 50MB content that was ~8 × 50MB ≈ 400MB of throwaway allocations per
|
|
144
|
+
* XML response. Also drops the previous "timeout" check, which only
|
|
145
|
+
* fired between regex passes (not during them) so it never actually
|
|
146
|
+
* bounded runtime on pathological input.
|
|
171
147
|
* @param {string} content - Content to decode
|
|
172
148
|
* @returns {string} Decoded content or original if processing fails
|
|
173
149
|
*/
|
|
174
150
|
function safeDecodeXmlEntities(content) {
|
|
175
|
-
const startTime = Date.now();
|
|
176
|
-
|
|
177
151
|
try {
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
const num = parseInt(dec, 10);
|
|
195
|
-
// Validate range for safety (valid Unicode range)
|
|
196
|
-
if (num >= 0 && num <= 0x10FFFF) {
|
|
197
|
-
return String.fromCharCode(num);
|
|
152
|
+
return content.replace(
|
|
153
|
+
/<|>|&|"|'|'|&#\d+;|&#x[0-9a-fA-F]+;/g,
|
|
154
|
+
(match) => {
|
|
155
|
+
// Named entity — exact match in the lookup table.
|
|
156
|
+
const named = NAMED_ENTITIES[match];
|
|
157
|
+
if (named) return named;
|
|
158
|
+
// Numeric entity — &#xNN; (hex) or &#NN; (decimal).
|
|
159
|
+
const isHex = match[2] === 'x' || match[2] === 'X';
|
|
160
|
+
const numStr = isHex ? match.slice(3, -1) : match.slice(2, -1);
|
|
161
|
+
const num = parseInt(numStr, isHex ? 16 : 10);
|
|
162
|
+
// String.fromCodePoint (NOT fromCharCode) — fromCharCode truncates
|
|
163
|
+
// to 16 bits, so 😀 (😀, codepoint 0x1F600) would decode to
|
|
164
|
+
// '' (a single garbage BMP char) instead of the emoji.
|
|
165
|
+
// fromCodePoint handles the full Unicode range up to 0x10FFFF.
|
|
166
|
+
if (num >= 0 && num <= 0x10FFFF) return String.fromCodePoint(num);
|
|
167
|
+
return match; // out-of-range — keep original
|
|
198
168
|
}
|
|
199
|
-
|
|
200
|
-
});
|
|
201
|
-
|
|
202
|
-
// Check timeout again
|
|
203
|
-
if (Date.now() - startTime > SEARCH_CONFIG.XML_ENTITY_TIMEOUT) {
|
|
204
|
-
console.warn('[warn] XML entity decoding timeout, using partial result');
|
|
205
|
-
return decoded;
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
// Decode numeric entities (hexadecimal)
|
|
209
|
-
decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (match, hex) => {
|
|
210
|
-
const num = parseInt(hex, 16);
|
|
211
|
-
// Validate range for safety (valid Unicode range)
|
|
212
|
-
if (num >= 0 && num <= 0x10FFFF) {
|
|
213
|
-
return String.fromCharCode(num);
|
|
214
|
-
}
|
|
215
|
-
return match; // Keep original if invalid
|
|
216
|
-
});
|
|
217
|
-
|
|
218
|
-
return decoded;
|
|
169
|
+
);
|
|
219
170
|
} catch (xmlErr) {
|
|
220
|
-
console.warn(
|
|
221
|
-
return content;
|
|
171
|
+
console.warn(formatLogMessage('warn', `XML entity decoding failed: ${xmlErr.message}`));
|
|
172
|
+
return content;
|
|
222
173
|
}
|
|
223
174
|
}
|
|
224
175
|
|
|
@@ -229,15 +180,12 @@ function safeDecodeXmlEntities(content) {
|
|
|
229
180
|
*/
|
|
230
181
|
function safeStripTags(content) {
|
|
231
182
|
try {
|
|
232
|
-
//
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
// Replace tags with spaces to preserve word boundaries
|
|
238
|
-
return limitedContent.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ');
|
|
183
|
+
// No content-size cap here — searchContent already truncated to
|
|
184
|
+
// MAX_CONTENT_SIZE before calling, so the previous cap was a no-op.
|
|
185
|
+
// Replace tags with spaces to preserve word boundaries.
|
|
186
|
+
return content.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ');
|
|
239
187
|
} catch (stripErr) {
|
|
240
|
-
console.warn(
|
|
188
|
+
console.warn(formatLogMessage('warn', `Tag stripping failed: ${stripErr.message}`));
|
|
241
189
|
return content;
|
|
242
190
|
}
|
|
243
191
|
}
|
|
@@ -251,134 +199,110 @@ function safeStripTags(content) {
|
|
|
251
199
|
* @param {Array<string>} searchStringsAnd - Array of strings that must all be present (AND logic)
|
|
252
200
|
* @param {string} contentType - Content type for specialized handling
|
|
253
201
|
* @param {string} url - URL for debugging context (optional)
|
|
254
|
-
* @returns {
|
|
202
|
+
* @returns {{found: boolean, matchedString: string|null, logicType: 'AND'|'OR'|'NONE', error?: string}}
|
|
255
203
|
*/
|
|
256
204
|
function searchContent(content, searchStrings, searchStringsAnd = [], contentType = '', url = '') {
|
|
257
|
-
// Input validation
|
|
205
|
+
// Input validation. Return shape carries only what callers actually
|
|
206
|
+
// destructure ({found, matchedString, logicType, error}); the old
|
|
207
|
+
// matchedStrings/allMatches/contentSize/searchableSize/processedAsXml
|
|
208
|
+
// fields were computed and returned but never read by any caller.
|
|
258
209
|
if (!content || typeof content !== 'string') {
|
|
259
|
-
return {
|
|
260
|
-
found: false,
|
|
261
|
-
matchedString: null,
|
|
262
|
-
matchedStrings: [],
|
|
263
|
-
allMatches: [],
|
|
264
|
-
logicType: 'NONE',
|
|
265
|
-
error: 'Invalid or empty content'
|
|
266
|
-
};
|
|
210
|
+
return { found: false, matchedString: null, logicType: 'NONE', error: 'Invalid or empty content' };
|
|
267
211
|
}
|
|
268
212
|
|
|
213
|
+
// Validate search strings FIRST — before paying for content truncation,
|
|
214
|
+
// XML entity decoding, tag stripping, and 3× lowercase. Previously these
|
|
215
|
+
// ran first, so a config with zero valid search strings still burned
|
|
216
|
+
// ~150MB of allocations on a 50MB XML response before returning empty.
|
|
217
|
+
const validSearchStrings = searchStrings.filter(str =>
|
|
218
|
+
str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
|
|
219
|
+
);
|
|
220
|
+
const validSearchStringsAnd = searchStringsAnd.filter(str =>
|
|
221
|
+
str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
|
|
222
|
+
);
|
|
223
|
+
|
|
224
|
+
if (validSearchStrings.length !== searchStrings.length) {
|
|
225
|
+
console.warn(formatLogMessage('warn', `Filtered ${searchStrings.length - validSearchStrings.length} invalid search strings`));
|
|
226
|
+
}
|
|
227
|
+
if (validSearchStringsAnd.length !== searchStringsAnd.length) {
|
|
228
|
+
console.warn(formatLogMessage('warn', `Filtered ${searchStringsAnd.length - validSearchStringsAnd.length} invalid AND search strings`));
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (validSearchStrings.length === 0 && validSearchStringsAnd.length === 0) {
|
|
232
|
+
return { found: false, matchedString: null, logicType: 'NONE', error: 'No valid search strings provided' };
|
|
233
|
+
}
|
|
234
|
+
|
|
269
235
|
// Size check and truncation with warning
|
|
270
236
|
const originalLength = content.length;
|
|
271
237
|
if (originalLength > SEARCH_CONFIG.MAX_CONTENT_SIZE) {
|
|
272
238
|
content = content.substring(0, SEARCH_CONFIG.MAX_CONTENT_SIZE);
|
|
273
|
-
console.warn(
|
|
239
|
+
console.warn(formatLogMessage('warn', `Content truncated from ${originalLength} to ${SEARCH_CONFIG.MAX_CONTENT_SIZE} chars for ${url || 'unknown URL'}`));
|
|
274
240
|
}
|
|
275
|
-
let searchableContent = content;
|
|
276
241
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
242
|
+
// For XML/HTML we search across three views — original, entity-decoded,
|
|
243
|
+
// tag-stripped — so encoded strings ("&") and DOM-text strings
|
|
244
|
+
// ("body text") and raw-source strings (attribute values) all match.
|
|
245
|
+
//
|
|
246
|
+
// The previous implementation joined all three into a single 3× string
|
|
247
|
+
// then .toLowerCase()'d it. For a 50MB response that allocated a 150MB
|
|
248
|
+
// intermediate plus a 150MB lowercase copy. Now we lowercase each
|
|
249
|
+
// version independently and probe with `versionsIncludes()` — same
|
|
250
|
+
// matching semantics (a string found in ANY version still counts) but
|
|
251
|
+
// ~half the peak memory.
|
|
252
|
+
const ct = contentType.toLowerCase();
|
|
253
|
+
const isXmlContent = ct.includes('xml') || ct.includes('html');
|
|
254
|
+
|
|
255
|
+
let lowerVersions;
|
|
280
256
|
if (isXmlContent) {
|
|
281
257
|
try {
|
|
282
|
-
// Safely decode XML entities
|
|
283
258
|
const decodedContent = safeDecodeXmlEntities(content);
|
|
284
|
-
|
|
285
|
-
// Safely strip tags to extract text content
|
|
286
259
|
const strippedContent = safeStripTags(decodedContent);
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
260
|
+
lowerVersions = [
|
|
261
|
+
content.toLowerCase(),
|
|
262
|
+
decodedContent.toLowerCase(),
|
|
263
|
+
strippedContent.toLowerCase()
|
|
264
|
+
];
|
|
292
265
|
} catch (xmlProcessingErr) {
|
|
293
|
-
console.warn(
|
|
294
|
-
|
|
295
|
-
searchableContent = content;
|
|
266
|
+
console.warn(formatLogMessage('warn', `XML processing failed for ${url || 'unknown URL'}: ${xmlProcessingErr.message}`));
|
|
267
|
+
lowerVersions = [content.toLowerCase()];
|
|
296
268
|
}
|
|
269
|
+
} else {
|
|
270
|
+
lowerVersions = [content.toLowerCase()];
|
|
297
271
|
}
|
|
272
|
+
|
|
273
|
+
const versionsIncludes = (needleLower) => {
|
|
274
|
+
for (let i = 0; i < lowerVersions.length; i++) {
|
|
275
|
+
if (lowerVersions[i].includes(needleLower)) return true;
|
|
276
|
+
}
|
|
277
|
+
return false;
|
|
278
|
+
};
|
|
298
279
|
|
|
299
|
-
//
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH
|
|
305
|
-
);
|
|
306
|
-
|
|
307
|
-
// Warn about filtered search strings
|
|
308
|
-
if (validSearchStrings.length !== searchStrings.length) {
|
|
309
|
-
console.warn(`[warn] Filtered ${searchStrings.length - validSearchStrings.length} invalid search strings`);
|
|
310
|
-
}
|
|
311
|
-
if (validSearchStringsAnd.length !== searchStringsAnd.length) {
|
|
312
|
-
console.warn(`[warn] Filtered ${searchStringsAnd.length - validSearchStringsAnd.length} invalid AND search strings`);
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
// Early return if no valid search strings
|
|
316
|
-
if (validSearchStrings.length === 0 && validSearchStringsAnd.length === 0) {
|
|
317
|
-
return {
|
|
318
|
-
found: false,
|
|
319
|
-
matchedString: null,
|
|
320
|
-
matchedStrings: [],
|
|
321
|
-
allMatches: [],
|
|
322
|
-
logicType: 'NONE',
|
|
323
|
-
error: 'No valid search strings provided'
|
|
324
|
-
};
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
// Pre-compute lowercase content once for better performance
|
|
328
|
-
const lowerContent = searchableContent.toLowerCase();
|
|
329
|
-
|
|
330
|
-
// Check AND logic first (more restrictive) - ALL strings must be present
|
|
331
|
-
if (validSearchStringsAnd && validSearchStringsAnd.length > 0) {
|
|
332
|
-
const foundAndStrings = [];
|
|
333
|
-
|
|
280
|
+
// Check AND logic first (more restrictive) — ALL strings must be present
|
|
281
|
+
// in at least one of the searchable versions. Loop exits early on first
|
|
282
|
+
// NOT-found.
|
|
283
|
+
if (validSearchStringsAnd.length > 0) {
|
|
284
|
+
let allFound = true;
|
|
334
285
|
for (const searchStr of validSearchStringsAnd) {
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
foundAndStrings.push(searchStr);
|
|
338
|
-
} else {
|
|
339
|
-
// Early exit if any AND string is not found
|
|
286
|
+
if (!versionsIncludes(searchStr.toLowerCase())) {
|
|
287
|
+
allFound = false;
|
|
340
288
|
break;
|
|
341
289
|
}
|
|
342
290
|
}
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
if (foundAndStrings.length === validSearchStringsAnd.length) {
|
|
346
|
-
return {
|
|
347
|
-
found: true,
|
|
348
|
-
matchedString: foundAndStrings.join(' AND '),
|
|
349
|
-
matchedStrings: foundAndStrings,
|
|
350
|
-
allMatches: foundAndStrings,
|
|
351
|
-
logicType: 'AND',
|
|
352
|
-
contentSize: originalLength,
|
|
353
|
-
searchableSize: searchableContent.length
|
|
354
|
-
};
|
|
291
|
+
if (allFound) {
|
|
292
|
+
return { found: true, matchedString: validSearchStringsAnd.join(' AND '), logicType: 'AND' };
|
|
355
293
|
}
|
|
356
294
|
}
|
|
357
|
-
|
|
358
|
-
// OR logic: ANY string can match
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
295
|
+
|
|
296
|
+
// OR logic: ANY string can match. Early-exit on first hit since the
|
|
297
|
+
// caller only reads matchedString (the first match). Previously the
|
|
298
|
+
// loop ran to completion to fill an `allMatches` array no caller read.
|
|
362
299
|
for (const searchStr of validSearchStrings) {
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
allMatches.push(searchStr);
|
|
366
|
-
if (!firstMatch) {
|
|
367
|
-
firstMatch = searchStr;
|
|
368
|
-
}
|
|
300
|
+
if (versionsIncludes(searchStr.toLowerCase())) {
|
|
301
|
+
return { found: true, matchedString: searchStr, logicType: 'OR' };
|
|
369
302
|
}
|
|
370
303
|
}
|
|
371
|
-
|
|
372
|
-
return {
|
|
373
|
-
found: allMatches.length > 0,
|
|
374
|
-
matchedString: firstMatch,
|
|
375
|
-
matchedStrings: allMatches,
|
|
376
|
-
allMatches: allMatches,
|
|
377
|
-
logicType: validSearchStrings.length > 0 ? 'OR' : 'NONE',
|
|
378
|
-
contentSize: originalLength,
|
|
379
|
-
searchableSize: searchableContent.length,
|
|
380
|
-
processedAsXml: isXmlContent
|
|
381
|
-
};
|
|
304
|
+
|
|
305
|
+
return { found: false, matchedString: null, logicType: validSearchStrings.length > 0 ? 'OR' : 'NONE' };
|
|
382
306
|
}
|
|
383
307
|
|
|
384
308
|
/**
|
|
@@ -440,44 +364,52 @@ function createCurlHandler(config) {
|
|
|
440
364
|
hasSearchString
|
|
441
365
|
} = config;
|
|
442
366
|
|
|
367
|
+
// Hoisted: currentUrl doesn't change for this handler's lifetime, so
|
|
368
|
+
// parsing its hostname once at handler-creation eliminates the
|
|
369
|
+
// per-request URL allocation.
|
|
370
|
+
let currentUrlHostname = '';
|
|
371
|
+
try { currentUrlHostname = new URL(currentUrl).hostname; } catch (_) {}
|
|
372
|
+
|
|
443
373
|
return async function curlHandler(requestUrl) {
|
|
444
|
-
|
|
445
|
-
//
|
|
374
|
+
// Regex check FIRST — cheap filter that skips ~99% of requests.
|
|
375
|
+
// Previously this ran AFTER a URL parse + domain-cache lookup;
|
|
376
|
+
// the parse is the expensive bit, so doing it after the cheap
|
|
377
|
+
// gate moves the cost off the hot path.
|
|
446
378
|
const matchesRegex = regexes.some(re => re.test(requestUrl));
|
|
447
379
|
if (!matchesRegex) return;
|
|
448
380
|
|
|
449
|
-
//
|
|
450
|
-
|
|
381
|
+
// Parse requestUrl ONCE and reuse. Was parsed 2-3 times.
|
|
382
|
+
let requestHostname;
|
|
383
|
+
try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
|
|
384
|
+
const reqDomain = perSiteSubDomains ? requestHostname : getRootDomain(requestUrl);
|
|
385
|
+
|
|
451
386
|
if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(reqDomain)) {
|
|
452
387
|
if (forceDebug) {
|
|
453
|
-
console.log(
|
|
388
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected domain: ${reqDomain}`));
|
|
454
389
|
}
|
|
455
390
|
return;
|
|
456
391
|
}
|
|
457
|
-
|
|
458
|
-
// Check if this is a first-party request (same domain as the URL being scanned)
|
|
459
|
-
const currentUrlHostname = new URL(currentUrl).hostname;
|
|
460
|
-
const requestHostname = new URL(requestUrl).hostname;
|
|
392
|
+
|
|
461
393
|
const isFirstParty = currentUrlHostname === requestHostname;
|
|
462
394
|
|
|
463
395
|
// Apply first-party/third-party filtering
|
|
464
396
|
if (isFirstParty && siteConfig.firstParty === false) {
|
|
465
397
|
if (forceDebug) {
|
|
466
|
-
console.log(
|
|
398
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty=false): ${requestUrl}`));
|
|
467
399
|
}
|
|
468
400
|
return;
|
|
469
401
|
}
|
|
470
402
|
|
|
471
403
|
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
472
404
|
if (forceDebug) {
|
|
473
|
-
console.log(
|
|
405
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty=false): ${requestUrl}`));
|
|
474
406
|
}
|
|
475
407
|
return;
|
|
476
408
|
}
|
|
477
409
|
|
|
478
410
|
try {
|
|
479
411
|
if (forceDebug) {
|
|
480
|
-
console.log(
|
|
412
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content from: ${requestUrl}`));
|
|
481
413
|
}
|
|
482
414
|
|
|
483
415
|
// If NO searchstring is defined, match immediately (like browser behavior)
|
|
@@ -486,7 +418,7 @@ function createCurlHandler(config) {
|
|
|
486
418
|
return;
|
|
487
419
|
}
|
|
488
420
|
|
|
489
|
-
|
|
421
|
+
addMatchedDomain(reqDomain, resourceType);
|
|
490
422
|
const simplifiedUrl = getRootDomain(currentUrl);
|
|
491
423
|
|
|
492
424
|
if (siteConfig.verbose === 1) {
|
|
@@ -503,7 +435,7 @@ function createCurlHandler(config) {
|
|
|
503
435
|
fs.appendFileSync(matchedUrlsLogFile,
|
|
504
436
|
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
|
|
505
437
|
} catch (logErr) {
|
|
506
|
-
console.warn(
|
|
438
|
+
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
507
439
|
}
|
|
508
440
|
}
|
|
509
441
|
return;
|
|
@@ -520,7 +452,7 @@ function createCurlHandler(config) {
|
|
|
520
452
|
return;
|
|
521
453
|
}
|
|
522
454
|
|
|
523
|
-
|
|
455
|
+
addMatchedDomain(reqDomain, resourceType);
|
|
524
456
|
const simplifiedUrl = getRootDomain(currentUrl);
|
|
525
457
|
|
|
526
458
|
if (siteConfig.verbose === 1) {
|
|
@@ -537,20 +469,20 @@ function createCurlHandler(config) {
|
|
|
537
469
|
fs.appendFileSync(matchedUrlsLogFile,
|
|
538
470
|
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
|
|
539
471
|
} catch (logErr) {
|
|
540
|
-
console.warn(
|
|
472
|
+
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
541
473
|
}
|
|
542
474
|
}
|
|
543
475
|
} else if (forceDebug) {
|
|
544
476
|
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
545
|
-
console.log(
|
|
477
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no searchstring found`));
|
|
546
478
|
if (error) {
|
|
547
|
-
console.log(
|
|
479
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Search error: ${error}`));
|
|
548
480
|
}
|
|
549
481
|
}
|
|
550
482
|
|
|
551
483
|
} catch (err) {
|
|
552
484
|
if (forceDebug) {
|
|
553
|
-
console.log(
|
|
485
|
+
console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download content for ${requestUrl}: ${err.message}`));
|
|
554
486
|
}
|
|
555
487
|
}
|
|
556
488
|
};
|
|
@@ -582,56 +514,66 @@ function createResponseHandler(config) {
|
|
|
582
514
|
resourceType // Will be null for response handler
|
|
583
515
|
} = config;
|
|
584
516
|
|
|
517
|
+
// Hoisted: currentUrl doesn't change for this handler's lifetime.
|
|
518
|
+
// Root domain (not bare hostname) so first-party matches the definition
|
|
519
|
+
// used by nwss.js's main request handler AND lib/curl.js — previously
|
|
520
|
+
// this module used hostname equality, so cdn.example.com and
|
|
521
|
+
// static.example.com were classified third-party here but first-party
|
|
522
|
+
// by the main handler. Unified to the registrable-root rule.
|
|
523
|
+
let currentRootDomain = '';
|
|
524
|
+
try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
|
|
525
|
+
|
|
585
526
|
return async function responseHandler(response) {
|
|
586
527
|
const respUrl = response.url();
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
//
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
528
|
+
|
|
529
|
+
// Regex check FIRST — cheapest filter, eliminates ~99% of responses
|
|
530
|
+
// before paying for URL parses + domain-cache lookup. Previously this
|
|
531
|
+
// ran AFTER 2× URL parses + isDomainAlreadyDetected; reordering moves
|
|
532
|
+
// the parse cost off the hot path of every subresource response.
|
|
533
|
+
const matchesRegex = regexes.some(re => re.test(respUrl));
|
|
534
|
+
if (!matchesRegex) return;
|
|
535
|
+
|
|
536
|
+
// Parse respUrl ONCE and reuse. Was parsed 2-3 times per response.
|
|
537
|
+
let respHostname;
|
|
538
|
+
try { respHostname = new URL(respUrl).hostname; } catch (_) { return; }
|
|
539
|
+
const fullSubdomain = respHostname; // hostname is always the full subdomain
|
|
540
|
+
|
|
593
541
|
if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(fullSubdomain)) {
|
|
594
542
|
return;
|
|
595
543
|
}
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
//
|
|
600
|
-
// The main request handler already filtered first-party/third-party requests
|
|
601
|
-
// This response handler only runs for requests that passed that filter
|
|
602
|
-
// However, we need to apply the same first-party/third-party logic here for searchstring analysis
|
|
603
|
-
// because the response handler analyzes content, not just URLs
|
|
604
|
-
|
|
605
|
-
// Apply first-party/third-party filtering for searchstring analysis
|
|
606
|
-
// Use the exact same logic as the main request handler
|
|
544
|
+
// respDomain (root domain) is only needed inside the `if (found)` block
|
|
545
|
+
// below. Deferring the getRootDomain call avoids the URL re-parse for
|
|
546
|
+
// every regex-matched response whose content doesn't contain the
|
|
547
|
+
// searchstring — the common case on most pages.
|
|
607
548
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
const
|
|
549
|
+
// First-party / third-party gate. Root-domain comparison matches the
|
|
550
|
+
// main handler and curl.js — old hostname comparison disagreed.
|
|
551
|
+
const respRootDomain = getRootDomain(respUrl);
|
|
552
|
+
const isFirstParty = currentRootDomain === respRootDomain;
|
|
611
553
|
if (isFirstParty && siteConfig.firstParty === false) {
|
|
612
554
|
if (forceDebug) {
|
|
613
|
-
console.log(
|
|
555
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`));
|
|
614
556
|
}
|
|
615
557
|
return;
|
|
616
558
|
}
|
|
617
|
-
|
|
559
|
+
|
|
618
560
|
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
619
561
|
if (forceDebug) {
|
|
620
|
-
console.log(
|
|
562
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`));
|
|
621
563
|
}
|
|
622
564
|
return;
|
|
623
565
|
}
|
|
624
|
-
|
|
566
|
+
|
|
625
567
|
try {
|
|
626
568
|
// Only capture appropriate content types to avoid binary data
|
|
627
569
|
const contentType = response.headers()['content-type'] || '';
|
|
628
570
|
if (!shouldAnalyzeContentType(contentType)) {
|
|
629
571
|
if (forceDebug) {
|
|
630
|
-
console.log(
|
|
572
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Skipping content analysis for ${respUrl} (content-type: ${contentType})`));
|
|
631
573
|
}
|
|
632
574
|
return;
|
|
633
575
|
}
|
|
634
|
-
|
|
576
|
+
|
|
635
577
|
const content = await response.text();
|
|
636
578
|
|
|
637
579
|
// Cache the fetched content if callback provided
|
|
@@ -640,7 +582,7 @@ function createResponseHandler(config) {
|
|
|
640
582
|
config.onContentFetched(respUrl, content);
|
|
641
583
|
} catch (cacheErr) {
|
|
642
584
|
if (forceDebug) {
|
|
643
|
-
console.log(
|
|
585
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Content caching failed: ${cacheErr.message}`));
|
|
644
586
|
}
|
|
645
587
|
}
|
|
646
588
|
}
|
|
@@ -677,7 +619,7 @@ function createResponseHandler(config) {
|
|
|
677
619
|
}
|
|
678
620
|
} catch (grepErr) {
|
|
679
621
|
if (forceDebug) {
|
|
680
|
-
console.log(
|
|
622
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Grep failed for ${respUrl}, falling back to JavaScript: ${grepErr.message}`));
|
|
681
623
|
}
|
|
682
624
|
// Fallback to JavaScript search
|
|
683
625
|
searchResult = searchContent(content, searchStrings, searchStringsAnd, contentType, respUrl);
|
|
@@ -690,10 +632,13 @@ function createResponseHandler(config) {
|
|
|
690
632
|
const { found, matchedString, logicType, error } = searchResult;
|
|
691
633
|
|
|
692
634
|
if (found) {
|
|
635
|
+
// Reuse respRootDomain from the first-party check — was already
|
|
636
|
+
// computed above. Saves a second getRootDomain call per match.
|
|
637
|
+
const respDomain = perSiteSubDomains ? respHostname : respRootDomain;
|
|
693
638
|
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
|
|
694
639
|
return;
|
|
695
640
|
}
|
|
696
|
-
|
|
641
|
+
|
|
697
642
|
// Response handler doesn't have access to specific resource type
|
|
698
643
|
// Use the addMatchedDomain helper which handles fullSubdomain properly
|
|
699
644
|
addMatchedDomain(respDomain, null, fullSubdomain);
|
|
@@ -713,138 +658,104 @@ function createResponseHandler(config) {
|
|
|
713
658
|
fs.appendFileSync(matchedUrlsLogFile,
|
|
714
659
|
`${timestamp} [match][${simplifiedUrl}] ${respUrl} (${partyType}, ${searchMethod}, searchstring (${logicType}): "${matchedString}")\n`);
|
|
715
660
|
} catch (logErr) {
|
|
716
|
-
console.warn(
|
|
661
|
+
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
717
662
|
}
|
|
718
663
|
}
|
|
719
664
|
} else if (forceDebug) {
|
|
720
665
|
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
721
666
|
const searchMethod = useGrep ? 'grep' : 'js';
|
|
722
|
-
console.log(
|
|
667
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} ${respUrl} (${partyType}, ${searchMethod}) matched regex but no searchstring found`));
|
|
723
668
|
if (error) {
|
|
724
|
-
console.log(
|
|
669
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Search error: ${error}`));
|
|
725
670
|
}
|
|
726
671
|
}
|
|
727
|
-
|
|
672
|
+
|
|
728
673
|
} catch (err) {
|
|
729
674
|
if (forceDebug) {
|
|
730
|
-
console.log(
|
|
675
|
+
console.log(formatLogMessage('debug', `${SEARCHSTRING_TAG} Failed to read response content for ${respUrl}: ${err.message}`));
|
|
731
676
|
}
|
|
732
677
|
}
|
|
733
678
|
};
|
|
734
679
|
}
|
|
735
680
|
|
|
736
681
|
/**
|
|
737
|
-
* Validates
|
|
738
|
-
*
|
|
739
|
-
*
|
|
740
|
-
*
|
|
682
|
+
* Validates a single string-or-array-of-strings value against the
|
|
683
|
+
* shared rules: type, non-empty, per-element type/non-empty, length cap.
|
|
684
|
+
* Used by validateSearchString for both searchstring and searchstring_and.
|
|
685
|
+
*
|
|
686
|
+
* @param {string|Array<string>} value
|
|
687
|
+
* @param {string} fieldName - e.g. 'searchstring' or 'searchstring_and'
|
|
688
|
+
* @returns {{isValid: boolean, error: string|null}}
|
|
741
689
|
*/
|
|
742
|
-
function
|
|
743
|
-
if (
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
return { isValid: false, error: 'searchstring cannot be empty string' };
|
|
690
|
+
function validateSearchValue(value, fieldName) {
|
|
691
|
+
if (typeof value === 'string') {
|
|
692
|
+
if (value.length === 0) {
|
|
693
|
+
return { isValid: false, error: `${fieldName} cannot be empty string` };
|
|
694
|
+
}
|
|
695
|
+
if (value.length > SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH) {
|
|
696
|
+
return { isValid: false, error: `${fieldName} too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` };
|
|
750
697
|
}
|
|
751
698
|
return { isValid: true, error: null };
|
|
752
699
|
}
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
return { isValid: false, error: 'searchstring array cannot be empty' };
|
|
700
|
+
if (Array.isArray(value)) {
|
|
701
|
+
if (value.length === 0) {
|
|
702
|
+
return { isValid: false, error: `${fieldName} array cannot be empty` };
|
|
757
703
|
}
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
704
|
+
for (let i = 0; i < value.length; i++) {
|
|
705
|
+
if (typeof value[i] !== 'string') {
|
|
706
|
+
return { isValid: false, error: `${fieldName}[${i}] must be a string` };
|
|
707
|
+
}
|
|
708
|
+
if (value[i].length === 0) {
|
|
709
|
+
return { isValid: false, error: `${fieldName}[${i}] cannot be empty string` };
|
|
762
710
|
}
|
|
763
|
-
if (
|
|
764
|
-
return { isValid: false, error:
|
|
711
|
+
if (value[i].length > SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH) {
|
|
712
|
+
return { isValid: false, error: `${fieldName}[${i}] too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` };
|
|
765
713
|
}
|
|
766
714
|
}
|
|
767
|
-
|
|
768
715
|
return { isValid: true, error: null };
|
|
769
716
|
}
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
717
|
+
return { isValid: false, error: `${fieldName} must be string or array of strings` };
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
/**
|
|
721
|
+
* Validates searchstring configuration. The old structure returned
|
|
722
|
+
* early on valid string/array searchstring, so 60+ lines of validation
|
|
723
|
+
* below (the both-defined check, length caps, searchstring_and type
|
|
724
|
+
* check) were unreachable for valid inputs — e.g. passing both
|
|
725
|
+
* searchstring AND searchstring_and would have passed validation
|
|
726
|
+
* despite the documented mutual-exclusion rule. Rewritten as a linear
|
|
727
|
+
* sequence of independent checks via the shared validateSearchValue
|
|
728
|
+
* helper so every rule actually runs.
|
|
729
|
+
*
|
|
730
|
+
* @param {any} searchstring - The searchstring value (OR logic)
|
|
731
|
+
* @param {any} searchstringAnd - The searchstring_and value (AND logic)
|
|
732
|
+
* @returns {{isValid: boolean, error: string|null}}
|
|
733
|
+
*/
|
|
734
|
+
function validateSearchString(searchstring, searchstringAnd) {
|
|
735
|
+
const hasOR = searchstring !== undefined && searchstring !== null;
|
|
736
|
+
const hasAND = searchstringAnd !== undefined && searchstringAnd !== null;
|
|
737
|
+
|
|
738
|
+
// Both unset is fine — no searchstring filtering will be applied.
|
|
739
|
+
if (!hasOR && !hasAND) {
|
|
740
|
+
return { isValid: true, error: null };
|
|
793
741
|
}
|
|
794
|
-
|
|
795
|
-
//
|
|
796
|
-
if (
|
|
797
|
-
(searchstringAnd !== undefined && searchstringAnd !== null)) {
|
|
742
|
+
|
|
743
|
+
// Mutual exclusion: can't combine OR and AND logic in one site config.
|
|
744
|
+
if (hasOR && hasAND) {
|
|
798
745
|
return { isValid: false, error: 'Cannot use both searchstring (OR) and searchstring_and (AND) simultaneously. Choose one logic type.' };
|
|
799
746
|
}
|
|
800
747
|
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
if (
|
|
804
|
-
return { isValid: false, error: `${fieldName} too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` };
|
|
805
|
-
}
|
|
806
|
-
return { isValid: true };
|
|
807
|
-
};
|
|
808
|
-
|
|
809
|
-
// Validate search string lengths
|
|
810
|
-
if (typeof searchstring === 'string') {
|
|
811
|
-
const lengthCheck = validateStringLength(searchstring, 'searchstring');
|
|
812
|
-
if (!lengthCheck.isValid) return lengthCheck;
|
|
813
|
-
} else if (Array.isArray(searchstring)) {
|
|
814
|
-
for (let i = 0; i < searchstring.length; i++) {
|
|
815
|
-
const lengthCheck = validateStringLength(searchstring[i], `searchstring[${i}]`);
|
|
816
|
-
if (!lengthCheck.isValid) return lengthCheck;
|
|
817
|
-
}
|
|
748
|
+
if (hasOR) {
|
|
749
|
+
const check = validateSearchValue(searchstring, 'searchstring');
|
|
750
|
+
if (!check.isValid) return check;
|
|
818
751
|
}
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
if (!lengthCheck.isValid) return lengthCheck;
|
|
824
|
-
} else if (Array.isArray(searchstringAnd)) {
|
|
825
|
-
for (let i = 0; i < searchstringAnd.length; i++) {
|
|
826
|
-
const lengthCheck = validateStringLength(searchstringAnd[i], `searchstring_and[${i}]`);
|
|
827
|
-
if (!lengthCheck.isValid) return lengthCheck;
|
|
828
|
-
}
|
|
752
|
+
|
|
753
|
+
if (hasAND) {
|
|
754
|
+
const check = validateSearchValue(searchstringAnd, 'searchstring_and');
|
|
755
|
+
if (!check.isValid) return check;
|
|
829
756
|
}
|
|
830
|
-
|
|
831
|
-
return { isValid: false, error: 'searchstring must be string or array of strings' };
|
|
832
|
-
}
|
|
833
757
|
|
|
834
|
-
|
|
835
|
-
* Gets statistics about search string matches
|
|
836
|
-
* @param {Set|Map} matchedDomains - Set or Map of matched domains
|
|
837
|
-
* @param {Array<string>} searchStrings - Array of search strings used
|
|
838
|
-
* @returns {object} Statistics object
|
|
839
|
-
*/
|
|
840
|
-
function getSearchStats(matchedDomains, searchStrings) {
|
|
841
|
-
const totalMatches = matchedDomains instanceof Map ? matchedDomains.size : matchedDomains.size;
|
|
842
|
-
|
|
843
|
-
return {
|
|
844
|
-
totalMatches,
|
|
845
|
-
searchStringCount: searchStrings.length,
|
|
846
|
-
searchStrings: [...searchStrings]
|
|
847
|
-
};
|
|
758
|
+
return { isValid: true, error: null };
|
|
848
759
|
}
|
|
849
760
|
|
|
850
761
|
module.exports = {
|
|
@@ -856,7 +767,5 @@ module.exports = {
|
|
|
856
767
|
createCurlHandler,
|
|
857
768
|
downloadWithCurl,
|
|
858
769
|
validateSearchString,
|
|
859
|
-
getSearchStats,
|
|
860
|
-
addDomainToCollection,
|
|
861
770
|
downloadWithRetry
|
|
862
771
|
};
|