@fanboynz/network-scanner 2.0.66 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +134 -10
- package/CHANGELOG.md +135 -0
- package/CLAUDE.md +18 -7
- package/README.md +12 -4
- package/lib/adblock-rust.js +23 -18
- package/lib/adblock.js +127 -82
- package/lib/browserexit.js +210 -200
- package/lib/browserhealth.js +84 -60
- package/lib/cdp.js +103 -81
- package/lib/clear_sitedata.js +61 -159
- package/lib/cloudflare.js +579 -409
- package/lib/colorize.js +29 -12
- package/lib/compare.js +16 -8
- package/lib/compress.js +2 -1
- package/lib/curl.js +287 -220
- package/lib/domain-cache.js +87 -40
- package/lib/dry-run.js +137 -194
- package/lib/fingerprint.js +20 -18
- package/lib/flowproxy.js +391 -188
- package/lib/ghost-cursor.js +8 -7
- package/lib/grep.js +248 -171
- package/lib/ignore_similar.js +70 -124
- package/lib/interaction.js +132 -235
- package/lib/nettools.js +309 -87
- package/lib/openvpn_vpn.js +12 -11
- package/lib/output.js +92 -59
- package/lib/post-processing.js +216 -162
- package/lib/redirect.js +46 -30
- package/lib/referrer.js +158 -165
- package/lib/searchstring.js +290 -381
- package/lib/smart-cache.js +141 -91
- package/lib/socks-relay.js +8 -7
- package/lib/spawn-async.js +137 -0
- package/lib/validate_rules.js +188 -176
- package/lib/wireguard_vpn.js +111 -117
- package/nwss.js +740 -156
- package/package.json +4 -4
package/lib/ghost-cursor.js
CHANGED
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
// INSTALL:
|
|
15
15
|
// npm install ghost-cursor (optional dependency)
|
|
16
16
|
|
|
17
|
-
const { formatLogMessage } = require('./colorize');
|
|
17
|
+
const { formatLogMessage, messageColors } = require('./colorize');
|
|
18
|
+
const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]');
|
|
18
19
|
|
|
19
20
|
let ghostCursorModule = null;
|
|
20
21
|
let ghostCursorAvailable = false;
|
|
@@ -61,7 +62,7 @@ function createGhostCursor(page, options = {}) {
|
|
|
61
62
|
return cursor;
|
|
62
63
|
} catch (err) {
|
|
63
64
|
if (forceDebug) {
|
|
64
|
-
console.log(formatLogMessage('debug',
|
|
65
|
+
console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Failed to create cursor: ${err.message}`));
|
|
65
66
|
}
|
|
66
67
|
return null;
|
|
67
68
|
}
|
|
@@ -103,13 +104,13 @@ async function ghostMove(cursor, toX, toY, options = {}) {
|
|
|
103
104
|
await cursor.moveTo({ x: toX, y: toY }, moveOpts);
|
|
104
105
|
|
|
105
106
|
if (forceDebug) {
|
|
106
|
-
console.log(formatLogMessage('debug',
|
|
107
|
+
console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Moved to (${Math.round(toX)}, ${Math.round(toY)})`));
|
|
107
108
|
}
|
|
108
109
|
|
|
109
110
|
return true;
|
|
110
111
|
} catch (err) {
|
|
111
112
|
if (forceDebug) {
|
|
112
|
-
console.log(formatLogMessage('debug',
|
|
113
|
+
console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Move failed: ${err.message}`));
|
|
113
114
|
}
|
|
114
115
|
return false;
|
|
115
116
|
}
|
|
@@ -162,13 +163,13 @@ async function ghostClick(cursor, target, options = {}) {
|
|
|
162
163
|
|
|
163
164
|
if (forceDebug) {
|
|
164
165
|
const label = typeof target === 'string' ? target : `(${Math.round(target.x)}, ${Math.round(target.y)})`;
|
|
165
|
-
console.log(formatLogMessage('debug',
|
|
166
|
+
console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Clicked ${label}`));
|
|
166
167
|
}
|
|
167
168
|
|
|
168
169
|
return true;
|
|
169
170
|
} catch (err) {
|
|
170
171
|
if (forceDebug) {
|
|
171
|
-
console.log(formatLogMessage('debug',
|
|
172
|
+
console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Click failed: ${err.message}`));
|
|
172
173
|
}
|
|
173
174
|
return false;
|
|
174
175
|
}
|
|
@@ -193,7 +194,7 @@ async function ghostRandomMove(cursor, options = {}) {
|
|
|
193
194
|
return true;
|
|
194
195
|
} catch (err) {
|
|
195
196
|
if (options.forceDebug) {
|
|
196
|
-
console.log(formatLogMessage('debug',
|
|
197
|
+
console.log(formatLogMessage('debug', `${GHOST_CURSOR_TAG} Random move failed: ${err.message}`));
|
|
197
198
|
}
|
|
198
199
|
return false;
|
|
199
200
|
}
|
package/lib/grep.js
CHANGED
|
@@ -2,33 +2,85 @@
|
|
|
2
2
|
// Alternative to searchstring.js using grep for pattern matching
|
|
3
3
|
|
|
4
4
|
const fs = require('fs');
|
|
5
|
+
// spawnSync only used for validateGrepAvailability (runs once at
|
|
6
|
+
// startup). Production grep + curl paths go through runProcess (async).
|
|
5
7
|
const { spawnSync } = require('child_process');
|
|
6
|
-
const {
|
|
8
|
+
const { runProcess } = require('./spawn-async');
|
|
9
|
+
const { messageColors, formatLogMessage } = require('./colorize');
|
|
10
|
+
const GREP_TAG = messageColors.processing('[grep]');
|
|
7
11
|
|
|
8
12
|
// === Constants ===
|
|
9
13
|
const GREP_DEFAULTS = {
|
|
10
14
|
TIMEOUT_SECONDS: 30,
|
|
11
15
|
MAX_REDIRECTS: 5,
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
// 50MB to match lib/curl.js and lib/searchstring.js — the three
|
|
17
|
+
// download paths previously had two different caps (10MB here, 50MB
|
|
18
|
+
// there) so the same URL could succeed via one path and fail via
|
|
19
|
+
// another.
|
|
20
|
+
MAX_SIZE_BYTES: 50 * 1024 * 1024,
|
|
21
|
+
// Cap grep's stdout collection at the input size — output can in
|
|
22
|
+
// theory exceed input (overlapping match contexts) but in practice
|
|
23
|
+
// matching lines from 50MB of content max out around that. Replaces
|
|
24
|
+
// the old 1MB MAX_BUFFER_SIZE that silently killed grep with ENOBUFS
|
|
25
|
+
// on pages with many matching lines, making the pattern silently
|
|
26
|
+
// report "not found" despite thousands of matches.
|
|
27
|
+
MAX_GREP_OUTPUT_BYTES: 50 * 1024 * 1024,
|
|
28
|
+
VALIDATION_TIMEOUT: 5000,
|
|
29
|
+
GREP_TIMEOUT: 10000,
|
|
17
30
|
DEFAULT_MAX_MATCHES: 1000,
|
|
18
31
|
GREP_SUCCESS_STATUS: 0,
|
|
19
|
-
GREP_NOT_FOUND_STATUS: 1,
|
|
20
32
|
CURL_SUCCESS_STATUS: 0,
|
|
21
33
|
VERSION_LINE_INDEX: 0
|
|
22
34
|
};
|
|
23
35
|
|
|
24
36
|
/**
|
|
25
|
-
*
|
|
37
|
+
* Run a single grep pattern against `content`, returning the result
|
|
38
|
+
* asynchronously. Uses spawn (NOT spawnSync) — same rationale as
|
|
39
|
+
* downloadAndGrep — and handles stdout buffering ourselves so we can
|
|
40
|
+
* accept output up to MAX_GREP_OUTPUT_BYTES instead of being capped
|
|
41
|
+
* at spawnSync's `maxBuffer` (which silently killed grep with ENOBUFS
|
|
42
|
+
* on pages with many matching lines).
|
|
43
|
+
*
|
|
44
|
+
* @param {string} content - Stdin content for grep
|
|
45
|
+
* @param {string} pattern - The pattern to search for
|
|
46
|
+
* @param {string[]} baseArgs - Pre-computed grep flags (-i, -F, etc.)
|
|
47
|
+
* @returns {Promise<{status: number|null, stdout: string, truncated: boolean, signal: string|null, error?: string}>}
|
|
48
|
+
*/
|
|
49
|
+
async function grepOne(content, pattern, baseArgs) {
|
|
50
|
+
// Shared async-spawn helper handles stdout cap, kill timer, error/close
|
|
51
|
+
// wiring, and stdin EPIPE swallowing. We just adapt the return shape
|
|
52
|
+
// to what grepContent expects (string stdout, status alias for code).
|
|
53
|
+
const result = await runProcess('grep', [...baseArgs, pattern], {
|
|
54
|
+
timeout: GREP_DEFAULTS.GREP_TIMEOUT,
|
|
55
|
+
maxStdout: GREP_DEFAULTS.MAX_GREP_OUTPUT_BYTES,
|
|
56
|
+
input: content,
|
|
57
|
+
collectStderr: false // grep's stderr isn't used by callers
|
|
58
|
+
});
|
|
59
|
+
return {
|
|
60
|
+
status: result.error ? -1 : result.code,
|
|
61
|
+
stdout: result.stdout.toString('utf8'),
|
|
62
|
+
truncated: result.truncated,
|
|
63
|
+
signal: result.signal,
|
|
64
|
+
error: result.error
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Searches content using grep with the provided patterns.
|
|
70
|
+
*
|
|
71
|
+
* Async — runs one spawn per pattern (sequential, not concurrent, to
|
|
72
|
+
* avoid spiking memory with N copies of `content` on grep's stdin
|
|
73
|
+
* simultaneously). The previous spawnSync-per-pattern implementation
|
|
74
|
+
* blocked the event loop for the duration of every grep call; the
|
|
75
|
+
* outer downloadAndGrep's switch to async spawn was undone by this
|
|
76
|
+
* sync inner step.
|
|
77
|
+
*
|
|
26
78
|
* @param {string} content - The content to search
|
|
27
79
|
* @param {Array<string>} searchPatterns - Array of grep patterns to search for
|
|
28
|
-
* @param {object} options - Grep options
|
|
29
|
-
* @returns {Promise<
|
|
80
|
+
* @param {object} options - Grep options (ignoreCase, wholeWord, regex, maxMatches)
|
|
81
|
+
* @returns {Promise<{found: boolean, matchedPattern: string|null, allMatches: Array<{pattern: string, matches: string[]}>}>}
|
|
30
82
|
*/
|
|
31
|
-
function grepContent(content, searchPatterns, options = {}) {
|
|
83
|
+
async function grepContent(content, searchPatterns, options = {}) {
|
|
32
84
|
const {
|
|
33
85
|
ignoreCase = true,
|
|
34
86
|
wholeWord = false,
|
|
@@ -36,60 +88,53 @@ function grepContent(content, searchPatterns, options = {}) {
|
|
|
36
88
|
maxMatches = GREP_DEFAULTS.DEFAULT_MAX_MATCHES
|
|
37
89
|
} = options;
|
|
38
90
|
|
|
39
|
-
|
|
91
|
+
// Pre-filter empty/whitespace patterns at the top instead of doing
|
|
92
|
+
// `if (!pattern || ...) continue` inside the loop. `typeof === 'string'`
|
|
93
|
+
// guard rejects non-string entries (numbers, booleans, etc.) so we
|
|
94
|
+
// don't trip TypeError on `p.trim()` for misconfigured input.
|
|
95
|
+
const validPatterns = Array.isArray(searchPatterns)
|
|
96
|
+
? searchPatterns.filter(p => typeof p === 'string' && p.trim().length > 0)
|
|
97
|
+
: [];
|
|
98
|
+
|
|
99
|
+
if (!content || validPatterns.length === 0) {
|
|
40
100
|
return { found: false, matchedPattern: null, allMatches: [] };
|
|
41
101
|
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
matches: result.stdout.split('\n').filter(line => line.trim().length > 0).slice(0, maxMatches)
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
if (!firstMatch) {
|
|
74
|
-
firstMatch = pattern;
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
} catch (grepErr) {
|
|
79
|
-
// Continue with next pattern if this one fails
|
|
80
|
-
console.warn(formatLogMessage('warn', `[grep] Pattern "${pattern}" failed: ${grepErr.message}`));
|
|
81
|
-
}
|
|
102
|
+
|
|
103
|
+
const baseArgs = ['--text', '--color=never'];
|
|
104
|
+
if (ignoreCase) baseArgs.push('-i');
|
|
105
|
+
if (wholeWord) baseArgs.push('-w');
|
|
106
|
+
if (!regex) baseArgs.push('-F');
|
|
107
|
+
|
|
108
|
+
const allMatches = [];
|
|
109
|
+
let firstMatch = null;
|
|
110
|
+
|
|
111
|
+
for (const pattern of validPatterns) {
|
|
112
|
+
const result = await grepOne(content, pattern, baseArgs);
|
|
113
|
+
if (result.error) {
|
|
114
|
+
console.warn(formatLogMessage('warn', `${GREP_TAG} Pattern "${pattern}" failed: ${result.error}`));
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
// Surface truncation so admins can see when grep output hit the
|
|
118
|
+
// 50MB cap — previously this was silent (the SIGTERM-on-truncation
|
|
119
|
+
// path looks the same as a normal exit to the caller).
|
|
120
|
+
if (result.truncated) {
|
|
121
|
+
console.warn(formatLogMessage('warn', `${GREP_TAG} Pattern "${pattern}" output truncated at ${GREP_DEFAULTS.MAX_GREP_OUTPUT_BYTES} bytes; results may be incomplete`));
|
|
122
|
+
}
|
|
123
|
+
// grep exit codes: 0 = found, 1 = not found, 2+ = error.
|
|
124
|
+
// Also accept truncated output — we collected enough to slice to
|
|
125
|
+
// maxMatches even though more existed beyond the cap.
|
|
126
|
+
if (result.status === GREP_DEFAULTS.GREP_SUCCESS_STATUS && result.stdout) {
|
|
127
|
+
const lines = result.stdout.split('\n').filter(line => line.trim().length > 0).slice(0, maxMatches);
|
|
128
|
+
allMatches.push({ pattern, matches: lines });
|
|
129
|
+
if (!firstMatch) firstMatch = pattern;
|
|
82
130
|
}
|
|
83
|
-
|
|
84
|
-
return {
|
|
85
|
-
found: allMatches.length > 0,
|
|
86
|
-
matchedPattern: firstMatch,
|
|
87
|
-
allMatches: allMatches
|
|
88
|
-
};
|
|
89
|
-
|
|
90
|
-
} catch (error) {
|
|
91
|
-
throw new Error(`Grep search failed: ${error.message}`);
|
|
92
131
|
}
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
found: allMatches.length > 0,
|
|
135
|
+
matchedPattern: firstMatch,
|
|
136
|
+
allMatches
|
|
137
|
+
};
|
|
93
138
|
}
|
|
94
139
|
|
|
95
140
|
/**
|
|
@@ -102,74 +147,84 @@ function grepContent(content, searchPatterns, options = {}) {
|
|
|
102
147
|
* @returns {Promise<object>} Object with found boolean, matchedPattern, and content
|
|
103
148
|
*/
|
|
104
149
|
async function downloadAndGrep(url, searchPatterns, userAgent = '', grepOptions = {}, timeout = GREP_DEFAULTS.TIMEOUT_SECONDS) {
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
150
|
+
const curlArgs = [
|
|
151
|
+
'-s',
|
|
152
|
+
'-L',
|
|
153
|
+
'--max-time', timeout.toString(),
|
|
154
|
+
'--max-redirs', GREP_DEFAULTS.MAX_REDIRECTS.toString(),
|
|
155
|
+
'--fail-with-body',
|
|
156
|
+
'--compressed'
|
|
157
|
+
];
|
|
158
|
+
if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
|
|
159
|
+
curlArgs.push(
|
|
160
|
+
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
161
|
+
'-H', 'Accept-Language: en-US,en;q=0.5',
|
|
162
|
+
'-H', 'Accept-Encoding: gzip, deflate',
|
|
163
|
+
'-H', 'Connection: keep-alive',
|
|
164
|
+
'-H', 'Upgrade-Insecure-Requests: 1'
|
|
165
|
+
);
|
|
166
|
+
curlArgs.push(url);
|
|
118
167
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
'-H', 'Accept-Encoding: gzip, deflate',
|
|
124
|
-
'-H', 'Connection: keep-alive',
|
|
125
|
-
'-H', 'Upgrade-Insecure-Requests: 1'
|
|
126
|
-
);
|
|
168
|
+
const result = await runProcess('curl', curlArgs, {
|
|
169
|
+
timeout: timeout * 1000,
|
|
170
|
+
maxStdout: GREP_DEFAULTS.MAX_SIZE_BYTES
|
|
171
|
+
});
|
|
127
172
|
|
|
128
|
-
|
|
173
|
+
if (result.error) throw new Error(`Download and grep failed for ${url}: ${result.error}`);
|
|
174
|
+
if (result.truncated) throw new Error(`Output exceeded ${GREP_DEFAULTS.MAX_SIZE_BYTES} bytes for ${url}`);
|
|
175
|
+
if (result.signal) throw new Error(`Curl killed by signal ${result.signal} for ${url}`);
|
|
176
|
+
if (result.code !== GREP_DEFAULTS.CURL_SUCCESS_STATUS) {
|
|
177
|
+
throw new Error(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
|
|
178
|
+
}
|
|
129
179
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
encoding: 'utf8',
|
|
133
|
-
timeout: timeout * GREP_DEFAULTS.SPAWN_TIMEOUT_MULTIPLIER,
|
|
134
|
-
maxBuffer: GREP_DEFAULTS.MAX_SIZE_BYTES
|
|
135
|
-
});
|
|
136
|
-
|
|
137
|
-
if (curlResult.error) {
|
|
138
|
-
throw curlResult.error;
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
if (curlResult.status !== GREP_DEFAULTS.CURL_SUCCESS_STATUS) {
|
|
142
|
-
throw new Error(`Curl exited with status ${curlResult.status}: ${curlResult.stderr}`);
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
const content = curlResult.stdout;
|
|
146
|
-
|
|
147
|
-
// Search content with grep
|
|
180
|
+
const content = result.stdout.toString('utf8');
|
|
181
|
+
try {
|
|
148
182
|
const grepResult = await grepContent(content, searchPatterns, grepOptions);
|
|
149
|
-
|
|
150
183
|
return {
|
|
151
184
|
found: grepResult.found,
|
|
152
185
|
matchedPattern: grepResult.matchedPattern,
|
|
153
186
|
allMatches: grepResult.allMatches,
|
|
154
|
-
content
|
|
187
|
+
content,
|
|
155
188
|
contentLength: content.length
|
|
156
189
|
};
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
throw new Error(`Download and grep failed for ${url}: ${error.message}`);
|
|
190
|
+
} catch (grepErr) {
|
|
191
|
+
throw new Error(`Download and grep failed for ${url}: ${grepErr.message}`);
|
|
160
192
|
}
|
|
161
193
|
}
|
|
162
194
|
|
|
163
195
|
/**
|
|
164
|
-
* Creates a grep-based URL handler for downloading and searching content
|
|
165
|
-
*
|
|
166
|
-
* @
|
|
196
|
+
* Creates a grep-based URL handler for downloading and searching content.
|
|
197
|
+
*
|
|
198
|
+
* @param {object} config
|
|
199
|
+
* @param {string[]} config.searchStrings - OR-logic patterns (any match)
|
|
200
|
+
* @param {string[]} config.searchStringsAnd - AND-logic patterns (all must match)
|
|
201
|
+
* @param {boolean} config.hasSearchString - True if searchStrings is non-empty
|
|
202
|
+
* @param {boolean} config.hasSearchStringAnd - True if searchStringsAnd is non-empty;
|
|
203
|
+
* when true, AND-logic is applied to the combined grep result
|
|
204
|
+
* @param {RegExp[]} config.regexes - URL regex patterns for the first-pass filter
|
|
205
|
+
* @param {Function} config.addMatchedDomain - Sink for matched domains
|
|
206
|
+
* @param {Function} config.isDomainAlreadyDetected - Skip-if-true predicate
|
|
207
|
+
* @param {Function} [config.onContentFetched] - Optional cache hook
|
|
208
|
+
* @param {string} config.currentUrl - The page URL being scanned
|
|
209
|
+
* @param {boolean} config.perSiteSubDomains - Track at subdomain granularity
|
|
210
|
+
* @param {string[]} config.ignoreDomains - Domain ignore list
|
|
211
|
+
* @param {Function} config.matchesIgnoreDomain - Ignore-list matcher
|
|
212
|
+
* @param {Function} config.getRootDomain - URL → registrable root domain
|
|
213
|
+
* @param {object} config.siteConfig - Per-site config (verbose, firstParty, thirdParty)
|
|
214
|
+
* @param {boolean} config.dumpUrls - Write matched URLs to file
|
|
215
|
+
* @param {string} config.matchedUrlsLogFile - Path for dumpUrls output
|
|
216
|
+
* @param {boolean} config.forceDebug
|
|
217
|
+
* @param {string} config.userAgent - Curl user agent
|
|
218
|
+
* @param {string|null} config.resourceType - Resource type for adblock-rules mode
|
|
219
|
+
* @param {object} [config.grepOptions] - Passed through to grepContent
|
|
220
|
+
* (ignoreCase, wholeWord, regex, maxMatches)
|
|
221
|
+
* @returns {Function} URL handler: async (requestUrl) => void
|
|
167
222
|
*/
|
|
168
223
|
function createGrepHandler(config) {
|
|
169
224
|
const {
|
|
170
225
|
searchStrings,
|
|
226
|
+
searchStringsAnd,
|
|
171
227
|
regexes,
|
|
172
|
-
matchedDomains,
|
|
173
228
|
addMatchedDomain,
|
|
174
229
|
isDomainAlreadyDetected,
|
|
175
230
|
onContentFetched,
|
|
@@ -185,122 +240,143 @@ function createGrepHandler(config) {
|
|
|
185
240
|
userAgent,
|
|
186
241
|
resourceType,
|
|
187
242
|
hasSearchString,
|
|
243
|
+
hasSearchStringAnd,
|
|
188
244
|
grepOptions = {}
|
|
189
245
|
} = config;
|
|
190
246
|
|
|
247
|
+
// Hoisted: currentUrl doesn't change for this handler's lifetime.
|
|
248
|
+
// Previously parsed on every single request.
|
|
249
|
+
let currentRootDomain = '';
|
|
250
|
+
let currentUrlHostname = '';
|
|
251
|
+
try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
|
|
252
|
+
try { currentUrlHostname = new URL(currentUrl).hostname; } catch (_) {}
|
|
253
|
+
|
|
191
254
|
return async function grepHandler(requestUrl) {
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
255
|
+
// Regex check FIRST — cheap filter that skips ~99% of requests.
|
|
256
|
+
// Previously this ran AFTER URL parses and a domain-cache lookup,
|
|
257
|
+
// paying for parses on requests we then immediately drop.
|
|
258
|
+
const matchesRegex = regexes.some(re => re.test(requestUrl));
|
|
259
|
+
if (!matchesRegex) return;
|
|
260
|
+
|
|
261
|
+
// Parse requestUrl ONCE and reuse. Was parsed 4 times previously
|
|
262
|
+
// (two hostname parses + two for currentUrlHostname/requestHostname).
|
|
263
|
+
let requestHostname;
|
|
264
|
+
try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
|
|
265
|
+
const fullSubdomain = requestHostname;
|
|
266
|
+
const respDomain = perSiteSubDomains ? requestHostname : getRootDomain(requestUrl);
|
|
267
|
+
|
|
196
268
|
if (isDomainAlreadyDetected(fullSubdomain)) {
|
|
197
269
|
if (forceDebug) {
|
|
198
|
-
console.log(formatLogMessage('debug',
|
|
270
|
+
console.log(formatLogMessage('debug', `${GREP_TAG} Skipping already detected subdomain: ${fullSubdomain}`));
|
|
199
271
|
}
|
|
200
272
|
return;
|
|
201
273
|
}
|
|
202
|
-
|
|
203
|
-
// Only process URLs that match our regex patterns
|
|
204
|
-
const matchesRegex = regexes.some(re => re.test(requestUrl));
|
|
205
|
-
if (!matchesRegex) return;
|
|
206
|
-
|
|
207
|
-
// Check if this is a first-party request (same domain as the URL being scanned)
|
|
208
|
-
const currentUrlHostname = new URL(currentUrl).hostname;
|
|
209
|
-
const requestHostname = new URL(requestUrl).hostname;
|
|
274
|
+
|
|
210
275
|
const isFirstParty = currentUrlHostname === requestHostname;
|
|
211
|
-
|
|
212
|
-
// Apply first-party/third-party filtering
|
|
276
|
+
|
|
213
277
|
if (isFirstParty && siteConfig.firstParty === false) {
|
|
214
278
|
if (forceDebug) {
|
|
215
|
-
console.log(formatLogMessage('debug',
|
|
279
|
+
console.log(formatLogMessage('debug', `${GREP_TAG} Skipping first-party request (firstParty=false): ${requestUrl}`));
|
|
216
280
|
}
|
|
217
281
|
return;
|
|
218
282
|
}
|
|
219
|
-
|
|
220
283
|
if (!isFirstParty && siteConfig.thirdParty === false) {
|
|
221
284
|
if (forceDebug) {
|
|
222
|
-
console.log(formatLogMessage('debug',
|
|
285
|
+
console.log(formatLogMessage('debug', `${GREP_TAG} Skipping third-party request (thirdParty=false): ${requestUrl}`));
|
|
223
286
|
}
|
|
224
287
|
return;
|
|
225
288
|
}
|
|
226
|
-
|
|
289
|
+
|
|
227
290
|
try {
|
|
228
291
|
if (forceDebug) {
|
|
229
|
-
console.log(formatLogMessage('debug',
|
|
292
|
+
console.log(formatLogMessage('debug', `${GREP_TAG} Downloading and searching content from: ${requestUrl}`));
|
|
230
293
|
}
|
|
231
|
-
|
|
232
|
-
//
|
|
233
|
-
if (!hasSearchString) {
|
|
234
|
-
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains))
|
|
235
|
-
return;
|
|
236
|
-
}
|
|
237
|
-
|
|
294
|
+
|
|
295
|
+
// No searchstring at all → match immediately on regex alone.
|
|
296
|
+
if (!hasSearchString && !hasSearchStringAnd) {
|
|
297
|
+
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) return;
|
|
238
298
|
addMatchedDomain(respDomain, resourceType, fullSubdomain);
|
|
239
|
-
|
|
240
|
-
|
|
299
|
+
|
|
300
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
241
301
|
if (siteConfig.verbose === 1) {
|
|
242
|
-
|
|
243
|
-
console.log(formatLogMessage('match', `[${simplifiedUrl}] ${requestUrl} (${partyType}, grep) matched regex`));
|
|
302
|
+
console.log(formatLogMessage('match', `[${currentRootDomain}] ${requestUrl} (${partyType}, grep) matched regex`));
|
|
244
303
|
}
|
|
245
|
-
|
|
246
|
-
if (dumpUrls) {
|
|
304
|
+
if (dumpUrls && matchedUrlsLogFile) {
|
|
247
305
|
const timestamp = new Date().toISOString();
|
|
248
|
-
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
249
306
|
try {
|
|
250
|
-
fs.appendFileSync(matchedUrlsLogFile,
|
|
251
|
-
`${timestamp} [match][${
|
|
307
|
+
fs.appendFileSync(matchedUrlsLogFile,
|
|
308
|
+
`${timestamp} [match][${currentRootDomain}] ${requestUrl} (${partyType}, grep)\n`);
|
|
252
309
|
} catch (logErr) {
|
|
253
310
|
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
254
311
|
}
|
|
255
312
|
}
|
|
256
313
|
return;
|
|
257
314
|
}
|
|
258
|
-
|
|
259
|
-
// If searchstring IS defined, download and grep content
|
|
260
|
-
const result = await downloadAndGrep(requestUrl, searchStrings, userAgent, grepOptions, GREP_DEFAULTS.TIMEOUT_SECONDS);
|
|
261
315
|
|
|
262
|
-
//
|
|
316
|
+
// Combine OR + AND patterns into one grep pass. The AND-logic
|
|
317
|
+
// check below uses per-pattern attribution from
|
|
318
|
+
// grepContent.allMatches. Previously createGrepHandler only
|
|
319
|
+
// destructured `searchStrings` and ignored `searchStringsAnd`
|
|
320
|
+
// entirely — users configuring AND-only patterns with grep mode
|
|
321
|
+
// got silent zero matches.
|
|
322
|
+
const allPatterns = [
|
|
323
|
+
...(searchStrings || []),
|
|
324
|
+
...(searchStringsAnd || [])
|
|
325
|
+
];
|
|
326
|
+
const result = await downloadAndGrep(requestUrl, allPatterns, userAgent, grepOptions, GREP_DEFAULTS.TIMEOUT_SECONDS);
|
|
327
|
+
|
|
263
328
|
if (onContentFetched && result.content) {
|
|
264
329
|
try {
|
|
265
330
|
onContentFetched(requestUrl, result.content);
|
|
266
331
|
} catch (cacheErr) {
|
|
267
|
-
if (forceDebug) console.log(formatLogMessage('debug',
|
|
332
|
+
if (forceDebug) console.log(formatLogMessage('debug', `${GREP_TAG} Content caching failed: ${cacheErr.message}`));
|
|
268
333
|
}
|
|
269
334
|
}
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
335
|
+
|
|
336
|
+
// Apply OR vs AND logic. AND requires every searchStringsAnd
|
|
337
|
+
// pattern to appear in grepResult.allMatches; OR just needs
|
|
338
|
+
// anything found.
|
|
339
|
+
let matched = false;
|
|
340
|
+
let matchDescription = null;
|
|
341
|
+
|
|
342
|
+
if (hasSearchStringAnd && searchStringsAnd && searchStringsAnd.length > 0) {
|
|
343
|
+
const foundPatterns = new Set(result.allMatches.map(m => m.pattern));
|
|
344
|
+
if (searchStringsAnd.every(p => foundPatterns.has(p))) {
|
|
345
|
+
matched = true;
|
|
346
|
+
matchDescription = `patterns: ${searchStringsAnd.length}/${searchStringsAnd.length} (AND)`;
|
|
274
347
|
}
|
|
275
|
-
|
|
348
|
+
} else if (result.found) {
|
|
349
|
+
matched = true;
|
|
350
|
+
matchDescription = `pattern: "${result.matchedPattern}"`;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if (matched) {
|
|
354
|
+
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) return;
|
|
276
355
|
addMatchedDomain(respDomain, resourceType, fullSubdomain);
|
|
277
|
-
|
|
278
|
-
|
|
356
|
+
|
|
357
|
+
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
358
|
+
const matchCount = result.allMatches.reduce((sum, m) => sum + m.matches.length, 0);
|
|
359
|
+
|
|
279
360
|
if (siteConfig.verbose === 1) {
|
|
280
|
-
|
|
281
|
-
const matchCount = result.allMatches.reduce((sum, match) => sum + match.matches.length, 0);
|
|
282
|
-
console.log(formatLogMessage('match', `[${simplifiedUrl}] ${requestUrl} (${partyType}, grep) contains pattern: "${result.matchedPattern}" (${matchCount} matches)`));
|
|
361
|
+
console.log(formatLogMessage('match', `[${currentRootDomain}] ${requestUrl} (${partyType}, grep) contains ${matchDescription} (${matchCount} matches)`));
|
|
283
362
|
}
|
|
284
|
-
|
|
285
|
-
if (dumpUrls) {
|
|
363
|
+
if (dumpUrls && matchedUrlsLogFile) {
|
|
286
364
|
const timestamp = new Date().toISOString();
|
|
287
|
-
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
288
|
-
const matchCount = result.allMatches.reduce((sum, match) => sum + match.matches.length, 0);
|
|
289
365
|
try {
|
|
290
|
-
fs.appendFileSync(matchedUrlsLogFile,
|
|
291
|
-
`${timestamp} [match][${
|
|
366
|
+
fs.appendFileSync(matchedUrlsLogFile,
|
|
367
|
+
`${timestamp} [match][${currentRootDomain}] ${requestUrl} (${partyType}, grep, ${matchDescription}, matches: ${matchCount})\n`);
|
|
292
368
|
} catch (logErr) {
|
|
293
369
|
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
|
|
294
370
|
}
|
|
295
371
|
}
|
|
296
372
|
} else if (forceDebug) {
|
|
297
373
|
const partyType = isFirstParty ? 'first-party' : 'third-party';
|
|
298
|
-
console.log(formatLogMessage('debug',
|
|
374
|
+
console.log(formatLogMessage('debug', `${GREP_TAG} ${requestUrl} (${partyType}) matched regex but no patterns found`));
|
|
299
375
|
}
|
|
300
|
-
|
|
376
|
+
|
|
301
377
|
} catch (err) {
|
|
302
378
|
if (forceDebug) {
|
|
303
|
-
console.log(formatLogMessage('debug',
|
|
379
|
+
console.log(formatLogMessage('debug', `${GREP_TAG} Failed to download/grep content for ${requestUrl}: ${err.message}`));
|
|
304
380
|
}
|
|
305
381
|
}
|
|
306
382
|
};
|
|
@@ -340,9 +416,10 @@ function validateGrepAvailability() {
|
|
|
340
416
|
}
|
|
341
417
|
}
|
|
342
418
|
|
|
419
|
+
// Public surface. downloadAndGrep is module-internal (only called by
|
|
420
|
+
// createGrepHandler) — was exported but no external caller imported it.
|
|
343
421
|
module.exports = {
|
|
344
422
|
grepContent,
|
|
345
|
-
downloadAndGrep,
|
|
346
423
|
createGrepHandler,
|
|
347
424
|
validateGrepAvailability
|
|
348
425
|
};
|