npxconfuse 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +462 -0
- package/bin/cli.js +280 -0
- package/package.json +47 -0
- package/src/analyzer.js +167 -0
- package/src/extractors/js-bundle.js +147 -0
- package/src/extractors/package-json.js +162 -0
- package/src/formatters/csv.js +39 -0
- package/src/formatters/json.js +11 -0
- package/src/formatters/table.js +144 -0
- package/src/registries/npm.js +185 -0
- package/src/sources/github.js +142 -0
- package/src/sources/local.js +117 -0
- package/src/sources/web.js +182 -0
- package/src/utils/constants.js +181 -0
- package/src/utils/http.js +179 -0
- package/src/utils/logger.js +83 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { readFile, stat } from "node:fs/promises";
|
|
2
|
+
import { resolve, basename, extname } from "node:path";
|
|
3
|
+
import { glob } from "glob";
|
|
4
|
+
import logger from "../utils/logger.js";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Scan a local filesystem path for scannable files.
|
|
8
|
+
*
|
|
9
|
+
* @param {string} targetPath - File or directory to scan
|
|
10
|
+
* @param {object} options
|
|
11
|
+
* @param {boolean} options.deep - Also scan JS bundles (slower)
|
|
12
|
+
* @returns {Promise<Array<{filepath: string, content: string, type: string}>>}
|
|
13
|
+
*/
|
|
14
|
+
export async function scanLocal(targetPath, options = {}) {
|
|
15
|
+
const absPath = resolve(targetPath);
|
|
16
|
+
const results = [];
|
|
17
|
+
|
|
18
|
+
// Check if it's a file or directory
|
|
19
|
+
let stats;
|
|
20
|
+
try {
|
|
21
|
+
stats = await stat(absPath);
|
|
22
|
+
} catch (err) {
|
|
23
|
+
throw new Error(`Cannot access "${absPath}": ${err.message}`);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (stats.isFile()) {
|
|
27
|
+
const file = await readSingleFile(absPath);
|
|
28
|
+
if (file) results.push(file);
|
|
29
|
+
return results;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (!stats.isDirectory()) {
|
|
33
|
+
throw new Error(`"${absPath}" is not a file or directory`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
logger.info(`Scanning directory: ${absPath}`);
|
|
37
|
+
|
|
38
|
+
// Define glob patterns
|
|
39
|
+
const ignorePatterns = [
|
|
40
|
+
"**/node_modules/**",
|
|
41
|
+
"**/.git/**",
|
|
42
|
+
"**/dist/**",
|
|
43
|
+
"**/build/**",
|
|
44
|
+
"**/.next/**",
|
|
45
|
+
"**/coverage/**",
|
|
46
|
+
"**/.nyc_output/**",
|
|
47
|
+
];
|
|
48
|
+
|
|
49
|
+
const patterns = ["**/package.json"];
|
|
50
|
+
|
|
51
|
+
if (options.deep) {
|
|
52
|
+
patterns.push("**/*.js", "**/*.mjs", "**/*.cjs");
|
|
53
|
+
logger.info(
|
|
54
|
+
"Deep scan enabled — also scanning JS bundles (this may take a while)",
|
|
55
|
+
);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Run glob
|
|
59
|
+
const files = await glob(patterns, {
|
|
60
|
+
cwd: absPath,
|
|
61
|
+
ignore: ignorePatterns,
|
|
62
|
+
nodir: true,
|
|
63
|
+
absolute: true,
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
logger.info(`Found ${files.length} files to analyze`);
|
|
67
|
+
|
|
68
|
+
// Read each file
|
|
69
|
+
for (const filepath of files) {
|
|
70
|
+
try {
|
|
71
|
+
const content = await readFile(filepath, "utf-8");
|
|
72
|
+
const type = classifyFile(filepath);
|
|
73
|
+
if (type) {
|
|
74
|
+
results.push({ filepath, content, type });
|
|
75
|
+
}
|
|
76
|
+
} catch (err) {
|
|
77
|
+
logger.warn(`Skipping unreadable file: ${filepath} (${err.message})`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return results;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Read and classify a single file.
|
|
86
|
+
*/
|
|
87
|
+
async function readSingleFile(filepath) {
|
|
88
|
+
try {
|
|
89
|
+
const content = await readFile(filepath, "utf-8");
|
|
90
|
+
const type = classifyFile(filepath);
|
|
91
|
+
if (!type) {
|
|
92
|
+
logger.warn(`Unrecognized file type: ${filepath}`);
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
return { filepath, content, type };
|
|
96
|
+
} catch (err) {
|
|
97
|
+
logger.error(`Cannot read file: ${filepath} (${err.message})`);
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Classify a file by its name/extension.
|
|
104
|
+
*/
|
|
105
|
+
function classifyFile(filepath) {
|
|
106
|
+
const name = basename(filepath).toLowerCase();
|
|
107
|
+
const ext = extname(filepath).toLowerCase();
|
|
108
|
+
|
|
109
|
+
if (name === "package.json") return "package-json";
|
|
110
|
+
if (name === "package-lock.json") return "package-json"; // may contain useful data
|
|
111
|
+
|
|
112
|
+
if ([".js", ".mjs", ".cjs"].includes(ext)) return "js-bundle";
|
|
113
|
+
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export default scanLocal;
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import pLimit from 'p-limit';
|
|
3
|
+
import { httpGet, httpGetText } from '../utils/http.js';
|
|
4
|
+
import { WEB_PROBE_PATHS, SKIP_EXTENSIONS } from '../utils/constants.js';
|
|
5
|
+
import logger from '../utils/logger.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Scan web domains for exposed package manifests and JS bundles.
|
|
9
|
+
*
|
|
10
|
+
* @param {string} domainsFile - Path to file with one domain/URL per line
|
|
11
|
+
* @param {object} options
|
|
12
|
+
* @param {number} options.concurrency - Parallel domain scans (default 10)
|
|
13
|
+
* @param {number} options.timeout - HTTP timeout in ms
|
|
14
|
+
* @returns {Promise<Array<{filepath: string, content: string, type: string}>>}
|
|
15
|
+
*/
|
|
16
|
+
export async function scanWeb(domainsFile, options = {}) {
|
|
17
|
+
const content = await readFile(domainsFile, 'utf-8');
|
|
18
|
+
const domains = content
|
|
19
|
+
.split('\n')
|
|
20
|
+
.map(line => line.trim())
|
|
21
|
+
.filter(line => line && !line.startsWith('#'));
|
|
22
|
+
|
|
23
|
+
if (domains.length === 0) {
|
|
24
|
+
throw new Error(`No domains found in ${domainsFile}`);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
logger.info(`Loaded ${domains.length} domains from ${domainsFile}`);
|
|
28
|
+
|
|
29
|
+
const concurrency = options.concurrency || 10;
|
|
30
|
+
const limit = pLimit(concurrency);
|
|
31
|
+
const results = [];
|
|
32
|
+
let processed = 0;
|
|
33
|
+
|
|
34
|
+
const tasks = domains.map(domain =>
|
|
35
|
+
limit(async () => {
|
|
36
|
+
try {
|
|
37
|
+
const domainResults = await scanDomain(normalizeDomain(domain), options);
|
|
38
|
+
results.push(...domainResults);
|
|
39
|
+
} catch (err) {
|
|
40
|
+
logger.debug(`Error scanning ${domain}: ${err.message}`);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
processed++;
|
|
44
|
+
if (processed % 20 === 0 || processed === domains.length) {
|
|
45
|
+
logger.info(`Progress: ${processed}/${domains.length} domains scanned (${results.length} files found)`);
|
|
46
|
+
}
|
|
47
|
+
})
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
await Promise.all(tasks);
|
|
51
|
+
logger.success(`Web scan complete: ${results.length} files from ${domains.length} domains`);
|
|
52
|
+
|
|
53
|
+
return results;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Normalize a domain string to a full URL.
|
|
58
|
+
*/
|
|
59
|
+
function normalizeDomain(domain) {
|
|
60
|
+
if (domain.startsWith('http://') || domain.startsWith('https://')) {
|
|
61
|
+
return domain.replace(/\/+$/, ''); // strip trailing slashes
|
|
62
|
+
}
|
|
63
|
+
return `https://${domain}`;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Scan a single domain for exposed package files and JS bundles.
|
|
68
|
+
*/
|
|
69
|
+
async function scanDomain(baseUrl, options = {}) {
|
|
70
|
+
const results = [];
|
|
71
|
+
|
|
72
|
+
// ── 1. Probe known paths ──
|
|
73
|
+
for (const probePath of WEB_PROBE_PATHS) {
|
|
74
|
+
const url = `${baseUrl}${probePath}`;
|
|
75
|
+
try {
|
|
76
|
+
const response = await httpGet(url, {
|
|
77
|
+
timeout: options.timeout,
|
|
78
|
+
retries: 1,
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
if (response.status === 200 && response.data) {
|
|
82
|
+
const content = typeof response.data === 'string'
|
|
83
|
+
? response.data
|
|
84
|
+
: JSON.stringify(response.data);
|
|
85
|
+
|
|
86
|
+
// Validate it looks like a real package.json (not an HTML error page)
|
|
87
|
+
if (probePath.includes('package.json') && isLikelyJson(content)) {
|
|
88
|
+
results.push({
|
|
89
|
+
filepath: url,
|
|
90
|
+
content,
|
|
91
|
+
type: 'package-json',
|
|
92
|
+
});
|
|
93
|
+
logger.debug(`Found exposed ${probePath} at ${baseUrl}`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
} catch (err) {
|
|
97
|
+
// Expected for most domains
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// ── 2. Fetch root page and extract JS bundle URLs ──
|
|
102
|
+
try {
|
|
103
|
+
const rootResponse = await httpGetText(`${baseUrl}/`, {
|
|
104
|
+
timeout: options.timeout,
|
|
105
|
+
retries: 1,
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (rootResponse.status === 200 && typeof rootResponse.data === 'string') {
|
|
109
|
+
const scriptUrls = extractScriptUrls(rootResponse.data, baseUrl);
|
|
110
|
+
const maxBundles = 10;
|
|
111
|
+
|
|
112
|
+
for (const scriptUrl of scriptUrls.slice(0, maxBundles)) {
|
|
113
|
+
// Skip non-JS assets
|
|
114
|
+
const ext = scriptUrl.split('?')[0].split('#')[0].split('.').pop()?.toLowerCase();
|
|
115
|
+
if (SKIP_EXTENSIONS.has(`.${ext}`)) continue;
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
const bundleResponse = await httpGetText(scriptUrl, {
|
|
119
|
+
timeout: options.timeout,
|
|
120
|
+
retries: 0,
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
if (bundleResponse.status === 200 && typeof bundleResponse.data === 'string') {
|
|
124
|
+
// Only process files that look like JS (not HTML error pages)
|
|
125
|
+
if (bundleResponse.data.length > 100 && !bundleResponse.data.startsWith('<!')) {
|
|
126
|
+
results.push({
|
|
127
|
+
filepath: scriptUrl,
|
|
128
|
+
content: bundleResponse.data,
|
|
129
|
+
type: 'js-bundle',
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
} catch {
|
|
134
|
+
// Skip failed bundle fetches
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
} catch {
|
|
139
|
+
// Root page fetch failed — that's fine
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return results;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Extract <script src="..."> URLs from HTML.
|
|
147
|
+
*/
|
|
148
|
+
function extractScriptUrls(html, baseUrl) {
|
|
149
|
+
const urls = [];
|
|
150
|
+
const pattern = /<script[^>]+src=["']([^"']+)["']/gi;
|
|
151
|
+
let match;
|
|
152
|
+
|
|
153
|
+
while ((match = pattern.exec(html)) !== null) {
|
|
154
|
+
let src = match[1];
|
|
155
|
+
|
|
156
|
+
// Skip inline data URIs and blobs
|
|
157
|
+
if (src.startsWith('data:') || src.startsWith('blob:')) continue;
|
|
158
|
+
|
|
159
|
+
// Resolve relative URLs
|
|
160
|
+
if (src.startsWith('//')) {
|
|
161
|
+
src = `https:${src}`;
|
|
162
|
+
} else if (src.startsWith('/')) {
|
|
163
|
+
src = `${baseUrl}${src}`;
|
|
164
|
+
} else if (!src.startsWith('http')) {
|
|
165
|
+
src = `${baseUrl}/${src}`;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
urls.push(src);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return urls;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Check if a string looks like JSON (not an HTML page).
|
|
176
|
+
*/
|
|
177
|
+
function isLikelyJson(content) {
|
|
178
|
+
const trimmed = content.trim();
|
|
179
|
+
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export default scanWeb;
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Constants used throughout npxconfuse
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
// Node.js built-in modules to filter out during extraction
|
|
6
|
+
export const NODE_BUILTINS = new Set([
|
|
7
|
+
"_http_agent",
|
|
8
|
+
"_http_client",
|
|
9
|
+
"_http_common",
|
|
10
|
+
"_http_incoming",
|
|
11
|
+
"_http_outgoing",
|
|
12
|
+
"_http_server",
|
|
13
|
+
"_stream_duplex",
|
|
14
|
+
"_stream_passthrough",
|
|
15
|
+
"_stream_readable",
|
|
16
|
+
"_stream_transform",
|
|
17
|
+
"_stream_wrap",
|
|
18
|
+
"_stream_writable",
|
|
19
|
+
"_tls_common",
|
|
20
|
+
"_tls_wrap",
|
|
21
|
+
"assert",
|
|
22
|
+
"assert/strict",
|
|
23
|
+
"async_hooks",
|
|
24
|
+
"buffer",
|
|
25
|
+
"child_process",
|
|
26
|
+
"cluster",
|
|
27
|
+
"console",
|
|
28
|
+
"constants",
|
|
29
|
+
"crypto",
|
|
30
|
+
"dgram",
|
|
31
|
+
"diagnostics_channel",
|
|
32
|
+
"dns",
|
|
33
|
+
"dns/promises",
|
|
34
|
+
"domain",
|
|
35
|
+
"events",
|
|
36
|
+
"fs",
|
|
37
|
+
"fs/promises",
|
|
38
|
+
"http",
|
|
39
|
+
"http2",
|
|
40
|
+
"https",
|
|
41
|
+
"inspector",
|
|
42
|
+
"inspector/promises",
|
|
43
|
+
"module",
|
|
44
|
+
"net",
|
|
45
|
+
"os",
|
|
46
|
+
"path",
|
|
47
|
+
"path/posix",
|
|
48
|
+
"path/win32",
|
|
49
|
+
"perf_hooks",
|
|
50
|
+
"process",
|
|
51
|
+
"punycode",
|
|
52
|
+
"querystring",
|
|
53
|
+
"readline",
|
|
54
|
+
"readline/promises",
|
|
55
|
+
"repl",
|
|
56
|
+
"stream",
|
|
57
|
+
"stream/consumers",
|
|
58
|
+
"stream/promises",
|
|
59
|
+
"stream/web",
|
|
60
|
+
"string_decoder",
|
|
61
|
+
"sys",
|
|
62
|
+
"timers",
|
|
63
|
+
"timers/promises",
|
|
64
|
+
"tls",
|
|
65
|
+
"trace_events",
|
|
66
|
+
"tty",
|
|
67
|
+
"url",
|
|
68
|
+
"util",
|
|
69
|
+
"util/types",
|
|
70
|
+
"v8",
|
|
71
|
+
"vm",
|
|
72
|
+
"wasi",
|
|
73
|
+
"worker_threads",
|
|
74
|
+
"zlib",
|
|
75
|
+
// Prefixed forms
|
|
76
|
+
"node:assert",
|
|
77
|
+
"node:buffer",
|
|
78
|
+
"node:child_process",
|
|
79
|
+
"node:cluster",
|
|
80
|
+
"node:console",
|
|
81
|
+
"node:constants",
|
|
82
|
+
"node:crypto",
|
|
83
|
+
"node:dgram",
|
|
84
|
+
"node:diagnostics_channel",
|
|
85
|
+
"node:dns",
|
|
86
|
+
"node:domain",
|
|
87
|
+
"node:events",
|
|
88
|
+
"node:fs",
|
|
89
|
+
"node:http",
|
|
90
|
+
"node:http2",
|
|
91
|
+
"node:https",
|
|
92
|
+
"node:inspector",
|
|
93
|
+
"node:module",
|
|
94
|
+
"node:net",
|
|
95
|
+
"node:os",
|
|
96
|
+
"node:path",
|
|
97
|
+
"node:perf_hooks",
|
|
98
|
+
"node:process",
|
|
99
|
+
"node:punycode",
|
|
100
|
+
"node:querystring",
|
|
101
|
+
"node:readline",
|
|
102
|
+
"node:repl",
|
|
103
|
+
"node:stream",
|
|
104
|
+
"node:string_decoder",
|
|
105
|
+
"node:sys",
|
|
106
|
+
"node:timers",
|
|
107
|
+
"node:tls",
|
|
108
|
+
"node:trace_events",
|
|
109
|
+
"node:tty",
|
|
110
|
+
"node:url",
|
|
111
|
+
"node:util",
|
|
112
|
+
"node:v8",
|
|
113
|
+
"node:vm",
|
|
114
|
+
"node:wasi",
|
|
115
|
+
"node:worker_threads",
|
|
116
|
+
"node:zlib",
|
|
117
|
+
]);
|
|
118
|
+
|
|
119
|
+
// Registry endpoints
|
|
120
|
+
export const REGISTRIES = {
|
|
121
|
+
npm: {
|
|
122
|
+
registry: "https://registry.npmjs.org",
|
|
123
|
+
downloads: "https://api.npmjs.org/downloads/point/last-month",
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
// Default concurrency for parallel HTTP requests
|
|
128
|
+
export const DEFAULT_CONCURRENCY = 20;
|
|
129
|
+
|
|
130
|
+
// Default HTTP timeout in ms
|
|
131
|
+
export const DEFAULT_TIMEOUT = 10000;
|
|
132
|
+
|
|
133
|
+
// Severity levels
|
|
134
|
+
export const SEVERITY = {
|
|
135
|
+
CRITICAL: "CRITICAL",
|
|
136
|
+
HIGH: "HIGH",
|
|
137
|
+
MEDIUM: "MEDIUM",
|
|
138
|
+
LOW: "LOW",
|
|
139
|
+
INFO: "INFO",
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
// Finding types
|
|
143
|
+
export const FINDING_TYPE = {
|
|
144
|
+
NPX_CONFUSION: "npx-confusion",
|
|
145
|
+
DEPENDENCY_CONFUSION: "dependency-confusion",
|
|
146
|
+
BIN_MISMATCH: "bin-mismatch",
|
|
147
|
+
NAME_CLASH: "name-clash",
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
// Common patterns to skip in web scraping
|
|
151
|
+
export const SKIP_EXTENSIONS = new Set([
|
|
152
|
+
".png",
|
|
153
|
+
".jpg",
|
|
154
|
+
".jpeg",
|
|
155
|
+
".gif",
|
|
156
|
+
".svg",
|
|
157
|
+
".ico",
|
|
158
|
+
".woff",
|
|
159
|
+
".woff2",
|
|
160
|
+
".ttf",
|
|
161
|
+
".eot",
|
|
162
|
+
".mp4",
|
|
163
|
+
".webm",
|
|
164
|
+
".mp3",
|
|
165
|
+
".pdf",
|
|
166
|
+
".zip",
|
|
167
|
+
".tar",
|
|
168
|
+
]);
|
|
169
|
+
|
|
170
|
+
// Paths to probe on web targets
|
|
171
|
+
export const WEB_PROBE_PATHS = [
|
|
172
|
+
"/package.json",
|
|
173
|
+
"/package-lock.json",
|
|
174
|
+
"/npm-shrinkwrap.json",
|
|
175
|
+
];
|
|
176
|
+
|
|
177
|
+
// GitHub API defaults
|
|
178
|
+
export const GITHUB_DEFAULTS = {
|
|
179
|
+
perPage: 100,
|
|
180
|
+
maxRepos: 1000,
|
|
181
|
+
};
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import { DEFAULT_TIMEOUT } from "./constants.js";
|
|
2
|
+
import logger from "./logger.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* HTTP client wrapper with retry, backoff, and rate limiting.
|
|
6
|
+
* Uses Node.js built-in fetch (available in Node 18+).
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
// Simple token-bucket rate limiter
|
|
10
|
+
class RateLimiter {
|
|
11
|
+
constructor(maxTokens, refillRate) {
|
|
12
|
+
this.maxTokens = maxTokens;
|
|
13
|
+
this.tokens = maxTokens;
|
|
14
|
+
this.refillRate = refillRate; // tokens per second
|
|
15
|
+
this.lastRefill = Date.now();
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
async acquire() {
|
|
19
|
+
this._refill();
|
|
20
|
+
if (this.tokens > 0) {
|
|
21
|
+
this.tokens--;
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
// Wait for a token
|
|
25
|
+
const waitMs = (1 / this.refillRate) * 1000;
|
|
26
|
+
await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
27
|
+
this._refill();
|
|
28
|
+
this.tokens--;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
_refill() {
|
|
32
|
+
const now = Date.now();
|
|
33
|
+
const elapsed = (now - this.lastRefill) / 1000;
|
|
34
|
+
this.tokens = Math.min(
|
|
35
|
+
this.maxTokens,
|
|
36
|
+
this.tokens + elapsed * this.refillRate,
|
|
37
|
+
);
|
|
38
|
+
this.lastRefill = now;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Per-host rate limiters
|
|
43
|
+
const limiters = new Map();
|
|
44
|
+
|
|
45
|
+
function getLimiter(hostname, rate = 30) {
|
|
46
|
+
if (!limiters.has(hostname)) {
|
|
47
|
+
limiters.set(hostname, new RateLimiter(rate, rate));
|
|
48
|
+
}
|
|
49
|
+
return limiters.get(hostname);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Fetch with retry and exponential backoff.
|
|
54
|
+
* @param {string} url
|
|
55
|
+
* @param {object} options
|
|
56
|
+
* @param {number} options.timeout - ms
|
|
57
|
+
* @param {number} options.retries - max retry count
|
|
58
|
+
* @param {number} options.rateLimit - requests per second per host
|
|
59
|
+
* @param {object} options.headers - additional headers
|
|
60
|
+
* @returns {Promise<{status: number, data: any, headers: Headers}>}
|
|
61
|
+
*/
|
|
62
|
+
export async function httpGet(url, options = {}) {
|
|
63
|
+
const {
|
|
64
|
+
timeout = DEFAULT_TIMEOUT,
|
|
65
|
+
retries = 3,
|
|
66
|
+
rateLimit = 30,
|
|
67
|
+
headers = {},
|
|
68
|
+
} = options;
|
|
69
|
+
|
|
70
|
+
const parsedUrl = new URL(url);
|
|
71
|
+
const limiter = getLimiter(parsedUrl.hostname, rateLimit);
|
|
72
|
+
|
|
73
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
74
|
+
await limiter.acquire();
|
|
75
|
+
|
|
76
|
+
const controller = new AbortController();
|
|
77
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
78
|
+
|
|
79
|
+
try {
|
|
80
|
+
logger.debug(`HTTP GET ${url} (attempt ${attempt + 1})`);
|
|
81
|
+
|
|
82
|
+
const response = await fetch(url, {
|
|
83
|
+
signal: controller.signal,
|
|
84
|
+
headers: {
|
|
85
|
+
"User-Agent": "npxconfuse/1.0 (security-scanner)",
|
|
86
|
+
Accept: "application/json",
|
|
87
|
+
...headers,
|
|
88
|
+
},
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
clearTimeout(timer);
|
|
92
|
+
|
|
93
|
+
// Don't retry 4xx (except 429)
|
|
94
|
+
if (
|
|
95
|
+
response.status >= 400 &&
|
|
96
|
+
response.status < 500 &&
|
|
97
|
+
response.status !== 429
|
|
98
|
+
) {
|
|
99
|
+
return {
|
|
100
|
+
status: response.status,
|
|
101
|
+
data: null,
|
|
102
|
+
headers: response.headers,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Retry on 429 or 5xx
|
|
107
|
+
if (response.status === 429 || response.status >= 500) {
|
|
108
|
+
const retryAfter = response.headers.get("retry-after");
|
|
109
|
+
const waitMs = retryAfter
|
|
110
|
+
? parseInt(retryAfter, 10) * 1000
|
|
111
|
+
: Math.min(1000 * Math.pow(2, attempt), 30000);
|
|
112
|
+
|
|
113
|
+
logger.debug(
|
|
114
|
+
`Rate limited or server error (${response.status}), waiting ${waitMs}ms...`,
|
|
115
|
+
);
|
|
116
|
+
|
|
117
|
+
if (attempt < retries) {
|
|
118
|
+
await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
status: response.status,
|
|
124
|
+
data: null,
|
|
125
|
+
headers: response.headers,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Parse JSON or text
|
|
130
|
+
const contentType = response.headers.get("content-type") || "";
|
|
131
|
+
let data;
|
|
132
|
+
if (contentType.includes("application/json")) {
|
|
133
|
+
data = await response.json();
|
|
134
|
+
} else {
|
|
135
|
+
data = await response.text();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
status: response.status,
|
|
140
|
+
data,
|
|
141
|
+
headers: response.headers,
|
|
142
|
+
};
|
|
143
|
+
} catch (err) {
|
|
144
|
+
clearTimeout(timer);
|
|
145
|
+
|
|
146
|
+
if (err.name === "AbortError") {
|
|
147
|
+
logger.debug(`Request timed out: ${url}`);
|
|
148
|
+
} else {
|
|
149
|
+
logger.debug(`Request failed: ${url} — ${err.message}`);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (attempt < retries) {
|
|
153
|
+
const waitMs = Math.min(1000 * Math.pow(2, attempt), 15000);
|
|
154
|
+
await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
status: 0,
|
|
160
|
+
data: null,
|
|
161
|
+
headers: null,
|
|
162
|
+
error: err.message,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Fetch with HTML/text response (for web scraping).
|
|
170
|
+
*/
|
|
171
|
+
export async function httpGetText(url, options = {}) {
|
|
172
|
+
return httpGet(url, {
|
|
173
|
+
...options,
|
|
174
|
+
headers: {
|
|
175
|
+
Accept: "text/html, application/json, text/plain, */*",
|
|
176
|
+
...options.headers,
|
|
177
|
+
},
|
|
178
|
+
});
|
|
179
|
+
}
|