muaddib-scanner 2.10.19 → 2.10.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.10.19",
3
+ "version": "2.10.21",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,5 +1,6 @@
1
1
  const { NPM_PACKAGE_REGEX } = require('../shared/constants.js');
2
2
  const { debugLog } = require('../utils.js');
3
+ const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
3
4
 
4
5
  const REGISTRY_URL = 'https://registry.npmjs.org';
5
6
  const DOWNLOADS_URL = 'https://api.npmjs.org/downloads/point/last-week';
@@ -94,7 +95,12 @@ async function getPackageMetadata(packageName) {
94
95
  }
95
96
  if (!meta) {
96
97
  const registryUrl = REGISTRY_URL + '/' + encodeURIComponent(packageName);
97
- meta = await fetchWithRetry(registryUrl);
98
+ await acquireRegistrySlot();
99
+ try {
100
+ meta = await fetchWithRetry(registryUrl);
101
+ } finally {
102
+ releaseRegistrySlot();
103
+ }
98
104
  }
99
105
  if (!meta) return null;
100
106
 
@@ -121,9 +127,16 @@ async function getPackageMetadata(packageName) {
121
127
  ? SEARCH_URL + '?text=maintainer:' + encodeURIComponent(maintainer) + '&size=1'
122
128
  : null;
123
129
 
130
+ async function fetchAuthorWithSlot() {
131
+ if (!authorUrl) return null;
132
+ await acquireRegistrySlot();
133
+ try { return await fetchWithRetry(authorUrl); }
134
+ finally { releaseRegistrySlot(); }
135
+ }
136
+
124
137
  const [downloadsData, authorData] = await Promise.all([
125
- fetchWithRetry(downloadsUrl),
126
- authorUrl ? fetchWithRetry(authorUrl) : Promise.resolve(null)
138
+ fetchWithRetry(downloadsUrl), // api.npmjs.org — no semaphore needed
139
+ fetchAuthorWithSlot() // registry.npmjs.org — semaphore protected
127
140
  ]);
128
141
 
129
142
  const weeklyDownloads = downloadsData?.downloads ?? 0;
@@ -0,0 +1,54 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Centralized HTTP concurrency limiter for npm registry requests.
5
+ *
6
+ * With 16 monitor workers × 7+ HTTP requests/package, uncapped concurrency
7
+ * reaches 112+ simultaneous requests — well above npm's implicit rate limit.
8
+ * This module caps ALL registry.npmjs.org requests to a single semaphore
9
+ * so that no more than REGISTRY_SEMAPHORE_MAX requests are in-flight at once.
10
+ *
11
+ * Consumers: temporal-analysis.js, temporal-ast-diff.js, monitor.js (getNpmLatestTarball),
12
+ * npm-registry.js (fetchWithRetry to registry.npmjs.org).
13
+ * NOT covered: api.npmjs.org (different server), replicate.npmjs.com (CouchDB changes stream).
14
+ */
15
+
16
+ const REGISTRY_SEMAPHORE_MAX = 10;
17
+
18
+ const _semaphore = { active: 0, queue: [] };
19
+
20
+ function acquireRegistrySlot() {
21
+ if (_semaphore.active < REGISTRY_SEMAPHORE_MAX) {
22
+ _semaphore.active++;
23
+ return Promise.resolve();
24
+ }
25
+ return new Promise(resolve => {
26
+ _semaphore.queue.push(resolve);
27
+ });
28
+ }
29
+
30
+ function releaseRegistrySlot() {
31
+ if (_semaphore.queue.length > 0) {
32
+ const next = _semaphore.queue.shift();
33
+ next(); // Transfers slot to next waiter (active count stays the same)
34
+ } else {
35
+ _semaphore.active--;
36
+ }
37
+ }
38
+
39
+ function resetLimiter() {
40
+ _semaphore.active = 0;
41
+ _semaphore.queue.length = 0;
42
+ }
43
+
44
+ function getActiveSemaphore() {
45
+ return _semaphore;
46
+ }
47
+
48
+ module.exports = {
49
+ REGISTRY_SEMAPHORE_MAX,
50
+ acquireRegistrySlot,
51
+ releaseRegistrySlot,
52
+ resetLimiter,
53
+ getActiveSemaphore
54
+ };
@@ -1,4 +1,5 @@
1
1
  const https = require('https');
2
+ const { acquireRegistrySlot, releaseRegistrySlot, resetLimiter, getActiveSemaphore, REGISTRY_SEMAPHORE_MAX } = require('./shared/http-limiter.js');
2
3
 
3
4
  const REGISTRY_URL = 'https://registry.npmjs.org';
4
5
  const TIMEOUT_MS = 10_000;
@@ -6,9 +7,11 @@ const MAX_RESPONSE_SIZE = 50 * 1024 * 1024; // 50MB (some packages have lots of
6
7
 
7
8
  // Metadata cache: avoids duplicate HTTP requests when multiple temporal modules
8
9
  // fetch the same package metadata within a short window (monitor pipeline).
9
- const _metadataCache = new Map(); // packageName { data, fetchedAt }
10
+ // Entries with error=true are negative cache (shorter TTL) to avoid retry storms.
11
+ const _metadataCache = new Map(); // packageName → { data, fetchedAt, error? }
10
12
  const _inflightRequests = new Map(); // packageName → Promise
11
13
  const METADATA_CACHE_TTL = 5 * 60 * 1000; // 5 minutes
14
+ const NEGATIVE_CACHE_TTL = 60 * 1000; // 60 seconds for failed fetches
12
15
  const METADATA_CACHE_MAX = 200;
13
16
 
14
17
  const LIFECYCLE_SCRIPTS = [
@@ -24,9 +27,30 @@ const LIFECYCLE_SCRIPTS = [
24
27
 
25
28
  /**
26
29
  * Raw HTTP fetch — always hits the npm registry. Use fetchPackageMetadata() instead,
27
- * which adds caching and inflight dedup.
30
+ * which adds caching, inflight dedup, and semaphore.
31
+ * Acquires a shared HTTP semaphore slot before making the request.
28
32
  */
29
- function _fetchPackageMetadataImpl(packageName) {
33
+ async function _fetchPackageMetadataImpl(packageName) {
34
+ await acquireRegistrySlot();
35
+ try {
36
+ return await _fetchPackageMetadataHttp(packageName);
37
+ } catch (err) {
38
+ // Negative cache: store failure for 60s to prevent retry storms
39
+ if (_metadataCache.size >= METADATA_CACHE_MAX) {
40
+ const oldestKey = _metadataCache.keys().next().value;
41
+ _metadataCache.delete(oldestKey);
42
+ }
43
+ _metadataCache.set(packageName, { data: null, error: true, fetchedAt: Date.now() });
44
+ throw err;
45
+ } finally {
46
+ releaseRegistrySlot();
47
+ }
48
+ }
49
+
50
+ /**
51
+ * Low-level HTTP request to npm registry. No caching, no semaphore.
52
+ */
53
+ function _fetchPackageMetadataHttp(packageName) {
30
54
  const encodedName = encodeURIComponent(packageName).replace('%40', '@');
31
55
  const url = `${REGISTRY_URL}/${encodedName}`;
32
56
  const urlObj = new URL(url);
@@ -103,16 +127,23 @@ function _fetchPackageMetadataImpl(packageName) {
103
127
  }
104
128
 
105
129
  /**
106
- * Fetch full package metadata from the npm registry with caching and inflight dedup.
107
- * Multiple callers requesting the same package within 5 minutes share one HTTP request.
130
+ * Fetch full package metadata from the npm registry with caching, inflight dedup,
131
+ * negative cache, and HTTP semaphore. Multiple callers requesting the same package
132
+ * within 5 minutes share one HTTP request. Failed fetches are cached for 60s.
108
133
  * @param {string} packageName - npm package name (scoped or unscoped)
109
134
  * @returns {Promise<object>} Full registry metadata (versions, time, maintainers, etc.)
110
135
  */
111
136
  function fetchPackageMetadata(packageName) {
112
- // Check cache first (TTL-based)
137
+ // Check cache first (TTL-based, positive + negative)
113
138
  const cached = _metadataCache.get(packageName);
114
- if (cached && (Date.now() - cached.fetchedAt) < METADATA_CACHE_TTL) {
115
- return Promise.resolve(cached.data);
139
+ if (cached) {
140
+ const ttl = cached.error ? NEGATIVE_CACHE_TTL : METADATA_CACHE_TTL;
141
+ if ((Date.now() - cached.fetchedAt) < ttl) {
142
+ if (cached.error) {
143
+ return Promise.reject(new Error(`Negative cache hit for ${packageName} (failed ${Math.round((Date.now() - cached.fetchedAt) / 1000)}s ago)`));
144
+ }
145
+ return Promise.resolve(cached.data);
146
+ }
116
147
  }
117
148
 
118
149
  // Dedup inflight requests — if the same package is already being fetched, reuse that Promise
@@ -128,11 +159,12 @@ function fetchPackageMetadata(packageName) {
128
159
  }
129
160
 
130
161
  /**
131
- * Clear the metadata cache. Exported for tests and monitor reset.
162
+ * Clear the metadata cache and reset shared semaphore. Exported for tests and monitor reset.
132
163
  */
133
164
  function clearMetadataCache() {
134
165
  _metadataCache.clear();
135
166
  _inflightRequests.clear();
167
+ resetLimiter();
136
168
  }
137
169
 
138
170
  /**
@@ -309,5 +341,9 @@ module.exports = {
309
341
  _metadataCache,
310
342
  _inflightRequests,
311
343
  METADATA_CACHE_TTL,
312
- METADATA_CACHE_MAX
344
+ METADATA_CACHE_MAX,
345
+ NEGATIVE_CACHE_TTL,
346
+ // Re-export shared semaphore for backward compat with existing tests
347
+ _httpSemaphore: getActiveSemaphore(),
348
+ HTTP_SEMAPHORE_MAX: REGISTRY_SEMAPHORE_MAX
313
349
  };
@@ -7,6 +7,7 @@ const walk = require('acorn-walk');
7
7
  const { findJsFiles, forEachSafeFile, debugLog } = require('./utils.js');
8
8
  const { fetchPackageMetadata, getLatestVersions } = require('./temporal-analysis.js');
9
9
  const { downloadToFile, extractTarGz, sanitizePackageName } = require('./shared/download.js');
10
+ const { acquireRegistrySlot, releaseRegistrySlot } = require('./shared/http-limiter.js');
10
11
 
11
12
  const { MAX_FILE_SIZE, getMaxFileSize, ACORN_OPTIONS, safeParse } = require('./shared/constants.js');
12
13
 
@@ -36,11 +37,21 @@ const PATTERN_SEVERITY = {
36
37
 
37
38
  /**
38
39
  * Fetch version-specific metadata from npm registry.
40
+ * Acquires a shared HTTP semaphore slot to prevent registry throttling.
39
41
  * @param {string} packageName
40
42
  * @param {string} version
41
43
  * @returns {Promise<object>}
42
44
  */
43
- function fetchVersionMetadata(packageName, version) {
45
+ async function fetchVersionMetadata(packageName, version) {
46
+ await acquireRegistrySlot();
47
+ try {
48
+ return await _fetchVersionMetadataHttp(packageName, version);
49
+ } finally {
50
+ releaseRegistrySlot();
51
+ }
52
+ }
53
+
54
+ function _fetchVersionMetadataHttp(packageName, version) {
44
55
  const encodedName = encodeURIComponent(packageName).replace('%40', '@');
45
56
  const url = `${REGISTRY_URL}/${encodedName}/${encodeURIComponent(version)}`;
46
57
  const urlObj = new URL(url);
@@ -99,7 +110,13 @@ async function fetchPackageTarball(packageName, version) {
99
110
  let extractedDir;
100
111
  try {
101
112
  const tgzPath = path.join(tmpDir, 'package.tar.gz');
102
- await downloadToFile(tarballUrl, tgzPath);
113
+ // Tarball downloads go through the shared semaphore (npm CDN)
114
+ await acquireRegistrySlot();
115
+ try {
116
+ await downloadToFile(tarballUrl, tgzPath);
117
+ } finally {
118
+ releaseRegistrySlot();
119
+ }
103
120
  extractedDir = extractTarGz(tgzPath, tmpDir);
104
121
  } catch (err) {
105
122
  try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch (e) { debugLog('tmpDir cleanup failed:', e.message); }
package/src/utils.js CHANGED
@@ -28,8 +28,10 @@ let _filesCapped = false;
28
28
  * File content cache — read each file once, reused across all scanners in a single scan.
29
29
  * Key = absolute file path, Value = file content string.
30
30
  * Cleared between scans via clearFileListCache().
31
+ * Capped at 500 entries to prevent OOM during evaluate (200 packages sequential).
31
32
  */
32
33
  const _fileContentCache = new Map();
34
+ const _FILE_CONTENT_CACHE_MAX = 500;
33
35
 
34
36
  function setExtraExcludes(dirs, scanRoot) {
35
37
  _extraExcludedDirs = Array.isArray(dirs) ? dirs : [];
@@ -339,7 +341,8 @@ function forEachSafeFile(files, callback) {
339
341
  content = fs.readFileSync(file, 'utf8');
340
342
  } catch { continue; }
341
343
 
342
- // Cache for subsequent scanners
344
+ // Cache for subsequent scanners (evict all if over cap to prevent OOM in evaluate loops)
345
+ if (_fileContentCache.size >= _FILE_CONTENT_CACHE_MAX) _fileContentCache.clear();
343
346
  _fileContentCache.set(file, content);
344
347
  callback(file, content);
345
348
  }