muaddib-scanner 2.11.57 → 2.11.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.57",
3
+ "version": "2.11.59",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-06-04T20:24:41.702Z",
3
+ "timestamp": "2026-06-05T06:40:48.592Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -445,6 +445,24 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
445
445
  // ML Phase 2a: Count JS files and detect test presence for enriched features
446
446
  const { fileCountTotal, hasTests } = countPackageFiles(extractedDir);
447
447
 
448
+ // Hoisted before the worker spawn (per-worker 429-storm fix): fetch the npm
449
+ // registry metadata ONCE on the main thread. The shared http-limiter coordinates
450
+ // it and the temporal cache is warm (npm-registry.js reads it first), so only
451
+ // weekly_downloads + author hit the network. Passed to the worker via scanContext
452
+ // so the worker's processor consumes it instead of re-fetching on its OWN module-
453
+ // level limiter — N worker_threads = N uncoordinated limiters → ~Nx npm throughput
454
+ // → 429 bursts. Also reused below (ML / first-publish / training records /
455
+ // reputation) — previously this was a SECOND main-side fetch after the worker.
456
+ let npmRegistryMeta = null;
457
+ if (ecosystem === 'npm') {
458
+ try {
459
+ const { getPackageMetadata } = require('../scanner/npm-registry.js');
460
+ npmRegistryMeta = await getPackageMetadata(name);
461
+ } catch (err) {
462
+ console.error(`[ML] npm registry fetch failed for ${name}: ${err.message}`);
463
+ }
464
+ }
465
+
448
466
  let result;
449
467
  try {
450
468
  // scanContext: feeds monitor-side info (name/version/ecosystem) and the
@@ -463,6 +481,11 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
463
481
  // the full 20-scanner pipeline (unchanged behaviour).
464
482
  scanMode: (meta && meta.scanMode) || 'full'
465
483
  };
484
+ // Hand the main-thread-fetched metadata to the worker so its processor skips
485
+ // the per-worker getPackageMetadata fetch (429-storm fix). npm only; the key
486
+ // is set even when null ("main already tried, don't refetch"). pypi leaves it
487
+ // absent so the worker takes the unchanged CLI/else-if path.
488
+ if (ecosystem === 'npm') scanContext.npmRegistryMeta = npmRegistryMeta;
466
489
  result = await runScanInWorker(extractedDir, STATIC_SCAN_TIMEOUT_MS, scanContext, signal);
467
490
  } catch (staticErr) {
468
491
  if (/static scan timeout/i.test(staticErr.message)) {
@@ -494,22 +517,11 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
494
517
  // First-publish detection: used for sandbox priority below
495
518
  const isFirstPublish = cacheTrigger && cacheTrigger.reason === 'first_publish';
496
519
 
497
- // Fetch npm registry metadata for ALL npm packages (not just those with findings).
498
- // Needed for: (1) isFirstPublishHighRisk decision, (2) ML classifier features,
499
- // (3) JSONL training records clean packages MUST have metadata to prevent
500
- // data leakage (model learning "metadata=0 clean" instead of behavioral signals).
501
- // Cost: near-zero for npm packages because temporal checks (line ~1014) already
502
- // pre-fetch registry metadata into temporal-analysis._metadataCache, and
503
- // getPackageMetadata() reads this cache first (npm-registry.js:87-95).
504
- let npmRegistryMeta = null;
505
- if (ecosystem === 'npm') {
506
- try {
507
- const { getPackageMetadata } = require('../scanner/npm-registry.js');
508
- npmRegistryMeta = await getPackageMetadata(name);
509
- } catch (err) {
510
- console.error(`[ML] npm registry fetch failed for ${name}: ${err.message}`);
511
- }
512
- }
520
+ // npm registry metadata was fetched ONCE before the worker spawn (hoisted above
521
+ // to feed scanContext.npmRegistryMeta) and is reused here for: isFirstPublishHigh-
522
+ // Risk, ML classifier features, JSONL training records, and reputation scoring.
523
+ // Clean packages MUST carry metadata to prevent training-data leakage (model
524
+ // learning "metadata=0 clean" instead of behavioral signals).
513
525
 
514
526
  // First-publish sandbox priority: sandbox even with 0 static findings
515
527
  // if the package is from a new/unknown maintainer without a linked repository.
@@ -191,6 +191,31 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
191
191
  // 3 metadata-dependent gates (METADATA_FACTOR, MATURE_CAP, DELTA_MODE) in
192
192
  // one shot. Individual gates can still be turned off via their own =0 flag.
193
193
  if (
194
+ packageName &&
195
+ _pkgMeta &&
196
+ options &&
197
+ Object.prototype.hasOwnProperty.call(options, 'npmRegistryMeta')
198
+ ) {
199
+ // The monitor fetched the registry metadata ONCE on the main thread (shared
200
+ // http-limiter, warm temporal cache) and passed it via scanContext.npmRegistry-
201
+ // Meta. Consume it here instead of re-fetching: a per-worker getPackageMetadata()
202
+ // runs on the worker's OWN module-level limiter, so N worker_threads = N
203
+ // uncoordinated limiters → ~Nx npm throughput → 429 storms. Strict semantics —
204
+ // the key being present (even null) means "main already handled it"; a null
205
+ // value (main fetch failed) leaves the gates to no-op, identical to a failed
206
+ // fetch (best-effort metadata signals stay silent). CLI / `muaddib replay`
207
+ // never set the key → the else-if fetch path below is unchanged.
208
+ const injected = options.npmRegistryMeta;
209
+ if (injected) {
210
+ // Attach the scanned version (getPackageMetadata never sets it) so
211
+ // applyMatureStableCap can require scan_version === latest_version — a
212
+ // historical compromised version must not inherit live "stable" reputation.
213
+ if (injected.scan_version == null && packageVersion != null) {
214
+ injected.scan_version = packageVersion;
215
+ }
216
+ _pkgMeta.npmRegistryMeta = injected;
217
+ }
218
+ } else if (
194
219
  packageName &&
195
220
  _pkgMeta &&
196
221
  globalThis.process.env.MUADDIB_NO_REGISTRY_FETCH !== '1' &&
@@ -1,14 +1,16 @@
1
1
  const { NPM_PACKAGE_REGEX } = require('../shared/constants.js');
2
2
  const { debugLog } = require('../utils.js');
3
- const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
3
+ const { acquireRegistrySlot, releaseRegistrySlot, signal429 } = require('../shared/http-limiter.js');
4
4
  const { computeAdvancedRegistrySignals } = require('../integrations/registry-signals.js');
5
5
 
6
6
  const REGISTRY_URL = 'https://registry.npmjs.org';
7
7
  const DOWNLOADS_URL = 'https://api.npmjs.org/downloads/point/last-week';
8
8
  const SEARCH_URL = 'https://registry.npmjs.org/-/v1/search';
9
9
 
10
- const REQUEST_TIMEOUT = 10000; // 10 seconds
11
- const MAX_RETRIES = 3;
10
+ // Env-tunable; defaults preserve prior behavior except MAX_RETRIES (3 → 5) for more headroom
11
+ // under sustained 429s during a large evaluate burst.
12
+ const REQUEST_TIMEOUT = Math.max(1000, parseInt(process.env.MUADDIB_REGISTRY_TIMEOUT_MS, 10) || 10000); // 10s default
13
+ const MAX_RETRIES = Math.max(1, parseInt(process.env.MUADDIB_REGISTRY_RETRIES, 10) || 5);
12
14
 
13
15
  /**
14
16
  * Create a timeout signal, with fallback for older Node versions.
@@ -31,10 +33,12 @@ async function fetchWithRetry(url) {
31
33
  response = await fetch(url, { signal });
32
34
  } catch {
33
35
  cleanup();
34
- // REG-001: Retry on timeout/abort instead of returning null immediately
36
+ // REG-001: Retry on timeout/abort instead of returning null immediately.
37
+ // Jittered exponential backoff avoids synchronized retry storms across the
38
+ // (up to MUADDIB_REGISTRY_CONCURRENCY) concurrent fetches.
35
39
  if (attempt < MAX_RETRIES - 1) {
36
40
  const backoff = Math.min(1000 * Math.pow(2, attempt), 8000);
37
- await new Promise(r => setTimeout(r, backoff));
41
+ await new Promise(r => setTimeout(r, Math.round(backoff * (0.5 + Math.random() * 0.5))));
38
42
  }
39
43
  continue;
40
44
  }
@@ -48,12 +52,16 @@ async function fetchWithRetry(url) {
48
52
  return null;
49
53
  }
50
54
 
51
- // 429 = rate limit, respect Retry-After header (capped at 30s)
55
+ // 429 = rate limit. Drain the SHARED token bucket so EVERY in-flight request
56
+ // (not just this one) backs off together — fixes the thundering-herd 429 storm
57
+ // that left ~17% of packages metadata-less in a local evaluate run. Then honor
58
+ // Retry-After (capped at 30s) with jitter so retries don't re-synchronize.
52
59
  if (response.status === 429) {
53
60
  try { await response.text(); } catch (e) { debugLog('response drain failed:', e.message); }
61
+ try { signal429(); } catch { /* limiter is best-effort */ }
54
62
  const retryAfter = parseInt(response.headers.get('retry-after'), 10);
55
- const delay = Math.min(retryAfter && retryAfter > 0 ? retryAfter * 1000 : 2000, 30000);
56
- await new Promise(r => setTimeout(r, delay));
63
+ const base = Math.min(retryAfter && retryAfter > 0 ? retryAfter * 1000 : 2000, 30000);
64
+ await new Promise(r => setTimeout(r, Math.round(base * (0.5 + Math.random() * 0.5))));
57
65
  continue;
58
66
  }
59
67
 
@@ -29,6 +29,7 @@
29
29
  const fs = require('fs');
30
30
  const path = require('path');
31
31
  const https = require('https');
32
+ const { acquireRegistrySlot, releaseRegistrySlot, signal429 } = require('../shared/http-limiter.js');
32
33
 
33
34
  const TRUSTED_DEP_AGE_THRESHOLD_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
34
35
 
@@ -80,7 +81,17 @@ function httpsGet(url, timeoutMs = 30_000) {
80
81
  async function checkDepDiff(name, newVersion) {
81
82
  const findings = [];
82
83
  try {
83
- const body = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(name)}`, PACKUMENT_TIMEOUT_MS);
84
+ // Route through the shared http-limiter (concurrency + token bucket + 429
85
+ // backoff) instead of a raw uncoordinated httpsGet — this scanner runs inside
86
+ // the monitor worker_threads, where an unbounded fetch joins the per-worker
87
+ // 429 storm. finally-release keeps the semaphore balanced even on reject.
88
+ await acquireRegistrySlot();
89
+ let body;
90
+ try {
91
+ body = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(name)}`, PACKUMENT_TIMEOUT_MS);
92
+ } finally {
93
+ releaseRegistrySlot();
94
+ }
84
95
  const packument = JSON.parse(body);
85
96
 
86
97
  if (!packument.versions || !packument.time) return findings;
@@ -107,13 +118,20 @@ async function checkDepDiff(name, newVersion) {
107
118
  for (const dep of addedDeps) {
108
119
  let ageMs = null;
109
120
  try {
110
- const depBody = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(dep)}`, DEP_AGE_TIMEOUT_MS);
121
+ await acquireRegistrySlot();
122
+ let depBody;
123
+ try {
124
+ depBody = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(dep)}`, DEP_AGE_TIMEOUT_MS);
125
+ } finally {
126
+ releaseRegistrySlot();
127
+ }
111
128
  const depData = JSON.parse(depBody);
112
129
  const created = depData.time && depData.time.created;
113
130
  if (created) {
114
131
  ageMs = Date.now() - new Date(created).getTime();
115
132
  }
116
133
  } catch (err) {
134
+ if (/HTTP 429/.test(err.message)) { try { signal429(); } catch { /* limiter best-effort */ } }
117
135
  console.log(`[SCANNER] trusted-dep-diff: could not check age of dependency ${dep}: ${err.message}`);
118
136
  }
119
137
 
@@ -152,6 +170,7 @@ async function checkDepDiff(name, newVersion) {
152
170
 
153
171
  return findings;
154
172
  } catch (err) {
173
+ if (/HTTP 429/.test(err.message)) { try { signal429(); } catch { /* limiter best-effort */ } }
155
174
  console.log(`[SCANNER] trusted-dep-diff: check failed for ${name}@${newVersion}: ${err.message}`);
156
175
  return findings;
157
176
  }
@@ -4,8 +4,8 @@
4
4
  * Centralized HTTP concurrency + rate limiter for npm registry requests.
5
5
  *
6
6
  * Two layers of protection:
7
- * 1. Concurrency semaphore (REGISTRY_SEMAPHORE_MAX = 10) — caps in-flight requests
8
- * 2. Rate limiter (RATE_LIMIT_PER_SEC = 30) — caps requests/second via token bucket
7
+ * 1. Concurrency semaphore (REGISTRY_SEMAPHORE_MAX, default 20, env MUADDIB_REGISTRY_CONCURRENCY) — caps in-flight requests
8
+ * 2. Rate limiter (RATE_LIMIT_PER_SEC, default 30, env MUADDIB_REGISTRY_RATE) — caps requests/second via token bucket
9
9
  *
10
10
  * Without rate limiting, 10 concurrent slots × fast-completing requests = 100+ req/s
11
11
  * bursts that trigger npm 429 responses → exponential backoff → scan times 10s→90s.
@@ -14,8 +14,11 @@
14
14
  * NOT covered: api.npmjs.org (different server), replicate.npmjs.com (CouchDB changes stream).
15
15
  */
16
16
 
17
- const REGISTRY_SEMAPHORE_MAX = 20;
18
- const RATE_LIMIT_PER_SEC = 30;
17
+ // Env-tunable so a constrained client (e.g. local/Windows `evaluate` runs that hit npm 429s during
18
+ // the ~1644-request metadata burst over 548 packages) can dial the burst down without code edits.
19
+ // Defaults preserve prior behavior (20 in-flight / 30 req/s).
20
+ const REGISTRY_SEMAPHORE_MAX = Math.max(1, parseInt(process.env.MUADDIB_REGISTRY_CONCURRENCY, 10) || 20);
21
+ const RATE_LIMIT_PER_SEC = Math.max(1, parseInt(process.env.MUADDIB_REGISTRY_RATE, 10) || 30);
19
22
 
20
23
  // --- Concurrency semaphore ---
21
24