muaddib-scanner 2.11.58 → 2.11.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.58",
3
+ "version": "2.11.59",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-06-04T21:33:40.755Z",
3
+ "timestamp": "2026-06-05T06:40:48.592Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -445,6 +445,24 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
445
445
  // ML Phase 2a: Count JS files and detect test presence for enriched features
446
446
  const { fileCountTotal, hasTests } = countPackageFiles(extractedDir);
447
447
 
448
+ // Hoisted before the worker spawn (per-worker 429-storm fix): fetch the npm
449
+ // registry metadata ONCE on the main thread. The shared http-limiter coordinates
450
+ // it and the temporal cache is warm (npm-registry.js reads it first), so only
451
+ // weekly_downloads + author hit the network. Passed to the worker via scanContext
452
+ // so the worker's processor consumes it instead of re-fetching on its OWN module-
453
+ // level limiter — N worker_threads = N uncoordinated limiters → ~Nx npm throughput
454
+ // → 429 bursts. Also reused below (ML / first-publish / training records /
455
+ // reputation) — previously this was a SECOND main-side fetch after the worker.
456
+ let npmRegistryMeta = null;
457
+ if (ecosystem === 'npm') {
458
+ try {
459
+ const { getPackageMetadata } = require('../scanner/npm-registry.js');
460
+ npmRegistryMeta = await getPackageMetadata(name);
461
+ } catch (err) {
462
+ console.error(`[ML] npm registry fetch failed for ${name}: ${err.message}`);
463
+ }
464
+ }
465
+
448
466
  let result;
449
467
  try {
450
468
  // scanContext: feeds monitor-side info (name/version/ecosystem) and the
@@ -463,6 +481,11 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
463
481
  // the full 20-scanner pipeline (unchanged behaviour).
464
482
  scanMode: (meta && meta.scanMode) || 'full'
465
483
  };
484
+ // Hand the main-thread-fetched metadata to the worker so its processor skips
485
+ // the per-worker getPackageMetadata fetch (429-storm fix). npm only; the key
486
+ // is set even when null ("main already tried, don't refetch"). pypi leaves it
487
+ // absent so the worker takes the unchanged CLI/else-if path.
488
+ if (ecosystem === 'npm') scanContext.npmRegistryMeta = npmRegistryMeta;
466
489
  result = await runScanInWorker(extractedDir, STATIC_SCAN_TIMEOUT_MS, scanContext, signal);
467
490
  } catch (staticErr) {
468
491
  if (/static scan timeout/i.test(staticErr.message)) {
@@ -494,22 +517,11 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
494
517
  // First-publish detection: used for sandbox priority below
495
518
  const isFirstPublish = cacheTrigger && cacheTrigger.reason === 'first_publish';
496
519
 
497
- // Fetch npm registry metadata for ALL npm packages (not just those with findings).
498
- // Needed for: (1) isFirstPublishHighRisk decision, (2) ML classifier features,
499
- // (3) JSONL training records clean packages MUST have metadata to prevent
500
- // data leakage (model learning "metadata=0 clean" instead of behavioral signals).
501
- // Cost: near-zero for npm packages because temporal checks (line ~1014) already
502
- // pre-fetch registry metadata into temporal-analysis._metadataCache, and
503
- // getPackageMetadata() reads this cache first (npm-registry.js:87-95).
504
- let npmRegistryMeta = null;
505
- if (ecosystem === 'npm') {
506
- try {
507
- const { getPackageMetadata } = require('../scanner/npm-registry.js');
508
- npmRegistryMeta = await getPackageMetadata(name);
509
- } catch (err) {
510
- console.error(`[ML] npm registry fetch failed for ${name}: ${err.message}`);
511
- }
512
- }
520
+ // npm registry metadata was fetched ONCE before the worker spawn (hoisted above
521
+ // to feed scanContext.npmRegistryMeta) and is reused here for: isFirstPublishHigh-
522
+ // Risk, ML classifier features, JSONL training records, and reputation scoring.
523
+ // Clean packages MUST carry metadata to prevent training-data leakage (model
524
+ // learning "metadata=0 clean" instead of behavioral signals).
513
525
 
514
526
  // First-publish sandbox priority: sandbox even with 0 static findings
515
527
  // if the package is from a new/unknown maintainer without a linked repository.
@@ -191,6 +191,31 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
191
191
  // 3 metadata-dependent gates (METADATA_FACTOR, MATURE_CAP, DELTA_MODE) in
192
192
  // one shot. Individual gates can still be turned off via their own =0 flag.
193
193
  if (
194
+ packageName &&
195
+ _pkgMeta &&
196
+ options &&
197
+ Object.prototype.hasOwnProperty.call(options, 'npmRegistryMeta')
198
+ ) {
199
+ // The monitor fetched the registry metadata ONCE on the main thread (shared
200
+ // http-limiter, warm temporal cache) and passed it via scanContext.npmRegistry-
201
+ // Meta. Consume it here instead of re-fetching: a per-worker getPackageMetadata()
202
+ // runs on the worker's OWN module-level limiter, so N worker_threads = N
203
+ // uncoordinated limiters → ~Nx npm throughput → 429 storms. Strict semantics —
204
+ // the key being present (even null) means "main already handled it"; a null
205
+ // value (main fetch failed) leaves the gates to no-op, identical to a failed
206
+ // fetch (best-effort metadata signals stay silent). CLI / `muaddib replay`
207
+ // never set the key → the else-if fetch path below is unchanged.
208
+ const injected = options.npmRegistryMeta;
209
+ if (injected) {
210
+ // Attach the scanned version (getPackageMetadata never sets it) so
211
+ // applyMatureStableCap can require scan_version === latest_version — a
212
+ // historical compromised version must not inherit live "stable" reputation.
213
+ if (injected.scan_version == null && packageVersion != null) {
214
+ injected.scan_version = packageVersion;
215
+ }
216
+ _pkgMeta.npmRegistryMeta = injected;
217
+ }
218
+ } else if (
194
219
  packageName &&
195
220
  _pkgMeta &&
196
221
  globalThis.process.env.MUADDIB_NO_REGISTRY_FETCH !== '1' &&
@@ -29,6 +29,7 @@
29
29
  const fs = require('fs');
30
30
  const path = require('path');
31
31
  const https = require('https');
32
+ const { acquireRegistrySlot, releaseRegistrySlot, signal429 } = require('../shared/http-limiter.js');
32
33
 
33
34
  const TRUSTED_DEP_AGE_THRESHOLD_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
34
35
 
@@ -80,7 +81,17 @@ function httpsGet(url, timeoutMs = 30_000) {
80
81
  async function checkDepDiff(name, newVersion) {
81
82
  const findings = [];
82
83
  try {
83
- const body = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(name)}`, PACKUMENT_TIMEOUT_MS);
84
+ // Route through the shared http-limiter (concurrency + token bucket + 429
85
+ // backoff) instead of a raw uncoordinated httpsGet — this scanner runs inside
86
+ // the monitor worker_threads, where an unbounded fetch joins the per-worker
87
+ // 429 storm. finally-release keeps the semaphore balanced even on reject.
88
+ await acquireRegistrySlot();
89
+ let body;
90
+ try {
91
+ body = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(name)}`, PACKUMENT_TIMEOUT_MS);
92
+ } finally {
93
+ releaseRegistrySlot();
94
+ }
84
95
  const packument = JSON.parse(body);
85
96
 
86
97
  if (!packument.versions || !packument.time) return findings;
@@ -107,13 +118,20 @@ async function checkDepDiff(name, newVersion) {
107
118
  for (const dep of addedDeps) {
108
119
  let ageMs = null;
109
120
  try {
110
- const depBody = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(dep)}`, DEP_AGE_TIMEOUT_MS);
121
+ await acquireRegistrySlot();
122
+ let depBody;
123
+ try {
124
+ depBody = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(dep)}`, DEP_AGE_TIMEOUT_MS);
125
+ } finally {
126
+ releaseRegistrySlot();
127
+ }
111
128
  const depData = JSON.parse(depBody);
112
129
  const created = depData.time && depData.time.created;
113
130
  if (created) {
114
131
  ageMs = Date.now() - new Date(created).getTime();
115
132
  }
116
133
  } catch (err) {
134
+ if (/HTTP 429/.test(err.message)) { try { signal429(); } catch { /* limiter best-effort */ } }
117
135
  console.log(`[SCANNER] trusted-dep-diff: could not check age of dependency ${dep}: ${err.message}`);
118
136
  }
119
137
 
@@ -152,6 +170,7 @@ async function checkDepDiff(name, newVersion) {
152
170
 
153
171
  return findings;
154
172
  } catch (err) {
173
+ if (/HTTP 429/.test(err.message)) { try { signal429(); } catch { /* limiter best-effort */ } }
155
174
  console.log(`[SCANNER] trusted-dep-diff: check failed for ${name}@${newVersion}: ${err.message}`);
156
175
  return findings;
157
176
  }