muaddib-scanner 2.11.58 → 2.11.59
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/monitor/queue.js
CHANGED
|
@@ -445,6 +445,24 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
445
445
|
// ML Phase 2a: Count JS files and detect test presence for enriched features
|
|
446
446
|
const { fileCountTotal, hasTests } = countPackageFiles(extractedDir);
|
|
447
447
|
|
|
448
|
+
// Hoisted before the worker spawn (per-worker 429-storm fix): fetch the npm
|
|
449
|
+
// registry metadata ONCE on the main thread. The shared http-limiter coordinates
|
|
450
|
+
// it and the temporal cache is warm (npm-registry.js reads it first), so only
|
|
451
|
+
// weekly_downloads + author hit the network. Passed to the worker via scanContext
|
|
452
|
+
// so the worker's processor consumes it instead of re-fetching on its OWN module-
|
|
453
|
+
// level limiter — N worker_threads = N uncoordinated limiters → ~Nx npm throughput
|
|
454
|
+
// → 429 bursts. Also reused below (ML / first-publish / training records /
|
|
455
|
+
// reputation) — previously this was a SECOND main-side fetch after the worker.
|
|
456
|
+
let npmRegistryMeta = null;
|
|
457
|
+
if (ecosystem === 'npm') {
|
|
458
|
+
try {
|
|
459
|
+
const { getPackageMetadata } = require('../scanner/npm-registry.js');
|
|
460
|
+
npmRegistryMeta = await getPackageMetadata(name);
|
|
461
|
+
} catch (err) {
|
|
462
|
+
console.error(`[ML] npm registry fetch failed for ${name}: ${err.message}`);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
448
466
|
let result;
|
|
449
467
|
try {
|
|
450
468
|
// scanContext: feeds monitor-side info (name/version/ecosystem) and the
|
|
@@ -463,6 +481,11 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
463
481
|
// the full 20-scanner pipeline (unchanged behaviour).
|
|
464
482
|
scanMode: (meta && meta.scanMode) || 'full'
|
|
465
483
|
};
|
|
484
|
+
// Hand the main-thread-fetched metadata to the worker so its processor skips
|
|
485
|
+
// the per-worker getPackageMetadata fetch (429-storm fix). npm only; the key
|
|
486
|
+
// is set even when null ("main already tried, don't refetch"). pypi leaves it
|
|
487
|
+
// absent so the worker takes the unchanged CLI/else-if path.
|
|
488
|
+
if (ecosystem === 'npm') scanContext.npmRegistryMeta = npmRegistryMeta;
|
|
466
489
|
result = await runScanInWorker(extractedDir, STATIC_SCAN_TIMEOUT_MS, scanContext, signal);
|
|
467
490
|
} catch (staticErr) {
|
|
468
491
|
if (/static scan timeout/i.test(staticErr.message)) {
|
|
@@ -494,22 +517,11 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
494
517
|
// First-publish detection: used for sandbox priority below
|
|
495
518
|
const isFirstPublish = cacheTrigger && cacheTrigger.reason === 'first_publish';
|
|
496
519
|
|
|
497
|
-
//
|
|
498
|
-
//
|
|
499
|
-
//
|
|
500
|
-
//
|
|
501
|
-
//
|
|
502
|
-
// pre-fetch registry metadata into temporal-analysis._metadataCache, and
|
|
503
|
-
// getPackageMetadata() reads this cache first (npm-registry.js:87-95).
|
|
504
|
-
let npmRegistryMeta = null;
|
|
505
|
-
if (ecosystem === 'npm') {
|
|
506
|
-
try {
|
|
507
|
-
const { getPackageMetadata } = require('../scanner/npm-registry.js');
|
|
508
|
-
npmRegistryMeta = await getPackageMetadata(name);
|
|
509
|
-
} catch (err) {
|
|
510
|
-
console.error(`[ML] npm registry fetch failed for ${name}: ${err.message}`);
|
|
511
|
-
}
|
|
512
|
-
}
|
|
520
|
+
// npm registry metadata was fetched ONCE before the worker spawn (hoisted above
|
|
521
|
+
// to feed scanContext.npmRegistryMeta) and is reused here for: isFirstPublishHigh-
|
|
522
|
+
// Risk, ML classifier features, JSONL training records, and reputation scoring.
|
|
523
|
+
// Clean packages MUST carry metadata to prevent training-data leakage (model
|
|
524
|
+
// learning "metadata=0 → clean" instead of behavioral signals).
|
|
513
525
|
|
|
514
526
|
// First-publish sandbox priority: sandbox even with 0 static findings
|
|
515
527
|
// if the package is from a new/unknown maintainer without a linked repository.
|
|
@@ -191,6 +191,31 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
|
|
|
191
191
|
// 3 metadata-dependent gates (METADATA_FACTOR, MATURE_CAP, DELTA_MODE) in
|
|
192
192
|
// one shot. Individual gates can still be turned off via their own =0 flag.
|
|
193
193
|
if (
|
|
194
|
+
packageName &&
|
|
195
|
+
_pkgMeta &&
|
|
196
|
+
options &&
|
|
197
|
+
Object.prototype.hasOwnProperty.call(options, 'npmRegistryMeta')
|
|
198
|
+
) {
|
|
199
|
+
// The monitor fetched the registry metadata ONCE on the main thread (shared
|
|
200
|
+
// http-limiter, warm temporal cache) and passed it via scanContext.npmRegistry-
|
|
201
|
+
// Meta. Consume it here instead of re-fetching: a per-worker getPackageMetadata()
|
|
202
|
+
// runs on the worker's OWN module-level limiter, so N worker_threads = N
|
|
203
|
+
// uncoordinated limiters → ~Nx npm throughput → 429 storms. Strict semantics —
|
|
204
|
+
// the key being present (even null) means "main already handled it"; a null
|
|
205
|
+
// value (main fetch failed) leaves the gates to no-op, identical to a failed
|
|
206
|
+
// fetch (best-effort metadata signals stay silent). CLI / `muaddib replay`
|
|
207
|
+
// never set the key → the else-if fetch path below is unchanged.
|
|
208
|
+
const injected = options.npmRegistryMeta;
|
|
209
|
+
if (injected) {
|
|
210
|
+
// Attach the scanned version (getPackageMetadata never sets it) so
|
|
211
|
+
// applyMatureStableCap can require scan_version === latest_version — a
|
|
212
|
+
// historical compromised version must not inherit live "stable" reputation.
|
|
213
|
+
if (injected.scan_version == null && packageVersion != null) {
|
|
214
|
+
injected.scan_version = packageVersion;
|
|
215
|
+
}
|
|
216
|
+
_pkgMeta.npmRegistryMeta = injected;
|
|
217
|
+
}
|
|
218
|
+
} else if (
|
|
194
219
|
packageName &&
|
|
195
220
|
_pkgMeta &&
|
|
196
221
|
globalThis.process.env.MUADDIB_NO_REGISTRY_FETCH !== '1' &&
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
const fs = require('fs');
|
|
30
30
|
const path = require('path');
|
|
31
31
|
const https = require('https');
|
|
32
|
+
const { acquireRegistrySlot, releaseRegistrySlot, signal429 } = require('../shared/http-limiter.js');
|
|
32
33
|
|
|
33
34
|
const TRUSTED_DEP_AGE_THRESHOLD_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
|
|
34
35
|
|
|
@@ -80,7 +81,17 @@ function httpsGet(url, timeoutMs = 30_000) {
|
|
|
80
81
|
async function checkDepDiff(name, newVersion) {
|
|
81
82
|
const findings = [];
|
|
82
83
|
try {
|
|
83
|
-
|
|
84
|
+
// Route through the shared http-limiter (concurrency + token bucket + 429
|
|
85
|
+
// backoff) instead of a raw uncoordinated httpsGet — this scanner runs inside
|
|
86
|
+
// the monitor worker_threads, where an unbounded fetch joins the per-worker
|
|
87
|
+
// 429 storm. finally-release keeps the semaphore balanced even on reject.
|
|
88
|
+
await acquireRegistrySlot();
|
|
89
|
+
let body;
|
|
90
|
+
try {
|
|
91
|
+
body = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(name)}`, PACKUMENT_TIMEOUT_MS);
|
|
92
|
+
} finally {
|
|
93
|
+
releaseRegistrySlot();
|
|
94
|
+
}
|
|
84
95
|
const packument = JSON.parse(body);
|
|
85
96
|
|
|
86
97
|
if (!packument.versions || !packument.time) return findings;
|
|
@@ -107,13 +118,20 @@ async function checkDepDiff(name, newVersion) {
|
|
|
107
118
|
for (const dep of addedDeps) {
|
|
108
119
|
let ageMs = null;
|
|
109
120
|
try {
|
|
110
|
-
|
|
121
|
+
await acquireRegistrySlot();
|
|
122
|
+
let depBody;
|
|
123
|
+
try {
|
|
124
|
+
depBody = await httpsGet(`https://registry.npmjs.org/${encodeURIComponent(dep)}`, DEP_AGE_TIMEOUT_MS);
|
|
125
|
+
} finally {
|
|
126
|
+
releaseRegistrySlot();
|
|
127
|
+
}
|
|
111
128
|
const depData = JSON.parse(depBody);
|
|
112
129
|
const created = depData.time && depData.time.created;
|
|
113
130
|
if (created) {
|
|
114
131
|
ageMs = Date.now() - new Date(created).getTime();
|
|
115
132
|
}
|
|
116
133
|
} catch (err) {
|
|
134
|
+
if (/HTTP 429/.test(err.message)) { try { signal429(); } catch { /* limiter best-effort */ } }
|
|
117
135
|
console.log(`[SCANNER] trusted-dep-diff: could not check age of dependency ${dep}: ${err.message}`);
|
|
118
136
|
}
|
|
119
137
|
|
|
@@ -152,6 +170,7 @@ async function checkDepDiff(name, newVersion) {
|
|
|
152
170
|
|
|
153
171
|
return findings;
|
|
154
172
|
} catch (err) {
|
|
173
|
+
if (/HTTP 429/.test(err.message)) { try { signal429(); } catch { /* limiter best-effort */ } }
|
|
155
174
|
console.log(`[SCANNER] trusted-dep-diff: check failed for ${name}@${newVersion}: ${err.message}`);
|
|
156
175
|
return findings;
|
|
157
176
|
}
|