muaddib-scanner 2.11.121 → 2.11.124
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/monitor/queue.js
CHANGED
|
@@ -12,7 +12,7 @@ const os = require('os');
|
|
|
12
12
|
const { Worker } = require('worker_threads');
|
|
13
13
|
const { runSandbox, tryAcquireSandboxSlot } = require('../sandbox/index.js');
|
|
14
14
|
const { sendWebhook } = require('../webhook.js');
|
|
15
|
-
const { downloadToFile, extractArchive, sanitizePackageName } = require('../shared/download.js');
|
|
15
|
+
const { downloadToFile, extractArchive, extractArchiveOffThread, sanitizePackageName } = require('../shared/download.js');
|
|
16
16
|
const { MAX_TARBALL_SIZE, getMaxFileSize } = require('../shared/constants.js');
|
|
17
17
|
const { acquireRegistrySlot, releaseRegistrySlot, awaitRateToken: awaitRateTokenForWorker, signal429: signal429ForWorker } = require('../shared/http-limiter.js');
|
|
18
18
|
const { loadCachedIOCs } = require('../ioc/updater.js');
|
|
@@ -178,6 +178,25 @@ const STATIC_SCAN_TIMEOUT_MS = 45_000; // 45s for static analysis only
|
|
|
178
178
|
const LARGE_PACKAGE_SIZE = 10 * 1024 * 1024; // 10MB
|
|
179
179
|
const RECENTLY_SCANNED_MAX = 50_000; // FIFO cap for the dedup Set (P0c — bounded resource)
|
|
180
180
|
|
|
181
|
+
// OOM fix (2026-06-19): archives larger than this (COMPRESSED, on-disk size) are
|
|
182
|
+
// extracted off the main thread in a worker, so the synchronous extractor
|
|
183
|
+
// (adm-zip extractAllTo / execFileSync tar) can no longer wedge the event loop and
|
|
184
|
+
// starve the RSS breaker / memory governor / EMERGENCY purge (all main-thread
|
|
185
|
+
// timers) → cgroup OOM. Confirmed culprit: data/loop-stalls.jsonl (extract:* up to
|
|
186
|
+
// 148s). Small archives extract inline — a worker spawn costs more than their
|
|
187
|
+
// sub-100ms extraction. Env-tunable via MUADDIB_INLINE_EXTRACT_MB.
|
|
188
|
+
const INLINE_EXTRACT_MAX_BYTES = (parseInt(process.env.MUADDIB_INLINE_EXTRACT_MB, 10) || 4) * 1024 * 1024;
|
|
189
|
+
|
|
190
|
+
// Extract inline for small archives, off-thread for large ones. compressedSize is
|
|
191
|
+
// the on-disk tarball size (reliable, unlike the registry unpackedSize metadata).
|
|
192
|
+
// Always returns a Promise so the call sites can uniformly `await`.
|
|
193
|
+
function extractGated(archivePath, destDir, compressedSize) {
|
|
194
|
+
if (compressedSize > INLINE_EXTRACT_MAX_BYTES) {
|
|
195
|
+
return extractArchiveOffThread(archivePath, destDir);
|
|
196
|
+
}
|
|
197
|
+
return Promise.resolve(extractArchive(archivePath, destDir));
|
|
198
|
+
}
|
|
199
|
+
|
|
181
200
|
// First-publish sandbox: max pending sandbox items before deferring first-publish clean scans
|
|
182
201
|
// Prevents starving T1a sandbox capacity when many first-publish packages arrive at once
|
|
183
202
|
const FIRST_PUBLISH_SANDBOX_MAX_QUEUE = parseInt(process.env.MUADDIB_FIRST_PUBLISH_SANDBOX_MAX_QUEUE, 10) || 10;
|
|
@@ -766,7 +785,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
766
785
|
let bypassQuickScan = false;
|
|
767
786
|
try {
|
|
768
787
|
const _crumb = beginOp('extract:quickscan', { name, version, unpackedSizeMb: Math.round(unpackedSize / 1024 / 1024) });
|
|
769
|
-
try { extractedDir =
|
|
788
|
+
try { extractedDir = await extractGated(tgzPath, tmpDir, fileSize); } finally { endOp(_crumb); }
|
|
770
789
|
|
|
771
790
|
const [pkgThreats, shellThreats] = await Promise.all([
|
|
772
791
|
scanPackageJson(extractedDir),
|
|
@@ -816,7 +835,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
816
835
|
|
|
817
836
|
if (!extractedDir) {
|
|
818
837
|
const _crumb = beginOp('extract:prework', { name, version, unpackedSizeMb: Math.round((meta.unpackedSize || 0) / 1024 / 1024) });
|
|
819
|
-
try { extractedDir =
|
|
838
|
+
try { extractedDir = await extractGated(tgzPath, tmpDir, fileSize); } finally { endOp(_crumb); }
|
|
820
839
|
}
|
|
821
840
|
|
|
822
841
|
// ML Phase 2a: Count JS files and detect test presence for enriched features
|
package/src/scanner/shell.js
CHANGED
|
@@ -8,7 +8,7 @@ const SHELL_EXCLUDED_DIRS = ['node_modules', '.git', '.muaddib-cache'];
|
|
|
8
8
|
const MALICIOUS_PATTERNS = [
|
|
9
9
|
{ pattern: /curl[^\n]{0,5000}\|[^\n]{0,5000}sh/m, name: 'curl_pipe_shell', severity: 'HIGH' },
|
|
10
10
|
{ pattern: /wget[^\n]{0,5000}&&[^\n]{0,5000}chmod[^\n]{0,5000}\+x/m, name: 'wget_chmod_exec', severity: 'HIGH' },
|
|
11
|
-
{ pattern: /
|
|
11
|
+
{ pattern: /(?:ba)?sh\s+-i\s+>&\s*\/dev\/tcp/m, name: 'reverse_shell', severity: 'CRITICAL' },
|
|
12
12
|
{ pattern: /nc\s+-e\s+\/bin\/(ba)?sh/m, name: 'netcat_shell', severity: 'CRITICAL' },
|
|
13
13
|
{ pattern: /rm\s+-rf\s+(~\/|\$HOME|\/home)/m, name: 'home_deletion', severity: 'CRITICAL' },
|
|
14
14
|
{ pattern: /shred.*\$HOME/m, name: 'shred_home', severity: 'CRITICAL' },
|
|
@@ -40,13 +40,26 @@ const MALICIOUS_PATTERNS = [
|
|
|
40
40
|
|
|
41
41
|
const SHEBANG_RE = /^#!.*\b(?:ba)?sh\b/;
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
// Source files (.js/.ts/...) can embed shell reverse-shell commands inside
|
|
44
|
+
// child_process exec/execSync/spawn string args. shell.js historically scanned only
|
|
45
|
+
// .sh/shebang files, so `execSync("bash -i >& /dev/tcp/...")` in index.js was invisible
|
|
46
|
+
// (missed npx-whoami-demo, 2026-06 — a revshell that scored grs 25 with type_reverse_shell=0).
|
|
47
|
+
// Apply ONLY the unambiguous reverse-shell command patterns to source files — NOT the
|
|
48
|
+
// context-dependent ones (curl|sh, systemctl, rm -rf, base64|sh) which would false-positive
|
|
49
|
+
// on JS string literals / build tooling. FPR-gate (2026-06-19): these 4 matched 0 port-check
|
|
50
|
+
// idioms (</dev/tcp, echo >/dev/tcp, nc -z) and 0 node_modules .js files.
|
|
51
|
+
const SOURCE_SCAN_PATTERN_NAMES = new Set([
|
|
52
|
+
'reverse_shell', 'netcat_shell', 'fifo_reverse_shell', 'fifo_nc_reverse_shell'
|
|
53
|
+
]);
|
|
54
|
+
const SOURCE_SCAN_EXTENSIONS = ['.js', '.cjs', '.mjs', '.ts', '.jsx', '.tsx'];
|
|
55
|
+
|
|
56
|
+
function scanFileContent(file, content, targetPath, threats, patterns = MALICIOUS_PATTERNS) {
|
|
44
57
|
// Strip comment lines to avoid false positives on documentation
|
|
45
58
|
const activeContent = content.split(/\r?\n/)
|
|
46
59
|
.filter(line => !line.trimStart().startsWith('#'))
|
|
47
60
|
.join('\n');
|
|
48
61
|
|
|
49
|
-
for (const { pattern, name, severity } of
|
|
62
|
+
for (const { pattern, name, severity } of patterns) {
|
|
50
63
|
if (pattern.test(activeContent)) {
|
|
51
64
|
threats.push({
|
|
52
65
|
type: name,
|
|
@@ -106,6 +119,14 @@ async function scanShellScripts(targetPath) {
|
|
|
106
119
|
} catch (e) { debugLog('[SHELL] readFile error:', e?.message); }
|
|
107
120
|
}
|
|
108
121
|
|
|
122
|
+
// Pass 3: source files (.js/.ts/...) — only the unambiguous reverse-shell command
|
|
123
|
+
// patterns (revshell commands embedded in child_process exec/spawn string args).
|
|
124
|
+
const sourcePatterns = MALICIOUS_PATTERNS.filter(p => SOURCE_SCAN_PATTERN_NAMES.has(p.name));
|
|
125
|
+
const sourceFiles = findFiles(targetPath, { extensions: SOURCE_SCAN_EXTENSIONS, excludedDirs: SHELL_EXCLUDED_DIRS });
|
|
126
|
+
forEachSafeFile(sourceFiles, (file, content) => {
|
|
127
|
+
scanFileContent(file, content, targetPath, threats, sourcePatterns);
|
|
128
|
+
});
|
|
129
|
+
|
|
109
130
|
return threats;
|
|
110
131
|
}
|
|
111
132
|
|
package/src/shared/download.js
CHANGED
|
@@ -340,6 +340,52 @@ function extractArchive(archivePath, destDir, options = {}) {
|
|
|
340
340
|
throw new Error(`Unsupported archive format for ${path.basename(archivePath)}`);
|
|
341
341
|
}
|
|
342
342
|
|
|
343
|
+
// Hard cap for off-thread extraction (OOM fix). Large legit packages can take
|
|
344
|
+
// 10-30s; 120s leaves headroom while still bounding a pathological extraction
|
|
345
|
+
// (the worker is terminated past this so a runaway cannot pin RSS forever).
|
|
346
|
+
const EXTRACT_OFFTHREAD_TIMEOUT_MS = parseInt(process.env.MUADDIB_EXTRACT_OFFTHREAD_TIMEOUT_MS, 10) || 120_000;
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* Off-main-thread variant of extractArchive: runs the SAME synchronous extractor
|
|
350
|
+
* in a worker thread (src/shared/extract-worker.js) so the caller's event loop
|
|
351
|
+
* stays responsive during extraction. See extract-worker.js header for why this
|
|
352
|
+
* is the OOM fix. Same return contract as extractArchive (resolves to the
|
|
353
|
+
* extracted package root); rejects on extraction error, worker crash, or timeout.
|
|
354
|
+
* Callers gate on archive size — small archives extract inline (cheaper than a
|
|
355
|
+
* worker spawn), large ones offload here.
|
|
356
|
+
*
|
|
357
|
+
* @param {string} archivePath
|
|
358
|
+
* @param {string} destDir - must already exist
|
|
359
|
+
* @param {Object} [options]
|
|
360
|
+
* @param {'targz'|'zip'} [options.format] - override auto-detection
|
|
361
|
+
* @param {number} [options.timeoutMs] - hard cap; worker terminated past it
|
|
362
|
+
* @returns {Promise<string>} extracted package root
|
|
363
|
+
*/
|
|
364
|
+
function extractArchiveOffThread(archivePath, destDir, options = {}) {
|
|
365
|
+
const { Worker } = require('worker_threads');
|
|
366
|
+
const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : EXTRACT_OFFTHREAD_TIMEOUT_MS;
|
|
367
|
+
return new Promise((resolve, reject) => {
|
|
368
|
+
let settled = false;
|
|
369
|
+
const worker = new Worker(path.join(__dirname, 'extract-worker.js'), {
|
|
370
|
+
workerData: { archivePath, destDir, format: options.format || null }
|
|
371
|
+
});
|
|
372
|
+
const finish = (fn) => { if (settled) return; settled = true; clearTimeout(timer); fn(); };
|
|
373
|
+
const timer = setTimeout(() => finish(() => {
|
|
374
|
+
worker.terminate().finally(() =>
|
|
375
|
+
reject(new Error(`extractArchiveOffThread timeout after ${timeoutMs}ms: ${path.basename(archivePath)}`)));
|
|
376
|
+
}), timeoutMs);
|
|
377
|
+
if (timer && typeof timer.unref === 'function') timer.unref();
|
|
378
|
+
worker.once('message', (msg) => finish(() => {
|
|
379
|
+
worker.terminate();
|
|
380
|
+
if (msg && msg.ok) resolve(msg.dir);
|
|
381
|
+
else reject(new Error((msg && msg.error) || 'extractArchiveOffThread: worker reported failure'));
|
|
382
|
+
}));
|
|
383
|
+
worker.once('error', (err) => finish(() => { worker.terminate(); reject(err); }));
|
|
384
|
+
worker.once('exit', (code) => finish(() =>
|
|
385
|
+
reject(new Error(`extractArchiveOffThread: worker exited (${code}) without a result`))));
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
|
|
343
389
|
/**
|
|
344
390
|
* Backwards-compatible wrapper for the original tar.gz-only extractor.
|
|
345
391
|
* Kept because src/scanner/temporal-ast-diff.js and existing tests still
|
|
@@ -370,6 +416,7 @@ module.exports = {
|
|
|
370
416
|
downloadToFile,
|
|
371
417
|
extractTarGz,
|
|
372
418
|
extractArchive,
|
|
419
|
+
extractArchiveOffThread,
|
|
373
420
|
detectArchiveFormat,
|
|
374
421
|
sanitizePackageName,
|
|
375
422
|
isAllowedDownloadRedirect,
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Worker-thread entry for off-main-thread archive extraction (OOM fix 2026-06-19).
|
|
5
|
+
*
|
|
6
|
+
* extractArchive() runs a SYNCHRONOUS extractor — adm-zip `extractAllTo` (.zip/
|
|
7
|
+
* .whl) or `execFileSync('tar', …)` (.tgz). On the main thread that wedges the
|
|
8
|
+
* event loop for the whole extraction (measured: up to 148s on large packages,
|
|
9
|
+
* data/loop-stalls.jsonl). With the loop wedged the RSS circuit breaker, the
|
|
10
|
+
* memory governor's RSS feed, and the EMERGENCY queue purge — all main-thread
|
|
11
|
+
* `setInterval` timers (daemon.js) — never fire, so RSS climbs to the cgroup
|
|
12
|
+
* MemoryMax unchecked → kernel SIGKILL. Running the same sync extractor here
|
|
13
|
+
* blocks only the WORKER loop; the parent's loop stays live so those defenses run.
|
|
14
|
+
*
|
|
15
|
+
* Contract: workerData = { archivePath, destDir, format }. Posts exactly one
|
|
16
|
+
* message — { ok: true, dir } on success, { ok: false, error } on failure — and
|
|
17
|
+
* never throws to the thread (the parent also handles a worker 'error', but an
|
|
18
|
+
* explicit message keeps the failure path uniform). All extraction hardening
|
|
19
|
+
* (zip-slip, zip-bomb uncompressed-size cap) lives in extractArchive and runs here.
|
|
20
|
+
*/
|
|
21
|
+
const { workerData, parentPort } = require('worker_threads');
|
|
22
|
+
const { extractArchive } = require('./download.js');
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
const opts = workerData && workerData.format ? { format: workerData.format } : {};
|
|
26
|
+
const dir = extractArchive(workerData.archivePath, workerData.destDir, opts);
|
|
27
|
+
parentPort.postMessage({ ok: true, dir });
|
|
28
|
+
} catch (err) {
|
|
29
|
+
parentPort.postMessage({ ok: false, error: err && err.message ? err.message : String(err) });
|
|
30
|
+
}
|