muaddib-scanner 2.11.121 → 2.11.124

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.121",
3
+ "version": "2.11.124",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-06-19T09:42:28.283Z",
3
+ "timestamp": "2026-06-19T13:30:20.182Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -12,7 +12,7 @@ const os = require('os');
12
12
  const { Worker } = require('worker_threads');
13
13
  const { runSandbox, tryAcquireSandboxSlot } = require('../sandbox/index.js');
14
14
  const { sendWebhook } = require('../webhook.js');
15
- const { downloadToFile, extractArchive, sanitizePackageName } = require('../shared/download.js');
15
+ const { downloadToFile, extractArchive, extractArchiveOffThread, sanitizePackageName } = require('../shared/download.js');
16
16
  const { MAX_TARBALL_SIZE, getMaxFileSize } = require('../shared/constants.js');
17
17
  const { acquireRegistrySlot, releaseRegistrySlot, awaitRateToken: awaitRateTokenForWorker, signal429: signal429ForWorker } = require('../shared/http-limiter.js');
18
18
  const { loadCachedIOCs } = require('../ioc/updater.js');
@@ -178,6 +178,25 @@ const STATIC_SCAN_TIMEOUT_MS = 45_000; // 45s for static analysis only
178
178
  const LARGE_PACKAGE_SIZE = 10 * 1024 * 1024; // 10MB
179
179
  const RECENTLY_SCANNED_MAX = 50_000; // FIFO cap for the dedup Set (P0c — bounded resource)
180
180
 
181
+ // OOM fix (2026-06-19): archives larger than this (COMPRESSED, on-disk size) are
182
+ // extracted off the main thread in a worker, so the synchronous extractor
183
+ // (adm-zip extractAllTo / execFileSync tar) can no longer wedge the event loop and
184
+ // starve the RSS breaker / memory governor / EMERGENCY purge (all main-thread
185
+ // timers) → cgroup OOM. Confirmed culprit: data/loop-stalls.jsonl (extract:* up to
186
+ // 148s). Small archives extract inline — a worker spawn costs more than their
187
+ // sub-100ms extraction. Env-tunable via MUADDIB_INLINE_EXTRACT_MB.
188
+ const INLINE_EXTRACT_MAX_BYTES = (parseInt(process.env.MUADDIB_INLINE_EXTRACT_MB, 10) || 4) * 1024 * 1024;
189
+
190
+ // Extract inline for small archives, off-thread for large ones. compressedSize is
191
+ // the on-disk tarball size (reliable, unlike the registry unpackedSize metadata).
192
+ // Always returns a Promise so the call sites can uniformly `await`.
193
+ function extractGated(archivePath, destDir, compressedSize) {
194
+ if (compressedSize > INLINE_EXTRACT_MAX_BYTES) {
195
+ return extractArchiveOffThread(archivePath, destDir);
196
+ }
197
+ return Promise.resolve(extractArchive(archivePath, destDir));
198
+ }
199
+
181
200
  // First-publish sandbox: max pending sandbox items before deferring first-publish clean scans
182
201
  // Prevents starving T1a sandbox capacity when many first-publish packages arrive at once
183
202
  const FIRST_PUBLISH_SANDBOX_MAX_QUEUE = parseInt(process.env.MUADDIB_FIRST_PUBLISH_SANDBOX_MAX_QUEUE, 10) || 10;
@@ -766,7 +785,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
766
785
  let bypassQuickScan = false;
767
786
  try {
768
787
  const _crumb = beginOp('extract:quickscan', { name, version, unpackedSizeMb: Math.round(unpackedSize / 1024 / 1024) });
769
- try { extractedDir = extractArchive(tgzPath, tmpDir); } finally { endOp(_crumb); }
788
+ try { extractedDir = await extractGated(tgzPath, tmpDir, fileSize); } finally { endOp(_crumb); }
770
789
 
771
790
  const [pkgThreats, shellThreats] = await Promise.all([
772
791
  scanPackageJson(extractedDir),
@@ -816,7 +835,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
816
835
 
817
836
  if (!extractedDir) {
818
837
  const _crumb = beginOp('extract:prework', { name, version, unpackedSizeMb: Math.round((meta.unpackedSize || 0) / 1024 / 1024) });
819
- try { extractedDir = extractArchive(tgzPath, tmpDir); } finally { endOp(_crumb); }
838
+ try { extractedDir = await extractGated(tgzPath, tmpDir, fileSize); } finally { endOp(_crumb); }
820
839
  }
821
840
 
822
841
  // ML Phase 2a: Count JS files and detect test presence for enriched features
@@ -8,7 +8,7 @@ const SHELL_EXCLUDED_DIRS = ['node_modules', '.git', '.muaddib-cache'];
8
8
  const MALICIOUS_PATTERNS = [
9
9
  { pattern: /curl[^\n]{0,5000}\|[^\n]{0,5000}sh/m, name: 'curl_pipe_shell', severity: 'HIGH' },
10
10
  { pattern: /wget[^\n]{0,5000}&&[^\n]{0,5000}chmod[^\n]{0,5000}\+x/m, name: 'wget_chmod_exec', severity: 'HIGH' },
11
- { pattern: /bash\s+-i\s+>&\s+\/dev\/tcp/m, name: 'reverse_shell', severity: 'CRITICAL' },
11
+ { pattern: /(?:ba)?sh\s+-i\s+>&\s*\/dev\/tcp/m, name: 'reverse_shell', severity: 'CRITICAL' },
12
12
  { pattern: /nc\s+-e\s+\/bin\/(ba)?sh/m, name: 'netcat_shell', severity: 'CRITICAL' },
13
13
  { pattern: /rm\s+-rf\s+(~\/|\$HOME|\/home)/m, name: 'home_deletion', severity: 'CRITICAL' },
14
14
  { pattern: /shred.*\$HOME/m, name: 'shred_home', severity: 'CRITICAL' },
@@ -40,13 +40,26 @@ const MALICIOUS_PATTERNS = [
40
40
 
41
41
  const SHEBANG_RE = /^#!.*\b(?:ba)?sh\b/;
42
42
 
43
- function scanFileContent(file, content, targetPath, threats) {
43
+ // Source files (.js/.ts/...) can embed shell reverse-shell commands inside
44
+ // child_process exec/execSync/spawn string args. shell.js historically scanned only
45
+ // .sh/shebang files, so `execSync("bash -i >& /dev/tcp/...")` in index.js was invisible
46
+ // (missed npx-whoami-demo, 2026-06 — a revshell that scored grs 25 with type_reverse_shell=0).
47
+ // Apply ONLY the unambiguous reverse-shell command patterns to source files — NOT the
48
+ // context-dependent ones (curl|sh, systemctl, rm -rf, base64|sh) which would false-positive
49
+ // on JS string literals / build tooling. FPR-gate (2026-06-19): these 4 matched 0 port-check
50
+ // idioms (</dev/tcp, echo >/dev/tcp, nc -z) and 0 node_modules .js files.
51
+ const SOURCE_SCAN_PATTERN_NAMES = new Set([
52
+ 'reverse_shell', 'netcat_shell', 'fifo_reverse_shell', 'fifo_nc_reverse_shell'
53
+ ]);
54
+ const SOURCE_SCAN_EXTENSIONS = ['.js', '.cjs', '.mjs', '.ts', '.jsx', '.tsx'];
55
+
56
+ function scanFileContent(file, content, targetPath, threats, patterns = MALICIOUS_PATTERNS) {
44
57
  // Strip comment lines to avoid false positives on documentation
45
58
  const activeContent = content.split(/\r?\n/)
46
59
  .filter(line => !line.trimStart().startsWith('#'))
47
60
  .join('\n');
48
61
 
49
- for (const { pattern, name, severity } of MALICIOUS_PATTERNS) {
62
+ for (const { pattern, name, severity } of patterns) {
50
63
  if (pattern.test(activeContent)) {
51
64
  threats.push({
52
65
  type: name,
@@ -106,6 +119,14 @@ async function scanShellScripts(targetPath) {
106
119
  } catch (e) { debugLog('[SHELL] readFile error:', e?.message); }
107
120
  }
108
121
 
122
+ // Pass 3: source files (.js/.ts/...) — only the unambiguous reverse-shell command
123
+ // patterns (revshell commands embedded in child_process exec/spawn string args).
124
+ const sourcePatterns = MALICIOUS_PATTERNS.filter(p => SOURCE_SCAN_PATTERN_NAMES.has(p.name));
125
+ const sourceFiles = findFiles(targetPath, { extensions: SOURCE_SCAN_EXTENSIONS, excludedDirs: SHELL_EXCLUDED_DIRS });
126
+ forEachSafeFile(sourceFiles, (file, content) => {
127
+ scanFileContent(file, content, targetPath, threats, sourcePatterns);
128
+ });
129
+
109
130
  return threats;
110
131
  }
111
132
 
@@ -340,6 +340,52 @@ function extractArchive(archivePath, destDir, options = {}) {
340
340
  throw new Error(`Unsupported archive format for ${path.basename(archivePath)}`);
341
341
  }
342
342
 
343
+ // Hard cap for off-thread extraction (OOM fix). Large legit packages can take
344
+ // 10-30s; 120s leaves headroom while still bounding a pathological extraction
345
+ // (the worker is terminated past this so a runaway cannot pin RSS forever).
346
+ const EXTRACT_OFFTHREAD_TIMEOUT_MS = parseInt(process.env.MUADDIB_EXTRACT_OFFTHREAD_TIMEOUT_MS, 10) || 120_000;
347
+
348
+ /**
349
+ * Off-main-thread variant of extractArchive: runs the SAME synchronous extractor
350
+ * in a worker thread (src/shared/extract-worker.js) so the caller's event loop
351
+ * stays responsive during extraction. See extract-worker.js header for why this
352
+ * is the OOM fix. Same return contract as extractArchive (resolves to the
353
+ * extracted package root); rejects on extraction error, worker crash, or timeout.
354
+ * Callers gate on archive size — small archives extract inline (cheaper than a
355
+ * worker spawn), large ones offload here.
356
+ *
357
+ * @param {string} archivePath
358
+ * @param {string} destDir - must already exist
359
+ * @param {Object} [options]
360
+ * @param {'targz'|'zip'} [options.format] - override auto-detection
361
+ * @param {number} [options.timeoutMs] - hard cap; worker terminated past it
362
+ * @returns {Promise<string>} extracted package root
363
+ */
364
+ function extractArchiveOffThread(archivePath, destDir, options = {}) {
365
+ const { Worker } = require('worker_threads');
366
+ const timeoutMs = Number.isFinite(options.timeoutMs) ? options.timeoutMs : EXTRACT_OFFTHREAD_TIMEOUT_MS;
367
+ return new Promise((resolve, reject) => {
368
+ let settled = false;
369
+ const worker = new Worker(path.join(__dirname, 'extract-worker.js'), {
370
+ workerData: { archivePath, destDir, format: options.format || null }
371
+ });
372
+ const finish = (fn) => { if (settled) return; settled = true; clearTimeout(timer); fn(); };
373
+ const timer = setTimeout(() => finish(() => {
374
+ worker.terminate().finally(() =>
375
+ reject(new Error(`extractArchiveOffThread timeout after ${timeoutMs}ms: ${path.basename(archivePath)}`)));
376
+ }), timeoutMs);
377
+ if (timer && typeof timer.unref === 'function') timer.unref();
378
+ worker.once('message', (msg) => finish(() => {
379
+ worker.terminate();
380
+ if (msg && msg.ok) resolve(msg.dir);
381
+ else reject(new Error((msg && msg.error) || 'extractArchiveOffThread: worker reported failure'));
382
+ }));
383
+ worker.once('error', (err) => finish(() => { worker.terminate(); reject(err); }));
384
+ worker.once('exit', (code) => finish(() =>
385
+ reject(new Error(`extractArchiveOffThread: worker exited (${code}) without a result`))));
386
+ });
387
+ }
388
+
343
389
  /**
344
390
  * Backwards-compatible wrapper for the original tar.gz-only extractor.
345
391
  * Kept because src/scanner/temporal-ast-diff.js and existing tests still
@@ -370,6 +416,7 @@ module.exports = {
370
416
  downloadToFile,
371
417
  extractTarGz,
372
418
  extractArchive,
419
+ extractArchiveOffThread,
373
420
  detectArchiveFormat,
374
421
  sanitizePackageName,
375
422
  isAllowedDownloadRedirect,
@@ -0,0 +1,30 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Worker-thread entry for off-main-thread archive extraction (OOM fix 2026-06-19).
5
+ *
6
+ * extractArchive() runs a SYNCHRONOUS extractor — adm-zip `extractAllTo` (.zip/
7
+ * .whl) or `execFileSync('tar', …)` (.tgz). On the main thread that wedges the
8
+ * event loop for the whole extraction (measured: up to 148s on large packages,
9
+ * data/loop-stalls.jsonl). With the loop wedged the RSS circuit breaker, the
10
+ * memory governor's RSS feed, and the EMERGENCY queue purge — all main-thread
11
+ * `setInterval` timers (daemon.js) — never fire, so RSS climbs to the cgroup
12
+ * MemoryMax unchecked → kernel SIGKILL. Running the same sync extractor here
13
+ * blocks only the WORKER loop; the parent's loop stays live so those defenses run.
14
+ *
15
+ * Contract: workerData = { archivePath, destDir, format }. Posts exactly one
16
+ * message — { ok: true, dir } on success, { ok: false, error } on failure — and
17
+ * never throws to the thread (the parent also handles a worker 'error', but an
18
+ * explicit message keeps the failure path uniform). All extraction hardening
19
+ * (zip-slip, zip-bomb uncompressed-size cap) lives in extractArchive and runs here.
20
+ */
21
+ const { workerData, parentPort } = require('worker_threads');
22
+ const { extractArchive } = require('./download.js');
23
+
24
+ try {
25
+ const opts = workerData && workerData.format ? { format: workerData.format } : {};
26
+ const dir = extractArchive(workerData.archivePath, workerData.destDir, opts);
27
+ parentPort.postMessage({ ok: true, dir });
28
+ } catch (err) {
29
+ parentPort.postMessage({ ok: false, error: err && err.message ? err.message : String(err) });
30
+ }