muaddib-scanner 2.11.96 → 2.11.98
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/monitor/ingestion.js
CHANGED
|
@@ -1017,6 +1017,12 @@ async function pollNpm(state, scanQueue, stats) {
|
|
|
1017
1017
|
|
|
1018
1018
|
const PYPI_USER_AGENT = `${SELF_PACKAGE_NAME} (security-monitor; +https://github.com/DNSZLSK/muaddib)`;
|
|
1019
1019
|
|
|
1020
|
+
// A normal 15-min poll is a few dozen events; a changelog_since_serial batch
|
|
1021
|
+
// caps around ~50K. Anything this large means we are far behind — worth one
|
|
1022
|
+
// extra changelog_last_serial call to measure the GLOBAL lag (see the global
|
|
1023
|
+
// catch-up protection in pollPyPIChangelog).
|
|
1024
|
+
const PYPI_CATCHUP_PROBE_MIN_EVENTS = 10000;
|
|
1025
|
+
|
|
1020
1026
|
/**
|
|
1021
1027
|
* Build an XML-RPC methodCall envelope. PyPI accepts only <int> and <string>
|
|
1022
1028
|
* params for the methods we use (changelog_last_serial, changelog_since_serial),
|
|
@@ -1188,6 +1194,38 @@ async function pollPyPIChangelog(state, scanQueue, stats) {
|
|
|
1188
1194
|
return 0;
|
|
1189
1195
|
}
|
|
1190
1196
|
|
|
1197
|
+
// GLOBAL catch-up protection (2026-06-11 incident): the per-batch gap
|
|
1198
|
+
// below is bounded by one changelog_since_serial response (~50K events,
|
|
1199
|
+
// observed 33-43K), so it can NEVER exceed PYPI_CATCHUP_MAX (100K) — a
|
|
1200
|
+
// poller resumed from an ancient serial (a test-fixture serial leaked
|
|
1201
|
+
// into prod state) replayed YEARS of history, ~15K ancient packages per
|
|
1202
|
+
// poll, without ever tripping the skip. A full batch is the tell: probe
|
|
1203
|
+
// the registry's current serial and skip to it when the global lag is
|
|
1204
|
+
// beyond the cap. Costs one extra XML-RPC call only on full batches.
|
|
1205
|
+
if (events.length >= PYPI_CATCHUP_PROBE_MIN_EVENTS) {
|
|
1206
|
+
await acquireRegistrySlot();
|
|
1207
|
+
let curBody;
|
|
1208
|
+
try {
|
|
1209
|
+
curBody = await _deps.httpsPost(
|
|
1210
|
+
PYPI_XMLRPC_URL,
|
|
1211
|
+
buildXmlRpcCall('changelog_last_serial', []),
|
|
1212
|
+
{ 'User-Agent': PYPI_USER_AGENT },
|
|
1213
|
+
10_000
|
|
1214
|
+
);
|
|
1215
|
+
} finally {
|
|
1216
|
+
releaseRegistrySlot();
|
|
1217
|
+
}
|
|
1218
|
+
const currentSerial = parseXmlRpcInt(curBody);
|
|
1219
|
+
if (currentSerial != null && currentSerial - lastSerial > PYPI_CATCHUP_MAX) {
|
|
1220
|
+
console.warn(`[MONITOR] PyPI changelog globally behind (${currentSerial - lastSerial} serials) — skipping to current ${currentSerial}`);
|
|
1221
|
+
stats.pypiCatchupSkips = (stats.pypiCatchupSkips || 0) + 1;
|
|
1222
|
+
stats.pypiCatchupSkippedEvents = (stats.pypiCatchupSkippedEvents || 0) + (currentSerial - lastSerial);
|
|
1223
|
+
state.pypiLastSerial = currentSerial;
|
|
1224
|
+
savePypiSerial(currentSerial);
|
|
1225
|
+
return 0;
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1191
1229
|
// Catch-up protection: if events span more than PYPI_CATCHUP_MAX serials,
|
|
1192
1230
|
// skip to the latest serial to avoid an avalanche after long downtime.
|
|
1193
1231
|
const lastEventSerial = events[events.length - 1].serial;
|
package/src/monitor/queue.js
CHANGED
|
@@ -19,6 +19,7 @@ const { loadCachedIOCs } = require('../ioc/updater.js');
|
|
|
19
19
|
const { scanPackageJson } = require('../scanner/package.js');
|
|
20
20
|
const { scanShellScripts } = require('../scanner/shell.js');
|
|
21
21
|
const { buildTrainingRecord } = require('../ml/feature-extractor.js');
|
|
22
|
+
const { appendWorkerMem } = require('./worker-mem.js');
|
|
22
23
|
const { appendRecord: appendTrainingRecord, relabelRecords } = require('../ml/jsonl-writer.js');
|
|
23
24
|
|
|
24
25
|
// From ./state.js
|
|
@@ -426,6 +427,17 @@ function runScanInWorker(extractedDir, timeoutMs, scanContext = null, signal = n
|
|
|
426
427
|
const _sc = scanContext || {};
|
|
427
428
|
_liveWorkers.set(worker, { name: _sc.name, version: _sc.version, ecosystem: _sc.ecosystem });
|
|
428
429
|
|
|
430
|
+
// Off-heap attribution (worker-mem.jsonl, gated MUADDIB_WORKER_MEM=1):
|
|
431
|
+
// process RSS around each worker's lifetime. tid captured now — after
|
|
432
|
+
// 'exit' worker.threadId becomes -1.
|
|
433
|
+
const _wmTid = worker.threadId;
|
|
434
|
+
const _wmSpawnedAt = Date.now();
|
|
435
|
+
appendWorkerMem({
|
|
436
|
+
ev: 'spawn', tid: _wmTid,
|
|
437
|
+
name: _sc.name, version: _sc.version, ecosystem: _sc.ecosystem,
|
|
438
|
+
rss: process.memoryUsage().rss
|
|
439
|
+
});
|
|
440
|
+
|
|
429
441
|
let settled = false;
|
|
430
442
|
let timer = null;
|
|
431
443
|
const done = (fn) => {
|
|
@@ -462,9 +474,19 @@ function runScanInWorker(extractedDir, timeoutMs, scanContext = null, signal = n
|
|
|
462
474
|
|
|
463
475
|
worker.on('error', (err) => done(() => reject(err)));
|
|
464
476
|
|
|
465
|
-
worker.on('exit', (code) =>
|
|
466
|
-
|
|
467
|
-
|
|
477
|
+
worker.on('exit', (code) => {
|
|
478
|
+
// 'exit' fires exactly once per worker (even after terminate/error), so
|
|
479
|
+
// it is the one reliable place to close the spawn/exit RSS pair.
|
|
480
|
+
appendWorkerMem({
|
|
481
|
+
ev: 'exit', tid: _wmTid,
|
|
482
|
+
name: _sc.name, version: _sc.version, code,
|
|
483
|
+
durMs: Date.now() - _wmSpawnedAt,
|
|
484
|
+
rss: process.memoryUsage().rss
|
|
485
|
+
});
|
|
486
|
+
done(() => {
|
|
487
|
+
if (code !== 0) reject(new Error(`Worker exited with code ${code}`));
|
|
488
|
+
});
|
|
489
|
+
});
|
|
468
490
|
});
|
|
469
491
|
}
|
|
470
492
|
|
package/src/monitor/state.js
CHANGED
|
@@ -81,7 +81,10 @@ const ALERTS_LOG_DIR = resolveWritableDir(PRIMARY_ALERTS_DIR, FALLBACK_ALERTS_DI
|
|
|
81
81
|
|
|
82
82
|
// --- npm seq constants ---
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
// Env-overridable — same prod-state-pollution guard as PYPI_SERIAL_FILE below
|
|
85
|
+
// (the npm-seq roundtrip test used to unlink the REAL file).
|
|
86
|
+
const NPM_SEQ_FILE = process.env.MUADDIB_NPM_SEQ_FILE
|
|
87
|
+
|| path.join(__dirname, '..', '..', 'data', 'npm-seq.json');
|
|
85
88
|
const CHANGES_STREAM_URL = 'https://replicate.npmjs.com/registry/_changes';
|
|
86
89
|
const CHANGES_LIMIT = 1000;
|
|
87
90
|
const CHANGES_CATCHUP_MAX = 500000; // If behind by more than 500k seqs, skip to "now"
|
|
@@ -96,7 +99,13 @@ const CHANGES_CATCHUP_MAX = 500000; // If behind by more than 500k seqs, skip to
|
|
|
96
99
|
// PYPI_CATCHUP_MAX is the staleness cap: if we are behind by more than this many
|
|
97
100
|
// serials (≈ days of activity at ~30k events/day in 2026), skip to "now" rather
|
|
98
101
|
// than fetch a monster batch. Mirrors CHANGES_CATCHUP_MAX for npm.
|
|
99
|
-
|
|
102
|
+
// Env-overridable (2026-06-11 incident): integration tests exercise the real
|
|
103
|
+
// pollPyPIChangelog with stubbed HTTP but the real savePypiSerial — a fixture
|
|
104
|
+
// serial (1002) leaked into prod state, and the next daemon boot replayed the
|
|
105
|
+
// PyPI changelog from 2011 (~15K ancient packages queued per poll). The test
|
|
106
|
+
// harness points this at a tmp file so NO test can touch prod state.
|
|
107
|
+
const PYPI_SERIAL_FILE = process.env.MUADDIB_PYPI_SERIAL_FILE
|
|
108
|
+
|| path.join(__dirname, '..', '..', 'data', 'pypi-serial.json');
|
|
100
109
|
const PYPI_XMLRPC_URL = 'https://pypi.org/pypi';
|
|
101
110
|
const PYPI_CATCHUP_MAX = 100000;
|
|
102
111
|
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Per-worker memory instrumentation (off-heap RSS attribution, 2026-06).
|
|
5
|
+
*
|
|
6
|
+
* The EMERGENCY breaker fires on process RSS while the heap sits at ~15% —
|
|
7
|
+
* the driver is off-heap (malloc arenas + tarball Buffers) and mem-trend.jsonl
|
|
8
|
+
* only samples the whole process. This module attributes memory to individual
|
|
9
|
+
* scan workers / packages so the worker_threads → child_process decision can
|
|
10
|
+
* be made on data:
|
|
11
|
+
* H1: RSS stays high AFTER workers die → arenas never returned to the OS
|
|
12
|
+
* H2: RSS peaks only WHILE workers live → concurrent in-flight peak
|
|
13
|
+
*
|
|
14
|
+
* Producers:
|
|
15
|
+
* - queue.js (parent): ev:'spawn' / ev:'exit' around each scan worker,
|
|
16
|
+
* with process-wide RSS (delta attributable per package, noisy but
|
|
17
|
+
* aggregable over 24-48h).
|
|
18
|
+
* - scan-worker.js (worker): ev:'sample' every sampleIntervalMs() with the
|
|
19
|
+
* isolate-local heapUsed/external/arrayBuffers (rss there is process-wide).
|
|
20
|
+
*
|
|
21
|
+
* Same hot-path safety rules as spill.js (2026-06-11 prod-freeze lesson):
|
|
22
|
+
* append-only, stat-gated O(1) rotation, never read the file back, never throw.
|
|
23
|
+
* OFF unless MUADDIB_WORKER_MEM=1 (staged rollout, same pattern as
|
|
24
|
+
* MUADDIB_WORKER_MAX_OLD_MB) so tests and CLI runs never touch data/.
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
const fs = require('fs');
|
|
28
|
+
const path = require('path');
|
|
29
|
+
|
|
30
|
+
const DEFAULT_FILE = path.join(__dirname, '..', '..', 'data', 'worker-mem.jsonl');
|
|
31
|
+
const DEFAULT_MAX_MB = 64; // rotate past 64MB (file + .1 = 128MB worst case, ~2.5 days at concurrency 8)
|
|
32
|
+
const DEFAULT_SAMPLE_MS = 10000; // per-worker isolate sample cadence
|
|
33
|
+
|
|
34
|
+
function workerMemEnabled() {
|
|
35
|
+
return process.env.MUADDIB_WORKER_MEM === '1';
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function workerMemFile() {
|
|
39
|
+
return process.env.MUADDIB_WORKER_MEM_FILE || DEFAULT_FILE;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** 0 = sampling disabled (instrumentation off, or explicit MUADDIB_WORKER_MEM_SAMPLE_MS=0). */
|
|
43
|
+
function sampleIntervalMs() {
|
|
44
|
+
if (!workerMemEnabled()) return 0;
|
|
45
|
+
const v = parseInt(process.env.MUADDIB_WORKER_MEM_SAMPLE_MS, 10);
|
|
46
|
+
if (Number.isFinite(v) && v >= 0) return v;
|
|
47
|
+
return DEFAULT_SAMPLE_MS;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Append one instrumentation entry (ts stamped here). Bounded resource
|
|
52
|
+
* (CLAUDE.md §2): stat-gated truncate-rotate, no read-back on the hot path.
|
|
53
|
+
* @returns {boolean} true if a line was written
|
|
54
|
+
*/
|
|
55
|
+
function appendWorkerMem(entry) {
|
|
56
|
+
if (!workerMemEnabled()) return false;
|
|
57
|
+
try {
|
|
58
|
+
const file = workerMemFile();
|
|
59
|
+
const maxMb = parseInt(process.env.MUADDIB_WORKER_MEM_MAX_MB, 10);
|
|
60
|
+
const maxBytes = (Number.isFinite(maxMb) && maxMb > 0 ? maxMb : DEFAULT_MAX_MB) * 1024 * 1024;
|
|
61
|
+
try {
|
|
62
|
+
const st = fs.statSync(file);
|
|
63
|
+
if (st.size > maxBytes) fs.renameSync(file, file + '.1');
|
|
64
|
+
} catch { /* no file yet — fine */ }
|
|
65
|
+
fs.appendFileSync(file, JSON.stringify({ ts: new Date().toISOString(), ...entry }) + '\n', 'utf8');
|
|
66
|
+
return true;
|
|
67
|
+
} catch { /* instrumentation must never crash the daemon or a worker */
|
|
68
|
+
return false;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
module.exports = { appendWorkerMem, sampleIntervalMs, workerMemEnabled, workerMemFile };
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* parentPort.postMessage({ type: 'error', message: string })
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
|
-
const { parentPort, workerData } = require('worker_threads');
|
|
15
|
+
const { parentPort, workerData, threadId } = require('worker_threads');
|
|
16
16
|
|
|
17
17
|
if (!parentPort) {
|
|
18
18
|
// Not running as a worker — exit gracefully
|
|
@@ -20,17 +20,45 @@ if (!parentPort) {
|
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
const { run } = require('../index.js');
|
|
23
|
+
const { appendWorkerMem, sampleIntervalMs } = require('../monitor/worker-mem.js');
|
|
23
24
|
|
|
24
25
|
(async () => {
|
|
26
|
+
// Off-heap attribution samples (worker-mem.jsonl): heapUsed/external/
|
|
27
|
+
// arrayBuffers are isolate-local here, rss is process-wide. The samples MUST
|
|
28
|
+
// NOT go through parentPort — the parent settles the scan promise on the
|
|
29
|
+
// first message it receives (queue.js done()), so a sample message would
|
|
30
|
+
// hang the scan forever. unref() so the timer never keeps the worker alive.
|
|
31
|
+
const scanContext = workerData.scanContext || {};
|
|
32
|
+
const everyMs = sampleIntervalMs();
|
|
33
|
+
let sampler = null;
|
|
34
|
+
if (everyMs > 0) {
|
|
35
|
+
const sampleNow = () => {
|
|
36
|
+
const m = process.memoryUsage();
|
|
37
|
+
appendWorkerMem({
|
|
38
|
+
ev: 'sample', tid: threadId,
|
|
39
|
+
name: scanContext.name, version: scanContext.version,
|
|
40
|
+
heapUsed: m.heapUsed, external: m.external, arrayBuffers: m.arrayBuffers, rss: m.rss
|
|
41
|
+
});
|
|
42
|
+
};
|
|
43
|
+
// One immediate baseline sample, deterministically: a mostly-synchronous
|
|
44
|
+
// scan (small package, sync AST walks, microtask-only awaits) can starve
|
|
45
|
+
// the event loop for its whole lifetime, so the interval alone may never
|
|
46
|
+
// fire (bit CI on 2026-06-11). The baseline also gives the per-package
|
|
47
|
+
// delta a clean starting point.
|
|
48
|
+
sampleNow();
|
|
49
|
+
sampler = setInterval(sampleNow, everyMs);
|
|
50
|
+
sampler.unref();
|
|
51
|
+
}
|
|
25
52
|
try {
|
|
26
53
|
// scanContext (optional) carries monitor-side info that opt-in scanners need
|
|
27
54
|
// (e.g. trusted-dep-diff requires package name + version to query the registry).
|
|
28
55
|
// It is spread INTO the pipeline options, but `_capture: true` always wins so
|
|
29
56
|
// the worker keeps returning the result object — never prints.
|
|
30
|
-
const scanContext = workerData.scanContext || {};
|
|
31
57
|
const result = await run(workerData.extractedDir, { ...scanContext, _capture: true });
|
|
32
58
|
parentPort.postMessage({ type: 'result', data: result });
|
|
33
59
|
} catch (err) {
|
|
34
60
|
parentPort.postMessage({ type: 'error', message: err.message || String(err) });
|
|
61
|
+
} finally {
|
|
62
|
+
if (sampler) clearInterval(sampler);
|
|
35
63
|
}
|
|
36
64
|
})();
|