muaddib-scanner 2.11.103 → 2.11.109

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.103",
3
+ "version": "2.11.109",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-06-12T10:16:00.853Z",
3
+ "timestamp": "2026-06-12T16:19:40.738Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -296,7 +296,12 @@ function loadCachedIOCs() {
296
296
  try {
297
297
  const leanIOCs = JSON.parse(fs.readFileSync(LOCAL_LEAN_FILE, 'utf8'));
298
298
  mergeIOCs(merged, leanIOCs);
299
+ _leanParseFailedAt = 0; // healthy again
299
300
  } catch (e) {
301
+ // Phase D: a corrupted lean does NOT fall back to the full file — the
302
+ // scan continues with FEWER IOCs. Flag it so the degradation registry
303
+ // can alarm (ioc:lean-parse-failed) instead of one buried WARN line.
304
+ _leanParseFailedAt = Date.now();
300
305
  console.log('[WARN] Failed to load lean IOC database (iocs-lean.json): ' + e.message);
301
306
  }
302
307
  } else if (fs.existsSync(LOCAL_IOC_FILE)) {
@@ -511,6 +516,24 @@ function createLeanIOCs(fullIOCs) {
511
516
  };
512
517
  }
513
518
 
519
+ // Phase D (degradation registry): main-thread observable status of the lean
520
+ // projection. `missing`/`stale` re-arm the full-223MB-per-worker fallback;
521
+ // `parseFailed` means scans run with FEWER IOCs. Read by the daemon's
522
+ // degradation tick (cheap: two statSync).
523
+ let _leanParseFailedAt = 0;
524
+ function getLeanStatus() {
525
+ let missing = false, stale = false;
526
+ try {
527
+ const fullExists = fs.existsSync(LOCAL_IOC_FILE);
528
+ const leanExists = fs.existsSync(LOCAL_LEAN_FILE);
529
+ if (fullExists && !leanExists) missing = true;
530
+ else if (fullExists && leanExists) {
531
+ try { stale = fs.statSync(LOCAL_LEAN_FILE).mtimeMs < fs.statSync(LOCAL_IOC_FILE).mtimeMs; } catch { stale = false; }
532
+ }
533
+ } catch { /* status is best-effort */ }
534
+ return { missing, stale, parseFailed: _leanParseFailedAt > 0, parseFailedAt: _leanParseFailedAt || null };
535
+ }
536
+
514
537
  // Ensure LOCAL_LEAN_FILE exists and is at least as fresh as LOCAL_IOC_FILE.
515
538
  // Reads the 223MB full ONCE (the ~450MB parse peak) — acceptable only in a
516
539
  // long-lived process (daemon boot); NEVER call from a one-shot scan worker.
@@ -773,4 +796,4 @@ function verifyIOCHMAC(data, hmac) {
773
796
  }
774
797
  }
775
798
 
776
- module.exports = { updateIOCs, loadCachedIOCs, invalidateCache, generateCompactIOCs, expandCompactIOCs, createLeanIOCs, ensureLeanIOCFile, writeLeanIOCFile, LOCAL_LEAN_FILE, LOCAL_IOC_FILE, mergeIOCs, createOptimizedIOCs, generateIOCHMAC, verifyIOCHMAC, checkIOCStaleness, NEVER_WILDCARD, NEVER_WILDCARD_PYPI };
799
+ module.exports = { updateIOCs, loadCachedIOCs, invalidateCache, generateCompactIOCs, expandCompactIOCs, createLeanIOCs, ensureLeanIOCFile, writeLeanIOCFile, getLeanStatus, LOCAL_LEAN_FILE, LOCAL_IOC_FILE, mergeIOCs, createOptimizedIOCs, generateIOCHMAC, verifyIOCHMAC, checkIOCStaleness, NEVER_WILDCARD, NEVER_WILDCARD_PYPI };
@@ -10,7 +10,7 @@ const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache,
10
10
  const { isTemporalEnabled, isTemporalAstEnabled, isTemporalPublishEnabled, isTemporalMaintainerEnabled } = require('./temporal.js');
11
11
  const { pendingGrouped, flushScopeGroup, sendDailyReport, redeliverPendingReportOnBoot, alertedPackageRules, ALERTED_PACKAGES_MAX: MAX_ALERTED_PACKAGES } = require('./webhook.js');
12
12
  const { poll, getPollBackoffMs } = require('./ingestion.js');
13
- const { ensureWorkers, drainWorkers, getTargetConcurrency, setTargetConcurrency, getActiveWorkers, terminateAllWorkers } = require('./queue.js');
13
+ const { ensureWorkers, drainWorkers, getTargetConcurrency, setTargetConcurrency, getActiveWorkers, terminateAllWorkers, getInFlightItems, computeInterruptDisposition } = require('./queue.js');
14
14
  const { computeTarget, ADJUST_INTERVAL_MS, BASE_CONCURRENCY } = require('./adaptive-concurrency.js');
15
15
  const { startHealthcheck } = require('./healthcheck.js');
16
16
  const { startDeferredWorker, stopDeferredWorker, persistDeferredQueue, restoreDeferredQueue, clearDeferredQueue } = require('./deferred-sandbox.js');
@@ -34,6 +34,14 @@ const PROCESS_LOOP_INTERVAL = 2_000; // Queue check interval when empty
34
34
  // 12 calm hours/day do the catch-up of burst-time evictions. Rate-limited to one
35
35
  // batch per interval (the main loop ticks every 2s — unthrottled it would re-spike
36
36
  // the queue in seconds). All env-tunable for the staged rollout.
37
+ // C7: how long the shutdown waits for in-flight scans before spilling them.
38
+ // Must stay well under systemd TimeoutStopSec (default 90s) so the ledger,
39
+ // spill and queue persist ALWAYS run before any SIGKILL.
40
+ const SHUTDOWN_DRAIN_MAX_MS = (() => {
41
+ const v = parseInt(process.env.MUADDIB_SHUTDOWN_DRAIN_MAX_MS, 10);
42
+ return Number.isFinite(v) && v > 0 ? v : 20_000;
43
+ })();
44
+
37
45
  const SPILL_DRAIN_THRESHOLD = (() => {
38
46
  const v = parseInt(process.env.MUADDIB_SPILL_DRAIN_THRESHOLD, 10);
39
47
  return Number.isFinite(v) && v > 0 ? v : 500;
@@ -666,6 +674,23 @@ function reportStats(stats) {
666
674
  if (stats.changesStreamPackages) {
667
675
  console.log(`[MONITOR] Changes stream packages: ${stats.changesStreamPackages}`);
668
676
  }
677
+ // Phase D: active degradations in the hourly Stability block.
678
+ try {
679
+ const active = require('./degradation.js').getActiveDegradations();
680
+ if (active.length > 0) console.warn(`[MONITOR] Degradations: ${active.join(', ')}`);
681
+ } catch { /* observability only */ }
682
+ // Network-brain state (governors phase A): one line per host that has seen
683
+ // any backoff — the observation signal for the A deployment gate (AIMD
684
+ // de-escalations visible, no sustained max-level) and phase D's input.
685
+ try {
686
+ const { getBrainState } = require('../shared/http-limiter.js');
687
+ const brain = getBrainState();
688
+ const noisy = Object.entries(brain).filter(([, s]) => s.backoffCount > 0 || s.level > 0 || s.pendingWaiters > 0);
689
+ if (noisy.length > 0) {
690
+ const line = noisy.map(([h, s]) => `${h}: level=${s.level} pause=${s.pauseRemainingMs}ms 429s=${s.backoffCount} waiters=${s.pendingWaiters}`).join(' | ');
691
+ console.log(`[MONITOR] Brain: ${line}`);
692
+ }
693
+ } catch { /* observability only */ }
669
694
  if (stats.rssFallbackCount) {
670
695
  console.log(`[MONITOR] RSS fallback activations: ${stats.rssFallbackCount}`);
671
696
  }
@@ -960,9 +985,39 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
960
985
  clearInterval(concurrencyAdjustHandle);
961
986
  concurrencyAdjustHandle = null;
962
987
  }
963
- // Wait for in-flight scans to complete (soft drain)
964
- console.log(`[MONITOR] Draining ${getActiveWorkers()} active worker(s)...`);
965
- await drainWorkers();
988
+ // Bounded drain (phase C, C7). The old unbounded `await drainWorkers()`
989
+ // could outlive systemd's TimeoutStopSec (scans run up to 300s): SIGKILL
990
+ // then landed MID-drain, persistQueue never ran, and every in-flight scan
991
+ // plus up to 60s of queue mutations were lost UNLEDGERED on each manual
992
+ // restart — the exact deployment mode of this program. Drain for up to
993
+ // SHUTDOWN_DRAIN_MAX_MS, then spill the survivors (protected, bounded
994
+ // retries) so the next boot re-scans them.
995
+ console.log(`[MONITOR] Draining ${getActiveWorkers()} active worker(s) (bounded ${SHUTDOWN_DRAIN_MAX_MS / 1000}s)...`);
996
+ await Promise.race([
997
+ drainWorkers(),
998
+ new Promise(resolve => setTimeout(resolve, SHUTDOWN_DRAIN_MAX_MS).unref())
999
+ ]);
1000
+ try {
1001
+ const leftovers = getInFlightItems();
1002
+ if (leftovers.length > 0) {
1003
+ const { isSpillEnabled: spillOn, spillItems } = require('./spill.js');
1004
+ const { appendScanLedger } = require('./state.js');
1005
+ let spilledN = 0;
1006
+ for (const it of leftovers) {
1007
+ const { retries, giveUp } = computeInterruptDisposition(it);
1008
+ recentlyScanned.delete(`${it.ecosystem}/${it.name}@${it.version}`);
1009
+ if (giveUp) {
1010
+ appendScanLedger({ name: it.name, version: it.version, ecosystem: it.ecosystem, outcome: 'dropped', source: 'interrupted_max' });
1011
+ continue;
1012
+ }
1013
+ appendScanLedger({ name: it.name, version: it.version, ecosystem: it.ecosystem, outcome: 'interrupted', source: 'shutdown_drain' });
1014
+ if (spillOn() && spillItems([{ ...it, interrupted: true, interruptRetries: retries }]) === 1) spilledN++;
1015
+ }
1016
+ console.log(`[MONITOR] Shutdown: ${leftovers.length} in-flight scan(s) did not finish in time — ${spilledN} spilled for re-scan, all ledgered`);
1017
+ }
1018
+ } catch (e) {
1019
+ console.error(`[MONITOR] Shutdown in-flight spill failed: ${e.message}`);
1020
+ }
966
1021
  // Persist remaining queue items so they survive the restart
967
1022
  persistQueue(scanQueue, state);
968
1023
  saveRecentlyScanned(recentlyScanned); // Persist dedup set too (avoid re-scan storm on restart)
@@ -1043,6 +1098,7 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
1043
1098
  // This ensures new packages are ingested even while a large batch is being scanned.
1044
1099
  // Backpressure: poll() skips when queue >= 30K or memory pressure >= CRITICAL (90%).
1045
1100
  // Adaptive concurrency adjusts scan throughput to match ingestion rate.
1101
+ let _lastTemporalShedCount = 0; // phase D: temporal-shed delta tracking
1046
1102
  let pollInProgress = false;
1047
1103
  let pollStartedAt = 0;
1048
1104
  let backoffUntil = 0;
@@ -1110,6 +1166,34 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
1110
1166
  // every 5min is too slow (250 packages ingested between checks).
1111
1167
  const { level: pressureLevel, mem: currentMem, ratio: heapRatio, rssRatio } = computeMemoryPressure();
1112
1168
 
1169
+ // Phase B (memory governor): feed the admission gate the REAL process RSS
1170
+ // from this same 2s breaker loop — the governor's freeze keys on it (the
1171
+ // worker-mem disk samples are 10s-cadence and starve during sync parses).
1172
+ try { require('./memory-governor.js').updateGovernorRss(currentMem.rss); } catch { /* governor optional */ }
1173
+
1174
+ // Phase D (degradation registry): evaluate the raw degradation signals.
1175
+ // Cheap (two statSync + counters); alarms only fire on sustained
1176
+ // transitions inside tickDegradation. Fire-and-forget — the registry must
1177
+ // never block the breaker loop.
1178
+ try {
1179
+ const { getLeanStatus } = require('../ioc/updater.js');
1180
+ const { getBrainState } = require('../shared/http-limiter.js');
1181
+ const { isFrozen: govFrozen, isGovernorEnabled: govEnabled } = require('./memory-governor.js');
1182
+ const lean = getLeanStatus();
1183
+ const brain = getBrainState();
1184
+ const shedNow = stats.temporalLoadShed || 0;
1185
+ const shedActive = shedNow > _lastTemporalShedCount;
1186
+ _lastTemporalShedCount = shedNow;
1187
+ const signals = {
1188
+ 'ioc:full-fallback': lean.missing || lean.stale,
1189
+ 'ioc:lean-parse-failed': lean.parseFailed,
1190
+ 'registry:max-backoff': Object.values(brain).some(b => b.atMaxBackoff),
1191
+ 'temporal:shed': shedActive,
1192
+ 'workers:memory-floored': govEnabled() && govFrozen()
1193
+ };
1194
+ require('./degradation.js').tickDegradation(signals).catch(() => { /* best-effort */ });
1195
+ } catch { /* observability only */ }
1196
+
1113
1197
  // Top up workers ONLY when memory pressure is below HIGH.
1114
1198
  // At HIGH+, existing workers continue (they'll finish or timeout) but no new
1115
1199
  // ones are spawned. This is the core mechanism: let running scans release their
@@ -0,0 +1,182 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Degradation registry (governors program, phase D) — every degraded mode of
5
+ * the monitor becomes a NAMED state: alarmed once on entry (webhook), once on
6
+ * recovery, visible in the hourly Stability log and the daily report.
7
+ *
8
+ * Why: the failure mode of 2026-06-12 was not crashing — it was degrading
9
+ * SILENTLY. The lean-IOC fallback re-arms the per-worker RSS bomb with a
10
+ * single [WARN] line; a corrupted lean continues with FEWER IOCs (coverage
11
+ * loss, no fallback at all); temporal analysis sheds itself off above queue
12
+ * 2000 and stays off for hours; the registry sat at max backoff for an
13
+ * afternoon. None of these pages anyone unless someone greps the journal.
14
+ *
15
+ * Modeled on feed-health.js (pure decision core + persisted state file +
16
+ * edge-triggered webhook with recovery), EXTENDED with sustain durations:
17
+ * `evaluateDegradation(signals, prev, now)` tracks per-state activeSince so
18
+ * flapping signals (queue oscillating around the temporal shed threshold)
19
+ * never alarm, and only conditions sustained past their threshold do.
20
+ * Re-entry within REALARM_COOLDOWN_MS of the last alarm stays silent (the
21
+ * state still shows active everywhere) — the Discord webhook is shared with
22
+ * detection alerts and must never be flooded.
23
+ *
24
+ * The registry also feeds one defensive coupling: ensureWorkers caps the pool
25
+ * at FALLBACK_WORKER_CAP while `ioc:full-fallback` is active — each worker in
26
+ * fallback parses the full 223MB iocs.json (~450MB peak), and 12 of them is
27
+ * exactly the RSS bomb the lean projection removed.
28
+ */
29
+
30
+ const fs = require('fs');
31
+ const path = require('path');
32
+
33
+ const STATE_FILE = process.env.MUADDIB_DEGRADATION_FILE
34
+ || path.join(__dirname, '..', '..', 'data', 'degradation.json');
35
+
36
+ // One alarm per entry; re-entry within the cooldown stays silent.
37
+ const REALARM_COOLDOWN_MS = (() => {
38
+ const v = parseInt(process.env.MUADDIB_DEGRADATION_COOLDOWN_MS, 10);
39
+ return Number.isFinite(v) && v >= 0 ? v : 6 * 3600_000;
40
+ })();
41
+
42
+ // Sustain thresholds: how long a raw signal must hold before it IS a
43
+ // degradation. Instant for the IOC states (each affected worker spawn costs
44
+ // ~450MB / loses coverage immediately); duration-gated for the flappy ones.
45
+ const STATE_DEFS = {
46
+ 'ioc:full-fallback': { level: 'RED', sustainMs: 0, desc: 'iocs-lean.json missing/stale — every worker spawn parses the FULL 223MB iocs.json (~450MB peak each)' },
47
+ 'ioc:lean-parse-failed': { level: 'RED', sustainMs: 0, desc: 'iocs-lean.json unparseable — scans run with FEWER IOCs (silent coverage loss, no fallback)' },
48
+ 'registry:max-backoff': { level: 'RED', sustainMs: 15 * 60_000, desc: 'a registry host has been at maximum backoff pause — enrichment fetches are starving' },
49
+ 'temporal:shed': { level: 'YELLOW', sustainMs: 30 * 60_000, desc: 'temporal analysis has been load-shedding continuously (queue above the shed threshold)' },
50
+ 'workers:memory-floored': { level: 'YELLOW', sustainMs: 10 * 60_000, desc: 'the memory governor has been freezing admissions — budget likely outgrown by baseline drift' }
51
+ };
52
+
53
+ /**
54
+ * Pure decision core (exported for tests — no I/O, no Date.now()).
55
+ *
56
+ * @param {Object<string,boolean>} signals - raw signal per state name, this tick
57
+ * @param {{states: Object}} prev - previous registry state
58
+ * @param {number} now - timestamp ms
59
+ * @returns {{transitions: Array<{name, kind:'enter'|'recover', level, sinceMs}>,
60
+ * active: string[], nextState: {states: Object}}}
61
+ */
62
+ function evaluateDegradation(signals, prev, now, defs = STATE_DEFS, cooldownMs = REALARM_COOLDOWN_MS) {
63
+ const prevStates = (prev && prev.states) || {};
64
+ const nextStates = {};
65
+ const transitions = [];
66
+ const active = [];
67
+
68
+ for (const [name, def] of Object.entries(defs)) {
69
+ const raw = !!signals[name];
70
+ const p = prevStates[name] || {};
71
+ if (raw) {
72
+ const activeSince = p.activeSince || now;
73
+ const sustained = now - activeSince >= def.sustainMs;
74
+ let alarmedAt = p.alarmedAt || 0;
75
+ let lastAlarmAt = p.lastAlarmAt || 0;
76
+ if (sustained && !alarmedAt) {
77
+ // Cooldown only gates RE-entries (lastAlarmAt > 0): a virgin state must
78
+ // alarm immediately regardless of how small `now` is.
79
+ if (!lastAlarmAt || now - lastAlarmAt >= cooldownMs) {
80
+ transitions.push({ name, kind: 'enter', level: def.level, sinceMs: now - activeSince });
81
+ lastAlarmAt = now;
82
+ }
83
+ // alarmed (or silently re-entered under cooldown): either way the
84
+ // state is ACTIVE and a future recovery must be emitted.
85
+ alarmedAt = now;
86
+ }
87
+ if (sustained) active.push(name);
88
+ nextStates[name] = { activeSince, alarmedAt: alarmedAt || undefined, lastAlarmAt: lastAlarmAt || undefined };
89
+ } else {
90
+ if (p.alarmedAt) {
91
+ transitions.push({ name, kind: 'recover', level: def.level, sinceMs: p.activeSince ? now - p.activeSince : 0 });
92
+ }
93
+ // Keep lastAlarmAt across the recovery: it is the re-entry cooldown anchor.
94
+ nextStates[name] = p.lastAlarmAt ? { lastAlarmAt: p.lastAlarmAt } : {};
95
+ }
96
+ }
97
+ return { transitions, active, nextState: { states: nextStates } };
98
+ }
99
+
100
+ // ─── Module state (daemon tick) ───
101
+
102
+ let _state = null;
103
+ let _active = new Set();
104
+
105
+ function _loadState() {
106
+ if (_state) return _state;
107
+ try { _state = JSON.parse(fs.readFileSync(STATE_FILE, 'utf8')); } catch { _state = { states: {} }; }
108
+ return _state;
109
+ }
110
+
111
+ function _saveState() {
112
+ try {
113
+ const tmp = STATE_FILE + '.tmp';
114
+ fs.writeFileSync(tmp, JSON.stringify(_state));
115
+ fs.renameSync(tmp, STATE_FILE);
116
+ } catch { /* observability state is best-effort */ }
117
+ }
118
+
119
+ /**
120
+ * Daemon tick: evaluate raw signals, persist, dispatch alarms for transitions.
121
+ * Cheap (a few statSync upstream + pure math here) — runs from the 2s loop.
122
+ * `dispatch` is injectable for tests; defaults to the shared webhook.
123
+ */
124
+ async function tickDegradation(signals, now = Date.now(), dispatch = _defaultDispatch) {
125
+ const prev = _loadState();
126
+ const { transitions, active, nextState } = evaluateDegradation(signals, prev, now);
127
+ _state = nextState;
128
+ _active = new Set(active);
129
+ if (transitions.length > 0) {
130
+ _saveState();
131
+ for (const t of transitions) {
132
+ const def = STATE_DEFS[t.name] || {};
133
+ const enter = t.kind === 'enter';
134
+ console[enter ? 'warn' : 'log'](`[DEGRADATION] ${enter ? 'ENTER' : 'RECOVER'} ${t.level} ${t.name}${enter ? '' : ` (was active ${Math.round(t.sinceMs / 60000)}min)`}`);
135
+ try {
136
+ await dispatch({
137
+ embeds: [{
138
+ title: enter ? `🔻 DEGRADED: ${t.name}` : `✅ RECOVERED: ${t.name}`,
139
+ color: enter ? (t.level === 'RED' ? 0xe74c3c : 0xf39c12) : 0x2ecc71,
140
+ description: enter ? (def.desc || t.name) : `Degradation cleared after ${Math.round(t.sinceMs / 60000)} min.`,
141
+ footer: { text: "MUAD'DIB degradation registry" },
142
+ timestamp: new Date(now).toISOString()
143
+ }]
144
+ });
145
+ } catch { /* alarm is best-effort — the journal line above is the record */ }
146
+ }
147
+ } else {
148
+ _saveState();
149
+ }
150
+ return { transitions, active };
151
+ }
152
+
153
+ async function _defaultDispatch(payload) {
154
+ const url = process.env.MUADDIB_WEBHOOK_URL;
155
+ if (!url) return;
156
+ const { sendWebhook } = require('../webhook.js');
157
+ await sendWebhook(url, payload, { rawPayload: true });
158
+ }
159
+
160
+ /** Synchronous view for couplings (ensureWorkers cap) and the daily report. */
161
+ function getActiveDegradations() {
162
+ return Array.from(_active);
163
+ }
164
+
165
+ function isDegraded(name) {
166
+ return _active.has(name);
167
+ }
168
+
169
+ /** Test helper. */
170
+ function resetDegradation() {
171
+ _state = { states: {} };
172
+ _active = new Set();
173
+ }
174
+
175
+ module.exports = {
176
+ STATE_DEFS,
177
+ evaluateDegradation,
178
+ tickDegradation,
179
+ getActiveDegradations,
180
+ isDegraded,
181
+ resetDegradation
182
+ };
@@ -0,0 +1,292 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Memory governor (governors program, phase B) — global admission control for
5
+ * scan memory, by ticket.
6
+ *
7
+ * Why: every prior guard bounded a LOCAL variable while the resource is
8
+ * global. The per-worker watermark watches one isolate; the heavy-lane
9
+ * serializes packages that are individually heavy. The 2026-06-12 09:43
10
+ * EMERGENCY (RSS 96%, main heap 4%) was neither: 12 workers × ~650MB of
11
+ * sub-threshold scans (an ATO burst of SDK packages) — no individual
12
+ * threshold crossed, the AGGREGATE blew the breaker. The governor bounds the
13
+ * aggregate at ADMISSION: each scan pays a ticket sized by its weight class
14
+ * before the worker spawns; Σ outstanding tickets ≤ budget; and admissions
15
+ * freeze on REAL process RSS (sampled by the daemon's 2s breaker loop — NOT
16
+ * worker-mem samples, which land on disk every 10s and starve during the
17
+ * synchronous parses that matter).
18
+ *
19
+ * Ticket classes are FIXED (env-overridable), never learned from observed
20
+ * usage: auto-calibrated weights would be shapeable by an attacker crafting
21
+ * packages, and config-debt review (2026-06-11) bans measurement→threshold
22
+ * feedback loops.
23
+ *
24
+ * Invariants:
25
+ * - Carve-out: heavies can consume at most (budget − LIGHT_CARVEOUT); a
26
+ * light is never blocked by heavy consumption — preserves the heavy-lane
27
+ * "lights are NEVER blocked" guarantee, and an attacker publishing heavy
28
+ * packages cannot starve everyone else's scans.
29
+ * - Liveness: when frozen with ZERO outstanding tickets, one scan is
30
+ * admitted anyway (a stuck-high RSS reading must never deadlock the
31
+ * queue; the EMERGENCY breaker remains the backstop).
32
+ * - The governor is OFF unless MUADDIB_MEMORY_GOVERNOR=1 — acquire resolves
33
+ * `false` (nothing to release) and the legacy heavy-lane path applies.
34
+ *
35
+ * Same waiter contract as heavy-lane.js (abort-aware, wait-timeout, and the
36
+ * "trap #1": any waiter leaving the queue without being woken MUST splice
37
+ * itself out, or a release hands its grant to a dead waiter and leaks it).
38
+ * EMERGENCY purge + adaptive-concurrency + rssAdmissionCap are all kept as
39
+ * backstops (defense in depth) — the governor is the front gate, not a
40
+ * replacement for them.
41
+ */
42
+
43
+ const { isHeavyScan } = require('./heavy-lane.js');
44
+
45
+ // Env knobs (read at call time so tests can flip them around resetGovernor()):
46
+ function isGovernorEnabled() {
47
+ return process.env.MUADDIB_MEMORY_GOVERNOR === '1';
48
+ }
49
+
50
+ function _envMb(name, dflt) {
51
+ const v = parseInt(process.env[name], 10);
52
+ return Number.isFinite(v) && v > 0 ? v : dflt;
53
+ }
54
+
55
+ function ticketMb(cls) {
56
+ if (cls === 'heavy') return _envMb('MUADDIB_TICKET_HEAVY_MB', 2048);
57
+ if (cls === 'medium') return _envMb('MUADDIB_TICKET_MEDIUM_MB', 256);
58
+ return _envMb('MUADDIB_TICKET_LIGHT_MB', 64);
59
+ }
60
+
61
+ // medium starts at 1 MiB of weighted JS (heavy-lane's threshold is 3 MiB —
62
+ // the band between them is where the "many at once" aggregate risk lives).
63
+ function mediumThresholdBytes() {
64
+ const v = parseInt(process.env.MUADDIB_TICKET_MEDIUM_THRESHOLD_BYTES, 10);
65
+ return Number.isFinite(v) && v > 0 ? v : 1024 * 1024;
66
+ }
67
+
68
+ function lightCarveoutMb() {
69
+ return _envMb('MUADDIB_GOVERNOR_LIGHT_CARVEOUT_MB', 512);
70
+ }
71
+
72
+ function rssSoftPct() {
73
+ const v = parseInt(process.env.MUADDIB_GOVERNOR_RSS_SOFT_PCT, 10);
74
+ return Number.isFinite(v) && v > 0 && v < 100 ? v : 75;
75
+ }
76
+
77
+ function rssLimitMb() {
78
+ const v = parseInt(process.env.MUADDIB_RSS_LIMIT_MB, 10);
79
+ return Number.isFinite(v) && v > 0 ? v : 8500;
80
+ }
81
+
82
+ // ─── State ───
83
+
84
+ const _gov = {
85
+ outstandingMb: 0,
86
+ outstandingCount: 0,
87
+ byCls: { light: 0, medium: 0, heavy: 0 },
88
+ heavyOutstandingMb: 0,
89
+ queue: [], // FIFO of {cls, mb, grant(ticket), bail-able — see acquire}
90
+ lastRssBytes: 0,
91
+ baselineRssBytes: 0,
92
+ freezes: 0,
93
+ granted: 0,
94
+ denied: 0
95
+ };
96
+
97
+ /**
98
+ * Pure classifier (exported for tests). heavy ≡ isHeavyScan (oversize /
99
+ * truncated / ≥3MiB weighted); medium = weighted JS ≥ mediumThresholdBytes;
100
+ * light otherwise (including a null weight — an unmeasurable-but-not-truncated
101
+ * package is small).
102
+ */
103
+ function classifyWeight(jsWeight) {
104
+ if (jsWeight && isHeavyScan(jsWeight)) return { cls: 'heavy', mb: ticketMb('heavy') };
105
+ const effective = jsWeight
106
+ ? (Number.isFinite(jsWeight.weightedJsBytes) ? jsWeight.weightedJsBytes : (jsWeight.totalJsBytes || 0))
107
+ : 0;
108
+ if (effective >= mediumThresholdBytes()) return { cls: 'medium', mb: ticketMb('medium') };
109
+ return { cls: 'light', mb: ticketMb('light') };
110
+ }
111
+
112
+ /** Budget in MB: rssLimit×0.7 − boot baseline. 0 until the first RSS sample. */
113
+ function _budgetMb() {
114
+ if (!_gov.baselineRssBytes) return 0;
115
+ return Math.max(0, Math.floor(rssLimitMb() * 0.7 - _gov.baselineRssBytes / 1024 / 1024));
116
+ }
117
+
118
+ function isFrozen() {
119
+ if (!isGovernorEnabled()) return false;
120
+ if (!_gov.lastRssBytes) return false;
121
+ return _gov.lastRssBytes / 1024 / 1024 > rssLimitMb() * (rssSoftPct() / 100);
122
+ }
123
+
124
+ function _canAdmit(cls, mb) {
125
+ if (isFrozen()) return false;
126
+ if (cls === 'light') return true; // carve-out: lights only ever blocked by the RSS freeze
127
+ const budget = _budgetMb();
128
+ if (budget <= 0) return true; // no baseline yet (boot) — admit, breaker backstops
129
+ if (_gov.outstandingMb + mb > budget) return false;
130
+ if (cls === 'heavy' && _gov.heavyOutstandingMb + mb > Math.max(0, budget - lightCarveoutMb())) return false;
131
+ return true;
132
+ }
133
+
134
+ function _grant(cls, mb) {
135
+ _gov.outstandingMb += mb;
136
+ _gov.outstandingCount += 1;
137
+ _gov.byCls[cls] += 1;
138
+ if (cls === 'heavy') _gov.heavyOutstandingMb += mb;
139
+ _gov.granted += 1;
140
+ return { cls, mb, _released: false };
141
+ }
142
+
143
+ function _drainWaiters() {
144
+ if (_gov.queue.length === 0) return;
145
+ // FIFO with anti head-of-line for lights: a blocked heavy/medium must not
146
+ // park the lights queued behind it (they are admittable whenever unfrozen).
147
+ for (let i = 0; i < _gov.queue.length; ) {
148
+ const w = _gov.queue[i];
149
+ const liveness = isFrozen() && _gov.outstandingCount === 0 && i === 0;
150
+ if (liveness || _canAdmit(w.cls, w.mb)) {
151
+ _gov.queue.splice(i, 1);
152
+ w.wake(_grant(w.cls, w.mb));
153
+ } else if (w.cls !== 'light' ) {
154
+ i++; // blocked non-light: skip it, lights behind may still pass
155
+ } else {
156
+ i++; // frozen light — nothing passes until unfreeze/liveness
157
+ }
158
+ }
159
+ }
160
+
161
+ /**
162
+ * Acquire a memory ticket. Resolves the ticket object, or `false` when the
163
+ * governor is disabled (nothing to release — mirrors acquireHeavySlot).
164
+ * Rejects err.code='ABORT_ERR' on outer-scan abort, 'TICKET_WAIT_TIMEOUT'
165
+ * after maxWaitMs (caller requeues, same path as the heavy lane).
166
+ */
167
+ function acquireMemoryTicket(cls, opts = {}) {
168
+ if (!isGovernorEnabled()) return Promise.resolve(false);
169
+ const mb = ticketMb(cls);
170
+ // Liveness: a frozen governor with nothing in flight must still move.
171
+ if ((_canAdmit(cls, mb)) || (isFrozen() && _gov.outstandingCount === 0)) {
172
+ return Promise.resolve(_grant(cls, mb));
173
+ }
174
+ if (isFrozen()) _gov.freezes += 1;
175
+ const { signal, maxWaitMs } = opts;
176
+ return new Promise((resolve, reject) => {
177
+ let timer = null;
178
+ const cleanup = () => {
179
+ if (timer) { clearTimeout(timer); timer = null; }
180
+ if (signal) { try { signal.removeEventListener('abort', onAbort); } catch { /* not added */ } }
181
+ };
182
+ const waiter = {
183
+ cls,
184
+ mb,
185
+ wake: (ticket) => { cleanup(); resolve(ticket); }
186
+ };
187
+ // Trap #1 (heavy-lane.js): leaving the queue WITHOUT being woken must
188
+ // splice the waiter out, or a future drain wakes a dead waiter and the
189
+ // ticket leaks permanently.
190
+ const bail = (err) => {
191
+ const i = _gov.queue.indexOf(waiter);
192
+ if (i === -1) return; // already woken — the grant path owns the ticket
193
+ _gov.queue.splice(i, 1);
194
+ cleanup();
195
+ _gov.denied += 1;
196
+ reject(err);
197
+ };
198
+ const onAbort = () => {
199
+ const err = new Error('Memory-ticket wait aborted (outer scan timeout)');
200
+ err.code = 'ABORT_ERR';
201
+ bail(err);
202
+ };
203
+ _gov.queue.push(waiter);
204
+ if (signal) {
205
+ if (signal.aborted) { onAbort(); return; }
206
+ signal.addEventListener('abort', onAbort, { once: true });
207
+ }
208
+ if (Number.isFinite(maxWaitMs) && maxWaitMs > 0) {
209
+ // NOT unref'd: a pending admission is active work (an extracted package
210
+ // on tmp disk waiting to scan) — it must keep the process alive.
211
+ timer = setTimeout(() => {
212
+ const err = new Error(`Memory ticket (${cls}, ${mb}MB) not acquired within ${maxWaitMs}ms`);
213
+ err.code = 'TICKET_WAIT_TIMEOUT';
214
+ bail(err);
215
+ }, maxWaitMs);
216
+ }
217
+ });
218
+ }
219
+
220
+ function releaseMemoryTicket(ticket) {
221
+ if (!ticket || ticket._released) return;
222
+ ticket._released = true;
223
+ _gov.outstandingMb = Math.max(0, _gov.outstandingMb - ticket.mb);
224
+ _gov.outstandingCount = Math.max(0, _gov.outstandingCount - 1);
225
+ _gov.byCls[ticket.cls] = Math.max(0, _gov.byCls[ticket.cls] - 1);
226
+ if (ticket.cls === 'heavy') _gov.heavyOutstandingMb = Math.max(0, _gov.heavyOutstandingMb - ticket.mb);
227
+ _drainWaiters();
228
+ }
229
+
230
+ /**
231
+ * Feed the governor the process RSS (daemon breaker loop, every 2s). The
232
+ * FIRST sample becomes the boot baseline the budget is computed against —
233
+ * a frozen-at-boot dette documented in the plan: the baseline drifts up over
234
+ * days; phase D's `workers:memory-floored` state is the visibility loop.
235
+ */
236
+ function updateGovernorRss(rssBytes) {
237
+ if (!Number.isFinite(rssBytes) || rssBytes <= 0) return;
238
+ if (!_gov.baselineRssBytes) _gov.baselineRssBytes = rssBytes;
239
+ const wasFrozen = isFrozen();
240
+ _gov.lastRssBytes = rssBytes;
241
+ if (wasFrozen && !isFrozen()) _drainWaiters();
242
+ }
243
+
244
+ function getGovernorState() {
245
+ return {
246
+ enabled: isGovernorEnabled(),
247
+ frozen: isFrozen(),
248
+ outstandingMb: _gov.outstandingMb,
249
+ outstandingCount: _gov.outstandingCount,
250
+ byCls: { ..._gov.byCls },
251
+ heavyOutstandingMb: _gov.heavyOutstandingMb,
252
+ waiting: _gov.queue.length,
253
+ budgetMb: _budgetMb(),
254
+ baselineRssMb: Math.round(_gov.baselineRssBytes / 1024 / 1024),
255
+ lastRssMb: Math.round(_gov.lastRssBytes / 1024 / 1024),
256
+ granted: _gov.granted,
257
+ freezes: _gov.freezes
258
+ };
259
+ }
260
+
261
+ /** Test helper — same role as resetHeavyLane. */
262
+ function resetGovernor() {
263
+ _gov.outstandingMb = 0;
264
+ _gov.outstandingCount = 0;
265
+ _gov.byCls = { light: 0, medium: 0, heavy: 0 };
266
+ _gov.heavyOutstandingMb = 0;
267
+ while (_gov.queue.length > 0) {
268
+ const w = _gov.queue.shift();
269
+ w.wake(_grant(w.cls, w.mb)); // release parked waiters so tests never hang
270
+ }
271
+ _gov.outstandingMb = 0;
272
+ _gov.outstandingCount = 0;
273
+ _gov.byCls = { light: 0, medium: 0, heavy: 0 };
274
+ _gov.heavyOutstandingMb = 0;
275
+ _gov.lastRssBytes = 0;
276
+ _gov.baselineRssBytes = 0;
277
+ _gov.freezes = 0;
278
+ _gov.granted = 0;
279
+ _gov.denied = 0;
280
+ }
281
+
282
+ module.exports = {
283
+ isGovernorEnabled,
284
+ classifyWeight,
285
+ acquireMemoryTicket,
286
+ releaseMemoryTicket,
287
+ updateGovernorRss,
288
+ isFrozen,
289
+ getGovernorState,
290
+ resetGovernor,
291
+ ticketMb
292
+ };