muaddib-scanner 2.11.104 → 2.11.109
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/{self-scan-v2.11.104.json → self-scan-v2.11.109.json} +1 -1
- package/src/ioc/updater.js +24 -1
- package/src/monitor/daemon.js +88 -4
- package/src/monitor/degradation.js +182 -0
- package/src/monitor/memory-governor.js +292 -0
- package/src/monitor/queue.js +187 -12
- package/src/monitor/scan-queue.js +1 -1
- package/src/monitor/spill.js +14 -1
- package/src/monitor/state.js +6 -2
- package/src/monitor/webhook.js +11 -0
- package/src/pipeline/scan-worker.js +24 -13
- package/src/pipeline/watermark.js +53 -0
- package/src/scanner/npm-registry.js +14 -1
- package/src/scanner/pypi-registry.js +2 -2
- package/src/shared/download.js +3 -2
- package/src/shared/http-limiter.js +376 -152
package/package.json
CHANGED
package/src/ioc/updater.js
CHANGED
|
@@ -296,7 +296,12 @@ function loadCachedIOCs() {
|
|
|
296
296
|
try {
|
|
297
297
|
const leanIOCs = JSON.parse(fs.readFileSync(LOCAL_LEAN_FILE, 'utf8'));
|
|
298
298
|
mergeIOCs(merged, leanIOCs);
|
|
299
|
+
_leanParseFailedAt = 0; // healthy again
|
|
299
300
|
} catch (e) {
|
|
301
|
+
// Phase D: a corrupted lean does NOT fall back to the full file — the
|
|
302
|
+
// scan continues with FEWER IOCs. Flag it so the degradation registry
|
|
303
|
+
// can alarm (ioc:lean-parse-failed) instead of one buried WARN line.
|
|
304
|
+
_leanParseFailedAt = Date.now();
|
|
300
305
|
console.log('[WARN] Failed to load lean IOC database (iocs-lean.json): ' + e.message);
|
|
301
306
|
}
|
|
302
307
|
} else if (fs.existsSync(LOCAL_IOC_FILE)) {
|
|
@@ -511,6 +516,24 @@ function createLeanIOCs(fullIOCs) {
|
|
|
511
516
|
};
|
|
512
517
|
}
|
|
513
518
|
|
|
519
|
+
// Phase D (degradation registry): main-thread observable status of the lean
|
|
520
|
+
// projection. `missing`/`stale` re-arm the full-223MB-per-worker fallback;
|
|
521
|
+
// `parseFailed` means scans run with FEWER IOCs. Read by the daemon's
|
|
522
|
+
// degradation tick (cheap: two statSync).
|
|
523
|
+
let _leanParseFailedAt = 0;
|
|
524
|
+
function getLeanStatus() {
|
|
525
|
+
let missing = false, stale = false;
|
|
526
|
+
try {
|
|
527
|
+
const fullExists = fs.existsSync(LOCAL_IOC_FILE);
|
|
528
|
+
const leanExists = fs.existsSync(LOCAL_LEAN_FILE);
|
|
529
|
+
if (fullExists && !leanExists) missing = true;
|
|
530
|
+
else if (fullExists && leanExists) {
|
|
531
|
+
try { stale = fs.statSync(LOCAL_LEAN_FILE).mtimeMs < fs.statSync(LOCAL_IOC_FILE).mtimeMs; } catch { stale = false; }
|
|
532
|
+
}
|
|
533
|
+
} catch { /* status is best-effort */ }
|
|
534
|
+
return { missing, stale, parseFailed: _leanParseFailedAt > 0, parseFailedAt: _leanParseFailedAt || null };
|
|
535
|
+
}
|
|
536
|
+
|
|
514
537
|
// Ensure LOCAL_LEAN_FILE exists and is at least as fresh as LOCAL_IOC_FILE.
|
|
515
538
|
// Reads the 223MB full ONCE (the ~450MB parse peak) — acceptable only in a
|
|
516
539
|
// long-lived process (daemon boot); NEVER call from a one-shot scan worker.
|
|
@@ -773,4 +796,4 @@ function verifyIOCHMAC(data, hmac) {
|
|
|
773
796
|
}
|
|
774
797
|
}
|
|
775
798
|
|
|
776
|
-
module.exports = { updateIOCs, loadCachedIOCs, invalidateCache, generateCompactIOCs, expandCompactIOCs, createLeanIOCs, ensureLeanIOCFile, writeLeanIOCFile, LOCAL_LEAN_FILE, LOCAL_IOC_FILE, mergeIOCs, createOptimizedIOCs, generateIOCHMAC, verifyIOCHMAC, checkIOCStaleness, NEVER_WILDCARD, NEVER_WILDCARD_PYPI };
|
|
799
|
+
module.exports = { updateIOCs, loadCachedIOCs, invalidateCache, generateCompactIOCs, expandCompactIOCs, createLeanIOCs, ensureLeanIOCFile, writeLeanIOCFile, getLeanStatus, LOCAL_LEAN_FILE, LOCAL_IOC_FILE, mergeIOCs, createOptimizedIOCs, generateIOCHMAC, verifyIOCHMAC, checkIOCStaleness, NEVER_WILDCARD, NEVER_WILDCARD_PYPI };
|
package/src/monitor/daemon.js
CHANGED
|
@@ -10,7 +10,7 @@ const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache,
|
|
|
10
10
|
const { isTemporalEnabled, isTemporalAstEnabled, isTemporalPublishEnabled, isTemporalMaintainerEnabled } = require('./temporal.js');
|
|
11
11
|
const { pendingGrouped, flushScopeGroup, sendDailyReport, redeliverPendingReportOnBoot, alertedPackageRules, ALERTED_PACKAGES_MAX: MAX_ALERTED_PACKAGES } = require('./webhook.js');
|
|
12
12
|
const { poll, getPollBackoffMs } = require('./ingestion.js');
|
|
13
|
-
const { ensureWorkers, drainWorkers, getTargetConcurrency, setTargetConcurrency, getActiveWorkers, terminateAllWorkers } = require('./queue.js');
|
|
13
|
+
const { ensureWorkers, drainWorkers, getTargetConcurrency, setTargetConcurrency, getActiveWorkers, terminateAllWorkers, getInFlightItems, computeInterruptDisposition } = require('./queue.js');
|
|
14
14
|
const { computeTarget, ADJUST_INTERVAL_MS, BASE_CONCURRENCY } = require('./adaptive-concurrency.js');
|
|
15
15
|
const { startHealthcheck } = require('./healthcheck.js');
|
|
16
16
|
const { startDeferredWorker, stopDeferredWorker, persistDeferredQueue, restoreDeferredQueue, clearDeferredQueue } = require('./deferred-sandbox.js');
|
|
@@ -34,6 +34,14 @@ const PROCESS_LOOP_INTERVAL = 2_000; // Queue check interval when empty
|
|
|
34
34
|
// 12 calm hours/day do the catch-up of burst-time evictions. Rate-limited to one
|
|
35
35
|
// batch per interval (the main loop ticks every 2s — unthrottled it would re-spike
|
|
36
36
|
// the queue in seconds). All env-tunable for the staged rollout.
|
|
37
|
+
// C7: how long the shutdown waits for in-flight scans before spilling them.
|
|
38
|
+
// Must stay well under systemd TimeoutStopSec (default 90s) so the ledger,
|
|
39
|
+
// spill and queue persist ALWAYS run before any SIGKILL.
|
|
40
|
+
const SHUTDOWN_DRAIN_MAX_MS = (() => {
|
|
41
|
+
const v = parseInt(process.env.MUADDIB_SHUTDOWN_DRAIN_MAX_MS, 10);
|
|
42
|
+
return Number.isFinite(v) && v > 0 ? v : 20_000;
|
|
43
|
+
})();
|
|
44
|
+
|
|
37
45
|
const SPILL_DRAIN_THRESHOLD = (() => {
|
|
38
46
|
const v = parseInt(process.env.MUADDIB_SPILL_DRAIN_THRESHOLD, 10);
|
|
39
47
|
return Number.isFinite(v) && v > 0 ? v : 500;
|
|
@@ -666,6 +674,23 @@ function reportStats(stats) {
|
|
|
666
674
|
if (stats.changesStreamPackages) {
|
|
667
675
|
console.log(`[MONITOR] Changes stream packages: ${stats.changesStreamPackages}`);
|
|
668
676
|
}
|
|
677
|
+
// Phase D: active degradations in the hourly Stability block.
|
|
678
|
+
try {
|
|
679
|
+
const active = require('./degradation.js').getActiveDegradations();
|
|
680
|
+
if (active.length > 0) console.warn(`[MONITOR] Degradations: ${active.join(', ')}`);
|
|
681
|
+
} catch { /* observability only */ }
|
|
682
|
+
// Network-brain state (governors phase A): one line per host that has seen
|
|
683
|
+
// any backoff — the observation signal for the A deployment gate (AIMD
|
|
684
|
+
// de-escalations visible, no sustained max-level) and phase D's input.
|
|
685
|
+
try {
|
|
686
|
+
const { getBrainState } = require('../shared/http-limiter.js');
|
|
687
|
+
const brain = getBrainState();
|
|
688
|
+
const noisy = Object.entries(brain).filter(([, s]) => s.backoffCount > 0 || s.level > 0 || s.pendingWaiters > 0);
|
|
689
|
+
if (noisy.length > 0) {
|
|
690
|
+
const line = noisy.map(([h, s]) => `${h}: level=${s.level} pause=${s.pauseRemainingMs}ms 429s=${s.backoffCount} waiters=${s.pendingWaiters}`).join(' | ');
|
|
691
|
+
console.log(`[MONITOR] Brain: ${line}`);
|
|
692
|
+
}
|
|
693
|
+
} catch { /* observability only */ }
|
|
669
694
|
if (stats.rssFallbackCount) {
|
|
670
695
|
console.log(`[MONITOR] RSS fallback activations: ${stats.rssFallbackCount}`);
|
|
671
696
|
}
|
|
@@ -960,9 +985,39 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
960
985
|
clearInterval(concurrencyAdjustHandle);
|
|
961
986
|
concurrencyAdjustHandle = null;
|
|
962
987
|
}
|
|
963
|
-
//
|
|
964
|
-
|
|
965
|
-
|
|
988
|
+
// Bounded drain (phase C, C7). The old unbounded `await drainWorkers()`
|
|
989
|
+
// could outlive systemd's TimeoutStopSec (scans run up to 300s): SIGKILL
|
|
990
|
+
// then landed MID-drain, persistQueue never ran, and every in-flight scan
|
|
991
|
+
// plus up to 60s of queue mutations were lost UNLEDGERED on each manual
|
|
992
|
+
// restart — the exact deployment mode of this program. Drain for up to
|
|
993
|
+
// SHUTDOWN_DRAIN_MAX_MS, then spill the survivors (protected, bounded
|
|
994
|
+
// retries) so the next boot re-scans them.
|
|
995
|
+
console.log(`[MONITOR] Draining ${getActiveWorkers()} active worker(s) (bounded ${SHUTDOWN_DRAIN_MAX_MS / 1000}s)...`);
|
|
996
|
+
await Promise.race([
|
|
997
|
+
drainWorkers(),
|
|
998
|
+
new Promise(resolve => setTimeout(resolve, SHUTDOWN_DRAIN_MAX_MS).unref())
|
|
999
|
+
]);
|
|
1000
|
+
try {
|
|
1001
|
+
const leftovers = getInFlightItems();
|
|
1002
|
+
if (leftovers.length > 0) {
|
|
1003
|
+
const { isSpillEnabled: spillOn, spillItems } = require('./spill.js');
|
|
1004
|
+
const { appendScanLedger } = require('./state.js');
|
|
1005
|
+
let spilledN = 0;
|
|
1006
|
+
for (const it of leftovers) {
|
|
1007
|
+
const { retries, giveUp } = computeInterruptDisposition(it);
|
|
1008
|
+
recentlyScanned.delete(`${it.ecosystem}/${it.name}@${it.version}`);
|
|
1009
|
+
if (giveUp) {
|
|
1010
|
+
appendScanLedger({ name: it.name, version: it.version, ecosystem: it.ecosystem, outcome: 'dropped', source: 'interrupted_max' });
|
|
1011
|
+
continue;
|
|
1012
|
+
}
|
|
1013
|
+
appendScanLedger({ name: it.name, version: it.version, ecosystem: it.ecosystem, outcome: 'interrupted', source: 'shutdown_drain' });
|
|
1014
|
+
if (spillOn() && spillItems([{ ...it, interrupted: true, interruptRetries: retries }]) === 1) spilledN++;
|
|
1015
|
+
}
|
|
1016
|
+
console.log(`[MONITOR] Shutdown: ${leftovers.length} in-flight scan(s) did not finish in time — ${spilledN} spilled for re-scan, all ledgered`);
|
|
1017
|
+
}
|
|
1018
|
+
} catch (e) {
|
|
1019
|
+
console.error(`[MONITOR] Shutdown in-flight spill failed: ${e.message}`);
|
|
1020
|
+
}
|
|
966
1021
|
// Persist remaining queue items so they survive the restart
|
|
967
1022
|
persistQueue(scanQueue, state);
|
|
968
1023
|
saveRecentlyScanned(recentlyScanned); // Persist dedup set too (avoid re-scan storm on restart)
|
|
@@ -1043,6 +1098,7 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
1043
1098
|
// This ensures new packages are ingested even while a large batch is being scanned.
|
|
1044
1099
|
// Backpressure: poll() skips when queue >= 30K or memory pressure >= CRITICAL (90%).
|
|
1045
1100
|
// Adaptive concurrency adjusts scan throughput to match ingestion rate.
|
|
1101
|
+
let _lastTemporalShedCount = 0; // phase D: temporal-shed delta tracking
|
|
1046
1102
|
let pollInProgress = false;
|
|
1047
1103
|
let pollStartedAt = 0;
|
|
1048
1104
|
let backoffUntil = 0;
|
|
@@ -1110,6 +1166,34 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
1110
1166
|
// every 5min is too slow (250 packages ingested between checks).
|
|
1111
1167
|
const { level: pressureLevel, mem: currentMem, ratio: heapRatio, rssRatio } = computeMemoryPressure();
|
|
1112
1168
|
|
|
1169
|
+
// Phase B (memory governor): feed the admission gate the REAL process RSS
|
|
1170
|
+
// from this same 2s breaker loop — the governor's freeze keys on it (the
|
|
1171
|
+
// worker-mem disk samples are 10s-cadence and starve during sync parses).
|
|
1172
|
+
try { require('./memory-governor.js').updateGovernorRss(currentMem.rss); } catch { /* governor optional */ }
|
|
1173
|
+
|
|
1174
|
+
// Phase D (degradation registry): evaluate the raw degradation signals.
|
|
1175
|
+
// Cheap (two statSync + counters); alarms only fire on sustained
|
|
1176
|
+
// transitions inside tickDegradation. Fire-and-forget — the registry must
|
|
1177
|
+
// never block the breaker loop.
|
|
1178
|
+
try {
|
|
1179
|
+
const { getLeanStatus } = require('../ioc/updater.js');
|
|
1180
|
+
const { getBrainState } = require('../shared/http-limiter.js');
|
|
1181
|
+
const { isFrozen: govFrozen, isGovernorEnabled: govEnabled } = require('./memory-governor.js');
|
|
1182
|
+
const lean = getLeanStatus();
|
|
1183
|
+
const brain = getBrainState();
|
|
1184
|
+
const shedNow = stats.temporalLoadShed || 0;
|
|
1185
|
+
const shedActive = shedNow > _lastTemporalShedCount;
|
|
1186
|
+
_lastTemporalShedCount = shedNow;
|
|
1187
|
+
const signals = {
|
|
1188
|
+
'ioc:full-fallback': lean.missing || lean.stale,
|
|
1189
|
+
'ioc:lean-parse-failed': lean.parseFailed,
|
|
1190
|
+
'registry:max-backoff': Object.values(brain).some(b => b.atMaxBackoff),
|
|
1191
|
+
'temporal:shed': shedActive,
|
|
1192
|
+
'workers:memory-floored': govEnabled() && govFrozen()
|
|
1193
|
+
};
|
|
1194
|
+
require('./degradation.js').tickDegradation(signals).catch(() => { /* best-effort */ });
|
|
1195
|
+
} catch { /* observability only */ }
|
|
1196
|
+
|
|
1113
1197
|
// Top up workers ONLY when memory pressure is below HIGH.
|
|
1114
1198
|
// At HIGH+, existing workers continue (they'll finish or timeout) but no new
|
|
1115
1199
|
// ones are spawned. This is the core mechanism: let running scans release their
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Degradation registry (governors program, phase D) — every degraded mode of
|
|
5
|
+
* the monitor becomes a NAMED state: alarmed once on entry (webhook), once on
|
|
6
|
+
* recovery, visible in the hourly Stability log and the daily report.
|
|
7
|
+
*
|
|
8
|
+
* Why: the failure mode of 2026-06-12 was not crashing — it was degrading
|
|
9
|
+
* SILENTLY. The lean-IOC fallback re-arms the per-worker RSS bomb with a
|
|
10
|
+
* single [WARN] line; a corrupted lean continues with FEWER IOCs (coverage
|
|
11
|
+
* loss, no fallback at all); temporal analysis sheds itself off above queue
|
|
12
|
+
* 2000 and stays off for hours; the registry sat at max backoff for an
|
|
13
|
+
* afternoon. None of these pages anyone unless someone greps the journal.
|
|
14
|
+
*
|
|
15
|
+
* Modeled on feed-health.js (pure decision core + persisted state file +
|
|
16
|
+
* edge-triggered webhook with recovery), EXTENDED with sustain durations:
|
|
17
|
+
* `evaluateDegradation(signals, prev, now)` tracks per-state activeSince so
|
|
18
|
+
* flapping signals (queue oscillating around the temporal shed threshold)
|
|
19
|
+
* never alarm, and only conditions sustained past their threshold do.
|
|
20
|
+
* Re-entry within REALARM_COOLDOWN_MS of the last alarm stays silent (the
|
|
21
|
+
* state still shows active everywhere) — the Discord webhook is shared with
|
|
22
|
+
* detection alerts and must never be flooded.
|
|
23
|
+
*
|
|
24
|
+
* The registry also feeds one defensive coupling: ensureWorkers caps the pool
|
|
25
|
+
* at FALLBACK_WORKER_CAP while `ioc:full-fallback` is active — each worker in
|
|
26
|
+
* fallback parses the full 223MB iocs.json (~450MB peak), and 12 of them is
|
|
27
|
+
* exactly the RSS bomb the lean projection removed.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
const fs = require('fs');
|
|
31
|
+
const path = require('path');
|
|
32
|
+
|
|
33
|
+
const STATE_FILE = process.env.MUADDIB_DEGRADATION_FILE
|
|
34
|
+
|| path.join(__dirname, '..', '..', 'data', 'degradation.json');
|
|
35
|
+
|
|
36
|
+
// One alarm per entry; re-entry within the cooldown stays silent.
|
|
37
|
+
const REALARM_COOLDOWN_MS = (() => {
|
|
38
|
+
const v = parseInt(process.env.MUADDIB_DEGRADATION_COOLDOWN_MS, 10);
|
|
39
|
+
return Number.isFinite(v) && v >= 0 ? v : 6 * 3600_000;
|
|
40
|
+
})();
|
|
41
|
+
|
|
42
|
+
// Sustain thresholds: how long a raw signal must hold before it IS a
|
|
43
|
+
// degradation. Instant for the IOC states (each affected worker spawn costs
|
|
44
|
+
// ~450MB / loses coverage immediately); duration-gated for the flappy ones.
|
|
45
|
+
const STATE_DEFS = {
|
|
46
|
+
'ioc:full-fallback': { level: 'RED', sustainMs: 0, desc: 'iocs-lean.json missing/stale — every worker spawn parses the FULL 223MB iocs.json (~450MB peak each)' },
|
|
47
|
+
'ioc:lean-parse-failed': { level: 'RED', sustainMs: 0, desc: 'iocs-lean.json unparseable — scans run with FEWER IOCs (silent coverage loss, no fallback)' },
|
|
48
|
+
'registry:max-backoff': { level: 'RED', sustainMs: 15 * 60_000, desc: 'a registry host has been at maximum backoff pause — enrichment fetches are starving' },
|
|
49
|
+
'temporal:shed': { level: 'YELLOW', sustainMs: 30 * 60_000, desc: 'temporal analysis has been load-shedding continuously (queue above the shed threshold)' },
|
|
50
|
+
'workers:memory-floored': { level: 'YELLOW', sustainMs: 10 * 60_000, desc: 'the memory governor has been freezing admissions — budget likely outgrown by baseline drift' }
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Pure decision core (exported for tests — no I/O, no Date.now()).
|
|
55
|
+
*
|
|
56
|
+
* @param {Object<string,boolean>} signals - raw signal per state name, this tick
|
|
57
|
+
* @param {{states: Object}} prev - previous registry state
|
|
58
|
+
* @param {number} now - timestamp ms
|
|
59
|
+
* @returns {{transitions: Array<{name, kind:'enter'|'recover', level, sinceMs}>,
|
|
60
|
+
* active: string[], nextState: {states: Object}}}
|
|
61
|
+
*/
|
|
62
|
+
function evaluateDegradation(signals, prev, now, defs = STATE_DEFS, cooldownMs = REALARM_COOLDOWN_MS) {
|
|
63
|
+
const prevStates = (prev && prev.states) || {};
|
|
64
|
+
const nextStates = {};
|
|
65
|
+
const transitions = [];
|
|
66
|
+
const active = [];
|
|
67
|
+
|
|
68
|
+
for (const [name, def] of Object.entries(defs)) {
|
|
69
|
+
const raw = !!signals[name];
|
|
70
|
+
const p = prevStates[name] || {};
|
|
71
|
+
if (raw) {
|
|
72
|
+
const activeSince = p.activeSince || now;
|
|
73
|
+
const sustained = now - activeSince >= def.sustainMs;
|
|
74
|
+
let alarmedAt = p.alarmedAt || 0;
|
|
75
|
+
let lastAlarmAt = p.lastAlarmAt || 0;
|
|
76
|
+
if (sustained && !alarmedAt) {
|
|
77
|
+
// Cooldown only gates RE-entries (lastAlarmAt > 0): a virgin state must
|
|
78
|
+
// alarm immediately regardless of how small `now` is.
|
|
79
|
+
if (!lastAlarmAt || now - lastAlarmAt >= cooldownMs) {
|
|
80
|
+
transitions.push({ name, kind: 'enter', level: def.level, sinceMs: now - activeSince });
|
|
81
|
+
lastAlarmAt = now;
|
|
82
|
+
}
|
|
83
|
+
// alarmed (or silently re-entered under cooldown): either way the
|
|
84
|
+
// state is ACTIVE and a future recovery must be emitted.
|
|
85
|
+
alarmedAt = now;
|
|
86
|
+
}
|
|
87
|
+
if (sustained) active.push(name);
|
|
88
|
+
nextStates[name] = { activeSince, alarmedAt: alarmedAt || undefined, lastAlarmAt: lastAlarmAt || undefined };
|
|
89
|
+
} else {
|
|
90
|
+
if (p.alarmedAt) {
|
|
91
|
+
transitions.push({ name, kind: 'recover', level: def.level, sinceMs: p.activeSince ? now - p.activeSince : 0 });
|
|
92
|
+
}
|
|
93
|
+
// Keep lastAlarmAt across the recovery: it is the re-entry cooldown anchor.
|
|
94
|
+
nextStates[name] = p.lastAlarmAt ? { lastAlarmAt: p.lastAlarmAt } : {};
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return { transitions, active, nextState: { states: nextStates } };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ─── Module state (daemon tick) ───
|
|
101
|
+
|
|
102
|
+
let _state = null;
|
|
103
|
+
let _active = new Set();
|
|
104
|
+
|
|
105
|
+
function _loadState() {
|
|
106
|
+
if (_state) return _state;
|
|
107
|
+
try { _state = JSON.parse(fs.readFileSync(STATE_FILE, 'utf8')); } catch { _state = { states: {} }; }
|
|
108
|
+
return _state;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function _saveState() {
|
|
112
|
+
try {
|
|
113
|
+
const tmp = STATE_FILE + '.tmp';
|
|
114
|
+
fs.writeFileSync(tmp, JSON.stringify(_state));
|
|
115
|
+
fs.renameSync(tmp, STATE_FILE);
|
|
116
|
+
} catch { /* observability state is best-effort */ }
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Daemon tick: evaluate raw signals, persist, dispatch alarms for transitions.
|
|
121
|
+
* Cheap (a few statSync upstream + pure math here) — runs from the 2s loop.
|
|
122
|
+
* `dispatch` is injectable for tests; defaults to the shared webhook.
|
|
123
|
+
*/
|
|
124
|
+
async function tickDegradation(signals, now = Date.now(), dispatch = _defaultDispatch) {
|
|
125
|
+
const prev = _loadState();
|
|
126
|
+
const { transitions, active, nextState } = evaluateDegradation(signals, prev, now);
|
|
127
|
+
_state = nextState;
|
|
128
|
+
_active = new Set(active);
|
|
129
|
+
if (transitions.length > 0) {
|
|
130
|
+
_saveState();
|
|
131
|
+
for (const t of transitions) {
|
|
132
|
+
const def = STATE_DEFS[t.name] || {};
|
|
133
|
+
const enter = t.kind === 'enter';
|
|
134
|
+
console[enter ? 'warn' : 'log'](`[DEGRADATION] ${enter ? 'ENTER' : 'RECOVER'} ${t.level} ${t.name}${enter ? '' : ` (was active ${Math.round(t.sinceMs / 60000)}min)`}`);
|
|
135
|
+
try {
|
|
136
|
+
await dispatch({
|
|
137
|
+
embeds: [{
|
|
138
|
+
title: enter ? `🔻 DEGRADED: ${t.name}` : `✅ RECOVERED: ${t.name}`,
|
|
139
|
+
color: enter ? (t.level === 'RED' ? 0xe74c3c : 0xf39c12) : 0x2ecc71,
|
|
140
|
+
description: enter ? (def.desc || t.name) : `Degradation cleared after ${Math.round(t.sinceMs / 60000)} min.`,
|
|
141
|
+
footer: { text: "MUAD'DIB degradation registry" },
|
|
142
|
+
timestamp: new Date(now).toISOString()
|
|
143
|
+
}]
|
|
144
|
+
});
|
|
145
|
+
} catch { /* alarm is best-effort — the journal line above is the record */ }
|
|
146
|
+
}
|
|
147
|
+
} else {
|
|
148
|
+
_saveState();
|
|
149
|
+
}
|
|
150
|
+
return { transitions, active };
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
async function _defaultDispatch(payload) {
|
|
154
|
+
const url = process.env.MUADDIB_WEBHOOK_URL;
|
|
155
|
+
if (!url) return;
|
|
156
|
+
const { sendWebhook } = require('../webhook.js');
|
|
157
|
+
await sendWebhook(url, payload, { rawPayload: true });
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Synchronous view for couplings (ensureWorkers cap) and the daily report. */
|
|
161
|
+
function getActiveDegradations() {
|
|
162
|
+
return Array.from(_active);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function isDegraded(name) {
|
|
166
|
+
return _active.has(name);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/** Test helper. */
|
|
170
|
+
function resetDegradation() {
|
|
171
|
+
_state = { states: {} };
|
|
172
|
+
_active = new Set();
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
module.exports = {
|
|
176
|
+
STATE_DEFS,
|
|
177
|
+
evaluateDegradation,
|
|
178
|
+
tickDegradation,
|
|
179
|
+
getActiveDegradations,
|
|
180
|
+
isDegraded,
|
|
181
|
+
resetDegradation
|
|
182
|
+
};
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Memory governor (governors program, phase B) — global admission control for
|
|
5
|
+
* scan memory, by ticket.
|
|
6
|
+
*
|
|
7
|
+
* Why: every prior guard bounded a LOCAL variable while the resource is
|
|
8
|
+
* global. The per-worker watermark watches one isolate; the heavy-lane
|
|
9
|
+
* serializes packages that are individually heavy. The 2026-06-12 09:43
|
|
10
|
+
* EMERGENCY (RSS 96%, main heap 4%) was neither: 12 workers × ~650MB of
|
|
11
|
+
* sub-threshold scans (an ATO burst of SDK packages) — no individual
|
|
12
|
+
* threshold crossed, the AGGREGATE blew the breaker. The governor bounds the
|
|
13
|
+
* aggregate at ADMISSION: each scan pays a ticket sized by its weight class
|
|
14
|
+
* before the worker spawns; Σ outstanding tickets ≤ budget; and admissions
|
|
15
|
+
* freeze on REAL process RSS (sampled by the daemon's 2s breaker loop — NOT
|
|
16
|
+
* worker-mem samples, which land on disk every 10s and starve during the
|
|
17
|
+
* synchronous parses that matter).
|
|
18
|
+
*
|
|
19
|
+
* Ticket classes are FIXED (env-overridable), never learned from observed
|
|
20
|
+
* usage: auto-calibrated weights would be shapeable by an attacker crafting
|
|
21
|
+
* packages, and config-debt review (2026-06-11) bans measurement→threshold
|
|
22
|
+
* feedback loops.
|
|
23
|
+
*
|
|
24
|
+
* Invariants:
|
|
25
|
+
* - Carve-out: heavies can consume at most (budget − LIGHT_CARVEOUT); a
|
|
26
|
+
* light is never blocked by heavy consumption — preserves the heavy-lane
|
|
27
|
+
* "lights are NEVER blocked" guarantee, and an attacker publishing heavy
|
|
28
|
+
* packages cannot starve everyone else's scans.
|
|
29
|
+
* - Liveness: when frozen with ZERO outstanding tickets, one scan is
|
|
30
|
+
* admitted anyway (a stuck-high RSS reading must never deadlock the
|
|
31
|
+
* queue; the EMERGENCY breaker remains the backstop).
|
|
32
|
+
* - The governor is OFF unless MUADDIB_MEMORY_GOVERNOR=1 — acquire resolves
|
|
33
|
+
* `false` (nothing to release) and the legacy heavy-lane path applies.
|
|
34
|
+
*
|
|
35
|
+
* Same waiter contract as heavy-lane.js (abort-aware, wait-timeout, and the
|
|
36
|
+
* "trap #1": any waiter leaving the queue without being woken MUST splice
|
|
37
|
+
* itself out, or a release hands its grant to a dead waiter and leaks it).
|
|
38
|
+
* EMERGENCY purge + adaptive-concurrency + rssAdmissionCap are all kept as
|
|
39
|
+
* backstops (defense in depth) — the governor is the front gate, not a
|
|
40
|
+
* replacement for them.
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
const { isHeavyScan } = require('./heavy-lane.js');
|
|
44
|
+
|
|
45
|
+
// Env knobs (read at call time so tests can flip them around resetGovernor()):
|
|
46
|
+
function isGovernorEnabled() {
|
|
47
|
+
return process.env.MUADDIB_MEMORY_GOVERNOR === '1';
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function _envMb(name, dflt) {
|
|
51
|
+
const v = parseInt(process.env[name], 10);
|
|
52
|
+
return Number.isFinite(v) && v > 0 ? v : dflt;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function ticketMb(cls) {
|
|
56
|
+
if (cls === 'heavy') return _envMb('MUADDIB_TICKET_HEAVY_MB', 2048);
|
|
57
|
+
if (cls === 'medium') return _envMb('MUADDIB_TICKET_MEDIUM_MB', 256);
|
|
58
|
+
return _envMb('MUADDIB_TICKET_LIGHT_MB', 64);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// medium starts at 1 MiB of weighted JS (heavy-lane's threshold is 3 MiB —
|
|
62
|
+
// the band between them is where the "many at once" aggregate risk lives).
|
|
63
|
+
function mediumThresholdBytes() {
|
|
64
|
+
const v = parseInt(process.env.MUADDIB_TICKET_MEDIUM_THRESHOLD_BYTES, 10);
|
|
65
|
+
return Number.isFinite(v) && v > 0 ? v : 1024 * 1024;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function lightCarveoutMb() {
|
|
69
|
+
return _envMb('MUADDIB_GOVERNOR_LIGHT_CARVEOUT_MB', 512);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function rssSoftPct() {
|
|
73
|
+
const v = parseInt(process.env.MUADDIB_GOVERNOR_RSS_SOFT_PCT, 10);
|
|
74
|
+
return Number.isFinite(v) && v > 0 && v < 100 ? v : 75;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function rssLimitMb() {
|
|
78
|
+
const v = parseInt(process.env.MUADDIB_RSS_LIMIT_MB, 10);
|
|
79
|
+
return Number.isFinite(v) && v > 0 ? v : 8500;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// ─── State ───
|
|
83
|
+
|
|
84
|
+
const _gov = {
|
|
85
|
+
outstandingMb: 0,
|
|
86
|
+
outstandingCount: 0,
|
|
87
|
+
byCls: { light: 0, medium: 0, heavy: 0 },
|
|
88
|
+
heavyOutstandingMb: 0,
|
|
89
|
+
queue: [], // FIFO of {cls, mb, grant(ticket), bail-able — see acquire}
|
|
90
|
+
lastRssBytes: 0,
|
|
91
|
+
baselineRssBytes: 0,
|
|
92
|
+
freezes: 0,
|
|
93
|
+
granted: 0,
|
|
94
|
+
denied: 0
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Pure classifier (exported for tests). heavy ≡ isHeavyScan (oversize /
|
|
99
|
+
* truncated / ≥3MiB weighted); medium = weighted JS ≥ mediumThresholdBytes;
|
|
100
|
+
* light otherwise (including a null weight — an unmeasurable-but-not-truncated
|
|
101
|
+
* package is small).
|
|
102
|
+
*/
|
|
103
|
+
function classifyWeight(jsWeight) {
|
|
104
|
+
if (jsWeight && isHeavyScan(jsWeight)) return { cls: 'heavy', mb: ticketMb('heavy') };
|
|
105
|
+
const effective = jsWeight
|
|
106
|
+
? (Number.isFinite(jsWeight.weightedJsBytes) ? jsWeight.weightedJsBytes : (jsWeight.totalJsBytes || 0))
|
|
107
|
+
: 0;
|
|
108
|
+
if (effective >= mediumThresholdBytes()) return { cls: 'medium', mb: ticketMb('medium') };
|
|
109
|
+
return { cls: 'light', mb: ticketMb('light') };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/** Budget in MB: rssLimit×0.7 − boot baseline. 0 until the first RSS sample. */
|
|
113
|
+
function _budgetMb() {
|
|
114
|
+
if (!_gov.baselineRssBytes) return 0;
|
|
115
|
+
return Math.max(0, Math.floor(rssLimitMb() * 0.7 - _gov.baselineRssBytes / 1024 / 1024));
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function isFrozen() {
|
|
119
|
+
if (!isGovernorEnabled()) return false;
|
|
120
|
+
if (!_gov.lastRssBytes) return false;
|
|
121
|
+
return _gov.lastRssBytes / 1024 / 1024 > rssLimitMb() * (rssSoftPct() / 100);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function _canAdmit(cls, mb) {
|
|
125
|
+
if (isFrozen()) return false;
|
|
126
|
+
if (cls === 'light') return true; // carve-out: lights only ever blocked by the RSS freeze
|
|
127
|
+
const budget = _budgetMb();
|
|
128
|
+
if (budget <= 0) return true; // no baseline yet (boot) — admit, breaker backstops
|
|
129
|
+
if (_gov.outstandingMb + mb > budget) return false;
|
|
130
|
+
if (cls === 'heavy' && _gov.heavyOutstandingMb + mb > Math.max(0, budget - lightCarveoutMb())) return false;
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function _grant(cls, mb) {
|
|
135
|
+
_gov.outstandingMb += mb;
|
|
136
|
+
_gov.outstandingCount += 1;
|
|
137
|
+
_gov.byCls[cls] += 1;
|
|
138
|
+
if (cls === 'heavy') _gov.heavyOutstandingMb += mb;
|
|
139
|
+
_gov.granted += 1;
|
|
140
|
+
return { cls, mb, _released: false };
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function _drainWaiters() {
|
|
144
|
+
if (_gov.queue.length === 0) return;
|
|
145
|
+
// FIFO with anti head-of-line for lights: a blocked heavy/medium must not
|
|
146
|
+
// park the lights queued behind it (they are admittable whenever unfrozen).
|
|
147
|
+
for (let i = 0; i < _gov.queue.length; ) {
|
|
148
|
+
const w = _gov.queue[i];
|
|
149
|
+
const liveness = isFrozen() && _gov.outstandingCount === 0 && i === 0;
|
|
150
|
+
if (liveness || _canAdmit(w.cls, w.mb)) {
|
|
151
|
+
_gov.queue.splice(i, 1);
|
|
152
|
+
w.wake(_grant(w.cls, w.mb));
|
|
153
|
+
} else if (w.cls !== 'light' ) {
|
|
154
|
+
i++; // blocked non-light: skip it, lights behind may still pass
|
|
155
|
+
} else {
|
|
156
|
+
i++; // frozen light — nothing passes until unfreeze/liveness
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Acquire a memory ticket. Resolves the ticket object, or `false` when the
|
|
163
|
+
* governor is disabled (nothing to release — mirrors acquireHeavySlot).
|
|
164
|
+
* Rejects err.code='ABORT_ERR' on outer-scan abort, 'TICKET_WAIT_TIMEOUT'
|
|
165
|
+
* after maxWaitMs (caller requeues, same path as the heavy lane).
|
|
166
|
+
*/
|
|
167
|
+
function acquireMemoryTicket(cls, opts = {}) {
|
|
168
|
+
if (!isGovernorEnabled()) return Promise.resolve(false);
|
|
169
|
+
const mb = ticketMb(cls);
|
|
170
|
+
// Liveness: a frozen governor with nothing in flight must still move.
|
|
171
|
+
if ((_canAdmit(cls, mb)) || (isFrozen() && _gov.outstandingCount === 0)) {
|
|
172
|
+
return Promise.resolve(_grant(cls, mb));
|
|
173
|
+
}
|
|
174
|
+
if (isFrozen()) _gov.freezes += 1;
|
|
175
|
+
const { signal, maxWaitMs } = opts;
|
|
176
|
+
return new Promise((resolve, reject) => {
|
|
177
|
+
let timer = null;
|
|
178
|
+
const cleanup = () => {
|
|
179
|
+
if (timer) { clearTimeout(timer); timer = null; }
|
|
180
|
+
if (signal) { try { signal.removeEventListener('abort', onAbort); } catch { /* not added */ } }
|
|
181
|
+
};
|
|
182
|
+
const waiter = {
|
|
183
|
+
cls,
|
|
184
|
+
mb,
|
|
185
|
+
wake: (ticket) => { cleanup(); resolve(ticket); }
|
|
186
|
+
};
|
|
187
|
+
// Trap #1 (heavy-lane.js): leaving the queue WITHOUT being woken must
|
|
188
|
+
// splice the waiter out, or a future drain wakes a dead waiter and the
|
|
189
|
+
// ticket leaks permanently.
|
|
190
|
+
const bail = (err) => {
|
|
191
|
+
const i = _gov.queue.indexOf(waiter);
|
|
192
|
+
if (i === -1) return; // already woken — the grant path owns the ticket
|
|
193
|
+
_gov.queue.splice(i, 1);
|
|
194
|
+
cleanup();
|
|
195
|
+
_gov.denied += 1;
|
|
196
|
+
reject(err);
|
|
197
|
+
};
|
|
198
|
+
const onAbort = () => {
|
|
199
|
+
const err = new Error('Memory-ticket wait aborted (outer scan timeout)');
|
|
200
|
+
err.code = 'ABORT_ERR';
|
|
201
|
+
bail(err);
|
|
202
|
+
};
|
|
203
|
+
_gov.queue.push(waiter);
|
|
204
|
+
if (signal) {
|
|
205
|
+
if (signal.aborted) { onAbort(); return; }
|
|
206
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
207
|
+
}
|
|
208
|
+
if (Number.isFinite(maxWaitMs) && maxWaitMs > 0) {
|
|
209
|
+
// NOT unref'd: a pending admission is active work (an extracted package
|
|
210
|
+
// on tmp disk waiting to scan) — it must keep the process alive.
|
|
211
|
+
timer = setTimeout(() => {
|
|
212
|
+
const err = new Error(`Memory ticket (${cls}, ${mb}MB) not acquired within ${maxWaitMs}ms`);
|
|
213
|
+
err.code = 'TICKET_WAIT_TIMEOUT';
|
|
214
|
+
bail(err);
|
|
215
|
+
}, maxWaitMs);
|
|
216
|
+
}
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function releaseMemoryTicket(ticket) {
|
|
221
|
+
if (!ticket || ticket._released) return;
|
|
222
|
+
ticket._released = true;
|
|
223
|
+
_gov.outstandingMb = Math.max(0, _gov.outstandingMb - ticket.mb);
|
|
224
|
+
_gov.outstandingCount = Math.max(0, _gov.outstandingCount - 1);
|
|
225
|
+
_gov.byCls[ticket.cls] = Math.max(0, _gov.byCls[ticket.cls] - 1);
|
|
226
|
+
if (ticket.cls === 'heavy') _gov.heavyOutstandingMb = Math.max(0, _gov.heavyOutstandingMb - ticket.mb);
|
|
227
|
+
_drainWaiters();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Feed the governor the process RSS (daemon breaker loop, every 2s). The
|
|
232
|
+
* FIRST sample becomes the boot baseline the budget is computed against —
|
|
233
|
+
* a frozen-at-boot dette documented in the plan: the baseline drifts up over
|
|
234
|
+
* days; phase D's `workers:memory-floored` state is the visibility loop.
|
|
235
|
+
*/
|
|
236
|
+
function updateGovernorRss(rssBytes) {
|
|
237
|
+
if (!Number.isFinite(rssBytes) || rssBytes <= 0) return;
|
|
238
|
+
if (!_gov.baselineRssBytes) _gov.baselineRssBytes = rssBytes;
|
|
239
|
+
const wasFrozen = isFrozen();
|
|
240
|
+
_gov.lastRssBytes = rssBytes;
|
|
241
|
+
if (wasFrozen && !isFrozen()) _drainWaiters();
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
function getGovernorState() {
|
|
245
|
+
return {
|
|
246
|
+
enabled: isGovernorEnabled(),
|
|
247
|
+
frozen: isFrozen(),
|
|
248
|
+
outstandingMb: _gov.outstandingMb,
|
|
249
|
+
outstandingCount: _gov.outstandingCount,
|
|
250
|
+
byCls: { ..._gov.byCls },
|
|
251
|
+
heavyOutstandingMb: _gov.heavyOutstandingMb,
|
|
252
|
+
waiting: _gov.queue.length,
|
|
253
|
+
budgetMb: _budgetMb(),
|
|
254
|
+
baselineRssMb: Math.round(_gov.baselineRssBytes / 1024 / 1024),
|
|
255
|
+
lastRssMb: Math.round(_gov.lastRssBytes / 1024 / 1024),
|
|
256
|
+
granted: _gov.granted,
|
|
257
|
+
freezes: _gov.freezes
|
|
258
|
+
};
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/** Test helper — same role as resetHeavyLane. */
|
|
262
|
+
function resetGovernor() {
|
|
263
|
+
_gov.outstandingMb = 0;
|
|
264
|
+
_gov.outstandingCount = 0;
|
|
265
|
+
_gov.byCls = { light: 0, medium: 0, heavy: 0 };
|
|
266
|
+
_gov.heavyOutstandingMb = 0;
|
|
267
|
+
while (_gov.queue.length > 0) {
|
|
268
|
+
const w = _gov.queue.shift();
|
|
269
|
+
w.wake(_grant(w.cls, w.mb)); // release parked waiters so tests never hang
|
|
270
|
+
}
|
|
271
|
+
_gov.outstandingMb = 0;
|
|
272
|
+
_gov.outstandingCount = 0;
|
|
273
|
+
_gov.byCls = { light: 0, medium: 0, heavy: 0 };
|
|
274
|
+
_gov.heavyOutstandingMb = 0;
|
|
275
|
+
_gov.lastRssBytes = 0;
|
|
276
|
+
_gov.baselineRssBytes = 0;
|
|
277
|
+
_gov.freezes = 0;
|
|
278
|
+
_gov.granted = 0;
|
|
279
|
+
_gov.denied = 0;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
module.exports = {
|
|
283
|
+
isGovernorEnabled,
|
|
284
|
+
classifyWeight,
|
|
285
|
+
acquireMemoryTicket,
|
|
286
|
+
releaseMemoryTicket,
|
|
287
|
+
updateGovernorRss,
|
|
288
|
+
isFrozen,
|
|
289
|
+
getGovernorState,
|
|
290
|
+
resetGovernor,
|
|
291
|
+
ticketMb
|
|
292
|
+
};
|