muaddib-scanner 2.11.59 → 2.11.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/{self-scan-v2.11.59.json → self-scan-v2.11.60.json} +1 -1
- package/src/monitor/daemon.js +116 -3
- package/src/monitor/ingestion.js +3 -2
- package/src/monitor/queue.js +4 -2
- package/src/monitor/scan-queue.js +48 -0
- package/src/monitor/state.js +136 -8
- package/src/monitor/webhook.js +33 -14
package/package.json
CHANGED
package/src/monitor/daemon.js
CHANGED
|
@@ -5,7 +5,7 @@ const os = require('os');
|
|
|
5
5
|
const v8 = require('v8');
|
|
6
6
|
const { isDockerAvailable, SANDBOX_CONCURRENCY_MAX, killAllSandboxContainers } = require('../sandbox/index.js');
|
|
7
7
|
const { setVerboseMode, isSandboxEnabled, isCanaryEnabled, isLlmDetectiveEnabled, getLlmDetectiveMode, DOWNLOADS_CACHE_TTL } = require('./classify.js');
|
|
8
|
-
const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache, getParisHour, atomicWriteFileSync, saveNpmSeq, ALERTS_FILE, runStateMigrations } = require('./state.js');
|
|
8
|
+
const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache, getParisHour, atomicWriteFileSync, saveNpmSeq, ALERTS_FILE, runStateMigrations, loadRecentlyScanned, saveRecentlyScanned } = require('./state.js');
|
|
9
9
|
const { isTemporalEnabled, isTemporalAstEnabled, isTemporalPublishEnabled, isTemporalMaintainerEnabled } = require('./temporal.js');
|
|
10
10
|
const { pendingGrouped, flushScopeGroup, sendDailyReport, DAILY_REPORT_HOUR, alertedPackageRules, ALERTED_PACKAGES_MAX: MAX_ALERTED_PACKAGES } = require('./webhook.js');
|
|
11
11
|
const { poll } = require('./ingestion.js');
|
|
@@ -504,6 +504,9 @@ function reportStats(stats) {
|
|
|
504
504
|
const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
|
|
505
505
|
const { t1, t1a, t1b, t2, t3 } = stats.suspectByTier;
|
|
506
506
|
console.log(`[MONITOR] Stats: ${stats.scanned} scanned, ${stats.clean} clean, ${stats.suspect} suspect (T1a:${t1a} T1b:${t1b} T1:${t1} T2:${t2} T3:${t3}), ${stats.errors} error${stats.errors !== 1 ? 's' : ''}, avg ${avg}s/pkg`);
|
|
507
|
+
if (stats.temporalLoadShed || stats.queueHardDrops || (stats.restartsToday || 0) > 1) {
|
|
508
|
+
console.log(`[MONITOR] Stability: restarts(24h)=${stats.restartsToday || 0}, temporal load-shed=${stats.temporalLoadShed || 0}, queue hard-drops=${stats.queueHardDrops || 0}`);
|
|
509
|
+
}
|
|
507
510
|
if (stats.changesStreamPackages) {
|
|
508
511
|
console.log(`[MONITOR] Changes stream packages: ${stats.changesStreamPackages}`);
|
|
509
512
|
}
|
|
@@ -532,6 +535,99 @@ function isDailyReportDue(stats) {
|
|
|
532
535
|
return !hasReportBeenSentToday(stats);
|
|
533
536
|
}
|
|
534
537
|
|
|
538
|
+
// ─── P1.0 — memory-trend instrumentation ───
|
|
539
|
+
// Append one sample per memory-watchdog tick so the off-heap leak can be localised
|
|
540
|
+
// offline: rss climbing while heapUsed stays flat points at external/arrayBuffers
|
|
541
|
+
// (native tarball/AST buffers) vs liveWorkers (worker-isolate heaps) vs runscDirs
|
|
542
|
+
// (gVisor /tmp/runsc state dirs that survive `docker kill`). The heap-only breaker is
|
|
543
|
+
// blind to all three — this is the data needed to choose the P1.2/P1.3 fix.
|
|
544
|
+
const MEM_TREND_FILE = path.join(__dirname, '..', '..', 'data', 'mem-trend.jsonl');
|
|
545
|
+
const MEM_TREND_MAX_BYTES = 5 * 1024 * 1024; // bounded: truncate-rotate past 5MB
|
|
546
|
+
|
|
547
|
+
function countRunscDirs() {
|
|
548
|
+
try {
|
|
549
|
+
const dir = process.env.MUADDIB_GVISOR_LOG_DIR || '/tmp/runsc';
|
|
550
|
+
return fs.existsSync(dir) ? fs.readdirSync(dir).length : 0;
|
|
551
|
+
} catch { return 0; }
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
function appendMemTrend(currentMem, liveWorkers, queueLen) {
|
|
555
|
+
try {
|
|
556
|
+
// Bounded resource (CLAUDE.md §2): rotate the JSONL once past the cap.
|
|
557
|
+
try {
|
|
558
|
+
const st = fs.statSync(MEM_TREND_FILE);
|
|
559
|
+
if (st.size > MEM_TREND_MAX_BYTES) fs.renameSync(MEM_TREND_FILE, MEM_TREND_FILE + '.1');
|
|
560
|
+
} catch { /* no file yet — fine */ }
|
|
561
|
+
const entry = {
|
|
562
|
+
ts: new Date().toISOString(),
|
|
563
|
+
rss: currentMem.rss,
|
|
564
|
+
heapUsed: currentMem.heapUsed,
|
|
565
|
+
heapTotal: currentMem.heapTotal,
|
|
566
|
+
external: currentMem.external || 0,
|
|
567
|
+
arrayBuffers: currentMem.arrayBuffers || 0,
|
|
568
|
+
liveWorkers,
|
|
569
|
+
queueLen,
|
|
570
|
+
runscDirs: countRunscDirs(),
|
|
571
|
+
};
|
|
572
|
+
fs.appendFileSync(MEM_TREND_FILE, JSON.stringify(entry) + '\n', 'utf8');
|
|
573
|
+
} catch { /* instrumentation must never crash the daemon */ }
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// ─── P2.1 / P2.4 — restart tracking + crash-loop alert ───
|
|
577
|
+
// The chronic ~10×/day OOM crash-loop went unnoticed for weeks because NOTHING counted
|
|
578
|
+
// restarts. Record each boot, expose the 24h count for the daily report, and fire an
|
|
579
|
+
// alert (journal + rate-limited webhook) when the daemon is restarting abnormally often.
|
|
580
|
+
const RESTARTS_FILE = path.join(__dirname, '..', '..', 'data', 'restarts.jsonl');
|
|
581
|
+
const RESTARTS_MAX_LINES = 500; // bounded resource (CLAUDE.md §2)
|
|
582
|
+
const CRASH_LOOP_THRESHOLD_24H = 6; // restarts/24h above this = alert
|
|
583
|
+
const CRASH_LOOP_ALERT_MARKER = path.join(__dirname, '..', '..', 'data', '.crashloop-alert.json');
|
|
584
|
+
const CRASH_LOOP_ALERT_INTERVAL_MS = 6 * 3600 * 1000; // webhook at most once per 6h
|
|
585
|
+
|
|
586
|
+
function countRecentRestarts(windowMs = 24 * 3600 * 1000) {
|
|
587
|
+
try {
|
|
588
|
+
if (!fs.existsSync(RESTARTS_FILE)) return 0;
|
|
589
|
+
const cutoff = Date.now() - windowMs;
|
|
590
|
+
let n = 0;
|
|
591
|
+
for (const line of fs.readFileSync(RESTARTS_FILE, 'utf8').split('\n')) {
|
|
592
|
+
if (!line) continue;
|
|
593
|
+
try { if (new Date(JSON.parse(line).ts).getTime() >= cutoff) n++; } catch { /* skip bad line */ }
|
|
594
|
+
}
|
|
595
|
+
return n;
|
|
596
|
+
} catch { return 0; }
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
function maybeSendCrashLoopWebhook(count24h) {
|
|
600
|
+
try {
|
|
601
|
+
let last = 0;
|
|
602
|
+
try { last = JSON.parse(fs.readFileSync(CRASH_LOOP_ALERT_MARKER, 'utf8')).ts || 0; } catch { /* no marker */ }
|
|
603
|
+
if (Date.now() - last < CRASH_LOOP_ALERT_INTERVAL_MS) return; // rate-limited
|
|
604
|
+
const { getWebhookUrl, sendWebhook } = require('../webhook.js');
|
|
605
|
+
const url = (typeof getWebhookUrl === 'function' && getWebhookUrl()) || process.env.MUADDIB_WEBHOOK_URL;
|
|
606
|
+
if (!url) return;
|
|
607
|
+
atomicWriteFileSync(CRASH_LOOP_ALERT_MARKER, JSON.stringify({ ts: Date.now(), count24h }));
|
|
608
|
+
const payload = { content: `🚨 MUAD'DIB crash-loop: ${count24h} restarts in the last 24h (threshold ${CRASH_LOOP_THRESHOLD_24H}). Likely OOM — check data/mem-trend.jsonl (rss vs external/arrayBuffers).` };
|
|
609
|
+
Promise.resolve(sendWebhook(url, payload)).catch(() => { /* best-effort */ });
|
|
610
|
+
} catch { /* never block boot on alerting */ }
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
function recordRestart() {
|
|
614
|
+
try {
|
|
615
|
+
fs.appendFileSync(RESTARTS_FILE, JSON.stringify({ ts: new Date().toISOString(), pid: process.pid }) + '\n', 'utf8');
|
|
616
|
+
try {
|
|
617
|
+
const lines = fs.readFileSync(RESTARTS_FILE, 'utf8').split('\n').filter(Boolean);
|
|
618
|
+
if (lines.length > RESTARTS_MAX_LINES) fs.writeFileSync(RESTARTS_FILE, lines.slice(-RESTARTS_MAX_LINES).join('\n') + '\n', 'utf8');
|
|
619
|
+
} catch { /* trim best-effort */ }
|
|
620
|
+
} catch { /* best-effort: never block boot on telemetry */ }
|
|
621
|
+
const count24h = countRecentRestarts();
|
|
622
|
+
if (count24h > CRASH_LOOP_THRESHOLD_24H) {
|
|
623
|
+
console.error(`[MONITOR] CRASH-LOOP ALERT: ${count24h} restarts in the last 24h (threshold ${CRASH_LOOP_THRESHOLD_24H}) — daemon restarting abnormally often (OOM?). Check data/mem-trend.jsonl.`);
|
|
624
|
+
maybeSendCrashLoopWebhook(count24h);
|
|
625
|
+
} else {
|
|
626
|
+
console.log(`[MONITOR] BOOT: restart #${count24h} in the last 24h (pid ${process.pid})`);
|
|
627
|
+
}
|
|
628
|
+
return count24h;
|
|
629
|
+
}
|
|
630
|
+
|
|
535
631
|
async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailableRef) {
|
|
536
632
|
if (options && options.verbose) {
|
|
537
633
|
setVerboseMode(true);
|
|
@@ -543,8 +639,13 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
543
639
|
cleanupOrphanTmpDirs();
|
|
544
640
|
// Kill orphan sandbox containers from previous crash (npm-audit-* prefix)
|
|
545
641
|
cleanupOrphanContainers();
|
|
546
|
-
// Clean up stale gVisor runtime dirs (runsc leak — caused 61GB disk fill in prod)
|
|
547
|
-
|
|
642
|
+
// Clean up stale gVisor runtime dirs (runsc leak — caused 61GB disk fill in prod).
|
|
643
|
+
// At boot the previous process (often OOM-killed mid-scan in the ~10×/day crash-loop)
|
|
644
|
+
// owns NO live container, so every runsc dir is an orphan → clear them ALL (age 0),
|
|
645
|
+
// not just those >1h old. The hourly call below keeps the default age for live runtime.
|
|
646
|
+
cleanupRunscOrphans(0);
|
|
647
|
+
// P2.1/P2.4: record this boot, expose the 24h restart count, alert if crash-looping.
|
|
648
|
+
stats.restartsToday = recordRestart();
|
|
548
649
|
// Layer 3: Purge expired cached tarballs on startup
|
|
549
650
|
purgeTarballCache();
|
|
550
651
|
// Purge archived tarballs older than MUADDIB_ARCHIVE_RETENTION_DAYS (default 7).
|
|
@@ -668,6 +769,10 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
668
769
|
console.log(`[MONITOR] ${restoredCount} packages pre-loaded from previous session`);
|
|
669
770
|
}
|
|
670
771
|
|
|
772
|
+
// Restore the dedup Set so the restored backlog isn't re-scanned from scratch
|
|
773
|
+
// (an empty dedup set after each of ~10 daily restarts = thousands of wasted re-scans).
|
|
774
|
+
loadRecentlyScanned(recentlyScanned);
|
|
775
|
+
|
|
671
776
|
// Restore deferred sandbox queue from previous run
|
|
672
777
|
const deferredRestored = restoreDeferredQueue();
|
|
673
778
|
if (deferredRestored > 0) {
|
|
@@ -697,6 +802,7 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
697
802
|
await drainWorkers();
|
|
698
803
|
// Persist remaining queue items so they survive the restart
|
|
699
804
|
persistQueue(scanQueue, state);
|
|
805
|
+
saveRecentlyScanned(recentlyScanned); // Persist dedup set too (avoid re-scan storm on restart)
|
|
700
806
|
// Stop deferred sandbox worker and persist its queue
|
|
701
807
|
stopDeferredWorker();
|
|
702
808
|
persistDeferredQueue();
|
|
@@ -787,6 +893,7 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
787
893
|
queuePersistHandle = setInterval(() => {
|
|
788
894
|
if (!running) return;
|
|
789
895
|
persistQueue(scanQueue, state);
|
|
896
|
+
saveRecentlyScanned(recentlyScanned); // Piggyback: persist dedup set on the same 60s interval
|
|
790
897
|
persistDeferredQueue(); // Piggyback: persist deferred sandbox queue on same interval
|
|
791
898
|
}, QUEUE_PERSIST_INTERVAL);
|
|
792
899
|
|
|
@@ -824,6 +931,8 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
824
931
|
const pctUsed = (heapRatio * 100).toFixed(0);
|
|
825
932
|
const levelName = Object.keys(MEMORY_PRESSURE_LEVELS).find(k => MEMORY_PRESSURE_LEVELS[k] === pressureLevel) || 'UNKNOWN';
|
|
826
933
|
console.log(`[MONITOR] MEMORY: heap=${heapUsedMB}MB/${heapLimitMB}MB (${pctUsed}%), rss=${rssMB}MB (${(rssRatio * 100).toFixed(0)}%/${RSS_LIMIT_MB}MB), queue=${scanQueue.length}, dedup=${recentlyScanned.size}, downloads=${downloadsCache.size}, alerts=${alertedPackageRules.size}, dailyAlerts=${dailyAlerts.length}, pressure=${levelName}`);
|
|
934
|
+
// P1.0: persist the same sample as a time series for offline leak localisation.
|
|
935
|
+
appendMemTrend(currentMem, getActiveWorkers(), scanQueue.length);
|
|
827
936
|
|
|
828
937
|
// Graduated response at HIGH+
|
|
829
938
|
if (pressureLevel >= MEMORY_PRESSURE_LEVELS.HIGH) {
|
|
@@ -881,6 +990,10 @@ module.exports = {
|
|
|
881
990
|
sleep,
|
|
882
991
|
persistQueue,
|
|
883
992
|
restoreQueue,
|
|
993
|
+
appendMemTrend,
|
|
994
|
+
countRunscDirs,
|
|
995
|
+
recordRestart,
|
|
996
|
+
countRecentRestarts,
|
|
884
997
|
POLL_INTERVAL,
|
|
885
998
|
PROCESS_LOOP_INTERVAL,
|
|
886
999
|
QUEUE_WARNING_THRESHOLD,
|
package/src/monitor/ingestion.js
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
const https = require('https');
|
|
11
11
|
const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
|
|
12
12
|
const { loadCachedIOCs } = require('../ioc/updater.js');
|
|
13
|
+
const { enqueueScan } = require('./scan-queue.js');
|
|
13
14
|
const {
|
|
14
15
|
saveNpmSeq, CHANGES_STREAM_URL, CHANGES_LIMIT, CHANGES_CATCHUP_MAX,
|
|
15
16
|
savePypiSerial, PYPI_XMLRPC_URL, PYPI_CATCHUP_MAX
|
|
@@ -523,7 +524,7 @@ async function preResolveNpmBatch(items, stats, scanQueue) {
|
|
|
523
524
|
// already done. Items keep their original order because chunks complete
|
|
524
525
|
// sequentially.
|
|
525
526
|
if (scanQueue) {
|
|
526
|
-
for (const item of chunk) scanQueue
|
|
527
|
+
for (const item of chunk) enqueueScan(scanQueue, item, stats);
|
|
527
528
|
}
|
|
528
529
|
}
|
|
529
530
|
if (stats) {
|
|
@@ -566,7 +567,7 @@ async function preResolvePyPIBatch(items, stats, scanQueue) {
|
|
|
566
567
|
}
|
|
567
568
|
}));
|
|
568
569
|
if (scanQueue) {
|
|
569
|
-
for (const item of chunk) scanQueue
|
|
570
|
+
for (const item of chunk) enqueueScan(scanQueue, item, stats);
|
|
570
571
|
}
|
|
571
572
|
}
|
|
572
573
|
if (stats) {
|
package/src/monitor/queue.js
CHANGED
|
@@ -77,6 +77,7 @@ const {
|
|
|
77
77
|
|
|
78
78
|
// From ./ingestion.js
|
|
79
79
|
const { getNpmLatestTarball, getPyPITarballUrl } = require('./ingestion.js');
|
|
80
|
+
const { enqueueScan } = require('./scan-queue.js');
|
|
80
81
|
|
|
81
82
|
// From ./tarball-archive.js
|
|
82
83
|
const { archiveSuspectTarball } = require('./tarball-archive.js');
|
|
@@ -1255,7 +1256,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1255
1256
|
if (!recent || !recent.tarball || !recent.version) continue;
|
|
1256
1257
|
const dedupeKey = `${item.name}@${recent.version}`;
|
|
1257
1258
|
if (recentlyScanned.has(dedupeKey)) continue;
|
|
1258
|
-
scanQueue
|
|
1259
|
+
enqueueScan(scanQueue, {
|
|
1259
1260
|
name: item.name,
|
|
1260
1261
|
version: recent.version,
|
|
1261
1262
|
ecosystem: 'npm',
|
|
@@ -1264,7 +1265,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1264
1265
|
registryScripts: recent.scripts || null,
|
|
1265
1266
|
atoSignal: item.atoSignal === true,
|
|
1266
1267
|
isATOBurstExtra: true,
|
|
1267
|
-
});
|
|
1268
|
+
}, stats);
|
|
1268
1269
|
}
|
|
1269
1270
|
|
|
1270
1271
|
// Fast-track decision: large packages (>15MB) with no lifecycle scripts and no IOC match.
|
|
@@ -1377,6 +1378,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1377
1378
|
publishResult = pubRes.status === 'fulfilled' ? pubRes.value : null;
|
|
1378
1379
|
maintainerResult = maintRes.status === 'fulfilled' ? maintRes.value : null;
|
|
1379
1380
|
} else if (skipTemporal && item.ecosystem === 'npm' && !item.fastTrack) {
|
|
1381
|
+
stats.temporalLoadShed = (stats.temporalLoadShed || 0) + 1; // P2.2: count the coverage degradation
|
|
1380
1382
|
console.log(`[MONITOR] TEMPORAL LOAD-SHED: ${item.name}@${item.version} (queue=${scanQueue.length} > ${TEMPORAL_LOAD_SHED_THRESHOLD})`);
|
|
1381
1383
|
}
|
|
1382
1384
|
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared bounded enqueue for the scan queue.
|
|
3
|
+
*
|
|
4
|
+
* CLAUDE.md §2 (bounded resources): every in-memory structure needs an explicit max.
|
|
5
|
+
* The scan queue had none — ingestion pushed straight into a plain array, so a
|
|
6
|
+
* backpressure gap or the burst-publish path could grow it without bound. enqueueScan
|
|
7
|
+
* caps it at MAX_SCAN_QUEUE and drops the OLDEST item when full (newest packages are the
|
|
8
|
+
* most likely to still exist on the registry for a later re-scan — the same policy as
|
|
9
|
+
* the EMERGENCY queue truncation in daemon.js). Drops are counted (stats.queueHardDrops)
|
|
10
|
+
* and logged (rate-limited) so a coverage loss can't hide — CLAUDE.md "no silent caps".
|
|
11
|
+
*
|
|
12
|
+
* Lives in its own module so both ingestion.js and queue.js can import it without a
|
|
13
|
+
* circular require (queue.js already requires ingestion.js).
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
// Hard ceiling on live queue growth. Sits above the 30K soft-backpressure threshold
|
|
17
|
+
// (ingestion.js pauses polling at 30K), so it only fires if backpressure is bypassed
|
|
18
|
+
// (e.g. the burst path) or breaks. Env-tunable for ops.
|
|
19
|
+
const MAX_SCAN_QUEUE = (() => {
|
|
20
|
+
const v = parseInt(process.env.MUADDIB_MAX_SCAN_QUEUE, 10);
|
|
21
|
+
return Number.isFinite(v) && v > 0 ? v : 50_000;
|
|
22
|
+
})();
|
|
23
|
+
|
|
24
|
+
const HARD_DROP_LOG_INTERVAL_MS = 10_000;
|
|
25
|
+
let _lastHardDropLog = 0;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Push an item onto the scan queue, enforcing the hard cap by dropping the oldest item
|
|
29
|
+
* when at capacity. `max` defaults to MAX_SCAN_QUEUE (overridable for tests). Returns
|
|
30
|
+
* true iff an item was dropped to make room.
|
|
31
|
+
*/
|
|
32
|
+
function enqueueScan(scanQueue, item, stats, max = MAX_SCAN_QUEUE) {
|
|
33
|
+
let dropped = false;
|
|
34
|
+
if (scanQueue.length >= max) {
|
|
35
|
+
scanQueue.shift(); // drop oldest
|
|
36
|
+
dropped = true;
|
|
37
|
+
if (stats) stats.queueHardDrops = (stats.queueHardDrops || 0) + 1;
|
|
38
|
+
const now = Date.now();
|
|
39
|
+
if (now - _lastHardDropLog > HARD_DROP_LOG_INTERVAL_MS) {
|
|
40
|
+
_lastHardDropLog = now;
|
|
41
|
+
console.warn(`[MONITOR] QUEUE_HARD_DROP: scan queue at cap ${max} — dropping oldest item(s) (total dropped this session: ${stats ? stats.queueHardDrops : '?'}). Ingestion is outrunning scanning.`);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
scanQueue.push(item);
|
|
45
|
+
return dropped;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
module.exports = { enqueueScan, MAX_SCAN_QUEUE };
|
package/src/monitor/state.js
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
const fs = require('fs');
|
|
7
7
|
const path = require('path');
|
|
8
|
+
const { isMainThread, threadId } = require('worker_threads');
|
|
8
9
|
const { sanitizePackageName } = require('../shared/download.js');
|
|
9
10
|
|
|
10
11
|
// --- File path constants ---
|
|
@@ -19,6 +20,7 @@ const DETECTIONS_FILE_LEGACY = path.join(__dirname, '..', '..', 'data', 'detecti
|
|
|
19
20
|
const SCAN_STATS_FILE = path.join(__dirname, '..', '..', 'data', 'scan-stats.json');
|
|
20
21
|
const LAST_DAILY_REPORT_FILE = path.join(__dirname, '..', '..', 'data', 'last-daily-report.json');
|
|
21
22
|
const DAILY_STATS_FILE = path.join(__dirname, '..', '..', 'data', 'daily-stats.json');
|
|
23
|
+
const RECENTLY_SCANNED_FILE = path.join(__dirname, '..', '..', 'data', 'recently-scanned.json');
|
|
22
24
|
const TEMPORAL_DETECTIONS_FILE = path.join(__dirname, '..', '..', 'data', 'temporal-detections.jsonl');
|
|
23
25
|
const TEMPORAL_DETECTIONS_FILE_LEGACY = path.join(__dirname, '..', '..', 'data', 'temporal-detections.json');
|
|
24
26
|
|
|
@@ -43,13 +45,21 @@ const FALLBACK_ALERTS_DIR = path.join(require('os').tmpdir(), 'muaddib-alerts');
|
|
|
43
45
|
* Try to ensure a directory exists and is writable. Returns the usable path
|
|
44
46
|
* or a fallback path if the primary is read-only / permission-denied.
|
|
45
47
|
*/
|
|
46
|
-
function resolveWritableDir(primary, fallback) {
|
|
48
|
+
function resolveWritableDir(primary, fallback, isMain = isMainThread) {
|
|
47
49
|
try {
|
|
48
50
|
fs.mkdirSync(primary, { recursive: true });
|
|
49
|
-
//
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
51
|
+
// Only the MAIN thread writes reports/alerts. Each of the up-to-16 scan worker
|
|
52
|
+
// threads also loads this module (via the transitive require chain), so if they
|
|
53
|
+
// all ran the probe they'd race on the shared path and throw ENOENT on unlink
|
|
54
|
+
// (8 such errors/day in prod). Workers skip the probe — the main thread's is enough.
|
|
55
|
+
if (isMain) {
|
|
56
|
+
// Unique name per process+thread so overlapping processes (restart storms) and
|
|
57
|
+
// any future multi-thread probing can't collide. force:true on removal tolerates
|
|
58
|
+
// an already-gone probe (the very race this fixes) instead of throwing ENOENT.
|
|
59
|
+
const probe = path.join(primary, `.write-test-${process.pid}-${threadId}`);
|
|
60
|
+
fs.writeFileSync(probe, '', 'utf8');
|
|
61
|
+
fs.rmSync(probe, { force: true });
|
|
62
|
+
}
|
|
53
63
|
return primary;
|
|
54
64
|
} catch (err) {
|
|
55
65
|
if (err.code === 'EROFS' || err.code === 'EACCES' || err.code === 'EPERM') {
|
|
@@ -1075,6 +1085,66 @@ function maybePersistDailyStats(stats, dailyAlerts) {
|
|
|
1075
1085
|
}
|
|
1076
1086
|
}
|
|
1077
1087
|
|
|
1088
|
+
// --- Daily report headline reconciliation (crash-safe) ---
|
|
1089
|
+
//
|
|
1090
|
+
// A restart-storm around the daily-report hour can zero/corrupt the in-memory
|
|
1091
|
+
// `stats` counter (the monitor was OOM-restarted ~10×/day in prod), producing a
|
|
1092
|
+
// report like "scanned=5" while ~44k packages were actually scanned that day.
|
|
1093
|
+
// scan-stats.json's `stats.total_scanned` is a MONOTONIC all-time counter, written
|
|
1094
|
+
// atomically on every scan and NEVER reset — so "scans since the last report" is a
|
|
1095
|
+
// restart-proof delta. We persist that counter as a per-report baseline and floor
|
|
1096
|
+
// the published headline at the delta, so a report can never under-count below what
|
|
1097
|
+
// really happened. No-op on healthy days (in-memory counter >= delta).
|
|
1098
|
+
|
|
1099
|
+
/**
|
|
1100
|
+
* Snapshot the monotonic all-time scan-stats totals, to persist as a baseline at
|
|
1101
|
+
* report time. The next report computes "since last report" as a delta from it.
|
|
1102
|
+
*/
|
|
1103
|
+
function captureScanStatsBaseline() {
|
|
1104
|
+
const s = loadScanStats().stats || {};
|
|
1105
|
+
return {
|
|
1106
|
+
total_scanned: s.total_scanned || 0,
|
|
1107
|
+
clean: s.clean || 0,
|
|
1108
|
+
suspect: s.suspect || 0
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
/**
|
|
1113
|
+
* Floor the in-memory daily headline (scanned/clean/suspect) at the durable
|
|
1114
|
+
* scan-stats delta since the last report. Mutates `stats` UPWARD only; never lowers
|
|
1115
|
+
* a value. Returns { applied, floor, before } for observability and tests. Safe
|
|
1116
|
+
* no-op when there is no baseline yet (first report ever) or when the in-memory
|
|
1117
|
+
* counter already meets/exceeds the delta.
|
|
1118
|
+
*/
|
|
1119
|
+
function reconcileDailyHeadline(stats) {
|
|
1120
|
+
const summary = { applied: false, floor: 0, before: stats.scanned };
|
|
1121
|
+
let baseline = null;
|
|
1122
|
+
try {
|
|
1123
|
+
baseline = JSON.parse(fs.readFileSync(LAST_DAILY_REPORT_FILE, 'utf8')).scanStatsBaseline;
|
|
1124
|
+
} catch { /* no file / corrupt — no baseline, treat as first report */ }
|
|
1125
|
+
if (!baseline || typeof baseline.total_scanned !== 'number') return summary;
|
|
1126
|
+
const cur = loadScanStats().stats || {};
|
|
1127
|
+
const dScanned = Math.max(0, (cur.total_scanned || 0) - baseline.total_scanned);
|
|
1128
|
+
const dClean = Math.max(0, (cur.clean || 0) - (baseline.clean || 0));
|
|
1129
|
+
const dSuspect = Math.max(0, (cur.suspect || 0) - (baseline.suspect || 0));
|
|
1130
|
+
summary.floor = dScanned;
|
|
1131
|
+
// Trigger on SIGNIFICANT loss (in-memory below 80% of the durable delta = a
|
|
1132
|
+
// restart-storm dropped counter increments), not on normal drift. The two counters
|
|
1133
|
+
// drift a few percent (in-memory also counts SIZE_REJECT/SKIP-large paths scan-stats
|
|
1134
|
+
// doesn't — so on a healthy day delta <= in-memory, making a false trigger require an
|
|
1135
|
+
// implausible +25% over-count). 0.8 catches half-catastrophes (e.g. 25k in-memory vs
|
|
1136
|
+
// 48k durable) while staying well above the ~5-10% normal-drift band.
|
|
1137
|
+
const LOSS_FLOOR_RATIO = 0.8;
|
|
1138
|
+
if (dScanned > 100 && stats.scanned < dScanned * LOSS_FLOOR_RATIO) {
|
|
1139
|
+
console.warn(`[MONITOR] DAILY RECONCILE: in-memory scanned=${stats.scanned} ≪ durable scan-stats delta=${dScanned} (restart-storm counter loss) — publishing durable count`);
|
|
1140
|
+
stats.scanned = dScanned;
|
|
1141
|
+
if (dClean > stats.clean) stats.clean = dClean;
|
|
1142
|
+
if (dSuspect > stats.suspect) stats.suspect = dSuspect;
|
|
1143
|
+
summary.applied = true;
|
|
1144
|
+
}
|
|
1145
|
+
return summary;
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1078
1148
|
// --- Daily report date persistence ---
|
|
1079
1149
|
|
|
1080
1150
|
/**
|
|
@@ -1092,11 +1162,15 @@ function loadLastDailyReportDate() {
|
|
|
1092
1162
|
}
|
|
1093
1163
|
|
|
1094
1164
|
/**
|
|
1095
|
-
* Persist the date of the last daily report sent (YYYY-MM-DD)
|
|
1165
|
+
* Persist the date of the last daily report sent (YYYY-MM-DD), and optionally the
|
|
1166
|
+
* monotonic scan-stats baseline captured at that moment (used by the next report's
|
|
1167
|
+
* crash-safe headline reconciliation). Baseline is optional for backward compat.
|
|
1096
1168
|
*/
|
|
1097
|
-
function saveLastDailyReportDate(dateStr) {
|
|
1169
|
+
function saveLastDailyReportDate(dateStr, scanStatsBaseline) {
|
|
1098
1170
|
try {
|
|
1099
|
-
|
|
1171
|
+
const payload = { lastReportDate: dateStr };
|
|
1172
|
+
if (scanStatsBaseline) payload.scanStatsBaseline = scanStatsBaseline;
|
|
1173
|
+
atomicWriteFileSync(LAST_DAILY_REPORT_FILE, JSON.stringify(payload, null, 2));
|
|
1100
1174
|
} catch (err) {
|
|
1101
1175
|
console.error(`[MONITOR] Failed to save last daily report date: ${err.message}`);
|
|
1102
1176
|
}
|
|
@@ -1136,6 +1210,56 @@ function getParisDateString() {
|
|
|
1136
1210
|
return formatter.format(new Date());
|
|
1137
1211
|
}
|
|
1138
1212
|
|
|
1213
|
+
// --- recentlyScanned dedup-set persistence (survives restarts → no re-scan storm) ---
|
|
1214
|
+
//
|
|
1215
|
+
// The dedup Set is in-memory only, so every restart starts it empty and re-scans the
|
|
1216
|
+
// whole restored backlog (wasted work — the monitor OOM-restarts ~10×/day). We persist
|
|
1217
|
+
// the keys alongside the queue so the dedup survives. Entries are timestampless (the Set
|
|
1218
|
+
// is FIFO-capped and cleared at each daily report, so it holds at most ~24h of keys), so
|
|
1219
|
+
// freshness is guarded at the whole-file level with a savedAt — same shape as queue-state.
|
|
1220
|
+
const RECENTLY_SCANNED_PERSIST_MAX = 50_000; // mirrors RECENTLY_SCANNED_MAX (queue.js)
|
|
1221
|
+
const RECENTLY_SCANNED_MAX_AGE_MS = 24 * 60 * 60 * 1000; // discard a stale file (monitor down >24h)
|
|
1222
|
+
|
|
1223
|
+
function saveRecentlyScanned(recentlyScanned) {
|
|
1224
|
+
try {
|
|
1225
|
+
if (!recentlyScanned || recentlyScanned.size === 0) {
|
|
1226
|
+
try { fs.unlinkSync(RECENTLY_SCANNED_FILE); } catch {}
|
|
1227
|
+
return;
|
|
1228
|
+
}
|
|
1229
|
+
let keys = Array.from(recentlyScanned);
|
|
1230
|
+
if (keys.length > RECENTLY_SCANNED_PERSIST_MAX) keys = keys.slice(-RECENTLY_SCANNED_PERSIST_MAX);
|
|
1231
|
+
atomicWriteFileSync(RECENTLY_SCANNED_FILE, JSON.stringify({ savedAt: new Date().toISOString(), count: keys.length, keys }));
|
|
1232
|
+
} catch (err) {
|
|
1233
|
+
console.error(`[MONITOR] Failed to persist recentlyScanned: ${err.message}`);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
/**
|
|
1238
|
+
* Restore the dedup Set on boot by adding keys into the passed Set in place. Returns
|
|
1239
|
+
* the count restored. Safe no-op on missing / corrupt / stale (>24h) file.
|
|
1240
|
+
*/
|
|
1241
|
+
function loadRecentlyScanned(recentlyScanned) {
|
|
1242
|
+
try {
|
|
1243
|
+
if (!fs.existsSync(RECENTLY_SCANNED_FILE)) return 0;
|
|
1244
|
+
const data = JSON.parse(fs.readFileSync(RECENTLY_SCANNED_FILE, 'utf8'));
|
|
1245
|
+
if (!data || !Array.isArray(data.keys) || !data.savedAt) return 0;
|
|
1246
|
+
const ageMs = Date.now() - new Date(data.savedAt).getTime();
|
|
1247
|
+
if (ageMs > RECENTLY_SCANNED_MAX_AGE_MS) {
|
|
1248
|
+
console.log(`[MONITOR] recentlyScanned state expired (${Math.round(ageMs / 3600000)}h old) — ignoring`);
|
|
1249
|
+
try { fs.unlinkSync(RECENTLY_SCANNED_FILE); } catch {}
|
|
1250
|
+
return 0;
|
|
1251
|
+
}
|
|
1252
|
+
let keys = data.keys;
|
|
1253
|
+
if (keys.length > RECENTLY_SCANNED_PERSIST_MAX) keys = keys.slice(-RECENTLY_SCANNED_PERSIST_MAX);
|
|
1254
|
+
for (const k of keys) recentlyScanned.add(k);
|
|
1255
|
+
console.log(`[MONITOR] Restored ${keys.length} dedup keys from previous session (no re-scan storm)`);
|
|
1256
|
+
return keys.length;
|
|
1257
|
+
} catch (err) {
|
|
1258
|
+
console.log(`[MONITOR] WARNING: could not restore recentlyScanned: ${err.message}`);
|
|
1259
|
+
return 0;
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
|
|
1139
1263
|
// --- Raw state loader (CLI report helpers) ---
|
|
1140
1264
|
|
|
1141
1265
|
// --- JSONL migration (one-shot, idempotent) ---
|
|
@@ -1320,9 +1444,13 @@ module.exports = {
|
|
|
1320
1444
|
saveDailyStats,
|
|
1321
1445
|
resetDailyStats,
|
|
1322
1446
|
maybePersistDailyStats,
|
|
1447
|
+
captureScanStatsBaseline,
|
|
1448
|
+
reconcileDailyHeadline,
|
|
1323
1449
|
loadLastDailyReportDate,
|
|
1324
1450
|
saveLastDailyReportDate,
|
|
1325
1451
|
hasReportBeenSentToday,
|
|
1452
|
+
saveRecentlyScanned,
|
|
1453
|
+
loadRecentlyScanned,
|
|
1326
1454
|
getParisHour,
|
|
1327
1455
|
getParisDateString,
|
|
1328
1456
|
loadStateRaw
|
package/src/monitor/webhook.js
CHANGED
|
@@ -20,6 +20,8 @@ const {
|
|
|
20
20
|
loadDetections,
|
|
21
21
|
saveLastDailyReportDate,
|
|
22
22
|
resetDailyStats,
|
|
23
|
+
reconcileDailyHeadline,
|
|
24
|
+
captureScanStatsBaseline,
|
|
23
25
|
saveScanMemory,
|
|
24
26
|
shouldSuppressByMemory,
|
|
25
27
|
recordScanMemory,
|
|
@@ -1019,6 +1021,7 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
|
|
|
1019
1021
|
...((stats.sandboxDeferred || stats.deferredProcessed || stats.deferredExpired)
|
|
1020
1022
|
? [{ name: 'Deferred Sandbox', value: `Enqueued: ${stats.sandboxDeferred || 0} | Processed: ${stats.deferredProcessed || 0} | Expired: ${stats.deferredExpired || 0}`, inline: false }]
|
|
1021
1023
|
: []),
|
|
1024
|
+
{ name: 'Stability', value: `Restarts (24h): ${stats.restartsToday || 0} | Temporal load-shed: ${stats.temporalLoadShed || 0} | Queue hard-drops: ${stats.queueHardDrops || 0}`, inline: false },
|
|
1022
1025
|
{ name: 'System', value: healthText, inline: false }
|
|
1023
1026
|
],
|
|
1024
1027
|
footer: {
|
|
@@ -1037,6 +1040,11 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
|
|
|
1037
1040
|
* @param {Map} downloadsCache - In-memory downloads cache (will be cleared)
|
|
1038
1041
|
*/
|
|
1039
1042
|
async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCache) {
|
|
1043
|
+
// Crash-safe headline: a restart-storm around report time can zero the in-memory
|
|
1044
|
+
// counter (the monitor OOM-restarts ~10×/day). Floor scanned/clean/suspect at the
|
|
1045
|
+
// durable scan-stats delta so we never publish "5" when ~44k were really scanned.
|
|
1046
|
+
reconcileDailyHeadline(stats);
|
|
1047
|
+
|
|
1040
1048
|
// Never send an empty report (0 scanned — restart with no work done)
|
|
1041
1049
|
if (stats.scanned === 0) {
|
|
1042
1050
|
console.log('[MONITOR] Daily report skipped (0 packages scanned)');
|
|
@@ -1048,7 +1056,9 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1048
1056
|
// recorded on disk and prevents duplicate reports on next startup.
|
|
1049
1057
|
const today = getParisDateString();
|
|
1050
1058
|
stats.lastDailyReportDate = today;
|
|
1051
|
-
|
|
1059
|
+
// Persist the monotonic scan-stats counter as the baseline for the NEXT report's
|
|
1060
|
+
// delta. Written before the (now last) webhook so a mid-send kill can't double-count.
|
|
1061
|
+
saveLastDailyReportDate(today, captureScanStatsBaseline());
|
|
1052
1062
|
|
|
1053
1063
|
const payload = buildDailyReportEmbed(stats, dailyAlerts);
|
|
1054
1064
|
|
|
@@ -1068,22 +1078,12 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1068
1078
|
deferredProcessed: stats.deferredProcessed || 0,
|
|
1069
1079
|
deferredExpired: stats.deferredExpired || 0,
|
|
1070
1080
|
changesStreamPackages: stats.changesStreamPackages || 0,
|
|
1081
|
+
restartsToday: stats.restartsToday || 0,
|
|
1082
|
+
temporalLoadShed: stats.temporalLoadShed || 0,
|
|
1083
|
+
queueHardDrops: stats.queueHardDrops || 0,
|
|
1071
1084
|
topSuspects: dailyAlerts.slice().sort((a, b) => (b.score || 0) - (a.score || 0) || b.findingsCount - a.findingsCount).slice(0, 10)
|
|
1072
1085
|
});
|
|
1073
1086
|
|
|
1074
|
-
// Send webhook only if configured
|
|
1075
|
-
const url = getWebhookUrl();
|
|
1076
|
-
if (url) {
|
|
1077
|
-
try {
|
|
1078
|
-
await sendWebhook(url, payload, { rawPayload: true });
|
|
1079
|
-
console.log('[MONITOR] Daily report sent');
|
|
1080
|
-
} catch (err) {
|
|
1081
|
-
console.error(`[MONITOR] Daily report webhook failed: ${err.message}`);
|
|
1082
|
-
}
|
|
1083
|
-
} else {
|
|
1084
|
-
console.log('[MONITOR] Daily report persisted locally (no webhook URL configured)');
|
|
1085
|
-
}
|
|
1086
|
-
|
|
1087
1087
|
// Reset daily counters
|
|
1088
1088
|
stats.scanned = 0;
|
|
1089
1089
|
stats.clean = 0;
|
|
@@ -1122,6 +1122,8 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1122
1122
|
stats.pypiCatchupSkips = 0;
|
|
1123
1123
|
stats.pypiWheelsScanned = 0;
|
|
1124
1124
|
stats.pypiSkippedNoArchive = 0;
|
|
1125
|
+
stats.temporalLoadShed = 0;
|
|
1126
|
+
stats.queueHardDrops = 0;
|
|
1125
1127
|
stats.rssFallbackCount = 0;
|
|
1126
1128
|
dailyAlerts.length = 0;
|
|
1127
1129
|
recentlyScanned.clear();
|
|
@@ -1132,9 +1134,26 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1132
1134
|
}
|
|
1133
1135
|
pendingGrouped.clear();
|
|
1134
1136
|
downloadsCache.clear();
|
|
1137
|
+
// Reset the durable daily-stats counter. Done BEFORE the (now last) webhook so a
|
|
1138
|
+
// SIGKILL during the send can't leave the counter un-reset (which would double-count
|
|
1139
|
+
// into the next day's report). loadDailyStats() treats the absent file as zeros.
|
|
1135
1140
|
resetDailyStats();
|
|
1136
1141
|
// C3: Flush scan memory to disk on daily reset (ensures no data loss)
|
|
1137
1142
|
saveScanMemory();
|
|
1143
|
+
|
|
1144
|
+
// Send webhook LAST (best-effort). The reset + baseline above are already durable,
|
|
1145
|
+
// so a kill during the send loses only the Discord ping — never the accounting.
|
|
1146
|
+
const url = getWebhookUrl();
|
|
1147
|
+
if (url) {
|
|
1148
|
+
try {
|
|
1149
|
+
await sendWebhook(url, payload, { rawPayload: true });
|
|
1150
|
+
console.log('[MONITOR] Daily report sent');
|
|
1151
|
+
} catch (err) {
|
|
1152
|
+
console.error(`[MONITOR] Daily report webhook failed: ${err.message}`);
|
|
1153
|
+
}
|
|
1154
|
+
} else {
|
|
1155
|
+
console.log('[MONITOR] Daily report persisted locally (no webhook URL configured)');
|
|
1156
|
+
}
|
|
1138
1157
|
}
|
|
1139
1158
|
|
|
1140
1159
|
// --- CLI report helpers (muaddib report --now / --status) ---
|