muaddib-scanner 2.11.59 → 2.11.62

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.59",
3
+ "version": "2.11.62",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-06-05T06:40:48.592Z",
3
+ "timestamp": "2026-06-05T19:51:12.540Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -5,7 +5,7 @@ const os = require('os');
5
5
  const v8 = require('v8');
6
6
  const { isDockerAvailable, SANDBOX_CONCURRENCY_MAX, killAllSandboxContainers } = require('../sandbox/index.js');
7
7
  const { setVerboseMode, isSandboxEnabled, isCanaryEnabled, isLlmDetectiveEnabled, getLlmDetectiveMode, DOWNLOADS_CACHE_TTL } = require('./classify.js');
8
- const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache, getParisHour, atomicWriteFileSync, saveNpmSeq, ALERTS_FILE, runStateMigrations } = require('./state.js');
8
+ const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache, getParisHour, atomicWriteFileSync, saveNpmSeq, ALERTS_FILE, runStateMigrations, loadRecentlyScanned, saveRecentlyScanned } = require('./state.js');
9
9
  const { isTemporalEnabled, isTemporalAstEnabled, isTemporalPublishEnabled, isTemporalMaintainerEnabled } = require('./temporal.js');
10
10
  const { pendingGrouped, flushScopeGroup, sendDailyReport, DAILY_REPORT_HOUR, alertedPackageRules, ALERTED_PACKAGES_MAX: MAX_ALERTED_PACKAGES } = require('./webhook.js');
11
11
  const { poll } = require('./ingestion.js');
@@ -504,6 +504,9 @@ function reportStats(stats) {
504
504
  const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
505
505
  const { t1, t1a, t1b, t2, t3 } = stats.suspectByTier;
506
506
  console.log(`[MONITOR] Stats: ${stats.scanned} scanned, ${stats.clean} clean, ${stats.suspect} suspect (T1a:${t1a} T1b:${t1b} T1:${t1} T2:${t2} T3:${t3}), ${stats.errors} error${stats.errors !== 1 ? 's' : ''}, avg ${avg}s/pkg`);
507
+ if (stats.temporalLoadShed || stats.queueHardDrops || (stats.restartsToday || 0) > 1) {
508
+ console.log(`[MONITOR] Stability: restarts(24h)=${stats.restartsToday || 0}, temporal load-shed=${stats.temporalLoadShed || 0}, queue hard-drops=${stats.queueHardDrops || 0}`);
509
+ }
507
510
  if (stats.changesStreamPackages) {
508
511
  console.log(`[MONITOR] Changes stream packages: ${stats.changesStreamPackages}`);
509
512
  }
@@ -532,6 +535,99 @@ function isDailyReportDue(stats) {
532
535
  return !hasReportBeenSentToday(stats);
533
536
  }
534
537
 
538
+ // ─── P1.0 — memory-trend instrumentation ───
539
+ // Append one sample per memory-watchdog tick so the off-heap leak can be localised
540
+ // offline: rss climbing while heapUsed stays flat points at external/arrayBuffers
541
+ // (native tarball/AST buffers) vs liveWorkers (worker-isolate heaps) vs runscDirs
542
+ // (gVisor /tmp/runsc state dirs that survive `docker kill`). The heap-only breaker is
543
+ // blind to all three — this is the data needed to choose the P1.2/P1.3 fix.
544
+ const MEM_TREND_FILE = path.join(__dirname, '..', '..', 'data', 'mem-trend.jsonl');
545
+ const MEM_TREND_MAX_BYTES = 5 * 1024 * 1024; // bounded: truncate-rotate past 5MB
546
+
547
+ function countRunscDirs() {
548
+ try {
549
+ const dir = process.env.MUADDIB_GVISOR_LOG_DIR || '/tmp/runsc';
550
+ return fs.existsSync(dir) ? fs.readdirSync(dir).length : 0;
551
+ } catch { return 0; }
552
+ }
553
+
554
+ function appendMemTrend(currentMem, liveWorkers, queueLen) {
555
+ try {
556
+ // Bounded resource (CLAUDE.md §2): rotate the JSONL once past the cap.
557
+ try {
558
+ const st = fs.statSync(MEM_TREND_FILE);
559
+ if (st.size > MEM_TREND_MAX_BYTES) fs.renameSync(MEM_TREND_FILE, MEM_TREND_FILE + '.1');
560
+ } catch { /* no file yet — fine */ }
561
+ const entry = {
562
+ ts: new Date().toISOString(),
563
+ rss: currentMem.rss,
564
+ heapUsed: currentMem.heapUsed,
565
+ heapTotal: currentMem.heapTotal,
566
+ external: currentMem.external || 0,
567
+ arrayBuffers: currentMem.arrayBuffers || 0,
568
+ liveWorkers,
569
+ queueLen,
570
+ runscDirs: countRunscDirs(),
571
+ };
572
+ fs.appendFileSync(MEM_TREND_FILE, JSON.stringify(entry) + '\n', 'utf8');
573
+ } catch { /* instrumentation must never crash the daemon */ }
574
+ }
575
+
576
+ // ─── P2.1 / P2.4 — restart tracking + crash-loop alert ───
577
+ // The chronic ~10×/day OOM crash-loop went unnoticed for weeks because NOTHING counted
578
+ // restarts. Record each boot, expose the 24h count for the daily report, and fire an
579
+ // alert (journal + rate-limited webhook) when the daemon is restarting abnormally often.
580
+ const RESTARTS_FILE = path.join(__dirname, '..', '..', 'data', 'restarts.jsonl');
581
+ const RESTARTS_MAX_LINES = 500; // bounded resource (CLAUDE.md §2)
582
+ const CRASH_LOOP_THRESHOLD_24H = 6; // restarts/24h above this = alert
583
+ const CRASH_LOOP_ALERT_MARKER = path.join(__dirname, '..', '..', 'data', '.crashloop-alert.json');
584
+ const CRASH_LOOP_ALERT_INTERVAL_MS = 6 * 3600 * 1000; // webhook at most once per 6h
585
+
586
+ function countRecentRestarts(windowMs = 24 * 3600 * 1000) {
587
+ try {
588
+ if (!fs.existsSync(RESTARTS_FILE)) return 0;
589
+ const cutoff = Date.now() - windowMs;
590
+ let n = 0;
591
+ for (const line of fs.readFileSync(RESTARTS_FILE, 'utf8').split('\n')) {
592
+ if (!line) continue;
593
+ try { if (new Date(JSON.parse(line).ts).getTime() >= cutoff) n++; } catch { /* skip bad line */ }
594
+ }
595
+ return n;
596
+ } catch { return 0; }
597
+ }
598
+
599
+ function maybeSendCrashLoopWebhook(count24h) {
600
+ try {
601
+ let last = 0;
602
+ try { last = JSON.parse(fs.readFileSync(CRASH_LOOP_ALERT_MARKER, 'utf8')).ts || 0; } catch { /* no marker */ }
603
+ if (Date.now() - last < CRASH_LOOP_ALERT_INTERVAL_MS) return; // rate-limited
604
+ const { getWebhookUrl, sendWebhook } = require('../webhook.js');
605
+ const url = (typeof getWebhookUrl === 'function' && getWebhookUrl()) || process.env.MUADDIB_WEBHOOK_URL;
606
+ if (!url) return;
607
+ atomicWriteFileSync(CRASH_LOOP_ALERT_MARKER, JSON.stringify({ ts: Date.now(), count24h }));
608
+ const payload = { content: `🚨 MUAD'DIB crash-loop: ${count24h} restarts in the last 24h (threshold ${CRASH_LOOP_THRESHOLD_24H}). Likely OOM — check data/mem-trend.jsonl (rss vs external/arrayBuffers).` };
609
+ Promise.resolve(sendWebhook(url, payload)).catch(() => { /* best-effort */ });
610
+ } catch { /* never block boot on alerting */ }
611
+ }
612
+
613
+ function recordRestart() {
614
+ try {
615
+ fs.appendFileSync(RESTARTS_FILE, JSON.stringify({ ts: new Date().toISOString(), pid: process.pid }) + '\n', 'utf8');
616
+ try {
617
+ const lines = fs.readFileSync(RESTARTS_FILE, 'utf8').split('\n').filter(Boolean);
618
+ if (lines.length > RESTARTS_MAX_LINES) fs.writeFileSync(RESTARTS_FILE, lines.slice(-RESTARTS_MAX_LINES).join('\n') + '\n', 'utf8');
619
+ } catch { /* trim best-effort */ }
620
+ } catch { /* best-effort: never block boot on telemetry */ }
621
+ const count24h = countRecentRestarts();
622
+ if (count24h > CRASH_LOOP_THRESHOLD_24H) {
623
+ console.error(`[MONITOR] CRASH-LOOP ALERT: ${count24h} restarts in the last 24h (threshold ${CRASH_LOOP_THRESHOLD_24H}) — daemon restarting abnormally often (OOM?). Check data/mem-trend.jsonl.`);
624
+ maybeSendCrashLoopWebhook(count24h);
625
+ } else {
626
+ console.log(`[MONITOR] BOOT: restart #${count24h} in the last 24h (pid ${process.pid})`);
627
+ }
628
+ return count24h;
629
+ }
630
+
535
631
  async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailableRef) {
536
632
  if (options && options.verbose) {
537
633
  setVerboseMode(true);
@@ -543,8 +639,13 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
543
639
  cleanupOrphanTmpDirs();
544
640
  // Kill orphan sandbox containers from previous crash (npm-audit-* prefix)
545
641
  cleanupOrphanContainers();
546
- // Clean up stale gVisor runtime dirs (runsc leak — caused 61GB disk fill in prod)
547
- cleanupRunscOrphans();
642
+ // Clean up stale gVisor runtime dirs (runsc leak — caused 61GB disk fill in prod).
643
+ // At boot the previous process (often OOM-killed mid-scan in the ~10×/day crash-loop)
644
+ // owns NO live container, so every runsc dir is an orphan → clear them ALL (age 0),
645
+ // not just those >1h old. The hourly call below keeps the default age for live runtime.
646
+ cleanupRunscOrphans(0);
647
+ // P2.1/P2.4: record this boot, expose the 24h restart count, alert if crash-looping.
648
+ stats.restartsToday = recordRestart();
548
649
  // Layer 3: Purge expired cached tarballs on startup
549
650
  purgeTarballCache();
550
651
  // Purge archived tarballs older than MUADDIB_ARCHIVE_RETENTION_DAYS (default 7).
@@ -668,6 +769,10 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
668
769
  console.log(`[MONITOR] ${restoredCount} packages pre-loaded from previous session`);
669
770
  }
670
771
 
772
+ // Restore the dedup Set so the restored backlog isn't re-scanned from scratch
773
+ // (an empty dedup set after each of ~10 daily restarts = thousands of wasted re-scans).
774
+ loadRecentlyScanned(recentlyScanned);
775
+
671
776
  // Restore deferred sandbox queue from previous run
672
777
  const deferredRestored = restoreDeferredQueue();
673
778
  if (deferredRestored > 0) {
@@ -697,6 +802,7 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
697
802
  await drainWorkers();
698
803
  // Persist remaining queue items so they survive the restart
699
804
  persistQueue(scanQueue, state);
805
+ saveRecentlyScanned(recentlyScanned); // Persist dedup set too (avoid re-scan storm on restart)
700
806
  // Stop deferred sandbox worker and persist its queue
701
807
  stopDeferredWorker();
702
808
  persistDeferredQueue();
@@ -787,6 +893,7 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
787
893
  queuePersistHandle = setInterval(() => {
788
894
  if (!running) return;
789
895
  persistQueue(scanQueue, state);
896
+ saveRecentlyScanned(recentlyScanned); // Piggyback: persist dedup set on the same 60s interval
790
897
  persistDeferredQueue(); // Piggyback: persist deferred sandbox queue on same interval
791
898
  }, QUEUE_PERSIST_INTERVAL);
792
899
 
@@ -824,6 +931,8 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
824
931
  const pctUsed = (heapRatio * 100).toFixed(0);
825
932
  const levelName = Object.keys(MEMORY_PRESSURE_LEVELS).find(k => MEMORY_PRESSURE_LEVELS[k] === pressureLevel) || 'UNKNOWN';
826
933
  console.log(`[MONITOR] MEMORY: heap=${heapUsedMB}MB/${heapLimitMB}MB (${pctUsed}%), rss=${rssMB}MB (${(rssRatio * 100).toFixed(0)}%/${RSS_LIMIT_MB}MB), queue=${scanQueue.length}, dedup=${recentlyScanned.size}, downloads=${downloadsCache.size}, alerts=${alertedPackageRules.size}, dailyAlerts=${dailyAlerts.length}, pressure=${levelName}`);
934
+ // P1.0: persist the same sample as a time series for offline leak localisation.
935
+ appendMemTrend(currentMem, getActiveWorkers(), scanQueue.length);
827
936
 
828
937
  // Graduated response at HIGH+
829
938
  if (pressureLevel >= MEMORY_PRESSURE_LEVELS.HIGH) {
@@ -881,6 +990,10 @@ module.exports = {
881
990
  sleep,
882
991
  persistQueue,
883
992
  restoreQueue,
993
+ appendMemTrend,
994
+ countRunscDirs,
995
+ recordRestart,
996
+ countRecentRestarts,
884
997
  POLL_INTERVAL,
885
998
  PROCESS_LOOP_INTERVAL,
886
999
  QUEUE_WARNING_THRESHOLD,
@@ -18,7 +18,7 @@ const { runSandbox } = require('../sandbox/index.js');
18
18
  const { isCanaryEnabled, TIER1_TYPES } = require('./classify.js');
19
19
  const { getWebhookUrl, alertedPackageRules, persistAlert, buildAlertData } = require('./webhook.js');
20
20
  const { sendWebhook } = require('../webhook.js');
21
- const { atomicWriteFileSync } = require('./state.js');
21
+ const { atomicWriteFileSync, markSandboxed } = require('./state.js');
22
22
 
23
23
  // ── Constants ──
24
24
  const DEFERRED_QUEUE_MAX = 500;
@@ -200,6 +200,7 @@ async function processDeferredItem(stats) {
200
200
  const canary = isCanaryEnabled();
201
201
  // maxRuns=1: deferred items are T1b/T2, time bomb detection (3 runs) is a luxury.
202
202
  // 90s instead of 270s per item → 3× faster deferred queue drain.
203
+ markSandboxed(item.name); // stamp for sandbox-revalidation cadence (matches the synchronous path)
203
204
  sandboxResult = await runSandbox(item.name, { canary, skipSemaphore: true, maxRuns: 1, signal: ac.signal });
204
205
  console.log(`[DEFERRED] SANDBOX COMPLETE: ${key} -> score=${sandboxResult.score}, severity=${sandboxResult.severity}`);
205
206
  } catch (err) {
@@ -10,6 +10,7 @@
10
10
  const https = require('https');
11
11
  const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
12
12
  const { loadCachedIOCs } = require('../ioc/updater.js');
13
+ const { enqueueScan } = require('./scan-queue.js');
13
14
  const {
14
15
  saveNpmSeq, CHANGES_STREAM_URL, CHANGES_LIMIT, CHANGES_CATCHUP_MAX,
15
16
  savePypiSerial, PYPI_XMLRPC_URL, PYPI_CATCHUP_MAX
@@ -523,7 +524,7 @@ async function preResolveNpmBatch(items, stats, scanQueue) {
523
524
  // already done. Items keep their original order because chunks complete
524
525
  // sequentially.
525
526
  if (scanQueue) {
526
- for (const item of chunk) scanQueue.push(item);
527
+ for (const item of chunk) enqueueScan(scanQueue, item, stats);
527
528
  }
528
529
  }
529
530
  if (stats) {
@@ -566,7 +567,7 @@ async function preResolvePyPIBatch(items, stats, scanQueue) {
566
567
  }
567
568
  }));
568
569
  if (scanQueue) {
569
- for (const item of chunk) scanQueue.push(item);
570
+ for (const item of chunk) enqueueScan(scanQueue, item, stats);
570
571
  }
571
572
  }
572
573
  if (stats) {
@@ -33,7 +33,10 @@ const {
33
33
  appendAlert,
34
34
  getParisHour,
35
35
  hasReportBeenSentToday,
36
- MAX_DAILY_ALERTS
36
+ MAX_DAILY_ALERTS,
37
+ loadScanMemory,
38
+ shouldSuppressByMemory,
39
+ markSandboxed
37
40
  } = require('./state.js');
38
41
 
39
42
  // From ./classify.js
@@ -77,6 +80,7 @@ const {
77
80
 
78
81
  // From ./ingestion.js
79
82
  const { getNpmLatestTarball, getPyPITarballUrl } = require('./ingestion.js');
83
+ const { enqueueScan } = require('./scan-queue.js');
80
84
 
81
85
  // From ./tarball-archive.js
82
86
  const { archiveSuspectTarball } = require('./tarball-archive.js');
@@ -141,6 +145,29 @@ function computeSandboxScoreThreshold(envValue) {
141
145
  }
142
146
  const SANDBOX_SCORE_THRESHOLD = computeSandboxScoreThreshold(process.env.MUADDIB_SANDBOX_SCORE_THRESHOLD);
143
147
 
148
+ // --- Sandbox waste-cut (v2.11.6x): skip sandbox time that yields no new verdict ---
149
+ // Two skip paths, both detection-safe, applied BEFORE the tier sandbox decision:
150
+ // (1) memory match — re-sandboxing a package whose static result is equivalent to a
151
+ // remembered scan produces nothing the webhook wouldn't already memory-suppress.
152
+ // The dominant waste source is restart-replay: recentlyScanned is in-memory (lost on
153
+ // restart) but scan-memory persists 30d, so the changes-stream backlog gets
154
+ // re-sandboxed then suppressed. We skip, but re-sandbox at most once per
155
+ // SANDBOX_REVALIDATE_MS so runtime/canary coverage is retained on a slow cadence.
156
+ // (2) native binary shard — platform-specific prebuilt packages (os/cpu constrained or
157
+ // name like `*-linux-x64`) with trivial JS hang the sandbox install and always time
158
+ // out INCONCLUSIVE. Same guard rails as the large-low-signal skip (queue.js ~768):
159
+ // any lifecycle script, HIGH/CRITICAL finding, or temporal signal → sandbox runs.
160
+ const SANDBOX_REVALIDATE_MS = (() => {
161
+ const v = parseInt(process.env.MUADDIB_SANDBOX_REVALIDATE_MS, 10);
162
+ return Number.isFinite(v) && v >= 0 ? v : 7 * 24 * 60 * 60 * 1000; // default 7 days
163
+ })();
164
+ // npm platform-shard naming: <scope>/<pkg>-<os>-<arch>[-<libc/abi>] (esbuild/swc/turbo pattern).
165
+ const NATIVE_SHARD_NAME_RE = /-(linux|darwin|win32|freebsd|openbsd|android|sunos|aix)-(x64|arm64|arm|ia32|ppc64|s390x|riscv64|loong64|mips64el)(-(gnu|gnueabihf|musl|eabi|eabihf|msvc))?$/;
166
+ const LIFECYCLE_SCRIPT_KEYS = ['preinstall', 'install', 'postinstall', 'prepare', 'prepublish', 'prepublishOnly', 'preuninstall', 'uninstall', 'postuninstall'];
167
+ // A genuine prebuilt shard is a thin wrapper around a binary (index.js + index.d.ts at most).
168
+ // More JS than this means real logic → not a pure shard → don't skip.
169
+ const NATIVE_SHARD_MAX_JS_FILES = 3;
170
+
144
171
  // --- Bundled tooling false-positive filter ---
145
172
 
146
173
  const KNOWN_BUNDLED_FILES = ['yarn.js', 'webpack.js', 'terser.js', 'esbuild.js', 'polyfills.js'];
@@ -231,6 +258,88 @@ function countPackageFiles(dir) {
231
258
  return { fileCountTotal, hasTests };
232
259
  }
233
260
 
261
+ /**
262
+ * Pure classifier: is this a prebuilt native-binary platform shard (the kind that
263
+ * hangs the sandbox install and always times out INCONCLUSIVE)? No I/O — the parsed
264
+ * package.json manifest is passed in so this is unit-testable. Mirrors the extracted
265
+ * pure helpers computeWorkersToSpawn / computeTarget.
266
+ *
267
+ * A package is a shard when it declares a platform constraint (npm `os`/`cpu`) OR its
268
+ * name matches the `*-<os>-<arch>` convention, AND it carries only a trivial amount of
269
+ * JS (a real shard is a thin wrapper around a binary). hasLifecycleScripts is returned
270
+ * separately so the caller can keep sandboxing shards that DO run install hooks — the
271
+ * actual supply-chain vector.
272
+ *
273
+ * @param {string} name - Package name
274
+ * @param {number} fileCountTotal - JS/TS file count from countPackageFiles
275
+ * @param {Object|null} manifest - Parsed package.json (or null if unreadable)
276
+ * @returns {{ isShard: boolean, hasLifecycleScripts: boolean }}
277
+ */
278
+ function classifyNativeShard(name, fileCountTotal, manifest) {
279
+ const m = manifest || {};
280
+ const scripts = (m.scripts && typeof m.scripts === 'object') ? m.scripts : {};
281
+ const hasLifecycleScripts = LIFECYCLE_SCRIPT_KEYS.some(
282
+ k => typeof scripts[k] === 'string' && scripts[k].trim().length > 0
283
+ );
284
+ const platformConstrained =
285
+ (Array.isArray(m.os) && m.os.length > 0) ||
286
+ (Array.isArray(m.cpu) && m.cpu.length > 0);
287
+ const nameMatches = NATIVE_SHARD_NAME_RE.test(name || '');
288
+ const lowJs = (fileCountTotal || 0) <= NATIVE_SHARD_MAX_JS_FILES;
289
+ return { isShard: (platformConstrained || nameMatches) && lowJs, hasLifecycleScripts };
290
+ }
291
+
292
+ /**
293
+ * Pure decision: should the sandbox be skipped entirely for this package, BEFORE the
294
+ * tier-level run/defer/gate logic? Returns the skip descriptor or null. No I/O — every
295
+ * input is precomputed, so this is unit-testable without launching a real sandbox.
296
+ *
297
+ * Both skip paths are detection-safe:
298
+ * - skip-memory: only when shouldSuppressByMemory already holds (the webhook would be
299
+ * suppressed anyway → the sandbox produces nothing actionable) AND we re-sandboxed
300
+ * this package within revalidateMs. A memory match that is stale (or never sandboxed)
301
+ * falls through to run, so canary coverage is revalidated on the revalidateMs cadence.
302
+ * New threat types / new HC types / score shift / IOC match all make memorySuppress
303
+ * false upstream → never skipped.
304
+ * - skip-native: only a native binary shard with NO lifecycle script, NO HIGH/CRITICAL
305
+ * finding and NO temporal signal — same guard rails as the large-low-signal skip.
306
+ *
307
+ * @param {Object} ctx
308
+ * @param {boolean} ctx.memorySuppress - shouldSuppressByMemory(name, result).suppress
309
+ * @param {number} [ctx.lastSandboxAt] - last real sandbox timestamp from scan memory
310
+ * @param {number} ctx.now - current time (ms)
311
+ * @param {number} ctx.revalidateMs - SANDBOX_REVALIDATE_MS
312
+ * @param {boolean} ctx.isNativeShard
313
+ * @param {boolean} ctx.hasLifecycleScripts
314
+ * @param {boolean} ctx.hasHighOrCritical
315
+ * @param {boolean} ctx.hasTemporal
316
+ * @returns {{ action: 'skip-memory'|'skip-native', reason: string } | null}
317
+ */
318
+ function shouldSkipSandbox(ctx) {
319
+ const {
320
+ memorySuppress, lastSandboxAt, now, revalidateMs,
321
+ isNativeShard, hasLifecycleScripts, hasHighOrCritical, hasTemporal
322
+ } = ctx;
323
+
324
+ // (1) Memory match — skip only if we sandboxed it recently (else revalidate).
325
+ if (memorySuppress) {
326
+ const sandboxedRecently =
327
+ typeof lastSandboxAt === 'number' && (now - lastSandboxAt) < revalidateMs;
328
+ if (sandboxedRecently) {
329
+ const days = ((now - lastSandboxAt) / 86_400_000).toFixed(1);
330
+ return { action: 'skip-memory', reason: `memory match, last sandbox ${days}d ago` };
331
+ }
332
+ // fall through — stale/never-sandboxed memory match revalidates via the normal path
333
+ }
334
+
335
+ // (2) Native binary shard — same guard rails as the large-low-signal skip.
336
+ if (isNativeShard && !hasLifecycleScripts && !hasHighOrCritical && !hasTemporal) {
337
+ return { action: 'skip-native', reason: 'native binary shard, no lifecycle' };
338
+ }
339
+
340
+ return null;
341
+ }
342
+
234
343
  /**
235
344
  * Run the static scan in a Worker thread with a hard timeout.
236
345
  * worker.terminate() calls V8::TerminateExecution which can interrupt
@@ -790,7 +899,35 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
790
899
  (tier === 2 && riskScore >= SANDBOX_SCORE_THRESHOLD && scanQueue.length < 50)
791
900
  );
792
901
 
793
- if (shouldSandbox) {
902
+ // Waste-cut: skip the sandbox (run AND defer) when re-running it yields no new
903
+ // verdict — a memory match the webhook would suppress anyway (dominant cost:
904
+ // restart-replay of the changes-stream backlog), or a native binary shard that
905
+ // just hangs the install. Both detection-safe (see shouldSkipSandbox). Cheap:
906
+ // one package.json read + a scan-memory lookup.
907
+ let shardManifest = null;
908
+ try {
909
+ shardManifest = JSON.parse(fs.readFileSync(path.join(extractedDir, 'package.json'), 'utf8'));
910
+ } catch { /* unreadable manifest → classifyNativeShard treats it as non-shard */ }
911
+ const { isShard: isNativeShard, hasLifecycleScripts: shardHasLifecycle } =
912
+ classifyNativeShard(name, fileCountTotal, shardManifest);
913
+ const memEntry = loadScanMemory()[name];
914
+ const sandboxSkip = (isSandboxEnabled() && sandboxAvailable) ? shouldSkipSandbox({
915
+ memorySuppress: shouldSuppressByMemory(name, result).suppress,
916
+ lastSandboxAt: memEntry && memEntry.lastSandboxAt,
917
+ now: Date.now(),
918
+ revalidateMs: SANDBOX_REVALIDATE_MS,
919
+ isNativeShard,
920
+ hasLifecycleScripts: shardHasLifecycle,
921
+ hasHighOrCritical: hasHighOrCriticalFinding,
922
+ hasTemporal: hasTemporalSignal
923
+ }) : null;
924
+
925
+ if (sandboxSkip) {
926
+ console.log(`[MONITOR] SANDBOX SKIP (${sandboxSkip.reason}): ${name}@${version}`);
927
+ stats.sandboxWasteSkipped = (stats.sandboxWasteSkipped || 0) + 1;
928
+ if (sandboxSkip.action === 'skip-memory') stats.sandboxSkipMemory = (stats.sandboxSkipMemory || 0) + 1;
929
+ else stats.sandboxSkipNative = (stats.sandboxSkipNative || 0) + 1;
930
+ } else if (shouldSandbox) {
794
931
  try {
795
932
  const canary = isCanaryEnabled();
796
933
  const maxRuns = tier === '1a' ? undefined : 1;
@@ -798,11 +935,13 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
798
935
  if (tier === '1a') {
799
936
  // T1a: mandatory sandbox — block-wait (high-confidence threats MUST get sandbox)
800
937
  console.log(`[MONITOR] SANDBOX: launching for ${name}@${version}${canary ? ' (canary: on)' : ''}...`);
938
+ markSandboxed(name); // stamp before the await: an aborted/inconclusive run still spent the time
801
939
  sandboxResult = await runSandbox(name, { canary, maxRuns, signal });
802
940
  } else if (tryAcquireSandboxSlot()) {
803
941
  // T1b/T2: non-blocking — slot acquired atomically, run with skipSemaphore
804
942
  const reason = tier === 2 ? ' (T2, queue low)' : ' (T1b, conditional)';
805
943
  console.log(`[MONITOR] SANDBOX${reason}: launching for ${name}@${version}${canary ? ' (canary: on)' : ''}...`);
944
+ markSandboxed(name); // stamp before the await: an aborted/inconclusive run still spent the time
806
945
  sandboxResult = await runSandbox(name, { canary, maxRuns, skipSemaphore: true, signal });
807
946
  } else {
808
947
  // T1b/T2: all sandbox slots busy — defer instead of blocking worker
@@ -1255,7 +1394,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
1255
1394
  if (!recent || !recent.tarball || !recent.version) continue;
1256
1395
  const dedupeKey = `${item.name}@${recent.version}`;
1257
1396
  if (recentlyScanned.has(dedupeKey)) continue;
1258
- scanQueue.push({
1397
+ enqueueScan(scanQueue, {
1259
1398
  name: item.name,
1260
1399
  version: recent.version,
1261
1400
  ecosystem: 'npm',
@@ -1264,7 +1403,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
1264
1403
  registryScripts: recent.scripts || null,
1265
1404
  atoSignal: item.atoSignal === true,
1266
1405
  isATOBurstExtra: true,
1267
- });
1406
+ }, stats);
1268
1407
  }
1269
1408
 
1270
1409
  // Fast-track decision: large packages (>15MB) with no lifecycle scripts and no IOC match.
@@ -1377,6 +1516,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
1377
1516
  publishResult = pubRes.status === 'fulfilled' ? pubRes.value : null;
1378
1517
  maintainerResult = maintRes.status === 'fulfilled' ? maintRes.value : null;
1379
1518
  } else if (skipTemporal && item.ecosystem === 'npm' && !item.fastTrack) {
1519
+ stats.temporalLoadShed = (stats.temporalLoadShed || 0) + 1; // P2.2: count the coverage degradation
1380
1520
  console.log(`[MONITOR] TEMPORAL LOAD-SHED: ${item.name}@${item.version} (queue=${scanQueue.length} > ${TEMPORAL_LOAD_SHED_THRESHOLD})`);
1381
1521
  }
1382
1522
 
@@ -1528,6 +1668,7 @@ module.exports = {
1528
1668
  FIRST_PUBLISH_SANDBOX_ENABLED,
1529
1669
  SANDBOX_SCORE_THRESHOLD,
1530
1670
  computeSandboxScoreThreshold,
1671
+ SANDBOX_REVALIDATE_MS,
1531
1672
  KNOWN_BUNDLED_FILES,
1532
1673
  KNOWN_BUNDLED_PATHS,
1533
1674
  ML_EXCLUDED_DIRS,
@@ -1548,6 +1689,8 @@ module.exports = {
1548
1689
  isBundledToolingOnly,
1549
1690
  recordTrainingSample,
1550
1691
  countPackageFiles,
1692
+ classifyNativeShard,
1693
+ shouldSkipSandbox,
1551
1694
  runScanInWorker,
1552
1695
  scanPackage,
1553
1696
  timeoutPromise,
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Shared bounded enqueue for the scan queue.
3
+ *
4
+ * CLAUDE.md §2 (bounded resources): every in-memory structure needs an explicit max.
5
+ * The scan queue had none — ingestion pushed straight into a plain array, so a
6
+ * backpressure gap or the burst-publish path could grow it without bound. enqueueScan
7
+ * caps it at MAX_SCAN_QUEUE and drops the OLDEST item when full (newest packages are the
8
+ * most likely to still exist on the registry for a later re-scan — the same policy as
9
+ * the EMERGENCY queue truncation in daemon.js). Drops are counted (stats.queueHardDrops)
10
+ * and logged (rate-limited) so a coverage loss can't hide — CLAUDE.md "no silent caps".
11
+ *
12
+ * Lives in its own module so both ingestion.js and queue.js can import it without a
13
+ * circular require (queue.js already requires ingestion.js).
14
+ */
15
+
16
+ // Hard ceiling on live queue growth. Sits above the 30K soft-backpressure threshold
17
+ // (ingestion.js pauses polling at 30K), so it only fires if backpressure is bypassed
18
+ // (e.g. the burst path) or breaks. Env-tunable for ops.
19
+ const MAX_SCAN_QUEUE = (() => {
20
+ const v = parseInt(process.env.MUADDIB_MAX_SCAN_QUEUE, 10);
21
+ return Number.isFinite(v) && v > 0 ? v : 50_000;
22
+ })();
23
+
24
+ const HARD_DROP_LOG_INTERVAL_MS = 10_000;
25
+ let _lastHardDropLog = 0;
26
+
27
+ /**
28
+ * Push an item onto the scan queue, enforcing the hard cap by dropping the oldest item
29
+ * when at capacity. `max` defaults to MAX_SCAN_QUEUE (overridable for tests). Returns
30
+ * true iff an item was dropped to make room.
31
+ */
32
+ function enqueueScan(scanQueue, item, stats, max = MAX_SCAN_QUEUE) {
33
+ let dropped = false;
34
+ if (scanQueue.length >= max) {
35
+ scanQueue.shift(); // drop oldest
36
+ dropped = true;
37
+ if (stats) stats.queueHardDrops = (stats.queueHardDrops || 0) + 1;
38
+ const now = Date.now();
39
+ if (now - _lastHardDropLog > HARD_DROP_LOG_INTERVAL_MS) {
40
+ _lastHardDropLog = now;
41
+ console.warn(`[MONITOR] QUEUE_HARD_DROP: scan queue at cap ${max} — dropping oldest item(s) (total dropped this session: ${stats ? stats.queueHardDrops : '?'}). Ingestion is outrunning scanning.`);
42
+ }
43
+ }
44
+ scanQueue.push(item);
45
+ return dropped;
46
+ }
47
+
48
+ module.exports = { enqueueScan, MAX_SCAN_QUEUE };
@@ -5,6 +5,7 @@
5
5
 
6
6
  const fs = require('fs');
7
7
  const path = require('path');
8
+ const { isMainThread, threadId } = require('worker_threads');
8
9
  const { sanitizePackageName } = require('../shared/download.js');
9
10
 
10
11
  // --- File path constants ---
@@ -19,6 +20,7 @@ const DETECTIONS_FILE_LEGACY = path.join(__dirname, '..', '..', 'data', 'detecti
19
20
  const SCAN_STATS_FILE = path.join(__dirname, '..', '..', 'data', 'scan-stats.json');
20
21
  const LAST_DAILY_REPORT_FILE = path.join(__dirname, '..', '..', 'data', 'last-daily-report.json');
21
22
  const DAILY_STATS_FILE = path.join(__dirname, '..', '..', 'data', 'daily-stats.json');
23
+ const RECENTLY_SCANNED_FILE = path.join(__dirname, '..', '..', 'data', 'recently-scanned.json');
22
24
  const TEMPORAL_DETECTIONS_FILE = path.join(__dirname, '..', '..', 'data', 'temporal-detections.jsonl');
23
25
  const TEMPORAL_DETECTIONS_FILE_LEGACY = path.join(__dirname, '..', '..', 'data', 'temporal-detections.json');
24
26
 
@@ -43,13 +45,21 @@ const FALLBACK_ALERTS_DIR = path.join(require('os').tmpdir(), 'muaddib-alerts');
43
45
  * Try to ensure a directory exists and is writable. Returns the usable path
44
46
  * or a fallback path if the primary is read-only / permission-denied.
45
47
  */
46
- function resolveWritableDir(primary, fallback) {
48
+ function resolveWritableDir(primary, fallback, isMain = isMainThread) {
47
49
  try {
48
50
  fs.mkdirSync(primary, { recursive: true });
49
- // Verify writability with a probe file
50
- const probe = path.join(primary, '.write-test');
51
- fs.writeFileSync(probe, '', 'utf8');
52
- fs.unlinkSync(probe);
51
+ // Only the MAIN thread writes reports/alerts. Each of the up-to-16 scan worker
52
+ // threads also loads this module (via the transitive require chain), so if they
53
+ // all ran the probe they'd race on the shared path and throw ENOENT on unlink
54
+ // (8 such errors/day in prod). Workers skip the probe — the main thread's is enough.
55
+ if (isMain) {
56
+ // Unique name per process+thread so overlapping processes (restart storms) and
57
+ // any future multi-thread probing can't collide. force:true on removal tolerates
58
+ // an already-gone probe (the very race this fixes) instead of throwing ENOENT.
59
+ const probe = path.join(primary, `.write-test-${process.pid}-${threadId}`);
60
+ fs.writeFileSync(probe, '', 'utf8');
61
+ fs.rmSync(probe, { force: true });
62
+ }
53
63
  return primary;
54
64
  } catch (err) {
55
65
  if (err.code === 'EROFS' || err.code === 'EACCES' || err.code === 'EPERM') {
@@ -298,7 +308,14 @@ function saveScanMemory() {
298
308
  */
299
309
  function recordScanMemory(name, score, types, hcTypes) {
300
310
  const store = loadScanMemory();
311
+ // Read-modify-write: preserve fields set out-of-band (notably lastSandboxAt,
312
+ // stamped by markSandboxed when a real sandbox runs) so a record at webhook time
313
+ // does NOT clobber the sandbox-revalidation timestamp the sandbox-skip decision
314
+ // reads. Without this, every webhook record would reset lastSandboxAt and the
315
+ // 7-day canary-revalidation cadence would never settle.
316
+ const prev = store[name] || {};
301
317
  store[name] = {
318
+ ...prev,
302
319
  score,
303
320
  types: types.sort(),
304
321
  hcTypes: hcTypes.sort(),
@@ -306,6 +323,24 @@ function recordScanMemory(name, score, types, hcTypes) {
306
323
  };
307
324
  }
308
325
 
326
+ /**
327
+ * Stamp lastSandboxAt on a package's scan-memory entry — call when a real sandbox
328
+ * run was just performed. The sandbox-skip decision (queue.js shouldSkipSandbox)
329
+ * uses this to skip re-sandboxing a memory-matched package until SANDBOX_REVALIDATE_MS
330
+ * has elapsed: kills restart-replay / re-publish sandbox waste while retaining canary
331
+ * coverage on a slow cadence. Mutates the in-memory cache; persisted by the next
332
+ * saveScanMemory(). A timestamp is set too so a sandbox-before-first-scan entry still
333
+ * has a valid expiry/eviction key.
334
+ * @param {string} name - Package name
335
+ * @param {number} [at] - Timestamp in ms (defaults to now)
336
+ */
337
+ function markSandboxed(name, at) {
338
+ const store = loadScanMemory();
339
+ const ts = at || Date.now();
340
+ const prev = store[name] || {};
341
+ store[name] = { ...prev, lastSandboxAt: ts, timestamp: prev.timestamp || ts };
342
+ }
343
+
309
344
  /**
310
345
  * Check if a webhook should be suppressed based on scan memory.
311
346
  * Returns { suppress: boolean, reason?: string }.
@@ -1075,6 +1110,66 @@ function maybePersistDailyStats(stats, dailyAlerts) {
1075
1110
  }
1076
1111
  }
1077
1112
 
1113
+ // --- Daily report headline reconciliation (crash-safe) ---
1114
+ //
1115
+ // A restart-storm around the daily-report hour can zero/corrupt the in-memory
1116
+ // `stats` counter (the monitor was OOM-restarted ~10×/day in prod), producing a
1117
+ // report like "scanned=5" while ~44k packages were actually scanned that day.
1118
+ // scan-stats.json's `stats.total_scanned` is a MONOTONIC all-time counter, written
1119
+ // atomically on every scan and NEVER reset — so "scans since the last report" is a
1120
+ // restart-proof delta. We persist that counter as a per-report baseline and floor
1121
+ // the published headline at the delta, so a report can never under-count below what
1122
+ // really happened. No-op on healthy days (in-memory counter >= delta).
1123
+
1124
+ /**
1125
+ * Snapshot the monotonic all-time scan-stats totals, to persist as a baseline at
1126
+ * report time. The next report computes "since last report" as a delta from it.
1127
+ */
1128
+ function captureScanStatsBaseline() {
1129
+ const s = loadScanStats().stats || {};
1130
+ return {
1131
+ total_scanned: s.total_scanned || 0,
1132
+ clean: s.clean || 0,
1133
+ suspect: s.suspect || 0
1134
+ };
1135
+ }
1136
+
1137
+ /**
1138
+ * Floor the in-memory daily headline (scanned/clean/suspect) at the durable
1139
+ * scan-stats delta since the last report. Mutates `stats` UPWARD only; never lowers
1140
+ * a value. Returns { applied, floor, before } for observability and tests. Safe
1141
+ * no-op when there is no baseline yet (first report ever) or when the in-memory
1142
+ * counter already meets/exceeds the delta.
1143
+ */
1144
+ function reconcileDailyHeadline(stats) {
1145
+ const summary = { applied: false, floor: 0, before: stats.scanned };
1146
+ let baseline = null;
1147
+ try {
1148
+ baseline = JSON.parse(fs.readFileSync(LAST_DAILY_REPORT_FILE, 'utf8')).scanStatsBaseline;
1149
+ } catch { /* no file / corrupt — no baseline, treat as first report */ }
1150
+ if (!baseline || typeof baseline.total_scanned !== 'number') return summary;
1151
+ const cur = loadScanStats().stats || {};
1152
+ const dScanned = Math.max(0, (cur.total_scanned || 0) - baseline.total_scanned);
1153
+ const dClean = Math.max(0, (cur.clean || 0) - (baseline.clean || 0));
1154
+ const dSuspect = Math.max(0, (cur.suspect || 0) - (baseline.suspect || 0));
1155
+ summary.floor = dScanned;
1156
+ // Trigger on SIGNIFICANT loss (in-memory below 80% of the durable delta = a
1157
+ // restart-storm dropped counter increments), not on normal drift. The two counters
1158
+ // drift a few percent (in-memory also counts SIZE_REJECT/SKIP-large paths scan-stats
1159
+ // doesn't — so on a healthy day delta <= in-memory, making a false trigger require an
1160
+ // implausible +25% over-count). 0.8 catches half-catastrophes (e.g. 25k in-memory vs
1161
+ // 48k durable) while staying well above the ~5-10% normal-drift band.
1162
+ const LOSS_FLOOR_RATIO = 0.8;
1163
+ if (dScanned > 100 && stats.scanned < dScanned * LOSS_FLOOR_RATIO) {
1164
+ console.warn(`[MONITOR] DAILY RECONCILE: in-memory scanned=${stats.scanned} ≪ durable scan-stats delta=${dScanned} (restart-storm counter loss) — publishing durable count`);
1165
+ stats.scanned = dScanned;
1166
+ if (dClean > stats.clean) stats.clean = dClean;
1167
+ if (dSuspect > stats.suspect) stats.suspect = dSuspect;
1168
+ summary.applied = true;
1169
+ }
1170
+ return summary;
1171
+ }
1172
+
1078
1173
  // --- Daily report date persistence ---
1079
1174
 
1080
1175
  /**
@@ -1092,11 +1187,15 @@ function loadLastDailyReportDate() {
1092
1187
  }
1093
1188
 
1094
1189
  /**
1095
- * Persist the date of the last daily report sent (YYYY-MM-DD).
1190
+ * Persist the date of the last daily report sent (YYYY-MM-DD), and optionally the
1191
+ * monotonic scan-stats baseline captured at that moment (used by the next report's
1192
+ * crash-safe headline reconciliation). Baseline is optional for backward compat.
1096
1193
  */
1097
- function saveLastDailyReportDate(dateStr) {
1194
+ function saveLastDailyReportDate(dateStr, scanStatsBaseline) {
1098
1195
  try {
1099
- atomicWriteFileSync(LAST_DAILY_REPORT_FILE, JSON.stringify({ lastReportDate: dateStr }, null, 2));
1196
+ const payload = { lastReportDate: dateStr };
1197
+ if (scanStatsBaseline) payload.scanStatsBaseline = scanStatsBaseline;
1198
+ atomicWriteFileSync(LAST_DAILY_REPORT_FILE, JSON.stringify(payload, null, 2));
1100
1199
  } catch (err) {
1101
1200
  console.error(`[MONITOR] Failed to save last daily report date: ${err.message}`);
1102
1201
  }
@@ -1136,6 +1235,56 @@ function getParisDateString() {
1136
1235
  return formatter.format(new Date());
1137
1236
  }
1138
1237
 
1238
+ // --- recentlyScanned dedup-set persistence (survives restarts → no re-scan storm) ---
1239
+ //
1240
+ // The dedup Set is in-memory only, so every restart starts it empty and re-scans the
1241
+ // whole restored backlog (wasted work — the monitor OOM-restarts ~10×/day). We persist
1242
+ // the keys alongside the queue so the dedup survives. Entries are timestampless (the Set
1243
+ // is FIFO-capped and cleared at each daily report, so it holds at most ~24h of keys), so
1244
+ // freshness is guarded at the whole-file level with a savedAt — same shape as queue-state.
1245
+ const RECENTLY_SCANNED_PERSIST_MAX = 50_000; // mirrors RECENTLY_SCANNED_MAX (queue.js)
1246
+ const RECENTLY_SCANNED_MAX_AGE_MS = 24 * 60 * 60 * 1000; // discard a stale file (monitor down >24h)
1247
+
1248
+ function saveRecentlyScanned(recentlyScanned) {
1249
+ try {
1250
+ if (!recentlyScanned || recentlyScanned.size === 0) {
1251
+ try { fs.unlinkSync(RECENTLY_SCANNED_FILE); } catch {}
1252
+ return;
1253
+ }
1254
+ let keys = Array.from(recentlyScanned);
1255
+ if (keys.length > RECENTLY_SCANNED_PERSIST_MAX) keys = keys.slice(-RECENTLY_SCANNED_PERSIST_MAX);
1256
+ atomicWriteFileSync(RECENTLY_SCANNED_FILE, JSON.stringify({ savedAt: new Date().toISOString(), count: keys.length, keys }));
1257
+ } catch (err) {
1258
+ console.error(`[MONITOR] Failed to persist recentlyScanned: ${err.message}`);
1259
+ }
1260
+ }
1261
+
1262
+ /**
1263
+ * Restore the dedup Set on boot by adding keys into the passed Set in place. Returns
1264
+ * the count restored. Safe no-op on missing / corrupt / stale (>24h) file.
1265
+ */
1266
+ function loadRecentlyScanned(recentlyScanned) {
1267
+ try {
1268
+ if (!fs.existsSync(RECENTLY_SCANNED_FILE)) return 0;
1269
+ const data = JSON.parse(fs.readFileSync(RECENTLY_SCANNED_FILE, 'utf8'));
1270
+ if (!data || !Array.isArray(data.keys) || !data.savedAt) return 0;
1271
+ const ageMs = Date.now() - new Date(data.savedAt).getTime();
1272
+ if (ageMs > RECENTLY_SCANNED_MAX_AGE_MS) {
1273
+ console.log(`[MONITOR] recentlyScanned state expired (${Math.round(ageMs / 3600000)}h old) — ignoring`);
1274
+ try { fs.unlinkSync(RECENTLY_SCANNED_FILE); } catch {}
1275
+ return 0;
1276
+ }
1277
+ let keys = data.keys;
1278
+ if (keys.length > RECENTLY_SCANNED_PERSIST_MAX) keys = keys.slice(-RECENTLY_SCANNED_PERSIST_MAX);
1279
+ for (const k of keys) recentlyScanned.add(k);
1280
+ console.log(`[MONITOR] Restored ${keys.length} dedup keys from previous session (no re-scan storm)`);
1281
+ return keys.length;
1282
+ } catch (err) {
1283
+ console.log(`[MONITOR] WARNING: could not restore recentlyScanned: ${err.message}`);
1284
+ return 0;
1285
+ }
1286
+ }
1287
+
1139
1288
  // --- Raw state loader (CLI report helpers) ---
1140
1289
 
1141
1290
  // --- JSONL migration (one-shot, idempotent) ---
@@ -1292,6 +1441,7 @@ module.exports = {
1292
1441
  loadScanMemory,
1293
1442
  saveScanMemory,
1294
1443
  recordScanMemory,
1444
+ markSandboxed,
1295
1445
  shouldSuppressByMemory,
1296
1446
  loadTarballCacheIndex,
1297
1447
  saveTarballCacheIndex,
@@ -1320,9 +1470,13 @@ module.exports = {
1320
1470
  saveDailyStats,
1321
1471
  resetDailyStats,
1322
1472
  maybePersistDailyStats,
1473
+ captureScanStatsBaseline,
1474
+ reconcileDailyHeadline,
1323
1475
  loadLastDailyReportDate,
1324
1476
  saveLastDailyReportDate,
1325
1477
  hasReportBeenSentToday,
1478
+ saveRecentlyScanned,
1479
+ loadRecentlyScanned,
1326
1480
  getParisHour,
1327
1481
  getParisDateString,
1328
1482
  loadStateRaw
@@ -20,6 +20,8 @@ const {
20
20
  loadDetections,
21
21
  saveLastDailyReportDate,
22
22
  resetDailyStats,
23
+ reconcileDailyHeadline,
24
+ captureScanStatsBaseline,
23
25
  saveScanMemory,
24
26
  shouldSuppressByMemory,
25
27
  recordScanMemory,
@@ -1019,6 +1021,7 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
1019
1021
  ...((stats.sandboxDeferred || stats.deferredProcessed || stats.deferredExpired)
1020
1022
  ? [{ name: 'Deferred Sandbox', value: `Enqueued: ${stats.sandboxDeferred || 0} | Processed: ${stats.deferredProcessed || 0} | Expired: ${stats.deferredExpired || 0}`, inline: false }]
1021
1023
  : []),
1024
+ { name: 'Stability', value: `Restarts (24h): ${stats.restartsToday || 0} | Temporal load-shed: ${stats.temporalLoadShed || 0} | Queue hard-drops: ${stats.queueHardDrops || 0}`, inline: false },
1022
1025
  { name: 'System', value: healthText, inline: false }
1023
1026
  ],
1024
1027
  footer: {
@@ -1037,6 +1040,11 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
1037
1040
  * @param {Map} downloadsCache - In-memory downloads cache (will be cleared)
1038
1041
  */
1039
1042
  async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCache) {
1043
+ // Crash-safe headline: a restart-storm around report time can zero the in-memory
1044
+ // counter (the monitor OOM-restarts ~10×/day). Floor scanned/clean/suspect at the
1045
+ // durable scan-stats delta so we never publish "5" when ~44k were really scanned.
1046
+ reconcileDailyHeadline(stats);
1047
+
1040
1048
  // Never send an empty report (0 scanned — restart with no work done)
1041
1049
  if (stats.scanned === 0) {
1042
1050
  console.log('[MONITOR] Daily report skipped (0 packages scanned)');
@@ -1048,7 +1056,9 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1048
1056
  // recorded on disk and prevents duplicate reports on next startup.
1049
1057
  const today = getParisDateString();
1050
1058
  stats.lastDailyReportDate = today;
1051
- saveLastDailyReportDate(today);
1059
+ // Persist the monotonic scan-stats counter as the baseline for the NEXT report's
1060
+ // delta. Written before the (now last) webhook so a mid-send kill can't double-count.
1061
+ saveLastDailyReportDate(today, captureScanStatsBaseline());
1052
1062
 
1053
1063
  const payload = buildDailyReportEmbed(stats, dailyAlerts);
1054
1064
 
@@ -1068,22 +1078,12 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1068
1078
  deferredProcessed: stats.deferredProcessed || 0,
1069
1079
  deferredExpired: stats.deferredExpired || 0,
1070
1080
  changesStreamPackages: stats.changesStreamPackages || 0,
1081
+ restartsToday: stats.restartsToday || 0,
1082
+ temporalLoadShed: stats.temporalLoadShed || 0,
1083
+ queueHardDrops: stats.queueHardDrops || 0,
1071
1084
  topSuspects: dailyAlerts.slice().sort((a, b) => (b.score || 0) - (a.score || 0) || b.findingsCount - a.findingsCount).slice(0, 10)
1072
1085
  });
1073
1086
 
1074
- // Send webhook only if configured
1075
- const url = getWebhookUrl();
1076
- if (url) {
1077
- try {
1078
- await sendWebhook(url, payload, { rawPayload: true });
1079
- console.log('[MONITOR] Daily report sent');
1080
- } catch (err) {
1081
- console.error(`[MONITOR] Daily report webhook failed: ${err.message}`);
1082
- }
1083
- } else {
1084
- console.log('[MONITOR] Daily report persisted locally (no webhook URL configured)');
1085
- }
1086
-
1087
1087
  // Reset daily counters
1088
1088
  stats.scanned = 0;
1089
1089
  stats.clean = 0;
@@ -1122,6 +1122,8 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1122
1122
  stats.pypiCatchupSkips = 0;
1123
1123
  stats.pypiWheelsScanned = 0;
1124
1124
  stats.pypiSkippedNoArchive = 0;
1125
+ stats.temporalLoadShed = 0;
1126
+ stats.queueHardDrops = 0;
1125
1127
  stats.rssFallbackCount = 0;
1126
1128
  dailyAlerts.length = 0;
1127
1129
  recentlyScanned.clear();
@@ -1132,9 +1134,26 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1132
1134
  }
1133
1135
  pendingGrouped.clear();
1134
1136
  downloadsCache.clear();
1137
+ // Reset the durable daily-stats counter. Done BEFORE the (now last) webhook so a
1138
+ // SIGKILL during the send can't leave the counter un-reset (which would double-count
1139
+ // into the next day's report). loadDailyStats() treats the absent file as zeros.
1135
1140
  resetDailyStats();
1136
1141
  // C3: Flush scan memory to disk on daily reset (ensures no data loss)
1137
1142
  saveScanMemory();
1143
+
1144
+ // Send webhook LAST (best-effort). The reset + baseline above are already durable,
1145
+ // so a kill during the send loses only the Discord ping — never the accounting.
1146
+ const url = getWebhookUrl();
1147
+ if (url) {
1148
+ try {
1149
+ await sendWebhook(url, payload, { rawPayload: true });
1150
+ console.log('[MONITOR] Daily report sent');
1151
+ } catch (err) {
1152
+ console.error(`[MONITOR] Daily report webhook failed: ${err.message}`);
1153
+ }
1154
+ } else {
1155
+ console.log('[MONITOR] Daily report persisted locally (no webhook URL configured)');
1156
+ }
1138
1157
  }
1139
1158
 
1140
1159
  // --- CLI report helpers (muaddib report --now / --status) ---