muaddib-scanner 2.11.75 → 2.11.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,36 +24,129 @@ const MAX_SCAN_QUEUE = (() => {
24
24
  const HARD_DROP_LOG_INTERVAL_MS = 10_000;
25
25
  let _lastHardDropLog = 0;
26
26
 
27
+ // Phase 2b: classes we never want to drop blindly when the queue caps out — the
28
+ // specifically-targeted scans (known-malicious, burst/ATO, first-publish). Eviction drops
29
+ // the oldest UNPROTECTED item instead; only if a bounded head-window is entirely protected
30
+ // do we fall back to strict-oldest (still ledgered, with a distinct source).
31
+ function _isProtected(item) {
32
+ return !!(item && (item.isIOCMatch || item.isBurst || item.firstPublish || item.atoSignal || item.isATOBurstExtra));
33
+ }
34
+
35
+ // How far from the head we scan for an unprotected victim. Protected items are a small
36
+ // fraction of the flood, so a victim is almost always found within a few slots; the bound
37
+ // keeps eviction O(window) under sustained overflow (CLAUDE.md §2 bounded resources).
38
+ const PROTECTED_EVICTION_SCAN_MAX = (() => {
39
+ const v = parseInt(process.env.MUADDIB_PROTECTED_EVICTION_SCAN_MAX, 10);
40
+ return Number.isFinite(v) && v > 0 ? v : 1024;
41
+ })();
42
+
27
43
  /**
28
- * Push an item onto the scan queue, enforcing the hard cap by dropping the oldest item
29
- * when at capacity. `max` defaults to MAX_SCAN_QUEUE (overridable for tests). Returns
30
- * true iff an item was dropped to make room.
44
+ * Push an item onto the scan queue, enforcing the hard cap when at capacity. Evicts the
45
+ * oldest UNPROTECTED item (within a bounded head-window), falling back to strict-oldest if
46
+ * that window is all-protected. `max` defaults to MAX_SCAN_QUEUE (overridable for tests).
47
+ * Returns true iff an item was dropped to make room.
31
48
  */
32
49
  function enqueueScan(scanQueue, item, stats, max = MAX_SCAN_QUEUE) {
33
50
  let dropped = false;
34
51
  if (scanQueue.length >= max) {
35
- const evicted = scanQueue.shift(); // drop oldest
52
+ // Victim = oldest unprotected item within the bounded head-window; else strict oldest.
53
+ let victimIdx = -1;
54
+ const scanLimit = Math.min(scanQueue.length, PROTECTED_EVICTION_SCAN_MAX);
55
+ for (let i = 0; i < scanLimit; i++) {
56
+ if (!_isProtected(scanQueue[i])) { victimIdx = i; break; }
57
+ }
58
+ const protectedFallback = victimIdx === -1;
59
+ const evicted = protectedFallback ? scanQueue.shift() : scanQueue.splice(victimIdx, 1)[0];
36
60
  dropped = true;
37
61
  if (stats) stats.queueHardDrops = (stats.queueHardDrops || 0) + 1;
38
62
  // Phase 0a: record the dropped item so a coverage loss keeps an identity — answers
39
63
  // "which versions were never scanned" (e.g. the Miasma 72s/96-version burst). Lazy
40
64
  // require avoids any top-level coupling with state.js; best-effort, never throws.
65
+ // A dropped PROTECTED item (all-protected head-window) gets a distinct source so the
66
+ // rare case stays visible in the 0b ledger rollup.
41
67
  try {
42
68
  if (evicted && evicted.name) {
43
69
  require('./state.js').appendScanLedger({
44
70
  name: evicted.name, version: evicted.version, ecosystem: evicted.ecosystem,
45
- outcome: 'dropped', source: 'queue_cap'
71
+ outcome: 'dropped', source: protectedFallback ? 'queue_cap_protected' : 'queue_cap'
46
72
  });
47
73
  }
48
74
  } catch { /* ledger is best-effort */ }
49
75
  const now = Date.now();
50
76
  if (now - _lastHardDropLog > HARD_DROP_LOG_INTERVAL_MS) {
51
77
  _lastHardDropLog = now;
52
- console.warn(`[MONITOR] QUEUE_HARD_DROP: scan queue at cap ${max} — dropping oldest item(s) (total dropped this session: ${stats ? stats.queueHardDrops : '?'}). Ingestion is outrunning scanning.`);
78
+ console.warn(`[MONITOR] QUEUE_HARD_DROP: scan queue at cap ${max} — dropping ${protectedFallback ? 'OLDEST (head-window all protected)' : 'oldest unprotected'} item(s) (total dropped this session: ${stats ? stats.queueHardDrops : '?'}). Ingestion is outrunning scanning.`);
53
79
  }
54
80
  }
55
81
  scanQueue.push(item);
56
82
  return dropped;
57
83
  }
58
84
 
59
- module.exports = { enqueueScan, MAX_SCAN_QUEUE };
85
+ /**
86
+ * Bulk-evict the scan queue down to `targetKeep`, honoring the SAME protection predicate
87
+ * as enqueueScan and ledgering EVERY dropped item — the single-source-of-truth eviction
88
+ * the daemon's EMERGENCY memory breaker must use instead of a raw `splice(0, n)`.
89
+ *
90
+ * Selection: drop the oldest UNPROTECTED items first; only dip into protected items
91
+ * (oldest-first) if there aren't enough unprotected ones to reach the target. This keeps
92
+ * IOC-match / burst / first-publish / ATO scans alive through a memory emergency, exactly
93
+ * like the per-item cap path — closing the gap where the v2.10.88 circuit breaker silently
94
+ * dropped protected scans (CLAUDE.md "ne jamais perdre de scan" / "no silent caps").
95
+ *
96
+ * In-place compaction (write-pointer, O(n), preserves insertion order, no giant spread) so
97
+ * the daemon (which holds the same array reference) sees the mutation. Best-effort ledger;
98
+ * never throws. `ledgerFn` is injectable for tests; defaults to state.appendScanLedger.
99
+ *
100
+ * @returns {{dropped:number, droppedProtected:number}}
101
+ */
102
+ function evictFromScanQueueBulk(scanQueue, targetKeep, source = 'bulk_evict', ledgerFn = null) {
103
+ const before = scanQueue.length;
104
+ const keep = Math.max(0, targetKeep | 0);
105
+ if (before <= keep) return { dropped: 0, droppedProtected: 0 };
106
+ const toDrop = before - keep;
107
+
108
+ // Victim set: oldest unprotected first, then (only if short) oldest protected.
109
+ const dropSet = new Set();
110
+ for (let i = 0; i < before && dropSet.size < toDrop; i++) {
111
+ if (!_isProtected(scanQueue[i])) dropSet.add(i);
112
+ }
113
+ let droppedProtected = 0;
114
+ if (dropSet.size < toDrop) {
115
+ // Not enough unprotected items: every unprotected one is already marked, so the
116
+ // remaining oldest-first items are protected — drop them as a last resort.
117
+ for (let i = 0; i < before && dropSet.size < toDrop; i++) {
118
+ if (!dropSet.has(i)) { dropSet.add(i); droppedProtected++; }
119
+ }
120
+ }
121
+
122
+ // Resolve the ledger sink once (per-call require would be 500+ lookups under emergency).
123
+ let appendLedger = ledgerFn;
124
+ if (!appendLedger) {
125
+ try { appendLedger = require('./state.js').appendScanLedger; } catch { appendLedger = null; }
126
+ }
127
+
128
+ // Compact survivors in place, ledgering each evicted item with an identity-preserving
129
+ // source (protected drops get a distinct suffix so the rare case stays visible in the rollup).
130
+ let w = 0;
131
+ for (let r = 0; r < before; r++) {
132
+ if (dropSet.has(r)) {
133
+ const item = scanQueue[r];
134
+ if (appendLedger && item && item.name) {
135
+ try {
136
+ appendLedger({
137
+ name: item.name, version: item.version, ecosystem: item.ecosystem,
138
+ outcome: 'dropped',
139
+ source: _isProtected(item) ? `${source}_protected` : source
140
+ });
141
+ } catch { /* ledger is best-effort — must never break the breaker */ }
142
+ }
143
+ } else {
144
+ scanQueue[w++] = scanQueue[r];
145
+ }
146
+ }
147
+ scanQueue.length = w;
148
+
149
+ return { dropped: toDrop, droppedProtected };
150
+ }
151
+
152
+ module.exports = { enqueueScan, evictFromScanQueueBulk, isProtected: _isProtected, MAX_SCAN_QUEUE };
@@ -972,7 +972,7 @@ let _scanLedgerAppendedSinceCompact = 0;
972
972
  const SCAN_LEDGER_OUTCOMES = new Set([
973
973
  'clean', 'clean_low_signal', 'clean_tooling', 'suspect', 'ml_clean', 'llm_benign',
974
974
  'sandbox_inconclusive', 'sandbox_unconfirmed', 'confirmed',
975
- 'static_timeout', 'size_skip', 'dropped'
975
+ 'static_timeout', 'size_skip', 'dropped', 'error'
976
976
  ]);
977
977
 
978
978
  /**
@@ -1453,6 +1453,27 @@ function getParisDateString() {
1453
1453
  return formatter.format(new Date());
1454
1454
  }
1455
1455
 
1456
+ // Hour (Europe/Paris) at/after which the once-daily report may fire. Single source of
1457
+ // truth — imported by webhook.js, daemon.js and queue.js (each previously redefined it,
1458
+ // and webhook.js still re-exports it for back-compat).
1459
+ const DAILY_REPORT_HOUR = 8; // 08:00 Paris time (Europe/Paris)
1460
+
1461
+ /**
1462
+ * Canonical "is the daily report due?" predicate — the ONE gate, defined here in state.js
1463
+ * (a leaf module that daemon.js and queue.js already import, so no require cycle).
1464
+ *
1465
+ * Catch-up semantics: fire at OR AFTER 08:00 Paris, so a missed 08:00 (e.g. the daemon was
1466
+ * down/OOM-restarting at that minute) still fires later the SAME day — losing a whole day
1467
+ * was the old daemon.js `hour === 8` behaviour. But NEVER fire during the 00:00–07:59 Paris
1468
+ * "dead zone": a fire then stamps the NEW day's date before its 08:00 window and, because
1469
+ * hasReportBeenSentToday() keys off the Paris CALENDAR date, permanently suppresses that
1470
+ * day's real report. Replaces the two divergent copies (daemon.js `!== 8`, queue.js `< 8`).
1471
+ */
1472
+ function isDailyReportDue(stats) {
1473
+ if (getParisHour() < DAILY_REPORT_HOUR) return false;
1474
+ return !hasReportBeenSentToday(stats);
1475
+ }
1476
+
1456
1477
  // --- recentlyScanned dedup-set persistence (survives restarts → no re-scan storm) ---
1457
1478
  //
1458
1479
  // The dedup Set is in-memory only, so every restart starts it empty and re-scans the
@@ -1703,5 +1724,7 @@ module.exports = {
1703
1724
  loadRecentlyScanned,
1704
1725
  getParisHour,
1705
1726
  getParisDateString,
1727
+ DAILY_REPORT_HOUR,
1728
+ isDailyReportDue,
1706
1729
  loadStateRaw
1707
1730
  };
@@ -16,6 +16,7 @@ const {
16
16
  DAILY_REPORTS_LOG_DIR,
17
17
  getParisDateString,
18
18
  getParisHour,
19
+ DAILY_REPORT_HOUR,
19
20
  loadScanStats,
20
21
  loadDetections,
21
22
  saveLastDailyReportDate,
@@ -60,7 +61,8 @@ const HIGH_INTENT_TYPES = new Set([
60
61
  'remote_code_load', 'obfuscation_detected'
61
62
  ]);
62
63
 
63
- const DAILY_REPORT_HOUR = 8; // 08:00 Paris time (Europe/Paris)
64
+ // DAILY_REPORT_HOUR (=8) is imported from state.js (single source of truth) and
65
+ // re-exported below for back-compat (monitor.js / tests import it via webhook).
64
66
 
65
67
  // --- Webhook alerting ---
66
68
 
@@ -240,6 +242,43 @@ async function sendCampaignPreAlert(name, campaign, ecosystem = 'npm') {
240
242
  await sendWebhook(url, buildCampaignPreAlertEmbed(name, campaign, ecosystem), { rawPayload: true });
241
243
  }
242
244
 
245
+ /**
246
+ * Layer 1c: Build the burst pre-alert embed (pure — no network). Exported for tests.
247
+ * Fires when ≥K versions of one package land in a short window (account-takeover /
248
+ * "Miasma" burst-publish). Amber to distinguish from IOC (red) and campaign (orange).
249
+ * @param {string} name - Package name
250
+ * @param {number} count - Number of versions seen in the burst window
251
+ * @param {string} [ecosystem='npm'] - 'npm' | 'pypi' | 'crates' (link target)
252
+ */
253
+ function buildBurstPreAlertEmbed(name, count, ecosystem = 'npm') {
254
+ return {
255
+ embeds: [{
256
+ title: '⚠️ BURST PRE-ALERT — Rapid Multi-Version Publish',
257
+ color: 0xf39c12,
258
+ fields: [
259
+ { name: 'Package', value: `[${ecosystem}/${name}](${registryLink(ecosystem, name)})`, inline: true },
260
+ { name: 'Versions', value: `${count} in a short window`, inline: true },
261
+ { name: 'Detection', value: 'Burst-publish (possible ATO / Miasma)', inline: true },
262
+ { name: 'Status', value: 'Multiple versions published rapidly — every version queued for scan and protected from queue-cap eviction. Treat as suspect until verdicts land.', inline: false }
263
+ ],
264
+ footer: {
265
+ text: `MUAD'DIB Burst Pre-Alert | ${new Date().toISOString().replace('T', ' ').replace(/\.\d+Z$/, ' UTC')}`
266
+ },
267
+ timestamp: new Date().toISOString()
268
+ }]
269
+ };
270
+ }
271
+
272
+ /**
273
+ * Layer 1c: Send a burst pre-alert webhook. Fire-and-forget; callers dedupe per
274
+ * name/window so a burst pings once, not once per version.
275
+ */
276
+ async function sendBurstPreAlert(name, count, ecosystem = 'npm') {
277
+ const url = getWebhookUrl();
278
+ if (!url) return;
279
+ await sendWebhook(url, buildBurstPreAlertEmbed(name, count, ecosystem), { rawPayload: true });
280
+ }
281
+
243
282
  /**
244
283
  * Check if a specific package@version matches a versioned IOC entry.
245
284
  * Returns the matching IOC entry or null.
@@ -1115,6 +1154,14 @@ function buildDailyReportEmbed(stats, dailyAlerts, ledgerRollup) {
1115
1154
  * @param {Map} downloadsCache - In-memory downloads cache (will be cleared)
1116
1155
  */
1117
1156
  async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCache) {
1157
+ // Dead-zone guard (defense in depth): never send or stamp before the 08:00 Paris window.
1158
+ // The scheduled gate (isDailyReportDue) already excludes 00:00–07:59, but an ungated /
1159
+ // manual / test caller firing at e.g. 00:43 would otherwise write-ahead the NEW day's date
1160
+ // (below) and suppress that day's real report. This makes the early stamp impossible.
1161
+ if (getParisHour() < DAILY_REPORT_HOUR) {
1162
+ console.log(`[MONITOR] Daily report suppressed: before ${DAILY_REPORT_HOUR}:00 Paris (hour=${getParisHour()})`);
1163
+ return;
1164
+ }
1118
1165
  // Crash-safe headline: a restart-storm around report time can zero the in-memory
1119
1166
  // counter (the monitor OOM-restarts ~10×/day). Floor scanned/clean/suspect at the
1120
1167
  // durable scan-stats delta so we never publish "5" when ~44k were really scanned.
@@ -1134,6 +1181,10 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1134
1181
  // Persist the monotonic scan-stats counter as the baseline for the NEXT report's
1135
1182
  // delta. Written before the (now last) webhook so a mid-send kill can't double-count.
1136
1183
  saveLastDailyReportDate(today, captureScanStatsBaseline());
1184
+ // Observability: the success path previously logged nothing, which made the late-fire bug
1185
+ // invisible in the journal. Log the stamped date + the actual Paris hour (an on-time 08:00
1186
+ // fire vs a catch-up at hour 14 are now distinguishable) + the headline count.
1187
+ console.log(`[MONITOR] Daily report firing for ${today} (hour=${getParisHour()} Paris, scanned=${stats.scanned})`);
1137
1188
 
1138
1189
  // Phase 0b: compute the ledger rollup ONCE so the embed shows exactly the numbers
1139
1190
  // we persist (no double-scan, no drift between Discord and the on-disk metrics).
@@ -1328,16 +1379,23 @@ async function sendReportNow(stats) {
1328
1379
  return { sent: false, message: `Webhook failed: ${err.message}` };
1329
1380
  }
1330
1381
 
1331
- // Update lastDailyReportDate on disk
1332
- const today = getParisDateString();
1333
- const stateRaw = loadStateRaw();
1334
- const state = {
1335
- npmLastPackage: stateRaw.npmLastPackage || '',
1336
- pypiLastPackage: stateRaw.pypiLastPackage || ''
1337
- };
1338
- stats.lastDailyReportDate = today;
1339
- saveState(state, stats);
1340
- saveLastDailyReportDate(today);
1382
+ // Update lastDailyReportDate on disk — but ONLY at/after 08:00 Paris. A manual report run
1383
+ // before 08:00 is a deliberate operator override (we still SEND it), but it must NOT stamp
1384
+ // today's date: hasReportBeenSentToday() keys off the Paris calendar date, so an early
1385
+ // stamp would suppress that day's scheduled 08:00 report (the exact failure we're fixing).
1386
+ if (getParisHour() >= DAILY_REPORT_HOUR) {
1387
+ const today = getParisDateString();
1388
+ const stateRaw = loadStateRaw();
1389
+ const state = {
1390
+ npmLastPackage: stateRaw.npmLastPackage || '',
1391
+ pypiLastPackage: stateRaw.pypiLastPackage || ''
1392
+ };
1393
+ stats.lastDailyReportDate = today;
1394
+ saveState(state, stats);
1395
+ saveLastDailyReportDate(today);
1396
+ } else {
1397
+ console.log(`[MONITOR] Manual report sent; not stamping (before ${DAILY_REPORT_HOUR}:00 Paris — the scheduled report will still fire today)`);
1398
+ }
1341
1399
 
1342
1400
  return { sent: true, message: 'Daily report sent' };
1343
1401
  }
@@ -1399,6 +1457,8 @@ module.exports = {
1399
1457
  sendIOCPreAlert,
1400
1458
  buildCampaignPreAlertEmbed,
1401
1459
  sendCampaignPreAlert,
1460
+ buildBurstPreAlertEmbed,
1461
+ sendBurstPreAlert,
1402
1462
  matchVersionedIOC,
1403
1463
  computeRiskLevel,
1404
1464
  computeRiskScore,
@@ -121,6 +121,14 @@ function _fetchPackageMetadataHttp(packageName) {
121
121
  return;
122
122
  }
123
123
 
124
+ if (res.statusCode === 429) {
125
+ res.resume();
126
+ // Coordinated backoff on the shared registry limiter — the temporal scanners must
127
+ // signal 429 like the metadata path, not hammer through a rate limit (CLAUDE.md storm).
128
+ try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
129
+ reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}`));
130
+ return;
131
+ }
124
132
  if (res.statusCode < 200 || res.statusCode >= 300) {
125
133
  res.resume();
126
134
  reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}`));
@@ -71,6 +71,11 @@ function _fetchVersionMetadataHttp(packageName, version) {
71
71
  res.resume();
72
72
  return reject(new Error(`Version ${version} not found for package ${packageName}`));
73
73
  }
74
+ if (res.statusCode === 429) {
75
+ res.resume();
76
+ try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
77
+ return reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}@${version}`));
78
+ }
74
79
  if (res.statusCode < 200 || res.statusCode >= 300) {
75
80
  res.resume();
76
81
  return reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}@${version}`));
package/.dockerignore DELETED
@@ -1,7 +0,0 @@
1
- node_modules
2
- .git
3
- datasets
4
- tests
5
- metrics
6
- .muaddib-cache
7
- *.md
package/.env.example DELETED
@@ -1,43 +0,0 @@
1
- # MUAD'DIB environment variables — template
2
- # Copy to .env (local dev) or /opt/muaddib/.env (VPS) and fill in real values.
3
- # .env files are gitignored. NEVER commit a real token.
4
-
5
- # ----------------------------------------------------------------------------
6
- # Threat-feed API tokens (all OPTIONAL — scrapers degrade gracefully if absent)
7
- # ----------------------------------------------------------------------------
8
-
9
- # OpenSourceMalware.com — community-verified threat intel
10
- # Free tier: 60 req/min, /query-latest gives 100 most recent threats per ecosystem.
11
- # Sign up + generate at: https://opensourcemalware.com/auth → profile → API Tokens
12
- # Format: osm_<random-32+chars>
13
- # Used by: src/ioc/scraper.js → scrapeOSMQueryLatest()
14
- OSM_API_TOKEN=
15
-
16
- # ----------------------------------------------------------------------------
17
- # Webhook destinations (optional — monitor alerts)
18
- # ----------------------------------------------------------------------------
19
-
20
- # Discord webhook for monitor alerts (P1/P2/P3 triage)
21
- # DISCORD_WEBHOOK_URL=
22
-
23
- # ----------------------------------------------------------------------------
24
- # FPR plan gates — DEFAULT ON since v2.11.9 (no need to set these unless opting OUT)
25
- # ----------------------------------------------------------------------------
26
- # Measured impact on the v2.11.4 evaluation corpus (1054 packages):
27
- # FPR curated 15.6% -> 9.36% (-6.24 pp), FPR random 7.0% -> 2.0% (-5.00 pp).
28
- # TPR@3 / TPR@20 / ADR strictly unchanged.
29
- #
30
- # Opt-OUT individual gates (uncomment + set to 0):
31
- # MUADDIB_FN_REACHABILITY=0 # function-level reachability gating
32
- # MUADDIB_DECAY=0 # group score decay on bundled outputs
33
- # MUADDIB_MATURE_CAP=0 # cap mature, well-trafficked packages at MEDIUM
34
- # MUADDIB_METADATA_FACTOR=0 # registry signals -> reputation multiplier
35
- # MUADDIB_DELTA_MODE=0 # delta scoring against prior versions
36
- #
37
- # Skip ALL network fetches (npm registry packument + GitHub Releases IOC
38
- # bootstrap) in one shot. Disables MATURE_CAP + METADATA_FACTOR + DELTA_MODE
39
- # at the per-scan level AND the first-run IOC database download. Useful for:
40
- # - air-gap / offline CI environments
41
- # - test runners (set automatically by tests/run-tests.js)
42
- # - perf-critical batch scans where you've pre-warmed the IOC cache
43
- # MUADDIB_NO_REGISTRY_FETCH=1