npm - muaddib-scanner - Versions diffs - 2.11.75 → 2.11.77 - Mend

muaddib-scanner 2.11.75 → 2.11.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/.githooks/pre-commit +18 -0
package/README.md +15 -6
package/package.json +1 -2
package/{self-scan-v2.11.75.json → self-scan-v2.11.77.json} +1 -1
package/src/commands/safe-install.js +8 -3
package/src/monitor/daemon.js +34 -22
package/src/monitor/ingestion.js +43 -6
package/src/monitor/queue.js +120 -21
package/src/monitor/scan-queue.js +100 -7
package/src/monitor/state.js +24 -1
package/src/monitor/webhook.js +71 -11
package/src/scanner/temporal-analysis.js +8 -0
package/src/scanner/temporal-ast-diff.js +5 -0
package/.dockerignore +0 -7
package/.env.example +0 -43
package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
package/ml-retrain/auto-labeler/labeler.py +0 -256
package/ml-retrain/auto-labeler/npm_checker.py +0 -228
package/ml-retrain/auto-labeler/ossf_index.py +0 -178
package/ml-retrain/auto-labeler/requirements.txt +0 -1
package/ml-retrain/confusion-matrix.png +0 -0
package/ml-retrain/model-trees-retrained.js +0 -12
package/ml-retrain/retrain-report.json +0 -225
package/ml-retrain/retrain.py +0 -974
package/sbom.json +0 -0
package/src/ml/train-bundler-detector.py +0 -725
package/src/ml/train-xgboost.py +0 -957
package/tools/export-model-js.py +0 -160
package/tools/requirements-ml.txt +0 -5
package/tools/train-classifier.py +0 -333

package/src/monitor/scan-queue.js CHANGED Viewed

@@ -24,36 +24,129 @@ const MAX_SCAN_QUEUE = (() => {
 const HARD_DROP_LOG_INTERVAL_MS = 10_000;
 let _lastHardDropLog = 0;
+// Phase 2b: classes we never want to drop blindly when the queue caps out — the
+// specifically-targeted scans (known-malicious, burst/ATO, first-publish). Eviction drops
+// the oldest UNPROTECTED item instead; only if a bounded head-window is entirely protected
+// do we fall back to strict-oldest (still ledgered, with a distinct source).
+function _isProtected(item) {
+  return !!(item && (item.isIOCMatch || item.isBurst || item.firstPublish || item.atoSignal || item.isATOBurstExtra));
+}
+// How far from the head we scan for an unprotected victim. Protected items are a small
+// fraction of the flood, so a victim is almost always found within a few slots; the bound
+// keeps eviction O(window) under sustained overflow (CLAUDE.md §2 bounded resources).
+const PROTECTED_EVICTION_SCAN_MAX = (() => {
+  const v = parseInt(process.env.MUADDIB_PROTECTED_EVICTION_SCAN_MAX, 10);
+  return Number.isFinite(v) && v > 0 ? v : 1024;
+})();
 /**
- * Push an item onto the scan queue, enforcing the hard cap by dropping the oldest item
- * when at capacity. `max` defaults to MAX_SCAN_QUEUE (overridable for tests). Returns
- * true iff an item was dropped to make room.
+ * Push an item onto the scan queue, enforcing the hard cap when at capacity. Evicts the
+ * oldest UNPROTECTED item (within a bounded head-window), falling back to strict-oldest if
+ * that window is all-protected. `max` defaults to MAX_SCAN_QUEUE (overridable for tests).
+ * Returns true iff an item was dropped to make room.
  */
 function enqueueScan(scanQueue, item, stats, max = MAX_SCAN_QUEUE) {
   let dropped = false;
   if (scanQueue.length >= max) {
-    const evicted = scanQueue.shift(); // drop oldest
+    // Victim = oldest unprotected item within the bounded head-window; else strict oldest.
+    let victimIdx = -1;
+    const scanLimit = Math.min(scanQueue.length, PROTECTED_EVICTION_SCAN_MAX);
+    for (let i = 0; i < scanLimit; i++) {
+      if (!_isProtected(scanQueue[i])) { victimIdx = i; break; }
+    }
+    const protectedFallback = victimIdx === -1;
+    const evicted = protectedFallback ? scanQueue.shift() : scanQueue.splice(victimIdx, 1)[0];
     dropped = true;
     if (stats) stats.queueHardDrops = (stats.queueHardDrops || 0) + 1;
     // Phase 0a: record the dropped item so a coverage loss keeps an identity — answers
     // "which versions were never scanned" (e.g. the Miasma 72s/96-version burst). Lazy
     // require avoids any top-level coupling with state.js; best-effort, never throws.
+    // A dropped PROTECTED item (all-protected head-window) gets a distinct source so the
+    // rare case stays visible in the 0b ledger rollup.
     try {
       if (evicted && evicted.name) {
         require('./state.js').appendScanLedger({
           name: evicted.name, version: evicted.version, ecosystem: evicted.ecosystem,
-          outcome: 'dropped', source: 'queue_cap'
+          outcome: 'dropped', source: protectedFallback ? 'queue_cap_protected' : 'queue_cap'
         });
       }
     } catch { /* ledger is best-effort */ }
     const now = Date.now();
     if (now - _lastHardDropLog > HARD_DROP_LOG_INTERVAL_MS) {
       _lastHardDropLog = now;
-      console.warn(`[MONITOR] QUEUE_HARD_DROP: scan queue at cap ${max} — dropping oldest item(s) (total dropped this session: ${stats ? stats.queueHardDrops : '?'}). Ingestion is outrunning scanning.`);
+      console.warn(`[MONITOR] QUEUE_HARD_DROP: scan queue at cap ${max} — dropping ${protectedFallback ? 'OLDEST (head-window all protected)' : 'oldest unprotected'} item(s) (total dropped this session: ${stats ? stats.queueHardDrops : '?'}). Ingestion is outrunning scanning.`);
     }
   }
   scanQueue.push(item);
   return dropped;
 }
-module.exports = { enqueueScan, MAX_SCAN_QUEUE };
+/**
+ * Bulk-evict the scan queue down to `targetKeep`, honoring the SAME protection predicate
+ * as enqueueScan and ledgering EVERY dropped item — the single-source-of-truth eviction
+ * the daemon's EMERGENCY memory breaker must use instead of a raw `splice(0, n)`.
+ *
+ * Selection: drop the oldest UNPROTECTED items first; only dip into protected items
+ * (oldest-first) if there aren't enough unprotected ones to reach the target. This keeps
+ * IOC-match / burst / first-publish / ATO scans alive through a memory emergency, exactly
+ * like the per-item cap path — closing the gap where the v2.10.88 circuit breaker silently
+ * dropped protected scans (CLAUDE.md "ne jamais perdre de scan" / "no silent caps").
+ *
+ * In-place compaction (write-pointer, O(n), preserves insertion order, no giant spread) so
+ * the daemon (which holds the same array reference) sees the mutation. Best-effort ledger;
+ * never throws. `ledgerFn` is injectable for tests; defaults to state.appendScanLedger.
+ *
+ * @returns {{dropped:number, droppedProtected:number}}
+ */
+function evictFromScanQueueBulk(scanQueue, targetKeep, source = 'bulk_evict', ledgerFn = null) {
+  const before = scanQueue.length;
+  const keep = Math.max(0, targetKeep | 0);
+  if (before <= keep) return { dropped: 0, droppedProtected: 0 };
+  const toDrop = before - keep;
+  // Victim set: oldest unprotected first, then (only if short) oldest protected.
+  const dropSet = new Set();
+  for (let i = 0; i < before && dropSet.size < toDrop; i++) {
+    if (!_isProtected(scanQueue[i])) dropSet.add(i);
+  }
+  let droppedProtected = 0;
+  if (dropSet.size < toDrop) {
+    // Not enough unprotected items: every unprotected one is already marked, so the
+    // remaining oldest-first items are protected — drop them as a last resort.
+    for (let i = 0; i < before && dropSet.size < toDrop; i++) {
+      if (!dropSet.has(i)) { dropSet.add(i); droppedProtected++; }
+    }
+  }
+  // Resolve the ledger sink once (per-call require would be 500+ lookups under emergency).
+  let appendLedger = ledgerFn;
+  if (!appendLedger) {
+    try { appendLedger = require('./state.js').appendScanLedger; } catch { appendLedger = null; }
+  }
+  // Compact survivors in place, ledgering each evicted item with an identity-preserving
+  // source (protected drops get a distinct suffix so the rare case stays visible in the rollup).
+  let w = 0;
+  for (let r = 0; r < before; r++) {
+    if (dropSet.has(r)) {
+      const item = scanQueue[r];
+      if (appendLedger && item && item.name) {
+        try {
+          appendLedger({
+            name: item.name, version: item.version, ecosystem: item.ecosystem,
+            outcome: 'dropped',
+            source: _isProtected(item) ? `${source}_protected` : source
+          });
+        } catch { /* ledger is best-effort — must never break the breaker */ }
+      }
+    } else {
+      scanQueue[w++] = scanQueue[r];
+    }
+  }
+  scanQueue.length = w;
+  return { dropped: toDrop, droppedProtected };
+}
+module.exports = { enqueueScan, evictFromScanQueueBulk, isProtected: _isProtected, MAX_SCAN_QUEUE };

package/src/monitor/state.js CHANGED Viewed

@@ -972,7 +972,7 @@ let _scanLedgerAppendedSinceCompact = 0;
 const SCAN_LEDGER_OUTCOMES = new Set([
   'clean', 'clean_low_signal', 'clean_tooling', 'suspect', 'ml_clean', 'llm_benign',
   'sandbox_inconclusive', 'sandbox_unconfirmed', 'confirmed',
-  'static_timeout', 'size_skip', 'dropped'
+  'static_timeout', 'size_skip', 'dropped', 'error'
 ]);
 /**
@@ -1453,6 +1453,27 @@ function getParisDateString() {
   return formatter.format(new Date());
 }
+// Hour (Europe/Paris) at/after which the once-daily report may fire. Single source of
+// truth — imported by webhook.js, daemon.js and queue.js (each previously redefined it,
+// and webhook.js still re-exports it for back-compat).
+const DAILY_REPORT_HOUR = 8; // 08:00 Paris time (Europe/Paris)
+/**
+ * Canonical "is the daily report due?" predicate — the ONE gate, defined here in state.js
+ * (a leaf module that daemon.js and queue.js already import, so no require cycle).
+ *
+ * Catch-up semantics: fire at OR AFTER 08:00 Paris, so a missed 08:00 (e.g. the daemon was
+ * down/OOM-restarting at that minute) still fires later the SAME day — losing a whole day
+ * was the old daemon.js `hour === 8` behaviour. But NEVER fire during the 00:00–07:59 Paris
+ * "dead zone": a fire then stamps the NEW day's date before its 08:00 window and, because
+ * hasReportBeenSentToday() keys off the Paris CALENDAR date, permanently suppresses that
+ * day's real report. Replaces the two divergent copies (daemon.js `!== 8`, queue.js `< 8`).
+ */
+function isDailyReportDue(stats) {
+  if (getParisHour() < DAILY_REPORT_HOUR) return false;
+  return !hasReportBeenSentToday(stats);
+}
 // --- recentlyScanned dedup-set persistence (survives restarts → no re-scan storm) ---
 //
 // The dedup Set is in-memory only, so every restart starts it empty and re-scans the
@@ -1703,5 +1724,7 @@ module.exports = {
   loadRecentlyScanned,
   getParisHour,
   getParisDateString,
+  DAILY_REPORT_HOUR,
+  isDailyReportDue,
   loadStateRaw
 };

package/src/monitor/webhook.js CHANGED Viewed

@@ -16,6 +16,7 @@ const {
   DAILY_REPORTS_LOG_DIR,
   getParisDateString,
   getParisHour,
+  DAILY_REPORT_HOUR,
   loadScanStats,
   loadDetections,
   saveLastDailyReportDate,
@@ -60,7 +61,8 @@ const HIGH_INTENT_TYPES = new Set([
   'remote_code_load', 'obfuscation_detected'
 ]);
-const DAILY_REPORT_HOUR = 8; // 08:00 Paris time (Europe/Paris)
+// DAILY_REPORT_HOUR (=8) is imported from state.js (single source of truth) and
+// re-exported below for back-compat (monitor.js / tests import it via webhook).
 // --- Webhook alerting ---
@@ -240,6 +242,43 @@ async function sendCampaignPreAlert(name, campaign, ecosystem = 'npm') {
   await sendWebhook(url, buildCampaignPreAlertEmbed(name, campaign, ecosystem), { rawPayload: true });
 }
+/**
+ * Layer 1c: Build the burst pre-alert embed (pure — no network). Exported for tests.
+ * Fires when ≥K versions of one package land in a short window (account-takeover /
+ * "Miasma" burst-publish). Amber to distinguish from IOC (red) and campaign (orange).
+ * @param {string} name - Package name
+ * @param {number} count - Number of versions seen in the burst window
+ * @param {string} [ecosystem='npm'] - 'npm' | 'pypi' | 'crates' (link target)
+ */
+function buildBurstPreAlertEmbed(name, count, ecosystem = 'npm') {
+  return {
+    embeds: [{
+      title: '⚠️ BURST PRE-ALERT — Rapid Multi-Version Publish',
+      color: 0xf39c12,
+      fields: [
+        { name: 'Package', value: `[${ecosystem}/${name}](${registryLink(ecosystem, name)})`, inline: true },
+        { name: 'Versions', value: `${count} in a short window`, inline: true },
+        { name: 'Detection', value: 'Burst-publish (possible ATO / Miasma)', inline: true },
+        { name: 'Status', value: 'Multiple versions published rapidly — every version queued for scan and protected from queue-cap eviction. Treat as suspect until verdicts land.', inline: false }
+      ],
+      footer: {
+        text: `MUAD'DIB Burst Pre-Alert | ${new Date().toISOString().replace('T', ' ').replace(/\.\d+Z$/, ' UTC')}`
+      },
+      timestamp: new Date().toISOString()
+    }]
+  };
+}
+/**
+ * Layer 1c: Send a burst pre-alert webhook. Fire-and-forget; callers dedupe per
+ * name/window so a burst pings once, not once per version.
+ */
+async function sendBurstPreAlert(name, count, ecosystem = 'npm') {
+  const url = getWebhookUrl();
+  if (!url) return;
+  await sendWebhook(url, buildBurstPreAlertEmbed(name, count, ecosystem), { rawPayload: true });
+}
 /**
  * Check if a specific package@version matches a versioned IOC entry.
  * Returns the matching IOC entry or null.
@@ -1115,6 +1154,14 @@ function buildDailyReportEmbed(stats, dailyAlerts, ledgerRollup) {
  * @param {Map} downloadsCache - In-memory downloads cache (will be cleared)
  */
 async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCache) {
+  // Dead-zone guard (defense in depth): never send or stamp before the 08:00 Paris window.
+  // The scheduled gate (isDailyReportDue) already excludes 00:00–07:59, but an ungated /
+  // manual / test caller firing at e.g. 00:43 would otherwise write-ahead the NEW day's date
+  // (below) and suppress that day's real report. This makes the early stamp impossible.
+  if (getParisHour() < DAILY_REPORT_HOUR) {
+    console.log(`[MONITOR] Daily report suppressed: before ${DAILY_REPORT_HOUR}:00 Paris (hour=${getParisHour()})`);
+    return;
+  }
   // Crash-safe headline: a restart-storm around report time can zero the in-memory
   // counter (the monitor OOM-restarts ~10×/day). Floor scanned/clean/suspect at the
   // durable scan-stats delta so we never publish "5" when ~44k were really scanned.
@@ -1134,6 +1181,10 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
   // Persist the monotonic scan-stats counter as the baseline for the NEXT report's
   // delta. Written before the (now last) webhook so a mid-send kill can't double-count.
   saveLastDailyReportDate(today, captureScanStatsBaseline());
+  // Observability: the success path previously logged nothing, which made the late-fire bug
+  // invisible in the journal. Log the stamped date + the actual Paris hour (an on-time 08:00
+  // fire vs a catch-up at hour 14 are now distinguishable) + the headline count.
+  console.log(`[MONITOR] Daily report firing for ${today} (hour=${getParisHour()} Paris, scanned=${stats.scanned})`);
   // Phase 0b: compute the ledger rollup ONCE so the embed shows exactly the numbers
   // we persist (no double-scan, no drift between Discord and the on-disk metrics).
@@ -1328,16 +1379,23 @@ async function sendReportNow(stats) {
     return { sent: false, message: `Webhook failed: ${err.message}` };
   }
-  // Update lastDailyReportDate on disk
-  const today = getParisDateString();
-  const stateRaw = loadStateRaw();
-  const state = {
-    npmLastPackage: stateRaw.npmLastPackage || '',
-    pypiLastPackage: stateRaw.pypiLastPackage || ''
-  };
-  stats.lastDailyReportDate = today;
-  saveState(state, stats);
-  saveLastDailyReportDate(today);
+  // Update lastDailyReportDate on disk — but ONLY at/after 08:00 Paris. A manual report run
+  // before 08:00 is a deliberate operator override (we still SEND it), but it must NOT stamp
+  // today's date: hasReportBeenSentToday() keys off the Paris calendar date, so an early
+  // stamp would suppress that day's scheduled 08:00 report (the exact failure we're fixing).
+  if (getParisHour() >= DAILY_REPORT_HOUR) {
+    const today = getParisDateString();
+    const stateRaw = loadStateRaw();
+    const state = {
+      npmLastPackage: stateRaw.npmLastPackage || '',
+      pypiLastPackage: stateRaw.pypiLastPackage || ''
+    };
+    stats.lastDailyReportDate = today;
+    saveState(state, stats);
+    saveLastDailyReportDate(today);
+  } else {
+    console.log(`[MONITOR] Manual report sent; not stamping (before ${DAILY_REPORT_HOUR}:00 Paris — the scheduled report will still fire today)`);
+  }
   return { sent: true, message: 'Daily report sent' };
 }
@@ -1399,6 +1457,8 @@ module.exports = {
   sendIOCPreAlert,
   buildCampaignPreAlertEmbed,
   sendCampaignPreAlert,
+  buildBurstPreAlertEmbed,
+  sendBurstPreAlert,
   matchVersionedIOC,
   computeRiskLevel,
   computeRiskScore,

package/src/scanner/temporal-analysis.js CHANGED Viewed

@@ -121,6 +121,14 @@ function _fetchPackageMetadataHttp(packageName) {
         return;
       }
+      if (res.statusCode === 429) {
+        res.resume();
+        // Coordinated backoff on the shared registry limiter — the temporal scanners must
+        // signal 429 like the metadata path, not hammer through a rate limit (CLAUDE.md storm).
+        try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
+        reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}`));
+        return;
+      }
       if (res.statusCode < 200 || res.statusCode >= 300) {
         res.resume();
         reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}`));

package/src/scanner/temporal-ast-diff.js CHANGED Viewed

@@ -71,6 +71,11 @@ function _fetchVersionMetadataHttp(packageName, version) {
         res.resume();
         return reject(new Error(`Version ${version} not found for package ${packageName}`));
       }
+      if (res.statusCode === 429) {
+        res.resume();
+        try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
+        return reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}@${version}`));
+      }
       if (res.statusCode < 200 || res.statusCode >= 300) {
         res.resume();
         return reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}@${version}`));

package/.dockerignore DELETED Viewed

@@ -1,7 +0,0 @@
-node_modules
-.git
-datasets
-tests
-metrics
-.muaddib-cache
-*.md

package/.env.example DELETED Viewed

@@ -1,43 +0,0 @@
-# MUAD'DIB environment variables — template
-# Copy to .env (local dev) or /opt/muaddib/.env (VPS) and fill in real values.
-# .env files are gitignored. NEVER commit a real token.
-# ----------------------------------------------------------------------------
-# Threat-feed API tokens (all OPTIONAL — scrapers degrade gracefully if absent)
-# ----------------------------------------------------------------------------
-# OpenSourceMalware.com — community-verified threat intel
-# Free tier: 60 req/min, /query-latest gives 100 most recent threats per ecosystem.
-# Sign up + generate at: https://opensourcemalware.com/auth → profile → API Tokens
-# Format: osm_<random-32+chars>
-# Used by: src/ioc/scraper.js → scrapeOSMQueryLatest()
-OSM_API_TOKEN=
-# ----------------------------------------------------------------------------
-# Webhook destinations (optional — monitor alerts)
-# ----------------------------------------------------------------------------
-# Discord webhook for monitor alerts (P1/P2/P3 triage)
-# DISCORD_WEBHOOK_URL=
-# ----------------------------------------------------------------------------
-# FPR plan gates — DEFAULT ON since v2.11.9 (no need to set these unless opting OUT)
-# ----------------------------------------------------------------------------
-# Measured impact on the v2.11.4 evaluation corpus (1054 packages):
-#   FPR curated 15.6% -> 9.36% (-6.24 pp), FPR random 7.0% -> 2.0% (-5.00 pp).
-#   TPR@3 / TPR@20 / ADR strictly unchanged.
-#
-# Opt-OUT individual gates (uncomment + set to 0):
-# MUADDIB_FN_REACHABILITY=0   # function-level reachability gating
-# MUADDIB_DECAY=0             # group score decay on bundled outputs
-# MUADDIB_MATURE_CAP=0        # cap mature, well-trafficked packages at MEDIUM
-# MUADDIB_METADATA_FACTOR=0   # registry signals -> reputation multiplier
-# MUADDIB_DELTA_MODE=0        # delta scoring against prior versions
-#
-# Skip ALL network fetches (npm registry packument + GitHub Releases IOC
-# bootstrap) in one shot. Disables MATURE_CAP + METADATA_FACTOR + DELTA_MODE
-# at the per-scan level AND the first-run IOC database download. Useful for:
-#   - air-gap / offline CI environments
-#   - test runners (set automatically by tests/run-tests.js)
-#   - perf-critical batch scans where you've pre-warmed the IOC cache
-# MUADDIB_NO_REGISTRY_FETCH=1