npm - muaddib-scanner - Versions diffs - 2.11.38 → 2.11.39 - Mend

muaddib-scanner 2.11.38 → 2.11.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +1 -1
package/{self-scan-v2.11.38.json → self-scan-v2.11.39.json} +1 -1
package/src/ml/jsonl-writer.js +64 -6
package/src/monitor/classify.js +8 -2
package/src/monitor/ingestion.js +29 -9
package/src/monitor/queue.js +29 -4
package/src/monitor/state.js +22 -0
package/src/monitor/webhook.js +29 -4
package/src/shared/download.js +97 -6

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "muaddib-scanner",
-  "version": "2.11.38",
+  "version": "2.11.39",
   "description": "Supply-chain threat detection & response for npm & PyPI/Python",
   "main": "src/index.js",
   "bin": {

package/{self-scan-v2.11.38.json → self-scan-v2.11.39.json} RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "target": "node_modules",
-  "timestamp": "2026-05-24T22:20:18.999Z",
+  "timestamp": "2026-05-25T08:33:11.787Z",
   "threats": [
     {
       "type": "string_mutation_obfuscation",

package/src/ml/jsonl-writer.js CHANGED Viewed

@@ -18,12 +18,19 @@ const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-train
 let TRAINING_FILE = DEFAULT_TRAINING_FILE;
 const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
+// In-memory line counter. null = needs recompute (cold boot, file rewrite, or
+// path swap). Maintained incrementally by appendRecord and invalidated by
+// relabelRecords and setTrainingFile. Prior to this cache, getStats read the
+// entire JSONL into RAM on every daily report (72MB allocation × ~30K records).
+let _cachedLineCount = null;
 /**
  * Override the training file path (for testing).
  * @param {string} filePath - new file path
  */
 function setTrainingFile(filePath) {
   TRAINING_FILE = filePath;
+  _cachedLineCount = null; // different file → recompute on next getStats
 }
 /**
@@ -31,6 +38,7 @@ function setTrainingFile(filePath) {
  */
 function resetTrainingFile() {
   TRAINING_FILE = DEFAULT_TRAINING_FILE;
+  _cachedLineCount = null;
 }
 /**
@@ -49,6 +57,7 @@ function appendRecord(record) {
     const line = JSON.stringify(record) + '\n';
     fs.appendFileSync(TRAINING_FILE, line, 'utf8');
+    if (_cachedLineCount !== null) _cachedLineCount++;
   } catch (err) {
     // Non-fatal: JSONL export failure should never crash the monitor
     // Log permission errors so they are visible in journalctl (was silent before v2.10.27)
@@ -73,6 +82,7 @@ function maybeRotate() {
     const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
     const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
     fs.renameSync(TRAINING_FILE, rotatedName);
+    _cachedLineCount = 0; // fresh file starts empty
     console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
   } catch (err) {
     console.error(`[ML] Rotation failed: ${err.message}`);
@@ -107,25 +117,71 @@ function readRecords() {
 }
 /**
- * Get stats about the current JSONL file.
+ * Stream-count newlines in a file using 64KB chunks. Counts non-empty
+ * logical records: each `\n`-terminated line that contains at least one
+ * non-whitespace byte. Matches the semantics of the old split-based count
+ * while avoiding the full-file readFileSync.
+ *
+ * @param {string} filePath
+ * @returns {number}
+ */
+function countLinesStreaming(filePath) {
+  const BUFFER_SIZE = 64 * 1024;
+  let fd;
+  try {
+    fd = fs.openSync(filePath, 'r');
+  } catch {
+    return 0;
+  }
+  try {
+    const buf = Buffer.alloc(BUFFER_SIZE);
+    let count = 0;
+    let sawContent = false;
+    let bytesRead;
+    while ((bytesRead = fs.readSync(fd, buf, 0, BUFFER_SIZE, null)) > 0) {
+      for (let i = 0; i < bytesRead; i++) {
+        const b = buf[i];
+        if (b === 0x0A) {            // '\n'
+          if (sawContent) count++;
+          sawContent = false;
+        } else if (b !== 0x20 && b !== 0x09 && b !== 0x0D) {
+          // any non-whitespace byte (space, tab, CR are still whitespace)
+          sawContent = true;
+        }
+      }
+    }
+    if (sawContent) count++; // trailing record without final newline
+    return count;
+  } finally {
+    try { fs.closeSync(fd); } catch {}
+  }
+}
+/**
+ * Get stats about the current JSONL file. Uses an in-memory line counter
+ * that is maintained incrementally by appendRecord and invalidated by
+ * rewrite operations — so getStats is O(1) on the hot path of the daily
+ * report (previously O(file size) via readFileSync on a 72MB+ file).
+ *
  * @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
  */
 function getStats() {
   try {
     if (!fs.existsSync(TRAINING_FILE)) {
+      _cachedLineCount = 0;
       return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
     }
     const stat = fs.statSync(TRAINING_FILE);
-    // Count lines without reading the entire file into memory
-    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
-    const lineCount = content.split('\n').filter(l => l.trim()).length;
+    if (_cachedLineCount === null) {
+      _cachedLineCount = countLinesStreaming(TRAINING_FILE);
+    }
     return {
-      recordCount: lineCount,
+      recordCount: _cachedLineCount,
       fileSizeBytes: stat.size,
       fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
     };
   } catch {
-    return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
+    return { recordCount: _cachedLineCount || 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
   }
 }
@@ -183,6 +239,8 @@ function relabelRecords(packageName, newLabel, sandboxFindingCount, manualReview
     if (updated > 0) {
       fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
+      // File was rewritten — line count cache must be recomputed on next read.
+      _cachedLineCount = null;
       console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
     }
     return updated;

package/src/monitor/classify.js CHANGED Viewed

@@ -225,11 +225,15 @@ function isSuspectClassification(result) {
 /**
  * Classify an error into a category for the daily report breakdown.
  * @param {Error} err
- * @returns {'too_large'|'tar_failed'|'http_error'|'static_timeout'|'timeout'|'other'}
+ * @returns {'too_large'|'tar_failed'|'archive_failed'|'unsupported_format'|'http_error'|'static_timeout'|'timeout'|'other'}
  */
 function classifyError(err) {
   const msg = (err && err.message) || '';
-  if (/too large|tarball too large/i.test(msg)) return 'too_large';
+  if (/too large|tarball too large|exceeds \d+/i.test(msg)) return 'too_large';
+  // Wheel/zip extraction failures must NOT be lumped with tar failures —
+  // they were the dominant noise before adm-zip dispatch.
+  if (/unsupported archive format/i.test(msg)) return 'unsupported_format';
+  if (/zip[\s_-]|wheel|whl\b/i.test(msg)) return 'archive_failed';
   if (/tar\b|extract/i.test(msg)) return 'tar_failed';
   if (/HTTP [45]\d\d|HTTP \d{3}/i.test(msg)) return 'http_error';
   if (/static scan timeout/i.test(msg)) return 'static_timeout';
@@ -257,6 +261,8 @@ function formatErrorBreakdown(total, byType) {
   const parts = [];
   if (byType.http_error > 0) parts.push(`HTTP: ${byType.http_error}`);
   if (byType.tar_failed > 0) parts.push(`tar: ${byType.tar_failed}`);
+  if (byType.archive_failed > 0) parts.push(`zip: ${byType.archive_failed}`);
+  if (byType.unsupported_format > 0) parts.push(`unsupported: ${byType.unsupported_format}`);
   if (byType.too_large > 0) parts.push(`too large: ${byType.too_large}`);
   if (byType.timeout > 0) parts.push(`timeout: ${byType.timeout}`);
   if (byType.static_timeout > 0) parts.push(`static: ${byType.static_timeout}`);

package/src/monitor/ingestion.js CHANGED Viewed

@@ -46,7 +46,8 @@ let consecutivePollErrors = 0;
 // `ingestion._deps.httpsPost = fakePost` and have it take effect inside
 // pollPyPIChangelog. Kept tiny on purpose — only network I/O lives here.
 const _deps = {
-  httpsPost: null // populated below once httpsPost is defined
+  httpsPost: null, // populated below once httpsPost is defined
+  httpsGet: null   // populated below; used by npm pollers so tests can stub
 };
 function getConsecutivePollErrors() {
@@ -131,6 +132,7 @@ function httpsPost(url, body, headers = {}, timeoutMs = 30_000) {
 }
 _deps.httpsPost = httpsPost;
+_deps.httpsGet = httpsGet;
 async function getWeeklyDownloads(packageName) {
   const cached = downloadsCache.get(packageName);
@@ -162,7 +164,7 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
   const url = packageVersion
     ? `https://pypi.org/pypi/${encodeURIComponent(packageName)}/${encodeURIComponent(packageVersion)}/json`
     : `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
-  const body = await httpsGet(url);
+  const body = await _deps.httpsGet(url);
   let data;
   try {
     data = JSON.parse(body);
@@ -177,8 +179,11 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
   // Fallback: any .tar.gz
   const tarGz = urls.find(u => u.url && u.url.endsWith('.tar.gz'));
   if (tarGz) return { url: tarGz.url, version };
-  // Fallback: first available file
-  if (urls.length > 0 && urls[0].url) return { url: urls[0].url, version };
+  // Fallback: wheel (.whl) — extracted via adm-zip in queue.js, not tar.
+  // Legacy .egg / .tar.bz2 / .exe installers intentionally NOT returned —
+  // they were the cause of ~2773 tar_failed/day before this fix.
+  const wheel = urls.find(u => u.url && (u.url.endsWith('.whl') || u.url.endsWith('.zip')));
+  if (wheel) return { url: wheel.url, version };
   return { url: null, version };
 }
@@ -405,7 +410,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
     // First run: initialize to current seq ("now") via root endpoint
     if (lastSeq == null) {
-      const infoBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
+      const infoBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
       const info = JSON.parse(infoBody);
       const currentSeq = info.update_seq;
       if (currentSeq == null) {
@@ -423,13 +428,13 @@ async function pollNpmChanges(state, scanQueue, stats) {
     const url = `${CHANGES_STREAM_URL}?since=${lastSeq}&limit=${CHANGES_LIMIT}`;
     let body, data;
     try {
-      body = await httpsGet(url, 60000);
+      body = await _deps.httpsGet(url, 60000);
       data = JSON.parse(body);
     } catch (fetchErr) {
       // Invalid seq (stale from pre-migration CouchDB) or transient error — re-init to current seq
       console.warn(`[MONITOR] Changes stream fetch failed (${fetchErr.message}) — attempting seq re-init`);
       try {
-        const reinitBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
+        const reinitBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
         const reinitData = JSON.parse(reinitBody);
         if (reinitData.update_seq != null) {
           state.npmLastSeq = reinitData.update_seq;
@@ -450,7 +455,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
     // Catch-up protection: if too far behind, skip to current
     if (data.results.length === CHANGES_LIMIT) {
-      const currentSeqBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
+      const currentSeqBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
       const currentSeqData = JSON.parse(currentSeqBody);
       const currentSeq = currentSeqData.update_seq;
       if (typeof currentSeq === 'number' && typeof data.last_seq === 'number' &&
@@ -459,12 +464,22 @@ async function pollNpmChanges(state, scanQueue, stats) {
         console.warn(`[MONITOR] Changes stream too far behind (${gap} changes) — skipping to current`);
         stats.npmCatchupSkips = (stats.npmCatchupSkips || 0) + 1;
         stats.npmCatchupSkippedSeqs = (stats.npmCatchupSkippedSeqs || 0) + gap;
+        // Catch-up gap = events we know happened but chose to skip. They must
+        // appear in the coverage denominator so the daily report exposes the
+        // gap as low coverage (and the catch-up line explains why).
+        stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + gap;
         state.npmLastSeq = currentSeq;
         saveNpmSeq(currentSeq);
         return 0;
       }
     }
+    // IMPORTANT: count raw events BEFORE filtering — otherwise the coverage
+    // denominator is biased (matches "events we queued", not "events npm
+    // emitted"). The filters below drop _design/self/@types/deleted, but
+    // those were still real changes-stream events.
+    stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + data.results.length;
     let queued = 0;
     for (const change of data.results) {
       // Skip deleted packages
@@ -584,7 +599,7 @@ async function pollNpmRss(state, scanQueue, stats) {
     await acquireRegistrySlot();
     let body;
     try {
-      body = await httpsGet(url);
+      body = await _deps.httpsGet(url);
     } finally {
       releaseRegistrySlot();
     }
@@ -603,6 +618,11 @@ async function pollNpmRss(state, scanQueue, stats) {
       }
     }
+    // Mirror pollNpmChanges: count raw events BEFORE per-package filters
+    // so the coverage denominator stays accurate when the changes stream
+    // falls back to RSS.
+    stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + newPackages.length;
     for (const name of newPackages) {
       if (name === SELF_PACKAGE_NAME) {
         console.log(`[MONITOR] SKIPPED (self): ${name}`);

package/src/monitor/queue.js CHANGED Viewed

@@ -13,7 +13,7 @@ const { Worker } = require('worker_threads');
 const { run } = require('../index.js');
 const { runSandbox, isDockerAvailable, tryAcquireSandboxSlot, SANDBOX_CONCURRENCY_MAX } = require('../sandbox/index.js');
 const { sendWebhook } = require('../webhook.js');
-const { downloadToFile, extractTarGz, sanitizePackageName } = require('../shared/download.js');
+const { downloadToFile, extractTarGz, extractArchive, sanitizePackageName } = require('../shared/download.js');
 const { MAX_TARBALL_SIZE } = require('../shared/constants.js');
 const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
 const { loadCachedIOCs } = require('../ioc/updater.js');
@@ -294,10 +294,22 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
     if (metaSize > MAX_TARBALL_SIZE) {
       console.log(`[MONITOR] SIZE_REJECT: ${name}@${version} — metadata size ${(metaSize / 1024 / 1024).toFixed(1)}MB exceeds ${(MAX_TARBALL_SIZE / 1024 / 1024).toFixed(0)}MB limit (skipped without download)`);
       stats.scanned++;
+      stats.totalTimeMs += Date.now() - startTime;
       return;
     }
-    const tgzPath = path.join(tmpDir, 'package.tar.gz');
+    // Pick the local filename extension from the URL so adm-zip / tar both
+    // read the magic correctly. PyPI wheels arrive as .whl, npm tarballs as
+    // .tgz, sdists as .tar.gz. Anything else falls through to .tar.gz
+    // (ingestion now returns null for unsupported types, so this branch is
+    // a defensive default rather than a real fallback).
+    const urlLower = (tarballUrl || '').toLowerCase();
+    const isWheel = urlLower.endsWith('.whl') || urlLower.endsWith('.zip');
+    const archiveExt = isWheel ? '.whl' : '.tar.gz';
+    const tgzPath = path.join(tmpDir, `package${archiveExt}`);
+    if (isWheel && ecosystem === 'pypi') {
+      stats.pypiWheelsScanned = (stats.pypiWheelsScanned || 0) + 1;
+    }
     // Layer 3: Check tarball cache before downloading
     const cacheKey = tarballCacheKey(name, version);
@@ -338,6 +350,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
     if (fileSize > MAX_TARBALL_SIZE) {
       console.log(`[MONITOR] SKIP: ${name}@${version} — tarball too large (${(fileSize / 1024 / 1024).toFixed(1)}MB)`);
       stats.scanned++;
+      stats.totalTimeMs += Date.now() - startTime;
       return;
     }
@@ -365,7 +378,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
         let bypassQuickScan = false;
         try {
           alreadyExtracted = true;
-          extractedDir = extractTarGz(tgzPath, tmpDir);
+          extractedDir = extractArchive(tgzPath, tmpDir);
           const [pkgThreats, shellThreats] = await Promise.all([
             scanPackageJson(extractedDir),
@@ -382,6 +395,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
           } else {
             console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, quick scan clean)`);
             stats.scanned++;
+            stats.totalTimeMs += Date.now() - startTime;
             stats.clean++;
             updateScanStats('clean');
             return;
@@ -402,6 +416,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
           } else {
             console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, extract failed)`);
             stats.scanned++;
+            stats.totalTimeMs += Date.now() - startTime;
             stats.clean++;
             updateScanStats('clean');
             return;
@@ -411,7 +426,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
     }
     if (!extractedDir) {
-      extractedDir = extractTarGz(tgzPath, tmpDir);
+      extractedDir = extractArchive(tgzPath, tmpDir);
     }
     // ML Phase 2a: Count JS files and detect test presence for enriched features
@@ -1169,6 +1184,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
     try {
       const pypiInfo = await getPyPITarballUrl(item.name, item.version || '');
       if (!pypiInfo.url) {
+        // No sdist / .tar.gz / wheel — likely a legacy egg or msi-only
+        // release. Clean skip: do NOT touch stats.scanned or stats.errors
+        // (those would distort the Commit 1 coverage ratios). The dedicated
+        // pypiSkippedNoArchive counter surfaces volume in the daily report.
+        stats.pypiSkippedNoArchive = (stats.pypiSkippedNoArchive || 0) + 1;
         console.log(`[MONITOR] SKIP: ${item.name} — no tarball URL found on PyPI`);
         return;
       }
@@ -1205,6 +1225,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
     return;
   }
   recentlyScanned.add(dedupeKey);
+  // Coverage numerator: one count per unique (ecosystem, name, version) that
+  // reaches a scan attempt. Excludes ATO burst extras that lose the dedup
+  // race, retries, size-cap rejections — those inflate stats.scanned but
+  // would distort the "% of publishes we covered" reading.
+  stats.uniqueScanAttempts = (stats.uniqueScanAttempts || 0) + 1;
   // Abort check: if timeout fired during URL resolution or dedup, bail out
   if (signal && signal.aborted) return;

package/src/monitor/state.js CHANGED Viewed

@@ -991,6 +991,8 @@ function loadDailyStats(stats, dailyAlerts) {
       if (data.errorsByType) {
         stats.errorsByType.too_large = data.errorsByType.too_large || 0;
         stats.errorsByType.tar_failed = data.errorsByType.tar_failed || 0;
+        stats.errorsByType.archive_failed = data.errorsByType.archive_failed || 0;
+        stats.errorsByType.unsupported_format = data.errorsByType.unsupported_format || 0;
         stats.errorsByType.http_error = data.errorsByType.http_error || 0;
         stats.errorsByType.timeout = data.errorsByType.timeout || 0;
         stats.errorsByType.static_timeout = data.errorsByType.static_timeout || 0;
@@ -1001,6 +1003,16 @@ function loadDailyStats(stats, dailyAlerts) {
       stats.llmAnalyzed = data.llmAnalyzed || 0;
       stats.llmSuppressed = data.llmSuppressed || 0;
       stats.changesStreamPackages = data.changesStreamPackages || 0;
+      stats.uniqueScanAttempts = data.uniqueScanAttempts || 0;
+      stats.npmPublishEventsSeen = data.npmPublishEventsSeen || 0;
+      stats.pypiChangelogPackages = data.pypiChangelogPackages || 0;
+      stats.pypiChangelogEvents = data.pypiChangelogEvents || 0;
+      stats.npmCatchupSkippedSeqs = data.npmCatchupSkippedSeqs || 0;
+      stats.npmCatchupSkips = data.npmCatchupSkips || 0;
+      stats.pypiCatchupSkippedEvents = data.pypiCatchupSkippedEvents || 0;
+      stats.pypiCatchupSkips = data.pypiCatchupSkips || 0;
+      stats.pypiWheelsScanned = data.pypiWheelsScanned || 0;
+      stats.pypiSkippedNoArchive = data.pypiSkippedNoArchive || 0;
       if (Array.isArray(data.dailyAlerts)) {
         const restored = data.dailyAlerts.slice(-MAX_DAILY_ALERTS);
         dailyAlerts.length = 0;
@@ -1029,6 +1041,16 @@ function saveDailyStats(stats, dailyAlerts) {
       llmAnalyzed: stats.llmAnalyzed || 0,
       llmSuppressed: stats.llmSuppressed || 0,
       changesStreamPackages: stats.changesStreamPackages || 0,
+      uniqueScanAttempts: stats.uniqueScanAttempts || 0,
+      npmPublishEventsSeen: stats.npmPublishEventsSeen || 0,
+      pypiChangelogPackages: stats.pypiChangelogPackages || 0,
+      pypiChangelogEvents: stats.pypiChangelogEvents || 0,
+      npmCatchupSkippedSeqs: stats.npmCatchupSkippedSeqs || 0,
+      npmCatchupSkips: stats.npmCatchupSkips || 0,
+      pypiCatchupSkippedEvents: stats.pypiCatchupSkippedEvents || 0,
+      pypiCatchupSkips: stats.pypiCatchupSkips || 0,
+      pypiWheelsScanned: stats.pypiWheelsScanned || 0,
+      pypiSkippedNoArchive: stats.pypiSkippedNoArchive || 0,
       dailyAlerts: dailyAlerts.slice()
     };
     atomicWriteFileSync(DAILY_STATS_FILE, JSON.stringify(data, null, 2));

package/src/monitor/webhook.js CHANGED Viewed

@@ -855,11 +855,24 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
   const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
   // --- Coverage estimation ---
-  // changesStreamPackages = total versions seen from npm changes stream (≈ published today)
-  const published = stats.changesStreamPackages || 0;
+  // Numerator: unique (ecosystem, name, version) tuples that reached a scan
+  // attempt (post-dedup). Denominator: raw publish events seen on either
+  // changes stream BEFORE per-package filtering, plus npm catch-up gaps and
+  // PyPI publish events that survived per-(name,version) dedup. This stays
+  // bounded near 100% — old "scanned/changesStreamPackages" was racing PyPI
+  // scans and ATO burst extras against an npm-only denominator.
+  const attempted = stats.uniqueScanAttempts || 0;
+  const npmPub = stats.npmPublishEventsSeen || 0;
+  const pypiPub = stats.pypiChangelogPackages || 0;
+  const published = npmPub + pypiPub;
+  const coverageRatio = published > 0 ? (attempted / published * 100).toFixed(0) : '0';
+  const catchupSkipped = (stats.npmCatchupSkippedSeqs || 0) + (stats.pypiCatchupSkippedEvents || 0);
+  const opsSuffix = catchupSkipped > 0
+    ? `\nOps: ${stats.scanned} | Catch-up skip: ${catchupSkipped}`
+    : `\nOps: ${stats.scanned}`;
   const coverageText = published > 0
-    ? `${stats.scanned}/${published} (${(stats.scanned / published * 100).toFixed(0)}%)`
-    : `${stats.scanned} scanned`;
+    ? `${attempted}/${published} (${coverageRatio}%)${opsSuffix}`
+    : `${attempted} attempted${opsSuffix}`;
   // --- Timeouts ---
   const staticTimeouts = (stats.errorsByType && stats.errorsByType.static_timeout) || 0;
@@ -1019,6 +1032,8 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
   stats.errors = 0;
   stats.errorsByType.too_large = 0;
   stats.errorsByType.tar_failed = 0;
+  stats.errorsByType.archive_failed = 0;
+  stats.errorsByType.unsupported_format = 0;
   stats.errorsByType.http_error = 0;
   stats.errorsByType.timeout = 0;
   stats.errorsByType.static_timeout = 0;
@@ -1033,6 +1048,16 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
   // Reset LLM detective internal stats
   try { require('../ml/llm-detective.js').resetStats(); } catch {}
   stats.changesStreamPackages = 0;
+  stats.uniqueScanAttempts = 0;
+  stats.npmPublishEventsSeen = 0;
+  stats.pypiChangelogPackages = 0;
+  stats.pypiChangelogEvents = 0;
+  stats.npmCatchupSkippedSeqs = 0;
+  stats.npmCatchupSkips = 0;
+  stats.pypiCatchupSkippedEvents = 0;
+  stats.pypiCatchupSkips = 0;
+  stats.pypiWheelsScanned = 0;
+  stats.pypiSkippedNoArchive = 0;
   stats.rssFallbackCount = 0;
   dailyAlerts.length = 0;
   recentlyScanned.clear();

package/src/shared/download.js CHANGED Viewed

@@ -2,6 +2,7 @@ const https = require('https');
 const fs = require('fs');
 const path = require('path');
 const { execFileSync } = require('child_process');
+const AdmZip = require('adm-zip');
 const { MAX_TARBALL_SIZE, DOWNLOAD_TIMEOUT } = require('./constants.js');
 // Allowed redirect domains for tarball downloads (SSRF protection)
@@ -221,13 +222,30 @@ function downloadToFile(url, destPath, timeoutMs = DOWNLOAD_TIMEOUT) {
 }
 /**
- * Extract a .tar.gz to a directory. Returns the package root.
- * Uses execFileSync (no shell) to prevent command injection.
- * @param {string} tgzPath - Path to the .tar.gz file
- * @param {string} destDir - Destination directory
- * @returns {string} Path to extracted package root
+ * Detect archive format from a path/URL extension.
+ * URL-derived names are reliable enough here: PyPI's `urls[].packagetype`
+ * + filename are authoritative, npm tarballs are always `.tgz`. Returns
+ * 'targz', 'zip', or 'unknown'. Callers either pass an `options.format`
+ * override or trust this detection.
+ *
+ * @param {string} archivePath - Path or URL ending in the archive filename
+ * @returns {'targz'|'zip'|'unknown'}
  */
-function extractTarGz(tgzPath, destDir) {
+function detectArchiveFormat(archivePath) {
+  if (typeof archivePath !== 'string') return 'unknown';
+  const lower = archivePath.toLowerCase();
+  if (lower.endsWith('.tar.gz') || lower.endsWith('.tgz')) return 'targz';
+  if (lower.endsWith('.whl') || lower.endsWith('.zip')) return 'zip';
+  return 'unknown';
+}
+/**
+ * Extract a tar.gz tarball with the system `tar` binary. Used for npm
+ * tarballs and PyPI sdists. Internal implementation — call extractArchive
+ * for new code; extractTarGz remains as a thin wrapper for the existing
+ * scanner/temporal-ast-diff.js callsite.
+ */
+function _extractTarGzImpl(tgzPath, destDir) {
   // Use cwd + relative paths so C: never appears in tar arguments
   // (GNU tar treats C: as remote host, bsdtar doesn't support --force-local)
   const tgzDir = path.dirname(path.resolve(tgzPath));
@@ -258,6 +276,77 @@ function extractTarGz(tgzPath, destDir) {
   return destDir;
 }
+/**
+ * Extract a ZIP archive (PyPI wheels, generic zips) to a directory.
+ * adm-zip is already a runtime dependency (used by src/ioc/scraper.js).
+ *
+ * Two hardening layers before extraction touches disk:
+ *  1. zip-slip: resolve each entry path against destDir and reject anything
+ *     that escapes. path.resolve normalizes ../, mixed separators, and
+ *     absolute paths in a single pass.
+ *  2. size cap: sum of uncompressed entry sizes must stay below
+ *     MAX_TARBALL_SIZE — defends against zip bombs that pass tarball
+ *     size checks but expand into multi-GB on disk.
+ */
+function _extractZipImpl(zipPath, destDir) {
+  const zip = new AdmZip(zipPath);
+  const entries = zip.getEntries();
+  const resolvedDest = path.resolve(destDir);
+  let totalUncompressed = 0;
+  for (const entry of entries) {
+    totalUncompressed += (entry.header && entry.header.size) || 0;
+    if (totalUncompressed > MAX_TARBALL_SIZE) {
+      throw new Error(
+        `Zip extract refused: total uncompressed size ${totalUncompressed} exceeds ${MAX_TARBALL_SIZE}`
+      );
+    }
+    const target = path.resolve(destDir, entry.entryName);
+    if (target !== resolvedDest && !target.startsWith(resolvedDest + path.sep)) {
+      throw new Error(`Unsafe zip entry escapes destDir: ${entry.entryName}`);
+    }
+  }
+  zip.extractAllTo(destDir, /* overwrite */ true);
+  // Wheels carry a flat layout (no leading `package/`); collapse into the
+  // single top-level dir if there is exactly one (matches sdist behavior so
+  // the scanner pipeline can treat the result uniformly).
+  try {
+    const top = fs.readdirSync(destDir);
+    if (top.length === 1) {
+      const single = path.join(destDir, top[0]);
+      const stat = fs.lstatSync(single);
+      if (!stat.isSymbolicLink() && stat.isDirectory()) return single;
+    }
+  } catch { /* ignore — fall back to destDir */ }
+  return destDir;
+}
+/**
+ * Extract an archive to a directory, dispatching on file extension.
+ * Supports `.tar.gz` / `.tgz` (tar) and `.whl` / `.zip` (adm-zip).
+ *
+ * @param {string} archivePath - Path to the archive on disk
+ * @param {string} destDir - Destination directory (must exist)
+ * @param {Object} [options]
+ * @param {'targz'|'zip'} [options.format] - override auto-detection
+ * @returns {string} Path to extracted package root
+ * @throws {Error} when the format is unknown or extraction fails
+ */
+function extractArchive(archivePath, destDir, options = {}) {
+  const format = options.format || detectArchiveFormat(archivePath);
+  if (format === 'targz') return _extractTarGzImpl(archivePath, destDir);
+  if (format === 'zip') return _extractZipImpl(archivePath, destDir);
+  throw new Error(`Unsupported archive format for ${path.basename(archivePath)}`);
+}
+/**
+ * Backwards-compatible wrapper for the original tar.gz-only extractor.
+ * Kept because src/scanner/temporal-ast-diff.js and existing tests still
+ * import it by name. New code should call extractArchive instead.
+ */
+function extractTarGz(tgzPath, destDir) {
+  return _extractTarGzImpl(tgzPath, destDir);
+}
 /**
  * Sanitize a package name for use in temporary directory names.
  * Removes path traversal sequences, slashes, and @ symbols.
@@ -277,6 +366,8 @@ function sanitizePackageName(packageName) {
 module.exports = {
   downloadToFile,
   extractTarGz,
+  extractArchive,
+  detectArchiveFormat,
   sanitizePackageName,
   isAllowedDownloadRedirect,
   normalizeHostname,