muaddib-scanner 2.11.38 → 2.11.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.38",
3
+ "version": "2.11.39",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-05-24T22:20:18.999Z",
3
+ "timestamp": "2026-05-25T08:33:11.787Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -18,12 +18,19 @@ const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-train
18
18
  let TRAINING_FILE = DEFAULT_TRAINING_FILE;
19
19
  const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
20
20
 
21
+ // In-memory line counter. null = needs recompute (cold boot, file rewrite, or
22
+ // path swap). Maintained incrementally by appendRecord and invalidated by
23
+ // relabelRecords and setTrainingFile. Prior to this cache, getStats read the
24
+ // entire JSONL into RAM on every daily report (72MB allocation × ~30K records).
25
+ let _cachedLineCount = null;
26
+
21
27
  /**
22
28
  * Override the training file path (for testing).
23
29
  * @param {string} filePath - new file path
24
30
  */
25
31
  function setTrainingFile(filePath) {
26
32
  TRAINING_FILE = filePath;
33
+ _cachedLineCount = null; // different file → recompute on next getStats
27
34
  }
28
35
 
29
36
  /**
@@ -31,6 +38,7 @@ function setTrainingFile(filePath) {
31
38
  */
32
39
  function resetTrainingFile() {
33
40
  TRAINING_FILE = DEFAULT_TRAINING_FILE;
41
+ _cachedLineCount = null;
34
42
  }
35
43
 
36
44
  /**
@@ -49,6 +57,7 @@ function appendRecord(record) {
49
57
 
50
58
  const line = JSON.stringify(record) + '\n';
51
59
  fs.appendFileSync(TRAINING_FILE, line, 'utf8');
60
+ if (_cachedLineCount !== null) _cachedLineCount++;
52
61
  } catch (err) {
53
62
  // Non-fatal: JSONL export failure should never crash the monitor
54
63
  // Log permission errors so they are visible in journalctl (was silent before v2.10.27)
@@ -73,6 +82,7 @@ function maybeRotate() {
73
82
  const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
74
83
  const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
75
84
  fs.renameSync(TRAINING_FILE, rotatedName);
85
+ _cachedLineCount = 0; // fresh file starts empty
76
86
  console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
77
87
  } catch (err) {
78
88
  console.error(`[ML] Rotation failed: ${err.message}`);
@@ -107,25 +117,71 @@ function readRecords() {
107
117
  }
108
118
 
109
119
  /**
110
- * Get stats about the current JSONL file.
120
+ * Stream-count newlines in a file using 64KB chunks. Counts non-empty
121
+ * logical records: each `\n`-terminated line that contains at least one
122
+ * non-whitespace byte. Matches the semantics of the old split-based count
123
+ * while avoiding the full-file readFileSync.
124
+ *
125
+ * @param {string} filePath
126
+ * @returns {number}
127
+ */
128
+ function countLinesStreaming(filePath) {
129
+ const BUFFER_SIZE = 64 * 1024;
130
+ let fd;
131
+ try {
132
+ fd = fs.openSync(filePath, 'r');
133
+ } catch {
134
+ return 0;
135
+ }
136
+ try {
137
+ const buf = Buffer.alloc(BUFFER_SIZE);
138
+ let count = 0;
139
+ let sawContent = false;
140
+ let bytesRead;
141
+ while ((bytesRead = fs.readSync(fd, buf, 0, BUFFER_SIZE, null)) > 0) {
142
+ for (let i = 0; i < bytesRead; i++) {
143
+ const b = buf[i];
144
+ if (b === 0x0A) { // '\n'
145
+ if (sawContent) count++;
146
+ sawContent = false;
147
+ } else if (b !== 0x20 && b !== 0x09 && b !== 0x0D) {
148
+ // any non-whitespace byte (space, tab, CR are still whitespace)
149
+ sawContent = true;
150
+ }
151
+ }
152
+ }
153
+ if (sawContent) count++; // trailing record without final newline
154
+ return count;
155
+ } finally {
156
+ try { fs.closeSync(fd); } catch {}
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Get stats about the current JSONL file. Uses an in-memory line counter
162
+ * that is maintained incrementally by appendRecord and invalidated by
163
+ * rewrite operations — so getStats is O(1) on the hot path of the daily
164
+ * report (previously O(file size) via readFileSync on a 72MB+ file).
165
+ *
111
166
  * @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
112
167
  */
113
168
  function getStats() {
114
169
  try {
115
170
  if (!fs.existsSync(TRAINING_FILE)) {
171
+ _cachedLineCount = 0;
116
172
  return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
117
173
  }
118
174
  const stat = fs.statSync(TRAINING_FILE);
119
- // Count lines without reading the entire file into memory
120
- const content = fs.readFileSync(TRAINING_FILE, 'utf8');
121
- const lineCount = content.split('\n').filter(l => l.trim()).length;
175
+ if (_cachedLineCount === null) {
176
+ _cachedLineCount = countLinesStreaming(TRAINING_FILE);
177
+ }
122
178
  return {
123
- recordCount: lineCount,
179
+ recordCount: _cachedLineCount,
124
180
  fileSizeBytes: stat.size,
125
181
  fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
126
182
  };
127
183
  } catch {
128
- return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
184
+ return { recordCount: _cachedLineCount || 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
129
185
  }
130
186
  }
131
187
 
@@ -183,6 +239,8 @@ function relabelRecords(packageName, newLabel, sandboxFindingCount, manualReview
183
239
 
184
240
  if (updated > 0) {
185
241
  fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
242
+ // File was rewritten — line count cache must be recomputed on next read.
243
+ _cachedLineCount = null;
186
244
  console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
187
245
  }
188
246
  return updated;
@@ -225,11 +225,15 @@ function isSuspectClassification(result) {
225
225
  /**
226
226
  * Classify an error into a category for the daily report breakdown.
227
227
  * @param {Error} err
228
- * @returns {'too_large'|'tar_failed'|'http_error'|'static_timeout'|'timeout'|'other'}
228
+ * @returns {'too_large'|'tar_failed'|'archive_failed'|'unsupported_format'|'http_error'|'static_timeout'|'timeout'|'other'}
229
229
  */
230
230
  function classifyError(err) {
231
231
  const msg = (err && err.message) || '';
232
- if (/too large|tarball too large/i.test(msg)) return 'too_large';
232
+ if (/too large|tarball too large|exceeds \d+/i.test(msg)) return 'too_large';
233
+ // Wheel/zip extraction failures must NOT be lumped with tar failures —
234
+ // they were the dominant noise before adm-zip dispatch.
235
+ if (/unsupported archive format/i.test(msg)) return 'unsupported_format';
236
+ if (/zip[\s_-]|wheel|whl\b/i.test(msg)) return 'archive_failed';
233
237
  if (/tar\b|extract/i.test(msg)) return 'tar_failed';
234
238
  if (/HTTP [45]\d\d|HTTP \d{3}/i.test(msg)) return 'http_error';
235
239
  if (/static scan timeout/i.test(msg)) return 'static_timeout';
@@ -257,6 +261,8 @@ function formatErrorBreakdown(total, byType) {
257
261
  const parts = [];
258
262
  if (byType.http_error > 0) parts.push(`HTTP: ${byType.http_error}`);
259
263
  if (byType.tar_failed > 0) parts.push(`tar: ${byType.tar_failed}`);
264
+ if (byType.archive_failed > 0) parts.push(`zip: ${byType.archive_failed}`);
265
+ if (byType.unsupported_format > 0) parts.push(`unsupported: ${byType.unsupported_format}`);
260
266
  if (byType.too_large > 0) parts.push(`too large: ${byType.too_large}`);
261
267
  if (byType.timeout > 0) parts.push(`timeout: ${byType.timeout}`);
262
268
  if (byType.static_timeout > 0) parts.push(`static: ${byType.static_timeout}`);
@@ -46,7 +46,8 @@ let consecutivePollErrors = 0;
46
46
  // `ingestion._deps.httpsPost = fakePost` and have it take effect inside
47
47
  // pollPyPIChangelog. Kept tiny on purpose — only network I/O lives here.
48
48
  const _deps = {
49
- httpsPost: null // populated below once httpsPost is defined
49
+ httpsPost: null, // populated below once httpsPost is defined
50
+ httpsGet: null // populated below; used by npm pollers so tests can stub
50
51
  };
51
52
 
52
53
  function getConsecutivePollErrors() {
@@ -131,6 +132,7 @@ function httpsPost(url, body, headers = {}, timeoutMs = 30_000) {
131
132
  }
132
133
 
133
134
  _deps.httpsPost = httpsPost;
135
+ _deps.httpsGet = httpsGet;
134
136
 
135
137
  async function getWeeklyDownloads(packageName) {
136
138
  const cached = downloadsCache.get(packageName);
@@ -162,7 +164,7 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
162
164
  const url = packageVersion
163
165
  ? `https://pypi.org/pypi/${encodeURIComponent(packageName)}/${encodeURIComponent(packageVersion)}/json`
164
166
  : `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
165
- const body = await httpsGet(url);
167
+ const body = await _deps.httpsGet(url);
166
168
  let data;
167
169
  try {
168
170
  data = JSON.parse(body);
@@ -177,8 +179,11 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
177
179
  // Fallback: any .tar.gz
178
180
  const tarGz = urls.find(u => u.url && u.url.endsWith('.tar.gz'));
179
181
  if (tarGz) return { url: tarGz.url, version };
180
- // Fallback: first available file
181
- if (urls.length > 0 && urls[0].url) return { url: urls[0].url, version };
182
+ // Fallback: wheel (.whl) — extracted via adm-zip in queue.js, not tar.
183
+ // Legacy .egg / .tar.bz2 / .exe installers intentionally NOT returned
184
+ // they were the cause of ~2773 tar_failed/day before this fix.
185
+ const wheel = urls.find(u => u.url && (u.url.endsWith('.whl') || u.url.endsWith('.zip')));
186
+ if (wheel) return { url: wheel.url, version };
182
187
  return { url: null, version };
183
188
  }
184
189
 
@@ -405,7 +410,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
405
410
 
406
411
  // First run: initialize to current seq ("now") via root endpoint
407
412
  if (lastSeq == null) {
408
- const infoBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
413
+ const infoBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
409
414
  const info = JSON.parse(infoBody);
410
415
  const currentSeq = info.update_seq;
411
416
  if (currentSeq == null) {
@@ -423,13 +428,13 @@ async function pollNpmChanges(state, scanQueue, stats) {
423
428
  const url = `${CHANGES_STREAM_URL}?since=${lastSeq}&limit=${CHANGES_LIMIT}`;
424
429
  let body, data;
425
430
  try {
426
- body = await httpsGet(url, 60000);
431
+ body = await _deps.httpsGet(url, 60000);
427
432
  data = JSON.parse(body);
428
433
  } catch (fetchErr) {
429
434
  // Invalid seq (stale from pre-migration CouchDB) or transient error — re-init to current seq
430
435
  console.warn(`[MONITOR] Changes stream fetch failed (${fetchErr.message}) — attempting seq re-init`);
431
436
  try {
432
- const reinitBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
437
+ const reinitBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
433
438
  const reinitData = JSON.parse(reinitBody);
434
439
  if (reinitData.update_seq != null) {
435
440
  state.npmLastSeq = reinitData.update_seq;
@@ -450,7 +455,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
450
455
 
451
456
  // Catch-up protection: if too far behind, skip to current
452
457
  if (data.results.length === CHANGES_LIMIT) {
453
- const currentSeqBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
458
+ const currentSeqBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
454
459
  const currentSeqData = JSON.parse(currentSeqBody);
455
460
  const currentSeq = currentSeqData.update_seq;
456
461
  if (typeof currentSeq === 'number' && typeof data.last_seq === 'number' &&
@@ -459,12 +464,22 @@ async function pollNpmChanges(state, scanQueue, stats) {
459
464
  console.warn(`[MONITOR] Changes stream too far behind (${gap} changes) — skipping to current`);
460
465
  stats.npmCatchupSkips = (stats.npmCatchupSkips || 0) + 1;
461
466
  stats.npmCatchupSkippedSeqs = (stats.npmCatchupSkippedSeqs || 0) + gap;
467
+ // Catch-up gap = events we know happened but chose to skip. They must
468
+ // appear in the coverage denominator so the daily report exposes the
469
+ // gap as low coverage (and the catch-up line explains why).
470
+ stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + gap;
462
471
  state.npmLastSeq = currentSeq;
463
472
  saveNpmSeq(currentSeq);
464
473
  return 0;
465
474
  }
466
475
  }
467
476
 
477
+ // IMPORTANT: count raw events BEFORE filtering — otherwise the coverage
478
+ // denominator is biased (matches "events we queued", not "events npm
479
+ // emitted"). The filters below drop _design/self/@types/deleted, but
480
+ // those were still real changes-stream events.
481
+ stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + data.results.length;
482
+
468
483
  let queued = 0;
469
484
  for (const change of data.results) {
470
485
  // Skip deleted packages
@@ -584,7 +599,7 @@ async function pollNpmRss(state, scanQueue, stats) {
584
599
  await acquireRegistrySlot();
585
600
  let body;
586
601
  try {
587
- body = await httpsGet(url);
602
+ body = await _deps.httpsGet(url);
588
603
  } finally {
589
604
  releaseRegistrySlot();
590
605
  }
@@ -603,6 +618,11 @@ async function pollNpmRss(state, scanQueue, stats) {
603
618
  }
604
619
  }
605
620
 
621
+ // Mirror pollNpmChanges: count raw events BEFORE per-package filters
622
+ // so the coverage denominator stays accurate when the changes stream
623
+ // falls back to RSS.
624
+ stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + newPackages.length;
625
+
606
626
  for (const name of newPackages) {
607
627
  if (name === SELF_PACKAGE_NAME) {
608
628
  console.log(`[MONITOR] SKIPPED (self): ${name}`);
@@ -13,7 +13,7 @@ const { Worker } = require('worker_threads');
13
13
  const { run } = require('../index.js');
14
14
  const { runSandbox, isDockerAvailable, tryAcquireSandboxSlot, SANDBOX_CONCURRENCY_MAX } = require('../sandbox/index.js');
15
15
  const { sendWebhook } = require('../webhook.js');
16
- const { downloadToFile, extractTarGz, sanitizePackageName } = require('../shared/download.js');
16
+ const { downloadToFile, extractTarGz, extractArchive, sanitizePackageName } = require('../shared/download.js');
17
17
  const { MAX_TARBALL_SIZE } = require('../shared/constants.js');
18
18
  const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
19
19
  const { loadCachedIOCs } = require('../ioc/updater.js');
@@ -294,10 +294,22 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
294
294
  if (metaSize > MAX_TARBALL_SIZE) {
295
295
  console.log(`[MONITOR] SIZE_REJECT: ${name}@${version} — metadata size ${(metaSize / 1024 / 1024).toFixed(1)}MB exceeds ${(MAX_TARBALL_SIZE / 1024 / 1024).toFixed(0)}MB limit (skipped without download)`);
296
296
  stats.scanned++;
297
+ stats.totalTimeMs += Date.now() - startTime;
297
298
  return;
298
299
  }
299
300
 
300
- const tgzPath = path.join(tmpDir, 'package.tar.gz');
301
+ // Pick the local filename extension from the URL so adm-zip / tar both
302
+ // read the magic correctly. PyPI wheels arrive as .whl, npm tarballs as
303
+ // .tgz, sdists as .tar.gz. Anything else falls through to .tar.gz
304
+ // (ingestion now returns null for unsupported types, so this branch is
305
+ // a defensive default rather than a real fallback).
306
+ const urlLower = (tarballUrl || '').toLowerCase();
307
+ const isWheel = urlLower.endsWith('.whl') || urlLower.endsWith('.zip');
308
+ const archiveExt = isWheel ? '.whl' : '.tar.gz';
309
+ const tgzPath = path.join(tmpDir, `package${archiveExt}`);
310
+ if (isWheel && ecosystem === 'pypi') {
311
+ stats.pypiWheelsScanned = (stats.pypiWheelsScanned || 0) + 1;
312
+ }
301
313
 
302
314
  // Layer 3: Check tarball cache before downloading
303
315
  const cacheKey = tarballCacheKey(name, version);
@@ -338,6 +350,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
338
350
  if (fileSize > MAX_TARBALL_SIZE) {
339
351
  console.log(`[MONITOR] SKIP: ${name}@${version} — tarball too large (${(fileSize / 1024 / 1024).toFixed(1)}MB)`);
340
352
  stats.scanned++;
353
+ stats.totalTimeMs += Date.now() - startTime;
341
354
  return;
342
355
  }
343
356
 
@@ -365,7 +378,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
365
378
  let bypassQuickScan = false;
366
379
  try {
367
380
  alreadyExtracted = true;
368
- extractedDir = extractTarGz(tgzPath, tmpDir);
381
+ extractedDir = extractArchive(tgzPath, tmpDir);
369
382
 
370
383
  const [pkgThreats, shellThreats] = await Promise.all([
371
384
  scanPackageJson(extractedDir),
@@ -382,6 +395,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
382
395
  } else {
383
396
  console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, quick scan clean)`);
384
397
  stats.scanned++;
398
+ stats.totalTimeMs += Date.now() - startTime;
385
399
  stats.clean++;
386
400
  updateScanStats('clean');
387
401
  return;
@@ -402,6 +416,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
402
416
  } else {
403
417
  console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, extract failed)`);
404
418
  stats.scanned++;
419
+ stats.totalTimeMs += Date.now() - startTime;
405
420
  stats.clean++;
406
421
  updateScanStats('clean');
407
422
  return;
@@ -411,7 +426,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
411
426
  }
412
427
 
413
428
  if (!extractedDir) {
414
- extractedDir = extractTarGz(tgzPath, tmpDir);
429
+ extractedDir = extractArchive(tgzPath, tmpDir);
415
430
  }
416
431
 
417
432
  // ML Phase 2a: Count JS files and detect test presence for enriched features
@@ -1169,6 +1184,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
1169
1184
  try {
1170
1185
  const pypiInfo = await getPyPITarballUrl(item.name, item.version || '');
1171
1186
  if (!pypiInfo.url) {
1187
+ // No sdist / .tar.gz / wheel — likely a legacy egg or msi-only
1188
+ // release. Clean skip: do NOT touch stats.scanned or stats.errors
1189
+ // (those would distort the Commit 1 coverage ratios). The dedicated
1190
+ // pypiSkippedNoArchive counter surfaces volume in the daily report.
1191
+ stats.pypiSkippedNoArchive = (stats.pypiSkippedNoArchive || 0) + 1;
1172
1192
  console.log(`[MONITOR] SKIP: ${item.name} — no tarball URL found on PyPI`);
1173
1193
  return;
1174
1194
  }
@@ -1205,6 +1225,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
1205
1225
  return;
1206
1226
  }
1207
1227
  recentlyScanned.add(dedupeKey);
1228
+ // Coverage numerator: one count per unique (ecosystem, name, version) that
1229
+ // reaches a scan attempt. Excludes ATO burst extras that lose the dedup
1230
+ // race, retries, size-cap rejections — those inflate stats.scanned but
1231
+ // would distort the "% of publishes we covered" reading.
1232
+ stats.uniqueScanAttempts = (stats.uniqueScanAttempts || 0) + 1;
1208
1233
 
1209
1234
  // Abort check: if timeout fired during URL resolution or dedup, bail out
1210
1235
  if (signal && signal.aborted) return;
@@ -991,6 +991,8 @@ function loadDailyStats(stats, dailyAlerts) {
991
991
  if (data.errorsByType) {
992
992
  stats.errorsByType.too_large = data.errorsByType.too_large || 0;
993
993
  stats.errorsByType.tar_failed = data.errorsByType.tar_failed || 0;
994
+ stats.errorsByType.archive_failed = data.errorsByType.archive_failed || 0;
995
+ stats.errorsByType.unsupported_format = data.errorsByType.unsupported_format || 0;
994
996
  stats.errorsByType.http_error = data.errorsByType.http_error || 0;
995
997
  stats.errorsByType.timeout = data.errorsByType.timeout || 0;
996
998
  stats.errorsByType.static_timeout = data.errorsByType.static_timeout || 0;
@@ -1001,6 +1003,16 @@ function loadDailyStats(stats, dailyAlerts) {
1001
1003
  stats.llmAnalyzed = data.llmAnalyzed || 0;
1002
1004
  stats.llmSuppressed = data.llmSuppressed || 0;
1003
1005
  stats.changesStreamPackages = data.changesStreamPackages || 0;
1006
+ stats.uniqueScanAttempts = data.uniqueScanAttempts || 0;
1007
+ stats.npmPublishEventsSeen = data.npmPublishEventsSeen || 0;
1008
+ stats.pypiChangelogPackages = data.pypiChangelogPackages || 0;
1009
+ stats.pypiChangelogEvents = data.pypiChangelogEvents || 0;
1010
+ stats.npmCatchupSkippedSeqs = data.npmCatchupSkippedSeqs || 0;
1011
+ stats.npmCatchupSkips = data.npmCatchupSkips || 0;
1012
+ stats.pypiCatchupSkippedEvents = data.pypiCatchupSkippedEvents || 0;
1013
+ stats.pypiCatchupSkips = data.pypiCatchupSkips || 0;
1014
+ stats.pypiWheelsScanned = data.pypiWheelsScanned || 0;
1015
+ stats.pypiSkippedNoArchive = data.pypiSkippedNoArchive || 0;
1004
1016
  if (Array.isArray(data.dailyAlerts)) {
1005
1017
  const restored = data.dailyAlerts.slice(-MAX_DAILY_ALERTS);
1006
1018
  dailyAlerts.length = 0;
@@ -1029,6 +1041,16 @@ function saveDailyStats(stats, dailyAlerts) {
1029
1041
  llmAnalyzed: stats.llmAnalyzed || 0,
1030
1042
  llmSuppressed: stats.llmSuppressed || 0,
1031
1043
  changesStreamPackages: stats.changesStreamPackages || 0,
1044
+ uniqueScanAttempts: stats.uniqueScanAttempts || 0,
1045
+ npmPublishEventsSeen: stats.npmPublishEventsSeen || 0,
1046
+ pypiChangelogPackages: stats.pypiChangelogPackages || 0,
1047
+ pypiChangelogEvents: stats.pypiChangelogEvents || 0,
1048
+ npmCatchupSkippedSeqs: stats.npmCatchupSkippedSeqs || 0,
1049
+ npmCatchupSkips: stats.npmCatchupSkips || 0,
1050
+ pypiCatchupSkippedEvents: stats.pypiCatchupSkippedEvents || 0,
1051
+ pypiCatchupSkips: stats.pypiCatchupSkips || 0,
1052
+ pypiWheelsScanned: stats.pypiWheelsScanned || 0,
1053
+ pypiSkippedNoArchive: stats.pypiSkippedNoArchive || 0,
1032
1054
  dailyAlerts: dailyAlerts.slice()
1033
1055
  };
1034
1056
  atomicWriteFileSync(DAILY_STATS_FILE, JSON.stringify(data, null, 2));
@@ -855,11 +855,24 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
855
855
  const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
856
856
 
857
857
  // --- Coverage estimation ---
858
- // changesStreamPackages = total versions seen from npm changes stream (≈ published today)
859
- const published = stats.changesStreamPackages || 0;
858
+ // Numerator: unique (ecosystem, name, version) tuples that reached a scan
859
+ // attempt (post-dedup). Denominator: raw publish events seen on either
860
+ // changes stream BEFORE per-package filtering, plus npm catch-up gaps and
861
+ // PyPI publish events that survived per-(name,version) dedup. This stays
862
+ // bounded near 100% — old "scanned/changesStreamPackages" was racing PyPI
863
+ // scans and ATO burst extras against an npm-only denominator.
864
+ const attempted = stats.uniqueScanAttempts || 0;
865
+ const npmPub = stats.npmPublishEventsSeen || 0;
866
+ const pypiPub = stats.pypiChangelogPackages || 0;
867
+ const published = npmPub + pypiPub;
868
+ const coverageRatio = published > 0 ? (attempted / published * 100).toFixed(0) : '0';
869
+ const catchupSkipped = (stats.npmCatchupSkippedSeqs || 0) + (stats.pypiCatchupSkippedEvents || 0);
870
+ const opsSuffix = catchupSkipped > 0
871
+ ? `\nOps: ${stats.scanned} | Catch-up skip: ${catchupSkipped}`
872
+ : `\nOps: ${stats.scanned}`;
860
873
  const coverageText = published > 0
861
- ? `${stats.scanned}/${published} (${(stats.scanned / published * 100).toFixed(0)}%)`
862
- : `${stats.scanned} scanned`;
874
+ ? `${attempted}/${published} (${coverageRatio}%)${opsSuffix}`
875
+ : `${attempted} attempted${opsSuffix}`;
863
876
 
864
877
  // --- Timeouts ---
865
878
  const staticTimeouts = (stats.errorsByType && stats.errorsByType.static_timeout) || 0;
@@ -1019,6 +1032,8 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1019
1032
  stats.errors = 0;
1020
1033
  stats.errorsByType.too_large = 0;
1021
1034
  stats.errorsByType.tar_failed = 0;
1035
+ stats.errorsByType.archive_failed = 0;
1036
+ stats.errorsByType.unsupported_format = 0;
1022
1037
  stats.errorsByType.http_error = 0;
1023
1038
  stats.errorsByType.timeout = 0;
1024
1039
  stats.errorsByType.static_timeout = 0;
@@ -1033,6 +1048,16 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
1033
1048
  // Reset LLM detective internal stats
1034
1049
  try { require('../ml/llm-detective.js').resetStats(); } catch {}
1035
1050
  stats.changesStreamPackages = 0;
1051
+ stats.uniqueScanAttempts = 0;
1052
+ stats.npmPublishEventsSeen = 0;
1053
+ stats.pypiChangelogPackages = 0;
1054
+ stats.pypiChangelogEvents = 0;
1055
+ stats.npmCatchupSkippedSeqs = 0;
1056
+ stats.npmCatchupSkips = 0;
1057
+ stats.pypiCatchupSkippedEvents = 0;
1058
+ stats.pypiCatchupSkips = 0;
1059
+ stats.pypiWheelsScanned = 0;
1060
+ stats.pypiSkippedNoArchive = 0;
1036
1061
  stats.rssFallbackCount = 0;
1037
1062
  dailyAlerts.length = 0;
1038
1063
  recentlyScanned.clear();
@@ -2,6 +2,7 @@ const https = require('https');
2
2
  const fs = require('fs');
3
3
  const path = require('path');
4
4
  const { execFileSync } = require('child_process');
5
+ const AdmZip = require('adm-zip');
5
6
  const { MAX_TARBALL_SIZE, DOWNLOAD_TIMEOUT } = require('./constants.js');
6
7
 
7
8
  // Allowed redirect domains for tarball downloads (SSRF protection)
@@ -221,13 +222,30 @@ function downloadToFile(url, destPath, timeoutMs = DOWNLOAD_TIMEOUT) {
221
222
  }
222
223
 
223
224
  /**
224
- * Extract a .tar.gz to a directory. Returns the package root.
225
- * Uses execFileSync (no shell) to prevent command injection.
226
- * @param {string} tgzPath - Path to the .tar.gz file
227
- * @param {string} destDir - Destination directory
228
- * @returns {string} Path to extracted package root
225
+ * Detect archive format from a path/URL extension.
226
+ * URL-derived names are reliable enough here: PyPI's `urls[].packagetype`
227
+ * + filename are authoritative, npm tarballs are always `.tgz`. Returns
228
+ * 'targz', 'zip', or 'unknown'. Callers either pass an `options.format`
229
+ * override or trust this detection.
230
+ *
231
+ * @param {string} archivePath - Path or URL ending in the archive filename
232
+ * @returns {'targz'|'zip'|'unknown'}
229
233
  */
230
- function extractTarGz(tgzPath, destDir) {
234
+ function detectArchiveFormat(archivePath) {
235
+ if (typeof archivePath !== 'string') return 'unknown';
236
+ const lower = archivePath.toLowerCase();
237
+ if (lower.endsWith('.tar.gz') || lower.endsWith('.tgz')) return 'targz';
238
+ if (lower.endsWith('.whl') || lower.endsWith('.zip')) return 'zip';
239
+ return 'unknown';
240
+ }
241
+
242
+ /**
243
+ * Extract a tar.gz tarball with the system `tar` binary. Used for npm
244
+ * tarballs and PyPI sdists. Internal implementation — call extractArchive
245
+ * for new code; extractTarGz remains as a thin wrapper for the existing
246
+ * scanner/temporal-ast-diff.js callsite.
247
+ */
248
+ function _extractTarGzImpl(tgzPath, destDir) {
231
249
  // Use cwd + relative paths so C: never appears in tar arguments
232
250
  // (GNU tar treats C: as remote host, bsdtar doesn't support --force-local)
233
251
  const tgzDir = path.dirname(path.resolve(tgzPath));
@@ -258,6 +276,77 @@ function extractTarGz(tgzPath, destDir) {
258
276
  return destDir;
259
277
  }
260
278
 
279
+ /**
280
+ * Extract a ZIP archive (PyPI wheels, generic zips) to a directory.
281
+ * adm-zip is already a runtime dependency (used by src/ioc/scraper.js).
282
+ *
283
+ * Two hardening layers before extraction touches disk:
284
+ * 1. zip-slip: resolve each entry path against destDir and reject anything
285
+ * that escapes. path.resolve normalizes ../, mixed separators, and
286
+ * absolute paths in a single pass.
287
+ * 2. size cap: sum of uncompressed entry sizes must stay below
288
+ * MAX_TARBALL_SIZE — defends against zip bombs that pass tarball
289
+ * size checks but expand into multi-GB on disk.
290
+ */
291
+ function _extractZipImpl(zipPath, destDir) {
292
+ const zip = new AdmZip(zipPath);
293
+ const entries = zip.getEntries();
294
+ const resolvedDest = path.resolve(destDir);
295
+ let totalUncompressed = 0;
296
+ for (const entry of entries) {
297
+ totalUncompressed += (entry.header && entry.header.size) || 0;
298
+ if (totalUncompressed > MAX_TARBALL_SIZE) {
299
+ throw new Error(
300
+ `Zip extract refused: total uncompressed size ${totalUncompressed} exceeds ${MAX_TARBALL_SIZE}`
301
+ );
302
+ }
303
+ const target = path.resolve(destDir, entry.entryName);
304
+ if (target !== resolvedDest && !target.startsWith(resolvedDest + path.sep)) {
305
+ throw new Error(`Unsafe zip entry escapes destDir: ${entry.entryName}`);
306
+ }
307
+ }
308
+ zip.extractAllTo(destDir, /* overwrite */ true);
309
+ // Wheels carry a flat layout (no leading `package/`); collapse into the
310
+ // single top-level dir if there is exactly one (matches sdist behavior so
311
+ // the scanner pipeline can treat the result uniformly).
312
+ try {
313
+ const top = fs.readdirSync(destDir);
314
+ if (top.length === 1) {
315
+ const single = path.join(destDir, top[0]);
316
+ const stat = fs.lstatSync(single);
317
+ if (!stat.isSymbolicLink() && stat.isDirectory()) return single;
318
+ }
319
+ } catch { /* ignore — fall back to destDir */ }
320
+ return destDir;
321
+ }
322
+
323
+ /**
324
+ * Extract an archive to a directory, dispatching on file extension.
325
+ * Supports `.tar.gz` / `.tgz` (tar) and `.whl` / `.zip` (adm-zip).
326
+ *
327
+ * @param {string} archivePath - Path to the archive on disk
328
+ * @param {string} destDir - Destination directory (must exist)
329
+ * @param {Object} [options]
330
+ * @param {'targz'|'zip'} [options.format] - override auto-detection
331
+ * @returns {string} Path to extracted package root
332
+ * @throws {Error} when the format is unknown or extraction fails
333
+ */
334
+ function extractArchive(archivePath, destDir, options = {}) {
335
+ const format = options.format || detectArchiveFormat(archivePath);
336
+ if (format === 'targz') return _extractTarGzImpl(archivePath, destDir);
337
+ if (format === 'zip') return _extractZipImpl(archivePath, destDir);
338
+ throw new Error(`Unsupported archive format for ${path.basename(archivePath)}`);
339
+ }
340
+
341
+ /**
342
+ * Backwards-compatible wrapper for the original tar.gz-only extractor.
343
+ * Kept because src/scanner/temporal-ast-diff.js and existing tests still
344
+ * import it by name. New code should call extractArchive instead.
345
+ */
346
+ function extractTarGz(tgzPath, destDir) {
347
+ return _extractTarGzImpl(tgzPath, destDir);
348
+ }
349
+
261
350
  /**
262
351
  * Sanitize a package name for use in temporary directory names.
263
352
  * Removes path traversal sequences, slashes, and @ symbols.
@@ -277,6 +366,8 @@ function sanitizePackageName(packageName) {
277
366
  module.exports = {
278
367
  downloadToFile,
279
368
  extractTarGz,
369
+ extractArchive,
370
+ detectArchiveFormat,
280
371
  sanitizePackageName,
281
372
  isAllowedDownloadRedirect,
282
373
  normalizeHostname,