muaddib-scanner 2.11.14 → 2.11.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/muaddib.js CHANGED
@@ -297,6 +297,7 @@ if (command === 'version' || command === '--version' || command === '-v') {
297
297
  if (wantHelp) showHelp('watch');
298
298
  watch(target);
299
299
  } else if (command === 'update') {
300
+ if (wantHelp) showHelp('update');
300
301
  updateIOCs().then(() => {
301
302
  process.exit(0);
302
303
  }).catch(err => {
@@ -304,6 +305,7 @@ if (command === 'version' || command === '--version' || command === '-v') {
304
305
  process.exit(1);
305
306
  });
306
307
  } else if (command === 'scrape') {
308
+ if (wantHelp) showHelp('scrape');
307
309
  runScraper().then(result => {
308
310
  console.log(`[OK] ${result.added} new IOCs (total: ${result.total})`);
309
311
  process.exit(0);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.14",
3
+ "version": "2.11.16",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -46,7 +46,7 @@
46
46
  "node": ">=18.0.0"
47
47
  },
48
48
  "dependencies": {
49
- "@inquirer/prompts": "8.4.2",
49
+ "@inquirer/prompts": "8.4.3",
50
50
  "acorn": "8.16.0",
51
51
  "acorn-walk": "8.3.5",
52
52
  "adm-zip": "0.5.17",
@@ -10,7 +10,10 @@
10
10
  const https = require('https');
11
11
  const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
12
12
  const { loadCachedIOCs } = require('../ioc/updater.js');
13
- const { loadNpmSeq, saveNpmSeq, CHANGES_STREAM_URL, CHANGES_LIMIT, CHANGES_CATCHUP_MAX } = require('./state.js');
13
+ const {
14
+ loadNpmSeq, saveNpmSeq, CHANGES_STREAM_URL, CHANGES_LIMIT, CHANGES_CATCHUP_MAX,
15
+ savePypiSerial, PYPI_XMLRPC_URL, PYPI_CATCHUP_MAX
16
+ } = require('./state.js');
14
17
  const { sendIOCPreAlert } = require('./webhook.js');
15
18
  const { evaluateCacheTrigger, POPULAR_THRESHOLD, downloadsCache, DOWNLOADS_CACHE_TTL } = require('./classify.js');
16
19
 
@@ -22,6 +25,14 @@ const POLL_MAX_BACKOFF = 960_000; // 16 minutes max backoff
22
25
  // --- Mutable state ---
23
26
  let consecutivePollErrors = 0;
24
27
 
28
+ // Test seam: code paths that need to be stubbed in tests call these through
29
+ // `_deps` instead of the bare module-local name, so a test can swap
30
+ // `ingestion._deps.httpsPost = fakePost` and have it take effect inside
31
+ // pollPyPIChangelog. Kept tiny on purpose — only network I/O lives here.
32
+ const _deps = {
33
+ httpsPost: null // populated below once httpsPost is defined
34
+ };
35
+
25
36
  function getConsecutivePollErrors() {
26
37
  return consecutivePollErrors;
27
38
  }
@@ -64,6 +75,47 @@ function httpsGet(url, timeoutMs = 30_000) {
64
75
  });
65
76
  }
66
77
 
78
+ /**
79
+ * Minimal HTTPS POST. Used for PyPI XML-RPC; kept inside the ingestion module
80
+ * (rather than pulled into shared/) because XML-RPC is its only consumer today.
81
+ */
82
+ function httpsPost(url, body, headers = {}, timeoutMs = 30_000) {
83
+ return new Promise((resolve, reject) => {
84
+ const u = new URL(url);
85
+ const options = {
86
+ method: 'POST',
87
+ hostname: u.hostname,
88
+ port: u.port || 443,
89
+ path: u.pathname + (u.search || ''),
90
+ timeout: timeoutMs,
91
+ headers: {
92
+ 'Content-Type': 'text/xml',
93
+ 'Content-Length': Buffer.byteLength(body),
94
+ ...headers
95
+ }
96
+ };
97
+ const req = https.request(options, (res) => {
98
+ if (res.statusCode < 200 || res.statusCode >= 300) {
99
+ res.resume();
100
+ return reject(new Error(`HTTP ${res.statusCode} for POST ${url}`));
101
+ }
102
+ const chunks = [];
103
+ res.on('data', (chunk) => chunks.push(chunk));
104
+ res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
105
+ res.on('error', reject);
106
+ });
107
+ req.on('error', reject);
108
+ req.on('timeout', () => {
109
+ req.destroy();
110
+ reject(new Error(`Timeout for POST ${url}`));
111
+ });
112
+ req.write(body);
113
+ req.end();
114
+ });
115
+ }
116
+
117
+ _deps.httpsPost = httpsPost;
118
+
67
119
  async function getWeeklyDownloads(packageName) {
68
120
  const cached = downloadsCache.get(packageName);
69
121
  if (cached && (Date.now() - cached.fetchedAt) < DOWNLOADS_CACHE_TTL) {
@@ -186,8 +238,13 @@ function getNpmTarballUrl(pkgData) {
186
238
  return (pkgData.dist && pkgData.dist.tarball) || null;
187
239
  }
188
240
 
189
- async function getPyPITarballUrl(packageName) {
190
- const url = `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
241
+ async function getPyPITarballUrl(packageName, packageVersion = '') {
242
+ // Per-version endpoint when we know the version (e.g. from the XML-RPC changelog)
243
+ // guarantees we scan the artifact that just landed, not whatever became "latest"
244
+ // between event detection and scan. Falls back to /pypi/<name>/json (latest) otherwise.
245
+ const url = packageVersion
246
+ ? `https://pypi.org/pypi/${encodeURIComponent(packageName)}/${encodeURIComponent(packageVersion)}/json`
247
+ : `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
191
248
  const body = await httpsGet(url);
192
249
  let data;
193
250
  try {
@@ -195,7 +252,7 @@ async function getPyPITarballUrl(packageName) {
195
252
  } catch (e) {
196
253
  throw new Error(`Invalid JSON from PyPI for ${packageName}: ${e.message}`);
197
254
  }
198
- const version = (data.info && data.info.version) || '';
255
+ const version = (data.info && data.info.version) || packageVersion || '';
199
256
  const urls = data.urls || [];
200
257
  // Prefer sdist (.tar.gz)
201
258
  const sdist = urls.find(u => u.packagetype === 'sdist' && u.url);
@@ -386,7 +443,10 @@ async function pollNpmChanges(state, scanQueue, stats) {
386
443
  const currentSeq = currentSeqData.update_seq;
387
444
  if (typeof currentSeq === 'number' && typeof data.last_seq === 'number' &&
388
445
  (currentSeq - data.last_seq) > CHANGES_CATCHUP_MAX) {
389
- console.warn(`[MONITOR] Changes stream too far behind (${currentSeq - lastSeq} changes) — skipping to current`);
446
+ const gap = currentSeq - lastSeq;
447
+ console.warn(`[MONITOR] Changes stream too far behind (${gap} changes) — skipping to current`);
448
+ stats.npmCatchupSkips = (stats.npmCatchupSkips || 0) + 1;
449
+ stats.npmCatchupSkippedSeqs = (stats.npmCatchupSkippedSeqs || 0) + gap;
390
450
  state.npmLastSeq = currentSeq;
391
451
  saveNpmSeq(currentSeq);
392
452
  return 0;
@@ -590,13 +650,271 @@ async function pollNpm(state, scanQueue, stats) {
590
650
 
591
651
  // --- PyPI polling ---
592
652
 
653
+ const PYPI_USER_AGENT = `${SELF_PACKAGE_NAME} (security-monitor; +https://github.com/DNSZLSK/muaddib)`;
654
+
593
655
  /**
594
- * Poll PyPI RSS feed for new packages.
656
+ * Build an XML-RPC methodCall envelope. PyPI accepts only <int> and <string>
657
+ * params for the methods we use (changelog_last_serial, changelog_since_serial),
658
+ * so this builder is deliberately minimal.
659
+ */
660
+ function buildXmlRpcCall(method, params) {
661
+ const paramXml = params.map((p) => {
662
+ if (typeof p === 'number' && Number.isInteger(p)) {
663
+ return `<param><value><int>${p}</int></value></param>`;
664
+ }
665
+ if (typeof p === 'string') {
666
+ // Method names + serial numbers only — no user-supplied strings reach this path.
667
+ const escaped = p.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
668
+ return `<param><value><string>${escaped}</string></value></param>`;
669
+ }
670
+ throw new Error(`Unsupported XML-RPC param type: ${typeof p}`);
671
+ }).join('');
672
+ return `<?xml version="1.0"?><methodCall><methodName>${method}</methodName><params>${paramXml}</params></methodCall>`;
673
+ }
674
+
675
+ /**
676
+ * Parse a PyPI changelog_since_serial response.
677
+ *
678
+ * Response shape (per https://warehouse.pypa.io/api-reference/xml-rpc.html):
679
+ * <array><data>
680
+ * <value><array><data>
681
+ * <value><string>NAME</string></value> <!-- index 0 -->
682
+ * <value><string>VERSION</string></value> <!-- index 1, may be empty -->
683
+ * <value><int>TIMESTAMP</int></value> <!-- index 2 -->
684
+ * <value><string>ACTION</string></value> <!-- index 3 -->
685
+ * <value><int>SERIAL</int></value> <!-- index 4 -->
686
+ * </data></array></value>
687
+ * ...
688
+ * </data></array>
689
+ *
690
+ * Returns array of { name, version, timestamp, action, serial }. Invalid tuples
691
+ * are skipped silently — partial data is better than dropping the whole batch.
692
+ */
693
+ function parseXmlRpcChangelog(xml) {
694
+ const out = [];
695
+ if (typeof xml !== 'string' || !xml.includes('<methodResponse>')) return out;
696
+ if (xml.includes('<fault>')) return out; // PyPI fault → caller should treat as failure
697
+
698
+ // The response is a nested array: outer <array><data>...inner tuples...</data></array>.
699
+ // We strip the outer wrapper first so the inner-tuple regex can't accidentally
700
+ // greedy-match across the outer boundary (which would swallow tuple #1).
701
+ const outerArrayStart = xml.indexOf('<array>');
702
+ if (outerArrayStart === -1) return out;
703
+ const outerDataStart = xml.indexOf('<data>', outerArrayStart);
704
+ if (outerDataStart === -1) return out;
705
+ const outerDataEnd = xml.lastIndexOf('</data>');
706
+ if (outerDataEnd === -1 || outerDataEnd <= outerDataStart) return out;
707
+ const body = xml.slice(outerDataStart + '<data>'.length, outerDataEnd);
708
+
709
+ // Each tuple inside `body` is exactly: <value><array><data>...</data></array></value>
710
+ const tupleRegex = /<value>\s*<array>\s*<data>([\s\S]*?)<\/data>\s*<\/array>\s*<\/value>/g;
711
+ let m;
712
+ while ((m = tupleRegex.exec(body)) !== null) {
713
+ const inner = m[1];
714
+ const values = [];
715
+ const valRegex = /<value>\s*(?:<string>([\s\S]*?)<\/string>|<int>(-?\d+)<\/int>)\s*<\/value>/g;
716
+ let v;
717
+ while ((v = valRegex.exec(inner)) !== null) {
718
+ if (v[1] !== undefined) {
719
+ // Decode the XML entities we encode on the way in
720
+ values.push(v[1].replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&amp;/g, '&'));
721
+ } else {
722
+ values.push(parseInt(v[2], 10));
723
+ }
724
+ }
725
+ if (values.length !== 5) continue;
726
+ const [name, version, timestamp, action, serial] = values;
727
+ if (typeof name !== 'string' || typeof action !== 'string' ||
728
+ typeof timestamp !== 'number' || typeof serial !== 'number') continue;
729
+ out.push({ name, version: typeof version === 'string' ? version : '', timestamp, action, serial });
730
+ }
731
+ return out;
732
+ }
733
+
734
+ /**
735
+ * Parse a changelog_last_serial response. Returns the integer or null.
736
+ */
737
+ function parseXmlRpcInt(xml) {
738
+ if (typeof xml !== 'string' || xml.includes('<fault>')) return null;
739
+ const m = xml.match(/<value>\s*<int>(-?\d+)<\/int>\s*<\/value>/);
740
+ return m ? parseInt(m[1], 10) : null;
741
+ }
742
+
743
+ /**
744
+ * Decide whether a changelog event introduces scannable content.
745
+ *
746
+ * KEEP (something new was published, scan the release):
747
+ * - "new release" → version metadata created
748
+ * - "add source file …" → sdist uploaded
749
+ * - "add py3 file …" / "add cp… file …" / "add … file …" → wheel uploaded
750
+ *
751
+ * SKIP (no new artifact to scan):
752
+ * - "remove …", "yank release", "unyank release" → removal, not a new threat
753
+ * - "create" → package shell, no version yet
754
+ * - "add Owner", "remove Owner", "accepted Owner" → ACL changes
755
+ * - empty version → administrative event at the package level
756
+ */
757
+ function isPypiScannableAction(action, version) {
758
+ if (!version) return false;
759
+ if (typeof action !== 'string') return false;
760
+ if (action === 'new release') return true;
761
+ if (action.startsWith('add ') && action.includes(' file ')) return true;
762
+ return false;
763
+ }
764
+
765
+ /**
766
+ * Poll PyPI changelog via XML-RPC (primary path).
767
+ * Equivalent of pollNpmChanges: strictly monotonic serial, lossless resume.
768
+ *
769
+ * @param {Object} state - Monitor state (pypiLastSerial)
770
+ * @param {Array} scanQueue - Mutable scan queue array
771
+ * @param {Object} stats - Mutable stats object
772
+ * @returns {Promise<number>} Number of packages queued, or -1 on error
773
+ */
774
+ async function pollPyPIChangelog(state, scanQueue, stats) {
775
+ try {
776
+ let lastSerial = state.pypiLastSerial;
777
+
778
+ // First run: anchor to "now" rather than replaying months of history
779
+ if (lastSerial == null) {
780
+ await acquireRegistrySlot();
781
+ let initBody;
782
+ try {
783
+ initBody = await _deps.httpsPost(
784
+ PYPI_XMLRPC_URL,
785
+ buildXmlRpcCall('changelog_last_serial', []),
786
+ { 'User-Agent': PYPI_USER_AGENT },
787
+ 10_000
788
+ );
789
+ } finally {
790
+ releaseRegistrySlot();
791
+ }
792
+ const current = parseXmlRpcInt(initBody);
793
+ if (current == null) {
794
+ console.warn('[MONITOR] PyPI changelog init: no serial in response');
795
+ return -1;
796
+ }
797
+ state.pypiLastSerial = current;
798
+ savePypiSerial(current);
799
+ console.log(`[MONITOR] PyPI changelog initialized at serial ${current}`);
800
+ return 0;
801
+ }
802
+
803
+ await acquireRegistrySlot();
804
+ let body;
805
+ try {
806
+ body = await _deps.httpsPost(
807
+ PYPI_XMLRPC_URL,
808
+ buildXmlRpcCall('changelog_since_serial', [lastSerial]),
809
+ { 'User-Agent': PYPI_USER_AGENT },
810
+ 60_000
811
+ );
812
+ } finally {
813
+ releaseRegistrySlot();
814
+ }
815
+
816
+ const events = parseXmlRpcChangelog(body);
817
+ if (events.length === 0) {
818
+ // Either nothing happened or the response was a fault — distinguish.
819
+ if (body && body.includes('<fault>')) {
820
+ console.error('[MONITOR] PyPI changelog returned XML-RPC fault — falling back to RSS');
821
+ return -1;
822
+ }
823
+ return 0;
824
+ }
825
+
826
+ // Catch-up protection: if events span more than PYPI_CATCHUP_MAX serials,
827
+ // skip to the latest serial to avoid an avalanche after long downtime.
828
+ const lastEventSerial = events[events.length - 1].serial;
829
+ const gap = lastEventSerial - lastSerial;
830
+ if (gap > PYPI_CATCHUP_MAX) {
831
+ console.warn(`[MONITOR] PyPI changelog too far behind (${gap} events) — skipping to current`);
832
+ stats.pypiCatchupSkips = (stats.pypiCatchupSkips || 0) + 1;
833
+ stats.pypiCatchupSkippedEvents = (stats.pypiCatchupSkippedEvents || 0) + gap;
834
+ state.pypiLastSerial = lastEventSerial;
835
+ savePypiSerial(lastEventSerial);
836
+ return 0;
837
+ }
838
+
839
+ // Dedupe (name, version) within the batch: a single release usually emits
840
+ // multiple events (new release + add source file + add wheel files…), but
841
+ // there's only one thing to scan.
842
+ const seen = new Set();
843
+ let queued = 0;
844
+ let maxSerial = lastSerial;
845
+
846
+ for (const ev of events) {
847
+ if (ev.serial > maxSerial) maxSerial = ev.serial;
848
+
849
+ if (!isPypiScannableAction(ev.action, ev.version)) continue;
850
+
851
+ const key = `${ev.name}@${ev.version}`;
852
+ if (seen.has(key)) continue;
853
+ seen.add(key);
854
+
855
+ // Skip self (mirror of the npm path — defensive even though we don't publish to PyPI)
856
+ if (ev.name === SELF_PACKAGE_NAME) continue;
857
+
858
+ // IOC pre-alert for known-malicious PyPI packages
859
+ let isKnownIOC = false;
860
+ try {
861
+ const iocs = loadCachedIOCs();
862
+ // PyPI IOCs are namespaced "pypi:<name>" in the wildcardPackages set
863
+ const pypiKey = `pypi:${ev.name}`;
864
+ isKnownIOC = iocs.wildcardPackages && (
865
+ iocs.wildcardPackages.has(pypiKey) || iocs.wildcardPackages.has(ev.name)
866
+ );
867
+ if (isKnownIOC) {
868
+ console.log(`[MONITOR] IOC PRE-ALERT (pypi): ${ev.name} — known malicious package`);
869
+ stats.iocPreAlerts = (stats.iocPreAlerts || 0) + 1;
870
+ sendIOCPreAlert(ev.name).catch(err => {
871
+ console.error(`[MONITOR] IOC pre-alert webhook failed for ${ev.name}: ${err.message}`);
872
+ });
873
+ }
874
+ } catch { /* IOC load failure is non-fatal */ }
875
+
876
+ scanQueue.push({
877
+ name: ev.name,
878
+ version: ev.version,
879
+ ecosystem: 'pypi',
880
+ tarballUrl: null, // resolved lazily via getPyPITarballUrl()
881
+ isIOCMatch: isKnownIOC
882
+ });
883
+ queued++;
884
+ }
885
+
886
+ // Persist the serial both in memory and on disk before returning.
887
+ // daemon.js also flushes state.json after the queue is saved, but writing the
888
+ // dedicated serial file here means a crash between the two flush points costs
889
+ // at most one poll of replay — and re-queuing the same (name, version) is
890
+ // handled idempotently by the scan-memory dedupe downstream.
891
+ state.pypiLastSerial = maxSerial;
892
+ if (maxSerial !== lastSerial) {
893
+ savePypiSerial(maxSerial);
894
+ }
895
+
896
+ if (queued > 0) {
897
+ console.log(`[MONITOR] PyPI changelog: ${queued} packages queued (serial ${lastSerial} → ${maxSerial}, ${events.length} events)`);
898
+ }
899
+ stats.pypiChangelogPackages = (stats.pypiChangelogPackages || 0) + queued;
900
+ stats.pypiChangelogEvents = (stats.pypiChangelogEvents || 0) + events.length;
901
+
902
+ return queued;
903
+ } catch (err) {
904
+ console.error(`[MONITOR] PyPI changelog error: ${err.message} — falling back to RSS`);
905
+ return -1;
906
+ }
907
+ }
908
+
909
+ /**
910
+ * Poll PyPI RSS feed (legacy fallback).
911
+ * Only covers newly-registered packages (first-ever publish) and is capped at ~40 items —
912
+ * a single burst can silently lose events. Used only when the XML-RPC changelog fails.
595
913
  *
596
914
  * @param {Object} state - Monitor state object (pypiLastPackage)
597
915
  * @param {Array} scanQueue - Mutable scan queue array
598
916
  */
599
- async function pollPyPI(state, scanQueue) {
917
+ async function pollPyPIRss(state, scanQueue) {
600
918
  const url = 'https://pypi.org/rss/packages.xml';
601
919
 
602
920
  try {
@@ -620,7 +938,7 @@ async function pollPyPI(state, scanQueue) {
620
938
  }
621
939
 
622
940
  for (const name of newPackages) {
623
- console.log(`[MONITOR] New pypi: ${name}`);
941
+ console.log(`[MONITOR] New pypi (rss): ${name}`);
624
942
  // Queue PyPI packages — tarball URL resolved during scan
625
943
  scanQueue.push({
626
944
  name,
@@ -637,11 +955,28 @@ async function pollPyPI(state, scanQueue) {
637
955
 
638
956
  return newPackages.length;
639
957
  } catch (err) {
640
- console.error(`[MONITOR] PyPI poll error: ${err.message}`);
958
+ console.error(`[MONITOR] PyPI RSS poll error: ${err.message}`);
641
959
  return -1;
642
960
  }
643
961
  }
644
962
 
963
+ /**
964
+ * Poll PyPI for new packages and versions.
965
+ * Primary: XML-RPC changelog_since_serial (lossless, captures new versions).
966
+ * Fallback: RSS feed (new registrations only, lossy on bursts).
967
+ *
968
+ * @param {Object} state - Monitor state object
969
+ * @param {Array} scanQueue - Mutable scan queue array
970
+ * @param {Object} stats - Mutable stats object
971
+ */
972
+ async function pollPyPI(state, scanQueue, stats = {}) {
973
+ const count = await pollPyPIChangelog(state, scanQueue, stats);
974
+ if (count >= 0) return count;
975
+ console.log('[MONITOR] Using RSS fallback for PyPI');
976
+ stats.pypiRssFallbackCount = (stats.pypiRssFallbackCount || 0) + 1;
977
+ return pollPyPIRss(state, scanQueue);
978
+ }
979
+
645
980
  // --- Main poll orchestrator ---
646
981
 
647
982
  /**
@@ -686,7 +1021,7 @@ async function poll(state, scanQueue, stats) {
686
1021
 
687
1022
  const [npmCount, pypiCount] = await Promise.all([
688
1023
  pollNpm(state, scanQueue, stats),
689
- pollPyPI(state, scanQueue)
1024
+ pollPyPI(state, scanQueue, stats)
690
1025
  ]);
691
1026
 
692
1027
  // Track consecutive poll failures for backoff
@@ -718,6 +1053,7 @@ module.exports = {
718
1053
 
719
1054
  // HTTP helpers
720
1055
  httpsGet,
1056
+ httpsPost,
721
1057
  getWeeklyDownloads,
722
1058
  checkTrustedDepDiff,
723
1059
  TRUSTED_DEP_AGE_THRESHOLD_MS,
@@ -731,6 +1067,12 @@ module.exports = {
731
1067
  parseNpmRss,
732
1068
  parsePyPIRss,
733
1069
 
1070
+ // XML-RPC (PyPI changelog)
1071
+ buildXmlRpcCall,
1072
+ parseXmlRpcChangelog,
1073
+ parseXmlRpcInt,
1074
+ isPypiScannableAction,
1075
+
734
1076
  // CouchDB doc extraction
735
1077
  extractTarballFromDoc,
736
1078
 
@@ -738,6 +1080,11 @@ module.exports = {
738
1080
  pollNpmChanges,
739
1081
  pollNpmRss,
740
1082
  pollNpm,
1083
+ pollPyPIChangelog,
1084
+ pollPyPIRss,
741
1085
  pollPyPI,
742
- poll
1086
+ poll,
1087
+
1088
+ // Test seam — see _deps definition near the top of this file.
1089
+ _deps
743
1090
  };
@@ -1138,7 +1138,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
1138
1138
  }
1139
1139
  if (item.ecosystem === 'pypi' && !item.tarballUrl) {
1140
1140
  try {
1141
- const pypiInfo = await getPyPITarballUrl(item.name);
1141
+ const pypiInfo = await getPyPITarballUrl(item.name, item.version || '');
1142
1142
  if (!pypiInfo.url) {
1143
1143
  console.log(`[MONITOR] SKIP: ${item.name} — no tarball URL found on PyPI`);
1144
1144
  return;
@@ -76,6 +76,20 @@ const CHANGES_STREAM_URL = 'https://replicate.npmjs.com/registry/_changes';
76
76
  const CHANGES_LIMIT = 1000;
77
77
  const CHANGES_CATCHUP_MAX = 500000; // If behind by more than 500k seqs, skip to "now"
78
78
 
79
+ // --- PyPI serial constants ---
80
+ //
81
+ // PyPI's XML-RPC changelog endpoint is the canonical equivalent of npm's CouchDB
82
+ // `_changes` stream: every package event (release, file upload, removal, owner
83
+ // change…) gets a strictly monotonic integer "serial". `changelog_since_serial(n)`
84
+ // returns every event with serial > n, letting us resume losslessly across restarts.
85
+ //
86
+ // PYPI_CATCHUP_MAX is the staleness cap: if we are behind by more than this many
87
+ // serials (≈ days of activity at ~30k events/day in 2026), skip to "now" rather
88
+ // than fetch a monster batch. Mirrors CHANGES_CATCHUP_MAX for npm.
89
+ const PYPI_SERIAL_FILE = path.join(__dirname, '..', '..', 'data', 'pypi-serial.json');
90
+ const PYPI_XMLRPC_URL = 'https://pypi.org/pypi';
91
+ const PYPI_CATCHUP_MAX = 100000;
92
+
79
93
  // --- Scan memory constants ---
80
94
 
81
95
  const SCAN_MEMORY_FILE = path.join(__dirname, '..', '..', 'data', 'scan-memory.json');
@@ -191,6 +205,37 @@ function saveNpmSeq(seq) {
191
205
  atomicWriteFileSync(NPM_SEQ_FILE, JSON.stringify({ lastSeq: seq, updatedAt: new Date().toISOString() }, null, 2));
192
206
  }
193
207
 
208
+ // --- PyPI serial persistence ---
209
+
210
+ /**
211
+ * Load the last processed PyPI changelog serial from the dedicated file.
212
+ * Returns null if no file exists or file is invalid (triggers "now" initialization).
213
+ */
214
+ function loadPypiSerial() {
215
+ try {
216
+ if (fs.existsSync(PYPI_SERIAL_FILE)) {
217
+ const data = JSON.parse(fs.readFileSync(PYPI_SERIAL_FILE, 'utf8'));
218
+ if (typeof data.lastSerial === 'number' && Number.isFinite(data.lastSerial)) {
219
+ return data.lastSerial;
220
+ }
221
+ }
222
+ } catch (err) {
223
+ console.warn(`[MONITOR] Failed to load PyPI serial: ${err.message}`);
224
+ }
225
+ return null;
226
+ }
227
+
228
+ /**
229
+ * Persist the last processed PyPI changelog serial to a dedicated file.
230
+ * Atomic write (crash-safe). Also mirrored in monitor-state.json via saveState().
231
+ */
232
+ function savePypiSerial(serial) {
233
+ atomicWriteFileSync(
234
+ PYPI_SERIAL_FILE,
235
+ JSON.stringify({ lastSerial: serial, updatedAt: new Date().toISOString() }, null, 2)
236
+ );
237
+ }
238
+
194
239
  // --- C3: Scan Memory Management ---
195
240
 
196
241
  /**
@@ -649,10 +694,16 @@ function loadState(stats) {
649
694
  return {
650
695
  npmLastPackage: typeof state.npmLastPackage === 'string' ? state.npmLastPackage : '',
651
696
  pypiLastPackage: typeof state.pypiLastPackage === 'string' ? state.pypiLastPackage : '',
652
- npmLastSeq: state.npmLastSeq != null ? state.npmLastSeq : loadNpmSeq()
697
+ npmLastSeq: state.npmLastSeq != null ? state.npmLastSeq : loadNpmSeq(),
698
+ pypiLastSerial: state.pypiLastSerial != null ? state.pypiLastSerial : loadPypiSerial()
653
699
  };
654
700
  } catch {
655
- return { npmLastPackage: '', pypiLastPackage: '', npmLastSeq: loadNpmSeq() };
701
+ return {
702
+ npmLastPackage: '',
703
+ pypiLastPackage: '',
704
+ npmLastSeq: loadNpmSeq(),
705
+ pypiLastSerial: loadPypiSerial()
706
+ };
656
707
  }
657
708
  }
658
709
 
@@ -1180,6 +1231,9 @@ module.exports = {
1180
1231
  CHANGES_STREAM_URL,
1181
1232
  CHANGES_LIMIT,
1182
1233
  CHANGES_CATCHUP_MAX,
1234
+ PYPI_SERIAL_FILE,
1235
+ PYPI_XMLRPC_URL,
1236
+ PYPI_CATCHUP_MAX,
1183
1237
  SCAN_MEMORY_FILE,
1184
1238
  SCAN_MEMORY_EXPIRY_MS,
1185
1239
  MAX_MEMORY_ENTRIES,
@@ -1211,6 +1265,8 @@ module.exports = {
1211
1265
  atomicWriteFileSync,
1212
1266
  loadNpmSeq,
1213
1267
  saveNpmSeq,
1268
+ loadPypiSerial,
1269
+ savePypiSerial,
1214
1270
  loadScanMemory,
1215
1271
  saveScanMemory,
1216
1272
  recordScanMemory,
@@ -142,8 +142,12 @@ function getEntryPoints(packagePath) {
142
142
  candidates.push(pkg.module);
143
143
  }
144
144
 
145
- // Lifecycle scripts: extract .js files from preinstall/install/postinstall/prepare
146
- const lifecycleKeys = ['preinstall', 'install', 'postinstall', 'prepare'];
145
+ // Lifecycle scripts: extract .js files from npm lifecycle hooks
146
+ const lifecycleKeys = [
147
+ 'preinstall', 'install', 'postinstall', 'prepare',
148
+ 'prepack', 'postpack', 'prepublishOnly', 'prepublish',
149
+ 'preuninstall', 'uninstall', 'postuninstall'
150
+ ];
147
151
  if (pkg.scripts) {
148
152
  for (const key of lifecycleKeys) {
149
153
  if (typeof pkg.scripts[key] === 'string') {
package/src/scoring.js CHANGED
@@ -1186,7 +1186,7 @@ function applyFPReductions(threats, reachableFiles, packageName, packageDeps, re
1186
1186
  // MUST run AFTER benign_lifecycle reduction to correctly detect LOW lifecycle_script.
1187
1187
  const LIFECYCLE_GUARD_TYPES = new Set([
1188
1188
  'obfuscation_detected', 'dynamic_require', 'dangerous_call_function',
1189
- 'dangerous_call_eval', 'staged_payload'
1189
+ 'dangerous_call_eval', 'staged_payload', 'env_access'
1190
1190
  ]);
1191
1191
 
1192
1192
  const lifecycleThreats = threats.filter(t => t.type === 'lifecycle_script');