muaddib-scanner 2.11.14 → 2.11.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/muaddib.js +2 -0
- package/package.json +2 -2
- package/src/monitor/ingestion.js +358 -11
- package/src/monitor/queue.js +1 -1
- package/src/monitor/state.js +58 -2
- package/src/scanner/reachability.js +6 -2
- package/src/scoring.js +1 -1
package/bin/muaddib.js
CHANGED
|
@@ -297,6 +297,7 @@ if (command === 'version' || command === '--version' || command === '-v') {
|
|
|
297
297
|
if (wantHelp) showHelp('watch');
|
|
298
298
|
watch(target);
|
|
299
299
|
} else if (command === 'update') {
|
|
300
|
+
if (wantHelp) showHelp('update');
|
|
300
301
|
updateIOCs().then(() => {
|
|
301
302
|
process.exit(0);
|
|
302
303
|
}).catch(err => {
|
|
@@ -304,6 +305,7 @@ if (command === 'version' || command === '--version' || command === '-v') {
|
|
|
304
305
|
process.exit(1);
|
|
305
306
|
});
|
|
306
307
|
} else if (command === 'scrape') {
|
|
308
|
+
if (wantHelp) showHelp('scrape');
|
|
307
309
|
runScraper().then(result => {
|
|
308
310
|
console.log(`[OK] ${result.added} new IOCs (total: ${result.total})`);
|
|
309
311
|
process.exit(0);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "muaddib-scanner",
|
|
3
|
-
"version": "2.11.
|
|
3
|
+
"version": "2.11.16",
|
|
4
4
|
"description": "Supply-chain threat detection & response for npm & PyPI/Python",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
"node": ">=18.0.0"
|
|
47
47
|
},
|
|
48
48
|
"dependencies": {
|
|
49
|
-
"@inquirer/prompts": "8.4.
|
|
49
|
+
"@inquirer/prompts": "8.4.3",
|
|
50
50
|
"acorn": "8.16.0",
|
|
51
51
|
"acorn-walk": "8.3.5",
|
|
52
52
|
"adm-zip": "0.5.17",
|
package/src/monitor/ingestion.js
CHANGED
|
@@ -10,7 +10,10 @@
|
|
|
10
10
|
const https = require('https');
|
|
11
11
|
const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
|
|
12
12
|
const { loadCachedIOCs } = require('../ioc/updater.js');
|
|
13
|
-
const {
|
|
13
|
+
const {
|
|
14
|
+
loadNpmSeq, saveNpmSeq, CHANGES_STREAM_URL, CHANGES_LIMIT, CHANGES_CATCHUP_MAX,
|
|
15
|
+
savePypiSerial, PYPI_XMLRPC_URL, PYPI_CATCHUP_MAX
|
|
16
|
+
} = require('./state.js');
|
|
14
17
|
const { sendIOCPreAlert } = require('./webhook.js');
|
|
15
18
|
const { evaluateCacheTrigger, POPULAR_THRESHOLD, downloadsCache, DOWNLOADS_CACHE_TTL } = require('./classify.js');
|
|
16
19
|
|
|
@@ -22,6 +25,14 @@ const POLL_MAX_BACKOFF = 960_000; // 16 minutes max backoff
|
|
|
22
25
|
// --- Mutable state ---
|
|
23
26
|
let consecutivePollErrors = 0;
|
|
24
27
|
|
|
28
|
+
// Test seam: code paths that need to be stubbed in tests call these through
|
|
29
|
+
// `_deps` instead of the bare module-local name, so a test can swap
|
|
30
|
+
// `ingestion._deps.httpsPost = fakePost` and have it take effect inside
|
|
31
|
+
// pollPyPIChangelog. Kept tiny on purpose — only network I/O lives here.
|
|
32
|
+
const _deps = {
|
|
33
|
+
httpsPost: null // populated below once httpsPost is defined
|
|
34
|
+
};
|
|
35
|
+
|
|
25
36
|
function getConsecutivePollErrors() {
|
|
26
37
|
return consecutivePollErrors;
|
|
27
38
|
}
|
|
@@ -64,6 +75,47 @@ function httpsGet(url, timeoutMs = 30_000) {
|
|
|
64
75
|
});
|
|
65
76
|
}
|
|
66
77
|
|
|
78
|
+
/**
|
|
79
|
+
* Minimal HTTPS POST. Used for PyPI XML-RPC; kept inside the ingestion module
|
|
80
|
+
* (rather than pulled into shared/) because XML-RPC is its only consumer today.
|
|
81
|
+
*/
|
|
82
|
+
function httpsPost(url, body, headers = {}, timeoutMs = 30_000) {
|
|
83
|
+
return new Promise((resolve, reject) => {
|
|
84
|
+
const u = new URL(url);
|
|
85
|
+
const options = {
|
|
86
|
+
method: 'POST',
|
|
87
|
+
hostname: u.hostname,
|
|
88
|
+
port: u.port || 443,
|
|
89
|
+
path: u.pathname + (u.search || ''),
|
|
90
|
+
timeout: timeoutMs,
|
|
91
|
+
headers: {
|
|
92
|
+
'Content-Type': 'text/xml',
|
|
93
|
+
'Content-Length': Buffer.byteLength(body),
|
|
94
|
+
...headers
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
const req = https.request(options, (res) => {
|
|
98
|
+
if (res.statusCode < 200 || res.statusCode >= 300) {
|
|
99
|
+
res.resume();
|
|
100
|
+
return reject(new Error(`HTTP ${res.statusCode} for POST ${url}`));
|
|
101
|
+
}
|
|
102
|
+
const chunks = [];
|
|
103
|
+
res.on('data', (chunk) => chunks.push(chunk));
|
|
104
|
+
res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
|
|
105
|
+
res.on('error', reject);
|
|
106
|
+
});
|
|
107
|
+
req.on('error', reject);
|
|
108
|
+
req.on('timeout', () => {
|
|
109
|
+
req.destroy();
|
|
110
|
+
reject(new Error(`Timeout for POST ${url}`));
|
|
111
|
+
});
|
|
112
|
+
req.write(body);
|
|
113
|
+
req.end();
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
_deps.httpsPost = httpsPost;
|
|
118
|
+
|
|
67
119
|
async function getWeeklyDownloads(packageName) {
|
|
68
120
|
const cached = downloadsCache.get(packageName);
|
|
69
121
|
if (cached && (Date.now() - cached.fetchedAt) < DOWNLOADS_CACHE_TTL) {
|
|
@@ -186,8 +238,13 @@ function getNpmTarballUrl(pkgData) {
|
|
|
186
238
|
return (pkgData.dist && pkgData.dist.tarball) || null;
|
|
187
239
|
}
|
|
188
240
|
|
|
189
|
-
async function getPyPITarballUrl(packageName) {
|
|
190
|
-
|
|
241
|
+
async function getPyPITarballUrl(packageName, packageVersion = '') {
|
|
242
|
+
// Per-version endpoint when we know the version (e.g. from the XML-RPC changelog) —
|
|
243
|
+
// guarantees we scan the artifact that just landed, not whatever became "latest"
|
|
244
|
+
// between event detection and scan. Falls back to /pypi/<name>/json (latest) otherwise.
|
|
245
|
+
const url = packageVersion
|
|
246
|
+
? `https://pypi.org/pypi/${encodeURIComponent(packageName)}/${encodeURIComponent(packageVersion)}/json`
|
|
247
|
+
: `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
|
|
191
248
|
const body = await httpsGet(url);
|
|
192
249
|
let data;
|
|
193
250
|
try {
|
|
@@ -195,7 +252,7 @@ async function getPyPITarballUrl(packageName) {
|
|
|
195
252
|
} catch (e) {
|
|
196
253
|
throw new Error(`Invalid JSON from PyPI for ${packageName}: ${e.message}`);
|
|
197
254
|
}
|
|
198
|
-
const version = (data.info && data.info.version) || '';
|
|
255
|
+
const version = (data.info && data.info.version) || packageVersion || '';
|
|
199
256
|
const urls = data.urls || [];
|
|
200
257
|
// Prefer sdist (.tar.gz)
|
|
201
258
|
const sdist = urls.find(u => u.packagetype === 'sdist' && u.url);
|
|
@@ -386,7 +443,10 @@ async function pollNpmChanges(state, scanQueue, stats) {
|
|
|
386
443
|
const currentSeq = currentSeqData.update_seq;
|
|
387
444
|
if (typeof currentSeq === 'number' && typeof data.last_seq === 'number' &&
|
|
388
445
|
(currentSeq - data.last_seq) > CHANGES_CATCHUP_MAX) {
|
|
389
|
-
|
|
446
|
+
const gap = currentSeq - lastSeq;
|
|
447
|
+
console.warn(`[MONITOR] Changes stream too far behind (${gap} changes) — skipping to current`);
|
|
448
|
+
stats.npmCatchupSkips = (stats.npmCatchupSkips || 0) + 1;
|
|
449
|
+
stats.npmCatchupSkippedSeqs = (stats.npmCatchupSkippedSeqs || 0) + gap;
|
|
390
450
|
state.npmLastSeq = currentSeq;
|
|
391
451
|
saveNpmSeq(currentSeq);
|
|
392
452
|
return 0;
|
|
@@ -590,13 +650,271 @@ async function pollNpm(state, scanQueue, stats) {
|
|
|
590
650
|
|
|
591
651
|
// --- PyPI polling ---
|
|
592
652
|
|
|
653
|
+
const PYPI_USER_AGENT = `${SELF_PACKAGE_NAME} (security-monitor; +https://github.com/DNSZLSK/muaddib)`;
|
|
654
|
+
|
|
593
655
|
/**
|
|
594
|
-
*
|
|
656
|
+
* Build an XML-RPC methodCall envelope. PyPI accepts only <int> and <string>
|
|
657
|
+
* params for the methods we use (changelog_last_serial, changelog_since_serial),
|
|
658
|
+
* so this builder is deliberately minimal.
|
|
659
|
+
*/
|
|
660
|
+
function buildXmlRpcCall(method, params) {
|
|
661
|
+
const paramXml = params.map((p) => {
|
|
662
|
+
if (typeof p === 'number' && Number.isInteger(p)) {
|
|
663
|
+
return `<param><value><int>${p}</int></value></param>`;
|
|
664
|
+
}
|
|
665
|
+
if (typeof p === 'string') {
|
|
666
|
+
// Method names + serial numbers only — no user-supplied strings reach this path.
|
|
667
|
+
const escaped = p.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
668
|
+
return `<param><value><string>${escaped}</string></value></param>`;
|
|
669
|
+
}
|
|
670
|
+
throw new Error(`Unsupported XML-RPC param type: ${typeof p}`);
|
|
671
|
+
}).join('');
|
|
672
|
+
return `<?xml version="1.0"?><methodCall><methodName>${method}</methodName><params>${paramXml}</params></methodCall>`;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* Parse a PyPI changelog_since_serial response.
|
|
677
|
+
*
|
|
678
|
+
* Response shape (per https://warehouse.pypa.io/api-reference/xml-rpc.html):
|
|
679
|
+
* <array><data>
|
|
680
|
+
* <value><array><data>
|
|
681
|
+
* <value><string>NAME</string></value> <!-- index 0 -->
|
|
682
|
+
* <value><string>VERSION</string></value> <!-- index 1, may be empty -->
|
|
683
|
+
* <value><int>TIMESTAMP</int></value> <!-- index 2 -->
|
|
684
|
+
* <value><string>ACTION</string></value> <!-- index 3 -->
|
|
685
|
+
* <value><int>SERIAL</int></value> <!-- index 4 -->
|
|
686
|
+
* </data></array></value>
|
|
687
|
+
* ...
|
|
688
|
+
* </data></array>
|
|
689
|
+
*
|
|
690
|
+
* Returns array of { name, version, timestamp, action, serial }. Invalid tuples
|
|
691
|
+
* are skipped silently — partial data is better than dropping the whole batch.
|
|
692
|
+
*/
|
|
693
|
+
function parseXmlRpcChangelog(xml) {
|
|
694
|
+
const out = [];
|
|
695
|
+
if (typeof xml !== 'string' || !xml.includes('<methodResponse>')) return out;
|
|
696
|
+
if (xml.includes('<fault>')) return out; // PyPI fault → caller should treat as failure
|
|
697
|
+
|
|
698
|
+
// The response is a nested array: outer <array><data>...inner tuples...</data></array>.
|
|
699
|
+
// We strip the outer wrapper first so the inner-tuple regex can't accidentally
|
|
700
|
+
// greedy-match across the outer boundary (which would swallow tuple #1).
|
|
701
|
+
const outerArrayStart = xml.indexOf('<array>');
|
|
702
|
+
if (outerArrayStart === -1) return out;
|
|
703
|
+
const outerDataStart = xml.indexOf('<data>', outerArrayStart);
|
|
704
|
+
if (outerDataStart === -1) return out;
|
|
705
|
+
const outerDataEnd = xml.lastIndexOf('</data>');
|
|
706
|
+
if (outerDataEnd === -1 || outerDataEnd <= outerDataStart) return out;
|
|
707
|
+
const body = xml.slice(outerDataStart + '<data>'.length, outerDataEnd);
|
|
708
|
+
|
|
709
|
+
// Each tuple inside `body` is exactly: <value><array><data>...</data></array></value>
|
|
710
|
+
const tupleRegex = /<value>\s*<array>\s*<data>([\s\S]*?)<\/data>\s*<\/array>\s*<\/value>/g;
|
|
711
|
+
let m;
|
|
712
|
+
while ((m = tupleRegex.exec(body)) !== null) {
|
|
713
|
+
const inner = m[1];
|
|
714
|
+
const values = [];
|
|
715
|
+
const valRegex = /<value>\s*(?:<string>([\s\S]*?)<\/string>|<int>(-?\d+)<\/int>)\s*<\/value>/g;
|
|
716
|
+
let v;
|
|
717
|
+
while ((v = valRegex.exec(inner)) !== null) {
|
|
718
|
+
if (v[1] !== undefined) {
|
|
719
|
+
// Decode the XML entities we encode on the way in
|
|
720
|
+
values.push(v[1].replace(/</g, '<').replace(/>/g, '>').replace(/&/g, '&'));
|
|
721
|
+
} else {
|
|
722
|
+
values.push(parseInt(v[2], 10));
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
if (values.length !== 5) continue;
|
|
726
|
+
const [name, version, timestamp, action, serial] = values;
|
|
727
|
+
if (typeof name !== 'string' || typeof action !== 'string' ||
|
|
728
|
+
typeof timestamp !== 'number' || typeof serial !== 'number') continue;
|
|
729
|
+
out.push({ name, version: typeof version === 'string' ? version : '', timestamp, action, serial });
|
|
730
|
+
}
|
|
731
|
+
return out;
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
/**
|
|
735
|
+
* Parse a changelog_last_serial response. Returns the integer or null.
|
|
736
|
+
*/
|
|
737
|
+
function parseXmlRpcInt(xml) {
|
|
738
|
+
if (typeof xml !== 'string' || xml.includes('<fault>')) return null;
|
|
739
|
+
const m = xml.match(/<value>\s*<int>(-?\d+)<\/int>\s*<\/value>/);
|
|
740
|
+
return m ? parseInt(m[1], 10) : null;
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
/**
|
|
744
|
+
* Decide whether a changelog event introduces scannable content.
|
|
745
|
+
*
|
|
746
|
+
* KEEP (something new was published, scan the release):
|
|
747
|
+
* - "new release" → version metadata created
|
|
748
|
+
* - "add source file …" → sdist uploaded
|
|
749
|
+
* - "add py3 file …" / "add cp… file …" / "add … file …" → wheel uploaded
|
|
750
|
+
*
|
|
751
|
+
* SKIP (no new artifact to scan):
|
|
752
|
+
* - "remove …", "yank release", "unyank release" → removal, not a new threat
|
|
753
|
+
* - "create" → package shell, no version yet
|
|
754
|
+
* - "add Owner", "remove Owner", "accepted Owner" → ACL changes
|
|
755
|
+
* - empty version → administrative event at the package level
|
|
756
|
+
*/
|
|
757
|
+
function isPypiScannableAction(action, version) {
|
|
758
|
+
if (!version) return false;
|
|
759
|
+
if (typeof action !== 'string') return false;
|
|
760
|
+
if (action === 'new release') return true;
|
|
761
|
+
if (action.startsWith('add ') && action.includes(' file ')) return true;
|
|
762
|
+
return false;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
/**
|
|
766
|
+
* Poll PyPI changelog via XML-RPC (primary path).
|
|
767
|
+
* Equivalent of pollNpmChanges: strictly monotonic serial, lossless resume.
|
|
768
|
+
*
|
|
769
|
+
* @param {Object} state - Monitor state (pypiLastSerial)
|
|
770
|
+
* @param {Array} scanQueue - Mutable scan queue array
|
|
771
|
+
* @param {Object} stats - Mutable stats object
|
|
772
|
+
* @returns {Promise<number>} Number of packages queued, or -1 on error
|
|
773
|
+
*/
|
|
774
|
+
async function pollPyPIChangelog(state, scanQueue, stats) {
|
|
775
|
+
try {
|
|
776
|
+
let lastSerial = state.pypiLastSerial;
|
|
777
|
+
|
|
778
|
+
// First run: anchor to "now" rather than replaying months of history
|
|
779
|
+
if (lastSerial == null) {
|
|
780
|
+
await acquireRegistrySlot();
|
|
781
|
+
let initBody;
|
|
782
|
+
try {
|
|
783
|
+
initBody = await _deps.httpsPost(
|
|
784
|
+
PYPI_XMLRPC_URL,
|
|
785
|
+
buildXmlRpcCall('changelog_last_serial', []),
|
|
786
|
+
{ 'User-Agent': PYPI_USER_AGENT },
|
|
787
|
+
10_000
|
|
788
|
+
);
|
|
789
|
+
} finally {
|
|
790
|
+
releaseRegistrySlot();
|
|
791
|
+
}
|
|
792
|
+
const current = parseXmlRpcInt(initBody);
|
|
793
|
+
if (current == null) {
|
|
794
|
+
console.warn('[MONITOR] PyPI changelog init: no serial in response');
|
|
795
|
+
return -1;
|
|
796
|
+
}
|
|
797
|
+
state.pypiLastSerial = current;
|
|
798
|
+
savePypiSerial(current);
|
|
799
|
+
console.log(`[MONITOR] PyPI changelog initialized at serial ${current}`);
|
|
800
|
+
return 0;
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
await acquireRegistrySlot();
|
|
804
|
+
let body;
|
|
805
|
+
try {
|
|
806
|
+
body = await _deps.httpsPost(
|
|
807
|
+
PYPI_XMLRPC_URL,
|
|
808
|
+
buildXmlRpcCall('changelog_since_serial', [lastSerial]),
|
|
809
|
+
{ 'User-Agent': PYPI_USER_AGENT },
|
|
810
|
+
60_000
|
|
811
|
+
);
|
|
812
|
+
} finally {
|
|
813
|
+
releaseRegistrySlot();
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
const events = parseXmlRpcChangelog(body);
|
|
817
|
+
if (events.length === 0) {
|
|
818
|
+
// Either nothing happened or the response was a fault — distinguish.
|
|
819
|
+
if (body && body.includes('<fault>')) {
|
|
820
|
+
console.error('[MONITOR] PyPI changelog returned XML-RPC fault — falling back to RSS');
|
|
821
|
+
return -1;
|
|
822
|
+
}
|
|
823
|
+
return 0;
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
// Catch-up protection: if events span more than PYPI_CATCHUP_MAX serials,
|
|
827
|
+
// skip to the latest serial to avoid an avalanche after long downtime.
|
|
828
|
+
const lastEventSerial = events[events.length - 1].serial;
|
|
829
|
+
const gap = lastEventSerial - lastSerial;
|
|
830
|
+
if (gap > PYPI_CATCHUP_MAX) {
|
|
831
|
+
console.warn(`[MONITOR] PyPI changelog too far behind (${gap} events) — skipping to current`);
|
|
832
|
+
stats.pypiCatchupSkips = (stats.pypiCatchupSkips || 0) + 1;
|
|
833
|
+
stats.pypiCatchupSkippedEvents = (stats.pypiCatchupSkippedEvents || 0) + gap;
|
|
834
|
+
state.pypiLastSerial = lastEventSerial;
|
|
835
|
+
savePypiSerial(lastEventSerial);
|
|
836
|
+
return 0;
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
// Dedupe (name, version) within the batch: a single release usually emits
|
|
840
|
+
// multiple events (new release + add source file + add wheel files…), but
|
|
841
|
+
// there's only one thing to scan.
|
|
842
|
+
const seen = new Set();
|
|
843
|
+
let queued = 0;
|
|
844
|
+
let maxSerial = lastSerial;
|
|
845
|
+
|
|
846
|
+
for (const ev of events) {
|
|
847
|
+
if (ev.serial > maxSerial) maxSerial = ev.serial;
|
|
848
|
+
|
|
849
|
+
if (!isPypiScannableAction(ev.action, ev.version)) continue;
|
|
850
|
+
|
|
851
|
+
const key = `${ev.name}@${ev.version}`;
|
|
852
|
+
if (seen.has(key)) continue;
|
|
853
|
+
seen.add(key);
|
|
854
|
+
|
|
855
|
+
// Skip self (mirror of the npm path — defensive even though we don't publish to PyPI)
|
|
856
|
+
if (ev.name === SELF_PACKAGE_NAME) continue;
|
|
857
|
+
|
|
858
|
+
// IOC pre-alert for known-malicious PyPI packages
|
|
859
|
+
let isKnownIOC = false;
|
|
860
|
+
try {
|
|
861
|
+
const iocs = loadCachedIOCs();
|
|
862
|
+
// PyPI IOCs are namespaced "pypi:<name>" in the wildcardPackages set
|
|
863
|
+
const pypiKey = `pypi:${ev.name}`;
|
|
864
|
+
isKnownIOC = iocs.wildcardPackages && (
|
|
865
|
+
iocs.wildcardPackages.has(pypiKey) || iocs.wildcardPackages.has(ev.name)
|
|
866
|
+
);
|
|
867
|
+
if (isKnownIOC) {
|
|
868
|
+
console.log(`[MONITOR] IOC PRE-ALERT (pypi): ${ev.name} — known malicious package`);
|
|
869
|
+
stats.iocPreAlerts = (stats.iocPreAlerts || 0) + 1;
|
|
870
|
+
sendIOCPreAlert(ev.name).catch(err => {
|
|
871
|
+
console.error(`[MONITOR] IOC pre-alert webhook failed for ${ev.name}: ${err.message}`);
|
|
872
|
+
});
|
|
873
|
+
}
|
|
874
|
+
} catch { /* IOC load failure is non-fatal */ }
|
|
875
|
+
|
|
876
|
+
scanQueue.push({
|
|
877
|
+
name: ev.name,
|
|
878
|
+
version: ev.version,
|
|
879
|
+
ecosystem: 'pypi',
|
|
880
|
+
tarballUrl: null, // resolved lazily via getPyPITarballUrl()
|
|
881
|
+
isIOCMatch: isKnownIOC
|
|
882
|
+
});
|
|
883
|
+
queued++;
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// Persist the serial both in memory and on disk before returning.
|
|
887
|
+
// daemon.js also flushes state.json after the queue is saved, but writing the
|
|
888
|
+
// dedicated serial file here means a crash between the two flush points costs
|
|
889
|
+
// at most one poll of replay — and re-queuing the same (name, version) is
|
|
890
|
+
// handled idempotently by the scan-memory dedupe downstream.
|
|
891
|
+
state.pypiLastSerial = maxSerial;
|
|
892
|
+
if (maxSerial !== lastSerial) {
|
|
893
|
+
savePypiSerial(maxSerial);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
if (queued > 0) {
|
|
897
|
+
console.log(`[MONITOR] PyPI changelog: ${queued} packages queued (serial ${lastSerial} → ${maxSerial}, ${events.length} events)`);
|
|
898
|
+
}
|
|
899
|
+
stats.pypiChangelogPackages = (stats.pypiChangelogPackages || 0) + queued;
|
|
900
|
+
stats.pypiChangelogEvents = (stats.pypiChangelogEvents || 0) + events.length;
|
|
901
|
+
|
|
902
|
+
return queued;
|
|
903
|
+
} catch (err) {
|
|
904
|
+
console.error(`[MONITOR] PyPI changelog error: ${err.message} — falling back to RSS`);
|
|
905
|
+
return -1;
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
/**
|
|
910
|
+
* Poll PyPI RSS feed (legacy fallback).
|
|
911
|
+
* Only covers newly-registered packages (first-ever publish) and is capped at ~40 items —
|
|
912
|
+
* a single burst can silently lose events. Used only when the XML-RPC changelog fails.
|
|
595
913
|
*
|
|
596
914
|
* @param {Object} state - Monitor state object (pypiLastPackage)
|
|
597
915
|
* @param {Array} scanQueue - Mutable scan queue array
|
|
598
916
|
*/
|
|
599
|
-
async function
|
|
917
|
+
async function pollPyPIRss(state, scanQueue) {
|
|
600
918
|
const url = 'https://pypi.org/rss/packages.xml';
|
|
601
919
|
|
|
602
920
|
try {
|
|
@@ -620,7 +938,7 @@ async function pollPyPI(state, scanQueue) {
|
|
|
620
938
|
}
|
|
621
939
|
|
|
622
940
|
for (const name of newPackages) {
|
|
623
|
-
console.log(`[MONITOR] New pypi: ${name}`);
|
|
941
|
+
console.log(`[MONITOR] New pypi (rss): ${name}`);
|
|
624
942
|
// Queue PyPI packages — tarball URL resolved during scan
|
|
625
943
|
scanQueue.push({
|
|
626
944
|
name,
|
|
@@ -637,11 +955,28 @@ async function pollPyPI(state, scanQueue) {
|
|
|
637
955
|
|
|
638
956
|
return newPackages.length;
|
|
639
957
|
} catch (err) {
|
|
640
|
-
console.error(`[MONITOR] PyPI poll error: ${err.message}`);
|
|
958
|
+
console.error(`[MONITOR] PyPI RSS poll error: ${err.message}`);
|
|
641
959
|
return -1;
|
|
642
960
|
}
|
|
643
961
|
}
|
|
644
962
|
|
|
963
|
+
/**
|
|
964
|
+
* Poll PyPI for new packages and versions.
|
|
965
|
+
* Primary: XML-RPC changelog_since_serial (lossless, captures new versions).
|
|
966
|
+
* Fallback: RSS feed (new registrations only, lossy on bursts).
|
|
967
|
+
*
|
|
968
|
+
* @param {Object} state - Monitor state object
|
|
969
|
+
* @param {Array} scanQueue - Mutable scan queue array
|
|
970
|
+
* @param {Object} stats - Mutable stats object
|
|
971
|
+
*/
|
|
972
|
+
async function pollPyPI(state, scanQueue, stats = {}) {
|
|
973
|
+
const count = await pollPyPIChangelog(state, scanQueue, stats);
|
|
974
|
+
if (count >= 0) return count;
|
|
975
|
+
console.log('[MONITOR] Using RSS fallback for PyPI');
|
|
976
|
+
stats.pypiRssFallbackCount = (stats.pypiRssFallbackCount || 0) + 1;
|
|
977
|
+
return pollPyPIRss(state, scanQueue);
|
|
978
|
+
}
|
|
979
|
+
|
|
645
980
|
// --- Main poll orchestrator ---
|
|
646
981
|
|
|
647
982
|
/**
|
|
@@ -686,7 +1021,7 @@ async function poll(state, scanQueue, stats) {
|
|
|
686
1021
|
|
|
687
1022
|
const [npmCount, pypiCount] = await Promise.all([
|
|
688
1023
|
pollNpm(state, scanQueue, stats),
|
|
689
|
-
pollPyPI(state, scanQueue)
|
|
1024
|
+
pollPyPI(state, scanQueue, stats)
|
|
690
1025
|
]);
|
|
691
1026
|
|
|
692
1027
|
// Track consecutive poll failures for backoff
|
|
@@ -718,6 +1053,7 @@ module.exports = {
|
|
|
718
1053
|
|
|
719
1054
|
// HTTP helpers
|
|
720
1055
|
httpsGet,
|
|
1056
|
+
httpsPost,
|
|
721
1057
|
getWeeklyDownloads,
|
|
722
1058
|
checkTrustedDepDiff,
|
|
723
1059
|
TRUSTED_DEP_AGE_THRESHOLD_MS,
|
|
@@ -731,6 +1067,12 @@ module.exports = {
|
|
|
731
1067
|
parseNpmRss,
|
|
732
1068
|
parsePyPIRss,
|
|
733
1069
|
|
|
1070
|
+
// XML-RPC (PyPI changelog)
|
|
1071
|
+
buildXmlRpcCall,
|
|
1072
|
+
parseXmlRpcChangelog,
|
|
1073
|
+
parseXmlRpcInt,
|
|
1074
|
+
isPypiScannableAction,
|
|
1075
|
+
|
|
734
1076
|
// CouchDB doc extraction
|
|
735
1077
|
extractTarballFromDoc,
|
|
736
1078
|
|
|
@@ -738,6 +1080,11 @@ module.exports = {
|
|
|
738
1080
|
pollNpmChanges,
|
|
739
1081
|
pollNpmRss,
|
|
740
1082
|
pollNpm,
|
|
1083
|
+
pollPyPIChangelog,
|
|
1084
|
+
pollPyPIRss,
|
|
741
1085
|
pollPyPI,
|
|
742
|
-
poll
|
|
1086
|
+
poll,
|
|
1087
|
+
|
|
1088
|
+
// Test seam — see _deps definition near the top of this file.
|
|
1089
|
+
_deps
|
|
743
1090
|
};
|
package/src/monitor/queue.js
CHANGED
|
@@ -1138,7 +1138,7 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1138
1138
|
}
|
|
1139
1139
|
if (item.ecosystem === 'pypi' && !item.tarballUrl) {
|
|
1140
1140
|
try {
|
|
1141
|
-
const pypiInfo = await getPyPITarballUrl(item.name);
|
|
1141
|
+
const pypiInfo = await getPyPITarballUrl(item.name, item.version || '');
|
|
1142
1142
|
if (!pypiInfo.url) {
|
|
1143
1143
|
console.log(`[MONITOR] SKIP: ${item.name} — no tarball URL found on PyPI`);
|
|
1144
1144
|
return;
|
package/src/monitor/state.js
CHANGED
|
@@ -76,6 +76,20 @@ const CHANGES_STREAM_URL = 'https://replicate.npmjs.com/registry/_changes';
|
|
|
76
76
|
const CHANGES_LIMIT = 1000;
|
|
77
77
|
const CHANGES_CATCHUP_MAX = 500000; // If behind by more than 500k seqs, skip to "now"
|
|
78
78
|
|
|
79
|
+
// --- PyPI serial constants ---
|
|
80
|
+
//
|
|
81
|
+
// PyPI's XML-RPC changelog endpoint is the canonical equivalent of npm's CouchDB
|
|
82
|
+
// `_changes` stream: every package event (release, file upload, removal, owner
|
|
83
|
+
// change…) gets a strictly monotonic integer "serial". `changelog_since_serial(n)`
|
|
84
|
+
// returns every event with serial > n, letting us resume losslessly across restarts.
|
|
85
|
+
//
|
|
86
|
+
// PYPI_CATCHUP_MAX is the staleness cap: if we are behind by more than this many
|
|
87
|
+
// serials (≈ days of activity at ~30k events/day in 2026), skip to "now" rather
|
|
88
|
+
// than fetch a monster batch. Mirrors CHANGES_CATCHUP_MAX for npm.
|
|
89
|
+
const PYPI_SERIAL_FILE = path.join(__dirname, '..', '..', 'data', 'pypi-serial.json');
|
|
90
|
+
const PYPI_XMLRPC_URL = 'https://pypi.org/pypi';
|
|
91
|
+
const PYPI_CATCHUP_MAX = 100000;
|
|
92
|
+
|
|
79
93
|
// --- Scan memory constants ---
|
|
80
94
|
|
|
81
95
|
const SCAN_MEMORY_FILE = path.join(__dirname, '..', '..', 'data', 'scan-memory.json');
|
|
@@ -191,6 +205,37 @@ function saveNpmSeq(seq) {
|
|
|
191
205
|
atomicWriteFileSync(NPM_SEQ_FILE, JSON.stringify({ lastSeq: seq, updatedAt: new Date().toISOString() }, null, 2));
|
|
192
206
|
}
|
|
193
207
|
|
|
208
|
+
// --- PyPI serial persistence ---
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Load the last processed PyPI changelog serial from the dedicated file.
|
|
212
|
+
* Returns null if no file exists or file is invalid (triggers "now" initialization).
|
|
213
|
+
*/
|
|
214
|
+
function loadPypiSerial() {
|
|
215
|
+
try {
|
|
216
|
+
if (fs.existsSync(PYPI_SERIAL_FILE)) {
|
|
217
|
+
const data = JSON.parse(fs.readFileSync(PYPI_SERIAL_FILE, 'utf8'));
|
|
218
|
+
if (typeof data.lastSerial === 'number' && Number.isFinite(data.lastSerial)) {
|
|
219
|
+
return data.lastSerial;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
} catch (err) {
|
|
223
|
+
console.warn(`[MONITOR] Failed to load PyPI serial: ${err.message}`);
|
|
224
|
+
}
|
|
225
|
+
return null;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Persist the last processed PyPI changelog serial to a dedicated file.
|
|
230
|
+
* Atomic write (crash-safe). Also mirrored in monitor-state.json via saveState().
|
|
231
|
+
*/
|
|
232
|
+
function savePypiSerial(serial) {
|
|
233
|
+
atomicWriteFileSync(
|
|
234
|
+
PYPI_SERIAL_FILE,
|
|
235
|
+
JSON.stringify({ lastSerial: serial, updatedAt: new Date().toISOString() }, null, 2)
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
|
|
194
239
|
// --- C3: Scan Memory Management ---
|
|
195
240
|
|
|
196
241
|
/**
|
|
@@ -649,10 +694,16 @@ function loadState(stats) {
|
|
|
649
694
|
return {
|
|
650
695
|
npmLastPackage: typeof state.npmLastPackage === 'string' ? state.npmLastPackage : '',
|
|
651
696
|
pypiLastPackage: typeof state.pypiLastPackage === 'string' ? state.pypiLastPackage : '',
|
|
652
|
-
npmLastSeq: state.npmLastSeq != null ? state.npmLastSeq : loadNpmSeq()
|
|
697
|
+
npmLastSeq: state.npmLastSeq != null ? state.npmLastSeq : loadNpmSeq(),
|
|
698
|
+
pypiLastSerial: state.pypiLastSerial != null ? state.pypiLastSerial : loadPypiSerial()
|
|
653
699
|
};
|
|
654
700
|
} catch {
|
|
655
|
-
return {
|
|
701
|
+
return {
|
|
702
|
+
npmLastPackage: '',
|
|
703
|
+
pypiLastPackage: '',
|
|
704
|
+
npmLastSeq: loadNpmSeq(),
|
|
705
|
+
pypiLastSerial: loadPypiSerial()
|
|
706
|
+
};
|
|
656
707
|
}
|
|
657
708
|
}
|
|
658
709
|
|
|
@@ -1180,6 +1231,9 @@ module.exports = {
|
|
|
1180
1231
|
CHANGES_STREAM_URL,
|
|
1181
1232
|
CHANGES_LIMIT,
|
|
1182
1233
|
CHANGES_CATCHUP_MAX,
|
|
1234
|
+
PYPI_SERIAL_FILE,
|
|
1235
|
+
PYPI_XMLRPC_URL,
|
|
1236
|
+
PYPI_CATCHUP_MAX,
|
|
1183
1237
|
SCAN_MEMORY_FILE,
|
|
1184
1238
|
SCAN_MEMORY_EXPIRY_MS,
|
|
1185
1239
|
MAX_MEMORY_ENTRIES,
|
|
@@ -1211,6 +1265,8 @@ module.exports = {
|
|
|
1211
1265
|
atomicWriteFileSync,
|
|
1212
1266
|
loadNpmSeq,
|
|
1213
1267
|
saveNpmSeq,
|
|
1268
|
+
loadPypiSerial,
|
|
1269
|
+
savePypiSerial,
|
|
1214
1270
|
loadScanMemory,
|
|
1215
1271
|
saveScanMemory,
|
|
1216
1272
|
recordScanMemory,
|
|
@@ -142,8 +142,12 @@ function getEntryPoints(packagePath) {
|
|
|
142
142
|
candidates.push(pkg.module);
|
|
143
143
|
}
|
|
144
144
|
|
|
145
|
-
// Lifecycle scripts: extract .js files from
|
|
146
|
-
const lifecycleKeys = [
|
|
145
|
+
// Lifecycle scripts: extract .js files from npm lifecycle hooks
|
|
146
|
+
const lifecycleKeys = [
|
|
147
|
+
'preinstall', 'install', 'postinstall', 'prepare',
|
|
148
|
+
'prepack', 'postpack', 'prepublishOnly', 'prepublish',
|
|
149
|
+
'preuninstall', 'uninstall', 'postuninstall'
|
|
150
|
+
];
|
|
147
151
|
if (pkg.scripts) {
|
|
148
152
|
for (const key of lifecycleKeys) {
|
|
149
153
|
if (typeof pkg.scripts[key] === 'string') {
|
package/src/scoring.js
CHANGED
|
@@ -1186,7 +1186,7 @@ function applyFPReductions(threats, reachableFiles, packageName, packageDeps, re
|
|
|
1186
1186
|
// MUST run AFTER benign_lifecycle reduction to correctly detect LOW lifecycle_script.
|
|
1187
1187
|
const LIFECYCLE_GUARD_TYPES = new Set([
|
|
1188
1188
|
'obfuscation_detected', 'dynamic_require', 'dangerous_call_function',
|
|
1189
|
-
'dangerous_call_eval', 'staged_payload'
|
|
1189
|
+
'dangerous_call_eval', 'staged_payload', 'env_access'
|
|
1190
1190
|
]);
|
|
1191
1191
|
|
|
1192
1192
|
const lifecycleThreats = threats.filter(t => t.type === 'lifecycle_script');
|