muaddib-scanner 2.10.63 → 2.10.65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/muaddib.js +30 -0
- package/package.json +4 -4
- package/src/ml/classifier.js +12 -5
- package/src/monitor/auto-labeler.js +344 -0
- package/src/monitor/daemon.js +107 -7
- package/src/monitor/ingestion.js +8 -0
- package/src/monitor/queue.js +57 -13
package/bin/muaddib.js
CHANGED
|
@@ -687,6 +687,36 @@ if (command === 'version' || command === '--version' || command === '-v') {
|
|
|
687
687
|
console.log('Usage: muaddib report --now | --status');
|
|
688
688
|
process.exit(1);
|
|
689
689
|
}
|
|
690
|
+
} else if (command === 'relabel') {
|
|
691
|
+
if (wantHelp) {
|
|
692
|
+
console.log('Usage: muaddib relabel [--input <path>] [--output <path>] [--dry-run]');
|
|
693
|
+
console.log('');
|
|
694
|
+
console.log('Auto-relabel ML training data by checking registry takedown status.');
|
|
695
|
+
console.log('Verifies each package against npm/PyPI registries:');
|
|
696
|
+
console.log(' - npm 0.0.1-security → confirmed_malicious');
|
|
697
|
+
console.log(' - HTTP 404 + score >= 50 → confirmed_malicious');
|
|
698
|
+
console.log(' - Alive > 30 days + score < 20 → confirmed_benign');
|
|
699
|
+
console.log('');
|
|
700
|
+
console.log('Options:');
|
|
701
|
+
console.log(' --input <path> Input JSONL file (default: data/ml-training.jsonl)');
|
|
702
|
+
console.log(' --output <path> Output JSONL file (default: data/ml-training-relabeled.jsonl)');
|
|
703
|
+
console.log(' --dry-run Log changes without writing');
|
|
704
|
+
process.exit(0);
|
|
705
|
+
}
|
|
706
|
+
const { relabelDataset } = require('../src/monitor/auto-labeler.js');
|
|
707
|
+
let inputPath, outputPath;
|
|
708
|
+
for (let i = 0; i < options.length; i++) {
|
|
709
|
+
if (options[i] === '--input' && options[i + 1]) { inputPath = options[++i]; }
|
|
710
|
+
else if (options[i] === '--output' && options[i + 1]) { outputPath = options[++i]; }
|
|
711
|
+
}
|
|
712
|
+
const dryRun = options.includes('--dry-run');
|
|
713
|
+
relabelDataset({ input: inputPath, output: outputPath, dryRun }).then(summary => {
|
|
714
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
715
|
+
process.exit(0);
|
|
716
|
+
}).catch(err => {
|
|
717
|
+
console.error('[ERROR]', err.message);
|
|
718
|
+
process.exit(1);
|
|
719
|
+
});
|
|
690
720
|
} else if (command === 'help') {
|
|
691
721
|
// muaddib help <command> — show per-command help
|
|
692
722
|
const helpCmd = options.filter(o => !o.startsWith('-'))[0];
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "muaddib-scanner",
|
|
3
|
-
"version": "2.10.
|
|
3
|
+
"version": "2.10.65",
|
|
4
4
|
"description": "Supply-chain threat detection & response for npm & PyPI/Python",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -44,10 +44,10 @@
|
|
|
44
44
|
"node": ">=18.0.0"
|
|
45
45
|
},
|
|
46
46
|
"dependencies": {
|
|
47
|
-
"@inquirer/prompts": "8.
|
|
47
|
+
"@inquirer/prompts": "8.4.1",
|
|
48
48
|
"acorn": "8.16.0",
|
|
49
49
|
"acorn-walk": "8.3.5",
|
|
50
|
-
"adm-zip": "0.5.
|
|
50
|
+
"adm-zip": "0.5.17",
|
|
51
51
|
"js-yaml": "4.1.1"
|
|
52
52
|
},
|
|
53
53
|
"overrides": {
|
|
@@ -55,7 +55,7 @@
|
|
|
55
55
|
},
|
|
56
56
|
"devDependencies": {
|
|
57
57
|
"@eslint/js": "10.0.1",
|
|
58
|
-
"eslint": "10.
|
|
58
|
+
"eslint": "10.2.0",
|
|
59
59
|
"eslint-plugin-security": "^4.0.0",
|
|
60
60
|
"globals": "17.4.0"
|
|
61
61
|
}
|
package/src/ml/classifier.js
CHANGED
|
@@ -326,21 +326,28 @@ function classifyPackage(result, meta) {
|
|
|
326
326
|
return { prediction: 'bypass', probability: 1, reason: 'high_confidence_threat' };
|
|
327
327
|
}
|
|
328
328
|
|
|
329
|
-
// Guard rail 2b: bundler model
|
|
329
|
+
// Guard rail 2b: bundler model — LOG-ONLY mode
|
|
330
|
+
// DISABLED (2026-04-08): Model semi-collapsed — gives p≈0.37 for both bundler FPs
|
|
331
|
+
// and real malware (identical output despite 11/19 features diverging). Cannot
|
|
332
|
+
// discriminate. Safe (nothing filtered at threshold 0.1) but useless.
|
|
333
|
+
// Disabled until retrained alongside ML1 on corrected JSONL data.
|
|
330
334
|
if (isBundlerModelAvailable()) {
|
|
331
335
|
const bundlerVec = buildBundlerFeatureVector(result, meta);
|
|
332
336
|
const bundlerResult = predictBundler(bundlerVec);
|
|
333
|
-
|
|
337
|
+
// Log-only: record prediction for retraining validation
|
|
338
|
+
const roundedP = Math.round(bundlerResult.probability * 1000) / 1000;
|
|
339
|
+
// When retrained and validated, remove the 'false &&' guard below.
|
|
340
|
+
if (false && bundlerResult.prediction === 'clean') {
|
|
334
341
|
return {
|
|
335
342
|
prediction: 'fp_bundler',
|
|
336
|
-
probability:
|
|
343
|
+
probability: roundedP,
|
|
337
344
|
reason: 'ml_bundler_clean'
|
|
338
345
|
};
|
|
339
346
|
}
|
|
340
347
|
return {
|
|
341
348
|
prediction: 'bypass',
|
|
342
|
-
probability:
|
|
343
|
-
reason: 'ml_bundler_malicious'
|
|
349
|
+
probability: roundedP,
|
|
350
|
+
reason: bundlerResult.prediction === 'clean' ? 'ml_bundler_clean_disabled' : 'ml_bundler_malicious'
|
|
344
351
|
};
|
|
345
352
|
}
|
|
346
353
|
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Auto-labeler — registry takedown-based ML training label correction.
|
|
5
|
+
*
|
|
6
|
+
* Verifies packages in the JSONL training dataset against npm/PyPI registries:
|
|
7
|
+
* - npm `0.0.1-security` replacement → confirmed_malicious (npm Security takedown)
|
|
8
|
+
* - HTTP 404 + high score → confirmed_malicious (removed, high conviction)
|
|
9
|
+
* - HTTP 404 + low score → removed_unlabeled (removed, unknown intent)
|
|
10
|
+
* - Alive > 30 days + low score → confirmed_benign (survival heuristic)
|
|
11
|
+
* - Alive > 30 days + moderate score → likely_benign
|
|
12
|
+
*
|
|
13
|
+
* Never modifies the input JSONL — writes a new file.
|
|
14
|
+
* Reuses the shared HTTP semaphore to avoid starving monitor scans.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const fs = require('fs');
|
|
18
|
+
const path = require('path');
|
|
19
|
+
const https = require('https');
|
|
20
|
+
const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
|
|
21
|
+
const { atomicWriteFileSync } = require('./state.js');
|
|
22
|
+
|
|
23
|
+
const DEFAULT_INPUT = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
|
|
24
|
+
const DEFAULT_OUTPUT = path.join(__dirname, '..', '..', 'data', 'ml-training-relabeled.jsonl');
|
|
25
|
+
const DEFAULT_DELAY_MS = 200; // 5 req/s max — gentle on registries
|
|
26
|
+
const SURVIVAL_DAYS = 30;
|
|
27
|
+
|
|
28
|
+
// Labels eligible for auto-relabeling
|
|
29
|
+
const RELABELABLE = new Set(['suspect', 'ml_clean', 'unconfirmed', 'clean']);
|
|
30
|
+
|
|
31
|
+
// --- HTTP helper (minimal, avoids circular deps with ingestion.js) ---
|
|
32
|
+
|
|
33
|
+
function httpsGetJson(url, timeoutMs = 15000) {
|
|
34
|
+
return new Promise((resolve, reject) => {
|
|
35
|
+
const req = https.get(url, { timeout: timeoutMs }, (res) => {
|
|
36
|
+
if (res.statusCode === 404) {
|
|
37
|
+
res.resume();
|
|
38
|
+
return resolve({ _httpStatus: 404 });
|
|
39
|
+
}
|
|
40
|
+
if (res.statusCode < 200 || res.statusCode >= 300) {
|
|
41
|
+
res.resume();
|
|
42
|
+
return reject(new Error(`HTTP ${res.statusCode} for ${url}`));
|
|
43
|
+
}
|
|
44
|
+
const chunks = [];
|
|
45
|
+
res.on('data', (chunk) => chunks.push(chunk));
|
|
46
|
+
res.on('end', () => {
|
|
47
|
+
try {
|
|
48
|
+
const body = Buffer.concat(chunks).toString('utf8');
|
|
49
|
+
resolve(JSON.parse(body));
|
|
50
|
+
} catch (err) {
|
|
51
|
+
reject(new Error(`JSON parse error for ${url}: ${err.message}`));
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
res.on('error', reject);
|
|
55
|
+
});
|
|
56
|
+
req.on('error', reject);
|
|
57
|
+
req.on('timeout', () => {
|
|
58
|
+
req.destroy();
|
|
59
|
+
reject(new Error(`Timeout for ${url}`));
|
|
60
|
+
});
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function sleep(ms) {
|
|
65
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// --- Registry status checks ---
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Check npm registry status for a package.
|
|
72
|
+
* @param {string} name - package name
|
|
73
|
+
* @returns {Promise<{status: string, latestVersion?: string, detail?: string}>}
|
|
74
|
+
*/
|
|
75
|
+
async function checkNpmStatus(name) {
|
|
76
|
+
await acquireRegistrySlot();
|
|
77
|
+
try {
|
|
78
|
+
const data = await httpsGetJson(`https://registry.npmjs.org/${encodeURIComponent(name)}`);
|
|
79
|
+
|
|
80
|
+
if (data._httpStatus === 404) {
|
|
81
|
+
return { status: 'removed' };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const latest = data['dist-tags'] && data['dist-tags'].latest;
|
|
85
|
+
if (latest === '0.0.1-security') {
|
|
86
|
+
return { status: 'security_takedown', latestVersion: latest };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return { status: 'alive', latestVersion: latest || 'unknown' };
|
|
90
|
+
} catch (err) {
|
|
91
|
+
return { status: 'error', detail: err.message };
|
|
92
|
+
} finally {
|
|
93
|
+
releaseRegistrySlot();
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Check PyPI registry status for a package.
|
|
99
|
+
* @param {string} name - package name
|
|
100
|
+
* @returns {Promise<{status: string, detail?: string}>}
|
|
101
|
+
*/
|
|
102
|
+
async function checkPyPIStatus(name) {
|
|
103
|
+
try {
|
|
104
|
+
const data = await httpsGetJson(`https://pypi.org/pypi/${encodeURIComponent(name)}/json`);
|
|
105
|
+
|
|
106
|
+
if (data._httpStatus === 404) {
|
|
107
|
+
return { status: 'removed' };
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return { status: 'alive' };
|
|
111
|
+
} catch (err) {
|
|
112
|
+
return { status: 'error', detail: err.message };
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// --- Label computation ---
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Compute the new label for a record based on registry status.
|
|
120
|
+
*
|
|
121
|
+
* Guards:
|
|
122
|
+
* - security_takedown → always confirmed_malicious
|
|
123
|
+
* - removed + score >= 50 → confirmed_malicious (high conviction)
|
|
124
|
+
* - removed + score < 50 → removed_unlabeled (don't train on uncertain data)
|
|
125
|
+
* - alive + age >= 30d + score < 20 → confirmed_benign
|
|
126
|
+
* - alive + age >= 30d + score 20-34 → likely_benign
|
|
127
|
+
* - alive + age >= 30d + score >= 35 → no change (sleeper risk)
|
|
128
|
+
* - alive + age < 30d → no change (too early)
|
|
129
|
+
*
|
|
130
|
+
* @param {Object} record - JSONL training record (must have: score, timestamp, label)
|
|
131
|
+
* @param {{status: string}} registryStatus - from checkNpmStatus/checkPyPIStatus
|
|
132
|
+
* @returns {{label: string, source: string} | null} new label or null if no change
|
|
133
|
+
*/
|
|
134
|
+
function computeNewLabel(record, registryStatus) {
|
|
135
|
+
const { status } = registryStatus;
|
|
136
|
+
const score = record.score || 0;
|
|
137
|
+
|
|
138
|
+
// Already confirmed — don't re-label
|
|
139
|
+
if (record.label === 'confirmed_malicious' || record.label === 'confirmed_benign' ||
|
|
140
|
+
record.label === 'fp' || record.label === 'confirmed') {
|
|
141
|
+
return null;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// --- Takedown signals ---
|
|
145
|
+
if (status === 'security_takedown') {
|
|
146
|
+
return { label: 'confirmed_malicious', source: 'npm_security_takedown' };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (status === 'removed') {
|
|
150
|
+
if (score >= 50) {
|
|
151
|
+
return { label: 'confirmed_malicious', source: 'registry_removed_high_score' };
|
|
152
|
+
}
|
|
153
|
+
return { label: 'removed_unlabeled', source: 'registry_removed_low_score' };
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// --- Survival signals ---
|
|
157
|
+
if (status === 'alive') {
|
|
158
|
+
const recordAge = record.timestamp
|
|
159
|
+
? (Date.now() - new Date(record.timestamp).getTime()) / (1000 * 60 * 60 * 24)
|
|
160
|
+
: 0;
|
|
161
|
+
|
|
162
|
+
if (recordAge >= SURVIVAL_DAYS) {
|
|
163
|
+
if (score < 20) {
|
|
164
|
+
return { label: 'confirmed_benign', source: 'survival_30d' };
|
|
165
|
+
}
|
|
166
|
+
if (score >= 20 && score < 35) {
|
|
167
|
+
return { label: 'likely_benign', source: 'survival_30d_moderate' };
|
|
168
|
+
}
|
|
169
|
+
// score >= 35: no change (sleeper risk)
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return null;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// --- Dataset relabeling ---
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Read JSONL, check each unique package against registries, write relabeled output.
|
|
180
|
+
*
|
|
181
|
+
* @param {Object} [options]
|
|
182
|
+
* @param {string} [options.input] - input JSONL path
|
|
183
|
+
* @param {string} [options.output] - output JSONL path
|
|
184
|
+
* @param {boolean} [options.dryRun] - log changes without writing
|
|
185
|
+
* @param {number} [options.delayMs] - ms between registry requests
|
|
186
|
+
* @returns {Promise<Object>} summary stats
|
|
187
|
+
*/
|
|
188
|
+
async function relabelDataset(options = {}) {
|
|
189
|
+
const inputPath = options.input || DEFAULT_INPUT;
|
|
190
|
+
const outputPath = options.output || DEFAULT_OUTPUT;
|
|
191
|
+
const dryRun = options.dryRun || false;
|
|
192
|
+
const delayMs = options.delayMs != null ? options.delayMs : DEFAULT_DELAY_MS;
|
|
193
|
+
|
|
194
|
+
// 1. Read records
|
|
195
|
+
if (!fs.existsSync(inputPath)) {
|
|
196
|
+
throw new Error(`Input file not found: ${inputPath}`);
|
|
197
|
+
}
|
|
198
|
+
const content = fs.readFileSync(inputPath, 'utf8');
|
|
199
|
+
const lines = content.split('\n');
|
|
200
|
+
const records = [];
|
|
201
|
+
for (let i = 0; i < lines.length; i++) {
|
|
202
|
+
const line = lines[i].trim();
|
|
203
|
+
if (!line) continue;
|
|
204
|
+
try {
|
|
205
|
+
records.push({ idx: i, data: JSON.parse(line), raw: lines[i] });
|
|
206
|
+
} catch {
|
|
207
|
+
records.push({ idx: i, data: null, raw: lines[i] });
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// 2. Extract unique packages eligible for relabeling
|
|
212
|
+
const packageMap = new Map(); // key → { name, ecosystem, score, timestamp, indices[] }
|
|
213
|
+
for (const rec of records) {
|
|
214
|
+
if (!rec.data) continue;
|
|
215
|
+
if (!RELABELABLE.has(rec.data.label)) continue;
|
|
216
|
+
const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
|
|
217
|
+
if (!packageMap.has(key)) {
|
|
218
|
+
packageMap.set(key, {
|
|
219
|
+
name: rec.data.name,
|
|
220
|
+
ecosystem: rec.data.ecosystem || 'npm',
|
|
221
|
+
score: rec.data.score || 0,
|
|
222
|
+
timestamp: rec.data.timestamp,
|
|
223
|
+
indices: []
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
packageMap.get(key).indices.push(rec.idx);
|
|
227
|
+
// Use highest score seen for this package
|
|
228
|
+
if ((rec.data.score || 0) > packageMap.get(key).score) {
|
|
229
|
+
packageMap.get(key).score = rec.data.score;
|
|
230
|
+
}
|
|
231
|
+
// Use earliest timestamp
|
|
232
|
+
if (rec.data.timestamp && (!packageMap.get(key).timestamp || rec.data.timestamp < packageMap.get(key).timestamp)) {
|
|
233
|
+
packageMap.get(key).timestamp = rec.data.timestamp;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
console.log(`[RELABEL] ${records.length} records, ${packageMap.size} unique packages to check`);
|
|
238
|
+
|
|
239
|
+
// 3. Check each package against registry
|
|
240
|
+
const summary = {
|
|
241
|
+
checked: 0,
|
|
242
|
+
relabeled_malicious: 0,
|
|
243
|
+
relabeled_benign: 0,
|
|
244
|
+
relabeled_likely_benign: 0,
|
|
245
|
+
removed_unlabeled: 0,
|
|
246
|
+
unchanged: 0,
|
|
247
|
+
errors: 0,
|
|
248
|
+
records_updated: 0
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
const labelChanges = new Map(); // packageKey → { label, source }
|
|
252
|
+
|
|
253
|
+
for (const [key, pkg] of packageMap) {
|
|
254
|
+
let registryStatus;
|
|
255
|
+
try {
|
|
256
|
+
if (pkg.ecosystem === 'npm') {
|
|
257
|
+
registryStatus = await checkNpmStatus(pkg.name);
|
|
258
|
+
} else if (pkg.ecosystem === 'pypi') {
|
|
259
|
+
registryStatus = await checkPyPIStatus(pkg.name);
|
|
260
|
+
} else {
|
|
261
|
+
summary.unchanged++;
|
|
262
|
+
summary.checked++;
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
} catch (err) {
|
|
266
|
+
summary.errors++;
|
|
267
|
+
summary.checked++;
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (registryStatus.status === 'error') {
|
|
272
|
+
summary.errors++;
|
|
273
|
+
summary.checked++;
|
|
274
|
+
if (delayMs > 0) await sleep(delayMs);
|
|
275
|
+
continue;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
const newLabel = computeNewLabel(pkg, registryStatus);
|
|
279
|
+
summary.checked++;
|
|
280
|
+
|
|
281
|
+
if (newLabel) {
|
|
282
|
+
labelChanges.set(key, newLabel);
|
|
283
|
+
if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
|
|
284
|
+
else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
|
|
285
|
+
else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
|
|
286
|
+
else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
|
|
287
|
+
|
|
288
|
+
if (dryRun) {
|
|
289
|
+
console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
|
|
290
|
+
}
|
|
291
|
+
} else {
|
|
292
|
+
summary.unchanged++;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
if (delayMs > 0) await sleep(delayMs);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// 4. Apply label changes to records
|
|
299
|
+
const outputLines = [];
|
|
300
|
+
for (const rec of records) {
|
|
301
|
+
if (!rec.data) {
|
|
302
|
+
outputLines.push(rec.raw);
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
|
|
306
|
+
const change = labelChanges.get(key);
|
|
307
|
+
if (change && RELABELABLE.has(rec.data.label)) {
|
|
308
|
+
rec.data.label = change.label;
|
|
309
|
+
rec.data.relabel_source = change.source;
|
|
310
|
+
rec.data.relabel_timestamp = new Date().toISOString();
|
|
311
|
+
outputLines.push(JSON.stringify(rec.data));
|
|
312
|
+
summary.records_updated++;
|
|
313
|
+
} else {
|
|
314
|
+
outputLines.push(rec.raw);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// 5. Write output
|
|
319
|
+
if (!dryRun) {
|
|
320
|
+
const dir = path.dirname(outputPath);
|
|
321
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
322
|
+
atomicWriteFileSync(outputPath, outputLines.join('\n'));
|
|
323
|
+
console.log(`[RELABEL] Written ${outputLines.length} records to ${path.basename(outputPath)} (${summary.records_updated} updated)`);
|
|
324
|
+
} else {
|
|
325
|
+
console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated`);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
console.log(`[RELABEL] Summary: ${summary.relabeled_malicious} malicious, ${summary.relabeled_benign} benign, ${summary.relabeled_likely_benign} likely_benign, ${summary.removed_unlabeled} removed_unlabeled, ${summary.unchanged} unchanged, ${summary.errors} errors`);
|
|
329
|
+
|
|
330
|
+
return summary;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
module.exports = {
|
|
334
|
+
checkNpmStatus,
|
|
335
|
+
checkPyPIStatus,
|
|
336
|
+
computeNewLabel,
|
|
337
|
+
relabelDataset,
|
|
338
|
+
// Constants (for testing)
|
|
339
|
+
RELABELABLE,
|
|
340
|
+
SURVIVAL_DAYS,
|
|
341
|
+
DEFAULT_INPUT,
|
|
342
|
+
DEFAULT_OUTPUT,
|
|
343
|
+
DEFAULT_DELAY_MS
|
|
344
|
+
};
|
package/src/monitor/daemon.js
CHANGED
|
@@ -3,10 +3,10 @@ const fs = require('fs');
|
|
|
3
3
|
const path = require('path');
|
|
4
4
|
const os = require('os');
|
|
5
5
|
const { isDockerAvailable, SANDBOX_CONCURRENCY_MAX } = require('../sandbox/index.js');
|
|
6
|
-
const { setVerboseMode, isSandboxEnabled, isCanaryEnabled, isLlmDetectiveEnabled, getLlmDetectiveMode } = require('./classify.js');
|
|
6
|
+
const { setVerboseMode, isSandboxEnabled, isCanaryEnabled, isLlmDetectiveEnabled, getLlmDetectiveMode, DOWNLOADS_CACHE_TTL } = require('./classify.js');
|
|
7
7
|
const { loadState, saveState, loadDailyStats, saveDailyStats, purgeTarballCache, getParisHour, atomicWriteFileSync } = require('./state.js');
|
|
8
8
|
const { isTemporalEnabled, isTemporalAstEnabled, isTemporalPublishEnabled, isTemporalMaintainerEnabled } = require('./temporal.js');
|
|
9
|
-
const { pendingGrouped, flushScopeGroup, sendDailyReport, DAILY_REPORT_HOUR } = require('./webhook.js');
|
|
9
|
+
const { pendingGrouped, flushScopeGroup, sendDailyReport, DAILY_REPORT_HOUR, alertedPackageRules } = require('./webhook.js');
|
|
10
10
|
const { poll } = require('./ingestion.js');
|
|
11
11
|
const { processQueue, SCAN_CONCURRENCY } = require('./queue.js');
|
|
12
12
|
const { startHealthcheck } = require('./healthcheck.js');
|
|
@@ -19,6 +19,7 @@ const QUEUE_PERSIST_INTERVAL = 60_000; // Persist queue to disk every 60s
|
|
|
19
19
|
const QUEUE_STATE_FILE = path.join(__dirname, '..', '..', 'data', 'queue-state.json');
|
|
20
20
|
const QUEUE_STATE_MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24h expiry
|
|
21
21
|
const MAX_QUEUE_PERSIST_SIZE = 100_000; // Don't persist if queue > 100K items
|
|
22
|
+
const MAX_SCAN_QUEUE = 10_000; // Backpressure: skip polling when queue exceeds this
|
|
22
23
|
|
|
23
24
|
function sleep(ms) {
|
|
24
25
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
@@ -87,13 +88,18 @@ function restoreQueue(scanQueue) {
|
|
|
87
88
|
return 0;
|
|
88
89
|
}
|
|
89
90
|
|
|
90
|
-
// Restore items
|
|
91
|
-
|
|
91
|
+
// Restore items (cap at MAX_SCAN_QUEUE to prevent OOM from stale persisted queues)
|
|
92
|
+
let items = data.items;
|
|
93
|
+
if (items.length > MAX_SCAN_QUEUE) {
|
|
94
|
+
console.log(`[MONITOR] Truncating restored queue from ${items.length} to ${MAX_SCAN_QUEUE} items`);
|
|
95
|
+
items = items.slice(0, MAX_SCAN_QUEUE);
|
|
96
|
+
}
|
|
97
|
+
const count = items.length;
|
|
92
98
|
if (count === 0) {
|
|
93
99
|
try { fs.unlinkSync(QUEUE_STATE_FILE); } catch {}
|
|
94
100
|
return 0;
|
|
95
101
|
}
|
|
96
|
-
scanQueue.push(...
|
|
102
|
+
scanQueue.push(...items);
|
|
97
103
|
console.log(`[MONITOR] Restored ${count} packages from queue state (saved at ${data.savedAt})`);
|
|
98
104
|
|
|
99
105
|
// Delete after successful restore
|
|
@@ -231,6 +237,49 @@ function checkDiskSpace() {
|
|
|
231
237
|
}
|
|
232
238
|
}
|
|
233
239
|
|
|
240
|
+
// --- Memory management ---
|
|
241
|
+
|
|
242
|
+
const MAX_RECENTLY_SCANNED = 50_000;
|
|
243
|
+
const MAX_ALERTED_PACKAGES = 5_000;
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* Prune in-memory caches to prevent unbounded growth between daily resets.
|
|
247
|
+
* Called hourly from the main loop. Targets:
|
|
248
|
+
* - recentlyScanned: Set used for 24h dedup (no TTL, only cleared at daily report)
|
|
249
|
+
* - downloadsCache: Map with 24h TTL but no proactive eviction
|
|
250
|
+
* - alertedPackageRules: Map for webhook dedup (only cleared at daily report)
|
|
251
|
+
*/
|
|
252
|
+
function pruneMemoryCaches(recentlyScanned, downloadsCache, alertedPackageRules) {
|
|
253
|
+
let pruned = 0;
|
|
254
|
+
|
|
255
|
+
// 1. recentlyScanned — cap size (FIFO semantics: oldest entries are irrelevant)
|
|
256
|
+
if (recentlyScanned.size > MAX_RECENTLY_SCANNED) {
|
|
257
|
+
console.log(`[MONITOR] PRUNE: recentlyScanned ${recentlyScanned.size} > ${MAX_RECENTLY_SCANNED} — clearing`);
|
|
258
|
+
recentlyScanned.clear();
|
|
259
|
+
pruned++;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// 2. downloadsCache — evict entries past 24h TTL
|
|
263
|
+
const now = Date.now();
|
|
264
|
+
for (const [key, entry] of downloadsCache) {
|
|
265
|
+
if (now - entry.fetchedAt > DOWNLOADS_CACHE_TTL) {
|
|
266
|
+
downloadsCache.delete(key);
|
|
267
|
+
pruned++;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// 3. alertedPackageRules — cap size
|
|
272
|
+
if (alertedPackageRules.size > MAX_ALERTED_PACKAGES) {
|
|
273
|
+
console.log(`[MONITOR] PRUNE: alertedPackageRules ${alertedPackageRules.size} > ${MAX_ALERTED_PACKAGES} — clearing`);
|
|
274
|
+
alertedPackageRules.clear();
|
|
275
|
+
pruned++;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if (pruned > 0) {
|
|
279
|
+
console.log(`[MONITOR] PRUNE: ${pruned} cache entries/collections pruned`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
234
283
|
function reportStats(stats) {
|
|
235
284
|
const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
|
|
236
285
|
const { t1, t1a, t1b, t2, t3 } = stats.suspectByTier;
|
|
@@ -432,6 +481,12 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
432
481
|
let pollInProgress = false;
|
|
433
482
|
pollIntervalHandle = setInterval(async () => {
|
|
434
483
|
if (!running || pollInProgress) return;
|
|
484
|
+
// Backpressure: skip poll when queue is too deep.
|
|
485
|
+
// CouchDB seq is NOT advanced — next poll resumes from the same point. No packages lost.
|
|
486
|
+
if (scanQueue.length >= MAX_SCAN_QUEUE) {
|
|
487
|
+
console.log(`[MONITOR] BACKPRESSURE: skipping poll (queue ${scanQueue.length} >= ${MAX_SCAN_QUEUE})`);
|
|
488
|
+
return;
|
|
489
|
+
}
|
|
435
490
|
pollInProgress = true;
|
|
436
491
|
try {
|
|
437
492
|
await poll(state, scanQueue, stats);
|
|
@@ -460,21 +515,62 @@ async function startMonitor(options, stats, dailyAlerts, recentlyScanned, downlo
|
|
|
460
515
|
// Consumes scanQueue independently of polling. Workers inside processQueue
|
|
461
516
|
// check scanQueue.length > 0 after each item, so items added by a concurrent
|
|
462
517
|
// poll are picked up immediately by running workers.
|
|
518
|
+
const MEMORY_LOG_INTERVAL = 300_000; // 5 minutes
|
|
519
|
+
const MEMORY_PRESSURE_THRESHOLD = 0.85; // 85% heap usage triggers emergency prune
|
|
520
|
+
let lastMemoryLogTime = Date.now();
|
|
521
|
+
|
|
463
522
|
while (running) {
|
|
464
523
|
if (scanQueue.length > 0) {
|
|
465
524
|
await processQueue(scanQueue, stats, dailyAlerts, recentlyScanned, downloadsCache, sandboxAvailableRef.value);
|
|
466
525
|
}
|
|
467
526
|
|
|
468
|
-
//
|
|
527
|
+
// ─── Memory watchdog (every 5 min) ───
|
|
528
|
+
if (Date.now() - lastMemoryLogTime >= MEMORY_LOG_INTERVAL) {
|
|
529
|
+
const mem = process.memoryUsage();
|
|
530
|
+
const heapUsedMB = (mem.heapUsed / 1024 / 1024).toFixed(0);
|
|
531
|
+
const heapTotalMB = (mem.heapTotal / 1024 / 1024).toFixed(0);
|
|
532
|
+
const rssMB = (mem.rss / 1024 / 1024).toFixed(0);
|
|
533
|
+
console.log(`[MONITOR] MEMORY: heap=${heapUsedMB}MB/${heapTotalMB}MB, rss=${rssMB}MB, queue=${scanQueue.length}, dedup=${recentlyScanned.size}, downloads=${downloadsCache.size}, alerts=${alertedPackageRules.size}`);
|
|
534
|
+
|
|
535
|
+
// Emergency prune under memory pressure
|
|
536
|
+
if (mem.heapUsed / mem.heapTotal > MEMORY_PRESSURE_THRESHOLD) {
|
|
537
|
+
console.error(`[MONITOR] MEMORY PRESSURE: heap at ${((mem.heapUsed / mem.heapTotal) * 100).toFixed(0)}% — emergency prune`);
|
|
538
|
+
recentlyScanned.clear();
|
|
539
|
+
downloadsCache.clear();
|
|
540
|
+
alertedPackageRules.clear();
|
|
541
|
+
// Force GC if available (requires --expose-gc)
|
|
542
|
+
if (global.gc) {
|
|
543
|
+
global.gc();
|
|
544
|
+
console.log('[MONITOR] Forced garbage collection');
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
lastMemoryLogTime = Date.now();
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Hourly stats report + cache purge + runsc cleanup + memory pruning
|
|
469
551
|
if (Date.now() - stats.lastReportTime >= 3600_000) {
|
|
470
552
|
reportStats(stats);
|
|
471
553
|
purgeTarballCache();
|
|
472
554
|
cleanupRunscOrphans();
|
|
555
|
+
pruneMemoryCaches(recentlyScanned, downloadsCache, alertedPackageRules);
|
|
473
556
|
}
|
|
474
557
|
|
|
475
558
|
// Daily webhook report at 08:00 Paris time
|
|
476
559
|
if (isDailyReportDue(stats)) {
|
|
477
560
|
await sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCache);
|
|
561
|
+
// Auto-relabel JSONL training data after daily report (once per day).
|
|
562
|
+
// Checks registry takedown status for unconfirmed packages.
|
|
563
|
+
try {
|
|
564
|
+
const { relabelDataset } = require('./auto-labeler.js');
|
|
565
|
+
const summary = await relabelDataset({});
|
|
566
|
+
const totalRelabeled = summary.relabeled_malicious + summary.relabeled_benign + summary.relabeled_likely_benign;
|
|
567
|
+
if (totalRelabeled > 0) {
|
|
568
|
+
console.log(`[MONITOR] Auto-relabel: ${summary.relabeled_malicious} malicious, ${summary.relabeled_benign} benign, ${summary.relabeled_likely_benign} likely_benign (${summary.checked} checked)`);
|
|
569
|
+
}
|
|
570
|
+
} catch (err) {
|
|
571
|
+
// Non-fatal: relabel failure must never crash the monitor
|
|
572
|
+
console.error(`[MONITOR] Auto-relabel failed: ${err.message}`);
|
|
573
|
+
}
|
|
478
574
|
}
|
|
479
575
|
|
|
480
576
|
// Short pause before re-checking queue — yields event loop for poll interval
|
|
@@ -499,5 +595,9 @@ module.exports = {
|
|
|
499
595
|
QUEUE_PERSIST_INTERVAL,
|
|
500
596
|
QUEUE_STATE_FILE,
|
|
501
597
|
QUEUE_STATE_MAX_AGE_MS,
|
|
502
|
-
MAX_QUEUE_PERSIST_SIZE
|
|
598
|
+
MAX_QUEUE_PERSIST_SIZE,
|
|
599
|
+
MAX_SCAN_QUEUE,
|
|
600
|
+
pruneMemoryCaches,
|
|
601
|
+
MAX_RECENTLY_SCANNED,
|
|
602
|
+
MAX_ALERTED_PACKAGES
|
|
503
603
|
};
|
package/src/monitor/ingestion.js
CHANGED
|
@@ -644,6 +644,14 @@ async function pollPyPI(state, scanQueue) {
|
|
|
644
644
|
* @param {Object} stats - Mutable stats object
|
|
645
645
|
*/
|
|
646
646
|
async function poll(state, scanQueue, stats) {
|
|
647
|
+
// Backpressure: skip ingestion when queue is saturated.
|
|
648
|
+
// CouchDB seq and PyPI lastPackage are NOT advanced — next poll resumes from same point.
|
|
649
|
+
const MAX_SCAN_QUEUE = 10_000;
|
|
650
|
+
if (scanQueue.length >= MAX_SCAN_QUEUE) {
|
|
651
|
+
console.log(`[MONITOR] BACKPRESSURE: skipping poll (queue ${scanQueue.length} >= ${MAX_SCAN_QUEUE})`);
|
|
652
|
+
return;
|
|
653
|
+
}
|
|
654
|
+
|
|
647
655
|
const timestamp = new Date().toISOString().slice(0, 19).replace('T', ' ');
|
|
648
656
|
console.log(`[MONITOR] ${timestamp} — polling registries...`);
|
|
649
657
|
|
package/src/monitor/queue.js
CHANGED
|
@@ -413,11 +413,15 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
413
413
|
// First-publish detection: used for sandbox priority below
|
|
414
414
|
const isFirstPublish = cacheTrigger && cacheTrigger.reason === 'first_publish';
|
|
415
415
|
|
|
416
|
-
//
|
|
417
|
-
//
|
|
418
|
-
//
|
|
416
|
+
// Fetch npm registry metadata for ALL npm packages (not just those with findings).
|
|
417
|
+
// Needed for: (1) isFirstPublishHighRisk decision, (2) ML classifier features,
|
|
418
|
+
// (3) JSONL training records — clean packages MUST have metadata to prevent
|
|
419
|
+
// data leakage (model learning "metadata=0 → clean" instead of behavioral signals).
|
|
420
|
+
// Cost: near-zero for npm packages because temporal checks (line ~1014) already
|
|
421
|
+
// pre-fetch registry metadata into temporal-analysis._metadataCache, and
|
|
422
|
+
// getPackageMetadata() reads this cache first (npm-registry.js:87-95).
|
|
419
423
|
let npmRegistryMeta = null;
|
|
420
|
-
if (
|
|
424
|
+
if (ecosystem === 'npm') {
|
|
421
425
|
try {
|
|
422
426
|
const { getPackageMetadata } = require('../scanner/npm-registry.js');
|
|
423
427
|
npmRegistryMeta = await getPackageMetadata(name);
|
|
@@ -589,19 +593,43 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
589
593
|
console.log(`[MONITOR] FINDINGS: ${name}@${version} → ${formatFindings(result)}`);
|
|
590
594
|
|
|
591
595
|
// ML Phase 2: classifier filter for T1 zone (score 20-34)
|
|
592
|
-
// Reduces FP webhook noise by filtering clean packages before sandbox/webhook.
|
|
593
596
|
// Guard rails in classifyPackage() ensure HC types and high-score packages are never suppressed.
|
|
594
597
|
// Hoisted so trySendWebhook can use ML result to prevent suppression (p >= 0.90).
|
|
595
|
-
//
|
|
598
|
+
//
|
|
599
|
+
// DISABLED (2026-04-08): Model has collapsed — predicts p≈0.002 for ALL inputs (always "clean"),
|
|
600
|
+
// including clearly malicious patterns (lifecycle+exec+staged_payload). This suppresses real
|
|
601
|
+
// threats as ml_clean (false negatives). Disabled until model is retrained on corrected JSONL
|
|
602
|
+
// data with balanced labels. The classifier still runs in LOG-ONLY mode to collect data for
|
|
603
|
+
// retraining validation, but its prediction is never used for filtering.
|
|
604
|
+
//
|
|
605
|
+
// Guards added: ecosystem === 'npm' (PyPI has no npm registry metadata),
|
|
606
|
+
// npmRegistryMeta fallback fetch (ensure metadata is never null for ML features).
|
|
596
607
|
let mlResult = null;
|
|
597
608
|
const riskScore = result.summary.riskScore || 0;
|
|
598
|
-
if ((tier === '1a' || tier === '1b') && riskScore >= 20 && riskScore < 35) {
|
|
609
|
+
if ((tier === '1a' || tier === '1b') && riskScore >= 20 && riskScore < 35 && ecosystem === 'npm') {
|
|
599
610
|
try {
|
|
600
611
|
const { classifyPackage, isModelAvailable } = require('../ml/classifier.js');
|
|
601
612
|
if (isModelAvailable()) {
|
|
613
|
+
// Defensive: ensure npmRegistryMeta is fetched (should already be from line ~420,
|
|
614
|
+
// but network failures can silently leave it null)
|
|
615
|
+
if (!npmRegistryMeta) {
|
|
616
|
+
try {
|
|
617
|
+
const { getPackageMetadata } = require('../scanner/npm-registry.js');
|
|
618
|
+
npmRegistryMeta = await getPackageMetadata(name);
|
|
619
|
+
if (!npmRegistryMeta) {
|
|
620
|
+
console.warn(`[ML] Registry metadata unavailable for ${name} — ML features will be zero-filled`);
|
|
621
|
+
}
|
|
622
|
+
} catch (fetchErr) {
|
|
623
|
+
console.warn(`[ML] Registry metadata fetch failed for ${name}: ${fetchErr.message}`);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
602
626
|
const enrichedMeta = { npmRegistryMeta, fileCountTotal, hasTests, unpackedSize: meta.unpackedSize, registryMeta: meta };
|
|
603
627
|
mlResult = classifyPackage(result, enrichedMeta);
|
|
604
|
-
|
|
628
|
+
// LOG-ONLY: record ML prediction for retraining data but do NOT filter.
|
|
629
|
+
// When model is retrained and validated, remove the 'true ||' guard below.
|
|
630
|
+
console.log(`[MONITOR] ML LOG-ONLY: ${name}@${version} (prediction=${mlResult.prediction}, p=${mlResult.probability}, score=${riskScore})`);
|
|
631
|
+
if (false && mlResult.prediction === 'clean') {
|
|
632
|
+
// DISABLED: model collapsed (p≈0.002 for all inputs). Re-enable after retrain.
|
|
605
633
|
console.log(`[MONITOR] ML CLEAN: ${name}@${version} (p=${mlResult.probability}, score=${riskScore})`);
|
|
606
634
|
stats.mlFiltered++;
|
|
607
635
|
stats.scanned++;
|
|
@@ -612,8 +640,6 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
612
640
|
recordTrainingSample(result, { name, version, ecosystem, label: 'ml_clean', tier, registryMeta: meta, unpackedSize: meta.unpackedSize, npmRegistryMeta, fileCountTotal, hasTests });
|
|
613
641
|
return { sandboxResult: null, mlFiltered: true, tier };
|
|
614
642
|
}
|
|
615
|
-
// Not clean — proceed normally
|
|
616
|
-
console.log(`[MONITOR] ML SUSPECT: ${name}@${version} (p=${mlResult.probability}, reason=${mlResult.reason})`);
|
|
617
643
|
}
|
|
618
644
|
} catch (err) {
|
|
619
645
|
// Non-fatal: ML failure must never block the scan pipeline
|
|
@@ -865,10 +891,18 @@ function isDailyReportDue(stats) {
|
|
|
865
891
|
* Encapsulates the full per-package flow: scan -> sandbox -> reputation -> webhook.
|
|
866
892
|
*/
|
|
867
893
|
async function processQueueItem(item, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailable) {
|
|
894
|
+
// AbortController: signals the scan to stop after timeout.
|
|
895
|
+
// Prevents zombie scans from continuing expensive work (HTTP, sandbox) in the background.
|
|
896
|
+
const controller = new AbortController();
|
|
897
|
+
const timeoutId = setTimeout(() => controller.abort(), SCAN_TIMEOUT_MS);
|
|
868
898
|
try {
|
|
869
899
|
await Promise.race([
|
|
870
|
-
resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailable),
|
|
871
|
-
|
|
900
|
+
resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailable, controller.signal),
|
|
901
|
+
new Promise((_, reject) => {
|
|
902
|
+
controller.signal.addEventListener('abort', () => {
|
|
903
|
+
reject(new Error(`Scan timeout after ${SCAN_TIMEOUT_MS / 1000}s`));
|
|
904
|
+
}, { once: true });
|
|
905
|
+
})
|
|
872
906
|
]);
|
|
873
907
|
} catch (err) {
|
|
874
908
|
recordError(err, stats);
|
|
@@ -900,6 +934,8 @@ async function processQueueItem(item, stats, dailyAlerts, recentlyScanned, downl
|
|
|
900
934
|
console.error(`[MONITOR] IOC fallback webhook failed: ${webhookErr.message}`);
|
|
901
935
|
}
|
|
902
936
|
}
|
|
937
|
+
} finally {
|
|
938
|
+
clearTimeout(timeoutId);
|
|
903
939
|
}
|
|
904
940
|
maybePersistDailyStats(stats, dailyAlerts);
|
|
905
941
|
|
|
@@ -942,7 +978,9 @@ async function processQueue(scanQueue, stats, dailyAlerts, recentlyScanned, down
|
|
|
942
978
|
* For npm packages, tarballUrl is already set from the registry response.
|
|
943
979
|
* For PyPI packages, we need to fetch the JSON API to get the tarball URL.
|
|
944
980
|
*/
|
|
945
|
-
async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailable) {
|
|
981
|
+
async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned, downloadsCache, scanQueue, sandboxAvailable, signal) {
|
|
982
|
+
if (signal && signal.aborted) return;
|
|
983
|
+
|
|
946
984
|
if (item.ecosystem === 'npm' && !item.tarballUrl) {
|
|
947
985
|
try {
|
|
948
986
|
const npmInfo = await getNpmLatestTarball(item.name);
|
|
@@ -1001,6 +1039,9 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1001
1039
|
}
|
|
1002
1040
|
recentlyScanned.add(dedupeKey);
|
|
1003
1041
|
|
|
1042
|
+
// Abort check: if timeout fired during URL resolution or dedup, bail out
|
|
1043
|
+
if (signal && signal.aborted) return;
|
|
1044
|
+
|
|
1004
1045
|
// Temporal analysis: check for sudden lifecycle script changes (npm only)
|
|
1005
1046
|
// Webhooks are deferred until after sandbox confirms the threat
|
|
1006
1047
|
let temporalResult = null;
|
|
@@ -1023,6 +1064,9 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1023
1064
|
maintainerResult = maintRes.status === 'fulfilled' ? maintRes.value : null;
|
|
1024
1065
|
}
|
|
1025
1066
|
|
|
1067
|
+
// Abort check: if timeout fired during temporal checks, skip the expensive scan
|
|
1068
|
+
if (signal && signal.aborted) return;
|
|
1069
|
+
|
|
1026
1070
|
const scanResult = await scanPackage(item.name, item.version, item.ecosystem, item.tarballUrl, {
|
|
1027
1071
|
unpackedSize: item.unpackedSize || 0,
|
|
1028
1072
|
registryScripts: item.registryScripts || null,
|