muaddib-scanner 2.10.65 → 2.10.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/monitor/auto-labeler.js +121 -102
package/package.json
CHANGED
|
@@ -18,11 +18,10 @@ const fs = require('fs');
|
|
|
18
18
|
const path = require('path');
|
|
19
19
|
const https = require('https');
|
|
20
20
|
const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
|
|
21
|
-
const { atomicWriteFileSync } = require('./state.js');
|
|
22
21
|
|
|
23
22
|
const DEFAULT_INPUT = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
|
|
24
23
|
const DEFAULT_OUTPUT = path.join(__dirname, '..', '..', 'data', 'ml-training-relabeled.jsonl');
|
|
25
|
-
const DEFAULT_DELAY_MS =
|
|
24
|
+
const DEFAULT_DELAY_MS = 50; // 20 req/s — CLI one-shot, no monitor slot sharing needed
|
|
26
25
|
const SURVIVAL_DAYS = 30;
|
|
27
26
|
|
|
28
27
|
// Labels eligible for auto-relabeling
|
|
@@ -72,8 +71,8 @@ function sleep(ms) {
|
|
|
72
71
|
* @param {string} name - package name
|
|
73
72
|
* @returns {Promise<{status: string, latestVersion?: string, detail?: string}>}
|
|
74
73
|
*/
|
|
75
|
-
async function checkNpmStatus(name) {
|
|
76
|
-
await acquireRegistrySlot();
|
|
74
|
+
async function checkNpmStatus(name, options = {}) {
|
|
75
|
+
if (!options.skipSemaphore) await acquireRegistrySlot();
|
|
77
76
|
try {
|
|
78
77
|
const data = await httpsGetJson(`https://registry.npmjs.org/${encodeURIComponent(name)}`);
|
|
79
78
|
|
|
@@ -90,7 +89,7 @@ async function checkNpmStatus(name) {
|
|
|
90
89
|
} catch (err) {
|
|
91
90
|
return { status: 'error', detail: err.message };
|
|
92
91
|
} finally {
|
|
93
|
-
releaseRegistrySlot();
|
|
92
|
+
if (!options.skipSemaphore) releaseRegistrySlot();
|
|
94
93
|
}
|
|
95
94
|
}
|
|
96
95
|
|
|
@@ -191,52 +190,42 @@ async function relabelDataset(options = {}) {
|
|
|
191
190
|
const dryRun = options.dryRun || false;
|
|
192
191
|
const delayMs = options.delayMs != null ? options.delayMs : DEFAULT_DELAY_MS;
|
|
193
192
|
|
|
194
|
-
// 1.
|
|
193
|
+
// 1. Build package map from input (records freed after block scope)
|
|
195
194
|
if (!fs.existsSync(inputPath)) {
|
|
196
195
|
throw new Error(`Input file not found: ${inputPath}`);
|
|
197
196
|
}
|
|
198
|
-
|
|
199
|
-
const
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
const
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
timestamp
|
|
223
|
-
|
|
224
|
-
});
|
|
225
|
-
}
|
|
226
|
-
packageMap.get(key).indices.push(rec.idx);
|
|
227
|
-
// Use highest score seen for this package
|
|
228
|
-
if ((rec.data.score || 0) > packageMap.get(key).score) {
|
|
229
|
-
packageMap.get(key).score = rec.data.score;
|
|
230
|
-
}
|
|
231
|
-
// Use earliest timestamp
|
|
232
|
-
if (rec.data.timestamp && (!packageMap.get(key).timestamp || rec.data.timestamp < packageMap.get(key).timestamp)) {
|
|
233
|
-
packageMap.get(key).timestamp = rec.data.timestamp;
|
|
197
|
+
let recordCount = 0;
|
|
198
|
+
const packageMap = new Map(); // key → { name, ecosystem, score, timestamp }
|
|
199
|
+
{
|
|
200
|
+
const content = fs.readFileSync(inputPath, 'utf8');
|
|
201
|
+
const lines = content.split('\n');
|
|
202
|
+
for (let i = 0; i < lines.length; i++) {
|
|
203
|
+
const line = lines[i].trim();
|
|
204
|
+
if (!line) continue;
|
|
205
|
+
recordCount++;
|
|
206
|
+
let data;
|
|
207
|
+
try { data = JSON.parse(line); } catch { continue; }
|
|
208
|
+
if (!RELABELABLE.has(data.label)) continue;
|
|
209
|
+
const key = `${data.ecosystem || 'npm'}/${data.name}`;
|
|
210
|
+
if (!packageMap.has(key)) {
|
|
211
|
+
packageMap.set(key, {
|
|
212
|
+
name: data.name,
|
|
213
|
+
ecosystem: data.ecosystem || 'npm',
|
|
214
|
+
score: data.score || 0,
|
|
215
|
+
timestamp: data.timestamp
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
const pkg = packageMap.get(key);
|
|
219
|
+
if ((data.score || 0) > pkg.score) pkg.score = data.score;
|
|
220
|
+
if (data.timestamp && (!pkg.timestamp || data.timestamp < pkg.timestamp)) {
|
|
221
|
+
pkg.timestamp = data.timestamp;
|
|
222
|
+
}
|
|
234
223
|
}
|
|
235
|
-
}
|
|
224
|
+
} // content, lines — eligible for GC before registry checks
|
|
236
225
|
|
|
237
|
-
console.log(`[RELABEL] ${
|
|
226
|
+
console.log(`[RELABEL] ${recordCount} records, ${packageMap.size} unique packages to check`);
|
|
238
227
|
|
|
239
|
-
//
|
|
228
|
+
// 2. Check each package against registry (crash-resilient)
|
|
240
229
|
const summary = {
|
|
241
230
|
checked: 0,
|
|
242
231
|
relabeled_malicious: 0,
|
|
@@ -247,85 +236,115 @@ async function relabelDataset(options = {}) {
|
|
|
247
236
|
errors: 0,
|
|
248
237
|
records_updated: 0
|
|
249
238
|
};
|
|
250
|
-
|
|
251
239
|
const labelChanges = new Map(); // packageKey → { label, source }
|
|
240
|
+
let registryError = null;
|
|
252
241
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
242
|
+
try {
|
|
243
|
+
const total = packageMap.size;
|
|
244
|
+
for (const [key, pkg] of packageMap) {
|
|
245
|
+
const t0 = Date.now();
|
|
246
|
+
let registryStatus;
|
|
247
|
+
try {
|
|
248
|
+
if (pkg.ecosystem === 'npm') {
|
|
249
|
+
registryStatus = await checkNpmStatus(pkg.name, { skipSemaphore: true });
|
|
250
|
+
} else if (pkg.ecosystem === 'pypi') {
|
|
251
|
+
registryStatus = await checkPyPIStatus(pkg.name);
|
|
252
|
+
} else {
|
|
253
|
+
summary.unchanged++;
|
|
254
|
+
summary.checked++;
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
} catch (err) {
|
|
258
|
+
summary.errors++;
|
|
262
259
|
summary.checked++;
|
|
260
|
+
console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${err.message}`);
|
|
263
261
|
continue;
|
|
264
262
|
}
|
|
265
|
-
} catch (err) {
|
|
266
|
-
summary.errors++;
|
|
267
|
-
summary.checked++;
|
|
268
|
-
continue;
|
|
269
|
-
}
|
|
270
263
|
|
|
271
|
-
|
|
272
|
-
|
|
264
|
+
if (registryStatus.status === 'error') {
|
|
265
|
+
summary.errors++;
|
|
266
|
+
summary.checked++;
|
|
267
|
+
console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${registryStatus.detail}`);
|
|
268
|
+
if (delayMs > 0) await sleep(delayMs);
|
|
269
|
+
continue;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const newLabel = computeNewLabel(pkg, registryStatus);
|
|
273
273
|
summary.checked++;
|
|
274
|
-
|
|
275
|
-
continue;
|
|
276
|
-
}
|
|
274
|
+
console.log(`[RELABEL] ${key} → ${newLabel ? newLabel.label : 'unchanged'} (${registryStatus.status}, ${Date.now() - t0}ms)`);
|
|
277
275
|
|
|
278
|
-
|
|
279
|
-
|
|
276
|
+
if (summary.checked % 100 === 0) {
|
|
277
|
+
console.log(`[RELABEL] Progress: ${summary.checked}/${total} checked`);
|
|
278
|
+
}
|
|
280
279
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
280
|
+
if (newLabel) {
|
|
281
|
+
labelChanges.set(key, newLabel);
|
|
282
|
+
if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
|
|
283
|
+
else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
|
|
284
|
+
else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
|
|
285
|
+
else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
|
|
287
286
|
|
|
288
|
-
|
|
289
|
-
|
|
287
|
+
if (dryRun) {
|
|
288
|
+
console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
|
|
289
|
+
}
|
|
290
|
+
} else {
|
|
291
|
+
summary.unchanged++;
|
|
290
292
|
}
|
|
291
|
-
} else {
|
|
292
|
-
summary.unchanged++;
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
if (delayMs > 0) await sleep(delayMs);
|
|
296
|
-
}
|
|
297
293
|
|
|
298
|
-
|
|
299
|
-
const outputLines = [];
|
|
300
|
-
for (const rec of records) {
|
|
301
|
-
if (!rec.data) {
|
|
302
|
-
outputLines.push(rec.raw);
|
|
303
|
-
continue;
|
|
304
|
-
}
|
|
305
|
-
const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
|
|
306
|
-
const change = labelChanges.get(key);
|
|
307
|
-
if (change && RELABELABLE.has(rec.data.label)) {
|
|
308
|
-
rec.data.label = change.label;
|
|
309
|
-
rec.data.relabel_source = change.source;
|
|
310
|
-
rec.data.relabel_timestamp = new Date().toISOString();
|
|
311
|
-
outputLines.push(JSON.stringify(rec.data));
|
|
312
|
-
summary.records_updated++;
|
|
313
|
-
} else {
|
|
314
|
-
outputLines.push(rec.raw);
|
|
294
|
+
if (delayMs > 0) await sleep(delayMs);
|
|
315
295
|
}
|
|
296
|
+
} catch (err) {
|
|
297
|
+
registryError = err;
|
|
298
|
+
console.error(`[RELABEL] Registry check interrupted at ${summary.checked}/${packageMap.size}: ${err.message}`);
|
|
316
299
|
}
|
|
317
300
|
|
|
318
|
-
//
|
|
301
|
+
// 3. Stream output: re-read input, apply collected labelChanges, write line by line
|
|
319
302
|
if (!dryRun) {
|
|
320
303
|
const dir = path.dirname(outputPath);
|
|
321
304
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
322
|
-
|
|
323
|
-
|
|
305
|
+
const tmpPath = outputPath + '.tmp';
|
|
306
|
+
const ws = fs.createWriteStream(tmpPath);
|
|
307
|
+
const relabelTs = new Date().toISOString();
|
|
308
|
+
const inputContent = fs.readFileSync(inputPath, 'utf8');
|
|
309
|
+
const inputLines = inputContent.split('\n');
|
|
310
|
+
let linesWritten = 0;
|
|
311
|
+
for (let i = 0; i < inputLines.length; i++) {
|
|
312
|
+
const raw = inputLines[i];
|
|
313
|
+
const trimmed = raw.trim();
|
|
314
|
+
let outputLine = raw;
|
|
315
|
+
if (trimmed) {
|
|
316
|
+
try {
|
|
317
|
+
const data = JSON.parse(trimmed);
|
|
318
|
+
const key = `${data.ecosystem || 'npm'}/${data.name}`;
|
|
319
|
+
const change = labelChanges.get(key);
|
|
320
|
+
if (change && RELABELABLE.has(data.label)) {
|
|
321
|
+
data.label = change.label;
|
|
322
|
+
data.relabel_source = change.source;
|
|
323
|
+
data.relabel_timestamp = relabelTs;
|
|
324
|
+
outputLine = JSON.stringify(data);
|
|
325
|
+
summary.records_updated++;
|
|
326
|
+
}
|
|
327
|
+
} catch { /* unparseable line — keep as-is */ }
|
|
328
|
+
linesWritten++;
|
|
329
|
+
}
|
|
330
|
+
ws.write(outputLine);
|
|
331
|
+
if (i < inputLines.length - 1) ws.write('\n');
|
|
332
|
+
}
|
|
333
|
+
await new Promise((resolve, reject) => {
|
|
334
|
+
ws.on('finish', resolve);
|
|
335
|
+
ws.on('error', reject);
|
|
336
|
+
ws.end();
|
|
337
|
+
});
|
|
338
|
+
fs.renameSync(tmpPath, outputPath);
|
|
339
|
+
console.log(`[RELABEL] Written ${linesWritten} records to ${path.basename(outputPath)} (${summary.records_updated} updated${registryError ? ', PARTIAL' : ''})`);
|
|
324
340
|
} else {
|
|
325
|
-
console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated`);
|
|
341
|
+
console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated${registryError ? ' (PARTIAL)' : ''}`);
|
|
326
342
|
}
|
|
327
343
|
|
|
328
344
|
console.log(`[RELABEL] Summary: ${summary.relabeled_malicious} malicious, ${summary.relabeled_benign} benign, ${summary.relabeled_likely_benign} likely_benign, ${summary.removed_unlabeled} removed_unlabeled, ${summary.unchanged} unchanged, ${summary.errors} errors`);
|
|
345
|
+
if (registryError) {
|
|
346
|
+
console.error(`[RELABEL] WARNING: Partial results — registry check failed after ${summary.checked}/${packageMap.size} packages`);
|
|
347
|
+
}
|
|
329
348
|
|
|
330
349
|
return summary;
|
|
331
350
|
}
|