muaddib-scanner 2.10.66 → 2.10.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/monitor/auto-labeler.js +117 -107
package/package.json
CHANGED
|
@@ -18,7 +18,6 @@ const fs = require('fs');
|
|
|
18
18
|
const path = require('path');
|
|
19
19
|
const https = require('https');
|
|
20
20
|
const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
|
|
21
|
-
const { atomicWriteFileSync } = require('./state.js');
|
|
22
21
|
|
|
23
22
|
const DEFAULT_INPUT = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
|
|
24
23
|
const DEFAULT_OUTPUT = path.join(__dirname, '..', '..', 'data', 'ml-training-relabeled.jsonl');
|
|
@@ -191,52 +190,42 @@ async function relabelDataset(options = {}) {
|
|
|
191
190
|
const dryRun = options.dryRun || false;
|
|
192
191
|
const delayMs = options.delayMs != null ? options.delayMs : DEFAULT_DELAY_MS;
|
|
193
192
|
|
|
194
|
-
// 1.
|
|
193
|
+
// 1. Build package map from input (records freed after block scope)
|
|
195
194
|
if (!fs.existsSync(inputPath)) {
|
|
196
195
|
throw new Error(`Input file not found: ${inputPath}`);
|
|
197
196
|
}
|
|
198
|
-
|
|
199
|
-
const
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
const
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
timestamp
|
|
223
|
-
|
|
224
|
-
});
|
|
225
|
-
}
|
|
226
|
-
packageMap.get(key).indices.push(rec.idx);
|
|
227
|
-
// Use highest score seen for this package
|
|
228
|
-
if ((rec.data.score || 0) > packageMap.get(key).score) {
|
|
229
|
-
packageMap.get(key).score = rec.data.score;
|
|
230
|
-
}
|
|
231
|
-
// Use earliest timestamp
|
|
232
|
-
if (rec.data.timestamp && (!packageMap.get(key).timestamp || rec.data.timestamp < packageMap.get(key).timestamp)) {
|
|
233
|
-
packageMap.get(key).timestamp = rec.data.timestamp;
|
|
197
|
+
let recordCount = 0;
|
|
198
|
+
const packageMap = new Map(); // key → { name, ecosystem, score, timestamp }
|
|
199
|
+
{
|
|
200
|
+
const content = fs.readFileSync(inputPath, 'utf8');
|
|
201
|
+
const lines = content.split('\n');
|
|
202
|
+
for (let i = 0; i < lines.length; i++) {
|
|
203
|
+
const line = lines[i].trim();
|
|
204
|
+
if (!line) continue;
|
|
205
|
+
recordCount++;
|
|
206
|
+
let data;
|
|
207
|
+
try { data = JSON.parse(line); } catch { continue; }
|
|
208
|
+
if (!RELABELABLE.has(data.label)) continue;
|
|
209
|
+
const key = `${data.ecosystem || 'npm'}/${data.name}`;
|
|
210
|
+
if (!packageMap.has(key)) {
|
|
211
|
+
packageMap.set(key, {
|
|
212
|
+
name: data.name,
|
|
213
|
+
ecosystem: data.ecosystem || 'npm',
|
|
214
|
+
score: data.score || 0,
|
|
215
|
+
timestamp: data.timestamp
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
const pkg = packageMap.get(key);
|
|
219
|
+
if ((data.score || 0) > pkg.score) pkg.score = data.score;
|
|
220
|
+
if (data.timestamp && (!pkg.timestamp || data.timestamp < pkg.timestamp)) {
|
|
221
|
+
pkg.timestamp = data.timestamp;
|
|
222
|
+
}
|
|
234
223
|
}
|
|
235
|
-
}
|
|
224
|
+
} // content, lines — eligible for GC before registry checks
|
|
236
225
|
|
|
237
|
-
console.log(`[RELABEL] ${
|
|
226
|
+
console.log(`[RELABEL] ${recordCount} records, ${packageMap.size} unique packages to check`);
|
|
238
227
|
|
|
239
|
-
//
|
|
228
|
+
// 2. Check each package against registry (crash-resilient)
|
|
240
229
|
const summary = {
|
|
241
230
|
checked: 0,
|
|
242
231
|
relabeled_malicious: 0,
|
|
@@ -247,94 +236,115 @@ async function relabelDataset(options = {}) {
|
|
|
247
236
|
errors: 0,
|
|
248
237
|
records_updated: 0
|
|
249
238
|
};
|
|
250
|
-
|
|
251
239
|
const labelChanges = new Map(); // packageKey → { label, source }
|
|
240
|
+
let registryError = null;
|
|
252
241
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
const
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
242
|
+
try {
|
|
243
|
+
const total = packageMap.size;
|
|
244
|
+
for (const [key, pkg] of packageMap) {
|
|
245
|
+
const t0 = Date.now();
|
|
246
|
+
let registryStatus;
|
|
247
|
+
try {
|
|
248
|
+
if (pkg.ecosystem === 'npm') {
|
|
249
|
+
registryStatus = await checkNpmStatus(pkg.name, { skipSemaphore: true });
|
|
250
|
+
} else if (pkg.ecosystem === 'pypi') {
|
|
251
|
+
registryStatus = await checkPyPIStatus(pkg.name);
|
|
252
|
+
} else {
|
|
253
|
+
summary.unchanged++;
|
|
254
|
+
summary.checked++;
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
} catch (err) {
|
|
258
|
+
summary.errors++;
|
|
264
259
|
summary.checked++;
|
|
260
|
+
console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${err.message}`);
|
|
265
261
|
continue;
|
|
266
262
|
}
|
|
267
|
-
} catch (err) {
|
|
268
|
-
summary.errors++;
|
|
269
|
-
summary.checked++;
|
|
270
|
-
console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${err.message}`);
|
|
271
|
-
continue;
|
|
272
|
-
}
|
|
273
263
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
264
|
+
if (registryStatus.status === 'error') {
|
|
265
|
+
summary.errors++;
|
|
266
|
+
summary.checked++;
|
|
267
|
+
console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${registryStatus.detail}`);
|
|
268
|
+
if (delayMs > 0) await sleep(delayMs);
|
|
269
|
+
continue;
|
|
270
|
+
}
|
|
281
271
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
272
|
+
const newLabel = computeNewLabel(pkg, registryStatus);
|
|
273
|
+
summary.checked++;
|
|
274
|
+
console.log(`[RELABEL] ${key} → ${newLabel ? newLabel.label : 'unchanged'} (${registryStatus.status}, ${Date.now() - t0}ms)`);
|
|
285
275
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
276
|
+
if (summary.checked % 100 === 0) {
|
|
277
|
+
console.log(`[RELABEL] Progress: ${summary.checked}/${total} checked`);
|
|
278
|
+
}
|
|
289
279
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
280
|
+
if (newLabel) {
|
|
281
|
+
labelChanges.set(key, newLabel);
|
|
282
|
+
if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
|
|
283
|
+
else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
|
|
284
|
+
else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
|
|
285
|
+
else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
|
|
296
286
|
|
|
297
|
-
|
|
298
|
-
|
|
287
|
+
if (dryRun) {
|
|
288
|
+
console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
|
|
289
|
+
}
|
|
290
|
+
} else {
|
|
291
|
+
summary.unchanged++;
|
|
299
292
|
}
|
|
300
|
-
} else {
|
|
301
|
-
summary.unchanged++;
|
|
302
|
-
}
|
|
303
293
|
|
|
304
|
-
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
// 4. Apply label changes to records
|
|
308
|
-
const outputLines = [];
|
|
309
|
-
for (const rec of records) {
|
|
310
|
-
if (!rec.data) {
|
|
311
|
-
outputLines.push(rec.raw);
|
|
312
|
-
continue;
|
|
313
|
-
}
|
|
314
|
-
const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
|
|
315
|
-
const change = labelChanges.get(key);
|
|
316
|
-
if (change && RELABELABLE.has(rec.data.label)) {
|
|
317
|
-
rec.data.label = change.label;
|
|
318
|
-
rec.data.relabel_source = change.source;
|
|
319
|
-
rec.data.relabel_timestamp = new Date().toISOString();
|
|
320
|
-
outputLines.push(JSON.stringify(rec.data));
|
|
321
|
-
summary.records_updated++;
|
|
322
|
-
} else {
|
|
323
|
-
outputLines.push(rec.raw);
|
|
294
|
+
if (delayMs > 0) await sleep(delayMs);
|
|
324
295
|
}
|
|
296
|
+
} catch (err) {
|
|
297
|
+
registryError = err;
|
|
298
|
+
console.error(`[RELABEL] Registry check interrupted at ${summary.checked}/${packageMap.size}: ${err.message}`);
|
|
325
299
|
}
|
|
326
300
|
|
|
327
|
-
//
|
|
301
|
+
// 3. Stream output: re-read input, apply collected labelChanges, write line by line
|
|
328
302
|
if (!dryRun) {
|
|
329
303
|
const dir = path.dirname(outputPath);
|
|
330
304
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
331
|
-
|
|
332
|
-
|
|
305
|
+
const tmpPath = outputPath + '.tmp';
|
|
306
|
+
const ws = fs.createWriteStream(tmpPath);
|
|
307
|
+
const relabelTs = new Date().toISOString();
|
|
308
|
+
const inputContent = fs.readFileSync(inputPath, 'utf8');
|
|
309
|
+
const inputLines = inputContent.split('\n');
|
|
310
|
+
let linesWritten = 0;
|
|
311
|
+
for (let i = 0; i < inputLines.length; i++) {
|
|
312
|
+
const raw = inputLines[i];
|
|
313
|
+
const trimmed = raw.trim();
|
|
314
|
+
let outputLine = raw;
|
|
315
|
+
if (trimmed) {
|
|
316
|
+
try {
|
|
317
|
+
const data = JSON.parse(trimmed);
|
|
318
|
+
const key = `${data.ecosystem || 'npm'}/${data.name}`;
|
|
319
|
+
const change = labelChanges.get(key);
|
|
320
|
+
if (change && RELABELABLE.has(data.label)) {
|
|
321
|
+
data.label = change.label;
|
|
322
|
+
data.relabel_source = change.source;
|
|
323
|
+
data.relabel_timestamp = relabelTs;
|
|
324
|
+
outputLine = JSON.stringify(data);
|
|
325
|
+
summary.records_updated++;
|
|
326
|
+
}
|
|
327
|
+
} catch { /* unparseable line — keep as-is */ }
|
|
328
|
+
linesWritten++;
|
|
329
|
+
}
|
|
330
|
+
ws.write(outputLine);
|
|
331
|
+
if (i < inputLines.length - 1) ws.write('\n');
|
|
332
|
+
}
|
|
333
|
+
await new Promise((resolve, reject) => {
|
|
334
|
+
ws.on('finish', resolve);
|
|
335
|
+
ws.on('error', reject);
|
|
336
|
+
ws.end();
|
|
337
|
+
});
|
|
338
|
+
fs.renameSync(tmpPath, outputPath);
|
|
339
|
+
console.log(`[RELABEL] Written ${linesWritten} records to ${path.basename(outputPath)} (${summary.records_updated} updated${registryError ? ', PARTIAL' : ''})`);
|
|
333
340
|
} else {
|
|
334
|
-
console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated`);
|
|
341
|
+
console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated${registryError ? ' (PARTIAL)' : ''}`);
|
|
335
342
|
}
|
|
336
343
|
|
|
337
344
|
console.log(`[RELABEL] Summary: ${summary.relabeled_malicious} malicious, ${summary.relabeled_benign} benign, ${summary.relabeled_likely_benign} likely_benign, ${summary.removed_unlabeled} removed_unlabeled, ${summary.unchanged} unchanged, ${summary.errors} errors`);
|
|
345
|
+
if (registryError) {
|
|
346
|
+
console.error(`[RELABEL] WARNING: Partial results — registry check failed after ${summary.checked}/${packageMap.size} packages`);
|
|
347
|
+
}
|
|
338
348
|
|
|
339
349
|
return summary;
|
|
340
350
|
}
|