muaddib-scanner 2.10.65 → 2.10.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.10.65",
3
+ "version": "2.10.67",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -18,11 +18,10 @@ const fs = require('fs');
18
18
  const path = require('path');
19
19
  const https = require('https');
20
20
  const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
21
- const { atomicWriteFileSync } = require('./state.js');
22
21
 
23
22
  const DEFAULT_INPUT = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
24
23
  const DEFAULT_OUTPUT = path.join(__dirname, '..', '..', 'data', 'ml-training-relabeled.jsonl');
25
- const DEFAULT_DELAY_MS = 200; // 5 req/s max gentle on registries
24
+ const DEFAULT_DELAY_MS = 50; // 20 req/s — CLI one-shot, no monitor slot sharing needed
26
25
  const SURVIVAL_DAYS = 30;
27
26
 
28
27
  // Labels eligible for auto-relabeling
@@ -72,8 +71,8 @@ function sleep(ms) {
72
71
  * @param {string} name - package name
73
72
  * @returns {Promise<{status: string, latestVersion?: string, detail?: string}>}
74
73
  */
75
- async function checkNpmStatus(name) {
76
- await acquireRegistrySlot();
74
+ async function checkNpmStatus(name, options = {}) {
75
+ if (!options.skipSemaphore) await acquireRegistrySlot();
77
76
  try {
78
77
  const data = await httpsGetJson(`https://registry.npmjs.org/${encodeURIComponent(name)}`);
79
78
 
@@ -90,7 +89,7 @@ async function checkNpmStatus(name) {
90
89
  } catch (err) {
91
90
  return { status: 'error', detail: err.message };
92
91
  } finally {
93
- releaseRegistrySlot();
92
+ if (!options.skipSemaphore) releaseRegistrySlot();
94
93
  }
95
94
  }
96
95
 
@@ -191,52 +190,42 @@ async function relabelDataset(options = {}) {
191
190
  const dryRun = options.dryRun || false;
192
191
  const delayMs = options.delayMs != null ? options.delayMs : DEFAULT_DELAY_MS;
193
192
 
194
- // 1. Read records
193
+ // 1. Build package map from input (records freed after block scope)
195
194
  if (!fs.existsSync(inputPath)) {
196
195
  throw new Error(`Input file not found: ${inputPath}`);
197
196
  }
198
- const content = fs.readFileSync(inputPath, 'utf8');
199
- const lines = content.split('\n');
200
- const records = [];
201
- for (let i = 0; i < lines.length; i++) {
202
- const line = lines[i].trim();
203
- if (!line) continue;
204
- try {
205
- records.push({ idx: i, data: JSON.parse(line), raw: lines[i] });
206
- } catch {
207
- records.push({ idx: i, data: null, raw: lines[i] });
208
- }
209
- }
210
-
211
- // 2. Extract unique packages eligible for relabeling
212
- const packageMap = new Map(); // key { name, ecosystem, score, timestamp, indices[] }
213
- for (const rec of records) {
214
- if (!rec.data) continue;
215
- if (!RELABELABLE.has(rec.data.label)) continue;
216
- const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
217
- if (!packageMap.has(key)) {
218
- packageMap.set(key, {
219
- name: rec.data.name,
220
- ecosystem: rec.data.ecosystem || 'npm',
221
- score: rec.data.score || 0,
222
- timestamp: rec.data.timestamp,
223
- indices: []
224
- });
225
- }
226
- packageMap.get(key).indices.push(rec.idx);
227
- // Use highest score seen for this package
228
- if ((rec.data.score || 0) > packageMap.get(key).score) {
229
- packageMap.get(key).score = rec.data.score;
230
- }
231
- // Use earliest timestamp
232
- if (rec.data.timestamp && (!packageMap.get(key).timestamp || rec.data.timestamp < packageMap.get(key).timestamp)) {
233
- packageMap.get(key).timestamp = rec.data.timestamp;
197
+ let recordCount = 0;
198
+ const packageMap = new Map(); // key → { name, ecosystem, score, timestamp }
199
+ {
200
+ const content = fs.readFileSync(inputPath, 'utf8');
201
+ const lines = content.split('\n');
202
+ for (let i = 0; i < lines.length; i++) {
203
+ const line = lines[i].trim();
204
+ if (!line) continue;
205
+ recordCount++;
206
+ let data;
207
+ try { data = JSON.parse(line); } catch { continue; }
208
+ if (!RELABELABLE.has(data.label)) continue;
209
+ const key = `${data.ecosystem || 'npm'}/${data.name}`;
210
+ if (!packageMap.has(key)) {
211
+ packageMap.set(key, {
212
+ name: data.name,
213
+ ecosystem: data.ecosystem || 'npm',
214
+ score: data.score || 0,
215
+ timestamp: data.timestamp
216
+ });
217
+ }
218
+ const pkg = packageMap.get(key);
219
+ if ((data.score || 0) > pkg.score) pkg.score = data.score;
220
+ if (data.timestamp && (!pkg.timestamp || data.timestamp < pkg.timestamp)) {
221
+ pkg.timestamp = data.timestamp;
222
+ }
234
223
  }
235
- }
224
+ } // content, lines — eligible for GC before registry checks
236
225
 
237
- console.log(`[RELABEL] ${records.length} records, ${packageMap.size} unique packages to check`);
226
+ console.log(`[RELABEL] ${recordCount} records, ${packageMap.size} unique packages to check`);
238
227
 
239
- // 3. Check each package against registry
228
+ // 2. Check each package against registry (crash-resilient)
240
229
  const summary = {
241
230
  checked: 0,
242
231
  relabeled_malicious: 0,
@@ -247,85 +236,115 @@ async function relabelDataset(options = {}) {
247
236
  errors: 0,
248
237
  records_updated: 0
249
238
  };
250
-
251
239
  const labelChanges = new Map(); // packageKey → { label, source }
240
+ let registryError = null;
252
241
 
253
- for (const [key, pkg] of packageMap) {
254
- let registryStatus;
255
- try {
256
- if (pkg.ecosystem === 'npm') {
257
- registryStatus = await checkNpmStatus(pkg.name);
258
- } else if (pkg.ecosystem === 'pypi') {
259
- registryStatus = await checkPyPIStatus(pkg.name);
260
- } else {
261
- summary.unchanged++;
242
+ try {
243
+ const total = packageMap.size;
244
+ for (const [key, pkg] of packageMap) {
245
+ const t0 = Date.now();
246
+ let registryStatus;
247
+ try {
248
+ if (pkg.ecosystem === 'npm') {
249
+ registryStatus = await checkNpmStatus(pkg.name, { skipSemaphore: true });
250
+ } else if (pkg.ecosystem === 'pypi') {
251
+ registryStatus = await checkPyPIStatus(pkg.name);
252
+ } else {
253
+ summary.unchanged++;
254
+ summary.checked++;
255
+ continue;
256
+ }
257
+ } catch (err) {
258
+ summary.errors++;
262
259
  summary.checked++;
260
+ console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${err.message}`);
263
261
  continue;
264
262
  }
265
- } catch (err) {
266
- summary.errors++;
267
- summary.checked++;
268
- continue;
269
- }
270
263
 
271
- if (registryStatus.status === 'error') {
272
- summary.errors++;
264
+ if (registryStatus.status === 'error') {
265
+ summary.errors++;
266
+ summary.checked++;
267
+ console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${registryStatus.detail}`);
268
+ if (delayMs > 0) await sleep(delayMs);
269
+ continue;
270
+ }
271
+
272
+ const newLabel = computeNewLabel(pkg, registryStatus);
273
273
  summary.checked++;
274
- if (delayMs > 0) await sleep(delayMs);
275
- continue;
276
- }
274
+ console.log(`[RELABEL] ${key} ${newLabel ? newLabel.label : 'unchanged'} (${registryStatus.status}, ${Date.now() - t0}ms)`);
277
275
 
278
- const newLabel = computeNewLabel(pkg, registryStatus);
279
- summary.checked++;
276
+ if (summary.checked % 100 === 0) {
277
+ console.log(`[RELABEL] Progress: ${summary.checked}/${total} checked`);
278
+ }
280
279
 
281
- if (newLabel) {
282
- labelChanges.set(key, newLabel);
283
- if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
284
- else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
285
- else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
286
- else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
280
+ if (newLabel) {
281
+ labelChanges.set(key, newLabel);
282
+ if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
283
+ else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
284
+ else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
285
+ else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
287
286
 
288
- if (dryRun) {
289
- console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
287
+ if (dryRun) {
288
+ console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
289
+ }
290
+ } else {
291
+ summary.unchanged++;
290
292
  }
291
- } else {
292
- summary.unchanged++;
293
- }
294
-
295
- if (delayMs > 0) await sleep(delayMs);
296
- }
297
293
 
298
- // 4. Apply label changes to records
299
- const outputLines = [];
300
- for (const rec of records) {
301
- if (!rec.data) {
302
- outputLines.push(rec.raw);
303
- continue;
304
- }
305
- const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
306
- const change = labelChanges.get(key);
307
- if (change && RELABELABLE.has(rec.data.label)) {
308
- rec.data.label = change.label;
309
- rec.data.relabel_source = change.source;
310
- rec.data.relabel_timestamp = new Date().toISOString();
311
- outputLines.push(JSON.stringify(rec.data));
312
- summary.records_updated++;
313
- } else {
314
- outputLines.push(rec.raw);
294
+ if (delayMs > 0) await sleep(delayMs);
315
295
  }
296
+ } catch (err) {
297
+ registryError = err;
298
+ console.error(`[RELABEL] Registry check interrupted at ${summary.checked}/${packageMap.size}: ${err.message}`);
316
299
  }
317
300
 
318
- // 5. Write output
301
+ // 3. Stream output: re-read input, apply collected labelChanges, write line by line
319
302
  if (!dryRun) {
320
303
  const dir = path.dirname(outputPath);
321
304
  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
322
- atomicWriteFileSync(outputPath, outputLines.join('\n'));
323
- console.log(`[RELABEL] Written ${outputLines.length} records to ${path.basename(outputPath)} (${summary.records_updated} updated)`);
305
+ const tmpPath = outputPath + '.tmp';
306
+ const ws = fs.createWriteStream(tmpPath);
307
+ const relabelTs = new Date().toISOString();
308
+ const inputContent = fs.readFileSync(inputPath, 'utf8');
309
+ const inputLines = inputContent.split('\n');
310
+ let linesWritten = 0;
311
+ for (let i = 0; i < inputLines.length; i++) {
312
+ const raw = inputLines[i];
313
+ const trimmed = raw.trim();
314
+ let outputLine = raw;
315
+ if (trimmed) {
316
+ try {
317
+ const data = JSON.parse(trimmed);
318
+ const key = `${data.ecosystem || 'npm'}/${data.name}`;
319
+ const change = labelChanges.get(key);
320
+ if (change && RELABELABLE.has(data.label)) {
321
+ data.label = change.label;
322
+ data.relabel_source = change.source;
323
+ data.relabel_timestamp = relabelTs;
324
+ outputLine = JSON.stringify(data);
325
+ summary.records_updated++;
326
+ }
327
+ } catch { /* unparseable line — keep as-is */ }
328
+ linesWritten++;
329
+ }
330
+ ws.write(outputLine);
331
+ if (i < inputLines.length - 1) ws.write('\n');
332
+ }
333
+ await new Promise((resolve, reject) => {
334
+ ws.on('finish', resolve);
335
+ ws.on('error', reject);
336
+ ws.end();
337
+ });
338
+ fs.renameSync(tmpPath, outputPath);
339
+ console.log(`[RELABEL] Written ${linesWritten} records to ${path.basename(outputPath)} (${summary.records_updated} updated${registryError ? ', PARTIAL' : ''})`);
324
340
  } else {
325
- console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated`);
341
+ console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated${registryError ? ' (PARTIAL)' : ''}`);
326
342
  }
327
343
 
328
344
  console.log(`[RELABEL] Summary: ${summary.relabeled_malicious} malicious, ${summary.relabeled_benign} benign, ${summary.relabeled_likely_benign} likely_benign, ${summary.removed_unlabeled} removed_unlabeled, ${summary.unchanged} unchanged, ${summary.errors} errors`);
345
+ if (registryError) {
346
+ console.error(`[RELABEL] WARNING: Partial results — registry check failed after ${summary.checked}/${packageMap.size} packages`);
347
+ }
329
348
 
330
349
  return summary;
331
350
  }