muaddib-scanner 2.10.66 → 2.10.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.10.66",
3
+ "version": "2.10.67",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -18,7 +18,6 @@ const fs = require('fs');
18
18
  const path = require('path');
19
19
  const https = require('https');
20
20
  const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
21
- const { atomicWriteFileSync } = require('./state.js');
22
21
 
23
22
  const DEFAULT_INPUT = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
24
23
  const DEFAULT_OUTPUT = path.join(__dirname, '..', '..', 'data', 'ml-training-relabeled.jsonl');
@@ -191,52 +190,42 @@ async function relabelDataset(options = {}) {
191
190
  const dryRun = options.dryRun || false;
192
191
  const delayMs = options.delayMs != null ? options.delayMs : DEFAULT_DELAY_MS;
193
192
 
194
- // 1. Read records
193
+ // 1. Build package map from input (records freed after block scope)
195
194
  if (!fs.existsSync(inputPath)) {
196
195
  throw new Error(`Input file not found: ${inputPath}`);
197
196
  }
198
- const content = fs.readFileSync(inputPath, 'utf8');
199
- const lines = content.split('\n');
200
- const records = [];
201
- for (let i = 0; i < lines.length; i++) {
202
- const line = lines[i].trim();
203
- if (!line) continue;
204
- try {
205
- records.push({ idx: i, data: JSON.parse(line), raw: lines[i] });
206
- } catch {
207
- records.push({ idx: i, data: null, raw: lines[i] });
208
- }
209
- }
210
-
211
- // 2. Extract unique packages eligible for relabeling
212
- const packageMap = new Map(); // key { name, ecosystem, score, timestamp, indices[] }
213
- for (const rec of records) {
214
- if (!rec.data) continue;
215
- if (!RELABELABLE.has(rec.data.label)) continue;
216
- const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
217
- if (!packageMap.has(key)) {
218
- packageMap.set(key, {
219
- name: rec.data.name,
220
- ecosystem: rec.data.ecosystem || 'npm',
221
- score: rec.data.score || 0,
222
- timestamp: rec.data.timestamp,
223
- indices: []
224
- });
225
- }
226
- packageMap.get(key).indices.push(rec.idx);
227
- // Use highest score seen for this package
228
- if ((rec.data.score || 0) > packageMap.get(key).score) {
229
- packageMap.get(key).score = rec.data.score;
230
- }
231
- // Use earliest timestamp
232
- if (rec.data.timestamp && (!packageMap.get(key).timestamp || rec.data.timestamp < packageMap.get(key).timestamp)) {
233
- packageMap.get(key).timestamp = rec.data.timestamp;
197
+ let recordCount = 0;
198
+ const packageMap = new Map(); // key → { name, ecosystem, score, timestamp }
199
+ {
200
+ const content = fs.readFileSync(inputPath, 'utf8');
201
+ const lines = content.split('\n');
202
+ for (let i = 0; i < lines.length; i++) {
203
+ const line = lines[i].trim();
204
+ if (!line) continue;
205
+ recordCount++;
206
+ let data;
207
+ try { data = JSON.parse(line); } catch { continue; }
208
+ if (!RELABELABLE.has(data.label)) continue;
209
+ const key = `${data.ecosystem || 'npm'}/${data.name}`;
210
+ if (!packageMap.has(key)) {
211
+ packageMap.set(key, {
212
+ name: data.name,
213
+ ecosystem: data.ecosystem || 'npm',
214
+ score: data.score || 0,
215
+ timestamp: data.timestamp
216
+ });
217
+ }
218
+ const pkg = packageMap.get(key);
219
+ if ((data.score || 0) > pkg.score) pkg.score = data.score;
220
+ if (data.timestamp && (!pkg.timestamp || data.timestamp < pkg.timestamp)) {
221
+ pkg.timestamp = data.timestamp;
222
+ }
234
223
  }
235
- }
224
+ } // content, lines — eligible for GC before registry checks
236
225
 
237
- console.log(`[RELABEL] ${records.length} records, ${packageMap.size} unique packages to check`);
226
+ console.log(`[RELABEL] ${recordCount} records, ${packageMap.size} unique packages to check`);
238
227
 
239
- // 3. Check each package against registry
228
+ // 2. Check each package against registry (crash-resilient)
240
229
  const summary = {
241
230
  checked: 0,
242
231
  relabeled_malicious: 0,
@@ -247,94 +236,115 @@ async function relabelDataset(options = {}) {
247
236
  errors: 0,
248
237
  records_updated: 0
249
238
  };
250
-
251
239
  const labelChanges = new Map(); // packageKey → { label, source }
240
+ let registryError = null;
252
241
 
253
- const total = packageMap.size;
254
- for (const [key, pkg] of packageMap) {
255
- const t0 = Date.now();
256
- let registryStatus;
257
- try {
258
- if (pkg.ecosystem === 'npm') {
259
- registryStatus = await checkNpmStatus(pkg.name, { skipSemaphore: true });
260
- } else if (pkg.ecosystem === 'pypi') {
261
- registryStatus = await checkPyPIStatus(pkg.name);
262
- } else {
263
- summary.unchanged++;
242
+ try {
243
+ const total = packageMap.size;
244
+ for (const [key, pkg] of packageMap) {
245
+ const t0 = Date.now();
246
+ let registryStatus;
247
+ try {
248
+ if (pkg.ecosystem === 'npm') {
249
+ registryStatus = await checkNpmStatus(pkg.name, { skipSemaphore: true });
250
+ } else if (pkg.ecosystem === 'pypi') {
251
+ registryStatus = await checkPyPIStatus(pkg.name);
252
+ } else {
253
+ summary.unchanged++;
254
+ summary.checked++;
255
+ continue;
256
+ }
257
+ } catch (err) {
258
+ summary.errors++;
264
259
  summary.checked++;
260
+ console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${err.message}`);
265
261
  continue;
266
262
  }
267
- } catch (err) {
268
- summary.errors++;
269
- summary.checked++;
270
- console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${err.message}`);
271
- continue;
272
- }
273
263
 
274
- if (registryStatus.status === 'error') {
275
- summary.errors++;
276
- summary.checked++;
277
- console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${registryStatus.detail}`);
278
- if (delayMs > 0) await sleep(delayMs);
279
- continue;
280
- }
264
+ if (registryStatus.status === 'error') {
265
+ summary.errors++;
266
+ summary.checked++;
267
+ console.log(`[RELABEL] ${key} → error (${Date.now() - t0}ms): ${registryStatus.detail}`);
268
+ if (delayMs > 0) await sleep(delayMs);
269
+ continue;
270
+ }
281
271
 
282
- const newLabel = computeNewLabel(pkg, registryStatus);
283
- summary.checked++;
284
- console.log(`[RELABEL] ${key} → ${newLabel ? newLabel.label : 'unchanged'} (${registryStatus.status}, ${Date.now() - t0}ms)`);
272
+ const newLabel = computeNewLabel(pkg, registryStatus);
273
+ summary.checked++;
274
+ console.log(`[RELABEL] ${key} → ${newLabel ? newLabel.label : 'unchanged'} (${registryStatus.status}, ${Date.now() - t0}ms)`);
285
275
 
286
- if (summary.checked % 100 === 0) {
287
- console.log(`[RELABEL] Progress: ${summary.checked}/${total} checked`);
288
- }
276
+ if (summary.checked % 100 === 0) {
277
+ console.log(`[RELABEL] Progress: ${summary.checked}/${total} checked`);
278
+ }
289
279
 
290
- if (newLabel) {
291
- labelChanges.set(key, newLabel);
292
- if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
293
- else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
294
- else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
295
- else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
280
+ if (newLabel) {
281
+ labelChanges.set(key, newLabel);
282
+ if (newLabel.label === 'confirmed_malicious') summary.relabeled_malicious++;
283
+ else if (newLabel.label === 'confirmed_benign') summary.relabeled_benign++;
284
+ else if (newLabel.label === 'likely_benign') summary.relabeled_likely_benign++;
285
+ else if (newLabel.label === 'removed_unlabeled') summary.removed_unlabeled++;
296
286
 
297
- if (dryRun) {
298
- console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
287
+ if (dryRun) {
288
+ console.log(`[RELABEL] DRY-RUN: ${key} → ${newLabel.label} (${newLabel.source}, score=${pkg.score}, status=${registryStatus.status})`);
289
+ }
290
+ } else {
291
+ summary.unchanged++;
299
292
  }
300
- } else {
301
- summary.unchanged++;
302
- }
303
293
 
304
- if (delayMs > 0) await sleep(delayMs);
305
- }
306
-
307
- // 4. Apply label changes to records
308
- const outputLines = [];
309
- for (const rec of records) {
310
- if (!rec.data) {
311
- outputLines.push(rec.raw);
312
- continue;
313
- }
314
- const key = `${rec.data.ecosystem || 'npm'}/${rec.data.name}`;
315
- const change = labelChanges.get(key);
316
- if (change && RELABELABLE.has(rec.data.label)) {
317
- rec.data.label = change.label;
318
- rec.data.relabel_source = change.source;
319
- rec.data.relabel_timestamp = new Date().toISOString();
320
- outputLines.push(JSON.stringify(rec.data));
321
- summary.records_updated++;
322
- } else {
323
- outputLines.push(rec.raw);
294
+ if (delayMs > 0) await sleep(delayMs);
324
295
  }
296
+ } catch (err) {
297
+ registryError = err;
298
+ console.error(`[RELABEL] Registry check interrupted at ${summary.checked}/${packageMap.size}: ${err.message}`);
325
299
  }
326
300
 
327
- // 5. Write output
301
+ // 3. Stream output: re-read input, apply collected labelChanges, write line by line
328
302
  if (!dryRun) {
329
303
  const dir = path.dirname(outputPath);
330
304
  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
331
- atomicWriteFileSync(outputPath, outputLines.join('\n'));
332
- console.log(`[RELABEL] Written ${outputLines.length} records to ${path.basename(outputPath)} (${summary.records_updated} updated)`);
305
+ const tmpPath = outputPath + '.tmp';
306
+ const ws = fs.createWriteStream(tmpPath);
307
+ const relabelTs = new Date().toISOString();
308
+ const inputContent = fs.readFileSync(inputPath, 'utf8');
309
+ const inputLines = inputContent.split('\n');
310
+ let linesWritten = 0;
311
+ for (let i = 0; i < inputLines.length; i++) {
312
+ const raw = inputLines[i];
313
+ const trimmed = raw.trim();
314
+ let outputLine = raw;
315
+ if (trimmed) {
316
+ try {
317
+ const data = JSON.parse(trimmed);
318
+ const key = `${data.ecosystem || 'npm'}/${data.name}`;
319
+ const change = labelChanges.get(key);
320
+ if (change && RELABELABLE.has(data.label)) {
321
+ data.label = change.label;
322
+ data.relabel_source = change.source;
323
+ data.relabel_timestamp = relabelTs;
324
+ outputLine = JSON.stringify(data);
325
+ summary.records_updated++;
326
+ }
327
+ } catch { /* unparseable line — keep as-is */ }
328
+ linesWritten++;
329
+ }
330
+ ws.write(outputLine);
331
+ if (i < inputLines.length - 1) ws.write('\n');
332
+ }
333
+ await new Promise((resolve, reject) => {
334
+ ws.on('finish', resolve);
335
+ ws.on('error', reject);
336
+ ws.end();
337
+ });
338
+ fs.renameSync(tmpPath, outputPath);
339
+ console.log(`[RELABEL] Written ${linesWritten} records to ${path.basename(outputPath)} (${summary.records_updated} updated${registryError ? ', PARTIAL' : ''})`);
333
340
  } else {
334
- console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated`);
341
+ console.log(`[RELABEL] DRY-RUN complete: ${summary.records_updated} records would be updated${registryError ? ' (PARTIAL)' : ''}`);
335
342
  }
336
343
 
337
344
  console.log(`[RELABEL] Summary: ${summary.relabeled_malicious} malicious, ${summary.relabeled_benign} benign, ${summary.relabeled_likely_benign} likely_benign, ${summary.removed_unlabeled} removed_unlabeled, ${summary.unchanged} unchanged, ${summary.errors} errors`);
345
+ if (registryError) {
346
+ console.error(`[RELABEL] WARNING: Partial results — registry check failed after ${summary.checked}/${packageMap.size} packages`);
347
+ }
338
348
 
339
349
  return summary;
340
350
  }