npm - muaddib-scanner - Versions diffs - 2.8.5 → 2.8.7 - Mend

muaddib-scanner 2.8.5 → 2.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/src/ml/feature-extractor.js +214 -0
package/src/ml/jsonl-writer.js +181 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "muaddib-scanner",
-  "version": "2.8.5",
+  "version": "2.8.7",
   "description": "Supply-chain threat detection & response for npm & PyPI/Python",
   "main": "src/index.js",
   "bin": {

package/src/ml/feature-extractor.js ADDED Viewed

@@ -0,0 +1,214 @@
+'use strict';
+/**
+ * ML Feature Extractor — extracts numeric/boolean features from scan results
+ * for ML classifier training (Phase 1 of FPR reduction pipeline).
+ *
+ * Features are designed to capture the discriminative signals between true
+ * positives and false positives: threat composition, severity distribution,
+ * scoring breakdown, and package metadata.
+ *
+ * Output: flat object with numeric/boolean values suitable for XGBoost/RF.
+ */
+// Top threat types by frequency in production (covers ~95% of all findings).
+// Types not in this list are aggregated into `threat_type_other`.
+const TOP_THREAT_TYPES = [
+  'suspicious_dataflow',
+  'env_access',
+  'sensitive_string',
+  'dangerous_call_eval',
+  'dangerous_call_exec',
+  'dangerous_call_function',
+  'obfuscation_detected',
+  'high_entropy_string',
+  'dynamic_require',
+  'dynamic_import',
+  'lifecycle_script',
+  'typosquat_detected',
+  'staged_payload',
+  'staged_binary_payload',
+  'network_require',
+  'sandbox_evasion',
+  'credential_regex_harvest',
+  'remote_code_load',
+  'suspicious_domain',
+  'prototype_hook',
+  'intent_credential_exfil',
+  'intent_command_exfil',
+  'cross_file_dataflow',
+  'module_compile',
+  'crypto_decipher',
+  'env_charcode_reconstruction',
+  'lifecycle_shell_pipe',
+  'curl_exec',
+  'reverse_shell',
+  'binary_dropper',
+  'mcp_config_injection'
+];
+const TOP_THREAT_TYPES_SET = new Set(TOP_THREAT_TYPES);
+/**
+ * Extract ML features from a scan result object.
+ *
+ * @param {Object} result - scan result from run() with { threats, summary }
+ * @param {Object} meta - package metadata { name, version, ecosystem, unpackedSize, registryMeta }
+ * @returns {Object} flat feature vector with numeric/boolean values
+ */
+function extractFeatures(result, meta) {
+  const features = Object.create(null);
+  const threats = (result && result.threats) || [];
+  const summary = (result && result.summary) || {};
+  // --- Scoring features ---
+  features.score = summary.riskScore || 0;
+  features.max_file_score = summary.maxFileScore || 0;
+  features.package_score = summary.packageScore || 0;
+  features.global_risk_score = summary.globalRiskScore || 0;
+  // --- Severity counts ---
+  features.count_total = summary.total || 0;
+  features.count_critical = summary.critical || 0;
+  features.count_high = summary.high || 0;
+  features.count_medium = summary.medium || 0;
+  features.count_low = summary.low || 0;
+  // --- Distinct threat types ---
+  const distinctTypes = new Set(threats.map(t => t.type));
+  features.distinct_threat_types = distinctTypes.size;
+  // --- Per-type counts (top 31 types) ---
+  const typeCounts = Object.create(null);
+  for (const t of threats) {
+    typeCounts[t.type] = (typeCounts[t.type] || 0) + 1;
+  }
+  for (const type of TOP_THREAT_TYPES) {
+    features[`type_${type}`] = typeCounts[type] || 0;
+  }
+  // Aggregate count for types not in top list
+  let otherCount = 0;
+  for (const [type, count] of Object.entries(typeCounts)) {
+    if (!TOP_THREAT_TYPES_SET.has(type)) {
+      otherCount += count;
+    }
+  }
+  features.type_other = otherCount;
+  // --- Boolean behavioral signals ---
+  features.has_lifecycle_script = threats.some(t => t.type === 'lifecycle_script' || t.type === 'lifecycle_shell_pipe') ? 1 : 0;
+  features.has_network_access = threats.some(t =>
+    t.type === 'network_require' || t.type === 'remote_code_load' ||
+    t.type === 'curl_exec' || t.type === 'suspicious_dataflow'
+  ) ? 1 : 0;
+  features.has_obfuscation = threats.some(t =>
+    t.type === 'obfuscation_detected' || t.type === 'high_entropy_string' ||
+    t.type === 'js_obfuscation_pattern'
+  ) ? 1 : 0;
+  features.has_env_access = threats.some(t => t.type === 'env_access' || t.type === 'env_charcode_reconstruction') ? 1 : 0;
+  features.has_eval = threats.some(t => t.type === 'dangerous_call_eval' || t.type === 'dangerous_call_function') ? 1 : 0;
+  features.has_staged_payload = threats.some(t => t.type === 'staged_payload' || t.type === 'staged_binary_payload') ? 1 : 0;
+  features.has_typosquat = threats.some(t => t.type === 'typosquat_detected' || t.type === 'pypi_typosquat_detected') ? 1 : 0;
+  features.has_ioc_match = threats.some(t => t.type === 'known_malicious_package' || t.type === 'known_malicious_hash' || t.type === 'pypi_malicious_package') ? 1 : 0;
+  features.has_intent_pair = threats.some(t => t.type === 'intent_credential_exfil' || t.type === 'intent_command_exfil') ? 1 : 0;
+  features.has_sandbox_finding = threats.some(t => t.type && t.type.startsWith('sandbox_')) ? 1 : 0;
+  // --- File distribution features ---
+  const fileScores = summary.fileScores || {};
+  const fileScoreValues = Object.values(fileScores);
+  features.file_count_with_threats = fileScoreValues.length;
+  features.file_score_mean = fileScoreValues.length > 0
+    ? Math.round(fileScoreValues.reduce((a, b) => a + b, 0) / fileScoreValues.length)
+    : 0;
+  features.file_score_max = fileScoreValues.length > 0
+    ? Math.max(...fileScoreValues)
+    : 0;
+  // --- Severity concentration: ratio of CRITICAL+HIGH vs total ---
+  features.severity_ratio_high = features.count_total > 0
+    ? Math.round(((features.count_critical + features.count_high) / features.count_total) * 100) / 100
+    : 0;
+  // --- Points concentration: max single-threat points vs score ---
+  const breakdown = summary.breakdown || [];
+  features.max_single_points = breakdown.length > 0 ? breakdown[0].points : 0;
+  features.points_concentration = features.score > 0 && breakdown.length > 0
+    ? Math.round((breakdown[0].points / features.score) * 100) / 100
+    : 0;
+  // --- Package metadata (from registry) ---
+  const registry = (meta && meta.registryMeta) || {};
+  features.unpacked_size_bytes = (meta && meta.unpackedSize) || registry.unpackedSize || 0;
+  features.dep_count = countDeps(registry.dependencies);
+  features.dev_dep_count = countDeps(registry.devDependencies);
+  // --- Reputation factor (if computed by monitor) ---
+  features.reputation_factor = summary.reputationFactor || 1.0;
+  return features;
+}
+/**
+ * Count dependencies from a registry metadata dependencies object.
+ * Handles both object format ({name: version}) and number.
+ */
+function countDeps(deps) {
+  if (!deps) return 0;
+  if (typeof deps === 'number') return deps;
+  if (typeof deps === 'object') return Object.keys(deps).length;
+  return 0;
+}
+/**
+ * Build a complete JSONL record for a scanned package.
+ *
+ * @param {Object} result - scan result from run()
+ * @param {Object} params - { name, version, ecosystem, unpackedSize, registryMeta, label, tier, sandboxResult }
+ * @returns {Object} complete record with metadata + features + label
+ */
+function buildTrainingRecord(result, params) {
+  const {
+    name, version, ecosystem,
+    unpackedSize, registryMeta,
+    label, tier, sandboxResult
+  } = params;
+  const features = extractFeatures(result, {
+    name, version, ecosystem,
+    unpackedSize, registryMeta
+  });
+  const record = Object.create(null);
+  // --- Identity (not features, for traceability) ---
+  record.name = name || '';
+  record.version = version || '';
+  record.ecosystem = ecosystem || 'npm';
+  record.timestamp = new Date().toISOString();
+  // --- Label ---
+  // 'clean' = no findings or T3 only
+  // 'suspect' = T1/T2 (pending manual review)
+  // 'confirmed' = manually confirmed malicious
+  // 'fp' = manually confirmed false positive
+  record.label = label || 'suspect';
+  record.tier = tier || null;
+  // --- Features ---
+  Object.assign(record, features);
+  // --- Sandbox score (if available) ---
+  record.sandbox_score = (sandboxResult && sandboxResult.score) || 0;
+  record.sandbox_finding_count = (sandboxResult && sandboxResult.findings)
+    ? sandboxResult.findings.length
+    : 0;
+  return record;
+}
+module.exports = {
+  extractFeatures,
+  buildTrainingRecord,
+  TOP_THREAT_TYPES,
+  TOP_THREAT_TYPES_SET
+};

package/src/ml/jsonl-writer.js ADDED Viewed

@@ -0,0 +1,181 @@
+'use strict';
+/**
+ * JSONL Writer — appends training records to data/ml-training.jsonl.
+ *
+ * One JSON object per line, newline-delimited (JSONL format).
+ * Uses append mode for crash-safe incremental writes.
+ * Auto-creates data/ directory if missing.
+ *
+ * File rotation: when the file exceeds MAX_JSONL_SIZE (100MB),
+ * it is renamed to ml-training-{timestamp}.jsonl and a fresh file starts.
+ */
+const fs = require('fs');
+const path = require('path');
+const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
+let TRAINING_FILE = DEFAULT_TRAINING_FILE;
+const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
+/**
+ * Override the training file path (for testing).
+ * @param {string} filePath - new file path
+ */
+function setTrainingFile(filePath) {
+  TRAINING_FILE = filePath;
+}
+/**
+ * Reset the training file path to the default.
+ */
+function resetTrainingFile() {
+  TRAINING_FILE = DEFAULT_TRAINING_FILE;
+}
+/**
+ * Append a single record to the JSONL training file.
+ * @param {Object} record - training record from buildTrainingRecord()
+ */
+function appendRecord(record) {
+  try {
+    const dir = path.dirname(TRAINING_FILE);
+    if (!fs.existsSync(dir)) {
+      fs.mkdirSync(dir, { recursive: true });
+    }
+    // Rotate if file is too large
+    maybeRotate();
+    const line = JSON.stringify(record) + '\n';
+    fs.appendFileSync(TRAINING_FILE, line, 'utf8');
+  } catch (err) {
+    // Non-fatal: JSONL export failure should never crash the monitor
+    if (err.code === 'EROFS' || err.code === 'EACCES' || err.code === 'EPERM') {
+      // Read-only filesystem — silently skip (same pattern as atomicWriteFileSync)
+      return;
+    }
+    console.error(`[ML] Failed to append JSONL record: ${err.message}`);
+  }
+}
+/**
+ * Rotate the JSONL file if it exceeds MAX_JSONL_SIZE.
+ * Renames to ml-training-{ISO timestamp}.jsonl.
+ */
+function maybeRotate() {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) return;
+    const stat = fs.statSync(TRAINING_FILE);
+    if (stat.size < MAX_JSONL_SIZE) return;
+    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+    const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
+    fs.renameSync(TRAINING_FILE, rotatedName);
+    console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
+  } catch (err) {
+    console.error(`[ML] Rotation failed: ${err.message}`);
+  }
+}
+/**
+ * Read all records from the current JSONL file.
+ * Useful for offline analysis and model training.
+ * @returns {Object[]} array of parsed records
+ */
+function readRecords() {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) return [];
+    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
+    return content
+      .split('\n')
+      .filter(line => line.trim())
+      .map((line, i) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          console.warn(`[ML] Skipping malformed JSONL line ${i + 1}`);
+          return null;
+        }
+      })
+      .filter(Boolean);
+  } catch (err) {
+    console.error(`[ML] Failed to read JSONL: ${err.message}`);
+    return [];
+  }
+}
+/**
+ * Get stats about the current JSONL file.
+ * @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
+ */
+function getStats() {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) {
+      return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
+    }
+    const stat = fs.statSync(TRAINING_FILE);
+    // Count lines without reading the entire file into memory
+    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
+    const lineCount = content.split('\n').filter(l => l.trim()).length;
+    return {
+      recordCount: lineCount,
+      fileSizeBytes: stat.size,
+      fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
+    };
+  } catch {
+    return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
+  }
+}
+/**
+ * Update the label of records matching a given package name.
+ * Used when manual confirmation (fp/confirmed) is applied retroactively.
+ *
+ * @param {string} packageName - package name to relabel
+ * @param {string} newLabel - 'fp' or 'confirmed'
+ * @returns {number} number of records updated
+ */
+function relabelRecords(packageName, newLabel) {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) return 0;
+    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
+    const lines = content.split('\n');
+    let updated = 0;
+    const newLines = lines.map(line => {
+      if (!line.trim()) return line;
+      try {
+        const record = JSON.parse(line);
+        if (record.name === packageName && record.label !== newLabel) {
+          record.label = newLabel;
+          updated++;
+          return JSON.stringify(record);
+        }
+        return line;
+      } catch {
+        return line;
+      }
+    });
+    if (updated > 0) {
+      fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
+      console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
+    }
+    return updated;
+  } catch (err) {
+    console.error(`[ML] Failed to relabel records: ${err.message}`);
+    return 0;
+  }
+}
+module.exports = {
+  appendRecord,
+  readRecords,
+  getStats,
+  relabelRecords,
+  maybeRotate,
+  get TRAINING_FILE() { return TRAINING_FILE; },
+  setTrainingFile,
+  resetTrainingFile,
+  MAX_JSONL_SIZE
+};