npm - muaddib-scanner - Versions diffs - 2.8.6 → 2.8.8 - Mend

muaddib-scanner 2.8.6 → 2.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +2 -2
package/src/index.js +27 -0
package/src/ml/feature-extractor.js +214 -0
package/src/ml/jsonl-writer.js +187 -0
package/src/response/playbooks.js +4 -0
package/src/rules/index.js +12 -0
package/src/scanner/ast-detectors.js +18 -0
package/src/scoring.js +4 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "muaddib-scanner",
-  "version": "2.8.6",
+  "version": "2.8.8",
   "description": "Supply-chain threat detection & response for npm & PyPI/Python",
   "main": "src/index.js",
   "bin": {
@@ -44,7 +44,7 @@
     "node": ">=18.0.0"
   },
   "dependencies": {
-    "@inquirer/prompts": "8.3.0",
+    "@inquirer/prompts": "8.3.2",
     "acorn": "8.16.0",
     "acorn-walk": "8.3.5",
     "adm-zip": "0.5.16",

package/src/index.js CHANGED Viewed

@@ -567,6 +567,33 @@ async function run(targetPath, options = {}) {
     }
   } catch { /* graceful fallback */ }
+  // Cross-scanner compound: detached_process + suspicious_dataflow in same file
+  // Catches cases where credential flow is detected by dataflow scanner, not AST scanner
+  {
+    const fileMap = Object.create(null);
+    for (const t of deduped) {
+      if (t.file) {
+        if (!fileMap[t.file]) fileMap[t.file] = [];
+        fileMap[t.file].push(t);
+      }
+    }
+    for (const file of Object.keys(fileMap)) {
+      const fileThreats = fileMap[file];
+      const hasDetached = fileThreats.some(t => t.type === 'detached_process');
+      const hasCredFlow = fileThreats.some(t => t.type === 'suspicious_dataflow');
+      const alreadyCompound = fileThreats.some(t => t.type === 'detached_credential_exfil');
+      if (hasDetached && hasCredFlow && !alreadyCompound) {
+        deduped.push({
+          type: 'detached_credential_exfil',
+          severity: 'CRITICAL',
+          message: 'Detached process + credential dataflow — background exfiltration (cross-scanner compound).',
+          file,
+          count: 1
+        });
+      }
+    }
+  }
   // FP reduction: legitimate frameworks produce high volumes of certain threat types.
   // A malware package typically has 1-3 occurrences, not dozens.
   applyFPReductions(deduped, reachableFiles, packageName, packageDeps);

package/src/ml/feature-extractor.js ADDED Viewed

@@ -0,0 +1,214 @@
+'use strict';
+/**
+ * ML Feature Extractor — extracts numeric/boolean features from scan results
+ * for ML classifier training (Phase 1 of FPR reduction pipeline).
+ *
+ * Features are designed to capture the discriminative signals between true
+ * positives and false positives: threat composition, severity distribution,
+ * scoring breakdown, and package metadata.
+ *
+ * Output: flat object with numeric/boolean values suitable for XGBoost/RF.
+ */
+// Top threat types by frequency in production (covers ~95% of all findings).
+// Types not in this list are aggregated into `threat_type_other`.
+const TOP_THREAT_TYPES = [
+  'suspicious_dataflow',
+  'env_access',
+  'sensitive_string',
+  'dangerous_call_eval',
+  'dangerous_call_exec',
+  'dangerous_call_function',
+  'obfuscation_detected',
+  'high_entropy_string',
+  'dynamic_require',
+  'dynamic_import',
+  'lifecycle_script',
+  'typosquat_detected',
+  'staged_payload',
+  'staged_binary_payload',
+  'network_require',
+  'sandbox_evasion',
+  'credential_regex_harvest',
+  'remote_code_load',
+  'suspicious_domain',
+  'prototype_hook',
+  'intent_credential_exfil',
+  'intent_command_exfil',
+  'cross_file_dataflow',
+  'module_compile',
+  'crypto_decipher',
+  'env_charcode_reconstruction',
+  'lifecycle_shell_pipe',
+  'curl_exec',
+  'reverse_shell',
+  'binary_dropper',
+  'mcp_config_injection'
+];
+const TOP_THREAT_TYPES_SET = new Set(TOP_THREAT_TYPES);
+/**
+ * Extract ML features from a scan result object.
+ *
+ * @param {Object} result - scan result from run() with { threats, summary }
+ * @param {Object} meta - package metadata { name, version, ecosystem, unpackedSize, registryMeta }
+ * @returns {Object} flat feature vector with numeric/boolean values
+ */
+function extractFeatures(result, meta) {
+  const features = Object.create(null);
+  const threats = (result && result.threats) || [];
+  const summary = (result && result.summary) || {};
+  // --- Scoring features ---
+  features.score = summary.riskScore || 0;
+  features.max_file_score = summary.maxFileScore || 0;
+  features.package_score = summary.packageScore || 0;
+  features.global_risk_score = summary.globalRiskScore || 0;
+  // --- Severity counts ---
+  features.count_total = summary.total || 0;
+  features.count_critical = summary.critical || 0;
+  features.count_high = summary.high || 0;
+  features.count_medium = summary.medium || 0;
+  features.count_low = summary.low || 0;
+  // --- Distinct threat types ---
+  const distinctTypes = new Set(threats.map(t => t.type));
+  features.distinct_threat_types = distinctTypes.size;
+  // --- Per-type counts (top 31 types) ---
+  const typeCounts = Object.create(null);
+  for (const t of threats) {
+    typeCounts[t.type] = (typeCounts[t.type] || 0) + 1;
+  }
+  for (const type of TOP_THREAT_TYPES) {
+    features[`type_${type}`] = typeCounts[type] || 0;
+  }
+  // Aggregate count for types not in top list
+  let otherCount = 0;
+  for (const [type, count] of Object.entries(typeCounts)) {
+    if (!TOP_THREAT_TYPES_SET.has(type)) {
+      otherCount += count;
+    }
+  }
+  features.type_other = otherCount;
+  // --- Boolean behavioral signals ---
+  features.has_lifecycle_script = threats.some(t => t.type === 'lifecycle_script' || t.type === 'lifecycle_shell_pipe') ? 1 : 0;
+  features.has_network_access = threats.some(t =>
+    t.type === 'network_require' || t.type === 'remote_code_load' ||
+    t.type === 'curl_exec' || t.type === 'suspicious_dataflow'
+  ) ? 1 : 0;
+  features.has_obfuscation = threats.some(t =>
+    t.type === 'obfuscation_detected' || t.type === 'high_entropy_string' ||
+    t.type === 'js_obfuscation_pattern'
+  ) ? 1 : 0;
+  features.has_env_access = threats.some(t => t.type === 'env_access' || t.type === 'env_charcode_reconstruction') ? 1 : 0;
+  features.has_eval = threats.some(t => t.type === 'dangerous_call_eval' || t.type === 'dangerous_call_function') ? 1 : 0;
+  features.has_staged_payload = threats.some(t => t.type === 'staged_payload' || t.type === 'staged_binary_payload') ? 1 : 0;
+  features.has_typosquat = threats.some(t => t.type === 'typosquat_detected' || t.type === 'pypi_typosquat_detected') ? 1 : 0;
+  features.has_ioc_match = threats.some(t => t.type === 'known_malicious_package' || t.type === 'known_malicious_hash' || t.type === 'pypi_malicious_package') ? 1 : 0;
+  features.has_intent_pair = threats.some(t => t.type === 'intent_credential_exfil' || t.type === 'intent_command_exfil') ? 1 : 0;
+  features.has_sandbox_finding = threats.some(t => t.type && t.type.startsWith('sandbox_')) ? 1 : 0;
+  // --- File distribution features ---
+  const fileScores = summary.fileScores || {};
+  const fileScoreValues = Object.values(fileScores);
+  features.file_count_with_threats = fileScoreValues.length;
+  features.file_score_mean = fileScoreValues.length > 0
+    ? Math.round(fileScoreValues.reduce((a, b) => a + b, 0) / fileScoreValues.length)
+    : 0;
+  features.file_score_max = fileScoreValues.length > 0
+    ? Math.max(...fileScoreValues)
+    : 0;
+  // --- Severity concentration: ratio of CRITICAL+HIGH vs total ---
+  features.severity_ratio_high = features.count_total > 0
+    ? Math.round(((features.count_critical + features.count_high) / features.count_total) * 100) / 100
+    : 0;
+  // --- Points concentration: max single-threat points vs score ---
+  const breakdown = summary.breakdown || [];
+  features.max_single_points = breakdown.length > 0 ? breakdown[0].points : 0;
+  features.points_concentration = features.score > 0 && breakdown.length > 0
+    ? Math.round((breakdown[0].points / features.score) * 100) / 100
+    : 0;
+  // --- Package metadata (from registry) ---
+  const registry = (meta && meta.registryMeta) || {};
+  features.unpacked_size_bytes = (meta && meta.unpackedSize) || registry.unpackedSize || 0;
+  features.dep_count = countDeps(registry.dependencies);
+  features.dev_dep_count = countDeps(registry.devDependencies);
+  // --- Reputation factor (if computed by monitor) ---
+  features.reputation_factor = summary.reputationFactor || 1.0;
+  return features;
+}
+/**
+ * Count dependencies from a registry metadata dependencies object.
+ * Handles both object format ({name: version}) and number.
+ */
+function countDeps(deps) {
+  if (!deps) return 0;
+  if (typeof deps === 'number') return deps;
+  if (typeof deps === 'object') return Object.keys(deps).length;
+  return 0;
+}
+/**
+ * Build a complete JSONL record for a scanned package.
+ *
+ * @param {Object} result - scan result from run()
+ * @param {Object} params - { name, version, ecosystem, unpackedSize, registryMeta, label, tier, sandboxResult }
+ * @returns {Object} complete record with metadata + features + label
+ */
+function buildTrainingRecord(result, params) {
+  const {
+    name, version, ecosystem,
+    unpackedSize, registryMeta,
+    label, tier, sandboxResult
+  } = params;
+  const features = extractFeatures(result, {
+    name, version, ecosystem,
+    unpackedSize, registryMeta
+  });
+  const record = Object.create(null);
+  // --- Identity (not features, for traceability) ---
+  record.name = name || '';
+  record.version = version || '';
+  record.ecosystem = ecosystem || 'npm';
+  record.timestamp = new Date().toISOString();
+  // --- Label ---
+  // 'clean' = no findings or T3 only
+  // 'suspect' = T1/T2 (pending manual review)
+  // 'confirmed' = manually confirmed malicious
+  // 'fp' = manually confirmed false positive
+  record.label = label || 'suspect';
+  record.tier = tier || null;
+  // --- Features ---
+  Object.assign(record, features);
+  // --- Sandbox score (if available) ---
+  record.sandbox_score = (sandboxResult && sandboxResult.score) || 0;
+  record.sandbox_finding_count = (sandboxResult && sandboxResult.findings)
+    ? sandboxResult.findings.length
+    : 0;
+  return record;
+}
+module.exports = {
+  extractFeatures,
+  buildTrainingRecord,
+  TOP_THREAT_TYPES,
+  TOP_THREAT_TYPES_SET
+};

package/src/ml/jsonl-writer.js ADDED Viewed

@@ -0,0 +1,187 @@
+'use strict';
+/**
+ * JSONL Writer — appends training records to data/ml-training.jsonl.
+ *
+ * One JSON object per line, newline-delimited (JSONL format).
+ * Uses append mode for crash-safe incremental writes.
+ * Auto-creates data/ directory if missing.
+ *
+ * File rotation: when the file exceeds MAX_JSONL_SIZE (100MB),
+ * it is renamed to ml-training-{timestamp}.jsonl and a fresh file starts.
+ */
+const fs = require('fs');
+const path = require('path');
+const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
+let TRAINING_FILE = DEFAULT_TRAINING_FILE;
+const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
+/**
+ * Override the training file path (for testing).
+ * @param {string} filePath - new file path
+ */
+function setTrainingFile(filePath) {
+  TRAINING_FILE = filePath;
+}
+/**
+ * Reset the training file path to the default.
+ */
+function resetTrainingFile() {
+  TRAINING_FILE = DEFAULT_TRAINING_FILE;
+}
+/**
+ * Append a single record to the JSONL training file.
+ * @param {Object} record - training record from buildTrainingRecord()
+ */
+function appendRecord(record) {
+  try {
+    const dir = path.dirname(TRAINING_FILE);
+    if (!fs.existsSync(dir)) {
+      fs.mkdirSync(dir, { recursive: true });
+    }
+    // Rotate if file is too large
+    maybeRotate();
+    const line = JSON.stringify(record) + '\n';
+    fs.appendFileSync(TRAINING_FILE, line, 'utf8');
+  } catch (err) {
+    // Non-fatal: JSONL export failure should never crash the monitor
+    if (err.code === 'EROFS' || err.code === 'EACCES' || err.code === 'EPERM') {
+      // Read-only filesystem — silently skip (same pattern as atomicWriteFileSync)
+      return;
+    }
+    console.error(`[ML] Failed to append JSONL record: ${err.message}`);
+  }
+}
+/**
+ * Rotate the JSONL file if it exceeds MAX_JSONL_SIZE.
+ * Renames to ml-training-{ISO timestamp}.jsonl.
+ */
+function maybeRotate() {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) return;
+    const stat = fs.statSync(TRAINING_FILE);
+    if (stat.size < MAX_JSONL_SIZE) return;
+    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+    const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
+    fs.renameSync(TRAINING_FILE, rotatedName);
+    console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
+  } catch (err) {
+    console.error(`[ML] Rotation failed: ${err.message}`);
+  }
+}
+/**
+ * Read all records from the current JSONL file.
+ * Useful for offline analysis and model training.
+ * @returns {Object[]} array of parsed records
+ */
+function readRecords() {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) return [];
+    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
+    return content
+      .split('\n')
+      .filter(line => line.trim())
+      .map((line, i) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          console.warn(`[ML] Skipping malformed JSONL line ${i + 1}`);
+          return null;
+        }
+      })
+      .filter(Boolean);
+  } catch (err) {
+    console.error(`[ML] Failed to read JSONL: ${err.message}`);
+    return [];
+  }
+}
+/**
+ * Get stats about the current JSONL file.
+ * @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
+ */
+function getStats() {
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) {
+      return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
+    }
+    const stat = fs.statSync(TRAINING_FILE);
+    // Count lines without reading the entire file into memory
+    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
+    const lineCount = content.split('\n').filter(l => l.trim()).length;
+    return {
+      recordCount: lineCount,
+      fileSizeBytes: stat.size,
+      fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
+    };
+  } catch {
+    return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
+  }
+}
+/**
+ * Update the label of records matching a given package name.
+ * Used when manual confirmation (fp/confirmed) is applied retroactively.
+ *
+ * @param {string} packageName - package name to relabel
+ * @param {string} newLabel - 'fp' or 'confirmed'
+ * @param {number} [sandboxFindingCount] - number of sandbox findings (defense-in-depth for 'confirmed')
+ * @returns {number} number of records updated
+ */
+function relabelRecords(packageName, newLabel, sandboxFindingCount) {
+  // Defense-in-depth: never write 'confirmed' without real sandbox findings
+  if (newLabel === 'confirmed' && (!sandboxFindingCount || sandboxFindingCount === 0)) {
+    console.warn(`[ML] BLOCKED relabel to 'confirmed' for ${packageName}: sandbox_finding_count=${sandboxFindingCount || 0}`);
+    return 0;
+  }
+  try {
+    if (!fs.existsSync(TRAINING_FILE)) return 0;
+    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
+    const lines = content.split('\n');
+    let updated = 0;
+    const newLines = lines.map(line => {
+      if (!line.trim()) return line;
+      try {
+        const record = JSON.parse(line);
+        if (record.name === packageName && record.label !== newLabel) {
+          record.label = newLabel;
+          updated++;
+          return JSON.stringify(record);
+        }
+        return line;
+      } catch {
+        return line;
+      }
+    });
+    if (updated > 0) {
+      fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
+      console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
+    }
+    return updated;
+  } catch (err) {
+    console.error(`[ML] Failed to relabel records: ${err.message}`);
+    return 0;
+  }
+}
+module.exports = {
+  appendRecord,
+  readRecords,
+  getStats,
+  relabelRecords,
+  maybeRotate,
+  get TRAINING_FILE() { return TRAINING_FILE; },
+  setTrainingFile,
+  resetTrainingFile,
+  MAX_JSONL_SIZE
+};

package/src/response/playbooks.js CHANGED Viewed

@@ -501,6 +501,10 @@ const PLAYBOOKS = {
     'CRITIQUE: Un Proxy JavaScript avec trap set/get/apply est combine avec un appel reseau. ' +
     'Technique d\'interception: le Proxy capture toutes les ecritures de proprietes (credentials, tokens, config) ' +
     'et les exfiltre via HTTPS/fetch/dgram. Supprimer le package. Auditer tous les modules qui importent ce package.',
+  detached_credential_exfil:
+    'CRITIQUE: Process detache avec acces aux credentials et exfiltration reseau. ' +
+    'Technique DPRK/Lazarus: le process fils survit au parent (detached:true, unref()) et exfiltre des secrets en arriere-plan. ' +
+    'Supprimer le package immediatement. Regenerer tous les tokens/credentials. Auditer les process en cours d\'execution.',
   intent_credential_exfil:
     'CRITIQUE: Coherence d\'intention detectee — lecture de credentials combinee avec exfiltration reseau. ' +
     'Pattern multi-fichier DPRK/Lazarus: chaque fichier semble legitime individuellement mais le package ' +

package/src/rules/index.js CHANGED Viewed

@@ -1408,6 +1408,18 @@ const RULES = {
   },
   // Intent Graph rules (v2.6.0)
+  detached_credential_exfil: {
+    id: 'MUADDIB-AST-047',
+    name: 'Detached Process Credential Exfiltration',
+    severity: 'CRITICAL',
+    confidence: 'high',
+    description: 'Process detache (survit au parent) avec acces aux credentials et appel reseau — technique DPRK/Lazarus pour exfiltrer des secrets en arriere-plan',
+    references: [
+      'https://attack.mitre.org/techniques/T1041/',
+      'https://www.cisa.gov/news-events/cybersecurity-advisories/aa22-108a'
+    ],
+    mitre: 'T1041'
+  },
   intent_credential_exfil: {
     id: 'MUADDIB-INTENT-001',
     name: 'Intent Credential Exfiltration',

package/src/scanner/ast-detectors.js CHANGED Viewed

@@ -2187,6 +2187,24 @@ function handlePostWalk(ctx) {
       file: ctx.relFile
     });
   }
+  // DPRK/Lazarus compound: detached background process + credential env access + network
+  // Pattern: spawn({detached:true}) reads secrets then exfils via network.
+  // This combination is never legitimate — daemons don't read API keys and send them out.
+  const hasDetachedInFile = ctx.threats.some(t =>
+    t.file === ctx.relFile && t.type === 'detached_process'
+  );
+  const hasSensitiveEnvInFile = ctx.threats.some(t =>
+    t.file === ctx.relFile && t.type === 'env_access'
+  );
+  if (hasDetachedInFile && hasSensitiveEnvInFile && ctx.hasNetworkCallInFile) {
+    ctx.threats.push({
+      type: 'detached_credential_exfil',
+      severity: 'CRITICAL',
+      message: 'Detached process + sensitive env access + network call — credential exfiltration via background process (DPRK/Lazarus evasion pattern).',
+      file: ctx.relFile
+    });
+  }
 }
 function handleWithStatement(node, ctx) {

package/src/scoring.js CHANGED Viewed

@@ -153,7 +153,8 @@ const DIST_EXEMPT_TYPES = new Set([
   'download_exec_binary',     // download + chmod + exec (binary dropper)
   'cross_file_dataflow',      // credential read → network exfil across files
   'staged_eval_decode',       // eval(atob(...)) (explicit payload staging)
-  'reverse_shell'             // net.Socket + connect + pipe (always malicious)
+  'reverse_shell',            // net.Socket + connect + pipe (always malicious)
+  'detached_credential_exfil' // detached process + credential exfil (DPRK/Lazarus)
   // P6: remote_code_load and proxy_data_intercept removed — in bundled dist/ files,
   // fetch + eval co-occurrence is coincidental (bundler combines HTTP client + template compilation).
   // fetch_decrypt_exec (fetch+decrypt+eval triple) remains exempt — never coincidental.
@@ -196,7 +197,8 @@ const REACHABILITY_EXEMPT_TYPES = new Set([
   'cross_file_dataflow',
   'typosquat_detected', 'pypi_typosquat_detected',
   'pypi_malicious_package',
-  'ai_config_injection', 'ai_config_injection_compound'
+  'ai_config_injection', 'ai_config_injection_compound',
+  'detached_credential_exfil' // DPRK/Lazarus: invoked via lifecycle, not require/import
 ]);
 // Custom class prototypes that HTTP frameworks legitimately extend.