muaddib-scanner 2.8.6 → 2.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.8.6",
3
+ "version": "2.8.7",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -0,0 +1,214 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * ML Feature Extractor — extracts numeric/boolean features from scan results
5
+ * for ML classifier training (Phase 1 of FPR reduction pipeline).
6
+ *
7
+ * Features are designed to capture the discriminative signals between true
8
+ * positives and false positives: threat composition, severity distribution,
9
+ * scoring breakdown, and package metadata.
10
+ *
11
+ * Output: flat object with numeric/boolean values suitable for XGBoost/RF.
12
+ */
13
+
14
+ // Top threat types by frequency in production (covers ~95% of all findings).
15
+ // Types not in this list are aggregated into `threat_type_other`.
16
+ const TOP_THREAT_TYPES = [
17
+ 'suspicious_dataflow',
18
+ 'env_access',
19
+ 'sensitive_string',
20
+ 'dangerous_call_eval',
21
+ 'dangerous_call_exec',
22
+ 'dangerous_call_function',
23
+ 'obfuscation_detected',
24
+ 'high_entropy_string',
25
+ 'dynamic_require',
26
+ 'dynamic_import',
27
+ 'lifecycle_script',
28
+ 'typosquat_detected',
29
+ 'staged_payload',
30
+ 'staged_binary_payload',
31
+ 'network_require',
32
+ 'sandbox_evasion',
33
+ 'credential_regex_harvest',
34
+ 'remote_code_load',
35
+ 'suspicious_domain',
36
+ 'prototype_hook',
37
+ 'intent_credential_exfil',
38
+ 'intent_command_exfil',
39
+ 'cross_file_dataflow',
40
+ 'module_compile',
41
+ 'crypto_decipher',
42
+ 'env_charcode_reconstruction',
43
+ 'lifecycle_shell_pipe',
44
+ 'curl_exec',
45
+ 'reverse_shell',
46
+ 'binary_dropper',
47
+ 'mcp_config_injection'
48
+ ];
49
+
50
+ const TOP_THREAT_TYPES_SET = new Set(TOP_THREAT_TYPES);
51
+
52
+ /**
53
+ * Extract ML features from a scan result object.
54
+ *
55
+ * @param {Object} result - scan result from run() with { threats, summary }
56
+ * @param {Object} meta - package metadata { name, version, ecosystem, unpackedSize, registryMeta }
57
+ * @returns {Object} flat feature vector with numeric/boolean values
58
+ */
59
+ function extractFeatures(result, meta) {
60
+ const features = Object.create(null);
61
+ const threats = (result && result.threats) || [];
62
+ const summary = (result && result.summary) || {};
63
+
64
+ // --- Scoring features ---
65
+ features.score = summary.riskScore || 0;
66
+ features.max_file_score = summary.maxFileScore || 0;
67
+ features.package_score = summary.packageScore || 0;
68
+ features.global_risk_score = summary.globalRiskScore || 0;
69
+
70
+ // --- Severity counts ---
71
+ features.count_total = summary.total || 0;
72
+ features.count_critical = summary.critical || 0;
73
+ features.count_high = summary.high || 0;
74
+ features.count_medium = summary.medium || 0;
75
+ features.count_low = summary.low || 0;
76
+
77
+ // --- Distinct threat types ---
78
+ const distinctTypes = new Set(threats.map(t => t.type));
79
+ features.distinct_threat_types = distinctTypes.size;
80
+
81
+ // --- Per-type counts (top 31 types) ---
82
+ const typeCounts = Object.create(null);
83
+ for (const t of threats) {
84
+ typeCounts[t.type] = (typeCounts[t.type] || 0) + 1;
85
+ }
86
+ for (const type of TOP_THREAT_TYPES) {
87
+ features[`type_${type}`] = typeCounts[type] || 0;
88
+ }
89
+ // Aggregate count for types not in top list
90
+ let otherCount = 0;
91
+ for (const [type, count] of Object.entries(typeCounts)) {
92
+ if (!TOP_THREAT_TYPES_SET.has(type)) {
93
+ otherCount += count;
94
+ }
95
+ }
96
+ features.type_other = otherCount;
97
+
98
+ // --- Boolean behavioral signals ---
99
+ features.has_lifecycle_script = threats.some(t => t.type === 'lifecycle_script' || t.type === 'lifecycle_shell_pipe') ? 1 : 0;
100
+ features.has_network_access = threats.some(t =>
101
+ t.type === 'network_require' || t.type === 'remote_code_load' ||
102
+ t.type === 'curl_exec' || t.type === 'suspicious_dataflow'
103
+ ) ? 1 : 0;
104
+ features.has_obfuscation = threats.some(t =>
105
+ t.type === 'obfuscation_detected' || t.type === 'high_entropy_string' ||
106
+ t.type === 'js_obfuscation_pattern'
107
+ ) ? 1 : 0;
108
+ features.has_env_access = threats.some(t => t.type === 'env_access' || t.type === 'env_charcode_reconstruction') ? 1 : 0;
109
+ features.has_eval = threats.some(t => t.type === 'dangerous_call_eval' || t.type === 'dangerous_call_function') ? 1 : 0;
110
+ features.has_staged_payload = threats.some(t => t.type === 'staged_payload' || t.type === 'staged_binary_payload') ? 1 : 0;
111
+ features.has_typosquat = threats.some(t => t.type === 'typosquat_detected' || t.type === 'pypi_typosquat_detected') ? 1 : 0;
112
+ features.has_ioc_match = threats.some(t => t.type === 'known_malicious_package' || t.type === 'known_malicious_hash' || t.type === 'pypi_malicious_package') ? 1 : 0;
113
+ features.has_intent_pair = threats.some(t => t.type === 'intent_credential_exfil' || t.type === 'intent_command_exfil') ? 1 : 0;
114
+ features.has_sandbox_finding = threats.some(t => t.type && t.type.startsWith('sandbox_')) ? 1 : 0;
115
+
116
+ // --- File distribution features ---
117
+ const fileScores = summary.fileScores || {};
118
+ const fileScoreValues = Object.values(fileScores);
119
+ features.file_count_with_threats = fileScoreValues.length;
120
+ features.file_score_mean = fileScoreValues.length > 0
121
+ ? Math.round(fileScoreValues.reduce((a, b) => a + b, 0) / fileScoreValues.length)
122
+ : 0;
123
+ features.file_score_max = fileScoreValues.length > 0
124
+ ? Math.max(...fileScoreValues)
125
+ : 0;
126
+
127
+ // --- Severity concentration: ratio of CRITICAL+HIGH vs total ---
128
+ features.severity_ratio_high = features.count_total > 0
129
+ ? Math.round(((features.count_critical + features.count_high) / features.count_total) * 100) / 100
130
+ : 0;
131
+
132
+ // --- Points concentration: max single-threat points vs score ---
133
+ const breakdown = summary.breakdown || [];
134
+ features.max_single_points = breakdown.length > 0 ? breakdown[0].points : 0;
135
+ features.points_concentration = features.score > 0 && breakdown.length > 0
136
+ ? Math.round((breakdown[0].points / features.score) * 100) / 100
137
+ : 0;
138
+
139
+ // --- Package metadata (from registry) ---
140
+ const registry = (meta && meta.registryMeta) || {};
141
+ features.unpacked_size_bytes = (meta && meta.unpackedSize) || registry.unpackedSize || 0;
142
+ features.dep_count = countDeps(registry.dependencies);
143
+ features.dev_dep_count = countDeps(registry.devDependencies);
144
+
145
+ // --- Reputation factor (if computed by monitor) ---
146
+ features.reputation_factor = summary.reputationFactor || 1.0;
147
+
148
+ return features;
149
+ }
150
+
151
+ /**
152
+ * Count dependencies from a registry metadata dependencies object.
153
+ * Handles both object format ({name: version}) and number.
154
+ */
155
+ function countDeps(deps) {
156
+ if (!deps) return 0;
157
+ if (typeof deps === 'number') return deps;
158
+ if (typeof deps === 'object') return Object.keys(deps).length;
159
+ return 0;
160
+ }
161
+
162
+ /**
163
+ * Build a complete JSONL record for a scanned package.
164
+ *
165
+ * @param {Object} result - scan result from run()
166
+ * @param {Object} params - { name, version, ecosystem, unpackedSize, registryMeta, label, tier, sandboxResult }
167
+ * @returns {Object} complete record with metadata + features + label
168
+ */
169
+ function buildTrainingRecord(result, params) {
170
+ const {
171
+ name, version, ecosystem,
172
+ unpackedSize, registryMeta,
173
+ label, tier, sandboxResult
174
+ } = params;
175
+
176
+ const features = extractFeatures(result, {
177
+ name, version, ecosystem,
178
+ unpackedSize, registryMeta
179
+ });
180
+
181
+ const record = Object.create(null);
182
+
183
+ // --- Identity (not features, for traceability) ---
184
+ record.name = name || '';
185
+ record.version = version || '';
186
+ record.ecosystem = ecosystem || 'npm';
187
+ record.timestamp = new Date().toISOString();
188
+
189
+ // --- Label ---
190
+ // 'clean' = no findings or T3 only
191
+ // 'suspect' = T1/T2 (pending manual review)
192
+ // 'confirmed' = manually confirmed malicious
193
+ // 'fp' = manually confirmed false positive
194
+ record.label = label || 'suspect';
195
+ record.tier = tier || null;
196
+
197
+ // --- Features ---
198
+ Object.assign(record, features);
199
+
200
+ // --- Sandbox score (if available) ---
201
+ record.sandbox_score = (sandboxResult && sandboxResult.score) || 0;
202
+ record.sandbox_finding_count = (sandboxResult && sandboxResult.findings)
203
+ ? sandboxResult.findings.length
204
+ : 0;
205
+
206
+ return record;
207
+ }
208
+
209
+ module.exports = {
210
+ extractFeatures,
211
+ buildTrainingRecord,
212
+ TOP_THREAT_TYPES,
213
+ TOP_THREAT_TYPES_SET
214
+ };
@@ -0,0 +1,181 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * JSONL Writer — appends training records to data/ml-training.jsonl.
5
+ *
6
+ * One JSON object per line, newline-delimited (JSONL format).
7
+ * Uses append mode for crash-safe incremental writes.
8
+ * Auto-creates data/ directory if missing.
9
+ *
10
+ * File rotation: when the file exceeds MAX_JSONL_SIZE (100MB),
11
+ * it is renamed to ml-training-{timestamp}.jsonl and a fresh file starts.
12
+ */
13
+
14
+ const fs = require('fs');
15
+ const path = require('path');
16
+
17
+ const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-training.jsonl');
18
+ let TRAINING_FILE = DEFAULT_TRAINING_FILE;
19
+ const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
20
+
21
+ /**
22
+ * Override the training file path (for testing).
23
+ * @param {string} filePath - new file path
24
+ */
25
+ function setTrainingFile(filePath) {
26
+ TRAINING_FILE = filePath;
27
+ }
28
+
29
+ /**
30
+ * Reset the training file path to the default.
31
+ */
32
+ function resetTrainingFile() {
33
+ TRAINING_FILE = DEFAULT_TRAINING_FILE;
34
+ }
35
+
36
+ /**
37
+ * Append a single record to the JSONL training file.
38
+ * @param {Object} record - training record from buildTrainingRecord()
39
+ */
40
+ function appendRecord(record) {
41
+ try {
42
+ const dir = path.dirname(TRAINING_FILE);
43
+ if (!fs.existsSync(dir)) {
44
+ fs.mkdirSync(dir, { recursive: true });
45
+ }
46
+
47
+ // Rotate if file is too large
48
+ maybeRotate();
49
+
50
+ const line = JSON.stringify(record) + '\n';
51
+ fs.appendFileSync(TRAINING_FILE, line, 'utf8');
52
+ } catch (err) {
53
+ // Non-fatal: JSONL export failure should never crash the monitor
54
+ if (err.code === 'EROFS' || err.code === 'EACCES' || err.code === 'EPERM') {
55
+ // Read-only filesystem — silently skip (same pattern as atomicWriteFileSync)
56
+ return;
57
+ }
58
+ console.error(`[ML] Failed to append JSONL record: ${err.message}`);
59
+ }
60
+ }
61
+
62
+ /**
63
+ * Rotate the JSONL file if it exceeds MAX_JSONL_SIZE.
64
+ * Renames to ml-training-{ISO timestamp}.jsonl.
65
+ */
66
+ function maybeRotate() {
67
+ try {
68
+ if (!fs.existsSync(TRAINING_FILE)) return;
69
+ const stat = fs.statSync(TRAINING_FILE);
70
+ if (stat.size < MAX_JSONL_SIZE) return;
71
+
72
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
73
+ const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
74
+ fs.renameSync(TRAINING_FILE, rotatedName);
75
+ console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
76
+ } catch (err) {
77
+ console.error(`[ML] Rotation failed: ${err.message}`);
78
+ }
79
+ }
80
+
81
+ /**
82
+ * Read all records from the current JSONL file.
83
+ * Useful for offline analysis and model training.
84
+ * @returns {Object[]} array of parsed records
85
+ */
86
+ function readRecords() {
87
+ try {
88
+ if (!fs.existsSync(TRAINING_FILE)) return [];
89
+ const content = fs.readFileSync(TRAINING_FILE, 'utf8');
90
+ return content
91
+ .split('\n')
92
+ .filter(line => line.trim())
93
+ .map((line, i) => {
94
+ try {
95
+ return JSON.parse(line);
96
+ } catch {
97
+ console.warn(`[ML] Skipping malformed JSONL line ${i + 1}`);
98
+ return null;
99
+ }
100
+ })
101
+ .filter(Boolean);
102
+ } catch (err) {
103
+ console.error(`[ML] Failed to read JSONL: ${err.message}`);
104
+ return [];
105
+ }
106
+ }
107
+
108
+ /**
109
+ * Get stats about the current JSONL file.
110
+ * @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
111
+ */
112
+ function getStats() {
113
+ try {
114
+ if (!fs.existsSync(TRAINING_FILE)) {
115
+ return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
116
+ }
117
+ const stat = fs.statSync(TRAINING_FILE);
118
+ // Count lines without reading the entire file into memory
119
+ const content = fs.readFileSync(TRAINING_FILE, 'utf8');
120
+ const lineCount = content.split('\n').filter(l => l.trim()).length;
121
+ return {
122
+ recordCount: lineCount,
123
+ fileSizeBytes: stat.size,
124
+ fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
125
+ };
126
+ } catch {
127
+ return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
128
+ }
129
+ }
130
+
131
+ /**
132
+ * Update the label of records matching a given package name.
133
+ * Used when manual confirmation (fp/confirmed) is applied retroactively.
134
+ *
135
+ * @param {string} packageName - package name to relabel
136
+ * @param {string} newLabel - 'fp' or 'confirmed'
137
+ * @returns {number} number of records updated
138
+ */
139
+ function relabelRecords(packageName, newLabel) {
140
+ try {
141
+ if (!fs.existsSync(TRAINING_FILE)) return 0;
142
+ const content = fs.readFileSync(TRAINING_FILE, 'utf8');
143
+ const lines = content.split('\n');
144
+ let updated = 0;
145
+ const newLines = lines.map(line => {
146
+ if (!line.trim()) return line;
147
+ try {
148
+ const record = JSON.parse(line);
149
+ if (record.name === packageName && record.label !== newLabel) {
150
+ record.label = newLabel;
151
+ updated++;
152
+ return JSON.stringify(record);
153
+ }
154
+ return line;
155
+ } catch {
156
+ return line;
157
+ }
158
+ });
159
+
160
+ if (updated > 0) {
161
+ fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
162
+ console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
163
+ }
164
+ return updated;
165
+ } catch (err) {
166
+ console.error(`[ML] Failed to relabel records: ${err.message}`);
167
+ return 0;
168
+ }
169
+ }
170
+
171
+ module.exports = {
172
+ appendRecord,
173
+ readRecords,
174
+ getStats,
175
+ relabelRecords,
176
+ maybeRotate,
177
+ get TRAINING_FILE() { return TRAINING_FILE; },
178
+ setTrainingFile,
179
+ resetTrainingFile,
180
+ MAX_JSONL_SIZE
181
+ };