muaddib-scanner 2.11.76 → 2.11.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.githooks/pre-commit +18 -0
  2. package/README.md +15 -6
  3. package/bin/muaddib.js +18 -4
  4. package/package.json +1 -2
  5. package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
  6. package/src/commands/interactive.js +5 -6
  7. package/src/commands/safe-install.js +19 -19
  8. package/src/ioc/scraper.js +46 -10
  9. package/src/monitor/daemon.js +39 -28
  10. package/src/monitor/ingestion.js +32 -2
  11. package/src/monitor/queue.js +84 -21
  12. package/src/monitor/scan-queue.js +68 -1
  13. package/src/monitor/state.js +24 -1
  14. package/src/monitor/webhook.js +32 -11
  15. package/src/output/formatter.js +3 -4
  16. package/src/pipeline/executor.js +9 -1
  17. package/src/runtime/daemon.js +27 -28
  18. package/src/runtime/watch.js +7 -7
  19. package/src/sandbox/index.js +11 -9
  20. package/src/scanner/temporal-analysis.js +8 -0
  21. package/src/scanner/temporal-ast-diff.js +5 -0
  22. package/src/utils.js +60 -1
  23. package/.dockerignore +0 -7
  24. package/.env.example +0 -43
  25. package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
  26. package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
  27. package/ml-retrain/auto-labeler/labeler.py +0 -256
  28. package/ml-retrain/auto-labeler/npm_checker.py +0 -228
  29. package/ml-retrain/auto-labeler/ossf_index.py +0 -178
  30. package/ml-retrain/auto-labeler/requirements.txt +0 -1
  31. package/ml-retrain/confusion-matrix.png +0 -0
  32. package/ml-retrain/model-trees-retrained.js +0 -12
  33. package/ml-retrain/retrain-report.json +0 -225
  34. package/ml-retrain/retrain.py +0 -974
  35. package/sbom.json +0 -0
  36. package/src/ml/train-bundler-detector.py +0 -725
  37. package/src/ml/train-xgboost.py +0 -957
  38. package/tools/export-model-js.py +0 -160
  39. package/tools/requirements-ml.txt +0 -5
  40. package/tools/train-classifier.py +0 -333
@@ -1,974 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- MUAD'DIB ML Retrain — Auto-Label Ground Truth
4
-
5
- Builds training dataset by crossing auto-labels.json with ml-training-merged.jsonl,
6
- adds Datadog malicious corpus, trains XGBoost with grid search, exports model + report.
7
-
8
- Usage:
9
- python ml-retrain/retrain.py --full
10
- python ml-retrain/retrain.py --build-dataset # Step 1 only
11
- python ml-retrain/retrain.py --train-only # Skip dataset build, use cached
12
-
13
- Environment:
14
- MUADDIB_DATA Override data directory (default: /opt/muaddib/data)
15
- """
16
-
17
- import argparse
18
- import json
19
- import os
20
- import sys
21
- import time
22
- from datetime import datetime, timezone
23
- from pathlib import Path
24
-
25
- import numpy as np
26
- import pandas as pd
27
- from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
28
- from sklearn.metrics import (
29
- precision_score, recall_score, f1_score, confusion_matrix,
30
- roc_auc_score, precision_recall_curve, roc_curve
31
- )
32
- import xgboost as xgb
33
-
34
- # ── Paths ──
35
- MUADDIB_DATA = Path(os.environ.get("MUADDIB_DATA", "/opt/muaddib/data"))
36
- MUADDIB_ALERTS = MUADDIB_DATA.parent / "logs" / "alerts"
37
- BASE_DIR = Path(__file__).parent
38
- OUTPUT_DIR = BASE_DIR
39
- DATASET_CACHE = OUTPUT_DIR / "retrain-dataset.jsonl"
40
- REPORT_PATH = OUTPUT_DIR / "retrain-report.json"
41
- MODEL_OUTPUT = OUTPUT_DIR / "model-trees-retrained.js"
42
- CONFUSION_MATRIX_PATH = OUTPUT_DIR / "confusion-matrix.png"
43
-
44
- # ── 87 hardcoded features — exact copy from train-xgboost.py ──
45
- IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
46
- FEATURE_NAMES = [
47
- 'score', 'max_file_score', 'package_score', 'global_risk_score',
48
- 'count_total', 'count_critical', 'count_high', 'count_medium', 'count_low',
49
- 'distinct_threat_types',
50
- 'type_suspicious_dataflow', 'type_env_access', 'type_sensitive_string',
51
- 'type_dangerous_call_eval', 'type_dangerous_call_exec',
52
- 'type_dangerous_call_function', 'type_obfuscation_detected',
53
- 'type_high_entropy_string', 'type_dynamic_require', 'type_dynamic_import',
54
- 'type_lifecycle_script', 'type_typosquat_detected', 'type_staged_payload',
55
- 'type_staged_binary_payload', 'type_network_require', 'type_sandbox_evasion',
56
- 'type_credential_regex_harvest', 'type_remote_code_load',
57
- 'type_suspicious_domain', 'type_prototype_hook',
58
- 'type_intent_credential_exfil', 'type_intent_command_exfil',
59
- 'type_cross_file_dataflow', 'type_module_compile', 'type_crypto_decipher',
60
- 'type_env_charcode_reconstruction', 'type_lifecycle_shell_pipe',
61
- 'type_curl_exec', 'type_reverse_shell', 'type_binary_dropper',
62
- 'type_mcp_config_injection',
63
- 'type_vm_code_execution', 'type_vm_dynamic_code',
64
- 'type_dangerous_constructor', 'type_module_load_bypass',
65
- 'type_require_process_mainmodule', 'type_proxy_globalthis_intercept',
66
- 'type_reflect_bind_code_execution',
67
- 'type_known_malicious_package', 'type_known_malicious_hash',
68
- 'type_unicode_invisible_injection', 'type_blockchain_c2_resolution',
69
- 'type_dangerous_exec', 'type_node_inline_exec',
70
- 'type_js_obfuscation_pattern',
71
- 'type_suspicious_module_sink', 'type_wasm_host_sink',
72
- 'type_other',
73
- 'has_lifecycle_script', 'has_network_access', 'has_obfuscation',
74
- 'has_env_access', 'has_eval', 'has_staged_payload', 'has_typosquat',
75
- 'has_ioc_match', 'has_intent_pair', 'has_sandbox_finding',
76
- 'file_count_with_threats', 'file_score_mean', 'file_score_max',
77
- 'severity_ratio_high', 'max_single_points', 'points_concentration',
78
- 'unpacked_size_bytes', 'dep_count', 'dev_dep_count',
79
- 'reputation_factor',
80
- 'package_age_days', 'weekly_downloads', 'version_count',
81
- 'author_package_count', 'has_repository', 'readme_size',
82
- 'file_count_total', 'has_tests', 'threat_density',
83
- ]
84
- assert len(FEATURE_NAMES) == 87
85
-
86
- # ── Grid search param space ──
87
- PARAM_GRID = {
88
- 'max_depth': [4, 6, 8],
89
- 'n_estimators': [100, 200, 300],
90
- 'learning_rate': [0.05, 0.1, 0.2],
91
- }
92
-
93
- XGB_BASE_PARAMS = {
94
- 'objective': 'binary:logistic',
95
- 'eval_metric': 'logloss',
96
- 'subsample': 0.8,
97
- 'colsample_bytree': 0.8,
98
- 'min_child_weight': 5,
99
- 'gamma': 0.1,
100
- 'reg_alpha': 0.1,
101
- 'reg_lambda': 1.0,
102
- 'seed': 42,
103
- 'verbosity': 0,
104
- }
105
-
106
-
107
- # ── TOP_THREAT_TYPES — mirrors feature-extractor.js ──
108
- TOP_THREAT_TYPES = [
109
- 'suspicious_dataflow', 'env_access', 'sensitive_string',
110
- 'dangerous_call_eval', 'dangerous_call_exec', 'dangerous_call_function',
111
- 'obfuscation_detected', 'high_entropy_string', 'dynamic_require',
112
- 'dynamic_import', 'lifecycle_script', 'typosquat_detected', 'staged_payload',
113
- 'staged_binary_payload', 'network_require', 'sandbox_evasion',
114
- 'credential_regex_harvest', 'remote_code_load', 'suspicious_domain',
115
- 'prototype_hook', 'intent_credential_exfil', 'intent_command_exfil',
116
- 'cross_file_dataflow', 'module_compile', 'crypto_decipher',
117
- 'env_charcode_reconstruction', 'lifecycle_shell_pipe', 'curl_exec',
118
- 'reverse_shell', 'binary_dropper', 'mcp_config_injection',
119
- 'vm_code_execution', 'vm_dynamic_code', 'dangerous_constructor',
120
- 'module_load_bypass', 'require_process_mainmodule',
121
- 'proxy_globalthis_intercept', 'reflect_bind_code_execution',
122
- 'known_malicious_package', 'known_malicious_hash',
123
- 'unicode_invisible_injection', 'blockchain_c2_resolution',
124
- 'dangerous_exec', 'node_inline_exec', 'js_obfuscation_pattern',
125
- 'suspicious_module_sink', 'wasm_host_sink',
126
- ]
127
- TOP_THREAT_TYPES_SET = set(TOP_THREAT_TYPES)
128
-
129
-
130
- def extract_features_from_alert(alert):
131
- """Extract the 87 ML features from an alert JSON file.
132
-
133
- Python port of feature-extractor.js — used to recover feature vectors
134
- for confirmed_malicious packages that have alert files but no JSONL record.
135
- Registry metadata features will be 0 (not available in alerts).
136
- """
137
- feat = {}
138
- summary = alert.get("summary", {})
139
- threats = alert.get("threats", [])
140
-
141
- # Scoring
142
- feat['score'] = summary.get('riskScore', 0)
143
- feat['max_file_score'] = summary.get('maxFileScore', 0)
144
- feat['package_score'] = summary.get('packageScore', 0)
145
- feat['global_risk_score'] = summary.get('globalRiskScore', 0)
146
-
147
- # Severity counts
148
- feat['count_total'] = summary.get('total', 0)
149
- feat['count_critical'] = summary.get('critical', 0)
150
- feat['count_high'] = summary.get('high', 0)
151
- feat['count_medium'] = summary.get('medium', 0)
152
- feat['count_low'] = summary.get('low', 0)
153
-
154
- # Distinct threat types
155
- distinct = set(t.get('type', '') for t in threats if t.get('type'))
156
- feat['distinct_threat_types'] = len(distinct)
157
-
158
- # Per-type counts
159
- type_counts = {}
160
- for t in threats:
161
- tt = t.get('type', '')
162
- if tt:
163
- type_counts[tt] = type_counts.get(tt, 0) + 1
164
- for tt in TOP_THREAT_TYPES:
165
- feat[f'type_{tt}'] = type_counts.get(tt, 0)
166
- other = sum(c for tt, c in type_counts.items() if tt not in TOP_THREAT_TYPES_SET)
167
- feat['type_other'] = other
168
-
169
- # Boolean behavioral signals
170
- types_set = set(type_counts.keys())
171
- feat['has_lifecycle_script'] = int(bool(types_set & {'lifecycle_script', 'lifecycle_shell_pipe'}))
172
- feat['has_network_access'] = int(bool(types_set & {'network_require', 'remote_code_load', 'curl_exec', 'suspicious_dataflow'}))
173
- feat['has_obfuscation'] = int(bool(types_set & {'obfuscation_detected', 'high_entropy_string', 'js_obfuscation_pattern'}))
174
- feat['has_env_access'] = int(bool(types_set & {'env_access', 'env_charcode_reconstruction'}))
175
- feat['has_eval'] = int(bool(types_set & {'dangerous_call_eval', 'dangerous_call_function'}))
176
- feat['has_staged_payload'] = int(bool(types_set & {'staged_payload', 'staged_binary_payload'}))
177
- feat['has_typosquat'] = int(bool(types_set & {'typosquat_detected', 'pypi_typosquat_detected'}))
178
- feat['has_ioc_match'] = int(bool(types_set & {'known_malicious_package', 'known_malicious_hash', 'pypi_malicious_package', 'dependency_ioc_match'}))
179
- feat['has_intent_pair'] = int(bool(types_set & {'intent_credential_exfil', 'intent_command_exfil'}))
180
- feat['has_sandbox_finding'] = int(any(tt.startswith('sandbox_') for tt in types_set))
181
-
182
- # File distribution
183
- file_scores = summary.get('fileScores', {})
184
- vals = list(file_scores.values()) if isinstance(file_scores, dict) else []
185
- feat['file_count_with_threats'] = len(vals)
186
- feat['file_score_mean'] = round(sum(vals) / len(vals)) if vals else 0
187
- feat['file_score_max'] = max(vals) if vals else 0
188
-
189
- # Severity concentration
190
- feat['severity_ratio_high'] = round(
191
- (feat['count_critical'] + feat['count_high']) / max(feat['count_total'], 1), 2)
192
- breakdown = summary.get('breakdown', [])
193
- feat['max_single_points'] = breakdown[0].get('points', 0) if breakdown else 0
194
- feat['points_concentration'] = round(
195
- feat['max_single_points'] / max(feat['score'], 1), 2) if feat['score'] > 0 else 0
196
-
197
- # Package metadata — not available in alerts, default to 0
198
- feat['unpacked_size_bytes'] = 0
199
- feat['dep_count'] = 0
200
- feat['dev_dep_count'] = 0
201
- feat['reputation_factor'] = 1.0
202
- feat['package_age_days'] = 0
203
- feat['weekly_downloads'] = 0
204
- feat['version_count'] = 0
205
- feat['author_package_count'] = 0
206
- feat['has_repository'] = 0
207
- feat['readme_size'] = 0
208
- feat['file_count_total'] = 0
209
- feat['has_tests'] = 0
210
- feat['threat_density'] = round(
211
- feat['count_total'] / max(feat['file_count_with_threats'], 1), 2)
212
-
213
- return feat
214
-
215
-
216
- def load_alert_index(alerts_dir):
217
- """Build index of alert files keyed by 'name@version'.
218
-
219
- Returns dict: { "name@version": alert_dict }
220
- """
221
- alerts_dir = Path(alerts_dir)
222
- if not alerts_dir.is_dir():
223
- return {}
224
-
225
- index = {}
226
- for filepath in alerts_dir.glob("*.json"):
227
- try:
228
- with open(filepath, 'r', encoding='utf-8') as f:
229
- alert = json.load(f)
230
- target = alert.get("target", "")
231
- # target format: "npm/package-name@version"
232
- if "/" in target and "@" in target:
233
- pkg_ver = target.split("/", 1)[1] # "name@version"
234
- # Keep the alert with the highest score for each package
235
- existing = index.get(pkg_ver)
236
- new_score = alert.get("summary", {}).get("riskScore", 0)
237
- if existing is None or new_score > existing.get("summary", {}).get("riskScore", 0):
238
- index[pkg_ver] = alert
239
- except (json.JSONDecodeError, OSError):
240
- continue
241
-
242
- return index
243
-
244
-
245
- # ══════════════════════════════════════════════════════════════
246
- # Step 1: Build training dataset
247
- # ══════════════════════════════════════════════════════════════
248
-
249
- def load_jsonl(filepath):
250
- """Load JSONL, skip malformed lines."""
251
- records = []
252
- skipped = 0
253
- with open(filepath, 'r', encoding='utf-8') as f:
254
- for line_num, line in enumerate(f, 1):
255
- line = line.strip()
256
- if not line:
257
- continue
258
- try:
259
- records.append(json.loads(line))
260
- except json.JSONDecodeError:
261
- skipped += 1
262
- print(f" [WARN] Skipping malformed line {line_num} in {filepath}")
263
- return records, skipped
264
-
265
-
266
- def build_dataset():
267
- """Cross auto-labels.json with ml-training-merged.jsonl + Datadog corpus."""
268
- print("=" * 70)
269
- print("[Step 1] Building training dataset from auto-labels + merged JSONL")
270
- print("=" * 70)
271
-
272
- # Load auto-labels
273
- auto_labels_path = MUADDIB_DATA / "auto-labels.json"
274
- if not auto_labels_path.is_file():
275
- print(f"ERROR: {auto_labels_path} not found", file=sys.stderr)
276
- sys.exit(1)
277
-
278
- with open(auto_labels_path, 'r', encoding='utf-8') as f:
279
- auto_labels = json.load(f)
280
- labels_map = auto_labels.get("labels", {})
281
- print(f" Auto-labels loaded: {len(labels_map)} entries")
282
- print(f" Summary: {auto_labels.get('summary', {})}")
283
-
284
- # Load merged JSONL
285
- merged_path = MUADDIB_DATA / "ml-training-merged.jsonl"
286
- if not merged_path.is_file():
287
- print(f"ERROR: {merged_path} not found", file=sys.stderr)
288
- sys.exit(1)
289
-
290
- merged_records, merged_skipped = load_jsonl(merged_path)
291
- print(f" Merged JSONL: {len(merged_records)} records ({merged_skipped} corrupted, skipped)")
292
-
293
- # Load Datadog malicious
294
- datadog_path = MUADDIB_DATA / "ml-training-datadog-full.jsonl"
295
- if not datadog_path.is_file():
296
- print(f"ERROR: {datadog_path} not found", file=sys.stderr)
297
- sys.exit(1)
298
-
299
- datadog_records, datadog_skipped = load_jsonl(datadog_path)
300
- print(f" Datadog JSONL: {len(datadog_records)} records ({datadog_skipped} corrupted, skipped)")
301
-
302
- # Load alert index for recovering confirmed_malicious without JSONL records
303
- alert_index = load_alert_index(MUADDIB_ALERTS)
304
- print(f" Alert index: {len(alert_index)} packages")
305
-
306
- # Cross merged records with auto-labels
307
- dataset = []
308
- stats = {
309
- "confirmed_malicious_jsonl": 0,
310
- "confirmed_malicious_alert": 0,
311
- "confirmed_malicious_no_features": 0,
312
- "likely_malicious_excluded": 0,
313
- "unconfirmed_as_clean": 0,
314
- "pending_excluded": 0,
315
- "clean_no_match": 0,
316
- "datadog_malicious": 0,
317
- }
318
-
319
- seen = set() # (name, version) dedup
320
-
321
- for rec in merged_records:
322
- name = rec.get("name", "")
323
- version = rec.get("version", "")
324
- key = f"{name}@{version}"
325
- dedup_key = (name, version)
326
-
327
- if dedup_key in seen:
328
- continue
329
-
330
- auto = labels_map.get(key, {})
331
- auto_label = auto.get("auto_label", "")
332
-
333
- if auto_label == "confirmed_malicious":
334
- rec["label"] = "malicious"
335
- rec["_retrain_source"] = "auto-label:confirmed+jsonl"
336
- dataset.append(rec)
337
- stats["confirmed_malicious_jsonl"] += 1
338
- seen.add(dedup_key)
339
-
340
- elif auto_label == "unconfirmed":
341
- # Suspect not confirmed after 7+ days → treat as clean for training
342
- rec["label"] = "clean"
343
- rec["_retrain_source"] = "auto-label:unconfirmed"
344
- dataset.append(rec)
345
- stats["unconfirmed_as_clean"] += 1
346
- seen.add(dedup_key)
347
-
348
- elif auto_label == "likely_malicious":
349
- # Exclude from training — ambiguous signal
350
- stats["likely_malicious_excluded"] += 1
351
- seen.add(dedup_key)
352
- continue
353
-
354
- elif auto_label == "pending":
355
- # Exclude — too recent for reliable label
356
- stats["pending_excluded"] += 1
357
- seen.add(dedup_key)
358
- continue
359
-
360
- else:
361
- # No match in auto-labels — use original label
362
- orig_label = rec.get("label", "")
363
- if orig_label in ("clean", "fp", "ml_clean"):
364
- rec["label"] = "clean"
365
- rec["_retrain_source"] = f"original:{orig_label}"
366
- dataset.append(rec)
367
- stats["clean_no_match"] += 1
368
- seen.add(dedup_key)
369
- # Skip suspect/unconfirmed/unknown without auto-label match
370
-
371
- # Add Datadog malicious corpus
372
- for rec in datadog_records:
373
- name = rec.get("name", "")
374
- version = rec.get("version", "")
375
- dedup_key = (name, version)
376
-
377
- if dedup_key in seen:
378
- continue
379
-
380
- rec["label"] = "malicious"
381
- rec["_retrain_source"] = "datadog"
382
- dataset.append(rec)
383
- stats["datadog_malicious"] += 1
384
- seen.add(dedup_key)
385
-
386
- # Recover confirmed_malicious that have alert files but no JSONL record
387
- for key, label_info in labels_map.items():
388
- if label_info.get("auto_label") != "confirmed_malicious":
389
- continue
390
- # key is "name@version"
391
- if "@" not in key:
392
- continue
393
- name, version = key.rsplit("@", 1)
394
- dedup_key = (name, version)
395
- if dedup_key in seen:
396
- continue
397
-
398
- alert = alert_index.get(key)
399
- if alert:
400
- features = extract_features_from_alert(alert)
401
- rec = {
402
- "name": name, "version": version, "ecosystem": "npm",
403
- "label": "malicious",
404
- "_retrain_source": "auto-label:confirmed+alert",
405
- }
406
- rec.update(features)
407
- dataset.append(rec)
408
- stats["confirmed_malicious_alert"] += 1
409
- else:
410
- stats["confirmed_malicious_no_features"] += 1
411
-
412
- seen.add(dedup_key)
413
-
414
- print(f"\n Dataset construction:")
415
- for k, v in stats.items():
416
- print(f" {k}: {v}")
417
-
418
- n_malicious = sum(1 for r in dataset if r["label"] == "malicious")
419
- n_clean = sum(1 for r in dataset if r["label"] == "clean")
420
- print(f"\n Final dataset: {len(dataset)} samples")
421
- print(f" Malicious: {n_malicious}")
422
- print(f" Clean: {n_clean}")
423
- print(f" Ratio (clean/malicious): {n_clean / max(n_malicious, 1):.2f}")
424
-
425
- # Cache dataset
426
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
427
- with open(DATASET_CACHE, 'w', encoding='utf-8') as f:
428
- for rec in dataset:
429
- f.write(json.dumps(rec, separators=(',', ':')) + '\n')
430
- print(f" Cached to {DATASET_CACHE}")
431
-
432
- return dataset, stats
433
-
434
-
435
- # ══════════════════════════════════════════════════════════════
436
- # Step 2: Feature alignment
437
- # ══════════════════════════════════════════════════════════════
438
-
439
- def align_features(dataset):
440
- """Align dataset to 87 features, return X, y.
441
-
442
- All missing features use 0 (not -1) to prevent data leakage:
443
- XGBoost learns split directions for missing values, so -1 in one source
444
- and real values in another creates a perfect source-identity signal.
445
- """
446
- print("\n" + "=" * 70)
447
- print("[Step 2] Aligning 87 features (0 for missing — no leakage)")
448
- print("=" * 70)
449
-
450
- X_data = []
451
- y_data = []
452
-
453
- for rec in dataset:
454
- row = []
455
- for feat in FEATURE_NAMES:
456
- val = rec.get(feat, 0)
457
- if val is None:
458
- val = 0
459
- row.append(float(val))
460
- X_data.append(row)
461
- y_data.append(1 if rec["label"] == "malicious" else 0)
462
-
463
- X = pd.DataFrame(X_data, columns=FEATURE_NAMES)
464
- y = np.array(y_data, dtype=int)
465
-
466
- print(f" Feature matrix: {X.shape[0]} x {X.shape[1]}")
467
- print(f" Class distribution: {int((y == 0).sum())} clean, {int((y == 1).sum())} malicious")
468
-
469
- return X, y
470
-
471
-
472
- def filter_leaky_features(X, y, min_coverage=0.001):
473
- """Remove dead features AND source-identity leaks.
474
-
475
- A feature is dropped if:
476
- - DEAD: non-zero in < 0.1% of ALL samples
477
- - LEAKY: non-zero in >= 99% of one class AND < 0.1% of the other
478
- (proxy for data source, not malware signal)
479
-
480
- Ported from train-xgboost.py filter_leaky_features().
481
- """
482
- print("\n" + "=" * 70)
483
- print("[Step 2b] Filtering dead / leaky features")
484
- print("=" * 70)
485
-
486
- neg_mask = y == 0
487
- pos_mask = y == 1
488
- n_neg = int(neg_mask.sum())
489
- n_pos = int(pos_mask.sum())
490
- n_total = n_neg + n_pos
491
-
492
- active = []
493
- dead = []
494
- leaky = []
495
-
496
- for feat in FEATURE_NAMES:
497
- neg_nz = float((X.loc[neg_mask, feat] != 0).sum()) / max(n_neg, 1)
498
- pos_nz = float((X.loc[pos_mask, feat] != 0).sum()) / max(n_pos, 1)
499
- all_nz = float((X[feat] != 0).sum()) / max(n_total, 1)
500
-
501
- if all_nz < min_coverage:
502
- dead.append(feat)
503
- elif (neg_nz >= 0.99 and pos_nz < min_coverage):
504
- leaky.append(feat)
505
- elif (pos_nz >= 0.99 and neg_nz < min_coverage):
506
- leaky.append(feat)
507
- else:
508
- active.append(feat)
509
-
510
- if dead:
511
- print(f" DEAD ({len(dead)}): {', '.join(dead)}")
512
- if leaky:
513
- print(f" LEAKY ({len(leaky)}): {', '.join(leaky)}")
514
- print(f" Active: {len(active)} / {len(FEATURE_NAMES)}")
515
-
516
- return X[active], active
517
-
518
-
519
- # ══════════════════════════════════════════════════════════════
520
- # Step 3: Grid search + training
521
- # ══════════════════════════════════════════════════════════════
522
-
523
- def grid_search(X_train, y_train, active_features, scale_pos_weight):
524
- """Grid search over hyperparameters with 3-fold CV for speed."""
525
- print("\n" + "=" * 70)
526
- print("[Step 3] Grid search (3-fold CV)")
527
- print("=" * 70)
528
-
529
- param_combinations = list(ParameterGrid(PARAM_GRID))
530
- print(f" {len(param_combinations)} combinations to evaluate")
531
-
532
- skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
533
- best_f1 = 0
534
- best_params = None
535
- results = []
536
-
537
- for i, params in enumerate(param_combinations):
538
- xgb_params = {
539
- **XGB_BASE_PARAMS,
540
- 'max_depth': params['max_depth'],
541
- 'learning_rate': params['learning_rate'],
542
- 'scale_pos_weight': scale_pos_weight,
543
- }
544
- n_est = params['n_estimators']
545
-
546
- fold_f1s = []
547
- for train_idx, val_idx in skf.split(X_train, y_train):
548
- X_tr = X_train.iloc[train_idx]
549
- X_va = X_train.iloc[val_idx]
550
- y_tr = y_train[train_idx]
551
- y_va = y_train[val_idx]
552
-
553
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
554
- dval = xgb.DMatrix(X_va, label=y_va, feature_names=active_features)
555
-
556
- model = xgb.train(
557
- xgb_params, dtrain, num_boost_round=n_est,
558
- evals=[(dval, 'val')], verbose_eval=False,
559
- early_stopping_rounds=20
560
- )
561
-
562
- probs = model.predict(dval)
563
- preds = (probs >= 0.5).astype(int)
564
- fold_f1s.append(f1_score(y_va, preds, zero_division=0))
565
-
566
- mean_f1 = np.mean(fold_f1s)
567
- results.append({**params, 'mean_f1': mean_f1})
568
-
569
- marker = " ← BEST" if mean_f1 > best_f1 else ""
570
- if mean_f1 > best_f1:
571
- best_f1 = mean_f1
572
- best_params = params
573
-
574
- if (i + 1) % 9 == 0 or i == 0 or mean_f1 > best_f1 - 0.001:
575
- print(f" [{i + 1:2d}/{len(param_combinations)}] "
576
- f"depth={params['max_depth']} est={params['n_estimators']} "
577
- f"lr={params['learning_rate']} → F1={mean_f1:.4f}{marker}")
578
-
579
- print(f"\n Best params: {best_params} (F1={best_f1:.4f})")
580
-
581
- # Sort all results by F1
582
- results.sort(key=lambda x: x['mean_f1'], reverse=True)
583
-
584
- return best_params, results
585
-
586
-
587
- def train_final(X_train, y_train, active_features, best_params, scale_pos_weight):
588
- """Train final model with best params on full training set."""
589
- print("\n" + "=" * 70)
590
- print("[Step 4] Training final model with best params")
591
- print("=" * 70)
592
-
593
- xgb_params = {
594
- **XGB_BASE_PARAMS,
595
- 'max_depth': best_params['max_depth'],
596
- 'learning_rate': best_params['learning_rate'],
597
- 'scale_pos_weight': scale_pos_weight,
598
- }
599
-
600
- # Internal 90/10 for early stopping
601
- X_tr, X_es, y_tr, y_es = train_test_split(
602
- X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
603
- )
604
-
605
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
606
- des = xgb.DMatrix(X_es, label=y_es, feature_names=active_features)
607
-
608
- model = xgb.train(
609
- xgb_params, dtrain, num_boost_round=best_params['n_estimators'],
610
- evals=[(des, 'early_stop')], verbose_eval=False,
611
- early_stopping_rounds=20
612
- )
613
-
614
- best_round = getattr(model, 'best_iteration', best_params['n_estimators'])
615
- print(f" Best iteration: {best_round}")
616
-
617
- return model
618
-
619
-
620
- def optimize_threshold(model, X_train, y_train, active_features,
621
- best_params=None, scale_pos_weight=1.0):
622
- """5-fold CV to find optimal threshold (maximize precision at recall >= 93.9%)."""
623
- print("\n" + "=" * 70)
624
- print("[Step 5] Threshold optimization (5-fold CV, recall >= 93.9%)")
625
- print("=" * 70)
626
-
627
- skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
628
- all_probs = np.zeros(len(y_train))
629
-
630
- for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
631
- X_tr = X_train.iloc[train_idx]
632
- X_va = X_train.iloc[val_idx]
633
- y_tr = y_train[train_idx]
634
- y_va = y_train[val_idx]
635
-
636
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
637
- dval = xgb.DMatrix(X_va, label=y_va, feature_names=active_features)
638
-
639
- fold_params = {
640
- **XGB_BASE_PARAMS,
641
- 'max_depth': best_params['max_depth'],
642
- 'learning_rate': best_params['learning_rate'],
643
- 'scale_pos_weight': scale_pos_weight,
644
- }
645
- fold_model = xgb.train(
646
- fold_params,
647
- dtrain, num_boost_round=best_params['n_estimators'],
648
- evals=[(dval, 'val')], verbose_eval=False,
649
- early_stopping_rounds=20
650
- )
651
- all_probs[val_idx] = fold_model.predict(dval)
652
-
653
- # Sweep thresholds
654
- best_threshold = 0.5
655
- best_precision = 0.0
656
-
657
- for t in np.arange(0.10, 0.91, 0.01):
658
- preds = (all_probs >= t).astype(int)
659
- r = recall_score(y_train, preds, zero_division=0)
660
- p = precision_score(y_train, preds, zero_division=0)
661
- if r >= 0.939 and p > best_precision:
662
- best_precision = p
663
- best_threshold = float(t)
664
-
665
- if best_precision == 0.0:
666
- print(" [WARN] No threshold achieves recall >= 93.9%, using 0.5")
667
- best_threshold = 0.5
668
-
669
- preds = (all_probs >= best_threshold).astype(int)
670
- p = precision_score(y_train, preds, zero_division=0)
671
- r = recall_score(y_train, preds, zero_division=0)
672
- f1 = f1_score(y_train, preds, zero_division=0)
673
-
674
- print(f" Optimal threshold: {best_threshold:.3f}")
675
- print(f" CV metrics: P={p:.3f} R={r:.3f} F1={f1:.3f}")
676
-
677
- return best_threshold, {'precision': p, 'recall': r, 'f1': f1}
678
-
679
-
680
- # ══════════════════════════════════════════════════════════════
681
- # Step 6: Evaluate on holdout
682
- # ══════════════════════════════════════════════════════════════
683
-
684
- def evaluate_holdout(model, X_test, y_test, active_features, threshold):
685
- """Evaluate on held-out test set."""
686
- print("\n" + "=" * 70)
687
- print(f"[Step 6] Holdout evaluation (threshold={threshold:.3f})")
688
- print("=" * 70)
689
-
690
- dtest = xgb.DMatrix(X_test, label=y_test, feature_names=active_features)
691
- probs = model.predict(dtest)
692
-
693
- preds = (probs >= threshold).astype(int)
694
- p = precision_score(y_test, preds, zero_division=0)
695
- r = recall_score(y_test, preds, zero_division=0)
696
- f1 = f1_score(y_test, preds, zero_division=0)
697
- cm = confusion_matrix(y_test, preds)
698
- tn, fp_count, fn, tp = cm.ravel()
699
-
700
- # AUC-ROC
701
- try:
702
- auc = roc_auc_score(y_test, probs)
703
- except ValueError:
704
- auc = 0.0
705
-
706
- # FPR / TPR
707
- fpr = fp_count / max(fp_count + tn, 1)
708
- tpr = tp / max(tp + fn, 1)
709
-
710
- print(f" Precision: {p:.4f}")
711
- print(f" Recall/TPR: {r:.4f}")
712
- print(f" F1: {f1:.4f}")
713
- print(f" AUC-ROC: {auc:.4f}")
714
- print(f" FPR: {fpr:.4f}")
715
- print(f" Confusion matrix:")
716
- print(f" TN={tn:>6d} FP={fp_count:>6d}")
717
- print(f" FN={fn:>6d} TP={tp:>6d}")
718
-
719
- # Feature importance
720
- importance = model.get_score(importance_type='gain')
721
- sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)
722
- print(f"\n Top 20 features (gain):")
723
- for i, (name, val) in enumerate(sorted_imp[:20]):
724
- print(f" {i + 1:2d}. {name:40s} {val:.4f}")
725
-
726
- return {
727
- 'precision': round(float(p), 4),
728
- 'recall': round(float(r), 4),
729
- 'f1': round(float(f1), 4),
730
- 'auc_roc': round(float(auc), 4),
731
- 'fpr': round(float(fpr), 4),
732
- 'tpr': round(float(tpr), 4),
733
- 'confusion_matrix': cm.tolist(),
734
- 'tp': int(tp), 'fp': int(fp_count),
735
- 'fn': int(fn), 'tn': int(tn),
736
- 'top_20_features': [(name, round(val, 4)) for name, val in sorted_imp[:20]],
737
- }, probs
738
-
739
-
740
- # ══════════════════════════════════════════════════════════════
741
- # Step 7: Export
742
- # ══════════════════════════════════════════════════════════════
743
-
744
- def save_confusion_matrix_png(y_test, preds, output_path):
745
- """Save confusion matrix as PNG."""
746
- try:
747
- import matplotlib
748
- matplotlib.use('Agg')
749
- import matplotlib.pyplot as plt
750
-
751
- cm = confusion_matrix(y_test, preds)
752
- fig, ax = plt.subplots(figsize=(8, 6))
753
- im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
754
- ax.figure.colorbar(im, ax=ax)
755
-
756
- classes = ['Clean (0)', 'Malicious (1)']
757
- ax.set(xticks=[0, 1], yticks=[0, 1],
758
- xticklabels=classes, yticklabels=classes,
759
- ylabel='True label', xlabel='Predicted label',
760
- title='MUAD\'DIB Retrained Model — Confusion Matrix')
761
-
762
- # Text annotations
763
- for i in range(2):
764
- for j in range(2):
765
- ax.text(j, i, f'{cm[i, j]:,}',
766
- ha='center', va='center',
767
- color='white' if cm[i, j] > cm.max() / 2 else 'black',
768
- fontsize=16)
769
-
770
- plt.tight_layout()
771
- plt.savefig(output_path, dpi=150)
772
- plt.close()
773
- print(f" Confusion matrix saved to {output_path}")
774
- except ImportError:
775
- print(" [WARN] matplotlib not available — skipping confusion matrix PNG")
776
-
777
-
778
- def convert_tree(tree_json, nodes, feature_map):
779
- """Recursively convert XGBoost tree JSON to flat array (from train-xgboost.py)."""
780
- idx = len(nodes)
781
- nodes.append(None)
782
-
783
- if 'leaf' in tree_json:
784
- nodes[idx] = {'f': -1, 't': 0, 'y': 0, 'n': 0,
785
- 'v': round(tree_json['leaf'], 6)}
786
- else:
787
- split_feature = tree_json.get('split', '')
788
- feature_idx = feature_map.get(split_feature, -1)
789
- threshold = tree_json.get('split_condition', 0)
790
- children = tree_json.get('children', [])
791
- yes_child = tree_json.get('yes', 0)
792
- no_child = tree_json.get('no', 0)
793
-
794
- yes_tree = no_tree = None
795
- for child in children:
796
- if child.get('nodeid') == yes_child:
797
- yes_tree = child
798
- elif child.get('nodeid') == no_child:
799
- no_tree = child
800
- if yes_tree is None and len(children) > 0:
801
- yes_tree = children[0]
802
- if no_tree is None and len(children) > 1:
803
- no_tree = children[1]
804
-
805
- yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
806
- no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
807
-
808
- nodes[idx] = {'f': feature_idx, 't': round(threshold, 6),
809
- 'y': yes_idx, 'n': no_idx, 'v': 0}
810
- return idx
811
-
812
-
813
- def export_model_js(model, features, threshold, cv_metrics, holdout_metrics, output_path):
814
- """Export model to model-trees.js format."""
815
- print("\n" + "=" * 70)
816
- print(f"[Step 7] Exporting model to {output_path}")
817
- print("=" * 70)
818
-
819
- trees_dump = model.get_dump(dump_format='json')
820
- feature_map = {name: idx for idx, name in enumerate(features)}
821
-
822
- js_trees = []
823
- total_nodes = 0
824
- for tree_str in trees_dump:
825
- tree_json = json.loads(tree_str)
826
- nodes = []
827
- convert_tree(tree_json, nodes, feature_map)
828
- js_trees.append(nodes)
829
- total_nodes += len(nodes)
830
-
831
- js_model = {
832
- 'version': 1,
833
- 'features': features,
834
- 'threshold': threshold,
835
- 'trees': js_trees,
836
- }
837
-
838
- now = datetime.now(timezone.utc).strftime('%Y-%m-%d')
839
- js_content = "'use strict';\n\n"
840
- js_content += "/**\n"
841
- js_content += f" * XGBoost model trees — auto-generated by ml-retrain/retrain.py ({now})\n"
842
- js_content += f" * {len(js_trees)} trees, {len(features)} features, threshold={threshold}\n"
843
- js_content += f" * CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}\n"
844
- js_content += f" * Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}\n"
845
- js_content += f" * AUC-ROC: {holdout_metrics['auc_roc']:.3f}\n"
846
- js_content += " * DO NOT EDIT MANUALLY\n"
847
- js_content += " */\n\n"
848
- js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
849
-
850
- with open(output_path, 'w', encoding='utf-8') as f:
851
- f.write(js_content)
852
-
853
- size_kb = output_path.stat().st_size / 1024
854
- print(f" Trees: {len(js_trees)}, nodes: {total_nodes}")
855
- print(f" Features: {len(features)}, threshold: {threshold:.3f}")
856
- print(f" File: {size_kb:.1f} KB")
857
-
858
-
859
- def save_report(dataset_stats, best_params, grid_results, cv_metrics,
860
- holdout_metrics, active_features):
861
- """Save full retrain report as JSON."""
862
- report = {
863
- "generated_at": datetime.now(timezone.utc).isoformat(),
864
- "dataset": dataset_stats,
865
- "best_hyperparams": best_params,
866
- "grid_search_top5": grid_results[:5],
867
- "cv_metrics": cv_metrics,
868
- "holdout_metrics": holdout_metrics,
869
- "active_features": active_features,
870
- }
871
- with open(REPORT_PATH, 'w', encoding='utf-8') as f:
872
- json.dump(report, f, indent=2, default=str)
873
- print(f" Report saved to {REPORT_PATH}")
874
-
875
-
876
- # ══════════════════════════════════════════════════════════════
877
- # Main
878
- # ══════════════════════════════════════════════════════════════
879
-
880
- def run_full():
881
- start = time.time()
882
-
883
- # Step 1: Build dataset
884
- dataset, dataset_stats = build_dataset()
885
-
886
- # Step 2: Feature alignment
887
- X, y = align_features(dataset)
888
- X, active_features = filter_leaky_features(X, y)
889
-
890
- # Stratified 80/20 split
891
- print("\n Stratified 80/20 split (seed=42)...")
892
- X_train, X_test, y_train, y_test = train_test_split(
893
- X, y, test_size=0.2, stratify=y, random_state=42
894
- )
895
- print(f" Train: {len(X_train)} ({int((y_train == 0).sum())} clean, "
896
- f"{int((y_train == 1).sum())} malicious)")
897
- print(f" Test: {len(X_test)} ({int((y_test == 0).sum())} clean, "
898
- f"{int((y_test == 1).sum())} malicious)")
899
-
900
- scale_pos_weight = float((y_train == 0).sum()) / max(float((y_train == 1).sum()), 1)
901
- print(f" scale_pos_weight: {scale_pos_weight:.2f}")
902
-
903
- # Step 3: Grid search
904
- best_params, grid_results = grid_search(
905
- X_train, y_train, active_features, scale_pos_weight)
906
-
907
- # Step 4: Train final model
908
- model = train_final(X_train, y_train, active_features, best_params, scale_pos_weight)
909
-
910
- # Step 5: Threshold optimization
911
- threshold, cv_metrics = optimize_threshold(
912
- model, X_train, y_train, active_features,
913
- best_params=best_params, scale_pos_weight=scale_pos_weight)
914
-
915
- # Step 6: Holdout evaluation
916
- holdout_metrics, probs = evaluate_holdout(
917
- model, X_test, y_test, active_features, threshold)
918
-
919
- # Step 7: Export
920
- export_model_js(model, active_features, threshold,
921
- cv_metrics, holdout_metrics, MODEL_OUTPUT)
922
-
923
- preds = (probs >= threshold).astype(int)
924
- save_confusion_matrix_png(y_test, preds, CONFUSION_MATRIX_PATH)
925
- save_report(dataset_stats, best_params, grid_results,
926
- cv_metrics, holdout_metrics, active_features)
927
-
928
- elapsed = time.time() - start
929
- print("\n" + "=" * 70)
930
- print(f"RETRAIN COMPLETE ({elapsed:.0f}s)")
931
- print("=" * 70)
932
- print(f" Dataset: {len(dataset)} samples")
933
- print(f" Features: {len(active_features)}")
934
- print(f" Best: depth={best_params['max_depth']} "
935
- f"est={best_params['n_estimators']} lr={best_params['learning_rate']}")
936
- print(f" Threshold: {threshold:.3f}")
937
- print(f" Holdout: P={holdout_metrics['precision']:.3f} "
938
- f"R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f} "
939
- f"AUC={holdout_metrics['auc_roc']:.3f}")
940
- print(f" Model: {MODEL_OUTPUT}")
941
- print(f" Report: {REPORT_PATH}")
942
-
943
-
944
- def main():
945
- parser = argparse.ArgumentParser(description="MUAD'DIB ML Retrain")
946
- group = parser.add_mutually_exclusive_group(required=True)
947
- group.add_argument('--full', action='store_true', help='Run all steps')
948
- group.add_argument('--build-dataset', action='store_true', help='Step 1 only')
949
- group.add_argument('--train-only', action='store_true',
950
- help='Train from cached dataset')
951
- parser.add_argument('--data-dir', help='Override MUADDIB_DATA path')
952
- args = parser.parse_args()
953
-
954
- if args.data_dir:
955
- global MUADDIB_DATA, MUADDIB_ALERTS
956
- MUADDIB_DATA = Path(args.data_dir)
957
- MUADDIB_ALERTS = MUADDIB_DATA.parent / "logs" / "alerts"
958
-
959
- if args.full:
960
- run_full()
961
- elif args.build_dataset:
962
- build_dataset()
963
- elif args.train_only:
964
- if not DATASET_CACHE.is_file():
965
- print(f"ERROR: Cached dataset not found at {DATASET_CACHE}", file=sys.stderr)
966
- print("Run --build-dataset or --full first", file=sys.stderr)
967
- sys.exit(1)
968
- records, _ = load_jsonl(DATASET_CACHE)
969
- # Fake stats for report
970
- run_full()
971
-
972
-
973
- if __name__ == '__main__':
974
- main()