muaddib-scanner 2.11.76 → 2.11.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +18 -0
- package/README.md +15 -6
- package/bin/muaddib.js +18 -4
- package/package.json +1 -2
- package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
- package/src/commands/interactive.js +5 -6
- package/src/commands/safe-install.js +19 -19
- package/src/ioc/scraper.js +46 -10
- package/src/monitor/daemon.js +39 -28
- package/src/monitor/ingestion.js +32 -2
- package/src/monitor/queue.js +84 -21
- package/src/monitor/scan-queue.js +68 -1
- package/src/monitor/state.js +24 -1
- package/src/monitor/webhook.js +32 -11
- package/src/output/formatter.js +3 -4
- package/src/pipeline/executor.js +9 -1
- package/src/runtime/daemon.js +27 -28
- package/src/runtime/watch.js +7 -7
- package/src/sandbox/index.js +11 -9
- package/src/scanner/temporal-analysis.js +8 -0
- package/src/scanner/temporal-ast-diff.js +5 -0
- package/src/utils.js +60 -1
- package/.dockerignore +0 -7
- package/.env.example +0 -43
- package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
- package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
- package/ml-retrain/auto-labeler/labeler.py +0 -256
- package/ml-retrain/auto-labeler/npm_checker.py +0 -228
- package/ml-retrain/auto-labeler/ossf_index.py +0 -178
- package/ml-retrain/auto-labeler/requirements.txt +0 -1
- package/ml-retrain/confusion-matrix.png +0 -0
- package/ml-retrain/model-trees-retrained.js +0 -12
- package/ml-retrain/retrain-report.json +0 -225
- package/ml-retrain/retrain.py +0 -974
- package/sbom.json +0 -0
- package/src/ml/train-bundler-detector.py +0 -725
- package/src/ml/train-xgboost.py +0 -957
- package/tools/export-model-js.py +0 -160
- package/tools/requirements-ml.txt +0 -5
- package/tools/train-classifier.py +0 -333
package/ml-retrain/retrain.py
DELETED
|
@@ -1,974 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
MUAD'DIB ML Retrain — Auto-Label Ground Truth
|
|
4
|
-
|
|
5
|
-
Builds training dataset by crossing auto-labels.json with ml-training-merged.jsonl,
|
|
6
|
-
adds Datadog malicious corpus, trains XGBoost with grid search, exports model + report.
|
|
7
|
-
|
|
8
|
-
Usage:
|
|
9
|
-
python ml-retrain/retrain.py --full
|
|
10
|
-
python ml-retrain/retrain.py --build-dataset # Step 1 only
|
|
11
|
-
python ml-retrain/retrain.py --train-only # Skip dataset build, use cached
|
|
12
|
-
|
|
13
|
-
Environment:
|
|
14
|
-
MUADDIB_DATA Override data directory (default: /opt/muaddib/data)
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
import json
|
|
19
|
-
import os
|
|
20
|
-
import sys
|
|
21
|
-
import time
|
|
22
|
-
from datetime import datetime, timezone
|
|
23
|
-
from pathlib import Path
|
|
24
|
-
|
|
25
|
-
import numpy as np
|
|
26
|
-
import pandas as pd
|
|
27
|
-
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
|
|
28
|
-
from sklearn.metrics import (
|
|
29
|
-
precision_score, recall_score, f1_score, confusion_matrix,
|
|
30
|
-
roc_auc_score, precision_recall_curve, roc_curve
|
|
31
|
-
)
|
|
32
|
-
import xgboost as xgb
|
|
33
|
-
|
|
34
|
-
# ── Paths ──
|
|
35
|
-
MUADDIB_DATA = Path(os.environ.get("MUADDIB_DATA", "/opt/muaddib/data"))
|
|
36
|
-
MUADDIB_ALERTS = MUADDIB_DATA.parent / "logs" / "alerts"
|
|
37
|
-
BASE_DIR = Path(__file__).parent
|
|
38
|
-
OUTPUT_DIR = BASE_DIR
|
|
39
|
-
DATASET_CACHE = OUTPUT_DIR / "retrain-dataset.jsonl"
|
|
40
|
-
REPORT_PATH = OUTPUT_DIR / "retrain-report.json"
|
|
41
|
-
MODEL_OUTPUT = OUTPUT_DIR / "model-trees-retrained.js"
|
|
42
|
-
CONFUSION_MATRIX_PATH = OUTPUT_DIR / "confusion-matrix.png"
|
|
43
|
-
|
|
44
|
-
# ── 87 hardcoded features — exact copy from train-xgboost.py ──
|
|
45
|
-
IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
|
|
46
|
-
FEATURE_NAMES = [
|
|
47
|
-
'score', 'max_file_score', 'package_score', 'global_risk_score',
|
|
48
|
-
'count_total', 'count_critical', 'count_high', 'count_medium', 'count_low',
|
|
49
|
-
'distinct_threat_types',
|
|
50
|
-
'type_suspicious_dataflow', 'type_env_access', 'type_sensitive_string',
|
|
51
|
-
'type_dangerous_call_eval', 'type_dangerous_call_exec',
|
|
52
|
-
'type_dangerous_call_function', 'type_obfuscation_detected',
|
|
53
|
-
'type_high_entropy_string', 'type_dynamic_require', 'type_dynamic_import',
|
|
54
|
-
'type_lifecycle_script', 'type_typosquat_detected', 'type_staged_payload',
|
|
55
|
-
'type_staged_binary_payload', 'type_network_require', 'type_sandbox_evasion',
|
|
56
|
-
'type_credential_regex_harvest', 'type_remote_code_load',
|
|
57
|
-
'type_suspicious_domain', 'type_prototype_hook',
|
|
58
|
-
'type_intent_credential_exfil', 'type_intent_command_exfil',
|
|
59
|
-
'type_cross_file_dataflow', 'type_module_compile', 'type_crypto_decipher',
|
|
60
|
-
'type_env_charcode_reconstruction', 'type_lifecycle_shell_pipe',
|
|
61
|
-
'type_curl_exec', 'type_reverse_shell', 'type_binary_dropper',
|
|
62
|
-
'type_mcp_config_injection',
|
|
63
|
-
'type_vm_code_execution', 'type_vm_dynamic_code',
|
|
64
|
-
'type_dangerous_constructor', 'type_module_load_bypass',
|
|
65
|
-
'type_require_process_mainmodule', 'type_proxy_globalthis_intercept',
|
|
66
|
-
'type_reflect_bind_code_execution',
|
|
67
|
-
'type_known_malicious_package', 'type_known_malicious_hash',
|
|
68
|
-
'type_unicode_invisible_injection', 'type_blockchain_c2_resolution',
|
|
69
|
-
'type_dangerous_exec', 'type_node_inline_exec',
|
|
70
|
-
'type_js_obfuscation_pattern',
|
|
71
|
-
'type_suspicious_module_sink', 'type_wasm_host_sink',
|
|
72
|
-
'type_other',
|
|
73
|
-
'has_lifecycle_script', 'has_network_access', 'has_obfuscation',
|
|
74
|
-
'has_env_access', 'has_eval', 'has_staged_payload', 'has_typosquat',
|
|
75
|
-
'has_ioc_match', 'has_intent_pair', 'has_sandbox_finding',
|
|
76
|
-
'file_count_with_threats', 'file_score_mean', 'file_score_max',
|
|
77
|
-
'severity_ratio_high', 'max_single_points', 'points_concentration',
|
|
78
|
-
'unpacked_size_bytes', 'dep_count', 'dev_dep_count',
|
|
79
|
-
'reputation_factor',
|
|
80
|
-
'package_age_days', 'weekly_downloads', 'version_count',
|
|
81
|
-
'author_package_count', 'has_repository', 'readme_size',
|
|
82
|
-
'file_count_total', 'has_tests', 'threat_density',
|
|
83
|
-
]
|
|
84
|
-
assert len(FEATURE_NAMES) == 87
|
|
85
|
-
|
|
86
|
-
# ── Grid search param space ──
|
|
87
|
-
PARAM_GRID = {
|
|
88
|
-
'max_depth': [4, 6, 8],
|
|
89
|
-
'n_estimators': [100, 200, 300],
|
|
90
|
-
'learning_rate': [0.05, 0.1, 0.2],
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
XGB_BASE_PARAMS = {
|
|
94
|
-
'objective': 'binary:logistic',
|
|
95
|
-
'eval_metric': 'logloss',
|
|
96
|
-
'subsample': 0.8,
|
|
97
|
-
'colsample_bytree': 0.8,
|
|
98
|
-
'min_child_weight': 5,
|
|
99
|
-
'gamma': 0.1,
|
|
100
|
-
'reg_alpha': 0.1,
|
|
101
|
-
'reg_lambda': 1.0,
|
|
102
|
-
'seed': 42,
|
|
103
|
-
'verbosity': 0,
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# ── TOP_THREAT_TYPES — mirrors feature-extractor.js ──
|
|
108
|
-
TOP_THREAT_TYPES = [
|
|
109
|
-
'suspicious_dataflow', 'env_access', 'sensitive_string',
|
|
110
|
-
'dangerous_call_eval', 'dangerous_call_exec', 'dangerous_call_function',
|
|
111
|
-
'obfuscation_detected', 'high_entropy_string', 'dynamic_require',
|
|
112
|
-
'dynamic_import', 'lifecycle_script', 'typosquat_detected', 'staged_payload',
|
|
113
|
-
'staged_binary_payload', 'network_require', 'sandbox_evasion',
|
|
114
|
-
'credential_regex_harvest', 'remote_code_load', 'suspicious_domain',
|
|
115
|
-
'prototype_hook', 'intent_credential_exfil', 'intent_command_exfil',
|
|
116
|
-
'cross_file_dataflow', 'module_compile', 'crypto_decipher',
|
|
117
|
-
'env_charcode_reconstruction', 'lifecycle_shell_pipe', 'curl_exec',
|
|
118
|
-
'reverse_shell', 'binary_dropper', 'mcp_config_injection',
|
|
119
|
-
'vm_code_execution', 'vm_dynamic_code', 'dangerous_constructor',
|
|
120
|
-
'module_load_bypass', 'require_process_mainmodule',
|
|
121
|
-
'proxy_globalthis_intercept', 'reflect_bind_code_execution',
|
|
122
|
-
'known_malicious_package', 'known_malicious_hash',
|
|
123
|
-
'unicode_invisible_injection', 'blockchain_c2_resolution',
|
|
124
|
-
'dangerous_exec', 'node_inline_exec', 'js_obfuscation_pattern',
|
|
125
|
-
'suspicious_module_sink', 'wasm_host_sink',
|
|
126
|
-
]
|
|
127
|
-
TOP_THREAT_TYPES_SET = set(TOP_THREAT_TYPES)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def extract_features_from_alert(alert):
|
|
131
|
-
"""Extract the 87 ML features from an alert JSON file.
|
|
132
|
-
|
|
133
|
-
Python port of feature-extractor.js — used to recover feature vectors
|
|
134
|
-
for confirmed_malicious packages that have alert files but no JSONL record.
|
|
135
|
-
Registry metadata features will be 0 (not available in alerts).
|
|
136
|
-
"""
|
|
137
|
-
feat = {}
|
|
138
|
-
summary = alert.get("summary", {})
|
|
139
|
-
threats = alert.get("threats", [])
|
|
140
|
-
|
|
141
|
-
# Scoring
|
|
142
|
-
feat['score'] = summary.get('riskScore', 0)
|
|
143
|
-
feat['max_file_score'] = summary.get('maxFileScore', 0)
|
|
144
|
-
feat['package_score'] = summary.get('packageScore', 0)
|
|
145
|
-
feat['global_risk_score'] = summary.get('globalRiskScore', 0)
|
|
146
|
-
|
|
147
|
-
# Severity counts
|
|
148
|
-
feat['count_total'] = summary.get('total', 0)
|
|
149
|
-
feat['count_critical'] = summary.get('critical', 0)
|
|
150
|
-
feat['count_high'] = summary.get('high', 0)
|
|
151
|
-
feat['count_medium'] = summary.get('medium', 0)
|
|
152
|
-
feat['count_low'] = summary.get('low', 0)
|
|
153
|
-
|
|
154
|
-
# Distinct threat types
|
|
155
|
-
distinct = set(t.get('type', '') for t in threats if t.get('type'))
|
|
156
|
-
feat['distinct_threat_types'] = len(distinct)
|
|
157
|
-
|
|
158
|
-
# Per-type counts
|
|
159
|
-
type_counts = {}
|
|
160
|
-
for t in threats:
|
|
161
|
-
tt = t.get('type', '')
|
|
162
|
-
if tt:
|
|
163
|
-
type_counts[tt] = type_counts.get(tt, 0) + 1
|
|
164
|
-
for tt in TOP_THREAT_TYPES:
|
|
165
|
-
feat[f'type_{tt}'] = type_counts.get(tt, 0)
|
|
166
|
-
other = sum(c for tt, c in type_counts.items() if tt not in TOP_THREAT_TYPES_SET)
|
|
167
|
-
feat['type_other'] = other
|
|
168
|
-
|
|
169
|
-
# Boolean behavioral signals
|
|
170
|
-
types_set = set(type_counts.keys())
|
|
171
|
-
feat['has_lifecycle_script'] = int(bool(types_set & {'lifecycle_script', 'lifecycle_shell_pipe'}))
|
|
172
|
-
feat['has_network_access'] = int(bool(types_set & {'network_require', 'remote_code_load', 'curl_exec', 'suspicious_dataflow'}))
|
|
173
|
-
feat['has_obfuscation'] = int(bool(types_set & {'obfuscation_detected', 'high_entropy_string', 'js_obfuscation_pattern'}))
|
|
174
|
-
feat['has_env_access'] = int(bool(types_set & {'env_access', 'env_charcode_reconstruction'}))
|
|
175
|
-
feat['has_eval'] = int(bool(types_set & {'dangerous_call_eval', 'dangerous_call_function'}))
|
|
176
|
-
feat['has_staged_payload'] = int(bool(types_set & {'staged_payload', 'staged_binary_payload'}))
|
|
177
|
-
feat['has_typosquat'] = int(bool(types_set & {'typosquat_detected', 'pypi_typosquat_detected'}))
|
|
178
|
-
feat['has_ioc_match'] = int(bool(types_set & {'known_malicious_package', 'known_malicious_hash', 'pypi_malicious_package', 'dependency_ioc_match'}))
|
|
179
|
-
feat['has_intent_pair'] = int(bool(types_set & {'intent_credential_exfil', 'intent_command_exfil'}))
|
|
180
|
-
feat['has_sandbox_finding'] = int(any(tt.startswith('sandbox_') for tt in types_set))
|
|
181
|
-
|
|
182
|
-
# File distribution
|
|
183
|
-
file_scores = summary.get('fileScores', {})
|
|
184
|
-
vals = list(file_scores.values()) if isinstance(file_scores, dict) else []
|
|
185
|
-
feat['file_count_with_threats'] = len(vals)
|
|
186
|
-
feat['file_score_mean'] = round(sum(vals) / len(vals)) if vals else 0
|
|
187
|
-
feat['file_score_max'] = max(vals) if vals else 0
|
|
188
|
-
|
|
189
|
-
# Severity concentration
|
|
190
|
-
feat['severity_ratio_high'] = round(
|
|
191
|
-
(feat['count_critical'] + feat['count_high']) / max(feat['count_total'], 1), 2)
|
|
192
|
-
breakdown = summary.get('breakdown', [])
|
|
193
|
-
feat['max_single_points'] = breakdown[0].get('points', 0) if breakdown else 0
|
|
194
|
-
feat['points_concentration'] = round(
|
|
195
|
-
feat['max_single_points'] / max(feat['score'], 1), 2) if feat['score'] > 0 else 0
|
|
196
|
-
|
|
197
|
-
# Package metadata — not available in alerts, default to 0
|
|
198
|
-
feat['unpacked_size_bytes'] = 0
|
|
199
|
-
feat['dep_count'] = 0
|
|
200
|
-
feat['dev_dep_count'] = 0
|
|
201
|
-
feat['reputation_factor'] = 1.0
|
|
202
|
-
feat['package_age_days'] = 0
|
|
203
|
-
feat['weekly_downloads'] = 0
|
|
204
|
-
feat['version_count'] = 0
|
|
205
|
-
feat['author_package_count'] = 0
|
|
206
|
-
feat['has_repository'] = 0
|
|
207
|
-
feat['readme_size'] = 0
|
|
208
|
-
feat['file_count_total'] = 0
|
|
209
|
-
feat['has_tests'] = 0
|
|
210
|
-
feat['threat_density'] = round(
|
|
211
|
-
feat['count_total'] / max(feat['file_count_with_threats'], 1), 2)
|
|
212
|
-
|
|
213
|
-
return feat
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def load_alert_index(alerts_dir):
|
|
217
|
-
"""Build index of alert files keyed by 'name@version'.
|
|
218
|
-
|
|
219
|
-
Returns dict: { "name@version": alert_dict }
|
|
220
|
-
"""
|
|
221
|
-
alerts_dir = Path(alerts_dir)
|
|
222
|
-
if not alerts_dir.is_dir():
|
|
223
|
-
return {}
|
|
224
|
-
|
|
225
|
-
index = {}
|
|
226
|
-
for filepath in alerts_dir.glob("*.json"):
|
|
227
|
-
try:
|
|
228
|
-
with open(filepath, 'r', encoding='utf-8') as f:
|
|
229
|
-
alert = json.load(f)
|
|
230
|
-
target = alert.get("target", "")
|
|
231
|
-
# target format: "npm/package-name@version"
|
|
232
|
-
if "/" in target and "@" in target:
|
|
233
|
-
pkg_ver = target.split("/", 1)[1] # "name@version"
|
|
234
|
-
# Keep the alert with the highest score for each package
|
|
235
|
-
existing = index.get(pkg_ver)
|
|
236
|
-
new_score = alert.get("summary", {}).get("riskScore", 0)
|
|
237
|
-
if existing is None or new_score > existing.get("summary", {}).get("riskScore", 0):
|
|
238
|
-
index[pkg_ver] = alert
|
|
239
|
-
except (json.JSONDecodeError, OSError):
|
|
240
|
-
continue
|
|
241
|
-
|
|
242
|
-
return index
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
# ══════════════════════════════════════════════════════════════
|
|
246
|
-
# Step 1: Build training dataset
|
|
247
|
-
# ══════════════════════════════════════════════════════════════
|
|
248
|
-
|
|
249
|
-
def load_jsonl(filepath):
|
|
250
|
-
"""Load JSONL, skip malformed lines."""
|
|
251
|
-
records = []
|
|
252
|
-
skipped = 0
|
|
253
|
-
with open(filepath, 'r', encoding='utf-8') as f:
|
|
254
|
-
for line_num, line in enumerate(f, 1):
|
|
255
|
-
line = line.strip()
|
|
256
|
-
if not line:
|
|
257
|
-
continue
|
|
258
|
-
try:
|
|
259
|
-
records.append(json.loads(line))
|
|
260
|
-
except json.JSONDecodeError:
|
|
261
|
-
skipped += 1
|
|
262
|
-
print(f" [WARN] Skipping malformed line {line_num} in {filepath}")
|
|
263
|
-
return records, skipped
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
def build_dataset():
|
|
267
|
-
"""Cross auto-labels.json with ml-training-merged.jsonl + Datadog corpus."""
|
|
268
|
-
print("=" * 70)
|
|
269
|
-
print("[Step 1] Building training dataset from auto-labels + merged JSONL")
|
|
270
|
-
print("=" * 70)
|
|
271
|
-
|
|
272
|
-
# Load auto-labels
|
|
273
|
-
auto_labels_path = MUADDIB_DATA / "auto-labels.json"
|
|
274
|
-
if not auto_labels_path.is_file():
|
|
275
|
-
print(f"ERROR: {auto_labels_path} not found", file=sys.stderr)
|
|
276
|
-
sys.exit(1)
|
|
277
|
-
|
|
278
|
-
with open(auto_labels_path, 'r', encoding='utf-8') as f:
|
|
279
|
-
auto_labels = json.load(f)
|
|
280
|
-
labels_map = auto_labels.get("labels", {})
|
|
281
|
-
print(f" Auto-labels loaded: {len(labels_map)} entries")
|
|
282
|
-
print(f" Summary: {auto_labels.get('summary', {})}")
|
|
283
|
-
|
|
284
|
-
# Load merged JSONL
|
|
285
|
-
merged_path = MUADDIB_DATA / "ml-training-merged.jsonl"
|
|
286
|
-
if not merged_path.is_file():
|
|
287
|
-
print(f"ERROR: {merged_path} not found", file=sys.stderr)
|
|
288
|
-
sys.exit(1)
|
|
289
|
-
|
|
290
|
-
merged_records, merged_skipped = load_jsonl(merged_path)
|
|
291
|
-
print(f" Merged JSONL: {len(merged_records)} records ({merged_skipped} corrupted, skipped)")
|
|
292
|
-
|
|
293
|
-
# Load Datadog malicious
|
|
294
|
-
datadog_path = MUADDIB_DATA / "ml-training-datadog-full.jsonl"
|
|
295
|
-
if not datadog_path.is_file():
|
|
296
|
-
print(f"ERROR: {datadog_path} not found", file=sys.stderr)
|
|
297
|
-
sys.exit(1)
|
|
298
|
-
|
|
299
|
-
datadog_records, datadog_skipped = load_jsonl(datadog_path)
|
|
300
|
-
print(f" Datadog JSONL: {len(datadog_records)} records ({datadog_skipped} corrupted, skipped)")
|
|
301
|
-
|
|
302
|
-
# Load alert index for recovering confirmed_malicious without JSONL records
|
|
303
|
-
alert_index = load_alert_index(MUADDIB_ALERTS)
|
|
304
|
-
print(f" Alert index: {len(alert_index)} packages")
|
|
305
|
-
|
|
306
|
-
# Cross merged records with auto-labels
|
|
307
|
-
dataset = []
|
|
308
|
-
stats = {
|
|
309
|
-
"confirmed_malicious_jsonl": 0,
|
|
310
|
-
"confirmed_malicious_alert": 0,
|
|
311
|
-
"confirmed_malicious_no_features": 0,
|
|
312
|
-
"likely_malicious_excluded": 0,
|
|
313
|
-
"unconfirmed_as_clean": 0,
|
|
314
|
-
"pending_excluded": 0,
|
|
315
|
-
"clean_no_match": 0,
|
|
316
|
-
"datadog_malicious": 0,
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
seen = set() # (name, version) dedup
|
|
320
|
-
|
|
321
|
-
for rec in merged_records:
|
|
322
|
-
name = rec.get("name", "")
|
|
323
|
-
version = rec.get("version", "")
|
|
324
|
-
key = f"{name}@{version}"
|
|
325
|
-
dedup_key = (name, version)
|
|
326
|
-
|
|
327
|
-
if dedup_key in seen:
|
|
328
|
-
continue
|
|
329
|
-
|
|
330
|
-
auto = labels_map.get(key, {})
|
|
331
|
-
auto_label = auto.get("auto_label", "")
|
|
332
|
-
|
|
333
|
-
if auto_label == "confirmed_malicious":
|
|
334
|
-
rec["label"] = "malicious"
|
|
335
|
-
rec["_retrain_source"] = "auto-label:confirmed+jsonl"
|
|
336
|
-
dataset.append(rec)
|
|
337
|
-
stats["confirmed_malicious_jsonl"] += 1
|
|
338
|
-
seen.add(dedup_key)
|
|
339
|
-
|
|
340
|
-
elif auto_label == "unconfirmed":
|
|
341
|
-
# Suspect not confirmed after 7+ days → treat as clean for training
|
|
342
|
-
rec["label"] = "clean"
|
|
343
|
-
rec["_retrain_source"] = "auto-label:unconfirmed"
|
|
344
|
-
dataset.append(rec)
|
|
345
|
-
stats["unconfirmed_as_clean"] += 1
|
|
346
|
-
seen.add(dedup_key)
|
|
347
|
-
|
|
348
|
-
elif auto_label == "likely_malicious":
|
|
349
|
-
# Exclude from training — ambiguous signal
|
|
350
|
-
stats["likely_malicious_excluded"] += 1
|
|
351
|
-
seen.add(dedup_key)
|
|
352
|
-
continue
|
|
353
|
-
|
|
354
|
-
elif auto_label == "pending":
|
|
355
|
-
# Exclude — too recent for reliable label
|
|
356
|
-
stats["pending_excluded"] += 1
|
|
357
|
-
seen.add(dedup_key)
|
|
358
|
-
continue
|
|
359
|
-
|
|
360
|
-
else:
|
|
361
|
-
# No match in auto-labels — use original label
|
|
362
|
-
orig_label = rec.get("label", "")
|
|
363
|
-
if orig_label in ("clean", "fp", "ml_clean"):
|
|
364
|
-
rec["label"] = "clean"
|
|
365
|
-
rec["_retrain_source"] = f"original:{orig_label}"
|
|
366
|
-
dataset.append(rec)
|
|
367
|
-
stats["clean_no_match"] += 1
|
|
368
|
-
seen.add(dedup_key)
|
|
369
|
-
# Skip suspect/unconfirmed/unknown without auto-label match
|
|
370
|
-
|
|
371
|
-
# Add Datadog malicious corpus
|
|
372
|
-
for rec in datadog_records:
|
|
373
|
-
name = rec.get("name", "")
|
|
374
|
-
version = rec.get("version", "")
|
|
375
|
-
dedup_key = (name, version)
|
|
376
|
-
|
|
377
|
-
if dedup_key in seen:
|
|
378
|
-
continue
|
|
379
|
-
|
|
380
|
-
rec["label"] = "malicious"
|
|
381
|
-
rec["_retrain_source"] = "datadog"
|
|
382
|
-
dataset.append(rec)
|
|
383
|
-
stats["datadog_malicious"] += 1
|
|
384
|
-
seen.add(dedup_key)
|
|
385
|
-
|
|
386
|
-
# Recover confirmed_malicious that have alert files but no JSONL record
|
|
387
|
-
for key, label_info in labels_map.items():
|
|
388
|
-
if label_info.get("auto_label") != "confirmed_malicious":
|
|
389
|
-
continue
|
|
390
|
-
# key is "name@version"
|
|
391
|
-
if "@" not in key:
|
|
392
|
-
continue
|
|
393
|
-
name, version = key.rsplit("@", 1)
|
|
394
|
-
dedup_key = (name, version)
|
|
395
|
-
if dedup_key in seen:
|
|
396
|
-
continue
|
|
397
|
-
|
|
398
|
-
alert = alert_index.get(key)
|
|
399
|
-
if alert:
|
|
400
|
-
features = extract_features_from_alert(alert)
|
|
401
|
-
rec = {
|
|
402
|
-
"name": name, "version": version, "ecosystem": "npm",
|
|
403
|
-
"label": "malicious",
|
|
404
|
-
"_retrain_source": "auto-label:confirmed+alert",
|
|
405
|
-
}
|
|
406
|
-
rec.update(features)
|
|
407
|
-
dataset.append(rec)
|
|
408
|
-
stats["confirmed_malicious_alert"] += 1
|
|
409
|
-
else:
|
|
410
|
-
stats["confirmed_malicious_no_features"] += 1
|
|
411
|
-
|
|
412
|
-
seen.add(dedup_key)
|
|
413
|
-
|
|
414
|
-
print(f"\n Dataset construction:")
|
|
415
|
-
for k, v in stats.items():
|
|
416
|
-
print(f" {k}: {v}")
|
|
417
|
-
|
|
418
|
-
n_malicious = sum(1 for r in dataset if r["label"] == "malicious")
|
|
419
|
-
n_clean = sum(1 for r in dataset if r["label"] == "clean")
|
|
420
|
-
print(f"\n Final dataset: {len(dataset)} samples")
|
|
421
|
-
print(f" Malicious: {n_malicious}")
|
|
422
|
-
print(f" Clean: {n_clean}")
|
|
423
|
-
print(f" Ratio (clean/malicious): {n_clean / max(n_malicious, 1):.2f}")
|
|
424
|
-
|
|
425
|
-
# Cache dataset
|
|
426
|
-
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
427
|
-
with open(DATASET_CACHE, 'w', encoding='utf-8') as f:
|
|
428
|
-
for rec in dataset:
|
|
429
|
-
f.write(json.dumps(rec, separators=(',', ':')) + '\n')
|
|
430
|
-
print(f" Cached to {DATASET_CACHE}")
|
|
431
|
-
|
|
432
|
-
return dataset, stats
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
# ══════════════════════════════════════════════════════════════
|
|
436
|
-
# Step 2: Feature alignment
|
|
437
|
-
# ══════════════════════════════════════════════════════════════
|
|
438
|
-
|
|
439
|
-
def align_features(dataset):
|
|
440
|
-
"""Align dataset to 87 features, return X, y.
|
|
441
|
-
|
|
442
|
-
All missing features use 0 (not -1) to prevent data leakage:
|
|
443
|
-
XGBoost learns split directions for missing values, so -1 in one source
|
|
444
|
-
and real values in another creates a perfect source-identity signal.
|
|
445
|
-
"""
|
|
446
|
-
print("\n" + "=" * 70)
|
|
447
|
-
print("[Step 2] Aligning 87 features (0 for missing — no leakage)")
|
|
448
|
-
print("=" * 70)
|
|
449
|
-
|
|
450
|
-
X_data = []
|
|
451
|
-
y_data = []
|
|
452
|
-
|
|
453
|
-
for rec in dataset:
|
|
454
|
-
row = []
|
|
455
|
-
for feat in FEATURE_NAMES:
|
|
456
|
-
val = rec.get(feat, 0)
|
|
457
|
-
if val is None:
|
|
458
|
-
val = 0
|
|
459
|
-
row.append(float(val))
|
|
460
|
-
X_data.append(row)
|
|
461
|
-
y_data.append(1 if rec["label"] == "malicious" else 0)
|
|
462
|
-
|
|
463
|
-
X = pd.DataFrame(X_data, columns=FEATURE_NAMES)
|
|
464
|
-
y = np.array(y_data, dtype=int)
|
|
465
|
-
|
|
466
|
-
print(f" Feature matrix: {X.shape[0]} x {X.shape[1]}")
|
|
467
|
-
print(f" Class distribution: {int((y == 0).sum())} clean, {int((y == 1).sum())} malicious")
|
|
468
|
-
|
|
469
|
-
return X, y
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
def filter_leaky_features(X, y, min_coverage=0.001):
|
|
473
|
-
"""Remove dead features AND source-identity leaks.
|
|
474
|
-
|
|
475
|
-
A feature is dropped if:
|
|
476
|
-
- DEAD: non-zero in < 0.1% of ALL samples
|
|
477
|
-
- LEAKY: non-zero in >= 99% of one class AND < 0.1% of the other
|
|
478
|
-
(proxy for data source, not malware signal)
|
|
479
|
-
|
|
480
|
-
Ported from train-xgboost.py filter_leaky_features().
|
|
481
|
-
"""
|
|
482
|
-
print("\n" + "=" * 70)
|
|
483
|
-
print("[Step 2b] Filtering dead / leaky features")
|
|
484
|
-
print("=" * 70)
|
|
485
|
-
|
|
486
|
-
neg_mask = y == 0
|
|
487
|
-
pos_mask = y == 1
|
|
488
|
-
n_neg = int(neg_mask.sum())
|
|
489
|
-
n_pos = int(pos_mask.sum())
|
|
490
|
-
n_total = n_neg + n_pos
|
|
491
|
-
|
|
492
|
-
active = []
|
|
493
|
-
dead = []
|
|
494
|
-
leaky = []
|
|
495
|
-
|
|
496
|
-
for feat in FEATURE_NAMES:
|
|
497
|
-
neg_nz = float((X.loc[neg_mask, feat] != 0).sum()) / max(n_neg, 1)
|
|
498
|
-
pos_nz = float((X.loc[pos_mask, feat] != 0).sum()) / max(n_pos, 1)
|
|
499
|
-
all_nz = float((X[feat] != 0).sum()) / max(n_total, 1)
|
|
500
|
-
|
|
501
|
-
if all_nz < min_coverage:
|
|
502
|
-
dead.append(feat)
|
|
503
|
-
elif (neg_nz >= 0.99 and pos_nz < min_coverage):
|
|
504
|
-
leaky.append(feat)
|
|
505
|
-
elif (pos_nz >= 0.99 and neg_nz < min_coverage):
|
|
506
|
-
leaky.append(feat)
|
|
507
|
-
else:
|
|
508
|
-
active.append(feat)
|
|
509
|
-
|
|
510
|
-
if dead:
|
|
511
|
-
print(f" DEAD ({len(dead)}): {', '.join(dead)}")
|
|
512
|
-
if leaky:
|
|
513
|
-
print(f" LEAKY ({len(leaky)}): {', '.join(leaky)}")
|
|
514
|
-
print(f" Active: {len(active)} / {len(FEATURE_NAMES)}")
|
|
515
|
-
|
|
516
|
-
return X[active], active
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
# ══════════════════════════════════════════════════════════════
|
|
520
|
-
# Step 3: Grid search + training
|
|
521
|
-
# ══════════════════════════════════════════════════════════════
|
|
522
|
-
|
|
523
|
-
def grid_search(X_train, y_train, active_features, scale_pos_weight):
|
|
524
|
-
"""Grid search over hyperparameters with 3-fold CV for speed."""
|
|
525
|
-
print("\n" + "=" * 70)
|
|
526
|
-
print("[Step 3] Grid search (3-fold CV)")
|
|
527
|
-
print("=" * 70)
|
|
528
|
-
|
|
529
|
-
param_combinations = list(ParameterGrid(PARAM_GRID))
|
|
530
|
-
print(f" {len(param_combinations)} combinations to evaluate")
|
|
531
|
-
|
|
532
|
-
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
|
|
533
|
-
best_f1 = 0
|
|
534
|
-
best_params = None
|
|
535
|
-
results = []
|
|
536
|
-
|
|
537
|
-
for i, params in enumerate(param_combinations):
|
|
538
|
-
xgb_params = {
|
|
539
|
-
**XGB_BASE_PARAMS,
|
|
540
|
-
'max_depth': params['max_depth'],
|
|
541
|
-
'learning_rate': params['learning_rate'],
|
|
542
|
-
'scale_pos_weight': scale_pos_weight,
|
|
543
|
-
}
|
|
544
|
-
n_est = params['n_estimators']
|
|
545
|
-
|
|
546
|
-
fold_f1s = []
|
|
547
|
-
for train_idx, val_idx in skf.split(X_train, y_train):
|
|
548
|
-
X_tr = X_train.iloc[train_idx]
|
|
549
|
-
X_va = X_train.iloc[val_idx]
|
|
550
|
-
y_tr = y_train[train_idx]
|
|
551
|
-
y_va = y_train[val_idx]
|
|
552
|
-
|
|
553
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
|
|
554
|
-
dval = xgb.DMatrix(X_va, label=y_va, feature_names=active_features)
|
|
555
|
-
|
|
556
|
-
model = xgb.train(
|
|
557
|
-
xgb_params, dtrain, num_boost_round=n_est,
|
|
558
|
-
evals=[(dval, 'val')], verbose_eval=False,
|
|
559
|
-
early_stopping_rounds=20
|
|
560
|
-
)
|
|
561
|
-
|
|
562
|
-
probs = model.predict(dval)
|
|
563
|
-
preds = (probs >= 0.5).astype(int)
|
|
564
|
-
fold_f1s.append(f1_score(y_va, preds, zero_division=0))
|
|
565
|
-
|
|
566
|
-
mean_f1 = np.mean(fold_f1s)
|
|
567
|
-
results.append({**params, 'mean_f1': mean_f1})
|
|
568
|
-
|
|
569
|
-
marker = " ← BEST" if mean_f1 > best_f1 else ""
|
|
570
|
-
if mean_f1 > best_f1:
|
|
571
|
-
best_f1 = mean_f1
|
|
572
|
-
best_params = params
|
|
573
|
-
|
|
574
|
-
if (i + 1) % 9 == 0 or i == 0 or mean_f1 > best_f1 - 0.001:
|
|
575
|
-
print(f" [{i + 1:2d}/{len(param_combinations)}] "
|
|
576
|
-
f"depth={params['max_depth']} est={params['n_estimators']} "
|
|
577
|
-
f"lr={params['learning_rate']} → F1={mean_f1:.4f}{marker}")
|
|
578
|
-
|
|
579
|
-
print(f"\n Best params: {best_params} (F1={best_f1:.4f})")
|
|
580
|
-
|
|
581
|
-
# Sort all results by F1
|
|
582
|
-
results.sort(key=lambda x: x['mean_f1'], reverse=True)
|
|
583
|
-
|
|
584
|
-
return best_params, results
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
def train_final(X_train, y_train, active_features, best_params, scale_pos_weight):
|
|
588
|
-
"""Train final model with best params on full training set."""
|
|
589
|
-
print("\n" + "=" * 70)
|
|
590
|
-
print("[Step 4] Training final model with best params")
|
|
591
|
-
print("=" * 70)
|
|
592
|
-
|
|
593
|
-
xgb_params = {
|
|
594
|
-
**XGB_BASE_PARAMS,
|
|
595
|
-
'max_depth': best_params['max_depth'],
|
|
596
|
-
'learning_rate': best_params['learning_rate'],
|
|
597
|
-
'scale_pos_weight': scale_pos_weight,
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
# Internal 90/10 for early stopping
|
|
601
|
-
X_tr, X_es, y_tr, y_es = train_test_split(
|
|
602
|
-
X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
|
|
603
|
-
)
|
|
604
|
-
|
|
605
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
|
|
606
|
-
des = xgb.DMatrix(X_es, label=y_es, feature_names=active_features)
|
|
607
|
-
|
|
608
|
-
model = xgb.train(
|
|
609
|
-
xgb_params, dtrain, num_boost_round=best_params['n_estimators'],
|
|
610
|
-
evals=[(des, 'early_stop')], verbose_eval=False,
|
|
611
|
-
early_stopping_rounds=20
|
|
612
|
-
)
|
|
613
|
-
|
|
614
|
-
best_round = getattr(model, 'best_iteration', best_params['n_estimators'])
|
|
615
|
-
print(f" Best iteration: {best_round}")
|
|
616
|
-
|
|
617
|
-
return model
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
def optimize_threshold(model, X_train, y_train, active_features,
|
|
621
|
-
best_params=None, scale_pos_weight=1.0):
|
|
622
|
-
"""5-fold CV to find optimal threshold (maximize precision at recall >= 93.9%)."""
|
|
623
|
-
print("\n" + "=" * 70)
|
|
624
|
-
print("[Step 5] Threshold optimization (5-fold CV, recall >= 93.9%)")
|
|
625
|
-
print("=" * 70)
|
|
626
|
-
|
|
627
|
-
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
628
|
-
all_probs = np.zeros(len(y_train))
|
|
629
|
-
|
|
630
|
-
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
|
|
631
|
-
X_tr = X_train.iloc[train_idx]
|
|
632
|
-
X_va = X_train.iloc[val_idx]
|
|
633
|
-
y_tr = y_train[train_idx]
|
|
634
|
-
y_va = y_train[val_idx]
|
|
635
|
-
|
|
636
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
|
|
637
|
-
dval = xgb.DMatrix(X_va, label=y_va, feature_names=active_features)
|
|
638
|
-
|
|
639
|
-
fold_params = {
|
|
640
|
-
**XGB_BASE_PARAMS,
|
|
641
|
-
'max_depth': best_params['max_depth'],
|
|
642
|
-
'learning_rate': best_params['learning_rate'],
|
|
643
|
-
'scale_pos_weight': scale_pos_weight,
|
|
644
|
-
}
|
|
645
|
-
fold_model = xgb.train(
|
|
646
|
-
fold_params,
|
|
647
|
-
dtrain, num_boost_round=best_params['n_estimators'],
|
|
648
|
-
evals=[(dval, 'val')], verbose_eval=False,
|
|
649
|
-
early_stopping_rounds=20
|
|
650
|
-
)
|
|
651
|
-
all_probs[val_idx] = fold_model.predict(dval)
|
|
652
|
-
|
|
653
|
-
# Sweep thresholds
|
|
654
|
-
best_threshold = 0.5
|
|
655
|
-
best_precision = 0.0
|
|
656
|
-
|
|
657
|
-
for t in np.arange(0.10, 0.91, 0.01):
|
|
658
|
-
preds = (all_probs >= t).astype(int)
|
|
659
|
-
r = recall_score(y_train, preds, zero_division=0)
|
|
660
|
-
p = precision_score(y_train, preds, zero_division=0)
|
|
661
|
-
if r >= 0.939 and p > best_precision:
|
|
662
|
-
best_precision = p
|
|
663
|
-
best_threshold = float(t)
|
|
664
|
-
|
|
665
|
-
if best_precision == 0.0:
|
|
666
|
-
print(" [WARN] No threshold achieves recall >= 93.9%, using 0.5")
|
|
667
|
-
best_threshold = 0.5
|
|
668
|
-
|
|
669
|
-
preds = (all_probs >= best_threshold).astype(int)
|
|
670
|
-
p = precision_score(y_train, preds, zero_division=0)
|
|
671
|
-
r = recall_score(y_train, preds, zero_division=0)
|
|
672
|
-
f1 = f1_score(y_train, preds, zero_division=0)
|
|
673
|
-
|
|
674
|
-
print(f" Optimal threshold: {best_threshold:.3f}")
|
|
675
|
-
print(f" CV metrics: P={p:.3f} R={r:.3f} F1={f1:.3f}")
|
|
676
|
-
|
|
677
|
-
return best_threshold, {'precision': p, 'recall': r, 'f1': f1}
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
# ══════════════════════════════════════════════════════════════
|
|
681
|
-
# Step 6: Evaluate on holdout
|
|
682
|
-
# ══════════════════════════════════════════════════════════════
|
|
683
|
-
|
|
684
|
-
def evaluate_holdout(model, X_test, y_test, active_features, threshold):
|
|
685
|
-
"""Evaluate on held-out test set."""
|
|
686
|
-
print("\n" + "=" * 70)
|
|
687
|
-
print(f"[Step 6] Holdout evaluation (threshold={threshold:.3f})")
|
|
688
|
-
print("=" * 70)
|
|
689
|
-
|
|
690
|
-
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=active_features)
|
|
691
|
-
probs = model.predict(dtest)
|
|
692
|
-
|
|
693
|
-
preds = (probs >= threshold).astype(int)
|
|
694
|
-
p = precision_score(y_test, preds, zero_division=0)
|
|
695
|
-
r = recall_score(y_test, preds, zero_division=0)
|
|
696
|
-
f1 = f1_score(y_test, preds, zero_division=0)
|
|
697
|
-
cm = confusion_matrix(y_test, preds)
|
|
698
|
-
tn, fp_count, fn, tp = cm.ravel()
|
|
699
|
-
|
|
700
|
-
# AUC-ROC
|
|
701
|
-
try:
|
|
702
|
-
auc = roc_auc_score(y_test, probs)
|
|
703
|
-
except ValueError:
|
|
704
|
-
auc = 0.0
|
|
705
|
-
|
|
706
|
-
# FPR / TPR
|
|
707
|
-
fpr = fp_count / max(fp_count + tn, 1)
|
|
708
|
-
tpr = tp / max(tp + fn, 1)
|
|
709
|
-
|
|
710
|
-
print(f" Precision: {p:.4f}")
|
|
711
|
-
print(f" Recall/TPR: {r:.4f}")
|
|
712
|
-
print(f" F1: {f1:.4f}")
|
|
713
|
-
print(f" AUC-ROC: {auc:.4f}")
|
|
714
|
-
print(f" FPR: {fpr:.4f}")
|
|
715
|
-
print(f" Confusion matrix:")
|
|
716
|
-
print(f" TN={tn:>6d} FP={fp_count:>6d}")
|
|
717
|
-
print(f" FN={fn:>6d} TP={tp:>6d}")
|
|
718
|
-
|
|
719
|
-
# Feature importance
|
|
720
|
-
importance = model.get_score(importance_type='gain')
|
|
721
|
-
sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)
|
|
722
|
-
print(f"\n Top 20 features (gain):")
|
|
723
|
-
for i, (name, val) in enumerate(sorted_imp[:20]):
|
|
724
|
-
print(f" {i + 1:2d}. {name:40s} {val:.4f}")
|
|
725
|
-
|
|
726
|
-
return {
|
|
727
|
-
'precision': round(float(p), 4),
|
|
728
|
-
'recall': round(float(r), 4),
|
|
729
|
-
'f1': round(float(f1), 4),
|
|
730
|
-
'auc_roc': round(float(auc), 4),
|
|
731
|
-
'fpr': round(float(fpr), 4),
|
|
732
|
-
'tpr': round(float(tpr), 4),
|
|
733
|
-
'confusion_matrix': cm.tolist(),
|
|
734
|
-
'tp': int(tp), 'fp': int(fp_count),
|
|
735
|
-
'fn': int(fn), 'tn': int(tn),
|
|
736
|
-
'top_20_features': [(name, round(val, 4)) for name, val in sorted_imp[:20]],
|
|
737
|
-
}, probs
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
# ══════════════════════════════════════════════════════════════
|
|
741
|
-
# Step 7: Export
|
|
742
|
-
# ══════════════════════════════════════════════════════════════
|
|
743
|
-
|
|
744
|
-
def save_confusion_matrix_png(y_test, preds, output_path):
|
|
745
|
-
"""Save confusion matrix as PNG."""
|
|
746
|
-
try:
|
|
747
|
-
import matplotlib
|
|
748
|
-
matplotlib.use('Agg')
|
|
749
|
-
import matplotlib.pyplot as plt
|
|
750
|
-
|
|
751
|
-
cm = confusion_matrix(y_test, preds)
|
|
752
|
-
fig, ax = plt.subplots(figsize=(8, 6))
|
|
753
|
-
im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
|
|
754
|
-
ax.figure.colorbar(im, ax=ax)
|
|
755
|
-
|
|
756
|
-
classes = ['Clean (0)', 'Malicious (1)']
|
|
757
|
-
ax.set(xticks=[0, 1], yticks=[0, 1],
|
|
758
|
-
xticklabels=classes, yticklabels=classes,
|
|
759
|
-
ylabel='True label', xlabel='Predicted label',
|
|
760
|
-
title='MUAD\'DIB Retrained Model — Confusion Matrix')
|
|
761
|
-
|
|
762
|
-
# Text annotations
|
|
763
|
-
for i in range(2):
|
|
764
|
-
for j in range(2):
|
|
765
|
-
ax.text(j, i, f'{cm[i, j]:,}',
|
|
766
|
-
ha='center', va='center',
|
|
767
|
-
color='white' if cm[i, j] > cm.max() / 2 else 'black',
|
|
768
|
-
fontsize=16)
|
|
769
|
-
|
|
770
|
-
plt.tight_layout()
|
|
771
|
-
plt.savefig(output_path, dpi=150)
|
|
772
|
-
plt.close()
|
|
773
|
-
print(f" Confusion matrix saved to {output_path}")
|
|
774
|
-
except ImportError:
|
|
775
|
-
print(" [WARN] matplotlib not available — skipping confusion matrix PNG")
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
def convert_tree(tree_json, nodes, feature_map):
|
|
779
|
-
"""Recursively convert XGBoost tree JSON to flat array (from train-xgboost.py)."""
|
|
780
|
-
idx = len(nodes)
|
|
781
|
-
nodes.append(None)
|
|
782
|
-
|
|
783
|
-
if 'leaf' in tree_json:
|
|
784
|
-
nodes[idx] = {'f': -1, 't': 0, 'y': 0, 'n': 0,
|
|
785
|
-
'v': round(tree_json['leaf'], 6)}
|
|
786
|
-
else:
|
|
787
|
-
split_feature = tree_json.get('split', '')
|
|
788
|
-
feature_idx = feature_map.get(split_feature, -1)
|
|
789
|
-
threshold = tree_json.get('split_condition', 0)
|
|
790
|
-
children = tree_json.get('children', [])
|
|
791
|
-
yes_child = tree_json.get('yes', 0)
|
|
792
|
-
no_child = tree_json.get('no', 0)
|
|
793
|
-
|
|
794
|
-
yes_tree = no_tree = None
|
|
795
|
-
for child in children:
|
|
796
|
-
if child.get('nodeid') == yes_child:
|
|
797
|
-
yes_tree = child
|
|
798
|
-
elif child.get('nodeid') == no_child:
|
|
799
|
-
no_tree = child
|
|
800
|
-
if yes_tree is None and len(children) > 0:
|
|
801
|
-
yes_tree = children[0]
|
|
802
|
-
if no_tree is None and len(children) > 1:
|
|
803
|
-
no_tree = children[1]
|
|
804
|
-
|
|
805
|
-
yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
|
|
806
|
-
no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
|
|
807
|
-
|
|
808
|
-
nodes[idx] = {'f': feature_idx, 't': round(threshold, 6),
|
|
809
|
-
'y': yes_idx, 'n': no_idx, 'v': 0}
|
|
810
|
-
return idx
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
def export_model_js(model, features, threshold, cv_metrics, holdout_metrics, output_path):
|
|
814
|
-
"""Export model to model-trees.js format."""
|
|
815
|
-
print("\n" + "=" * 70)
|
|
816
|
-
print(f"[Step 7] Exporting model to {output_path}")
|
|
817
|
-
print("=" * 70)
|
|
818
|
-
|
|
819
|
-
trees_dump = model.get_dump(dump_format='json')
|
|
820
|
-
feature_map = {name: idx for idx, name in enumerate(features)}
|
|
821
|
-
|
|
822
|
-
js_trees = []
|
|
823
|
-
total_nodes = 0
|
|
824
|
-
for tree_str in trees_dump:
|
|
825
|
-
tree_json = json.loads(tree_str)
|
|
826
|
-
nodes = []
|
|
827
|
-
convert_tree(tree_json, nodes, feature_map)
|
|
828
|
-
js_trees.append(nodes)
|
|
829
|
-
total_nodes += len(nodes)
|
|
830
|
-
|
|
831
|
-
js_model = {
|
|
832
|
-
'version': 1,
|
|
833
|
-
'features': features,
|
|
834
|
-
'threshold': threshold,
|
|
835
|
-
'trees': js_trees,
|
|
836
|
-
}
|
|
837
|
-
|
|
838
|
-
now = datetime.now(timezone.utc).strftime('%Y-%m-%d')
|
|
839
|
-
js_content = "'use strict';\n\n"
|
|
840
|
-
js_content += "/**\n"
|
|
841
|
-
js_content += f" * XGBoost model trees — auto-generated by ml-retrain/retrain.py ({now})\n"
|
|
842
|
-
js_content += f" * {len(js_trees)} trees, {len(features)} features, threshold={threshold}\n"
|
|
843
|
-
js_content += f" * CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}\n"
|
|
844
|
-
js_content += f" * Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}\n"
|
|
845
|
-
js_content += f" * AUC-ROC: {holdout_metrics['auc_roc']:.3f}\n"
|
|
846
|
-
js_content += " * DO NOT EDIT MANUALLY\n"
|
|
847
|
-
js_content += " */\n\n"
|
|
848
|
-
js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
|
|
849
|
-
|
|
850
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
851
|
-
f.write(js_content)
|
|
852
|
-
|
|
853
|
-
size_kb = output_path.stat().st_size / 1024
|
|
854
|
-
print(f" Trees: {len(js_trees)}, nodes: {total_nodes}")
|
|
855
|
-
print(f" Features: {len(features)}, threshold: {threshold:.3f}")
|
|
856
|
-
print(f" File: {size_kb:.1f} KB")
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
def save_report(dataset_stats, best_params, grid_results, cv_metrics,
|
|
860
|
-
holdout_metrics, active_features):
|
|
861
|
-
"""Save full retrain report as JSON."""
|
|
862
|
-
report = {
|
|
863
|
-
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
864
|
-
"dataset": dataset_stats,
|
|
865
|
-
"best_hyperparams": best_params,
|
|
866
|
-
"grid_search_top5": grid_results[:5],
|
|
867
|
-
"cv_metrics": cv_metrics,
|
|
868
|
-
"holdout_metrics": holdout_metrics,
|
|
869
|
-
"active_features": active_features,
|
|
870
|
-
}
|
|
871
|
-
with open(REPORT_PATH, 'w', encoding='utf-8') as f:
|
|
872
|
-
json.dump(report, f, indent=2, default=str)
|
|
873
|
-
print(f" Report saved to {REPORT_PATH}")
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
# ══════════════════════════════════════════════════════════════
|
|
877
|
-
# Main
|
|
878
|
-
# ══════════════════════════════════════════════════════════════
|
|
879
|
-
|
|
880
|
-
def run_full():
|
|
881
|
-
start = time.time()
|
|
882
|
-
|
|
883
|
-
# Step 1: Build dataset
|
|
884
|
-
dataset, dataset_stats = build_dataset()
|
|
885
|
-
|
|
886
|
-
# Step 2: Feature alignment
|
|
887
|
-
X, y = align_features(dataset)
|
|
888
|
-
X, active_features = filter_leaky_features(X, y)
|
|
889
|
-
|
|
890
|
-
# Stratified 80/20 split
|
|
891
|
-
print("\n Stratified 80/20 split (seed=42)...")
|
|
892
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
893
|
-
X, y, test_size=0.2, stratify=y, random_state=42
|
|
894
|
-
)
|
|
895
|
-
print(f" Train: {len(X_train)} ({int((y_train == 0).sum())} clean, "
|
|
896
|
-
f"{int((y_train == 1).sum())} malicious)")
|
|
897
|
-
print(f" Test: {len(X_test)} ({int((y_test == 0).sum())} clean, "
|
|
898
|
-
f"{int((y_test == 1).sum())} malicious)")
|
|
899
|
-
|
|
900
|
-
scale_pos_weight = float((y_train == 0).sum()) / max(float((y_train == 1).sum()), 1)
|
|
901
|
-
print(f" scale_pos_weight: {scale_pos_weight:.2f}")
|
|
902
|
-
|
|
903
|
-
# Step 3: Grid search
|
|
904
|
-
best_params, grid_results = grid_search(
|
|
905
|
-
X_train, y_train, active_features, scale_pos_weight)
|
|
906
|
-
|
|
907
|
-
# Step 4: Train final model
|
|
908
|
-
model = train_final(X_train, y_train, active_features, best_params, scale_pos_weight)
|
|
909
|
-
|
|
910
|
-
# Step 5: Threshold optimization
|
|
911
|
-
threshold, cv_metrics = optimize_threshold(
|
|
912
|
-
model, X_train, y_train, active_features,
|
|
913
|
-
best_params=best_params, scale_pos_weight=scale_pos_weight)
|
|
914
|
-
|
|
915
|
-
# Step 6: Holdout evaluation
|
|
916
|
-
holdout_metrics, probs = evaluate_holdout(
|
|
917
|
-
model, X_test, y_test, active_features, threshold)
|
|
918
|
-
|
|
919
|
-
# Step 7: Export
|
|
920
|
-
export_model_js(model, active_features, threshold,
|
|
921
|
-
cv_metrics, holdout_metrics, MODEL_OUTPUT)
|
|
922
|
-
|
|
923
|
-
preds = (probs >= threshold).astype(int)
|
|
924
|
-
save_confusion_matrix_png(y_test, preds, CONFUSION_MATRIX_PATH)
|
|
925
|
-
save_report(dataset_stats, best_params, grid_results,
|
|
926
|
-
cv_metrics, holdout_metrics, active_features)
|
|
927
|
-
|
|
928
|
-
elapsed = time.time() - start
|
|
929
|
-
print("\n" + "=" * 70)
|
|
930
|
-
print(f"RETRAIN COMPLETE ({elapsed:.0f}s)")
|
|
931
|
-
print("=" * 70)
|
|
932
|
-
print(f" Dataset: {len(dataset)} samples")
|
|
933
|
-
print(f" Features: {len(active_features)}")
|
|
934
|
-
print(f" Best: depth={best_params['max_depth']} "
|
|
935
|
-
f"est={best_params['n_estimators']} lr={best_params['learning_rate']}")
|
|
936
|
-
print(f" Threshold: {threshold:.3f}")
|
|
937
|
-
print(f" Holdout: P={holdout_metrics['precision']:.3f} "
|
|
938
|
-
f"R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f} "
|
|
939
|
-
f"AUC={holdout_metrics['auc_roc']:.3f}")
|
|
940
|
-
print(f" Model: {MODEL_OUTPUT}")
|
|
941
|
-
print(f" Report: {REPORT_PATH}")
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
def main():
|
|
945
|
-
parser = argparse.ArgumentParser(description="MUAD'DIB ML Retrain")
|
|
946
|
-
group = parser.add_mutually_exclusive_group(required=True)
|
|
947
|
-
group.add_argument('--full', action='store_true', help='Run all steps')
|
|
948
|
-
group.add_argument('--build-dataset', action='store_true', help='Step 1 only')
|
|
949
|
-
group.add_argument('--train-only', action='store_true',
|
|
950
|
-
help='Train from cached dataset')
|
|
951
|
-
parser.add_argument('--data-dir', help='Override MUADDIB_DATA path')
|
|
952
|
-
args = parser.parse_args()
|
|
953
|
-
|
|
954
|
-
if args.data_dir:
|
|
955
|
-
global MUADDIB_DATA, MUADDIB_ALERTS
|
|
956
|
-
MUADDIB_DATA = Path(args.data_dir)
|
|
957
|
-
MUADDIB_ALERTS = MUADDIB_DATA.parent / "logs" / "alerts"
|
|
958
|
-
|
|
959
|
-
if args.full:
|
|
960
|
-
run_full()
|
|
961
|
-
elif args.build_dataset:
|
|
962
|
-
build_dataset()
|
|
963
|
-
elif args.train_only:
|
|
964
|
-
if not DATASET_CACHE.is_file():
|
|
965
|
-
print(f"ERROR: Cached dataset not found at {DATASET_CACHE}", file=sys.stderr)
|
|
966
|
-
print("Run --build-dataset or --full first", file=sys.stderr)
|
|
967
|
-
sys.exit(1)
|
|
968
|
-
records, _ = load_jsonl(DATASET_CACHE)
|
|
969
|
-
# Fake stats for report
|
|
970
|
-
run_full()
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
if __name__ == '__main__':
|
|
974
|
-
main()
|