muaddib-scanner 2.11.76 → 2.11.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +18 -0
- package/README.md +15 -6
- package/package.json +1 -2
- package/{self-scan-v2.11.76.json → self-scan-v2.11.77.json} +1 -1
- package/src/commands/safe-install.js +8 -3
- package/src/monitor/daemon.js +34 -22
- package/src/monitor/ingestion.js +32 -2
- package/src/monitor/queue.js +84 -21
- package/src/monitor/scan-queue.js +68 -1
- package/src/monitor/state.js +24 -1
- package/src/monitor/webhook.js +32 -11
- package/src/scanner/temporal-analysis.js +8 -0
- package/src/scanner/temporal-ast-diff.js +5 -0
- package/.dockerignore +0 -7
- package/.env.example +0 -43
- package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
- package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
- package/ml-retrain/auto-labeler/labeler.py +0 -256
- package/ml-retrain/auto-labeler/npm_checker.py +0 -228
- package/ml-retrain/auto-labeler/ossf_index.py +0 -178
- package/ml-retrain/auto-labeler/requirements.txt +0 -1
- package/ml-retrain/confusion-matrix.png +0 -0
- package/ml-retrain/model-trees-retrained.js +0 -12
- package/ml-retrain/retrain-report.json +0 -225
- package/ml-retrain/retrain.py +0 -974
- package/sbom.json +0 -0
- package/src/ml/train-bundler-detector.py +0 -725
- package/src/ml/train-xgboost.py +0 -957
- package/tools/export-model-js.py +0 -160
- package/tools/requirements-ml.txt +0 -5
- package/tools/train-classifier.py +0 -333
|
@@ -1,725 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
MUAD'DIB Bundler Detector Training Pipeline — single-source JSONL (ML2)
|
|
4
|
-
|
|
5
|
-
Trains a binary XGBoost classifier to distinguish bundler false positives
|
|
6
|
-
from true malicious packages in the high-score zone (score >= 35).
|
|
7
|
-
|
|
8
|
-
Unlike ML1 (train-xgboost.py) which uses dual sources (monitor + Datadog),
|
|
9
|
-
this model uses a SINGLE source (monitor JSONL) for both classes:
|
|
10
|
-
- Class 0 (clean/bundler FP): label 'fp' AND score >= 35
|
|
11
|
-
- Class 1 (malicious): HC threat types present AND score >= 35
|
|
12
|
-
|
|
13
|
-
This avoids cross-source leakage entirely — both classes share the same
|
|
14
|
-
feature distribution from the monitor pipeline.
|
|
15
|
-
|
|
16
|
-
Features excluded at training time (always 0 at inference due to guard rails):
|
|
17
|
-
- type_reverse_shell, type_binary_dropper, type_staged_binary_payload
|
|
18
|
-
- has_typosquat, has_ioc_match
|
|
19
|
-
|
|
20
|
-
Exports directly to model-bundler.js.
|
|
21
|
-
|
|
22
|
-
Usage:
|
|
23
|
-
python src/ml/train-bundler-detector.py \\
|
|
24
|
-
--input data/ml-training.jsonl \\
|
|
25
|
-
--output src/ml/model-bundler.js \\
|
|
26
|
-
--top-features 30
|
|
27
|
-
|
|
28
|
-
# Optional: add Datadog positives for class 1 augmentation
|
|
29
|
-
python src/ml/train-bundler-detector.py \\
|
|
30
|
-
--input data/ml-training.jsonl \\
|
|
31
|
-
--positives-extra data/ml-training-datadog-full.jsonl \\
|
|
32
|
-
--output src/ml/model-bundler.js
|
|
33
|
-
|
|
34
|
-
Dependencies: see tools/requirements-ml.txt
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
import argparse
|
|
38
|
-
import json
|
|
39
|
-
import sys
|
|
40
|
-
from pathlib import Path
|
|
41
|
-
|
|
42
|
-
import numpy as np
|
|
43
|
-
import pandas as pd
|
|
44
|
-
import shap
|
|
45
|
-
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
46
|
-
from sklearn.metrics import (
|
|
47
|
-
precision_score, recall_score, f1_score, confusion_matrix
|
|
48
|
-
)
|
|
49
|
-
import xgboost as xgb
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
# --- Constants ---
|
|
53
|
-
|
|
54
|
-
# Identity columns to exclude from features
|
|
55
|
-
IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
|
|
56
|
-
|
|
57
|
-
# Minimum samples per class
|
|
58
|
-
MIN_SAMPLES = 50
|
|
59
|
-
|
|
60
|
-
# Score threshold for the bundler detector zone
|
|
61
|
-
SCORE_THRESHOLD = 35
|
|
62
|
-
|
|
63
|
-
# HC threat types used to construct the positive class (all in TOP_THREAT_TYPES)
|
|
64
|
-
BUNDLER_HC_TYPES = [
|
|
65
|
-
'type_intent_credential_exfil',
|
|
66
|
-
'type_intent_command_exfil',
|
|
67
|
-
'type_lifecycle_shell_pipe',
|
|
68
|
-
'type_reverse_shell',
|
|
69
|
-
'type_cross_file_dataflow',
|
|
70
|
-
]
|
|
71
|
-
|
|
72
|
-
# Features to EXCLUDE from training — always 0 at inference because the
|
|
73
|
-
# classifier's guard rail 2a intercepts these types before the bundler model
|
|
74
|
-
INFERENCE_EXCLUDED_FEATURES = {
|
|
75
|
-
'type_reverse_shell', # in HC_TYPES → intercepted by guard rail
|
|
76
|
-
'type_binary_dropper', # in HC_TYPES → intercepted by guard rail
|
|
77
|
-
'type_staged_binary_payload', # in HC_TYPES → intercepted by guard rail
|
|
78
|
-
'has_typosquat', # typosquat_detected in HC_TYPES
|
|
79
|
-
'has_ioc_match', # known_malicious_* in HC_TYPES
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
# XGBoost hyperparameters (same base as ML1)
|
|
83
|
-
XGB_PARAMS = {
|
|
84
|
-
'objective': 'binary:logistic',
|
|
85
|
-
'eval_metric': 'logloss',
|
|
86
|
-
'max_depth': 5, # slightly shallower than ML1 (smaller dataset expected)
|
|
87
|
-
'learning_rate': 0.1,
|
|
88
|
-
'subsample': 0.8,
|
|
89
|
-
'colsample_bytree': 0.8,
|
|
90
|
-
'min_child_weight': 5,
|
|
91
|
-
'gamma': 0.1,
|
|
92
|
-
'reg_alpha': 0.1,
|
|
93
|
-
'reg_lambda': 1.0,
|
|
94
|
-
'seed': 42,
|
|
95
|
-
'verbosity': 0,
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
N_ESTIMATORS = 200
|
|
99
|
-
N_FOLDS = 5
|
|
100
|
-
|
|
101
|
-
# Hardcoded 87 features — exact copy of feature-extractor.js output keys
|
|
102
|
-
# v2.10.32: expanded from 71 to 87 (16 new type_* features for code exec bypasses,
|
|
103
|
-
# IoC, GlassWorm, obfuscation, module graph). New features are 0 in pre-existing
|
|
104
|
-
# JSONL records; SHAP handles sparsity gracefully.
|
|
105
|
-
FEATURE_NAMES = [
|
|
106
|
-
# Scoring (4)
|
|
107
|
-
'score', 'max_file_score', 'package_score', 'global_risk_score',
|
|
108
|
-
# Severity counts (5)
|
|
109
|
-
'count_total', 'count_critical', 'count_high', 'count_medium', 'count_low',
|
|
110
|
-
# Distinct types (1)
|
|
111
|
-
'distinct_threat_types',
|
|
112
|
-
# Per-type counts (47 TOP_THREAT_TYPES + 1 other = 48)
|
|
113
|
-
# --- Original 31 ---
|
|
114
|
-
'type_suspicious_dataflow', 'type_env_access', 'type_sensitive_string',
|
|
115
|
-
'type_dangerous_call_eval', 'type_dangerous_call_exec',
|
|
116
|
-
'type_dangerous_call_function', 'type_obfuscation_detected',
|
|
117
|
-
'type_high_entropy_string', 'type_dynamic_require', 'type_dynamic_import',
|
|
118
|
-
'type_lifecycle_script', 'type_typosquat_detected', 'type_staged_payload',
|
|
119
|
-
'type_staged_binary_payload', 'type_network_require', 'type_sandbox_evasion',
|
|
120
|
-
'type_credential_regex_harvest', 'type_remote_code_load',
|
|
121
|
-
'type_suspicious_domain', 'type_prototype_hook',
|
|
122
|
-
'type_intent_credential_exfil', 'type_intent_command_exfil',
|
|
123
|
-
'type_cross_file_dataflow', 'type_module_compile', 'type_crypto_decipher',
|
|
124
|
-
'type_env_charcode_reconstruction', 'type_lifecycle_shell_pipe',
|
|
125
|
-
'type_curl_exec', 'type_reverse_shell', 'type_binary_dropper',
|
|
126
|
-
'type_mcp_config_injection',
|
|
127
|
-
# --- Code execution bypasses (v2.9.x–v2.10.x) ---
|
|
128
|
-
'type_vm_code_execution', 'type_vm_dynamic_code',
|
|
129
|
-
'type_dangerous_constructor', 'type_module_load_bypass',
|
|
130
|
-
'type_require_process_mainmodule', 'type_proxy_globalthis_intercept',
|
|
131
|
-
'type_reflect_bind_code_execution',
|
|
132
|
-
# --- IoC / supply chain ---
|
|
133
|
-
'type_known_malicious_package', 'type_known_malicious_hash',
|
|
134
|
-
# --- GlassWorm ---
|
|
135
|
-
'type_unicode_invisible_injection', 'type_blockchain_c2_resolution',
|
|
136
|
-
# --- Shell / exec ---
|
|
137
|
-
'type_dangerous_exec', 'type_node_inline_exec',
|
|
138
|
-
# --- Obfuscation ---
|
|
139
|
-
'type_js_obfuscation_pattern',
|
|
140
|
-
# --- Module graph / WASM ---
|
|
141
|
-
'type_suspicious_module_sink', 'type_wasm_host_sink',
|
|
142
|
-
# --- Aggregated ---
|
|
143
|
-
'type_other',
|
|
144
|
-
# Boolean behavioral signals (10)
|
|
145
|
-
'has_lifecycle_script', 'has_network_access', 'has_obfuscation',
|
|
146
|
-
'has_env_access', 'has_eval', 'has_staged_payload', 'has_typosquat',
|
|
147
|
-
'has_ioc_match', 'has_intent_pair', 'has_sandbox_finding',
|
|
148
|
-
# File distribution (3)
|
|
149
|
-
'file_count_with_threats', 'file_score_mean', 'file_score_max',
|
|
150
|
-
# Severity concentration (3)
|
|
151
|
-
'severity_ratio_high', 'max_single_points', 'points_concentration',
|
|
152
|
-
# Package metadata (3)
|
|
153
|
-
'unpacked_size_bytes', 'dep_count', 'dev_dep_count',
|
|
154
|
-
# Reputation (1)
|
|
155
|
-
'reputation_factor',
|
|
156
|
-
# Enriched registry metadata (9) — Phase 2a
|
|
157
|
-
'package_age_days', 'weekly_downloads', 'version_count',
|
|
158
|
-
'author_package_count', 'has_repository', 'readme_size',
|
|
159
|
-
'file_count_total', 'has_tests', 'threat_density',
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
assert len(FEATURE_NAMES) == 87, f"Expected 87 features, got {len(FEATURE_NAMES)}"
|
|
163
|
-
|
|
164
|
-
# Features available for training (after excluding inference-blocked features)
|
|
165
|
-
TRAINABLE_FEATURES = [f for f in FEATURE_NAMES if f not in INFERENCE_EXCLUDED_FEATURES]
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
# --- Data loading ---
|
|
169
|
-
|
|
170
|
-
def load_jsonl(filepath: str) -> list:
|
|
171
|
-
"""Load JSONL file into list of dicts."""
|
|
172
|
-
records = []
|
|
173
|
-
with open(filepath, 'r', encoding='utf-8') as f:
|
|
174
|
-
for line_num, line in enumerate(f, 1):
|
|
175
|
-
line = line.strip()
|
|
176
|
-
if not line:
|
|
177
|
-
continue
|
|
178
|
-
try:
|
|
179
|
-
records.append(json.loads(line))
|
|
180
|
-
except json.JSONDecodeError:
|
|
181
|
-
print(f" [WARN] Skipping malformed line {line_num} in {filepath}",
|
|
182
|
-
file=sys.stderr)
|
|
183
|
-
return records
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def has_hc_type(record: dict) -> bool:
|
|
187
|
-
"""Check if a record has any HC threat type with non-zero count."""
|
|
188
|
-
for hc_type in BUNDLER_HC_TYPES:
|
|
189
|
-
if record.get(hc_type, 0) > 0:
|
|
190
|
-
return True
|
|
191
|
-
return False
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def load_and_prepare(args) -> tuple:
|
|
195
|
-
"""
|
|
196
|
-
Step 1: Load monitor JSONL and split into bundler classes.
|
|
197
|
-
|
|
198
|
-
Class 0 (bundler FP): label == 'fp' AND score >= score_threshold
|
|
199
|
-
Class 1 (malicious): HC type present AND score >= score_threshold
|
|
200
|
-
|
|
201
|
-
Returns: (negatives, positives)
|
|
202
|
-
"""
|
|
203
|
-
score_threshold = args.score_threshold
|
|
204
|
-
|
|
205
|
-
print("=" * 60)
|
|
206
|
-
print("[Step 1/8] Loading JSONL data...")
|
|
207
|
-
print("=" * 60)
|
|
208
|
-
|
|
209
|
-
records = load_jsonl(args.input)
|
|
210
|
-
print(f" Input file: {len(records)} total records")
|
|
211
|
-
|
|
212
|
-
# Count label distribution
|
|
213
|
-
label_counts = {}
|
|
214
|
-
for r in records:
|
|
215
|
-
lbl = r.get('label', 'unknown')
|
|
216
|
-
label_counts[lbl] = label_counts.get(lbl, 0) + 1
|
|
217
|
-
print(f" Label distribution: {label_counts}")
|
|
218
|
-
|
|
219
|
-
# Filter to score >= score_threshold
|
|
220
|
-
high_score = [r for r in records if r.get('score', 0) >= score_threshold]
|
|
221
|
-
print(f" Records with score >= {score_threshold}: {len(high_score)}")
|
|
222
|
-
|
|
223
|
-
# Class 0: FP labels with high score (bundler false positives)
|
|
224
|
-
# Exclude 'unconfirmed' — not manually reviewed, may be contaminated (C1 remediation)
|
|
225
|
-
negatives = [r for r in high_score if r.get('label') == 'fp']
|
|
226
|
-
print(f" Class 0 (bundler FP): {len(negatives)}")
|
|
227
|
-
|
|
228
|
-
# Class 1: records with HC types and high score
|
|
229
|
-
# Accept any label (suspect, confirmed, malicious) — the HC type is the signal
|
|
230
|
-
# Exclude 'unconfirmed' and 'fp' from positives
|
|
231
|
-
positives = [r for r in high_score if has_hc_type(r) and r.get('label') not in ('fp', 'unconfirmed')]
|
|
232
|
-
print(f" Class 1 (HC malicious): {len(positives)}")
|
|
233
|
-
|
|
234
|
-
# Optional: augment positives from extra file
|
|
235
|
-
if args.positives_extra and Path(args.positives_extra).exists():
|
|
236
|
-
extra_records = load_jsonl(args.positives_extra)
|
|
237
|
-
extra_high = [r for r in extra_records
|
|
238
|
-
if r.get('score', 0) >= score_threshold and has_hc_type(r)]
|
|
239
|
-
print(f" Extra positives from {args.positives_extra}: {len(extra_high)}")
|
|
240
|
-
positives.extend(extra_high)
|
|
241
|
-
print(f" Class 1 total (with extra): {len(positives)}")
|
|
242
|
-
|
|
243
|
-
if len(negatives) < MIN_SAMPLES:
|
|
244
|
-
print(f"\nERROR: Need >= {MIN_SAMPLES} negatives (bundler FPs with score >= {score_threshold}), "
|
|
245
|
-
f"got {len(negatives)}",
|
|
246
|
-
file=sys.stderr)
|
|
247
|
-
print(f" Try lowering the score threshold with --score-threshold",
|
|
248
|
-
file=sys.stderr)
|
|
249
|
-
sys.exit(1)
|
|
250
|
-
if len(positives) < MIN_SAMPLES:
|
|
251
|
-
print(f"\nERROR: Need >= {MIN_SAMPLES} positives (HC types with score >= {score_threshold}), "
|
|
252
|
-
f"got {len(positives)}",
|
|
253
|
-
file=sys.stderr)
|
|
254
|
-
print(f" Try: --positives-extra data/ml-training-datadog-full.jsonl",
|
|
255
|
-
file=sys.stderr)
|
|
256
|
-
sys.exit(1)
|
|
257
|
-
|
|
258
|
-
ratio = len(negatives) / len(positives)
|
|
259
|
-
print(f"\n Negatives: {len(negatives)}")
|
|
260
|
-
print(f" Positives: {len(positives)}")
|
|
261
|
-
print(f" Ratio (neg/pos): {ratio:.2f}")
|
|
262
|
-
|
|
263
|
-
return negatives, positives
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
def align_features(negatives: list, positives: list) -> tuple:
|
|
267
|
-
"""
|
|
268
|
-
Step 2: Align all records to the trainable features (71 - excluded).
|
|
269
|
-
|
|
270
|
-
Returns: (X, y, stats)
|
|
271
|
-
"""
|
|
272
|
-
print("\n" + "=" * 60)
|
|
273
|
-
print(f"[Step 2/8] Aligning {len(TRAINABLE_FEATURES)} trainable features "
|
|
274
|
-
f"({len(INFERENCE_EXCLUDED_FEATURES)} excluded)...")
|
|
275
|
-
print("=" * 60)
|
|
276
|
-
|
|
277
|
-
print(f" Excluded features: {', '.join(sorted(INFERENCE_EXCLUDED_FEATURES))}")
|
|
278
|
-
|
|
279
|
-
all_records = []
|
|
280
|
-
for r in negatives:
|
|
281
|
-
all_records.append((r, 0))
|
|
282
|
-
for r in positives:
|
|
283
|
-
all_records.append((r, 1))
|
|
284
|
-
|
|
285
|
-
X_data = []
|
|
286
|
-
y_data = []
|
|
287
|
-
|
|
288
|
-
for record, label in all_records:
|
|
289
|
-
row = []
|
|
290
|
-
for feat in TRAINABLE_FEATURES:
|
|
291
|
-
val = record.get(feat, 0)
|
|
292
|
-
if val is None:
|
|
293
|
-
val = 0
|
|
294
|
-
row.append(float(val))
|
|
295
|
-
X_data.append(row)
|
|
296
|
-
y_data.append(label)
|
|
297
|
-
|
|
298
|
-
X = pd.DataFrame(X_data, columns=TRAINABLE_FEATURES)
|
|
299
|
-
y = np.array(y_data, dtype=int)
|
|
300
|
-
|
|
301
|
-
n_neg = int((y == 0).sum())
|
|
302
|
-
n_pos = int((y == 1).sum())
|
|
303
|
-
|
|
304
|
-
print(f" Feature matrix: {X.shape[0]} samples x {X.shape[1]} features")
|
|
305
|
-
|
|
306
|
-
stats = {
|
|
307
|
-
'n_total': len(X),
|
|
308
|
-
'n_neg': n_neg,
|
|
309
|
-
'n_pos': n_pos,
|
|
310
|
-
'n_features': len(TRAINABLE_FEATURES),
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
return X, y, stats
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
def split_data(X: pd.DataFrame, y: np.ndarray) -> tuple:
|
|
317
|
-
"""
|
|
318
|
-
Step 3: Stratified 80/20 split.
|
|
319
|
-
"""
|
|
320
|
-
print("\n" + "=" * 60)
|
|
321
|
-
print("[Step 3/8] Stratified train/test split (80/20, seed=42)...")
|
|
322
|
-
print("=" * 60)
|
|
323
|
-
|
|
324
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
325
|
-
X, y, test_size=0.2, stratify=y, random_state=42
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
print(f" Train: {len(X_train)} ({int((y_train == 0).sum())} neg, "
|
|
329
|
-
f"{int((y_train == 1).sum())} pos)")
|
|
330
|
-
print(f" Test: {len(X_test)} ({int((y_test == 0).sum())} neg, "
|
|
331
|
-
f"{int((y_test == 1).sum())} pos)")
|
|
332
|
-
|
|
333
|
-
return X_train, X_test, y_train, y_test
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
def train_preliminary_and_shap(X_train: pd.DataFrame, y_train: np.ndarray,
|
|
337
|
-
scale_pos_weight: float,
|
|
338
|
-
top_k: int = 30) -> list:
|
|
339
|
-
"""
|
|
340
|
-
Step 4: Preliminary training + SHAP feature selection.
|
|
341
|
-
"""
|
|
342
|
-
print("\n" + "=" * 60)
|
|
343
|
-
print(f"[Step 4/8] Preliminary training + SHAP (top {top_k} from "
|
|
344
|
-
f"{len(TRAINABLE_FEATURES)} features)...")
|
|
345
|
-
print("=" * 60)
|
|
346
|
-
|
|
347
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
348
|
-
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=list(X_train.columns))
|
|
349
|
-
prelim = xgb.train(params, dtrain, num_boost_round=100)
|
|
350
|
-
|
|
351
|
-
explainer = shap.TreeExplainer(prelim)
|
|
352
|
-
shap_values = explainer.shap_values(X_train)
|
|
353
|
-
|
|
354
|
-
mean_abs_shap = np.abs(shap_values).mean(axis=0)
|
|
355
|
-
importance = sorted(zip(X_train.columns, mean_abs_shap),
|
|
356
|
-
key=lambda x: x[1], reverse=True)
|
|
357
|
-
|
|
358
|
-
print(f"\n Top 20 features by SHAP importance:")
|
|
359
|
-
for i, (name, val) in enumerate(importance[:20]):
|
|
360
|
-
print(f" {i + 1:2d}. {name:40s} {val:.6f}")
|
|
361
|
-
|
|
362
|
-
# Cap to available features if fewer than top_k
|
|
363
|
-
effective_k = min(top_k, len(importance))
|
|
364
|
-
selected = [name for name, _ in importance[:effective_k]]
|
|
365
|
-
|
|
366
|
-
dropped = [name for name, _ in importance[effective_k:]]
|
|
367
|
-
if dropped:
|
|
368
|
-
print(f"\n Dropped {len(dropped)} features: {', '.join(dropped[:10])}"
|
|
369
|
-
+ (" ..." if len(dropped) > 10 else ""))
|
|
370
|
-
|
|
371
|
-
return selected
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
def cross_validate(X_train: pd.DataFrame, y_train: np.ndarray,
|
|
375
|
-
selected_features: list,
|
|
376
|
-
scale_pos_weight: float) -> dict:
|
|
377
|
-
"""
|
|
378
|
-
Step 5: 5-fold stratified CV on selected features.
|
|
379
|
-
For the bundler model, we optimize precision (minimize FN on malicious)
|
|
380
|
-
while maintaining reasonable recall.
|
|
381
|
-
"""
|
|
382
|
-
print("\n" + "=" * 60)
|
|
383
|
-
print(f"[Step 5/8] 5-fold stratified CV ({len(selected_features)} features)...")
|
|
384
|
-
print("=" * 60)
|
|
385
|
-
|
|
386
|
-
X_sel = X_train[selected_features]
|
|
387
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
388
|
-
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
|
|
389
|
-
|
|
390
|
-
fold_metrics = []
|
|
391
|
-
all_probs = np.zeros(len(y_train))
|
|
392
|
-
all_labels = np.zeros(len(y_train))
|
|
393
|
-
|
|
394
|
-
for fold, (train_idx, val_idx) in enumerate(skf.split(X_sel, y_train)):
|
|
395
|
-
X_tr = X_sel.iloc[train_idx]
|
|
396
|
-
X_va = X_sel.iloc[val_idx]
|
|
397
|
-
y_tr = y_train[train_idx]
|
|
398
|
-
y_va = y_train[val_idx]
|
|
399
|
-
|
|
400
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
|
|
401
|
-
dval = xgb.DMatrix(X_va, label=y_va, feature_names=selected_features)
|
|
402
|
-
|
|
403
|
-
model = xgb.train(
|
|
404
|
-
params, dtrain, num_boost_round=N_ESTIMATORS,
|
|
405
|
-
evals=[(dval, 'val')], verbose_eval=False,
|
|
406
|
-
early_stopping_rounds=20
|
|
407
|
-
)
|
|
408
|
-
|
|
409
|
-
probs = model.predict(dval)
|
|
410
|
-
all_probs[val_idx] = probs
|
|
411
|
-
all_labels[val_idx] = y_va
|
|
412
|
-
|
|
413
|
-
preds = (probs >= 0.5).astype(int)
|
|
414
|
-
p = precision_score(y_va, preds, zero_division=0)
|
|
415
|
-
r = recall_score(y_va, preds, zero_division=0)
|
|
416
|
-
f1 = f1_score(y_va, preds, zero_division=0)
|
|
417
|
-
fold_metrics.append({'precision': p, 'recall': r, 'f1': f1})
|
|
418
|
-
print(f" Fold {fold + 1}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
|
|
419
|
-
|
|
420
|
-
# Optimize threshold: maximize F1 (balanced precision/recall)
|
|
421
|
-
# For bundler detector, false negatives (missing real malware) are worse
|
|
422
|
-
# than false positives (flagging a bundler as malicious)
|
|
423
|
-
print(f"\n Optimizing threshold (maximize recall on malicious, precision >= 80%)...")
|
|
424
|
-
thresholds = np.arange(0.10, 0.91, 0.01)
|
|
425
|
-
best_threshold = 0.5
|
|
426
|
-
best_recall = 0.0
|
|
427
|
-
|
|
428
|
-
for t in thresholds:
|
|
429
|
-
preds = (all_probs >= t).astype(int)
|
|
430
|
-
r = recall_score(all_labels, preds, zero_division=0)
|
|
431
|
-
p = precision_score(all_labels, preds, zero_division=0)
|
|
432
|
-
if p >= 0.80 and r > best_recall:
|
|
433
|
-
best_recall = r
|
|
434
|
-
best_threshold = float(t)
|
|
435
|
-
|
|
436
|
-
if best_recall == 0.0:
|
|
437
|
-
print(f" [WARN] No threshold achieves precision >= 80%")
|
|
438
|
-
print(f" Using default threshold=0.5")
|
|
439
|
-
best_threshold = 0.5
|
|
440
|
-
|
|
441
|
-
final_preds = (all_probs >= best_threshold).astype(int)
|
|
442
|
-
final_p = precision_score(all_labels, final_preds, zero_division=0)
|
|
443
|
-
final_r = recall_score(all_labels, final_preds, zero_division=0)
|
|
444
|
-
final_f1 = f1_score(all_labels, final_preds, zero_division=0)
|
|
445
|
-
cm = confusion_matrix(all_labels, final_preds)
|
|
446
|
-
|
|
447
|
-
print(f"\n Optimal threshold: {best_threshold:.2f}")
|
|
448
|
-
print(f" CV metrics: P={final_p:.3f} R={final_r:.3f} F1={final_f1:.3f}")
|
|
449
|
-
print(f" Confusion matrix:")
|
|
450
|
-
print(f" TN={cm[0][0]} FP={cm[0][1]}")
|
|
451
|
-
print(f" FN={cm[1][0]} TP={cm[1][1]}")
|
|
452
|
-
|
|
453
|
-
return {
|
|
454
|
-
'threshold': round(best_threshold, 3),
|
|
455
|
-
'precision': round(float(final_p), 4),
|
|
456
|
-
'recall': round(float(final_r), 4),
|
|
457
|
-
'f1': round(float(final_f1), 4),
|
|
458
|
-
'fold_metrics': fold_metrics,
|
|
459
|
-
'confusion_matrix': cm.tolist()
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
def train_final_model(X_train: pd.DataFrame, y_train: np.ndarray,
|
|
464
|
-
selected_features: list,
|
|
465
|
-
scale_pos_weight: float) -> xgb.Booster:
|
|
466
|
-
"""
|
|
467
|
-
Step 6: Train final model on full train set with early stopping.
|
|
468
|
-
"""
|
|
469
|
-
print("\n" + "=" * 60)
|
|
470
|
-
print(f"[Step 6/8] Training final model ({len(selected_features)} features)...")
|
|
471
|
-
print("=" * 60)
|
|
472
|
-
|
|
473
|
-
X_sel = X_train[selected_features]
|
|
474
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
475
|
-
|
|
476
|
-
X_tr, X_es, y_tr, y_es = train_test_split(
|
|
477
|
-
X_sel, y_train, test_size=0.1, stratify=y_train, random_state=42
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
|
|
481
|
-
des = xgb.DMatrix(X_es, label=y_es, feature_names=selected_features)
|
|
482
|
-
|
|
483
|
-
model = xgb.train(
|
|
484
|
-
params, dtrain, num_boost_round=N_ESTIMATORS,
|
|
485
|
-
evals=[(des, 'early_stop')], verbose_eval=False,
|
|
486
|
-
early_stopping_rounds=20
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
best_round = model.best_iteration if hasattr(model, 'best_iteration') else N_ESTIMATORS
|
|
490
|
-
print(f" Best iteration: {best_round}")
|
|
491
|
-
|
|
492
|
-
return model
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
def evaluate_holdout(model: xgb.Booster, X_test: pd.DataFrame,
|
|
496
|
-
y_test: np.ndarray, selected_features: list,
|
|
497
|
-
threshold: float) -> dict:
|
|
498
|
-
"""
|
|
499
|
-
Step 7: Evaluate on holdout test set.
|
|
500
|
-
"""
|
|
501
|
-
print("\n" + "=" * 60)
|
|
502
|
-
print(f"[Step 7/8] Holdout evaluation (threshold={threshold:.3f})...")
|
|
503
|
-
print("=" * 60)
|
|
504
|
-
|
|
505
|
-
X_sel = X_test[selected_features]
|
|
506
|
-
dtest = xgb.DMatrix(X_sel, label=y_test, feature_names=selected_features)
|
|
507
|
-
probs = model.predict(dtest)
|
|
508
|
-
|
|
509
|
-
preds = (probs >= threshold).astype(int)
|
|
510
|
-
p = precision_score(y_test, preds, zero_division=0)
|
|
511
|
-
r = recall_score(y_test, preds, zero_division=0)
|
|
512
|
-
f1 = f1_score(y_test, preds, zero_division=0)
|
|
513
|
-
cm = confusion_matrix(y_test, preds)
|
|
514
|
-
|
|
515
|
-
tn, fp_count, fn, tp = cm.ravel()
|
|
516
|
-
|
|
517
|
-
print(f" Precision: {p:.3f}")
|
|
518
|
-
print(f" Recall: {r:.3f}")
|
|
519
|
-
print(f" F1: {f1:.3f}")
|
|
520
|
-
print(f" Confusion matrix:")
|
|
521
|
-
print(f" TN={tn} FP={fp_count}")
|
|
522
|
-
print(f" FN={fn} TP={tp}")
|
|
523
|
-
|
|
524
|
-
# Sanity check: perfect metrics = likely leakage
|
|
525
|
-
if p == 1.0 and r == 1.0:
|
|
526
|
-
print(f"\n [WARNING] Perfect precision AND recall — possible data leakage!")
|
|
527
|
-
elif f1 > 0.99:
|
|
528
|
-
print(f"\n [WARNING] F1 > 0.99 — verify no leakage")
|
|
529
|
-
|
|
530
|
-
# Feature importance
|
|
531
|
-
importance = model.get_score(importance_type='gain')
|
|
532
|
-
sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)
|
|
533
|
-
print(f"\n Top 20 features (gain-based):")
|
|
534
|
-
for i, (name, val) in enumerate(sorted_imp[:20]):
|
|
535
|
-
print(f" {i + 1:2d}. {name:40s} {val:.4f}")
|
|
536
|
-
|
|
537
|
-
return {
|
|
538
|
-
'precision': round(float(p), 4),
|
|
539
|
-
'recall': round(float(r), 4),
|
|
540
|
-
'f1': round(float(f1), 4),
|
|
541
|
-
'confusion_matrix': cm.tolist(),
|
|
542
|
-
'tp': int(tp), 'fp': int(fp_count),
|
|
543
|
-
'fn': int(fn), 'tn': int(tn)
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
def convert_tree(tree_json: dict, nodes: list, feature_map: dict) -> int:
|
|
548
|
-
"""
|
|
549
|
-
Recursively convert an XGBoost tree JSON node to flat array format.
|
|
550
|
-
Same format as model-trees.js.
|
|
551
|
-
"""
|
|
552
|
-
idx = len(nodes)
|
|
553
|
-
nodes.append(None)
|
|
554
|
-
|
|
555
|
-
if 'leaf' in tree_json:
|
|
556
|
-
nodes[idx] = {
|
|
557
|
-
'f': -1,
|
|
558
|
-
't': 0,
|
|
559
|
-
'y': 0,
|
|
560
|
-
'n': 0,
|
|
561
|
-
'v': round(tree_json['leaf'], 6)
|
|
562
|
-
}
|
|
563
|
-
else:
|
|
564
|
-
split_feature = tree_json.get('split', '')
|
|
565
|
-
feature_idx = feature_map.get(split_feature, -1)
|
|
566
|
-
threshold = tree_json.get('split_condition', 0)
|
|
567
|
-
|
|
568
|
-
children = tree_json.get('children', [])
|
|
569
|
-
yes_child = tree_json.get('yes', 0)
|
|
570
|
-
no_child = tree_json.get('no', 0)
|
|
571
|
-
|
|
572
|
-
yes_tree = None
|
|
573
|
-
no_tree = None
|
|
574
|
-
for child in children:
|
|
575
|
-
if child.get('nodeid') == yes_child:
|
|
576
|
-
yes_tree = child
|
|
577
|
-
elif child.get('nodeid') == no_child:
|
|
578
|
-
no_tree = child
|
|
579
|
-
|
|
580
|
-
if yes_tree is None and len(children) > 0:
|
|
581
|
-
yes_tree = children[0]
|
|
582
|
-
if no_tree is None and len(children) > 1:
|
|
583
|
-
no_tree = children[1]
|
|
584
|
-
|
|
585
|
-
yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
|
|
586
|
-
no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
|
|
587
|
-
|
|
588
|
-
nodes[idx] = {
|
|
589
|
-
'f': feature_idx,
|
|
590
|
-
't': round(threshold, 6),
|
|
591
|
-
'y': yes_idx,
|
|
592
|
-
'n': no_idx,
|
|
593
|
-
'v': 0
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
return idx
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
def export_model_bundler_js(model: xgb.Booster, selected_features: list,
|
|
600
|
-
threshold: float, output_path: str,
|
|
601
|
-
cv_metrics: dict, holdout_metrics: dict):
|
|
602
|
-
"""
|
|
603
|
-
Step 8: Export model directly to model-bundler.js.
|
|
604
|
-
"""
|
|
605
|
-
print("\n" + "=" * 60)
|
|
606
|
-
print(f"[Step 8/8] Exporting to {output_path}...")
|
|
607
|
-
print("=" * 60)
|
|
608
|
-
|
|
609
|
-
trees_dump = model.get_dump(dump_format='json')
|
|
610
|
-
feature_map = {name: idx for idx, name in enumerate(selected_features)}
|
|
611
|
-
|
|
612
|
-
js_trees = []
|
|
613
|
-
total_nodes = 0
|
|
614
|
-
for tree_str in trees_dump:
|
|
615
|
-
tree_json = json.loads(tree_str)
|
|
616
|
-
nodes = []
|
|
617
|
-
convert_tree(tree_json, nodes, feature_map)
|
|
618
|
-
js_trees.append(nodes)
|
|
619
|
-
total_nodes += len(nodes)
|
|
620
|
-
|
|
621
|
-
js_model = {
|
|
622
|
-
'version': 1,
|
|
623
|
-
'features': selected_features,
|
|
624
|
-
'threshold': threshold,
|
|
625
|
-
'trees': js_trees
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
js_content = "'use strict';\n\n"
|
|
629
|
-
js_content += "/**\n"
|
|
630
|
-
js_content += " * Bundler detector model trees — auto-generated by src/ml/train-bundler-detector.py\n"
|
|
631
|
-
js_content += f" * {len(js_trees)} trees, {len(selected_features)} features, threshold={threshold}\n"
|
|
632
|
-
js_content += f" * CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}\n"
|
|
633
|
-
js_content += f" * Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}\n"
|
|
634
|
-
js_content += " * DO NOT EDIT MANUALLY\n"
|
|
635
|
-
js_content += " */\n\n"
|
|
636
|
-
js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
|
|
637
|
-
|
|
638
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
639
|
-
f.write(js_content)
|
|
640
|
-
|
|
641
|
-
size_kb = Path(output_path).stat().st_size / 1024
|
|
642
|
-
print(f" Trees: {len(js_trees)}")
|
|
643
|
-
print(f" Total nodes: {total_nodes}")
|
|
644
|
-
print(f" Features: {len(selected_features)}")
|
|
645
|
-
print(f" Threshold: {threshold:.3f}")
|
|
646
|
-
print(f" File size: {size_kb:.1f} KB")
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
def main():
|
|
650
|
-
parser = argparse.ArgumentParser(
|
|
651
|
-
description='Train MUAD\'DIB bundler detector model (single-source JSONL)')
|
|
652
|
-
parser.add_argument('--input', required=True,
|
|
653
|
-
help='Path to monitor JSONL (all labels)')
|
|
654
|
-
parser.add_argument('--positives-extra', default=None,
|
|
655
|
-
help='Optional extra positives JSONL (Datadog) for class 1 augmentation')
|
|
656
|
-
parser.add_argument('--output', default='src/ml/model-bundler.js',
|
|
657
|
-
help='Output JS file path (default: src/ml/model-bundler.js)')
|
|
658
|
-
parser.add_argument('--top-features', type=int, default=30,
|
|
659
|
-
help='Number of top SHAP features to select (default: 30)')
|
|
660
|
-
parser.add_argument('--score-threshold', type=int, default=SCORE_THRESHOLD,
|
|
661
|
-
help=f'Minimum score for both classes (default: {SCORE_THRESHOLD})')
|
|
662
|
-
args = parser.parse_args()
|
|
663
|
-
|
|
664
|
-
if not Path(args.input).exists():
|
|
665
|
-
print(f"ERROR: Input file not found: {args.input}", file=sys.stderr)
|
|
666
|
-
sys.exit(1)
|
|
667
|
-
|
|
668
|
-
# Step 1: Load data
|
|
669
|
-
negatives, positives = load_and_prepare(args)
|
|
670
|
-
|
|
671
|
-
# Step 2: Align features
|
|
672
|
-
X, y, stats = align_features(negatives, positives)
|
|
673
|
-
|
|
674
|
-
# Class imbalance weight
|
|
675
|
-
n_neg = stats['n_neg']
|
|
676
|
-
n_pos = stats['n_pos']
|
|
677
|
-
scale_pos_weight = n_neg / max(n_pos, 1)
|
|
678
|
-
print(f"\n scale_pos_weight: {scale_pos_weight:.2f}")
|
|
679
|
-
|
|
680
|
-
# Step 3: Train/test split
|
|
681
|
-
X_train, X_test, y_train, y_test = split_data(X, y)
|
|
682
|
-
|
|
683
|
-
# Step 4: Preliminary + SHAP
|
|
684
|
-
selected = train_preliminary_and_shap(
|
|
685
|
-
X_train, y_train, scale_pos_weight,
|
|
686
|
-
top_k=args.top_features)
|
|
687
|
-
|
|
688
|
-
# Step 5: Cross-validation
|
|
689
|
-
cv_metrics = cross_validate(X_train, y_train, selected, scale_pos_weight)
|
|
690
|
-
|
|
691
|
-
# Step 6: Final model
|
|
692
|
-
final_model = train_final_model(X_train, y_train, selected, scale_pos_weight)
|
|
693
|
-
|
|
694
|
-
# Step 7: Holdout evaluation
|
|
695
|
-
holdout_metrics = evaluate_holdout(
|
|
696
|
-
final_model, X_test, y_test, selected, cv_metrics['threshold'])
|
|
697
|
-
|
|
698
|
-
# Step 8: Export
|
|
699
|
-
export_model_bundler_js(
|
|
700
|
-
final_model, selected, cv_metrics['threshold'],
|
|
701
|
-
args.output, cv_metrics, holdout_metrics)
|
|
702
|
-
|
|
703
|
-
# Summary
|
|
704
|
-
print("\n" + "=" * 60)
|
|
705
|
-
print("BUNDLER DETECTOR TRAINING COMPLETE")
|
|
706
|
-
print("=" * 60)
|
|
707
|
-
print(f" Samples: {n_neg} negatives (bundler FP) + {n_pos} positives (HC malicious) = {n_neg + n_pos}")
|
|
708
|
-
print(f" Features: {len(selected)} selected (from {len(TRAINABLE_FEATURES)} trainable / {len(FEATURE_NAMES)} total)")
|
|
709
|
-
print(f" Excluded features: {', '.join(sorted(INFERENCE_EXCLUDED_FEATURES))}")
|
|
710
|
-
print(f" Threshold: {cv_metrics['threshold']:.3f}")
|
|
711
|
-
print(f" CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}")
|
|
712
|
-
print(f" Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}")
|
|
713
|
-
print(f" Output: {args.output}")
|
|
714
|
-
|
|
715
|
-
# Warnings
|
|
716
|
-
if holdout_metrics['f1'] > 0.99:
|
|
717
|
-
print(f"\n [WARNING] F1 > 0.99 — verify no data leakage")
|
|
718
|
-
if holdout_metrics['recall'] < 0.80:
|
|
719
|
-
print(f"\n [WARNING] Holdout recall {holdout_metrics['recall']:.3f} < 80%")
|
|
720
|
-
if holdout_metrics['precision'] < 0.80:
|
|
721
|
-
print(f" [WARNING] Holdout precision {holdout_metrics['precision']:.3f} < 80%")
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
if __name__ == '__main__':
|
|
725
|
-
main()
|