muaddib-scanner 2.11.76 → 2.11.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,725 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- MUAD'DIB Bundler Detector Training Pipeline — single-source JSONL (ML2)
4
-
5
- Trains a binary XGBoost classifier to distinguish bundler false positives
6
- from true malicious packages in the high-score zone (score >= 35).
7
-
8
- Unlike ML1 (train-xgboost.py) which uses dual sources (monitor + Datadog),
9
- this model uses a SINGLE source (monitor JSONL) for both classes:
10
- - Class 0 (clean/bundler FP): label 'fp' AND score >= 35
11
- - Class 1 (malicious): HC threat types present AND score >= 35
12
-
13
- This avoids cross-source leakage entirely — both classes share the same
14
- feature distribution from the monitor pipeline.
15
-
16
- Features excluded at training time (always 0 at inference due to guard rails):
17
- - type_reverse_shell, type_binary_dropper, type_staged_binary_payload
18
- - has_typosquat, has_ioc_match
19
-
20
- Exports directly to model-bundler.js.
21
-
22
- Usage:
23
- python src/ml/train-bundler-detector.py \\
24
- --input data/ml-training.jsonl \\
25
- --output src/ml/model-bundler.js \\
26
- --top-features 30
27
-
28
- # Optional: add Datadog positives for class 1 augmentation
29
- python src/ml/train-bundler-detector.py \\
30
- --input data/ml-training.jsonl \\
31
- --positives-extra data/ml-training-datadog-full.jsonl \\
32
- --output src/ml/model-bundler.js
33
-
34
- Dependencies: see tools/requirements-ml.txt
35
- """
36
-
37
- import argparse
38
- import json
39
- import sys
40
- from pathlib import Path
41
-
42
- import numpy as np
43
- import pandas as pd
44
- import shap
45
- from sklearn.model_selection import train_test_split, StratifiedKFold
46
- from sklearn.metrics import (
47
- precision_score, recall_score, f1_score, confusion_matrix
48
- )
49
- import xgboost as xgb
50
-
51
-
52
- # --- Constants ---
53
-
54
- # Identity columns to exclude from features
55
- IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
56
-
57
- # Minimum samples per class
58
- MIN_SAMPLES = 50
59
-
60
- # Score threshold for the bundler detector zone
61
- SCORE_THRESHOLD = 35
62
-
63
- # HC threat types used to construct the positive class (all in TOP_THREAT_TYPES)
64
- BUNDLER_HC_TYPES = [
65
- 'type_intent_credential_exfil',
66
- 'type_intent_command_exfil',
67
- 'type_lifecycle_shell_pipe',
68
- 'type_reverse_shell',
69
- 'type_cross_file_dataflow',
70
- ]
71
-
72
- # Features to EXCLUDE from training — always 0 at inference because the
73
- # classifier's guard rail 2a intercepts these types before the bundler model
74
- INFERENCE_EXCLUDED_FEATURES = {
75
- 'type_reverse_shell', # in HC_TYPES → intercepted by guard rail
76
- 'type_binary_dropper', # in HC_TYPES → intercepted by guard rail
77
- 'type_staged_binary_payload', # in HC_TYPES → intercepted by guard rail
78
- 'has_typosquat', # typosquat_detected in HC_TYPES
79
- 'has_ioc_match', # known_malicious_* in HC_TYPES
80
- }
81
-
82
- # XGBoost hyperparameters (same base as ML1)
83
- XGB_PARAMS = {
84
- 'objective': 'binary:logistic',
85
- 'eval_metric': 'logloss',
86
- 'max_depth': 5, # slightly shallower than ML1 (smaller dataset expected)
87
- 'learning_rate': 0.1,
88
- 'subsample': 0.8,
89
- 'colsample_bytree': 0.8,
90
- 'min_child_weight': 5,
91
- 'gamma': 0.1,
92
- 'reg_alpha': 0.1,
93
- 'reg_lambda': 1.0,
94
- 'seed': 42,
95
- 'verbosity': 0,
96
- }
97
-
98
- N_ESTIMATORS = 200
99
- N_FOLDS = 5
100
-
101
- # Hardcoded 87 features — exact copy of feature-extractor.js output keys
102
- # v2.10.32: expanded from 71 to 87 (16 new type_* features for code exec bypasses,
103
- # IoC, GlassWorm, obfuscation, module graph). New features are 0 in pre-existing
104
- # JSONL records; SHAP handles sparsity gracefully.
105
- FEATURE_NAMES = [
106
- # Scoring (4)
107
- 'score', 'max_file_score', 'package_score', 'global_risk_score',
108
- # Severity counts (5)
109
- 'count_total', 'count_critical', 'count_high', 'count_medium', 'count_low',
110
- # Distinct types (1)
111
- 'distinct_threat_types',
112
- # Per-type counts (47 TOP_THREAT_TYPES + 1 other = 48)
113
- # --- Original 31 ---
114
- 'type_suspicious_dataflow', 'type_env_access', 'type_sensitive_string',
115
- 'type_dangerous_call_eval', 'type_dangerous_call_exec',
116
- 'type_dangerous_call_function', 'type_obfuscation_detected',
117
- 'type_high_entropy_string', 'type_dynamic_require', 'type_dynamic_import',
118
- 'type_lifecycle_script', 'type_typosquat_detected', 'type_staged_payload',
119
- 'type_staged_binary_payload', 'type_network_require', 'type_sandbox_evasion',
120
- 'type_credential_regex_harvest', 'type_remote_code_load',
121
- 'type_suspicious_domain', 'type_prototype_hook',
122
- 'type_intent_credential_exfil', 'type_intent_command_exfil',
123
- 'type_cross_file_dataflow', 'type_module_compile', 'type_crypto_decipher',
124
- 'type_env_charcode_reconstruction', 'type_lifecycle_shell_pipe',
125
- 'type_curl_exec', 'type_reverse_shell', 'type_binary_dropper',
126
- 'type_mcp_config_injection',
127
- # --- Code execution bypasses (v2.9.x–v2.10.x) ---
128
- 'type_vm_code_execution', 'type_vm_dynamic_code',
129
- 'type_dangerous_constructor', 'type_module_load_bypass',
130
- 'type_require_process_mainmodule', 'type_proxy_globalthis_intercept',
131
- 'type_reflect_bind_code_execution',
132
- # --- IoC / supply chain ---
133
- 'type_known_malicious_package', 'type_known_malicious_hash',
134
- # --- GlassWorm ---
135
- 'type_unicode_invisible_injection', 'type_blockchain_c2_resolution',
136
- # --- Shell / exec ---
137
- 'type_dangerous_exec', 'type_node_inline_exec',
138
- # --- Obfuscation ---
139
- 'type_js_obfuscation_pattern',
140
- # --- Module graph / WASM ---
141
- 'type_suspicious_module_sink', 'type_wasm_host_sink',
142
- # --- Aggregated ---
143
- 'type_other',
144
- # Boolean behavioral signals (10)
145
- 'has_lifecycle_script', 'has_network_access', 'has_obfuscation',
146
- 'has_env_access', 'has_eval', 'has_staged_payload', 'has_typosquat',
147
- 'has_ioc_match', 'has_intent_pair', 'has_sandbox_finding',
148
- # File distribution (3)
149
- 'file_count_with_threats', 'file_score_mean', 'file_score_max',
150
- # Severity concentration (3)
151
- 'severity_ratio_high', 'max_single_points', 'points_concentration',
152
- # Package metadata (3)
153
- 'unpacked_size_bytes', 'dep_count', 'dev_dep_count',
154
- # Reputation (1)
155
- 'reputation_factor',
156
- # Enriched registry metadata (9) — Phase 2a
157
- 'package_age_days', 'weekly_downloads', 'version_count',
158
- 'author_package_count', 'has_repository', 'readme_size',
159
- 'file_count_total', 'has_tests', 'threat_density',
160
- ]
161
-
162
- assert len(FEATURE_NAMES) == 87, f"Expected 87 features, got {len(FEATURE_NAMES)}"
163
-
164
- # Features available for training (after excluding inference-blocked features)
165
- TRAINABLE_FEATURES = [f for f in FEATURE_NAMES if f not in INFERENCE_EXCLUDED_FEATURES]
166
-
167
-
168
- # --- Data loading ---
169
-
170
- def load_jsonl(filepath: str) -> list:
171
- """Load JSONL file into list of dicts."""
172
- records = []
173
- with open(filepath, 'r', encoding='utf-8') as f:
174
- for line_num, line in enumerate(f, 1):
175
- line = line.strip()
176
- if not line:
177
- continue
178
- try:
179
- records.append(json.loads(line))
180
- except json.JSONDecodeError:
181
- print(f" [WARN] Skipping malformed line {line_num} in {filepath}",
182
- file=sys.stderr)
183
- return records
184
-
185
-
186
- def has_hc_type(record: dict) -> bool:
187
- """Check if a record has any HC threat type with non-zero count."""
188
- for hc_type in BUNDLER_HC_TYPES:
189
- if record.get(hc_type, 0) > 0:
190
- return True
191
- return False
192
-
193
-
194
- def load_and_prepare(args) -> tuple:
195
- """
196
- Step 1: Load monitor JSONL and split into bundler classes.
197
-
198
- Class 0 (bundler FP): label == 'fp' AND score >= score_threshold
199
- Class 1 (malicious): HC type present AND score >= score_threshold
200
-
201
- Returns: (negatives, positives)
202
- """
203
- score_threshold = args.score_threshold
204
-
205
- print("=" * 60)
206
- print("[Step 1/8] Loading JSONL data...")
207
- print("=" * 60)
208
-
209
- records = load_jsonl(args.input)
210
- print(f" Input file: {len(records)} total records")
211
-
212
- # Count label distribution
213
- label_counts = {}
214
- for r in records:
215
- lbl = r.get('label', 'unknown')
216
- label_counts[lbl] = label_counts.get(lbl, 0) + 1
217
- print(f" Label distribution: {label_counts}")
218
-
219
- # Filter to score >= score_threshold
220
- high_score = [r for r in records if r.get('score', 0) >= score_threshold]
221
- print(f" Records with score >= {score_threshold}: {len(high_score)}")
222
-
223
- # Class 0: FP labels with high score (bundler false positives)
224
- # Exclude 'unconfirmed' — not manually reviewed, may be contaminated (C1 remediation)
225
- negatives = [r for r in high_score if r.get('label') == 'fp']
226
- print(f" Class 0 (bundler FP): {len(negatives)}")
227
-
228
- # Class 1: records with HC types and high score
229
- # Accept any label (suspect, confirmed, malicious) — the HC type is the signal
230
- # Exclude 'unconfirmed' and 'fp' from positives
231
- positives = [r for r in high_score if has_hc_type(r) and r.get('label') not in ('fp', 'unconfirmed')]
232
- print(f" Class 1 (HC malicious): {len(positives)}")
233
-
234
- # Optional: augment positives from extra file
235
- if args.positives_extra and Path(args.positives_extra).exists():
236
- extra_records = load_jsonl(args.positives_extra)
237
- extra_high = [r for r in extra_records
238
- if r.get('score', 0) >= score_threshold and has_hc_type(r)]
239
- print(f" Extra positives from {args.positives_extra}: {len(extra_high)}")
240
- positives.extend(extra_high)
241
- print(f" Class 1 total (with extra): {len(positives)}")
242
-
243
- if len(negatives) < MIN_SAMPLES:
244
- print(f"\nERROR: Need >= {MIN_SAMPLES} negatives (bundler FPs with score >= {score_threshold}), "
245
- f"got {len(negatives)}",
246
- file=sys.stderr)
247
- print(f" Try lowering the score threshold with --score-threshold",
248
- file=sys.stderr)
249
- sys.exit(1)
250
- if len(positives) < MIN_SAMPLES:
251
- print(f"\nERROR: Need >= {MIN_SAMPLES} positives (HC types with score >= {score_threshold}), "
252
- f"got {len(positives)}",
253
- file=sys.stderr)
254
- print(f" Try: --positives-extra data/ml-training-datadog-full.jsonl",
255
- file=sys.stderr)
256
- sys.exit(1)
257
-
258
- ratio = len(negatives) / len(positives)
259
- print(f"\n Negatives: {len(negatives)}")
260
- print(f" Positives: {len(positives)}")
261
- print(f" Ratio (neg/pos): {ratio:.2f}")
262
-
263
- return negatives, positives
264
-
265
-
266
- def align_features(negatives: list, positives: list) -> tuple:
267
- """
268
- Step 2: Align all records to the trainable features (71 - excluded).
269
-
270
- Returns: (X, y, stats)
271
- """
272
- print("\n" + "=" * 60)
273
- print(f"[Step 2/8] Aligning {len(TRAINABLE_FEATURES)} trainable features "
274
- f"({len(INFERENCE_EXCLUDED_FEATURES)} excluded)...")
275
- print("=" * 60)
276
-
277
- print(f" Excluded features: {', '.join(sorted(INFERENCE_EXCLUDED_FEATURES))}")
278
-
279
- all_records = []
280
- for r in negatives:
281
- all_records.append((r, 0))
282
- for r in positives:
283
- all_records.append((r, 1))
284
-
285
- X_data = []
286
- y_data = []
287
-
288
- for record, label in all_records:
289
- row = []
290
- for feat in TRAINABLE_FEATURES:
291
- val = record.get(feat, 0)
292
- if val is None:
293
- val = 0
294
- row.append(float(val))
295
- X_data.append(row)
296
- y_data.append(label)
297
-
298
- X = pd.DataFrame(X_data, columns=TRAINABLE_FEATURES)
299
- y = np.array(y_data, dtype=int)
300
-
301
- n_neg = int((y == 0).sum())
302
- n_pos = int((y == 1).sum())
303
-
304
- print(f" Feature matrix: {X.shape[0]} samples x {X.shape[1]} features")
305
-
306
- stats = {
307
- 'n_total': len(X),
308
- 'n_neg': n_neg,
309
- 'n_pos': n_pos,
310
- 'n_features': len(TRAINABLE_FEATURES),
311
- }
312
-
313
- return X, y, stats
314
-
315
-
316
- def split_data(X: pd.DataFrame, y: np.ndarray) -> tuple:
317
- """
318
- Step 3: Stratified 80/20 split.
319
- """
320
- print("\n" + "=" * 60)
321
- print("[Step 3/8] Stratified train/test split (80/20, seed=42)...")
322
- print("=" * 60)
323
-
324
- X_train, X_test, y_train, y_test = train_test_split(
325
- X, y, test_size=0.2, stratify=y, random_state=42
326
- )
327
-
328
- print(f" Train: {len(X_train)} ({int((y_train == 0).sum())} neg, "
329
- f"{int((y_train == 1).sum())} pos)")
330
- print(f" Test: {len(X_test)} ({int((y_test == 0).sum())} neg, "
331
- f"{int((y_test == 1).sum())} pos)")
332
-
333
- return X_train, X_test, y_train, y_test
334
-
335
-
336
- def train_preliminary_and_shap(X_train: pd.DataFrame, y_train: np.ndarray,
337
- scale_pos_weight: float,
338
- top_k: int = 30) -> list:
339
- """
340
- Step 4: Preliminary training + SHAP feature selection.
341
- """
342
- print("\n" + "=" * 60)
343
- print(f"[Step 4/8] Preliminary training + SHAP (top {top_k} from "
344
- f"{len(TRAINABLE_FEATURES)} features)...")
345
- print("=" * 60)
346
-
347
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
348
- dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=list(X_train.columns))
349
- prelim = xgb.train(params, dtrain, num_boost_round=100)
350
-
351
- explainer = shap.TreeExplainer(prelim)
352
- shap_values = explainer.shap_values(X_train)
353
-
354
- mean_abs_shap = np.abs(shap_values).mean(axis=0)
355
- importance = sorted(zip(X_train.columns, mean_abs_shap),
356
- key=lambda x: x[1], reverse=True)
357
-
358
- print(f"\n Top 20 features by SHAP importance:")
359
- for i, (name, val) in enumerate(importance[:20]):
360
- print(f" {i + 1:2d}. {name:40s} {val:.6f}")
361
-
362
- # Cap to available features if fewer than top_k
363
- effective_k = min(top_k, len(importance))
364
- selected = [name for name, _ in importance[:effective_k]]
365
-
366
- dropped = [name for name, _ in importance[effective_k:]]
367
- if dropped:
368
- print(f"\n Dropped {len(dropped)} features: {', '.join(dropped[:10])}"
369
- + (" ..." if len(dropped) > 10 else ""))
370
-
371
- return selected
372
-
373
-
374
- def cross_validate(X_train: pd.DataFrame, y_train: np.ndarray,
375
- selected_features: list,
376
- scale_pos_weight: float) -> dict:
377
- """
378
- Step 5: 5-fold stratified CV on selected features.
379
- For the bundler model, we optimize precision (minimize FN on malicious)
380
- while maintaining reasonable recall.
381
- """
382
- print("\n" + "=" * 60)
383
- print(f"[Step 5/8] 5-fold stratified CV ({len(selected_features)} features)...")
384
- print("=" * 60)
385
-
386
- X_sel = X_train[selected_features]
387
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
388
- skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
389
-
390
- fold_metrics = []
391
- all_probs = np.zeros(len(y_train))
392
- all_labels = np.zeros(len(y_train))
393
-
394
- for fold, (train_idx, val_idx) in enumerate(skf.split(X_sel, y_train)):
395
- X_tr = X_sel.iloc[train_idx]
396
- X_va = X_sel.iloc[val_idx]
397
- y_tr = y_train[train_idx]
398
- y_va = y_train[val_idx]
399
-
400
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
401
- dval = xgb.DMatrix(X_va, label=y_va, feature_names=selected_features)
402
-
403
- model = xgb.train(
404
- params, dtrain, num_boost_round=N_ESTIMATORS,
405
- evals=[(dval, 'val')], verbose_eval=False,
406
- early_stopping_rounds=20
407
- )
408
-
409
- probs = model.predict(dval)
410
- all_probs[val_idx] = probs
411
- all_labels[val_idx] = y_va
412
-
413
- preds = (probs >= 0.5).astype(int)
414
- p = precision_score(y_va, preds, zero_division=0)
415
- r = recall_score(y_va, preds, zero_division=0)
416
- f1 = f1_score(y_va, preds, zero_division=0)
417
- fold_metrics.append({'precision': p, 'recall': r, 'f1': f1})
418
- print(f" Fold {fold + 1}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
419
-
420
- # Optimize threshold: maximize F1 (balanced precision/recall)
421
- # For bundler detector, false negatives (missing real malware) are worse
422
- # than false positives (flagging a bundler as malicious)
423
- print(f"\n Optimizing threshold (maximize recall on malicious, precision >= 80%)...")
424
- thresholds = np.arange(0.10, 0.91, 0.01)
425
- best_threshold = 0.5
426
- best_recall = 0.0
427
-
428
- for t in thresholds:
429
- preds = (all_probs >= t).astype(int)
430
- r = recall_score(all_labels, preds, zero_division=0)
431
- p = precision_score(all_labels, preds, zero_division=0)
432
- if p >= 0.80 and r > best_recall:
433
- best_recall = r
434
- best_threshold = float(t)
435
-
436
- if best_recall == 0.0:
437
- print(f" [WARN] No threshold achieves precision >= 80%")
438
- print(f" Using default threshold=0.5")
439
- best_threshold = 0.5
440
-
441
- final_preds = (all_probs >= best_threshold).astype(int)
442
- final_p = precision_score(all_labels, final_preds, zero_division=0)
443
- final_r = recall_score(all_labels, final_preds, zero_division=0)
444
- final_f1 = f1_score(all_labels, final_preds, zero_division=0)
445
- cm = confusion_matrix(all_labels, final_preds)
446
-
447
- print(f"\n Optimal threshold: {best_threshold:.2f}")
448
- print(f" CV metrics: P={final_p:.3f} R={final_r:.3f} F1={final_f1:.3f}")
449
- print(f" Confusion matrix:")
450
- print(f" TN={cm[0][0]} FP={cm[0][1]}")
451
- print(f" FN={cm[1][0]} TP={cm[1][1]}")
452
-
453
- return {
454
- 'threshold': round(best_threshold, 3),
455
- 'precision': round(float(final_p), 4),
456
- 'recall': round(float(final_r), 4),
457
- 'f1': round(float(final_f1), 4),
458
- 'fold_metrics': fold_metrics,
459
- 'confusion_matrix': cm.tolist()
460
- }
461
-
462
-
463
- def train_final_model(X_train: pd.DataFrame, y_train: np.ndarray,
464
- selected_features: list,
465
- scale_pos_weight: float) -> xgb.Booster:
466
- """
467
- Step 6: Train final model on full train set with early stopping.
468
- """
469
- print("\n" + "=" * 60)
470
- print(f"[Step 6/8] Training final model ({len(selected_features)} features)...")
471
- print("=" * 60)
472
-
473
- X_sel = X_train[selected_features]
474
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
475
-
476
- X_tr, X_es, y_tr, y_es = train_test_split(
477
- X_sel, y_train, test_size=0.1, stratify=y_train, random_state=42
478
- )
479
-
480
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
481
- des = xgb.DMatrix(X_es, label=y_es, feature_names=selected_features)
482
-
483
- model = xgb.train(
484
- params, dtrain, num_boost_round=N_ESTIMATORS,
485
- evals=[(des, 'early_stop')], verbose_eval=False,
486
- early_stopping_rounds=20
487
- )
488
-
489
- best_round = model.best_iteration if hasattr(model, 'best_iteration') else N_ESTIMATORS
490
- print(f" Best iteration: {best_round}")
491
-
492
- return model
493
-
494
-
495
- def evaluate_holdout(model: xgb.Booster, X_test: pd.DataFrame,
496
- y_test: np.ndarray, selected_features: list,
497
- threshold: float) -> dict:
498
- """
499
- Step 7: Evaluate on holdout test set.
500
- """
501
- print("\n" + "=" * 60)
502
- print(f"[Step 7/8] Holdout evaluation (threshold={threshold:.3f})...")
503
- print("=" * 60)
504
-
505
- X_sel = X_test[selected_features]
506
- dtest = xgb.DMatrix(X_sel, label=y_test, feature_names=selected_features)
507
- probs = model.predict(dtest)
508
-
509
- preds = (probs >= threshold).astype(int)
510
- p = precision_score(y_test, preds, zero_division=0)
511
- r = recall_score(y_test, preds, zero_division=0)
512
- f1 = f1_score(y_test, preds, zero_division=0)
513
- cm = confusion_matrix(y_test, preds)
514
-
515
- tn, fp_count, fn, tp = cm.ravel()
516
-
517
- print(f" Precision: {p:.3f}")
518
- print(f" Recall: {r:.3f}")
519
- print(f" F1: {f1:.3f}")
520
- print(f" Confusion matrix:")
521
- print(f" TN={tn} FP={fp_count}")
522
- print(f" FN={fn} TP={tp}")
523
-
524
- # Sanity check: perfect metrics = likely leakage
525
- if p == 1.0 and r == 1.0:
526
- print(f"\n [WARNING] Perfect precision AND recall — possible data leakage!")
527
- elif f1 > 0.99:
528
- print(f"\n [WARNING] F1 > 0.99 — verify no leakage")
529
-
530
- # Feature importance
531
- importance = model.get_score(importance_type='gain')
532
- sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)
533
- print(f"\n Top 20 features (gain-based):")
534
- for i, (name, val) in enumerate(sorted_imp[:20]):
535
- print(f" {i + 1:2d}. {name:40s} {val:.4f}")
536
-
537
- return {
538
- 'precision': round(float(p), 4),
539
- 'recall': round(float(r), 4),
540
- 'f1': round(float(f1), 4),
541
- 'confusion_matrix': cm.tolist(),
542
- 'tp': int(tp), 'fp': int(fp_count),
543
- 'fn': int(fn), 'tn': int(tn)
544
- }
545
-
546
-
547
- def convert_tree(tree_json: dict, nodes: list, feature_map: dict) -> int:
548
- """
549
- Recursively convert an XGBoost tree JSON node to flat array format.
550
- Same format as model-trees.js.
551
- """
552
- idx = len(nodes)
553
- nodes.append(None)
554
-
555
- if 'leaf' in tree_json:
556
- nodes[idx] = {
557
- 'f': -1,
558
- 't': 0,
559
- 'y': 0,
560
- 'n': 0,
561
- 'v': round(tree_json['leaf'], 6)
562
- }
563
- else:
564
- split_feature = tree_json.get('split', '')
565
- feature_idx = feature_map.get(split_feature, -1)
566
- threshold = tree_json.get('split_condition', 0)
567
-
568
- children = tree_json.get('children', [])
569
- yes_child = tree_json.get('yes', 0)
570
- no_child = tree_json.get('no', 0)
571
-
572
- yes_tree = None
573
- no_tree = None
574
- for child in children:
575
- if child.get('nodeid') == yes_child:
576
- yes_tree = child
577
- elif child.get('nodeid') == no_child:
578
- no_tree = child
579
-
580
- if yes_tree is None and len(children) > 0:
581
- yes_tree = children[0]
582
- if no_tree is None and len(children) > 1:
583
- no_tree = children[1]
584
-
585
- yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
586
- no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
587
-
588
- nodes[idx] = {
589
- 'f': feature_idx,
590
- 't': round(threshold, 6),
591
- 'y': yes_idx,
592
- 'n': no_idx,
593
- 'v': 0
594
- }
595
-
596
- return idx
597
-
598
-
599
- def export_model_bundler_js(model: xgb.Booster, selected_features: list,
600
- threshold: float, output_path: str,
601
- cv_metrics: dict, holdout_metrics: dict):
602
- """
603
- Step 8: Export model directly to model-bundler.js.
604
- """
605
- print("\n" + "=" * 60)
606
- print(f"[Step 8/8] Exporting to {output_path}...")
607
- print("=" * 60)
608
-
609
- trees_dump = model.get_dump(dump_format='json')
610
- feature_map = {name: idx for idx, name in enumerate(selected_features)}
611
-
612
- js_trees = []
613
- total_nodes = 0
614
- for tree_str in trees_dump:
615
- tree_json = json.loads(tree_str)
616
- nodes = []
617
- convert_tree(tree_json, nodes, feature_map)
618
- js_trees.append(nodes)
619
- total_nodes += len(nodes)
620
-
621
- js_model = {
622
- 'version': 1,
623
- 'features': selected_features,
624
- 'threshold': threshold,
625
- 'trees': js_trees
626
- }
627
-
628
- js_content = "'use strict';\n\n"
629
- js_content += "/**\n"
630
- js_content += " * Bundler detector model trees — auto-generated by src/ml/train-bundler-detector.py\n"
631
- js_content += f" * {len(js_trees)} trees, {len(selected_features)} features, threshold={threshold}\n"
632
- js_content += f" * CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}\n"
633
- js_content += f" * Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}\n"
634
- js_content += " * DO NOT EDIT MANUALLY\n"
635
- js_content += " */\n\n"
636
- js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
637
-
638
- with open(output_path, 'w', encoding='utf-8') as f:
639
- f.write(js_content)
640
-
641
- size_kb = Path(output_path).stat().st_size / 1024
642
- print(f" Trees: {len(js_trees)}")
643
- print(f" Total nodes: {total_nodes}")
644
- print(f" Features: {len(selected_features)}")
645
- print(f" Threshold: {threshold:.3f}")
646
- print(f" File size: {size_kb:.1f} KB")
647
-
648
-
649
- def main():
650
- parser = argparse.ArgumentParser(
651
- description='Train MUAD\'DIB bundler detector model (single-source JSONL)')
652
- parser.add_argument('--input', required=True,
653
- help='Path to monitor JSONL (all labels)')
654
- parser.add_argument('--positives-extra', default=None,
655
- help='Optional extra positives JSONL (Datadog) for class 1 augmentation')
656
- parser.add_argument('--output', default='src/ml/model-bundler.js',
657
- help='Output JS file path (default: src/ml/model-bundler.js)')
658
- parser.add_argument('--top-features', type=int, default=30,
659
- help='Number of top SHAP features to select (default: 30)')
660
- parser.add_argument('--score-threshold', type=int, default=SCORE_THRESHOLD,
661
- help=f'Minimum score for both classes (default: {SCORE_THRESHOLD})')
662
- args = parser.parse_args()
663
-
664
- if not Path(args.input).exists():
665
- print(f"ERROR: Input file not found: {args.input}", file=sys.stderr)
666
- sys.exit(1)
667
-
668
- # Step 1: Load data
669
- negatives, positives = load_and_prepare(args)
670
-
671
- # Step 2: Align features
672
- X, y, stats = align_features(negatives, positives)
673
-
674
- # Class imbalance weight
675
- n_neg = stats['n_neg']
676
- n_pos = stats['n_pos']
677
- scale_pos_weight = n_neg / max(n_pos, 1)
678
- print(f"\n scale_pos_weight: {scale_pos_weight:.2f}")
679
-
680
- # Step 3: Train/test split
681
- X_train, X_test, y_train, y_test = split_data(X, y)
682
-
683
- # Step 4: Preliminary + SHAP
684
- selected = train_preliminary_and_shap(
685
- X_train, y_train, scale_pos_weight,
686
- top_k=args.top_features)
687
-
688
- # Step 5: Cross-validation
689
- cv_metrics = cross_validate(X_train, y_train, selected, scale_pos_weight)
690
-
691
- # Step 6: Final model
692
- final_model = train_final_model(X_train, y_train, selected, scale_pos_weight)
693
-
694
- # Step 7: Holdout evaluation
695
- holdout_metrics = evaluate_holdout(
696
- final_model, X_test, y_test, selected, cv_metrics['threshold'])
697
-
698
- # Step 8: Export
699
- export_model_bundler_js(
700
- final_model, selected, cv_metrics['threshold'],
701
- args.output, cv_metrics, holdout_metrics)
702
-
703
- # Summary
704
- print("\n" + "=" * 60)
705
- print("BUNDLER DETECTOR TRAINING COMPLETE")
706
- print("=" * 60)
707
- print(f" Samples: {n_neg} negatives (bundler FP) + {n_pos} positives (HC malicious) = {n_neg + n_pos}")
708
- print(f" Features: {len(selected)} selected (from {len(TRAINABLE_FEATURES)} trainable / {len(FEATURE_NAMES)} total)")
709
- print(f" Excluded features: {', '.join(sorted(INFERENCE_EXCLUDED_FEATURES))}")
710
- print(f" Threshold: {cv_metrics['threshold']:.3f}")
711
- print(f" CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}")
712
- print(f" Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}")
713
- print(f" Output: {args.output}")
714
-
715
- # Warnings
716
- if holdout_metrics['f1'] > 0.99:
717
- print(f"\n [WARNING] F1 > 0.99 — verify no data leakage")
718
- if holdout_metrics['recall'] < 0.80:
719
- print(f"\n [WARNING] Holdout recall {holdout_metrics['recall']:.3f} < 80%")
720
- if holdout_metrics['precision'] < 0.80:
721
- print(f" [WARNING] Holdout precision {holdout_metrics['precision']:.3f} < 80%")
722
-
723
-
724
- if __name__ == '__main__':
725
- main()