muaddib-scanner 2.11.76 → 2.11.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.githooks/pre-commit +18 -0
  2. package/README.md +15 -6
  3. package/bin/muaddib.js +18 -4
  4. package/package.json +1 -2
  5. package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
  6. package/src/commands/interactive.js +5 -6
  7. package/src/commands/safe-install.js +19 -19
  8. package/src/ioc/scraper.js +46 -10
  9. package/src/monitor/daemon.js +39 -28
  10. package/src/monitor/ingestion.js +32 -2
  11. package/src/monitor/queue.js +84 -21
  12. package/src/monitor/scan-queue.js +68 -1
  13. package/src/monitor/state.js +24 -1
  14. package/src/monitor/webhook.js +32 -11
  15. package/src/output/formatter.js +3 -4
  16. package/src/pipeline/executor.js +9 -1
  17. package/src/runtime/daemon.js +27 -28
  18. package/src/runtime/watch.js +7 -7
  19. package/src/sandbox/index.js +11 -9
  20. package/src/scanner/temporal-analysis.js +8 -0
  21. package/src/scanner/temporal-ast-diff.js +5 -0
  22. package/src/utils.js +60 -1
  23. package/.dockerignore +0 -7
  24. package/.env.example +0 -43
  25. package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
  26. package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
  27. package/ml-retrain/auto-labeler/labeler.py +0 -256
  28. package/ml-retrain/auto-labeler/npm_checker.py +0 -228
  29. package/ml-retrain/auto-labeler/ossf_index.py +0 -178
  30. package/ml-retrain/auto-labeler/requirements.txt +0 -1
  31. package/ml-retrain/confusion-matrix.png +0 -0
  32. package/ml-retrain/model-trees-retrained.js +0 -12
  33. package/ml-retrain/retrain-report.json +0 -225
  34. package/ml-retrain/retrain.py +0 -974
  35. package/sbom.json +0 -0
  36. package/src/ml/train-bundler-detector.py +0 -725
  37. package/src/ml/train-xgboost.py +0 -957
  38. package/tools/export-model-js.py +0 -160
  39. package/tools/requirements-ml.txt +0 -5
  40. package/tools/train-classifier.py +0 -333
@@ -1,957 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- MUAD'DIB XGBoost Training Pipeline — dual-source JSONL
4
-
5
- Trains a binary XGBoost classifier on two JSONL files:
6
- - negatives: monitor output (labels clean/fp → 0)
7
- - positives: Datadog malware corpus (label malicious → 1)
8
-
9
- Exports directly to model-trees.js (no intermediate model.json).
10
-
11
- Usage:
12
- python src/ml/train-xgboost.py \
13
- --negatives data/ml-training.jsonl \
14
- --positives data/ml-training-datadog.jsonl \
15
- --output src/ml/model-trees.js \
16
- --top-features 40
17
-
18
- Dependencies: see tools/requirements-ml.txt
19
- """
20
-
21
- import argparse
22
- import json
23
- import sys
24
- from pathlib import Path
25
-
26
- import numpy as np
27
- import pandas as pd
28
- import shap
29
- from sklearn.model_selection import train_test_split, StratifiedKFold
30
- from sklearn.metrics import (
31
- precision_score, recall_score, f1_score, confusion_matrix
32
- )
33
- import xgboost as xgb
34
-
35
-
36
- # --- Constants ---
37
-
38
- # Identity columns to exclude from features
39
- IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
40
-
41
- # Minimum samples per class
42
- MIN_SAMPLES = 50
43
-
44
- # XGBoost hyperparameters (aligned with tools/train-classifier.py)
45
- XGB_PARAMS = {
46
- 'objective': 'binary:logistic',
47
- 'eval_metric': 'logloss',
48
- 'max_depth': 6,
49
- 'learning_rate': 0.1,
50
- 'subsample': 0.8,
51
- 'colsample_bytree': 0.8,
52
- 'min_child_weight': 5,
53
- 'gamma': 0.1,
54
- 'reg_alpha': 0.1,
55
- 'reg_lambda': 1.0,
56
- 'seed': 42,
57
- 'verbosity': 0,
58
- }
59
-
60
- N_ESTIMATORS = 200
61
- N_FOLDS = 5
62
-
63
- # Hardcoded 87 features — exact copy of feature-extractor.js output keys
64
- # v2.10.32: expanded from 71 to 87 (16 new type_* features for code exec bypasses,
65
- # IoC, GlassWorm, obfuscation, module graph). New features are 0 in pre-existing
66
- # JSONL records; SHAP handles sparsity gracefully.
67
- FEATURE_NAMES = [
68
- # Scoring (4)
69
- 'score', 'max_file_score', 'package_score', 'global_risk_score',
70
- # Severity counts (5)
71
- 'count_total', 'count_critical', 'count_high', 'count_medium', 'count_low',
72
- # Distinct types (1)
73
- 'distinct_threat_types',
74
- # Per-type counts (47 TOP_THREAT_TYPES + 1 other = 48)
75
- # --- Original 31 ---
76
- 'type_suspicious_dataflow', 'type_env_access', 'type_sensitive_string',
77
- 'type_dangerous_call_eval', 'type_dangerous_call_exec',
78
- 'type_dangerous_call_function', 'type_obfuscation_detected',
79
- 'type_high_entropy_string', 'type_dynamic_require', 'type_dynamic_import',
80
- 'type_lifecycle_script', 'type_typosquat_detected', 'type_staged_payload',
81
- 'type_staged_binary_payload', 'type_network_require', 'type_sandbox_evasion',
82
- 'type_credential_regex_harvest', 'type_remote_code_load',
83
- 'type_suspicious_domain', 'type_prototype_hook',
84
- 'type_intent_credential_exfil', 'type_intent_command_exfil',
85
- 'type_cross_file_dataflow', 'type_module_compile', 'type_crypto_decipher',
86
- 'type_env_charcode_reconstruction', 'type_lifecycle_shell_pipe',
87
- 'type_curl_exec', 'type_reverse_shell', 'type_binary_dropper',
88
- 'type_mcp_config_injection',
89
- # --- Code execution bypasses (v2.9.x–v2.10.x) ---
90
- 'type_vm_code_execution', 'type_vm_dynamic_code',
91
- 'type_dangerous_constructor', 'type_module_load_bypass',
92
- 'type_require_process_mainmodule', 'type_proxy_globalthis_intercept',
93
- 'type_reflect_bind_code_execution',
94
- # --- IoC / supply chain ---
95
- 'type_known_malicious_package', 'type_known_malicious_hash',
96
- # --- GlassWorm ---
97
- 'type_unicode_invisible_injection', 'type_blockchain_c2_resolution',
98
- # --- Shell / exec ---
99
- 'type_dangerous_exec', 'type_node_inline_exec',
100
- # --- Obfuscation ---
101
- 'type_js_obfuscation_pattern',
102
- # --- Module graph / WASM ---
103
- 'type_suspicious_module_sink', 'type_wasm_host_sink',
104
- # --- Aggregated ---
105
- 'type_other',
106
- # Boolean behavioral signals (10)
107
- 'has_lifecycle_script', 'has_network_access', 'has_obfuscation',
108
- 'has_env_access', 'has_eval', 'has_staged_payload', 'has_typosquat',
109
- 'has_ioc_match', 'has_intent_pair', 'has_sandbox_finding',
110
- # File distribution (3)
111
- 'file_count_with_threats', 'file_score_mean', 'file_score_max',
112
- # Severity concentration (3)
113
- 'severity_ratio_high', 'max_single_points', 'points_concentration',
114
- # Package metadata (3)
115
- 'unpacked_size_bytes', 'dep_count', 'dev_dep_count',
116
- # Reputation (1)
117
- 'reputation_factor',
118
- # Enriched registry metadata (9) — Phase 2a
119
- 'package_age_days', 'weekly_downloads', 'version_count',
120
- 'author_package_count', 'has_repository', 'readme_size',
121
- 'file_count_total', 'has_tests', 'threat_density',
122
- ]
123
-
124
- assert len(FEATURE_NAMES) == 87, f"Expected 87 features, got {len(FEATURE_NAMES)}"
125
-
126
- # Features to exclude: metadata/source-identity proxies that differ between
127
- # monitor (negatives) and Datadog (positives) for non-behavioral reasons.
128
- # See corrected retrain plan for full justification of each exclusion.
129
- EXCLUDED_METADATA = {
130
- # Score features — direct label leak by construction of the labeling
131
- # pipeline in monitor/queue.js: all negatives (label='clean') have
132
- # score<20 by definition (0 findings or T3-only), while all positives
133
- # (Datadog malware corpus) have score≥20. Leaving these as model features
134
- # gives xgboost a trivial shortcut: split on 'score', ignore all
135
- # behavioral signals. The model then fails to generalize on high-score
136
- # legitimate packages (playwright-core, @salesforce/cli, webpack, ...)
137
- # because it never had to learn the behavioral signature.
138
- # Removing them forces the model to learn from type_*/has_*/severity_ratio_*
139
- # — the actual behavioral ground truth. count_* severity counts are kept
140
- # because they reflect threat distribution, not aggregated risk.
141
- 'score', 'max_file_score', 'package_score', 'global_risk_score',
142
- # npm registry metadata — always 0 in Datadog positives (not fetched),
143
- # 8-13% non-zero in monitor negatives → source leak
144
- 'package_age_days', 'weekly_downloads', 'version_count',
145
- 'author_package_count', 'has_repository', 'readme_size',
146
- # Derived from corrupted npm metadata (age_days, version_count, downloads).
147
- # Currently zero-variance (always 1.0) but becomes a leak when future
148
- # records have actual computed values.
149
- 'reputation_factor',
150
- # Package-level metadata not from behavioral scan —
151
- # 88-95% non-zero in negatives, 0% in positives → massive source proxy
152
- 'unpacked_size_bytes', 'file_count_total',
153
- # 13% non-zero in negatives, 0% in positives → source proxy
154
- 'has_tests',
155
- }
156
-
157
-
158
- # --- Data loading ---
159
-
160
- def load_jsonl(filepath: str) -> list:
161
- """Load JSONL file into list of dicts."""
162
- records = []
163
- with open(filepath, 'r', encoding='utf-8') as f:
164
- for line_num, line in enumerate(f, 1):
165
- line = line.strip()
166
- if not line:
167
- continue
168
- try:
169
- records.append(json.loads(line))
170
- except json.JSONDecodeError:
171
- print(f" [WARN] Skipping malformed line {line_num} in {filepath}",
172
- file=sys.stderr)
173
- return records
174
-
175
-
176
- def load_and_prepare(args) -> tuple:
177
- """
178
- Step 1: Load two JSONL files and map to binary labels.
179
-
180
- Returns: (X, y, stats)
181
- """
182
- print("=" * 60)
183
- print("[Step 1/8] Loading JSONL data...")
184
- print("=" * 60)
185
-
186
- # Load negatives (clean/fp → 0)
187
- neg_records = load_jsonl(args.negatives)
188
- print(f" Negatives file: {len(neg_records)} total records")
189
-
190
- # Filter negatives: accept three label classes, all with verified ground truth.
191
- # 'clean' — scanner found 0 findings or T3-only (passive signals).
192
- # Labelled by monitor/queue.js, low-score by construction.
193
- # 'fp' — manually reviewed false positive. Requires manualReview=true
194
- # flag in jsonl-writer.js (defense-in-depth vs C1 contamination).
195
- # 'curated_benign' — hand-curated popular legitimate packages scanned via
196
- # scripts/scan-benign-training.js. These are the ONLY source
197
- # of high-score negatives: playwright-core, webpack, next,
198
- # @salesforce/cli, etc. trip behavioral heuristics while
199
- # being verifiably benign. Without them the model has no
200
- # high-score negatives and cannot generalize to complex
201
- # legitimate tooling.
202
- # Explicitly EXCLUDED: 'suspect', 'unconfirmed', 'ml_clean', 'llm_benign',
203
- # 'likely_benign', 'removed_unlabeled' — these are either uncertain or were
204
- # auto-labelled by sandbox-clean heuristics (see C1 remediation: 8176 records
205
- # contaminated before the manualReview gate was added).
206
- neg_label_counts = {}
207
- for r in neg_records:
208
- lbl = r.get('label', 'unknown')
209
- neg_label_counts[lbl] = neg_label_counts.get(lbl, 0) + 1
210
- print(f" Negative label distribution: {neg_label_counts}")
211
-
212
- VALID_NEGATIVE_LABELS = ('clean', 'fp', 'curated_benign')
213
- negatives = [r for r in neg_records if r.get('label') in VALID_NEGATIVE_LABELS]
214
- n_clean = sum(1 for r in negatives if r.get('label') == 'clean')
215
- n_fp = sum(1 for r in negatives if r.get('label') == 'fp')
216
- n_curated = sum(1 for r in negatives if r.get('label') == 'curated_benign')
217
- n_unconfirmed = sum(1 for r in neg_records if r.get('label') == 'unconfirmed')
218
- print(f" Kept {len(negatives)} negatives "
219
- f"(clean={n_clean}, fp={n_fp}, curated_benign={n_curated})")
220
- if n_unconfirmed > 0:
221
- print(f" Excluded {n_unconfirmed} 'unconfirmed' records (not manually reviewed)")
222
-
223
- # Load positives (malicious → 1)
224
- pos_records = load_jsonl(args.positives)
225
- print(f" Positives file: {len(pos_records)} total records")
226
-
227
- pos_label_counts = {}
228
- for r in pos_records:
229
- lbl = r.get('label', 'unknown')
230
- pos_label_counts[lbl] = pos_label_counts.get(lbl, 0) + 1
231
- print(f" Positive label distribution: {pos_label_counts}")
232
-
233
- positives = pos_records # All entries are malicious
234
-
235
- if len(negatives) < MIN_SAMPLES:
236
- print(f"ERROR: Need >= {MIN_SAMPLES} negatives, got {len(negatives)}",
237
- file=sys.stderr)
238
- sys.exit(1)
239
- if len(positives) < MIN_SAMPLES:
240
- print(f"ERROR: Need >= {MIN_SAMPLES} positives, got {len(positives)}",
241
- file=sys.stderr)
242
- sys.exit(1)
243
-
244
- ratio = len(negatives) / len(positives)
245
- print(f"\n Negatives: {len(negatives)}")
246
- print(f" Positives: {len(positives)}")
247
- print(f" Ratio (neg/pos): {ratio:.2f}")
248
-
249
- return negatives, positives
250
-
251
-
252
- def align_features(negatives: list, positives: list) -> tuple:
253
- """
254
- Step 2: Align all records to the 87 hardcoded features.
255
-
256
- Returns: (X: pd.DataFrame, y: np.ndarray, stats: dict)
257
- """
258
- print("\n" + "=" * 60)
259
- print("[Step 2/8] Aligning 87 features...")
260
- print("=" * 60)
261
-
262
- # Combine with binary labels
263
- all_records = []
264
- for r in negatives:
265
- all_records.append((r, 0))
266
- for r in positives:
267
- all_records.append((r, 1))
268
-
269
- # Extract feature matrix
270
- X_data = []
271
- y_data = []
272
- neg_present = 0
273
- neg_missing = 0
274
- pos_present = 0
275
- pos_missing = 0
276
-
277
- for record, label in all_records:
278
- row = []
279
- for feat in FEATURE_NAMES:
280
- val = record.get(feat, 0)
281
- if val is None:
282
- val = 0
283
- row.append(float(val))
284
- X_data.append(row)
285
- y_data.append(label)
286
-
287
- # Count present vs missing features
288
- if label == 0:
289
- for feat in FEATURE_NAMES:
290
- if feat in record and record[feat] is not None:
291
- neg_present += 1
292
- else:
293
- neg_missing += 1
294
- else:
295
- for feat in FEATURE_NAMES:
296
- if feat in record and record[feat] is not None:
297
- pos_present += 1
298
- else:
299
- pos_missing += 1
300
-
301
- X = pd.DataFrame(X_data, columns=FEATURE_NAMES)
302
- y = np.array(y_data, dtype=int)
303
-
304
- n_neg = int((y == 0).sum())
305
- n_pos = int((y == 1).sum())
306
-
307
- print(f" Feature matrix: {X.shape[0]} samples x {X.shape[1]} features")
308
- print(f" Negatives: {neg_present} present, {neg_missing} missing "
309
- f"({neg_present / max(neg_present + neg_missing, 1) * 100:.1f}% coverage)")
310
- print(f" Positives: {pos_present} present, {pos_missing} missing "
311
- f"({pos_present / max(pos_present + pos_missing, 1) * 100:.1f}% coverage)")
312
-
313
- stats = {
314
- 'n_total': len(X),
315
- 'n_neg': n_neg,
316
- 'n_pos': n_pos,
317
- 'n_features': len(FEATURE_NAMES),
318
- }
319
-
320
- return X, y, stats
321
-
322
-
323
- def filter_leaky_features(X: pd.DataFrame, y: np.ndarray,
324
- min_coverage: float = 0.001) -> tuple:
325
- """
326
- Step 2b: Remove dead features and source-identity leaks.
327
-
328
- A feature is dropped ONLY if:
329
- - DEAD: non-zero in < 0.1% of ALL samples (both classes combined)
330
- - LEAKY: non-zero in >= 99% of one class AND < 0.1% of the other
331
- (proxy for data source, not malware signal)
332
-
333
- Features that are 0% in negatives but high in positives are KEPT —
334
- that's discriminative, not leaky (e.g., count_critical, type_* features
335
- are legitimately 0 in clean packages).
336
-
337
- Returns: (X_filtered, active_features)
338
- """
339
- print("\n" + "=" * 60)
340
- print("[Step 2b/8] Filtering dead/leaky features...")
341
- print("=" * 60)
342
-
343
- neg_mask = y == 0
344
- pos_mask = y == 1
345
- n_neg = int(neg_mask.sum())
346
- n_pos = int(pos_mask.sum())
347
- n_total = n_neg + n_pos
348
-
349
- retained = []
350
- excluded = []
351
-
352
- # Iterate over columns actually present in X (metadata may have been
353
- # dropped by Step 2a before this function is called).
354
- available_features = list(X.columns)
355
-
356
- print(f"\n {'Feature':<40s} {'Neg%':>6s} {'Pos%':>6s} {'All%':>6s} {'Status'}")
357
- print(f" {'-' * 40} {'-' * 6} {'-' * 6} {'-' * 6} {'-' * 8}")
358
-
359
- for feat in available_features:
360
- neg_nonzero = float((X.loc[neg_mask, feat] != 0).sum()) / max(n_neg, 1)
361
- pos_nonzero = float((X.loc[pos_mask, feat] != 0).sum()) / max(n_pos, 1)
362
- all_nonzero = float((X[feat] != 0).sum()) / max(n_total, 1)
363
-
364
- status = 'KEEP'
365
-
366
- # DEAD: feature is near-zero across ALL samples — no signal at all
367
- if all_nonzero < min_coverage:
368
- status = 'DEAD'
369
-
370
- # LEAKY: feature is a source-identity proxy (>=99% in one, <0.1% in other)
371
- elif (neg_nonzero >= 0.99 and pos_nonzero < min_coverage):
372
- status = 'LEAK'
373
- elif (pos_nonzero >= 0.99 and neg_nonzero < min_coverage):
374
- status = 'LEAK'
375
-
376
- if status != 'KEEP':
377
- excluded.append(feat)
378
- else:
379
- retained.append(feat)
380
-
381
- print(f" {feat:<40s} {neg_nonzero * 100:5.1f}% {pos_nonzero * 100:5.1f}% "
382
- f"{all_nonzero * 100:5.1f}% {status}")
383
-
384
- print(f"\n Retained: {len(retained)}/{len(available_features)} features")
385
- if excluded:
386
- print(f" Excluded ({len(excluded)}): {', '.join(excluded)}")
387
-
388
- X_filtered = X[retained]
389
- return X_filtered, retained
390
-
391
-
392
- def source_discrimination_diagnostic(X: pd.DataFrame, y: np.ndarray,
393
- active_features: list):
394
- """
395
- Step 2c: Source discrimination diagnostic (LOG-ONLY, non-blocking).
396
-
397
- DESIGN NOTE: This test cannot function as a hard gate when source labels
398
- are perfectly confounded with class labels (all negatives = monitor,
399
- all positives = Datadog). In that case, legitimate behavioral features
400
- (score, count_critical, type_*) will dominate the discriminator because
401
- malware genuinely behaves differently from clean packages — this is
402
- signal, not leak.
403
-
404
- A true source discrimination test would require either:
405
- (a) positives re-scanned through our own pipeline, or
406
- (b) negatives and positives from the SAME source.
407
-
408
- This diagnostic still serves a purpose: it flags NON-BEHAVIORAL features
409
- that shouldn't appear in the top discriminators. If metadata features
410
- (unpacked_size_bytes, file_count_total, etc.) appear despite being
411
- excluded in Step 2a, something is wrong.
412
-
413
- The real validation happens in shadow deployment on live production data.
414
- """
415
- print("\n" + "=" * 60)
416
- print("[Step 2c/8] Source discrimination diagnostic (log-only)...")
417
- print("=" * 60)
418
- print(" NOTE: source=Datadog correlates 100% with label=malicious.")
419
- print(" This diagnostic checks for non-behavioral features in the")
420
- print(" top discriminators, NOT for overall accuracy (which will")
421
- print(" always be high due to the source/label confound).")
422
-
423
- X_active = X[active_features]
424
-
425
- # 70/30 split with different seed to avoid overlap with main split
426
- X_tr, X_te, y_tr, y_te = train_test_split(
427
- X_active, y, test_size=0.3, stratify=y, random_state=99
428
- )
429
-
430
- # Shallow model — depth=3, 50 rounds, no class weighting
431
- params = {
432
- 'objective': 'binary:logistic',
433
- 'eval_metric': 'logloss',
434
- 'max_depth': 3,
435
- 'learning_rate': 0.1,
436
- 'subsample': 0.8,
437
- 'seed': 99,
438
- 'verbosity': 0,
439
- }
440
-
441
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
442
- dtest = xgb.DMatrix(X_te, label=y_te, feature_names=active_features)
443
-
444
- model = xgb.train(params, dtrain, num_boost_round=50)
445
- probs = model.predict(dtest)
446
- preds = (probs >= 0.5).astype(int)
447
- accuracy = float((preds == y_te).mean())
448
-
449
- p = precision_score(y_te, preds, zero_division=0)
450
- r = recall_score(y_te, preds, zero_division=0)
451
-
452
- print(f"\n Discrimination accuracy: {accuracy:.3f} (P={p:.3f} R={r:.3f})")
453
- print(f" (Expected to be high due to source/label confound)")
454
-
455
- # SHAP analysis — the diagnostic value is in WHICH features dominate
456
- explainer = shap.TreeExplainer(model)
457
- shap_values = explainer.shap_values(X_te)
458
- mean_abs_shap = np.abs(shap_values).mean(axis=0)
459
- importance = sorted(zip(active_features, mean_abs_shap),
460
- key=lambda x: x[1], reverse=True)
461
-
462
- # Known behavioral features that SHOULD dominate (malware scores higher)
463
- EXPECTED_BEHAVIORAL = {
464
- 'score', 'global_risk_score', 'max_file_score', 'package_score',
465
- 'count_total', 'count_critical', 'count_high', 'count_medium',
466
- 'count_low', 'distinct_threat_types', 'severity_ratio_high',
467
- 'max_single_points', 'points_concentration', 'file_count_with_threats',
468
- 'file_score_mean', 'file_score_max', 'threat_density',
469
- }
470
- # Features that should NOT appear (already excluded, but sanity check)
471
- EXCLUDED_CHECK = {
472
- 'unpacked_size_bytes', 'file_count_total', 'has_tests',
473
- 'dep_count', 'dev_dep_count', 'reputation_factor',
474
- 'package_age_days', 'weekly_downloads', 'version_count',
475
- 'author_package_count', 'has_repository', 'readme_size',
476
- }
477
-
478
- print(f"\n Top 10 features driving discrimination:")
479
- has_leak = False
480
- for i, (name, val) in enumerate(importance[:10]):
481
- if name in EXCLUDED_CHECK:
482
- flag = " *** LEAK — should have been excluded in Step 2a!"
483
- has_leak = True
484
- elif name in EXPECTED_BEHAVIORAL:
485
- flag = " (expected — behavioral)"
486
- elif name.startswith('type_') or name.startswith('has_'):
487
- flag = " (behavioral signal)"
488
- else:
489
- flag = ""
490
- print(f" {i + 1:2d}. {name:40s} {val:.6f}{flag}")
491
-
492
- if has_leak:
493
- print(f"\n [WARNING] Non-behavioral features found in top discriminators!")
494
- print(f" Check EXCLUDED_METADATA — some metadata features leaked through.")
495
- else:
496
- print(f"\n [OK] Top discriminators are all behavioral features.")
497
- print(f" No metadata/source-proxy leak detected.")
498
-
499
-
500
- def split_data(X: pd.DataFrame, y: np.ndarray) -> tuple:
501
- """
502
- Step 3: Stratified 80/20 split.
503
-
504
- Returns: (X_train, X_test, y_train, y_test)
505
- """
506
- print("\n" + "=" * 60)
507
- print("[Step 3/8] Stratified train/test split (80/20, seed=42)...")
508
- print("=" * 60)
509
-
510
- X_train, X_test, y_train, y_test = train_test_split(
511
- X, y, test_size=0.2, stratify=y, random_state=42
512
- )
513
-
514
- print(f" Train: {len(X_train)} ({int((y_train == 0).sum())} neg, "
515
- f"{int((y_train == 1).sum())} pos)")
516
- print(f" Test: {len(X_test)} ({int((y_test == 0).sum())} neg, "
517
- f"{int((y_test == 1).sum())} pos)")
518
-
519
- return X_train, X_test, y_train, y_test
520
-
521
-
522
- def train_preliminary_and_shap(X_train: pd.DataFrame, y_train: np.ndarray,
523
- scale_pos_weight: float,
524
- active_features: list,
525
- top_k: int = 40) -> list:
526
- """
527
- Step 4: Preliminary training + SHAP feature selection.
528
-
529
- Returns: list of selected feature names
530
- """
531
- print("\n" + "=" * 60)
532
- print(f"[Step 4/8] Preliminary training + SHAP (top {top_k} from {len(active_features)} features)...")
533
- print("=" * 60)
534
-
535
- X_active = X_train[active_features]
536
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
537
- dtrain = xgb.DMatrix(X_active, label=y_train, feature_names=active_features)
538
- prelim = xgb.train(params, dtrain, num_boost_round=100)
539
-
540
- # SHAP
541
- explainer = shap.TreeExplainer(prelim)
542
- shap_values = explainer.shap_values(X_active)
543
-
544
- mean_abs_shap = np.abs(shap_values).mean(axis=0)
545
- importance = sorted(zip(active_features, mean_abs_shap),
546
- key=lambda x: x[1], reverse=True)
547
-
548
- print(f"\n Top 20 features by SHAP importance:")
549
- for i, (name, val) in enumerate(importance[:20]):
550
- print(f" {i + 1:2d}. {name:40s} {val:.6f}")
551
-
552
- selected = [name for name, _ in importance[:top_k]]
553
-
554
- # Show which features were dropped
555
- dropped = [name for name, _ in importance[top_k:]]
556
- if dropped:
557
- print(f"\n Dropped {len(dropped)} features: {', '.join(dropped[:10])}"
558
- + (" ..." if len(dropped) > 10 else ""))
559
-
560
- return selected
561
-
562
-
563
- def cross_validate(X_train: pd.DataFrame, y_train: np.ndarray,
564
- selected_features: list,
565
- scale_pos_weight: float) -> dict:
566
- """
567
- Step 5: 5-fold stratified CV on selected features.
568
- Optimize threshold: maximize precision under recall >= 93.9%.
569
-
570
- Returns: dict with threshold, precision, recall, fold_metrics
571
- """
572
- print("\n" + "=" * 60)
573
- print(f"[Step 5/8] 5-fold stratified CV ({len(selected_features)} features)...")
574
- print("=" * 60)
575
-
576
- X_sel = X_train[selected_features]
577
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
578
- skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
579
-
580
- fold_metrics = []
581
- all_probs = np.zeros(len(y_train))
582
- all_labels = np.zeros(len(y_train))
583
-
584
- for fold, (train_idx, val_idx) in enumerate(skf.split(X_sel, y_train)):
585
- X_tr = X_sel.iloc[train_idx]
586
- X_va = X_sel.iloc[val_idx]
587
- y_tr = y_train[train_idx]
588
- y_va = y_train[val_idx]
589
-
590
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
591
- dval = xgb.DMatrix(X_va, label=y_va, feature_names=selected_features)
592
-
593
- model = xgb.train(
594
- params, dtrain, num_boost_round=N_ESTIMATORS,
595
- evals=[(dval, 'val')], verbose_eval=False,
596
- early_stopping_rounds=20
597
- )
598
-
599
- probs = model.predict(dval)
600
- all_probs[val_idx] = probs
601
- all_labels[val_idx] = y_va
602
-
603
- preds = (probs >= 0.5).astype(int)
604
- p = precision_score(y_va, preds, zero_division=0)
605
- r = recall_score(y_va, preds, zero_division=0)
606
- f1 = f1_score(y_va, preds, zero_division=0)
607
- fold_metrics.append({'precision': p, 'recall': r, 'f1': f1})
608
- print(f" Fold {fold + 1}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
609
-
610
- # Optimize threshold: maximize precision while maintaining recall >= 93.9%
611
- print(f"\n Optimizing threshold (recall >= 93.9%)...")
612
- thresholds = np.arange(0.10, 0.91, 0.01)
613
- best_threshold = 0.5
614
- best_precision = 0.0
615
-
616
- for t in thresholds:
617
- preds = (all_probs >= t).astype(int)
618
- r = recall_score(all_labels, preds, zero_division=0)
619
- p = precision_score(all_labels, preds, zero_division=0)
620
- if r >= 0.939 and p > best_precision:
621
- best_precision = p
622
- best_threshold = float(t)
623
-
624
- # If no threshold meets recall constraint, warn and use 0.5
625
- if best_precision == 0.0:
626
- print(f" [WARN] No threshold achieves recall >= 93.9%")
627
- print(f" Using default threshold=0.5")
628
- best_threshold = 0.5
629
- final_preds = (all_probs >= 0.5).astype(int)
630
- else:
631
- final_preds = (all_probs >= best_threshold).astype(int)
632
-
633
- final_p = precision_score(all_labels, final_preds, zero_division=0)
634
- final_r = recall_score(all_labels, final_preds, zero_division=0)
635
- final_f1 = f1_score(all_labels, final_preds, zero_division=0)
636
- cm = confusion_matrix(all_labels, final_preds)
637
-
638
- print(f"\n Optimal threshold: {best_threshold:.2f}")
639
- print(f" CV metrics: P={final_p:.3f} R={final_r:.3f} F1={final_f1:.3f}")
640
- print(f" Confusion matrix:")
641
- print(f" TN={cm[0][0]} FP={cm[0][1]}")
642
- print(f" FN={cm[1][0]} TP={cm[1][1]}")
643
-
644
- return {
645
- 'threshold': round(best_threshold, 3),
646
- 'precision': round(float(final_p), 4),
647
- 'recall': round(float(final_r), 4),
648
- 'f1': round(float(final_f1), 4),
649
- 'fold_metrics': fold_metrics,
650
- 'confusion_matrix': cm.tolist()
651
- }
652
-
653
-
654
- def train_final_model(X_train: pd.DataFrame, y_train: np.ndarray,
655
- selected_features: list,
656
- scale_pos_weight: float) -> xgb.Booster:
657
- """
658
- Step 6: Train final model on full train set with early stopping on internal split.
659
- """
660
- print("\n" + "=" * 60)
661
- print(f"[Step 6/8] Training final model ({len(selected_features)} features)...")
662
- print("=" * 60)
663
-
664
- X_sel = X_train[selected_features]
665
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
666
-
667
- # Internal 90/10 split for early stopping
668
- X_tr, X_es, y_tr, y_es = train_test_split(
669
- X_sel, y_train, test_size=0.1, stratify=y_train, random_state=42
670
- )
671
-
672
- dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
673
- des = xgb.DMatrix(X_es, label=y_es, feature_names=selected_features)
674
-
675
- model = xgb.train(
676
- params, dtrain, num_boost_round=N_ESTIMATORS,
677
- evals=[(des, 'early_stop')], verbose_eval=False,
678
- early_stopping_rounds=20
679
- )
680
-
681
- best_round = model.best_iteration if hasattr(model, 'best_iteration') else N_ESTIMATORS
682
- print(f" Best iteration: {best_round}")
683
-
684
- return model
685
-
686
-
687
- def evaluate_holdout(model: xgb.Booster, X_test: pd.DataFrame,
688
- y_test: np.ndarray, selected_features: list,
689
- threshold: float) -> dict:
690
- """
691
- Step 7: Evaluate on holdout test set.
692
- """
693
- print("\n" + "=" * 60)
694
- print(f"[Step 7/8] Holdout evaluation (threshold={threshold:.3f})...")
695
- print("=" * 60)
696
-
697
- X_sel = X_test[selected_features]
698
- dtest = xgb.DMatrix(X_sel, label=y_test, feature_names=selected_features)
699
- probs = model.predict(dtest)
700
-
701
- preds = (probs >= threshold).astype(int)
702
- p = precision_score(y_test, preds, zero_division=0)
703
- r = recall_score(y_test, preds, zero_division=0)
704
- f1 = f1_score(y_test, preds, zero_division=0)
705
- cm = confusion_matrix(y_test, preds)
706
-
707
- tn, fp_count, fn, tp = cm.ravel()
708
-
709
- print(f" Precision: {p:.3f}")
710
- print(f" Recall: {r:.3f}")
711
- print(f" F1: {f1:.3f}")
712
- print(f" Confusion matrix:")
713
- print(f" TN={tn} FP={fp_count}")
714
- print(f" FN={fn} TP={tp}")
715
-
716
- # Hard verification
717
- if r < 0.939:
718
- print(f"\n [WARNING] Recall {r:.3f} < 93.9% target!")
719
- else:
720
- print(f"\n [PASS] Recall >= 93.9%")
721
-
722
- if p < 0.95:
723
- print(f" [WARNING] Precision {p:.3f} < 95% target!")
724
- else:
725
- print(f" [PASS] Precision >= 95%")
726
-
727
- # Feature importance (gain-based)
728
- importance = model.get_score(importance_type='gain')
729
- sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)
730
- print(f"\n Top 20 features (gain-based):")
731
- for i, (name, val) in enumerate(sorted_imp[:20]):
732
- print(f" {i + 1:2d}. {name:40s} {val:.4f}")
733
-
734
- return {
735
- 'precision': round(float(p), 4),
736
- 'recall': round(float(r), 4),
737
- 'f1': round(float(f1), 4),
738
- 'confusion_matrix': cm.tolist(),
739
- 'tp': int(tp), 'fp': int(fp_count),
740
- 'fn': int(fn), 'tn': int(tn)
741
- }
742
-
743
-
744
- def convert_tree(tree_json: dict, nodes: list, feature_map: dict) -> int:
745
- """
746
- Recursively convert an XGBoost tree JSON node to flat array format.
747
- Reused from tools/export-model-js.py.
748
- """
749
- idx = len(nodes)
750
- nodes.append(None)
751
-
752
- if 'leaf' in tree_json:
753
- nodes[idx] = {
754
- 'f': -1,
755
- 't': 0,
756
- 'y': 0,
757
- 'n': 0,
758
- 'v': round(tree_json['leaf'], 6)
759
- }
760
- else:
761
- split_feature = tree_json.get('split', '')
762
- feature_idx = feature_map.get(split_feature, -1)
763
- threshold = tree_json.get('split_condition', 0)
764
-
765
- children = tree_json.get('children', [])
766
- yes_child = tree_json.get('yes', 0)
767
- no_child = tree_json.get('no', 0)
768
-
769
- yes_tree = None
770
- no_tree = None
771
- for child in children:
772
- if child.get('nodeid') == yes_child:
773
- yes_tree = child
774
- elif child.get('nodeid') == no_child:
775
- no_tree = child
776
-
777
- if yes_tree is None and len(children) > 0:
778
- yes_tree = children[0]
779
- if no_tree is None and len(children) > 1:
780
- no_tree = children[1]
781
-
782
- yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
783
- no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
784
-
785
- nodes[idx] = {
786
- 'f': feature_idx,
787
- 't': round(threshold, 6),
788
- 'y': yes_idx,
789
- 'n': no_idx,
790
- 'v': 0
791
- }
792
-
793
- return idx
794
-
795
-
796
- def export_model_trees_js(model: xgb.Booster, selected_features: list,
797
- threshold: float, output_path: str,
798
- cv_metrics: dict, holdout_metrics: dict):
799
- """
800
- Step 8: Export model directly to model-trees.js.
801
- """
802
- print("\n" + "=" * 60)
803
- print(f"[Step 8/8] Exporting to {output_path}...")
804
- print("=" * 60)
805
-
806
- # Get tree dump as JSON
807
- trees_dump = model.get_dump(dump_format='json')
808
- feature_map = {name: idx for idx, name in enumerate(selected_features)}
809
-
810
- # Convert each tree to flat array format
811
- js_trees = []
812
- total_nodes = 0
813
- for tree_str in trees_dump:
814
- tree_json = json.loads(tree_str)
815
- nodes = []
816
- convert_tree(tree_json, nodes, feature_map)
817
- js_trees.append(nodes)
818
- total_nodes += len(nodes)
819
-
820
- # Build JS model object
821
- js_model = {
822
- 'version': 1,
823
- 'features': selected_features,
824
- 'threshold': threshold,
825
- 'trees': js_trees
826
- }
827
-
828
- # Write as JS module
829
- js_content = "'use strict';\n\n"
830
- js_content += "/**\n"
831
- js_content += " * XGBoost model trees — auto-generated by src/ml/train-xgboost.py\n"
832
- js_content += f" * {len(js_trees)} trees, {len(selected_features)} features, threshold={threshold}\n"
833
- js_content += f" * CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}\n"
834
- js_content += f" * Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}\n"
835
- js_content += " * DO NOT EDIT MANUALLY\n"
836
- js_content += " */\n\n"
837
- js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
838
-
839
- with open(output_path, 'w', encoding='utf-8') as f:
840
- f.write(js_content)
841
-
842
- size_kb = Path(output_path).stat().st_size / 1024
843
- print(f" Trees: {len(js_trees)}")
844
- print(f" Total nodes: {total_nodes}")
845
- print(f" Features: {len(selected_features)}")
846
- print(f" Threshold: {threshold:.3f}")
847
- print(f" File size: {size_kb:.1f} KB")
848
-
849
-
850
- def main():
851
- parser = argparse.ArgumentParser(
852
- description='Train MUAD\'DIB XGBoost classifier (dual-source JSONL)')
853
- parser.add_argument('--negatives', required=True,
854
- help='Path to negatives JSONL (clean/fp labels)')
855
- parser.add_argument('--positives', required=True,
856
- help='Path to positives JSONL (malicious labels)')
857
- parser.add_argument('--output', default='src/ml/model-trees-shadow.js',
858
- help='Output JS file path (default: src/ml/model-trees-shadow.js)')
859
- parser.add_argument('--top-features', type=int, default=50,
860
- help='Number of top SHAP features to select (default: 50)')
861
- parser.add_argument('--common-only', action=argparse.BooleanOptionalAction,
862
- default=True,
863
- help='Only use features with >=1%% non-zero coverage in BOTH sources (default: on)')
864
- parser.add_argument('--skip-gate', action='store_true',
865
- help='Skip source discrimination gate (dangerous — use only for debugging)')
866
- args = parser.parse_args()
867
-
868
- # Validate inputs
869
- if not Path(args.negatives).exists():
870
- print(f"ERROR: Negatives file not found: {args.negatives}", file=sys.stderr)
871
- sys.exit(1)
872
- if not Path(args.positives).exists():
873
- print(f"ERROR: Positives file not found: {args.positives}", file=sys.stderr)
874
- sys.exit(1)
875
-
876
- # Step 1: Load data
877
- negatives, positives = load_and_prepare(args)
878
-
879
- # Step 2: Align features
880
- X, y, stats = align_features(negatives, positives)
881
-
882
- # Step 2a: Remove known metadata/source-proxy features BEFORE leak filter.
883
- # These features differ between sources for non-behavioral reasons and would
884
- # cause the model to learn source identity instead of malicious behavior.
885
- metadata_cols = [f for f in FEATURE_NAMES if f in EXCLUDED_METADATA]
886
- X = X.drop(columns=metadata_cols, errors='ignore')
887
- remaining_features = [f for f in FEATURE_NAMES if f not in EXCLUDED_METADATA]
888
- print(f"\n [Step 2a] Excluded {len(metadata_cols)} metadata features: "
889
- f"{', '.join(metadata_cols)}")
890
- print(f" Remaining: {len(remaining_features)} features")
891
-
892
- # Step 2b: Filter dead/leaky features (on remaining behavioral features)
893
- if args.common_only:
894
- X, active_features = filter_leaky_features(X, y)
895
- else:
896
- active_features = list(remaining_features)
897
-
898
- # Step 2c: Source discrimination diagnostic (log-only).
899
- # NOT a hard gate — source label is 100% confounded with class label
900
- # (all positives = Datadog, all negatives = monitor), so behavioral
901
- # features will always dominate the discriminator. The diagnostic
902
- # checks that no METADATA features leaked through Step 2a.
903
- if not args.skip_gate:
904
- source_discrimination_diagnostic(X, y, active_features)
905
- else:
906
- print("\n [Step 2c] Source discrimination diagnostic SKIPPED (--skip-gate)")
907
-
908
- # Class imbalance weight
909
- n_neg = stats['n_neg']
910
- n_pos = stats['n_pos']
911
- scale_pos_weight = n_neg / max(n_pos, 1)
912
- print(f"\n scale_pos_weight: {scale_pos_weight:.2f}")
913
-
914
- # Step 3: Train/test split
915
- X_train, X_test, y_train, y_test = split_data(X, y)
916
-
917
- # Step 4: Preliminary + SHAP
918
- selected = train_preliminary_and_shap(
919
- X_train, y_train, scale_pos_weight, active_features,
920
- top_k=args.top_features)
921
-
922
- # Step 5: Cross-validation
923
- cv_metrics = cross_validate(X_train, y_train, selected, scale_pos_weight)
924
-
925
- # Step 6: Final model
926
- final_model = train_final_model(X_train, y_train, selected, scale_pos_weight)
927
-
928
- # Step 7: Holdout evaluation
929
- holdout_metrics = evaluate_holdout(
930
- final_model, X_test, y_test, selected, cv_metrics['threshold'])
931
-
932
- # Step 8: Export
933
- export_model_trees_js(
934
- final_model, selected, cv_metrics['threshold'],
935
- args.output, cv_metrics, holdout_metrics)
936
-
937
- # Summary
938
- print("\n" + "=" * 60)
939
- print("TRAINING COMPLETE")
940
- print("=" * 60)
941
- print(f" Samples: {n_neg} negatives + {n_pos} positives = {n_neg + n_pos}")
942
- print(f" Features: {len(selected)} selected (from {len(active_features)} active / "
943
- f"{len(FEATURE_NAMES)} total, {len(EXCLUDED_METADATA)} metadata excluded)")
944
- print(f" Threshold: {cv_metrics['threshold']:.3f}")
945
- print(f" CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}")
946
- print(f" Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}")
947
- print(f" Output: {args.output}")
948
-
949
- # Warnings
950
- if holdout_metrics['recall'] < 0.939:
951
- print(f"\n [WARNING] Holdout recall {holdout_metrics['recall']:.3f} < 93.9% target")
952
- if holdout_metrics['precision'] < 0.95:
953
- print(f" [WARNING] Holdout precision {holdout_metrics['precision']:.3f} < 95% target")
954
-
955
-
956
- if __name__ == '__main__':
957
- main()