muaddib-scanner 2.11.76 → 2.11.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +18 -0
- package/README.md +15 -6
- package/bin/muaddib.js +18 -4
- package/package.json +1 -2
- package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
- package/src/commands/interactive.js +5 -6
- package/src/commands/safe-install.js +19 -19
- package/src/ioc/scraper.js +46 -10
- package/src/monitor/daemon.js +39 -28
- package/src/monitor/ingestion.js +32 -2
- package/src/monitor/queue.js +84 -21
- package/src/monitor/scan-queue.js +68 -1
- package/src/monitor/state.js +24 -1
- package/src/monitor/webhook.js +32 -11
- package/src/output/formatter.js +3 -4
- package/src/pipeline/executor.js +9 -1
- package/src/runtime/daemon.js +27 -28
- package/src/runtime/watch.js +7 -7
- package/src/sandbox/index.js +11 -9
- package/src/scanner/temporal-analysis.js +8 -0
- package/src/scanner/temporal-ast-diff.js +5 -0
- package/src/utils.js +60 -1
- package/.dockerignore +0 -7
- package/.env.example +0 -43
- package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
- package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
- package/ml-retrain/auto-labeler/labeler.py +0 -256
- package/ml-retrain/auto-labeler/npm_checker.py +0 -228
- package/ml-retrain/auto-labeler/ossf_index.py +0 -178
- package/ml-retrain/auto-labeler/requirements.txt +0 -1
- package/ml-retrain/confusion-matrix.png +0 -0
- package/ml-retrain/model-trees-retrained.js +0 -12
- package/ml-retrain/retrain-report.json +0 -225
- package/ml-retrain/retrain.py +0 -974
- package/sbom.json +0 -0
- package/src/ml/train-bundler-detector.py +0 -725
- package/src/ml/train-xgboost.py +0 -957
- package/tools/export-model-js.py +0 -160
- package/tools/requirements-ml.txt +0 -5
- package/tools/train-classifier.py +0 -333
package/src/ml/train-xgboost.py
DELETED
|
@@ -1,957 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
MUAD'DIB XGBoost Training Pipeline — dual-source JSONL
|
|
4
|
-
|
|
5
|
-
Trains a binary XGBoost classifier on two JSONL files:
|
|
6
|
-
- negatives: monitor output (labels clean/fp → 0)
|
|
7
|
-
- positives: Datadog malware corpus (label malicious → 1)
|
|
8
|
-
|
|
9
|
-
Exports directly to model-trees.js (no intermediate model.json).
|
|
10
|
-
|
|
11
|
-
Usage:
|
|
12
|
-
python src/ml/train-xgboost.py \
|
|
13
|
-
--negatives data/ml-training.jsonl \
|
|
14
|
-
--positives data/ml-training-datadog.jsonl \
|
|
15
|
-
--output src/ml/model-trees.js \
|
|
16
|
-
--top-features 40
|
|
17
|
-
|
|
18
|
-
Dependencies: see tools/requirements-ml.txt
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
import argparse
|
|
22
|
-
import json
|
|
23
|
-
import sys
|
|
24
|
-
from pathlib import Path
|
|
25
|
-
|
|
26
|
-
import numpy as np
|
|
27
|
-
import pandas as pd
|
|
28
|
-
import shap
|
|
29
|
-
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
30
|
-
from sklearn.metrics import (
|
|
31
|
-
precision_score, recall_score, f1_score, confusion_matrix
|
|
32
|
-
)
|
|
33
|
-
import xgboost as xgb
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# --- Constants ---
|
|
37
|
-
|
|
38
|
-
# Identity columns to exclude from features
|
|
39
|
-
IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
|
|
40
|
-
|
|
41
|
-
# Minimum samples per class
|
|
42
|
-
MIN_SAMPLES = 50
|
|
43
|
-
|
|
44
|
-
# XGBoost hyperparameters (aligned with tools/train-classifier.py)
|
|
45
|
-
XGB_PARAMS = {
|
|
46
|
-
'objective': 'binary:logistic',
|
|
47
|
-
'eval_metric': 'logloss',
|
|
48
|
-
'max_depth': 6,
|
|
49
|
-
'learning_rate': 0.1,
|
|
50
|
-
'subsample': 0.8,
|
|
51
|
-
'colsample_bytree': 0.8,
|
|
52
|
-
'min_child_weight': 5,
|
|
53
|
-
'gamma': 0.1,
|
|
54
|
-
'reg_alpha': 0.1,
|
|
55
|
-
'reg_lambda': 1.0,
|
|
56
|
-
'seed': 42,
|
|
57
|
-
'verbosity': 0,
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
N_ESTIMATORS = 200
|
|
61
|
-
N_FOLDS = 5
|
|
62
|
-
|
|
63
|
-
# Hardcoded 87 features — exact copy of feature-extractor.js output keys
|
|
64
|
-
# v2.10.32: expanded from 71 to 87 (16 new type_* features for code exec bypasses,
|
|
65
|
-
# IoC, GlassWorm, obfuscation, module graph). New features are 0 in pre-existing
|
|
66
|
-
# JSONL records; SHAP handles sparsity gracefully.
|
|
67
|
-
FEATURE_NAMES = [
|
|
68
|
-
# Scoring (4)
|
|
69
|
-
'score', 'max_file_score', 'package_score', 'global_risk_score',
|
|
70
|
-
# Severity counts (5)
|
|
71
|
-
'count_total', 'count_critical', 'count_high', 'count_medium', 'count_low',
|
|
72
|
-
# Distinct types (1)
|
|
73
|
-
'distinct_threat_types',
|
|
74
|
-
# Per-type counts (47 TOP_THREAT_TYPES + 1 other = 48)
|
|
75
|
-
# --- Original 31 ---
|
|
76
|
-
'type_suspicious_dataflow', 'type_env_access', 'type_sensitive_string',
|
|
77
|
-
'type_dangerous_call_eval', 'type_dangerous_call_exec',
|
|
78
|
-
'type_dangerous_call_function', 'type_obfuscation_detected',
|
|
79
|
-
'type_high_entropy_string', 'type_dynamic_require', 'type_dynamic_import',
|
|
80
|
-
'type_lifecycle_script', 'type_typosquat_detected', 'type_staged_payload',
|
|
81
|
-
'type_staged_binary_payload', 'type_network_require', 'type_sandbox_evasion',
|
|
82
|
-
'type_credential_regex_harvest', 'type_remote_code_load',
|
|
83
|
-
'type_suspicious_domain', 'type_prototype_hook',
|
|
84
|
-
'type_intent_credential_exfil', 'type_intent_command_exfil',
|
|
85
|
-
'type_cross_file_dataflow', 'type_module_compile', 'type_crypto_decipher',
|
|
86
|
-
'type_env_charcode_reconstruction', 'type_lifecycle_shell_pipe',
|
|
87
|
-
'type_curl_exec', 'type_reverse_shell', 'type_binary_dropper',
|
|
88
|
-
'type_mcp_config_injection',
|
|
89
|
-
# --- Code execution bypasses (v2.9.x–v2.10.x) ---
|
|
90
|
-
'type_vm_code_execution', 'type_vm_dynamic_code',
|
|
91
|
-
'type_dangerous_constructor', 'type_module_load_bypass',
|
|
92
|
-
'type_require_process_mainmodule', 'type_proxy_globalthis_intercept',
|
|
93
|
-
'type_reflect_bind_code_execution',
|
|
94
|
-
# --- IoC / supply chain ---
|
|
95
|
-
'type_known_malicious_package', 'type_known_malicious_hash',
|
|
96
|
-
# --- GlassWorm ---
|
|
97
|
-
'type_unicode_invisible_injection', 'type_blockchain_c2_resolution',
|
|
98
|
-
# --- Shell / exec ---
|
|
99
|
-
'type_dangerous_exec', 'type_node_inline_exec',
|
|
100
|
-
# --- Obfuscation ---
|
|
101
|
-
'type_js_obfuscation_pattern',
|
|
102
|
-
# --- Module graph / WASM ---
|
|
103
|
-
'type_suspicious_module_sink', 'type_wasm_host_sink',
|
|
104
|
-
# --- Aggregated ---
|
|
105
|
-
'type_other',
|
|
106
|
-
# Boolean behavioral signals (10)
|
|
107
|
-
'has_lifecycle_script', 'has_network_access', 'has_obfuscation',
|
|
108
|
-
'has_env_access', 'has_eval', 'has_staged_payload', 'has_typosquat',
|
|
109
|
-
'has_ioc_match', 'has_intent_pair', 'has_sandbox_finding',
|
|
110
|
-
# File distribution (3)
|
|
111
|
-
'file_count_with_threats', 'file_score_mean', 'file_score_max',
|
|
112
|
-
# Severity concentration (3)
|
|
113
|
-
'severity_ratio_high', 'max_single_points', 'points_concentration',
|
|
114
|
-
# Package metadata (3)
|
|
115
|
-
'unpacked_size_bytes', 'dep_count', 'dev_dep_count',
|
|
116
|
-
# Reputation (1)
|
|
117
|
-
'reputation_factor',
|
|
118
|
-
# Enriched registry metadata (9) — Phase 2a
|
|
119
|
-
'package_age_days', 'weekly_downloads', 'version_count',
|
|
120
|
-
'author_package_count', 'has_repository', 'readme_size',
|
|
121
|
-
'file_count_total', 'has_tests', 'threat_density',
|
|
122
|
-
]
|
|
123
|
-
|
|
124
|
-
assert len(FEATURE_NAMES) == 87, f"Expected 87 features, got {len(FEATURE_NAMES)}"
|
|
125
|
-
|
|
126
|
-
# Features to exclude: metadata/source-identity proxies that differ between
|
|
127
|
-
# monitor (negatives) and Datadog (positives) for non-behavioral reasons.
|
|
128
|
-
# See corrected retrain plan for full justification of each exclusion.
|
|
129
|
-
EXCLUDED_METADATA = {
|
|
130
|
-
# Score features — direct label leak by construction of the labeling
|
|
131
|
-
# pipeline in monitor/queue.js: all negatives (label='clean') have
|
|
132
|
-
# score<20 by definition (0 findings or T3-only), while all positives
|
|
133
|
-
# (Datadog malware corpus) have score≥20. Leaving these as model features
|
|
134
|
-
# gives xgboost a trivial shortcut: split on 'score', ignore all
|
|
135
|
-
# behavioral signals. The model then fails to generalize on high-score
|
|
136
|
-
# legitimate packages (playwright-core, @salesforce/cli, webpack, ...)
|
|
137
|
-
# because it never had to learn the behavioral signature.
|
|
138
|
-
# Removing them forces the model to learn from type_*/has_*/severity_ratio_*
|
|
139
|
-
# — the actual behavioral ground truth. count_* severity counts are kept
|
|
140
|
-
# because they reflect threat distribution, not aggregated risk.
|
|
141
|
-
'score', 'max_file_score', 'package_score', 'global_risk_score',
|
|
142
|
-
# npm registry metadata — always 0 in Datadog positives (not fetched),
|
|
143
|
-
# 8-13% non-zero in monitor negatives → source leak
|
|
144
|
-
'package_age_days', 'weekly_downloads', 'version_count',
|
|
145
|
-
'author_package_count', 'has_repository', 'readme_size',
|
|
146
|
-
# Derived from corrupted npm metadata (age_days, version_count, downloads).
|
|
147
|
-
# Currently zero-variance (always 1.0) but becomes a leak when future
|
|
148
|
-
# records have actual computed values.
|
|
149
|
-
'reputation_factor',
|
|
150
|
-
# Package-level metadata not from behavioral scan —
|
|
151
|
-
# 88-95% non-zero in negatives, 0% in positives → massive source proxy
|
|
152
|
-
'unpacked_size_bytes', 'file_count_total',
|
|
153
|
-
# 13% non-zero in negatives, 0% in positives → source proxy
|
|
154
|
-
'has_tests',
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
# --- Data loading ---
|
|
159
|
-
|
|
160
|
-
def load_jsonl(filepath: str) -> list:
|
|
161
|
-
"""Load JSONL file into list of dicts."""
|
|
162
|
-
records = []
|
|
163
|
-
with open(filepath, 'r', encoding='utf-8') as f:
|
|
164
|
-
for line_num, line in enumerate(f, 1):
|
|
165
|
-
line = line.strip()
|
|
166
|
-
if not line:
|
|
167
|
-
continue
|
|
168
|
-
try:
|
|
169
|
-
records.append(json.loads(line))
|
|
170
|
-
except json.JSONDecodeError:
|
|
171
|
-
print(f" [WARN] Skipping malformed line {line_num} in {filepath}",
|
|
172
|
-
file=sys.stderr)
|
|
173
|
-
return records
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def load_and_prepare(args) -> tuple:
|
|
177
|
-
"""
|
|
178
|
-
Step 1: Load two JSONL files and map to binary labels.
|
|
179
|
-
|
|
180
|
-
Returns: (X, y, stats)
|
|
181
|
-
"""
|
|
182
|
-
print("=" * 60)
|
|
183
|
-
print("[Step 1/8] Loading JSONL data...")
|
|
184
|
-
print("=" * 60)
|
|
185
|
-
|
|
186
|
-
# Load negatives (clean/fp → 0)
|
|
187
|
-
neg_records = load_jsonl(args.negatives)
|
|
188
|
-
print(f" Negatives file: {len(neg_records)} total records")
|
|
189
|
-
|
|
190
|
-
# Filter negatives: accept three label classes, all with verified ground truth.
|
|
191
|
-
# 'clean' — scanner found 0 findings or T3-only (passive signals).
|
|
192
|
-
# Labelled by monitor/queue.js, low-score by construction.
|
|
193
|
-
# 'fp' — manually reviewed false positive. Requires manualReview=true
|
|
194
|
-
# flag in jsonl-writer.js (defense-in-depth vs C1 contamination).
|
|
195
|
-
# 'curated_benign' — hand-curated popular legitimate packages scanned via
|
|
196
|
-
# scripts/scan-benign-training.js. These are the ONLY source
|
|
197
|
-
# of high-score negatives: playwright-core, webpack, next,
|
|
198
|
-
# @salesforce/cli, etc. trip behavioral heuristics while
|
|
199
|
-
# being verifiably benign. Without them the model has no
|
|
200
|
-
# high-score negatives and cannot generalize to complex
|
|
201
|
-
# legitimate tooling.
|
|
202
|
-
# Explicitly EXCLUDED: 'suspect', 'unconfirmed', 'ml_clean', 'llm_benign',
|
|
203
|
-
# 'likely_benign', 'removed_unlabeled' — these are either uncertain or were
|
|
204
|
-
# auto-labelled by sandbox-clean heuristics (see C1 remediation: 8176 records
|
|
205
|
-
# contaminated before the manualReview gate was added).
|
|
206
|
-
neg_label_counts = {}
|
|
207
|
-
for r in neg_records:
|
|
208
|
-
lbl = r.get('label', 'unknown')
|
|
209
|
-
neg_label_counts[lbl] = neg_label_counts.get(lbl, 0) + 1
|
|
210
|
-
print(f" Negative label distribution: {neg_label_counts}")
|
|
211
|
-
|
|
212
|
-
VALID_NEGATIVE_LABELS = ('clean', 'fp', 'curated_benign')
|
|
213
|
-
negatives = [r for r in neg_records if r.get('label') in VALID_NEGATIVE_LABELS]
|
|
214
|
-
n_clean = sum(1 for r in negatives if r.get('label') == 'clean')
|
|
215
|
-
n_fp = sum(1 for r in negatives if r.get('label') == 'fp')
|
|
216
|
-
n_curated = sum(1 for r in negatives if r.get('label') == 'curated_benign')
|
|
217
|
-
n_unconfirmed = sum(1 for r in neg_records if r.get('label') == 'unconfirmed')
|
|
218
|
-
print(f" Kept {len(negatives)} negatives "
|
|
219
|
-
f"(clean={n_clean}, fp={n_fp}, curated_benign={n_curated})")
|
|
220
|
-
if n_unconfirmed > 0:
|
|
221
|
-
print(f" Excluded {n_unconfirmed} 'unconfirmed' records (not manually reviewed)")
|
|
222
|
-
|
|
223
|
-
# Load positives (malicious → 1)
|
|
224
|
-
pos_records = load_jsonl(args.positives)
|
|
225
|
-
print(f" Positives file: {len(pos_records)} total records")
|
|
226
|
-
|
|
227
|
-
pos_label_counts = {}
|
|
228
|
-
for r in pos_records:
|
|
229
|
-
lbl = r.get('label', 'unknown')
|
|
230
|
-
pos_label_counts[lbl] = pos_label_counts.get(lbl, 0) + 1
|
|
231
|
-
print(f" Positive label distribution: {pos_label_counts}")
|
|
232
|
-
|
|
233
|
-
positives = pos_records # All entries are malicious
|
|
234
|
-
|
|
235
|
-
if len(negatives) < MIN_SAMPLES:
|
|
236
|
-
print(f"ERROR: Need >= {MIN_SAMPLES} negatives, got {len(negatives)}",
|
|
237
|
-
file=sys.stderr)
|
|
238
|
-
sys.exit(1)
|
|
239
|
-
if len(positives) < MIN_SAMPLES:
|
|
240
|
-
print(f"ERROR: Need >= {MIN_SAMPLES} positives, got {len(positives)}",
|
|
241
|
-
file=sys.stderr)
|
|
242
|
-
sys.exit(1)
|
|
243
|
-
|
|
244
|
-
ratio = len(negatives) / len(positives)
|
|
245
|
-
print(f"\n Negatives: {len(negatives)}")
|
|
246
|
-
print(f" Positives: {len(positives)}")
|
|
247
|
-
print(f" Ratio (neg/pos): {ratio:.2f}")
|
|
248
|
-
|
|
249
|
-
return negatives, positives
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def align_features(negatives: list, positives: list) -> tuple:
|
|
253
|
-
"""
|
|
254
|
-
Step 2: Align all records to the 87 hardcoded features.
|
|
255
|
-
|
|
256
|
-
Returns: (X: pd.DataFrame, y: np.ndarray, stats: dict)
|
|
257
|
-
"""
|
|
258
|
-
print("\n" + "=" * 60)
|
|
259
|
-
print("[Step 2/8] Aligning 87 features...")
|
|
260
|
-
print("=" * 60)
|
|
261
|
-
|
|
262
|
-
# Combine with binary labels
|
|
263
|
-
all_records = []
|
|
264
|
-
for r in negatives:
|
|
265
|
-
all_records.append((r, 0))
|
|
266
|
-
for r in positives:
|
|
267
|
-
all_records.append((r, 1))
|
|
268
|
-
|
|
269
|
-
# Extract feature matrix
|
|
270
|
-
X_data = []
|
|
271
|
-
y_data = []
|
|
272
|
-
neg_present = 0
|
|
273
|
-
neg_missing = 0
|
|
274
|
-
pos_present = 0
|
|
275
|
-
pos_missing = 0
|
|
276
|
-
|
|
277
|
-
for record, label in all_records:
|
|
278
|
-
row = []
|
|
279
|
-
for feat in FEATURE_NAMES:
|
|
280
|
-
val = record.get(feat, 0)
|
|
281
|
-
if val is None:
|
|
282
|
-
val = 0
|
|
283
|
-
row.append(float(val))
|
|
284
|
-
X_data.append(row)
|
|
285
|
-
y_data.append(label)
|
|
286
|
-
|
|
287
|
-
# Count present vs missing features
|
|
288
|
-
if label == 0:
|
|
289
|
-
for feat in FEATURE_NAMES:
|
|
290
|
-
if feat in record and record[feat] is not None:
|
|
291
|
-
neg_present += 1
|
|
292
|
-
else:
|
|
293
|
-
neg_missing += 1
|
|
294
|
-
else:
|
|
295
|
-
for feat in FEATURE_NAMES:
|
|
296
|
-
if feat in record and record[feat] is not None:
|
|
297
|
-
pos_present += 1
|
|
298
|
-
else:
|
|
299
|
-
pos_missing += 1
|
|
300
|
-
|
|
301
|
-
X = pd.DataFrame(X_data, columns=FEATURE_NAMES)
|
|
302
|
-
y = np.array(y_data, dtype=int)
|
|
303
|
-
|
|
304
|
-
n_neg = int((y == 0).sum())
|
|
305
|
-
n_pos = int((y == 1).sum())
|
|
306
|
-
|
|
307
|
-
print(f" Feature matrix: {X.shape[0]} samples x {X.shape[1]} features")
|
|
308
|
-
print(f" Negatives: {neg_present} present, {neg_missing} missing "
|
|
309
|
-
f"({neg_present / max(neg_present + neg_missing, 1) * 100:.1f}% coverage)")
|
|
310
|
-
print(f" Positives: {pos_present} present, {pos_missing} missing "
|
|
311
|
-
f"({pos_present / max(pos_present + pos_missing, 1) * 100:.1f}% coverage)")
|
|
312
|
-
|
|
313
|
-
stats = {
|
|
314
|
-
'n_total': len(X),
|
|
315
|
-
'n_neg': n_neg,
|
|
316
|
-
'n_pos': n_pos,
|
|
317
|
-
'n_features': len(FEATURE_NAMES),
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
return X, y, stats
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def filter_leaky_features(X: pd.DataFrame, y: np.ndarray,
|
|
324
|
-
min_coverage: float = 0.001) -> tuple:
|
|
325
|
-
"""
|
|
326
|
-
Step 2b: Remove dead features and source-identity leaks.
|
|
327
|
-
|
|
328
|
-
A feature is dropped ONLY if:
|
|
329
|
-
- DEAD: non-zero in < 0.1% of ALL samples (both classes combined)
|
|
330
|
-
- LEAKY: non-zero in >= 99% of one class AND < 0.1% of the other
|
|
331
|
-
(proxy for data source, not malware signal)
|
|
332
|
-
|
|
333
|
-
Features that are 0% in negatives but high in positives are KEPT —
|
|
334
|
-
that's discriminative, not leaky (e.g., count_critical, type_* features
|
|
335
|
-
are legitimately 0 in clean packages).
|
|
336
|
-
|
|
337
|
-
Returns: (X_filtered, active_features)
|
|
338
|
-
"""
|
|
339
|
-
print("\n" + "=" * 60)
|
|
340
|
-
print("[Step 2b/8] Filtering dead/leaky features...")
|
|
341
|
-
print("=" * 60)
|
|
342
|
-
|
|
343
|
-
neg_mask = y == 0
|
|
344
|
-
pos_mask = y == 1
|
|
345
|
-
n_neg = int(neg_mask.sum())
|
|
346
|
-
n_pos = int(pos_mask.sum())
|
|
347
|
-
n_total = n_neg + n_pos
|
|
348
|
-
|
|
349
|
-
retained = []
|
|
350
|
-
excluded = []
|
|
351
|
-
|
|
352
|
-
# Iterate over columns actually present in X (metadata may have been
|
|
353
|
-
# dropped by Step 2a before this function is called).
|
|
354
|
-
available_features = list(X.columns)
|
|
355
|
-
|
|
356
|
-
print(f"\n {'Feature':<40s} {'Neg%':>6s} {'Pos%':>6s} {'All%':>6s} {'Status'}")
|
|
357
|
-
print(f" {'-' * 40} {'-' * 6} {'-' * 6} {'-' * 6} {'-' * 8}")
|
|
358
|
-
|
|
359
|
-
for feat in available_features:
|
|
360
|
-
neg_nonzero = float((X.loc[neg_mask, feat] != 0).sum()) / max(n_neg, 1)
|
|
361
|
-
pos_nonzero = float((X.loc[pos_mask, feat] != 0).sum()) / max(n_pos, 1)
|
|
362
|
-
all_nonzero = float((X[feat] != 0).sum()) / max(n_total, 1)
|
|
363
|
-
|
|
364
|
-
status = 'KEEP'
|
|
365
|
-
|
|
366
|
-
# DEAD: feature is near-zero across ALL samples — no signal at all
|
|
367
|
-
if all_nonzero < min_coverage:
|
|
368
|
-
status = 'DEAD'
|
|
369
|
-
|
|
370
|
-
# LEAKY: feature is a source-identity proxy (>=99% in one, <0.1% in other)
|
|
371
|
-
elif (neg_nonzero >= 0.99 and pos_nonzero < min_coverage):
|
|
372
|
-
status = 'LEAK'
|
|
373
|
-
elif (pos_nonzero >= 0.99 and neg_nonzero < min_coverage):
|
|
374
|
-
status = 'LEAK'
|
|
375
|
-
|
|
376
|
-
if status != 'KEEP':
|
|
377
|
-
excluded.append(feat)
|
|
378
|
-
else:
|
|
379
|
-
retained.append(feat)
|
|
380
|
-
|
|
381
|
-
print(f" {feat:<40s} {neg_nonzero * 100:5.1f}% {pos_nonzero * 100:5.1f}% "
|
|
382
|
-
f"{all_nonzero * 100:5.1f}% {status}")
|
|
383
|
-
|
|
384
|
-
print(f"\n Retained: {len(retained)}/{len(available_features)} features")
|
|
385
|
-
if excluded:
|
|
386
|
-
print(f" Excluded ({len(excluded)}): {', '.join(excluded)}")
|
|
387
|
-
|
|
388
|
-
X_filtered = X[retained]
|
|
389
|
-
return X_filtered, retained
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
def source_discrimination_diagnostic(X: pd.DataFrame, y: np.ndarray,
|
|
393
|
-
active_features: list):
|
|
394
|
-
"""
|
|
395
|
-
Step 2c: Source discrimination diagnostic (LOG-ONLY, non-blocking).
|
|
396
|
-
|
|
397
|
-
DESIGN NOTE: This test cannot function as a hard gate when source labels
|
|
398
|
-
are perfectly confounded with class labels (all negatives = monitor,
|
|
399
|
-
all positives = Datadog). In that case, legitimate behavioral features
|
|
400
|
-
(score, count_critical, type_*) will dominate the discriminator because
|
|
401
|
-
malware genuinely behaves differently from clean packages — this is
|
|
402
|
-
signal, not leak.
|
|
403
|
-
|
|
404
|
-
A true source discrimination test would require either:
|
|
405
|
-
(a) positives re-scanned through our own pipeline, or
|
|
406
|
-
(b) negatives and positives from the SAME source.
|
|
407
|
-
|
|
408
|
-
This diagnostic still serves a purpose: it flags NON-BEHAVIORAL features
|
|
409
|
-
that shouldn't appear in the top discriminators. If metadata features
|
|
410
|
-
(unpacked_size_bytes, file_count_total, etc.) appear despite being
|
|
411
|
-
excluded in Step 2a, something is wrong.
|
|
412
|
-
|
|
413
|
-
The real validation happens in shadow deployment on live production data.
|
|
414
|
-
"""
|
|
415
|
-
print("\n" + "=" * 60)
|
|
416
|
-
print("[Step 2c/8] Source discrimination diagnostic (log-only)...")
|
|
417
|
-
print("=" * 60)
|
|
418
|
-
print(" NOTE: source=Datadog correlates 100% with label=malicious.")
|
|
419
|
-
print(" This diagnostic checks for non-behavioral features in the")
|
|
420
|
-
print(" top discriminators, NOT for overall accuracy (which will")
|
|
421
|
-
print(" always be high due to the source/label confound).")
|
|
422
|
-
|
|
423
|
-
X_active = X[active_features]
|
|
424
|
-
|
|
425
|
-
# 70/30 split with different seed to avoid overlap with main split
|
|
426
|
-
X_tr, X_te, y_tr, y_te = train_test_split(
|
|
427
|
-
X_active, y, test_size=0.3, stratify=y, random_state=99
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
# Shallow model — depth=3, 50 rounds, no class weighting
|
|
431
|
-
params = {
|
|
432
|
-
'objective': 'binary:logistic',
|
|
433
|
-
'eval_metric': 'logloss',
|
|
434
|
-
'max_depth': 3,
|
|
435
|
-
'learning_rate': 0.1,
|
|
436
|
-
'subsample': 0.8,
|
|
437
|
-
'seed': 99,
|
|
438
|
-
'verbosity': 0,
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=active_features)
|
|
442
|
-
dtest = xgb.DMatrix(X_te, label=y_te, feature_names=active_features)
|
|
443
|
-
|
|
444
|
-
model = xgb.train(params, dtrain, num_boost_round=50)
|
|
445
|
-
probs = model.predict(dtest)
|
|
446
|
-
preds = (probs >= 0.5).astype(int)
|
|
447
|
-
accuracy = float((preds == y_te).mean())
|
|
448
|
-
|
|
449
|
-
p = precision_score(y_te, preds, zero_division=0)
|
|
450
|
-
r = recall_score(y_te, preds, zero_division=0)
|
|
451
|
-
|
|
452
|
-
print(f"\n Discrimination accuracy: {accuracy:.3f} (P={p:.3f} R={r:.3f})")
|
|
453
|
-
print(f" (Expected to be high due to source/label confound)")
|
|
454
|
-
|
|
455
|
-
# SHAP analysis — the diagnostic value is in WHICH features dominate
|
|
456
|
-
explainer = shap.TreeExplainer(model)
|
|
457
|
-
shap_values = explainer.shap_values(X_te)
|
|
458
|
-
mean_abs_shap = np.abs(shap_values).mean(axis=0)
|
|
459
|
-
importance = sorted(zip(active_features, mean_abs_shap),
|
|
460
|
-
key=lambda x: x[1], reverse=True)
|
|
461
|
-
|
|
462
|
-
# Known behavioral features that SHOULD dominate (malware scores higher)
|
|
463
|
-
EXPECTED_BEHAVIORAL = {
|
|
464
|
-
'score', 'global_risk_score', 'max_file_score', 'package_score',
|
|
465
|
-
'count_total', 'count_critical', 'count_high', 'count_medium',
|
|
466
|
-
'count_low', 'distinct_threat_types', 'severity_ratio_high',
|
|
467
|
-
'max_single_points', 'points_concentration', 'file_count_with_threats',
|
|
468
|
-
'file_score_mean', 'file_score_max', 'threat_density',
|
|
469
|
-
}
|
|
470
|
-
# Features that should NOT appear (already excluded, but sanity check)
|
|
471
|
-
EXCLUDED_CHECK = {
|
|
472
|
-
'unpacked_size_bytes', 'file_count_total', 'has_tests',
|
|
473
|
-
'dep_count', 'dev_dep_count', 'reputation_factor',
|
|
474
|
-
'package_age_days', 'weekly_downloads', 'version_count',
|
|
475
|
-
'author_package_count', 'has_repository', 'readme_size',
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
print(f"\n Top 10 features driving discrimination:")
|
|
479
|
-
has_leak = False
|
|
480
|
-
for i, (name, val) in enumerate(importance[:10]):
|
|
481
|
-
if name in EXCLUDED_CHECK:
|
|
482
|
-
flag = " *** LEAK — should have been excluded in Step 2a!"
|
|
483
|
-
has_leak = True
|
|
484
|
-
elif name in EXPECTED_BEHAVIORAL:
|
|
485
|
-
flag = " (expected — behavioral)"
|
|
486
|
-
elif name.startswith('type_') or name.startswith('has_'):
|
|
487
|
-
flag = " (behavioral signal)"
|
|
488
|
-
else:
|
|
489
|
-
flag = ""
|
|
490
|
-
print(f" {i + 1:2d}. {name:40s} {val:.6f}{flag}")
|
|
491
|
-
|
|
492
|
-
if has_leak:
|
|
493
|
-
print(f"\n [WARNING] Non-behavioral features found in top discriminators!")
|
|
494
|
-
print(f" Check EXCLUDED_METADATA — some metadata features leaked through.")
|
|
495
|
-
else:
|
|
496
|
-
print(f"\n [OK] Top discriminators are all behavioral features.")
|
|
497
|
-
print(f" No metadata/source-proxy leak detected.")
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
def split_data(X: pd.DataFrame, y: np.ndarray) -> tuple:
|
|
501
|
-
"""
|
|
502
|
-
Step 3: Stratified 80/20 split.
|
|
503
|
-
|
|
504
|
-
Returns: (X_train, X_test, y_train, y_test)
|
|
505
|
-
"""
|
|
506
|
-
print("\n" + "=" * 60)
|
|
507
|
-
print("[Step 3/8] Stratified train/test split (80/20, seed=42)...")
|
|
508
|
-
print("=" * 60)
|
|
509
|
-
|
|
510
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
511
|
-
X, y, test_size=0.2, stratify=y, random_state=42
|
|
512
|
-
)
|
|
513
|
-
|
|
514
|
-
print(f" Train: {len(X_train)} ({int((y_train == 0).sum())} neg, "
|
|
515
|
-
f"{int((y_train == 1).sum())} pos)")
|
|
516
|
-
print(f" Test: {len(X_test)} ({int((y_test == 0).sum())} neg, "
|
|
517
|
-
f"{int((y_test == 1).sum())} pos)")
|
|
518
|
-
|
|
519
|
-
return X_train, X_test, y_train, y_test
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
def train_preliminary_and_shap(X_train: pd.DataFrame, y_train: np.ndarray,
|
|
523
|
-
scale_pos_weight: float,
|
|
524
|
-
active_features: list,
|
|
525
|
-
top_k: int = 40) -> list:
|
|
526
|
-
"""
|
|
527
|
-
Step 4: Preliminary training + SHAP feature selection.
|
|
528
|
-
|
|
529
|
-
Returns: list of selected feature names
|
|
530
|
-
"""
|
|
531
|
-
print("\n" + "=" * 60)
|
|
532
|
-
print(f"[Step 4/8] Preliminary training + SHAP (top {top_k} from {len(active_features)} features)...")
|
|
533
|
-
print("=" * 60)
|
|
534
|
-
|
|
535
|
-
X_active = X_train[active_features]
|
|
536
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
537
|
-
dtrain = xgb.DMatrix(X_active, label=y_train, feature_names=active_features)
|
|
538
|
-
prelim = xgb.train(params, dtrain, num_boost_round=100)
|
|
539
|
-
|
|
540
|
-
# SHAP
|
|
541
|
-
explainer = shap.TreeExplainer(prelim)
|
|
542
|
-
shap_values = explainer.shap_values(X_active)
|
|
543
|
-
|
|
544
|
-
mean_abs_shap = np.abs(shap_values).mean(axis=0)
|
|
545
|
-
importance = sorted(zip(active_features, mean_abs_shap),
|
|
546
|
-
key=lambda x: x[1], reverse=True)
|
|
547
|
-
|
|
548
|
-
print(f"\n Top 20 features by SHAP importance:")
|
|
549
|
-
for i, (name, val) in enumerate(importance[:20]):
|
|
550
|
-
print(f" {i + 1:2d}. {name:40s} {val:.6f}")
|
|
551
|
-
|
|
552
|
-
selected = [name for name, _ in importance[:top_k]]
|
|
553
|
-
|
|
554
|
-
# Show which features were dropped
|
|
555
|
-
dropped = [name for name, _ in importance[top_k:]]
|
|
556
|
-
if dropped:
|
|
557
|
-
print(f"\n Dropped {len(dropped)} features: {', '.join(dropped[:10])}"
|
|
558
|
-
+ (" ..." if len(dropped) > 10 else ""))
|
|
559
|
-
|
|
560
|
-
return selected
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
def cross_validate(X_train: pd.DataFrame, y_train: np.ndarray,
|
|
564
|
-
selected_features: list,
|
|
565
|
-
scale_pos_weight: float) -> dict:
|
|
566
|
-
"""
|
|
567
|
-
Step 5: 5-fold stratified CV on selected features.
|
|
568
|
-
Optimize threshold: maximize precision under recall >= 93.9%.
|
|
569
|
-
|
|
570
|
-
Returns: dict with threshold, precision, recall, fold_metrics
|
|
571
|
-
"""
|
|
572
|
-
print("\n" + "=" * 60)
|
|
573
|
-
print(f"[Step 5/8] 5-fold stratified CV ({len(selected_features)} features)...")
|
|
574
|
-
print("=" * 60)
|
|
575
|
-
|
|
576
|
-
X_sel = X_train[selected_features]
|
|
577
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
578
|
-
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
|
|
579
|
-
|
|
580
|
-
fold_metrics = []
|
|
581
|
-
all_probs = np.zeros(len(y_train))
|
|
582
|
-
all_labels = np.zeros(len(y_train))
|
|
583
|
-
|
|
584
|
-
for fold, (train_idx, val_idx) in enumerate(skf.split(X_sel, y_train)):
|
|
585
|
-
X_tr = X_sel.iloc[train_idx]
|
|
586
|
-
X_va = X_sel.iloc[val_idx]
|
|
587
|
-
y_tr = y_train[train_idx]
|
|
588
|
-
y_va = y_train[val_idx]
|
|
589
|
-
|
|
590
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
|
|
591
|
-
dval = xgb.DMatrix(X_va, label=y_va, feature_names=selected_features)
|
|
592
|
-
|
|
593
|
-
model = xgb.train(
|
|
594
|
-
params, dtrain, num_boost_round=N_ESTIMATORS,
|
|
595
|
-
evals=[(dval, 'val')], verbose_eval=False,
|
|
596
|
-
early_stopping_rounds=20
|
|
597
|
-
)
|
|
598
|
-
|
|
599
|
-
probs = model.predict(dval)
|
|
600
|
-
all_probs[val_idx] = probs
|
|
601
|
-
all_labels[val_idx] = y_va
|
|
602
|
-
|
|
603
|
-
preds = (probs >= 0.5).astype(int)
|
|
604
|
-
p = precision_score(y_va, preds, zero_division=0)
|
|
605
|
-
r = recall_score(y_va, preds, zero_division=0)
|
|
606
|
-
f1 = f1_score(y_va, preds, zero_division=0)
|
|
607
|
-
fold_metrics.append({'precision': p, 'recall': r, 'f1': f1})
|
|
608
|
-
print(f" Fold {fold + 1}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
|
|
609
|
-
|
|
610
|
-
# Optimize threshold: maximize precision while maintaining recall >= 93.9%
|
|
611
|
-
print(f"\n Optimizing threshold (recall >= 93.9%)...")
|
|
612
|
-
thresholds = np.arange(0.10, 0.91, 0.01)
|
|
613
|
-
best_threshold = 0.5
|
|
614
|
-
best_precision = 0.0
|
|
615
|
-
|
|
616
|
-
for t in thresholds:
|
|
617
|
-
preds = (all_probs >= t).astype(int)
|
|
618
|
-
r = recall_score(all_labels, preds, zero_division=0)
|
|
619
|
-
p = precision_score(all_labels, preds, zero_division=0)
|
|
620
|
-
if r >= 0.939 and p > best_precision:
|
|
621
|
-
best_precision = p
|
|
622
|
-
best_threshold = float(t)
|
|
623
|
-
|
|
624
|
-
# If no threshold meets recall constraint, warn and use 0.5
|
|
625
|
-
if best_precision == 0.0:
|
|
626
|
-
print(f" [WARN] No threshold achieves recall >= 93.9%")
|
|
627
|
-
print(f" Using default threshold=0.5")
|
|
628
|
-
best_threshold = 0.5
|
|
629
|
-
final_preds = (all_probs >= 0.5).astype(int)
|
|
630
|
-
else:
|
|
631
|
-
final_preds = (all_probs >= best_threshold).astype(int)
|
|
632
|
-
|
|
633
|
-
final_p = precision_score(all_labels, final_preds, zero_division=0)
|
|
634
|
-
final_r = recall_score(all_labels, final_preds, zero_division=0)
|
|
635
|
-
final_f1 = f1_score(all_labels, final_preds, zero_division=0)
|
|
636
|
-
cm = confusion_matrix(all_labels, final_preds)
|
|
637
|
-
|
|
638
|
-
print(f"\n Optimal threshold: {best_threshold:.2f}")
|
|
639
|
-
print(f" CV metrics: P={final_p:.3f} R={final_r:.3f} F1={final_f1:.3f}")
|
|
640
|
-
print(f" Confusion matrix:")
|
|
641
|
-
print(f" TN={cm[0][0]} FP={cm[0][1]}")
|
|
642
|
-
print(f" FN={cm[1][0]} TP={cm[1][1]}")
|
|
643
|
-
|
|
644
|
-
return {
|
|
645
|
-
'threshold': round(best_threshold, 3),
|
|
646
|
-
'precision': round(float(final_p), 4),
|
|
647
|
-
'recall': round(float(final_r), 4),
|
|
648
|
-
'f1': round(float(final_f1), 4),
|
|
649
|
-
'fold_metrics': fold_metrics,
|
|
650
|
-
'confusion_matrix': cm.tolist()
|
|
651
|
-
}
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
def train_final_model(X_train: pd.DataFrame, y_train: np.ndarray,
|
|
655
|
-
selected_features: list,
|
|
656
|
-
scale_pos_weight: float) -> xgb.Booster:
|
|
657
|
-
"""
|
|
658
|
-
Step 6: Train final model on full train set with early stopping on internal split.
|
|
659
|
-
"""
|
|
660
|
-
print("\n" + "=" * 60)
|
|
661
|
-
print(f"[Step 6/8] Training final model ({len(selected_features)} features)...")
|
|
662
|
-
print("=" * 60)
|
|
663
|
-
|
|
664
|
-
X_sel = X_train[selected_features]
|
|
665
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
666
|
-
|
|
667
|
-
# Internal 90/10 split for early stopping
|
|
668
|
-
X_tr, X_es, y_tr, y_es = train_test_split(
|
|
669
|
-
X_sel, y_train, test_size=0.1, stratify=y_train, random_state=42
|
|
670
|
-
)
|
|
671
|
-
|
|
672
|
-
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=selected_features)
|
|
673
|
-
des = xgb.DMatrix(X_es, label=y_es, feature_names=selected_features)
|
|
674
|
-
|
|
675
|
-
model = xgb.train(
|
|
676
|
-
params, dtrain, num_boost_round=N_ESTIMATORS,
|
|
677
|
-
evals=[(des, 'early_stop')], verbose_eval=False,
|
|
678
|
-
early_stopping_rounds=20
|
|
679
|
-
)
|
|
680
|
-
|
|
681
|
-
best_round = model.best_iteration if hasattr(model, 'best_iteration') else N_ESTIMATORS
|
|
682
|
-
print(f" Best iteration: {best_round}")
|
|
683
|
-
|
|
684
|
-
return model
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
def evaluate_holdout(model: xgb.Booster, X_test: pd.DataFrame,
|
|
688
|
-
y_test: np.ndarray, selected_features: list,
|
|
689
|
-
threshold: float) -> dict:
|
|
690
|
-
"""
|
|
691
|
-
Step 7: Evaluate on holdout test set.
|
|
692
|
-
"""
|
|
693
|
-
print("\n" + "=" * 60)
|
|
694
|
-
print(f"[Step 7/8] Holdout evaluation (threshold={threshold:.3f})...")
|
|
695
|
-
print("=" * 60)
|
|
696
|
-
|
|
697
|
-
X_sel = X_test[selected_features]
|
|
698
|
-
dtest = xgb.DMatrix(X_sel, label=y_test, feature_names=selected_features)
|
|
699
|
-
probs = model.predict(dtest)
|
|
700
|
-
|
|
701
|
-
preds = (probs >= threshold).astype(int)
|
|
702
|
-
p = precision_score(y_test, preds, zero_division=0)
|
|
703
|
-
r = recall_score(y_test, preds, zero_division=0)
|
|
704
|
-
f1 = f1_score(y_test, preds, zero_division=0)
|
|
705
|
-
cm = confusion_matrix(y_test, preds)
|
|
706
|
-
|
|
707
|
-
tn, fp_count, fn, tp = cm.ravel()
|
|
708
|
-
|
|
709
|
-
print(f" Precision: {p:.3f}")
|
|
710
|
-
print(f" Recall: {r:.3f}")
|
|
711
|
-
print(f" F1: {f1:.3f}")
|
|
712
|
-
print(f" Confusion matrix:")
|
|
713
|
-
print(f" TN={tn} FP={fp_count}")
|
|
714
|
-
print(f" FN={fn} TP={tp}")
|
|
715
|
-
|
|
716
|
-
# Hard verification
|
|
717
|
-
if r < 0.939:
|
|
718
|
-
print(f"\n [WARNING] Recall {r:.3f} < 93.9% target!")
|
|
719
|
-
else:
|
|
720
|
-
print(f"\n [PASS] Recall >= 93.9%")
|
|
721
|
-
|
|
722
|
-
if p < 0.95:
|
|
723
|
-
print(f" [WARNING] Precision {p:.3f} < 95% target!")
|
|
724
|
-
else:
|
|
725
|
-
print(f" [PASS] Precision >= 95%")
|
|
726
|
-
|
|
727
|
-
# Feature importance (gain-based)
|
|
728
|
-
importance = model.get_score(importance_type='gain')
|
|
729
|
-
sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)
|
|
730
|
-
print(f"\n Top 20 features (gain-based):")
|
|
731
|
-
for i, (name, val) in enumerate(sorted_imp[:20]):
|
|
732
|
-
print(f" {i + 1:2d}. {name:40s} {val:.4f}")
|
|
733
|
-
|
|
734
|
-
return {
|
|
735
|
-
'precision': round(float(p), 4),
|
|
736
|
-
'recall': round(float(r), 4),
|
|
737
|
-
'f1': round(float(f1), 4),
|
|
738
|
-
'confusion_matrix': cm.tolist(),
|
|
739
|
-
'tp': int(tp), 'fp': int(fp_count),
|
|
740
|
-
'fn': int(fn), 'tn': int(tn)
|
|
741
|
-
}
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
def convert_tree(tree_json: dict, nodes: list, feature_map: dict) -> int:
|
|
745
|
-
"""
|
|
746
|
-
Recursively convert an XGBoost tree JSON node to flat array format.
|
|
747
|
-
Reused from tools/export-model-js.py.
|
|
748
|
-
"""
|
|
749
|
-
idx = len(nodes)
|
|
750
|
-
nodes.append(None)
|
|
751
|
-
|
|
752
|
-
if 'leaf' in tree_json:
|
|
753
|
-
nodes[idx] = {
|
|
754
|
-
'f': -1,
|
|
755
|
-
't': 0,
|
|
756
|
-
'y': 0,
|
|
757
|
-
'n': 0,
|
|
758
|
-
'v': round(tree_json['leaf'], 6)
|
|
759
|
-
}
|
|
760
|
-
else:
|
|
761
|
-
split_feature = tree_json.get('split', '')
|
|
762
|
-
feature_idx = feature_map.get(split_feature, -1)
|
|
763
|
-
threshold = tree_json.get('split_condition', 0)
|
|
764
|
-
|
|
765
|
-
children = tree_json.get('children', [])
|
|
766
|
-
yes_child = tree_json.get('yes', 0)
|
|
767
|
-
no_child = tree_json.get('no', 0)
|
|
768
|
-
|
|
769
|
-
yes_tree = None
|
|
770
|
-
no_tree = None
|
|
771
|
-
for child in children:
|
|
772
|
-
if child.get('nodeid') == yes_child:
|
|
773
|
-
yes_tree = child
|
|
774
|
-
elif child.get('nodeid') == no_child:
|
|
775
|
-
no_tree = child
|
|
776
|
-
|
|
777
|
-
if yes_tree is None and len(children) > 0:
|
|
778
|
-
yes_tree = children[0]
|
|
779
|
-
if no_tree is None and len(children) > 1:
|
|
780
|
-
no_tree = children[1]
|
|
781
|
-
|
|
782
|
-
yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
|
|
783
|
-
no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
|
|
784
|
-
|
|
785
|
-
nodes[idx] = {
|
|
786
|
-
'f': feature_idx,
|
|
787
|
-
't': round(threshold, 6),
|
|
788
|
-
'y': yes_idx,
|
|
789
|
-
'n': no_idx,
|
|
790
|
-
'v': 0
|
|
791
|
-
}
|
|
792
|
-
|
|
793
|
-
return idx
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
def export_model_trees_js(model: xgb.Booster, selected_features: list,
|
|
797
|
-
threshold: float, output_path: str,
|
|
798
|
-
cv_metrics: dict, holdout_metrics: dict):
|
|
799
|
-
"""
|
|
800
|
-
Step 8: Export model directly to model-trees.js.
|
|
801
|
-
"""
|
|
802
|
-
print("\n" + "=" * 60)
|
|
803
|
-
print(f"[Step 8/8] Exporting to {output_path}...")
|
|
804
|
-
print("=" * 60)
|
|
805
|
-
|
|
806
|
-
# Get tree dump as JSON
|
|
807
|
-
trees_dump = model.get_dump(dump_format='json')
|
|
808
|
-
feature_map = {name: idx for idx, name in enumerate(selected_features)}
|
|
809
|
-
|
|
810
|
-
# Convert each tree to flat array format
|
|
811
|
-
js_trees = []
|
|
812
|
-
total_nodes = 0
|
|
813
|
-
for tree_str in trees_dump:
|
|
814
|
-
tree_json = json.loads(tree_str)
|
|
815
|
-
nodes = []
|
|
816
|
-
convert_tree(tree_json, nodes, feature_map)
|
|
817
|
-
js_trees.append(nodes)
|
|
818
|
-
total_nodes += len(nodes)
|
|
819
|
-
|
|
820
|
-
# Build JS model object
|
|
821
|
-
js_model = {
|
|
822
|
-
'version': 1,
|
|
823
|
-
'features': selected_features,
|
|
824
|
-
'threshold': threshold,
|
|
825
|
-
'trees': js_trees
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
# Write as JS module
|
|
829
|
-
js_content = "'use strict';\n\n"
|
|
830
|
-
js_content += "/**\n"
|
|
831
|
-
js_content += " * XGBoost model trees — auto-generated by src/ml/train-xgboost.py\n"
|
|
832
|
-
js_content += f" * {len(js_trees)} trees, {len(selected_features)} features, threshold={threshold}\n"
|
|
833
|
-
js_content += f" * CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}\n"
|
|
834
|
-
js_content += f" * Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}\n"
|
|
835
|
-
js_content += " * DO NOT EDIT MANUALLY\n"
|
|
836
|
-
js_content += " */\n\n"
|
|
837
|
-
js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
|
|
838
|
-
|
|
839
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
840
|
-
f.write(js_content)
|
|
841
|
-
|
|
842
|
-
size_kb = Path(output_path).stat().st_size / 1024
|
|
843
|
-
print(f" Trees: {len(js_trees)}")
|
|
844
|
-
print(f" Total nodes: {total_nodes}")
|
|
845
|
-
print(f" Features: {len(selected_features)}")
|
|
846
|
-
print(f" Threshold: {threshold:.3f}")
|
|
847
|
-
print(f" File size: {size_kb:.1f} KB")
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
def main():
|
|
851
|
-
parser = argparse.ArgumentParser(
|
|
852
|
-
description='Train MUAD\'DIB XGBoost classifier (dual-source JSONL)')
|
|
853
|
-
parser.add_argument('--negatives', required=True,
|
|
854
|
-
help='Path to negatives JSONL (clean/fp labels)')
|
|
855
|
-
parser.add_argument('--positives', required=True,
|
|
856
|
-
help='Path to positives JSONL (malicious labels)')
|
|
857
|
-
parser.add_argument('--output', default='src/ml/model-trees-shadow.js',
|
|
858
|
-
help='Output JS file path (default: src/ml/model-trees-shadow.js)')
|
|
859
|
-
parser.add_argument('--top-features', type=int, default=50,
|
|
860
|
-
help='Number of top SHAP features to select (default: 50)')
|
|
861
|
-
parser.add_argument('--common-only', action=argparse.BooleanOptionalAction,
|
|
862
|
-
default=True,
|
|
863
|
-
help='Only use features with >=1%% non-zero coverage in BOTH sources (default: on)')
|
|
864
|
-
parser.add_argument('--skip-gate', action='store_true',
|
|
865
|
-
help='Skip source discrimination gate (dangerous — use only for debugging)')
|
|
866
|
-
args = parser.parse_args()
|
|
867
|
-
|
|
868
|
-
# Validate inputs
|
|
869
|
-
if not Path(args.negatives).exists():
|
|
870
|
-
print(f"ERROR: Negatives file not found: {args.negatives}", file=sys.stderr)
|
|
871
|
-
sys.exit(1)
|
|
872
|
-
if not Path(args.positives).exists():
|
|
873
|
-
print(f"ERROR: Positives file not found: {args.positives}", file=sys.stderr)
|
|
874
|
-
sys.exit(1)
|
|
875
|
-
|
|
876
|
-
# Step 1: Load data
|
|
877
|
-
negatives, positives = load_and_prepare(args)
|
|
878
|
-
|
|
879
|
-
# Step 2: Align features
|
|
880
|
-
X, y, stats = align_features(negatives, positives)
|
|
881
|
-
|
|
882
|
-
# Step 2a: Remove known metadata/source-proxy features BEFORE leak filter.
|
|
883
|
-
# These features differ between sources for non-behavioral reasons and would
|
|
884
|
-
# cause the model to learn source identity instead of malicious behavior.
|
|
885
|
-
metadata_cols = [f for f in FEATURE_NAMES if f in EXCLUDED_METADATA]
|
|
886
|
-
X = X.drop(columns=metadata_cols, errors='ignore')
|
|
887
|
-
remaining_features = [f for f in FEATURE_NAMES if f not in EXCLUDED_METADATA]
|
|
888
|
-
print(f"\n [Step 2a] Excluded {len(metadata_cols)} metadata features: "
|
|
889
|
-
f"{', '.join(metadata_cols)}")
|
|
890
|
-
print(f" Remaining: {len(remaining_features)} features")
|
|
891
|
-
|
|
892
|
-
# Step 2b: Filter dead/leaky features (on remaining behavioral features)
|
|
893
|
-
if args.common_only:
|
|
894
|
-
X, active_features = filter_leaky_features(X, y)
|
|
895
|
-
else:
|
|
896
|
-
active_features = list(remaining_features)
|
|
897
|
-
|
|
898
|
-
# Step 2c: Source discrimination diagnostic (log-only).
|
|
899
|
-
# NOT a hard gate — source label is 100% confounded with class label
|
|
900
|
-
# (all positives = Datadog, all negatives = monitor), so behavioral
|
|
901
|
-
# features will always dominate the discriminator. The diagnostic
|
|
902
|
-
# checks that no METADATA features leaked through Step 2a.
|
|
903
|
-
if not args.skip_gate:
|
|
904
|
-
source_discrimination_diagnostic(X, y, active_features)
|
|
905
|
-
else:
|
|
906
|
-
print("\n [Step 2c] Source discrimination diagnostic SKIPPED (--skip-gate)")
|
|
907
|
-
|
|
908
|
-
# Class imbalance weight
|
|
909
|
-
n_neg = stats['n_neg']
|
|
910
|
-
n_pos = stats['n_pos']
|
|
911
|
-
scale_pos_weight = n_neg / max(n_pos, 1)
|
|
912
|
-
print(f"\n scale_pos_weight: {scale_pos_weight:.2f}")
|
|
913
|
-
|
|
914
|
-
# Step 3: Train/test split
|
|
915
|
-
X_train, X_test, y_train, y_test = split_data(X, y)
|
|
916
|
-
|
|
917
|
-
# Step 4: Preliminary + SHAP
|
|
918
|
-
selected = train_preliminary_and_shap(
|
|
919
|
-
X_train, y_train, scale_pos_weight, active_features,
|
|
920
|
-
top_k=args.top_features)
|
|
921
|
-
|
|
922
|
-
# Step 5: Cross-validation
|
|
923
|
-
cv_metrics = cross_validate(X_train, y_train, selected, scale_pos_weight)
|
|
924
|
-
|
|
925
|
-
# Step 6: Final model
|
|
926
|
-
final_model = train_final_model(X_train, y_train, selected, scale_pos_weight)
|
|
927
|
-
|
|
928
|
-
# Step 7: Holdout evaluation
|
|
929
|
-
holdout_metrics = evaluate_holdout(
|
|
930
|
-
final_model, X_test, y_test, selected, cv_metrics['threshold'])
|
|
931
|
-
|
|
932
|
-
# Step 8: Export
|
|
933
|
-
export_model_trees_js(
|
|
934
|
-
final_model, selected, cv_metrics['threshold'],
|
|
935
|
-
args.output, cv_metrics, holdout_metrics)
|
|
936
|
-
|
|
937
|
-
# Summary
|
|
938
|
-
print("\n" + "=" * 60)
|
|
939
|
-
print("TRAINING COMPLETE")
|
|
940
|
-
print("=" * 60)
|
|
941
|
-
print(f" Samples: {n_neg} negatives + {n_pos} positives = {n_neg + n_pos}")
|
|
942
|
-
print(f" Features: {len(selected)} selected (from {len(active_features)} active / "
|
|
943
|
-
f"{len(FEATURE_NAMES)} total, {len(EXCLUDED_METADATA)} metadata excluded)")
|
|
944
|
-
print(f" Threshold: {cv_metrics['threshold']:.3f}")
|
|
945
|
-
print(f" CV: P={cv_metrics['precision']:.3f} R={cv_metrics['recall']:.3f} F1={cv_metrics['f1']:.3f}")
|
|
946
|
-
print(f" Holdout: P={holdout_metrics['precision']:.3f} R={holdout_metrics['recall']:.3f} F1={holdout_metrics['f1']:.3f}")
|
|
947
|
-
print(f" Output: {args.output}")
|
|
948
|
-
|
|
949
|
-
# Warnings
|
|
950
|
-
if holdout_metrics['recall'] < 0.939:
|
|
951
|
-
print(f"\n [WARNING] Holdout recall {holdout_metrics['recall']:.3f} < 93.9% target")
|
|
952
|
-
if holdout_metrics['precision'] < 0.95:
|
|
953
|
-
print(f" [WARNING] Holdout precision {holdout_metrics['precision']:.3f} < 95% target")
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
if __name__ == '__main__':
|
|
957
|
-
main()
|