muaddib-scanner 2.11.76 → 2.11.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +18 -0
- package/README.md +15 -6
- package/bin/muaddib.js +18 -4
- package/package.json +1 -2
- package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
- package/src/commands/interactive.js +5 -6
- package/src/commands/safe-install.js +19 -19
- package/src/ioc/scraper.js +46 -10
- package/src/monitor/daemon.js +39 -28
- package/src/monitor/ingestion.js +32 -2
- package/src/monitor/queue.js +84 -21
- package/src/monitor/scan-queue.js +68 -1
- package/src/monitor/state.js +24 -1
- package/src/monitor/webhook.js +32 -11
- package/src/output/formatter.js +3 -4
- package/src/pipeline/executor.js +9 -1
- package/src/runtime/daemon.js +27 -28
- package/src/runtime/watch.js +7 -7
- package/src/sandbox/index.js +11 -9
- package/src/scanner/temporal-analysis.js +8 -0
- package/src/scanner/temporal-ast-diff.js +5 -0
- package/src/utils.js +60 -1
- package/.dockerignore +0 -7
- package/.env.example +0 -43
- package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
- package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
- package/ml-retrain/auto-labeler/labeler.py +0 -256
- package/ml-retrain/auto-labeler/npm_checker.py +0 -228
- package/ml-retrain/auto-labeler/ossf_index.py +0 -178
- package/ml-retrain/auto-labeler/requirements.txt +0 -1
- package/ml-retrain/confusion-matrix.png +0 -0
- package/ml-retrain/model-trees-retrained.js +0 -12
- package/ml-retrain/retrain-report.json +0 -225
- package/ml-retrain/retrain.py +0 -974
- package/sbom.json +0 -0
- package/src/ml/train-bundler-detector.py +0 -725
- package/src/ml/train-xgboost.py +0 -957
- package/tools/export-model-js.py +0 -160
- package/tools/requirements-ml.txt +0 -5
- package/tools/train-classifier.py +0 -333
package/tools/export-model-js.py
DELETED
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Export XGBoost model JSON to JavaScript module for MUAD'DIB runtime.
|
|
4
|
-
|
|
5
|
-
Converts the tree dump from train-classifier.py into a compact JS format
|
|
6
|
-
that can be traversed by src/ml/classifier.js without any Python dependency.
|
|
7
|
-
|
|
8
|
-
Usage:
|
|
9
|
-
python tools/export-model-js.py model.json [--output src/ml/model-trees.js]
|
|
10
|
-
|
|
11
|
-
Output format:
|
|
12
|
-
module.exports = {
|
|
13
|
-
version: 1,
|
|
14
|
-
features: ['score', 'count_total', ...],
|
|
15
|
-
threshold: 0.45,
|
|
16
|
-
trees: [
|
|
17
|
-
[{f: 0, t: 25.5, y: 1, n: 2, v: 0}, ...], // tree 0
|
|
18
|
-
[{f: -1, t: 0, y: 0, n: 0, v: 0.123}, ...], // tree 1 (leaf)
|
|
19
|
-
...
|
|
20
|
-
]
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
Node format:
|
|
24
|
-
f: feature index (-1 for leaf nodes)
|
|
25
|
-
t: split threshold (0 for leaves)
|
|
26
|
-
y: yes (left) child index
|
|
27
|
-
n: no (right) child index
|
|
28
|
-
v: leaf value (0 for internal nodes)
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
import argparse
|
|
32
|
-
import json
|
|
33
|
-
import sys
|
|
34
|
-
from pathlib import Path
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def convert_tree(tree_json: dict, nodes: list, feature_map: dict) -> int:
|
|
38
|
-
"""
|
|
39
|
-
Recursively convert an XGBoost tree JSON node to flat array format.
|
|
40
|
-
Returns the index of this node in the nodes array.
|
|
41
|
-
"""
|
|
42
|
-
idx = len(nodes)
|
|
43
|
-
nodes.append(None) # Placeholder
|
|
44
|
-
|
|
45
|
-
if 'leaf' in tree_json:
|
|
46
|
-
# Leaf node
|
|
47
|
-
nodes[idx] = {
|
|
48
|
-
'f': -1,
|
|
49
|
-
't': 0,
|
|
50
|
-
'y': 0,
|
|
51
|
-
'n': 0,
|
|
52
|
-
'v': round(tree_json['leaf'], 6)
|
|
53
|
-
}
|
|
54
|
-
else:
|
|
55
|
-
# Internal node
|
|
56
|
-
split_feature = tree_json.get('split', '')
|
|
57
|
-
feature_idx = feature_map.get(split_feature, -1)
|
|
58
|
-
threshold = tree_json.get('split_condition', 0)
|
|
59
|
-
|
|
60
|
-
# Process children
|
|
61
|
-
children = tree_json.get('children', [])
|
|
62
|
-
yes_child = tree_json.get('yes', 0)
|
|
63
|
-
no_child = tree_json.get('no', 0)
|
|
64
|
-
|
|
65
|
-
# Find yes/no children
|
|
66
|
-
yes_tree = None
|
|
67
|
-
no_tree = None
|
|
68
|
-
for child in children:
|
|
69
|
-
if child.get('nodeid') == yes_child:
|
|
70
|
-
yes_tree = child
|
|
71
|
-
elif child.get('nodeid') == no_child:
|
|
72
|
-
no_tree = child
|
|
73
|
-
|
|
74
|
-
# Fallback: if yes/no not matched by nodeid, use order
|
|
75
|
-
if yes_tree is None and len(children) > 0:
|
|
76
|
-
yes_tree = children[0]
|
|
77
|
-
if no_tree is None and len(children) > 1:
|
|
78
|
-
no_tree = children[1]
|
|
79
|
-
|
|
80
|
-
# Recurse
|
|
81
|
-
yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
|
|
82
|
-
no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
|
|
83
|
-
|
|
84
|
-
nodes[idx] = {
|
|
85
|
-
'f': feature_idx,
|
|
86
|
-
't': round(threshold, 6),
|
|
87
|
-
'y': yes_idx,
|
|
88
|
-
'n': no_idx,
|
|
89
|
-
'v': 0
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
return idx
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def export_to_js(model_path: str, output_path: str):
|
|
96
|
-
"""Convert model JSON to JavaScript module."""
|
|
97
|
-
with open(model_path, 'r', encoding='utf-8') as f:
|
|
98
|
-
model = json.load(f)
|
|
99
|
-
|
|
100
|
-
features = model['features']
|
|
101
|
-
threshold = model['threshold']
|
|
102
|
-
trees_raw = model['trees_raw']
|
|
103
|
-
|
|
104
|
-
# Build feature name -> index mapping
|
|
105
|
-
feature_map = {name: idx for idx, name in enumerate(features)}
|
|
106
|
-
|
|
107
|
-
# Convert each tree
|
|
108
|
-
js_trees = []
|
|
109
|
-
for tree_json in trees_raw:
|
|
110
|
-
nodes = []
|
|
111
|
-
convert_tree(tree_json, nodes, feature_map)
|
|
112
|
-
js_trees.append(nodes)
|
|
113
|
-
|
|
114
|
-
# Build JS output
|
|
115
|
-
js_model = {
|
|
116
|
-
'version': 1,
|
|
117
|
-
'features': features,
|
|
118
|
-
'threshold': threshold,
|
|
119
|
-
'trees': js_trees
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
# Write as JS module with compact tree formatting
|
|
123
|
-
js_content = "'use strict';\n\n"
|
|
124
|
-
js_content += "/**\n"
|
|
125
|
-
js_content += " * XGBoost model trees — auto-generated by tools/export-model-js.py\n"
|
|
126
|
-
js_content += f" * {len(js_trees)} trees, {len(features)} features, threshold={threshold}\n"
|
|
127
|
-
js_content += " * DO NOT EDIT MANUALLY\n"
|
|
128
|
-
js_content += " */\n\n"
|
|
129
|
-
js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
|
|
130
|
-
|
|
131
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
132
|
-
f.write(js_content)
|
|
133
|
-
|
|
134
|
-
size_kb = Path(output_path).stat().st_size / 1024
|
|
135
|
-
print(f"Exported {len(js_trees)} trees, {len(features)} features to {output_path} ({size_kb:.1f} KB)")
|
|
136
|
-
print(f"Threshold: {threshold}")
|
|
137
|
-
|
|
138
|
-
# Verify roundtrip
|
|
139
|
-
total_nodes = sum(len(t) for t in js_trees)
|
|
140
|
-
print(f"Total nodes: {total_nodes}")
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def main():
|
|
144
|
-
parser = argparse.ArgumentParser(
|
|
145
|
-
description='Export XGBoost model to JavaScript module')
|
|
146
|
-
parser.add_argument('model', help='Path to model.json from train-classifier.py')
|
|
147
|
-
parser.add_argument('--output', default='src/ml/model-trees.js',
|
|
148
|
-
help='Output JS file path')
|
|
149
|
-
args = parser.parse_args()
|
|
150
|
-
|
|
151
|
-
if not Path(args.model).exists():
|
|
152
|
-
print(f"ERROR: Model file not found: {args.model}", file=sys.stderr)
|
|
153
|
-
sys.exit(1)
|
|
154
|
-
|
|
155
|
-
export_to_js(args.model, args.output)
|
|
156
|
-
print(f"\nDone! The model is ready for use in src/ml/classifier.js")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
if __name__ == '__main__':
|
|
160
|
-
main()
|
|
@@ -1,333 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
MUAD'DIB ML Classifier Training Pipeline
|
|
4
|
-
|
|
5
|
-
Trains a binary XGBoost classifier to distinguish true positives from false
|
|
6
|
-
positives in the T1 zone (score 20-34). Designed to be run offline — no
|
|
7
|
-
Python dependency in production.
|
|
8
|
-
|
|
9
|
-
Usage:
|
|
10
|
-
python tools/train-classifier.py [--data data/ml-training.jsonl] [--output model.json]
|
|
11
|
-
|
|
12
|
-
Label strategy:
|
|
13
|
-
- Positives: Datadog ground-truth malware corpus (scanned with muaddib)
|
|
14
|
-
- Negatives: monitor label='clean' packages (0 findings = truly benign)
|
|
15
|
-
- EXCLUDED: 'suspect' (unverified), 'fp' (auto-labeled, biased)
|
|
16
|
-
|
|
17
|
-
Output:
|
|
18
|
-
- model.json: XGBoost tree dump + feature list + threshold
|
|
19
|
-
- Use tools/export-model-js.py to convert to src/ml/model-trees.js
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
import argparse
|
|
23
|
-
import json
|
|
24
|
-
import sys
|
|
25
|
-
from pathlib import Path
|
|
26
|
-
|
|
27
|
-
import numpy as np
|
|
28
|
-
import pandas as pd
|
|
29
|
-
import shap
|
|
30
|
-
from sklearn.model_selection import StratifiedKFold
|
|
31
|
-
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
|
|
32
|
-
import xgboost as xgb
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# --- Constants ---
|
|
36
|
-
|
|
37
|
-
# Feature columns to EXCLUDE (identity/metadata, not features)
|
|
38
|
-
IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
|
|
39
|
-
|
|
40
|
-
# Minimum samples required for training
|
|
41
|
-
MIN_SAMPLES = 100
|
|
42
|
-
|
|
43
|
-
# XGBoost hyperparameters (tuned for supply-chain threat detection)
|
|
44
|
-
XGB_PARAMS = {
|
|
45
|
-
'objective': 'binary:logistic',
|
|
46
|
-
'eval_metric': 'logloss',
|
|
47
|
-
'max_depth': 6,
|
|
48
|
-
'learning_rate': 0.1,
|
|
49
|
-
'subsample': 0.8,
|
|
50
|
-
'colsample_bytree': 0.8,
|
|
51
|
-
'min_child_weight': 5,
|
|
52
|
-
'gamma': 0.1,
|
|
53
|
-
'reg_alpha': 0.1,
|
|
54
|
-
'reg_lambda': 1.0,
|
|
55
|
-
'seed': 42,
|
|
56
|
-
'verbosity': 0,
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
N_ESTIMATORS = 200
|
|
60
|
-
N_FOLDS = 5
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def load_jsonl(filepath: str) -> pd.DataFrame:
|
|
64
|
-
"""Load JSONL file into DataFrame."""
|
|
65
|
-
records = []
|
|
66
|
-
with open(filepath, 'r', encoding='utf-8') as f:
|
|
67
|
-
for line_num, line in enumerate(f, 1):
|
|
68
|
-
line = line.strip()
|
|
69
|
-
if not line:
|
|
70
|
-
continue
|
|
71
|
-
try:
|
|
72
|
-
records.append(json.loads(line))
|
|
73
|
-
except json.JSONDecodeError:
|
|
74
|
-
print(f" [WARN] Skipping malformed line {line_num}", file=sys.stderr)
|
|
75
|
-
return pd.DataFrame(records)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def prepare_data(df: pd.DataFrame) -> tuple:
|
|
79
|
-
"""
|
|
80
|
-
Prepare training data from monitor JSONL.
|
|
81
|
-
|
|
82
|
-
Returns: (X, y, feature_names, stats_dict)
|
|
83
|
-
"""
|
|
84
|
-
print(f"\n[1/5] Loading data: {len(df)} total records")
|
|
85
|
-
|
|
86
|
-
# Show label distribution
|
|
87
|
-
label_counts = df['label'].value_counts()
|
|
88
|
-
print(f" Label distribution:")
|
|
89
|
-
for label, count in label_counts.items():
|
|
90
|
-
print(f" {label}: {count}")
|
|
91
|
-
|
|
92
|
-
# Filter to usable labels only
|
|
93
|
-
# Positives: 'confirmed' (manually verified malicious)
|
|
94
|
-
# Negatives: 'clean' (0 findings, truly benign)
|
|
95
|
-
# Excluded: 'suspect' (unverified), 'fp' (auto-labeled bias)
|
|
96
|
-
positives = df[df['label'] == 'confirmed'].copy()
|
|
97
|
-
negatives = df[df['label'] == 'clean'].copy()
|
|
98
|
-
|
|
99
|
-
# For negatives in T1 zone training: filter to score 20-34
|
|
100
|
-
# This focuses the model on the decision boundary
|
|
101
|
-
negatives_t1 = negatives[(negatives['score'] >= 20) & (negatives['score'] < 35)]
|
|
102
|
-
|
|
103
|
-
print(f"\n Training set:")
|
|
104
|
-
print(f" Positives (confirmed): {len(positives)}")
|
|
105
|
-
print(f" Negatives (clean): {len(negatives)} total, {len(negatives_t1)} in T1 zone")
|
|
106
|
-
|
|
107
|
-
# If not enough T1 negatives, use all clean negatives
|
|
108
|
-
if len(negatives_t1) < 50:
|
|
109
|
-
print(f" [INFO] Not enough T1 negatives ({len(negatives_t1)}), using all clean samples")
|
|
110
|
-
neg_sample = negatives
|
|
111
|
-
else:
|
|
112
|
-
neg_sample = negatives_t1
|
|
113
|
-
|
|
114
|
-
# Combine
|
|
115
|
-
combined = pd.concat([positives, neg_sample], ignore_index=True)
|
|
116
|
-
combined['_target'] = (combined['label'] == 'confirmed').astype(int)
|
|
117
|
-
|
|
118
|
-
# Extract feature columns
|
|
119
|
-
feature_cols = [col for col in combined.columns
|
|
120
|
-
if col not in IDENTITY_COLS and col != '_target'
|
|
121
|
-
and not col.startswith('_')]
|
|
122
|
-
feature_cols = sorted(feature_cols)
|
|
123
|
-
|
|
124
|
-
X = combined[feature_cols].fillna(0).astype(float)
|
|
125
|
-
y = combined['_target']
|
|
126
|
-
|
|
127
|
-
stats = {
|
|
128
|
-
'total_records': len(df),
|
|
129
|
-
'positives': len(positives),
|
|
130
|
-
'negatives_total': len(negatives),
|
|
131
|
-
'negatives_t1': len(negatives_t1),
|
|
132
|
-
'negatives_used': len(neg_sample),
|
|
133
|
-
'features': len(feature_cols),
|
|
134
|
-
'class_balance': f"{len(positives)}:{len(neg_sample)}"
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
return X, y, feature_cols, stats
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def select_features_shap(model, X: pd.DataFrame, feature_names: list,
|
|
141
|
-
top_k: int = 40) -> list:
|
|
142
|
-
"""
|
|
143
|
-
Use SHAP to select top-k most important features.
|
|
144
|
-
"""
|
|
145
|
-
print(f"\n[3/5] SHAP feature selection (top {top_k})...")
|
|
146
|
-
explainer = shap.TreeExplainer(model)
|
|
147
|
-
shap_values = explainer.shap_values(X)
|
|
148
|
-
|
|
149
|
-
# Mean absolute SHAP value per feature
|
|
150
|
-
mean_abs_shap = np.abs(shap_values).mean(axis=0)
|
|
151
|
-
importance = sorted(zip(feature_names, mean_abs_shap),
|
|
152
|
-
key=lambda x: x[1], reverse=True)
|
|
153
|
-
|
|
154
|
-
print(f"\n Top 20 features by SHAP importance:")
|
|
155
|
-
for i, (name, val) in enumerate(importance[:20]):
|
|
156
|
-
print(f" {i + 1:2d}. {name:40s} {val:.4f}")
|
|
157
|
-
|
|
158
|
-
selected = [name for name, _ in importance[:top_k]]
|
|
159
|
-
return selected
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def cross_validate(X: pd.DataFrame, y: pd.Series, feature_names: list,
|
|
163
|
-
scale_pos_weight: float) -> dict:
|
|
164
|
-
"""
|
|
165
|
-
5-fold stratified CV with precision@recall>=93.9% optimization.
|
|
166
|
-
"""
|
|
167
|
-
print(f"\n[4/5] 5-fold stratified cross-validation...")
|
|
168
|
-
|
|
169
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
170
|
-
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
|
|
171
|
-
|
|
172
|
-
fold_metrics = []
|
|
173
|
-
all_probs = np.zeros(len(y))
|
|
174
|
-
all_labels = np.zeros(len(y))
|
|
175
|
-
|
|
176
|
-
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
|
177
|
-
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
|
|
178
|
-
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
|
|
179
|
-
|
|
180
|
-
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
|
|
181
|
-
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_names)
|
|
182
|
-
|
|
183
|
-
model = xgb.train(
|
|
184
|
-
params, dtrain, num_boost_round=N_ESTIMATORS,
|
|
185
|
-
evals=[(dval, 'val')], verbose_eval=False,
|
|
186
|
-
early_stopping_rounds=20
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
probs = model.predict(dval)
|
|
190
|
-
all_probs[val_idx] = probs
|
|
191
|
-
all_labels[val_idx] = y_val.values
|
|
192
|
-
|
|
193
|
-
# Default threshold 0.5
|
|
194
|
-
preds = (probs >= 0.5).astype(int)
|
|
195
|
-
p = precision_score(y_val, preds, zero_division=0)
|
|
196
|
-
r = recall_score(y_val, preds, zero_division=0)
|
|
197
|
-
f1 = f1_score(y_val, preds, zero_division=0)
|
|
198
|
-
fold_metrics.append({'precision': p, 'recall': r, 'f1': f1})
|
|
199
|
-
print(f" Fold {fold + 1}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
|
|
200
|
-
|
|
201
|
-
# Find optimal threshold: maximize precision while maintaining recall >= 93.9%
|
|
202
|
-
thresholds = np.arange(0.1, 0.9, 0.01)
|
|
203
|
-
best_threshold = 0.5
|
|
204
|
-
best_precision = 0
|
|
205
|
-
|
|
206
|
-
for t in thresholds:
|
|
207
|
-
preds = (all_probs >= t).astype(int)
|
|
208
|
-
r = recall_score(all_labels, preds, zero_division=0)
|
|
209
|
-
p = precision_score(all_labels, preds, zero_division=0)
|
|
210
|
-
if r >= 0.939 and p > best_precision:
|
|
211
|
-
best_precision = p
|
|
212
|
-
best_threshold = t
|
|
213
|
-
|
|
214
|
-
final_preds = (all_probs >= best_threshold).astype(int)
|
|
215
|
-
final_p = precision_score(all_labels, final_preds, zero_division=0)
|
|
216
|
-
final_r = recall_score(all_labels, final_preds, zero_division=0)
|
|
217
|
-
cm = confusion_matrix(all_labels, final_preds)
|
|
218
|
-
|
|
219
|
-
print(f"\n Optimal threshold: {best_threshold:.2f}")
|
|
220
|
-
print(f" Final metrics: P={final_p:.3f} R={final_r:.3f}")
|
|
221
|
-
print(f" Confusion matrix:\n {cm}")
|
|
222
|
-
|
|
223
|
-
return {
|
|
224
|
-
'threshold': round(float(best_threshold), 3),
|
|
225
|
-
'precision': round(float(final_p), 4),
|
|
226
|
-
'recall': round(float(final_r), 4),
|
|
227
|
-
'fold_metrics': fold_metrics,
|
|
228
|
-
'confusion_matrix': cm.tolist()
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def train_final_model(X: pd.DataFrame, y: pd.Series, feature_names: list,
|
|
233
|
-
scale_pos_weight: float) -> xgb.Booster:
|
|
234
|
-
"""Train final model on all data."""
|
|
235
|
-
print(f"\n[5/5] Training final model on all data...")
|
|
236
|
-
params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
237
|
-
dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
|
|
238
|
-
model = xgb.train(params, dtrain, num_boost_round=N_ESTIMATORS)
|
|
239
|
-
return model
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def export_model_json(model: xgb.Booster, feature_names: list,
|
|
243
|
-
threshold: float, output_path: str, cv_metrics: dict):
|
|
244
|
-
"""Export model as JSON tree dump."""
|
|
245
|
-
trees_dump = model.get_dump(dump_format='json')
|
|
246
|
-
|
|
247
|
-
model_data = {
|
|
248
|
-
'version': 1,
|
|
249
|
-
'algorithm': 'xgboost',
|
|
250
|
-
'features': feature_names,
|
|
251
|
-
'threshold': threshold,
|
|
252
|
-
'n_trees': len(trees_dump),
|
|
253
|
-
'cv_metrics': {
|
|
254
|
-
'precision': cv_metrics['precision'],
|
|
255
|
-
'recall': cv_metrics['recall'],
|
|
256
|
-
'threshold': cv_metrics['threshold']
|
|
257
|
-
},
|
|
258
|
-
'trees_raw': [json.loads(t) for t in trees_dump]
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
262
|
-
json.dump(model_data, f, indent=2)
|
|
263
|
-
|
|
264
|
-
size_mb = Path(output_path).stat().st_size / (1024 * 1024)
|
|
265
|
-
print(f"\n Model exported to {output_path} ({size_mb:.1f} MB)")
|
|
266
|
-
print(f" {len(trees_dump)} trees, {len(feature_names)} features, threshold={threshold:.3f}")
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def main():
|
|
270
|
-
parser = argparse.ArgumentParser(description='Train MUAD\'DIB ML classifier')
|
|
271
|
-
parser.add_argument('--data', default='data/ml-training.jsonl',
|
|
272
|
-
help='Path to JSONL training data')
|
|
273
|
-
parser.add_argument('--output', default='model.json',
|
|
274
|
-
help='Path for model JSON output')
|
|
275
|
-
parser.add_argument('--top-features', type=int, default=40,
|
|
276
|
-
help='Number of top SHAP features to select')
|
|
277
|
-
args = parser.parse_args()
|
|
278
|
-
|
|
279
|
-
# Load data
|
|
280
|
-
if not Path(args.data).exists():
|
|
281
|
-
print(f"ERROR: Training data not found: {args.data}", file=sys.stderr)
|
|
282
|
-
sys.exit(1)
|
|
283
|
-
|
|
284
|
-
df = load_jsonl(args.data)
|
|
285
|
-
if len(df) < MIN_SAMPLES:
|
|
286
|
-
print(f"ERROR: Need at least {MIN_SAMPLES} samples, got {len(df)}", file=sys.stderr)
|
|
287
|
-
sys.exit(1)
|
|
288
|
-
|
|
289
|
-
# Prepare data
|
|
290
|
-
X, y, feature_names, stats = prepare_data(df)
|
|
291
|
-
print(f"\n[2/5] Training with {stats['features']} features, "
|
|
292
|
-
f"balance {stats['class_balance']}")
|
|
293
|
-
|
|
294
|
-
# Class imbalance weight
|
|
295
|
-
n_pos = y.sum()
|
|
296
|
-
n_neg = len(y) - n_pos
|
|
297
|
-
scale_pos_weight = n_neg / max(n_pos, 1)
|
|
298
|
-
print(f" scale_pos_weight: {scale_pos_weight:.2f}")
|
|
299
|
-
|
|
300
|
-
# Phase 1: Train preliminary model for SHAP feature selection
|
|
301
|
-
prelim_params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
|
|
302
|
-
dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
|
|
303
|
-
prelim_model = xgb.train(prelim_params, dtrain, num_boost_round=100)
|
|
304
|
-
|
|
305
|
-
# SHAP feature selection
|
|
306
|
-
selected_features = select_features_shap(
|
|
307
|
-
prelim_model, X, feature_names, top_k=args.top_features)
|
|
308
|
-
|
|
309
|
-
# Retrain with selected features
|
|
310
|
-
X_selected = X[selected_features]
|
|
311
|
-
|
|
312
|
-
# Cross-validate
|
|
313
|
-
cv_metrics = cross_validate(X_selected, y, selected_features, scale_pos_weight)
|
|
314
|
-
|
|
315
|
-
# Train final model
|
|
316
|
-
final_model = train_final_model(X_selected, y, selected_features, scale_pos_weight)
|
|
317
|
-
|
|
318
|
-
# Export
|
|
319
|
-
export_model_json(final_model, selected_features, cv_metrics['threshold'],
|
|
320
|
-
args.output, cv_metrics)
|
|
321
|
-
|
|
322
|
-
print(f"\n{'=' * 60}")
|
|
323
|
-
print(f"Training complete!")
|
|
324
|
-
print(f" Samples: {stats['positives']} malicious + {stats['negatives_used']} clean")
|
|
325
|
-
print(f" Features: {len(selected_features)} (from {stats['features']} total)")
|
|
326
|
-
print(f" Precision: {cv_metrics['precision']:.1%}")
|
|
327
|
-
print(f" Recall: {cv_metrics['recall']:.1%}")
|
|
328
|
-
print(f" Threshold: {cv_metrics['threshold']:.3f}")
|
|
329
|
-
print(f"\nNext: python tools/export-model-js.py {args.output}")
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
if __name__ == '__main__':
|
|
333
|
-
main()
|