muaddib-scanner 2.11.76 → 2.11.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.githooks/pre-commit +18 -0
  2. package/README.md +15 -6
  3. package/bin/muaddib.js +18 -4
  4. package/package.json +1 -2
  5. package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
  6. package/src/commands/interactive.js +5 -6
  7. package/src/commands/safe-install.js +19 -19
  8. package/src/ioc/scraper.js +46 -10
  9. package/src/monitor/daemon.js +39 -28
  10. package/src/monitor/ingestion.js +32 -2
  11. package/src/monitor/queue.js +84 -21
  12. package/src/monitor/scan-queue.js +68 -1
  13. package/src/monitor/state.js +24 -1
  14. package/src/monitor/webhook.js +32 -11
  15. package/src/output/formatter.js +3 -4
  16. package/src/pipeline/executor.js +9 -1
  17. package/src/runtime/daemon.js +27 -28
  18. package/src/runtime/watch.js +7 -7
  19. package/src/sandbox/index.js +11 -9
  20. package/src/scanner/temporal-analysis.js +8 -0
  21. package/src/scanner/temporal-ast-diff.js +5 -0
  22. package/src/utils.js +60 -1
  23. package/.dockerignore +0 -7
  24. package/.env.example +0 -43
  25. package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
  26. package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
  27. package/ml-retrain/auto-labeler/labeler.py +0 -256
  28. package/ml-retrain/auto-labeler/npm_checker.py +0 -228
  29. package/ml-retrain/auto-labeler/ossf_index.py +0 -178
  30. package/ml-retrain/auto-labeler/requirements.txt +0 -1
  31. package/ml-retrain/confusion-matrix.png +0 -0
  32. package/ml-retrain/model-trees-retrained.js +0 -12
  33. package/ml-retrain/retrain-report.json +0 -225
  34. package/ml-retrain/retrain.py +0 -974
  35. package/sbom.json +0 -0
  36. package/src/ml/train-bundler-detector.py +0 -725
  37. package/src/ml/train-xgboost.py +0 -957
  38. package/tools/export-model-js.py +0 -160
  39. package/tools/requirements-ml.txt +0 -5
  40. package/tools/train-classifier.py +0 -333
@@ -1,160 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Export XGBoost model JSON to JavaScript module for MUAD'DIB runtime.
4
-
5
- Converts the tree dump from train-classifier.py into a compact JS format
6
- that can be traversed by src/ml/classifier.js without any Python dependency.
7
-
8
- Usage:
9
- python tools/export-model-js.py model.json [--output src/ml/model-trees.js]
10
-
11
- Output format:
12
- module.exports = {
13
- version: 1,
14
- features: ['score', 'count_total', ...],
15
- threshold: 0.45,
16
- trees: [
17
- [{f: 0, t: 25.5, y: 1, n: 2, v: 0}, ...], // tree 0
18
- [{f: -1, t: 0, y: 0, n: 0, v: 0.123}, ...], // tree 1 (leaf)
19
- ...
20
- ]
21
- };
22
-
23
- Node format:
24
- f: feature index (-1 for leaf nodes)
25
- t: split threshold (0 for leaves)
26
- y: yes (left) child index
27
- n: no (right) child index
28
- v: leaf value (0 for internal nodes)
29
- """
30
-
31
- import argparse
32
- import json
33
- import sys
34
- from pathlib import Path
35
-
36
-
37
- def convert_tree(tree_json: dict, nodes: list, feature_map: dict) -> int:
38
- """
39
- Recursively convert an XGBoost tree JSON node to flat array format.
40
- Returns the index of this node in the nodes array.
41
- """
42
- idx = len(nodes)
43
- nodes.append(None) # Placeholder
44
-
45
- if 'leaf' in tree_json:
46
- # Leaf node
47
- nodes[idx] = {
48
- 'f': -1,
49
- 't': 0,
50
- 'y': 0,
51
- 'n': 0,
52
- 'v': round(tree_json['leaf'], 6)
53
- }
54
- else:
55
- # Internal node
56
- split_feature = tree_json.get('split', '')
57
- feature_idx = feature_map.get(split_feature, -1)
58
- threshold = tree_json.get('split_condition', 0)
59
-
60
- # Process children
61
- children = tree_json.get('children', [])
62
- yes_child = tree_json.get('yes', 0)
63
- no_child = tree_json.get('no', 0)
64
-
65
- # Find yes/no children
66
- yes_tree = None
67
- no_tree = None
68
- for child in children:
69
- if child.get('nodeid') == yes_child:
70
- yes_tree = child
71
- elif child.get('nodeid') == no_child:
72
- no_tree = child
73
-
74
- # Fallback: if yes/no not matched by nodeid, use order
75
- if yes_tree is None and len(children) > 0:
76
- yes_tree = children[0]
77
- if no_tree is None and len(children) > 1:
78
- no_tree = children[1]
79
-
80
- # Recurse
81
- yes_idx = convert_tree(yes_tree, nodes, feature_map) if yes_tree else idx
82
- no_idx = convert_tree(no_tree, nodes, feature_map) if no_tree else idx
83
-
84
- nodes[idx] = {
85
- 'f': feature_idx,
86
- 't': round(threshold, 6),
87
- 'y': yes_idx,
88
- 'n': no_idx,
89
- 'v': 0
90
- }
91
-
92
- return idx
93
-
94
-
95
- def export_to_js(model_path: str, output_path: str):
96
- """Convert model JSON to JavaScript module."""
97
- with open(model_path, 'r', encoding='utf-8') as f:
98
- model = json.load(f)
99
-
100
- features = model['features']
101
- threshold = model['threshold']
102
- trees_raw = model['trees_raw']
103
-
104
- # Build feature name -> index mapping
105
- feature_map = {name: idx for idx, name in enumerate(features)}
106
-
107
- # Convert each tree
108
- js_trees = []
109
- for tree_json in trees_raw:
110
- nodes = []
111
- convert_tree(tree_json, nodes, feature_map)
112
- js_trees.append(nodes)
113
-
114
- # Build JS output
115
- js_model = {
116
- 'version': 1,
117
- 'features': features,
118
- 'threshold': threshold,
119
- 'trees': js_trees
120
- }
121
-
122
- # Write as JS module with compact tree formatting
123
- js_content = "'use strict';\n\n"
124
- js_content += "/**\n"
125
- js_content += " * XGBoost model trees — auto-generated by tools/export-model-js.py\n"
126
- js_content += f" * {len(js_trees)} trees, {len(features)} features, threshold={threshold}\n"
127
- js_content += " * DO NOT EDIT MANUALLY\n"
128
- js_content += " */\n\n"
129
- js_content += f"module.exports = {json.dumps(js_model, separators=(',', ':'))};\n"
130
-
131
- with open(output_path, 'w', encoding='utf-8') as f:
132
- f.write(js_content)
133
-
134
- size_kb = Path(output_path).stat().st_size / 1024
135
- print(f"Exported {len(js_trees)} trees, {len(features)} features to {output_path} ({size_kb:.1f} KB)")
136
- print(f"Threshold: {threshold}")
137
-
138
- # Verify roundtrip
139
- total_nodes = sum(len(t) for t in js_trees)
140
- print(f"Total nodes: {total_nodes}")
141
-
142
-
143
- def main():
144
- parser = argparse.ArgumentParser(
145
- description='Export XGBoost model to JavaScript module')
146
- parser.add_argument('model', help='Path to model.json from train-classifier.py')
147
- parser.add_argument('--output', default='src/ml/model-trees.js',
148
- help='Output JS file path')
149
- args = parser.parse_args()
150
-
151
- if not Path(args.model).exists():
152
- print(f"ERROR: Model file not found: {args.model}", file=sys.stderr)
153
- sys.exit(1)
154
-
155
- export_to_js(args.model, args.output)
156
- print(f"\nDone! The model is ready for use in src/ml/classifier.js")
157
-
158
-
159
- if __name__ == '__main__':
160
- main()
@@ -1,5 +0,0 @@
1
- xgboost>=2.0
2
- scikit-learn>=1.4
3
- shap>=0.44
4
- pandas>=2.2
5
- numpy>=1.26
@@ -1,333 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- MUAD'DIB ML Classifier Training Pipeline
4
-
5
- Trains a binary XGBoost classifier to distinguish true positives from false
6
- positives in the T1 zone (score 20-34). Designed to be run offline — no
7
- Python dependency in production.
8
-
9
- Usage:
10
- python tools/train-classifier.py [--data data/ml-training.jsonl] [--output model.json]
11
-
12
- Label strategy:
13
- - Positives: Datadog ground-truth malware corpus (scanned with muaddib)
14
- - Negatives: monitor label='clean' packages (0 findings = truly benign)
15
- - EXCLUDED: 'suspect' (unverified), 'fp' (auto-labeled, biased)
16
-
17
- Output:
18
- - model.json: XGBoost tree dump + feature list + threshold
19
- - Use tools/export-model-js.py to convert to src/ml/model-trees.js
20
- """
21
-
22
- import argparse
23
- import json
24
- import sys
25
- from pathlib import Path
26
-
27
- import numpy as np
28
- import pandas as pd
29
- import shap
30
- from sklearn.model_selection import StratifiedKFold
31
- from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
32
- import xgboost as xgb
33
-
34
-
35
- # --- Constants ---
36
-
37
- # Feature columns to EXCLUDE (identity/metadata, not features)
38
- IDENTITY_COLS = {'name', 'version', 'ecosystem', 'timestamp', 'label', 'tier'}
39
-
40
- # Minimum samples required for training
41
- MIN_SAMPLES = 100
42
-
43
- # XGBoost hyperparameters (tuned for supply-chain threat detection)
44
- XGB_PARAMS = {
45
- 'objective': 'binary:logistic',
46
- 'eval_metric': 'logloss',
47
- 'max_depth': 6,
48
- 'learning_rate': 0.1,
49
- 'subsample': 0.8,
50
- 'colsample_bytree': 0.8,
51
- 'min_child_weight': 5,
52
- 'gamma': 0.1,
53
- 'reg_alpha': 0.1,
54
- 'reg_lambda': 1.0,
55
- 'seed': 42,
56
- 'verbosity': 0,
57
- }
58
-
59
- N_ESTIMATORS = 200
60
- N_FOLDS = 5
61
-
62
-
63
- def load_jsonl(filepath: str) -> pd.DataFrame:
64
- """Load JSONL file into DataFrame."""
65
- records = []
66
- with open(filepath, 'r', encoding='utf-8') as f:
67
- for line_num, line in enumerate(f, 1):
68
- line = line.strip()
69
- if not line:
70
- continue
71
- try:
72
- records.append(json.loads(line))
73
- except json.JSONDecodeError:
74
- print(f" [WARN] Skipping malformed line {line_num}", file=sys.stderr)
75
- return pd.DataFrame(records)
76
-
77
-
78
- def prepare_data(df: pd.DataFrame) -> tuple:
79
- """
80
- Prepare training data from monitor JSONL.
81
-
82
- Returns: (X, y, feature_names, stats_dict)
83
- """
84
- print(f"\n[1/5] Loading data: {len(df)} total records")
85
-
86
- # Show label distribution
87
- label_counts = df['label'].value_counts()
88
- print(f" Label distribution:")
89
- for label, count in label_counts.items():
90
- print(f" {label}: {count}")
91
-
92
- # Filter to usable labels only
93
- # Positives: 'confirmed' (manually verified malicious)
94
- # Negatives: 'clean' (0 findings, truly benign)
95
- # Excluded: 'suspect' (unverified), 'fp' (auto-labeled bias)
96
- positives = df[df['label'] == 'confirmed'].copy()
97
- negatives = df[df['label'] == 'clean'].copy()
98
-
99
- # For negatives in T1 zone training: filter to score 20-34
100
- # This focuses the model on the decision boundary
101
- negatives_t1 = negatives[(negatives['score'] >= 20) & (negatives['score'] < 35)]
102
-
103
- print(f"\n Training set:")
104
- print(f" Positives (confirmed): {len(positives)}")
105
- print(f" Negatives (clean): {len(negatives)} total, {len(negatives_t1)} in T1 zone")
106
-
107
- # If not enough T1 negatives, use all clean negatives
108
- if len(negatives_t1) < 50:
109
- print(f" [INFO] Not enough T1 negatives ({len(negatives_t1)}), using all clean samples")
110
- neg_sample = negatives
111
- else:
112
- neg_sample = negatives_t1
113
-
114
- # Combine
115
- combined = pd.concat([positives, neg_sample], ignore_index=True)
116
- combined['_target'] = (combined['label'] == 'confirmed').astype(int)
117
-
118
- # Extract feature columns
119
- feature_cols = [col for col in combined.columns
120
- if col not in IDENTITY_COLS and col != '_target'
121
- and not col.startswith('_')]
122
- feature_cols = sorted(feature_cols)
123
-
124
- X = combined[feature_cols].fillna(0).astype(float)
125
- y = combined['_target']
126
-
127
- stats = {
128
- 'total_records': len(df),
129
- 'positives': len(positives),
130
- 'negatives_total': len(negatives),
131
- 'negatives_t1': len(negatives_t1),
132
- 'negatives_used': len(neg_sample),
133
- 'features': len(feature_cols),
134
- 'class_balance': f"{len(positives)}:{len(neg_sample)}"
135
- }
136
-
137
- return X, y, feature_cols, stats
138
-
139
-
140
- def select_features_shap(model, X: pd.DataFrame, feature_names: list,
141
- top_k: int = 40) -> list:
142
- """
143
- Use SHAP to select top-k most important features.
144
- """
145
- print(f"\n[3/5] SHAP feature selection (top {top_k})...")
146
- explainer = shap.TreeExplainer(model)
147
- shap_values = explainer.shap_values(X)
148
-
149
- # Mean absolute SHAP value per feature
150
- mean_abs_shap = np.abs(shap_values).mean(axis=0)
151
- importance = sorted(zip(feature_names, mean_abs_shap),
152
- key=lambda x: x[1], reverse=True)
153
-
154
- print(f"\n Top 20 features by SHAP importance:")
155
- for i, (name, val) in enumerate(importance[:20]):
156
- print(f" {i + 1:2d}. {name:40s} {val:.4f}")
157
-
158
- selected = [name for name, _ in importance[:top_k]]
159
- return selected
160
-
161
-
162
- def cross_validate(X: pd.DataFrame, y: pd.Series, feature_names: list,
163
- scale_pos_weight: float) -> dict:
164
- """
165
- 5-fold stratified CV with precision@recall>=93.9% optimization.
166
- """
167
- print(f"\n[4/5] 5-fold stratified cross-validation...")
168
-
169
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
170
- skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
171
-
172
- fold_metrics = []
173
- all_probs = np.zeros(len(y))
174
- all_labels = np.zeros(len(y))
175
-
176
- for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
177
- X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
178
- y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
179
-
180
- dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
181
- dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_names)
182
-
183
- model = xgb.train(
184
- params, dtrain, num_boost_round=N_ESTIMATORS,
185
- evals=[(dval, 'val')], verbose_eval=False,
186
- early_stopping_rounds=20
187
- )
188
-
189
- probs = model.predict(dval)
190
- all_probs[val_idx] = probs
191
- all_labels[val_idx] = y_val.values
192
-
193
- # Default threshold 0.5
194
- preds = (probs >= 0.5).astype(int)
195
- p = precision_score(y_val, preds, zero_division=0)
196
- r = recall_score(y_val, preds, zero_division=0)
197
- f1 = f1_score(y_val, preds, zero_division=0)
198
- fold_metrics.append({'precision': p, 'recall': r, 'f1': f1})
199
- print(f" Fold {fold + 1}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
200
-
201
- # Find optimal threshold: maximize precision while maintaining recall >= 93.9%
202
- thresholds = np.arange(0.1, 0.9, 0.01)
203
- best_threshold = 0.5
204
- best_precision = 0
205
-
206
- for t in thresholds:
207
- preds = (all_probs >= t).astype(int)
208
- r = recall_score(all_labels, preds, zero_division=0)
209
- p = precision_score(all_labels, preds, zero_division=0)
210
- if r >= 0.939 and p > best_precision:
211
- best_precision = p
212
- best_threshold = t
213
-
214
- final_preds = (all_probs >= best_threshold).astype(int)
215
- final_p = precision_score(all_labels, final_preds, zero_division=0)
216
- final_r = recall_score(all_labels, final_preds, zero_division=0)
217
- cm = confusion_matrix(all_labels, final_preds)
218
-
219
- print(f"\n Optimal threshold: {best_threshold:.2f}")
220
- print(f" Final metrics: P={final_p:.3f} R={final_r:.3f}")
221
- print(f" Confusion matrix:\n {cm}")
222
-
223
- return {
224
- 'threshold': round(float(best_threshold), 3),
225
- 'precision': round(float(final_p), 4),
226
- 'recall': round(float(final_r), 4),
227
- 'fold_metrics': fold_metrics,
228
- 'confusion_matrix': cm.tolist()
229
- }
230
-
231
-
232
- def train_final_model(X: pd.DataFrame, y: pd.Series, feature_names: list,
233
- scale_pos_weight: float) -> xgb.Booster:
234
- """Train final model on all data."""
235
- print(f"\n[5/5] Training final model on all data...")
236
- params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
237
- dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
238
- model = xgb.train(params, dtrain, num_boost_round=N_ESTIMATORS)
239
- return model
240
-
241
-
242
- def export_model_json(model: xgb.Booster, feature_names: list,
243
- threshold: float, output_path: str, cv_metrics: dict):
244
- """Export model as JSON tree dump."""
245
- trees_dump = model.get_dump(dump_format='json')
246
-
247
- model_data = {
248
- 'version': 1,
249
- 'algorithm': 'xgboost',
250
- 'features': feature_names,
251
- 'threshold': threshold,
252
- 'n_trees': len(trees_dump),
253
- 'cv_metrics': {
254
- 'precision': cv_metrics['precision'],
255
- 'recall': cv_metrics['recall'],
256
- 'threshold': cv_metrics['threshold']
257
- },
258
- 'trees_raw': [json.loads(t) for t in trees_dump]
259
- }
260
-
261
- with open(output_path, 'w', encoding='utf-8') as f:
262
- json.dump(model_data, f, indent=2)
263
-
264
- size_mb = Path(output_path).stat().st_size / (1024 * 1024)
265
- print(f"\n Model exported to {output_path} ({size_mb:.1f} MB)")
266
- print(f" {len(trees_dump)} trees, {len(feature_names)} features, threshold={threshold:.3f}")
267
-
268
-
269
- def main():
270
- parser = argparse.ArgumentParser(description='Train MUAD\'DIB ML classifier')
271
- parser.add_argument('--data', default='data/ml-training.jsonl',
272
- help='Path to JSONL training data')
273
- parser.add_argument('--output', default='model.json',
274
- help='Path for model JSON output')
275
- parser.add_argument('--top-features', type=int, default=40,
276
- help='Number of top SHAP features to select')
277
- args = parser.parse_args()
278
-
279
- # Load data
280
- if not Path(args.data).exists():
281
- print(f"ERROR: Training data not found: {args.data}", file=sys.stderr)
282
- sys.exit(1)
283
-
284
- df = load_jsonl(args.data)
285
- if len(df) < MIN_SAMPLES:
286
- print(f"ERROR: Need at least {MIN_SAMPLES} samples, got {len(df)}", file=sys.stderr)
287
- sys.exit(1)
288
-
289
- # Prepare data
290
- X, y, feature_names, stats = prepare_data(df)
291
- print(f"\n[2/5] Training with {stats['features']} features, "
292
- f"balance {stats['class_balance']}")
293
-
294
- # Class imbalance weight
295
- n_pos = y.sum()
296
- n_neg = len(y) - n_pos
297
- scale_pos_weight = n_neg / max(n_pos, 1)
298
- print(f" scale_pos_weight: {scale_pos_weight:.2f}")
299
-
300
- # Phase 1: Train preliminary model for SHAP feature selection
301
- prelim_params = {**XGB_PARAMS, 'scale_pos_weight': scale_pos_weight}
302
- dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
303
- prelim_model = xgb.train(prelim_params, dtrain, num_boost_round=100)
304
-
305
- # SHAP feature selection
306
- selected_features = select_features_shap(
307
- prelim_model, X, feature_names, top_k=args.top_features)
308
-
309
- # Retrain with selected features
310
- X_selected = X[selected_features]
311
-
312
- # Cross-validate
313
- cv_metrics = cross_validate(X_selected, y, selected_features, scale_pos_weight)
314
-
315
- # Train final model
316
- final_model = train_final_model(X_selected, y, selected_features, scale_pos_weight)
317
-
318
- # Export
319
- export_model_json(final_model, selected_features, cv_metrics['threshold'],
320
- args.output, cv_metrics)
321
-
322
- print(f"\n{'=' * 60}")
323
- print(f"Training complete!")
324
- print(f" Samples: {stats['positives']} malicious + {stats['negatives_used']} clean")
325
- print(f" Features: {len(selected_features)} (from {stats['features']} total)")
326
- print(f" Precision: {cv_metrics['precision']:.1%}")
327
- print(f" Recall: {cv_metrics['recall']:.1%}")
328
- print(f" Threshold: {cv_metrics['threshold']:.3f}")
329
- print(f"\nNext: python tools/export-model-js.py {args.output}")
330
-
331
-
332
- if __name__ == '__main__':
333
- main()