muaddib-scanner 2.10.68 → 2.10.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/ml/classifier.js +18 -19
- package/src/ml/train-xgboost.py +66 -40
- package/src/monitor/queue.js +18 -0
package/package.json
CHANGED
package/src/ml/classifier.js
CHANGED
|
@@ -126,16 +126,20 @@ function resetShadowModel() {
|
|
|
126
126
|
}
|
|
127
127
|
|
|
128
128
|
/**
|
|
129
|
-
* Run shadow model prediction and log
|
|
130
|
-
*
|
|
129
|
+
* Run shadow model prediction and log result.
|
|
130
|
+
* NEVER affects the actual classification decision — log-only.
|
|
131
|
+
*
|
|
132
|
+
* Runs independently of the main model's guard rails so that shadow
|
|
133
|
+
* predictions are logged for ALL packages with score >= 20, not just
|
|
134
|
+
* T1 zone. This provides validation data across the full score range
|
|
135
|
+
* before the shadow model is promoted to production.
|
|
131
136
|
*
|
|
132
137
|
* @param {Object} result - scan result
|
|
133
138
|
* @param {Object} meta - enriched metadata
|
|
134
|
-
* @param {string} mainPrediction - the main model's prediction
|
|
135
|
-
* @param {number} mainProbability - the main model's probability
|
|
136
139
|
* @param {string} packageName - for logging
|
|
140
|
+
* @param {number} score - risk score (for log context)
|
|
137
141
|
*/
|
|
138
|
-
function
|
|
142
|
+
function runShadowPrediction(result, meta, packageName, score) {
|
|
139
143
|
const shadow = loadShadowModel();
|
|
140
144
|
if (!shadow) return;
|
|
141
145
|
|
|
@@ -151,20 +155,21 @@ function runShadowComparison(result, meta, mainPrediction, mainProbability, pack
|
|
|
151
155
|
}
|
|
152
156
|
|
|
153
157
|
const shadowProb = sigmoid(margin);
|
|
158
|
+
const roundedP = Math.round(shadowProb * 1000) / 1000;
|
|
154
159
|
const shadowPred = shadowProb >= shadow.threshold ? 'malicious' : 'clean';
|
|
155
160
|
|
|
156
161
|
_shadowStats.total++;
|
|
157
|
-
if (shadowPred ===
|
|
158
|
-
_shadowStats.agree++;
|
|
159
|
-
} else {
|
|
162
|
+
if (shadowPred === 'malicious') {
|
|
160
163
|
_shadowStats.disagree++;
|
|
161
|
-
console.log(`[ML-SHADOW]
|
|
164
|
+
console.log(`[ML-SHADOW] ${packageName} → ${shadowPred} (p=${roundedP}, score=${score}) [${_shadowStats.disagree}/${_shadowStats.total} flagged]`);
|
|
165
|
+
} else {
|
|
166
|
+
_shadowStats.agree++;
|
|
162
167
|
}
|
|
163
168
|
|
|
164
169
|
// Periodic summary every 100 classifications
|
|
165
170
|
if (_shadowStats.total % 100 === 0) {
|
|
166
|
-
const
|
|
167
|
-
console.log(`[ML-SHADOW] Stats: ${_shadowStats.total} total, ${
|
|
171
|
+
const flagRate = ((_shadowStats.disagree / _shadowStats.total) * 100).toFixed(1);
|
|
172
|
+
console.log(`[ML-SHADOW] Stats: ${_shadowStats.total} total, ${_shadowStats.disagree} flagged (${flagRate}%), ${_shadowStats.agree} clean`);
|
|
168
173
|
}
|
|
169
174
|
}
|
|
170
175
|
|
|
@@ -371,13 +376,6 @@ function classifyPackage(result, meta) {
|
|
|
371
376
|
|
|
372
377
|
const roundedProb = Math.round(probability * 1000) / 1000;
|
|
373
378
|
|
|
374
|
-
// Shadow model comparison (log-only, never affects decision)
|
|
375
|
-
if (isShadowModelAvailable()) {
|
|
376
|
-
const pkgName = (result && result.summary && result.summary.packageName) ||
|
|
377
|
-
(meta && meta.name) || 'unknown';
|
|
378
|
-
runShadowComparison(result, meta, prediction, roundedProb, pkgName);
|
|
379
|
-
}
|
|
380
|
-
|
|
381
379
|
return {
|
|
382
380
|
prediction,
|
|
383
381
|
probability: roundedProb,
|
|
@@ -401,9 +399,10 @@ module.exports = {
|
|
|
401
399
|
loadBundlerModel,
|
|
402
400
|
predictBundler,
|
|
403
401
|
buildBundlerFeatureVector,
|
|
404
|
-
// Shadow model (ML1 v2, log-only
|
|
402
|
+
// Shadow model (ML1 v2, log-only prediction)
|
|
405
403
|
isShadowModelAvailable,
|
|
406
404
|
resetShadowModel,
|
|
407
405
|
loadShadowModel,
|
|
406
|
+
runShadowPrediction,
|
|
408
407
|
getShadowStats
|
|
409
408
|
};
|
package/src/ml/train-xgboost.py
CHANGED
|
@@ -359,23 +359,36 @@ def filter_leaky_features(X: pd.DataFrame, y: np.ndarray,
|
|
|
359
359
|
return X_filtered, retained
|
|
360
360
|
|
|
361
361
|
|
|
362
|
-
def
|
|
363
|
-
|
|
364
|
-
max_accuracy: float = 0.65) -> bool:
|
|
362
|
+
def source_discrimination_diagnostic(X: pd.DataFrame, y: np.ndarray,
|
|
363
|
+
active_features: list):
|
|
365
364
|
"""
|
|
366
|
-
Step 2c:
|
|
367
|
-
|
|
365
|
+
Step 2c: Source discrimination diagnostic (LOG-ONLY, non-blocking).
|
|
366
|
+
|
|
367
|
+
DESIGN NOTE: This test cannot function as a hard gate when source labels
|
|
368
|
+
are perfectly confounded with class labels (all negatives = monitor,
|
|
369
|
+
all positives = Datadog). In that case, legitimate behavioral features
|
|
370
|
+
(score, count_critical, type_*) will dominate the discriminator because
|
|
371
|
+
malware genuinely behaves differently from clean packages — this is
|
|
372
|
+
signal, not leak.
|
|
368
373
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
374
|
+
A true source discrimination test would require either:
|
|
375
|
+
(a) positives re-scanned through our own pipeline, or
|
|
376
|
+
(b) negatives and positives from the SAME source.
|
|
372
377
|
|
|
373
|
-
|
|
374
|
-
|
|
378
|
+
This diagnostic still serves a purpose: it flags NON-BEHAVIORAL features
|
|
379
|
+
that shouldn't appear in the top discriminators. If metadata features
|
|
380
|
+
(unpacked_size_bytes, file_count_total, etc.) appear despite being
|
|
381
|
+
excluded in Step 2a, something is wrong.
|
|
382
|
+
|
|
383
|
+
The real validation happens in shadow deployment on live production data.
|
|
375
384
|
"""
|
|
376
385
|
print("\n" + "=" * 60)
|
|
377
|
-
print(
|
|
386
|
+
print("[Step 2c/8] Source discrimination diagnostic (log-only)...")
|
|
378
387
|
print("=" * 60)
|
|
388
|
+
print(" NOTE: source=Datadog correlates 100% with label=malicious.")
|
|
389
|
+
print(" This diagnostic checks for non-behavioral features in the")
|
|
390
|
+
print(" top discriminators, NOT for overall accuracy (which will")
|
|
391
|
+
print(" always be high due to the source/label confound).")
|
|
379
392
|
|
|
380
393
|
X_active = X[active_features]
|
|
381
394
|
|
|
@@ -385,7 +398,6 @@ def source_discrimination_gate(X: pd.DataFrame, y: np.ndarray,
|
|
|
385
398
|
)
|
|
386
399
|
|
|
387
400
|
# Shallow model — depth=3, 50 rounds, no class weighting
|
|
388
|
-
# (we want to detect ANY discriminability, not optimize for one class)
|
|
389
401
|
params = {
|
|
390
402
|
'objective': 'binary:logistic',
|
|
391
403
|
'eval_metric': 'logloss',
|
|
@@ -407,35 +419,52 @@ def source_discrimination_gate(X: pd.DataFrame, y: np.ndarray,
|
|
|
407
419
|
p = precision_score(y_te, preds, zero_division=0)
|
|
408
420
|
r = recall_score(y_te, preds, zero_division=0)
|
|
409
421
|
|
|
410
|
-
print(f" Discrimination accuracy: {accuracy:.3f} (P={p:.3f} R={r:.3f})")
|
|
422
|
+
print(f"\n Discrimination accuracy: {accuracy:.3f} (P={p:.3f} R={r:.3f})")
|
|
423
|
+
print(f" (Expected to be high due to source/label confound)")
|
|
411
424
|
|
|
412
|
-
# SHAP analysis
|
|
425
|
+
# SHAP analysis — the diagnostic value is in WHICH features dominate
|
|
413
426
|
explainer = shap.TreeExplainer(model)
|
|
414
427
|
shap_values = explainer.shap_values(X_te)
|
|
415
428
|
mean_abs_shap = np.abs(shap_values).mean(axis=0)
|
|
416
429
|
importance = sorted(zip(active_features, mean_abs_shap),
|
|
417
430
|
key=lambda x: x[1], reverse=True)
|
|
418
431
|
|
|
419
|
-
|
|
432
|
+
# Known behavioral features that SHOULD dominate (malware scores higher)
|
|
433
|
+
EXPECTED_BEHAVIORAL = {
|
|
434
|
+
'score', 'global_risk_score', 'max_file_score', 'package_score',
|
|
435
|
+
'count_total', 'count_critical', 'count_high', 'count_medium',
|
|
436
|
+
'count_low', 'distinct_threat_types', 'severity_ratio_high',
|
|
437
|
+
'max_single_points', 'points_concentration', 'file_count_with_threats',
|
|
438
|
+
'file_score_mean', 'file_score_max', 'threat_density',
|
|
439
|
+
}
|
|
440
|
+
# Features that should NOT appear (already excluded, but sanity check)
|
|
441
|
+
EXCLUDED_CHECK = {
|
|
442
|
+
'unpacked_size_bytes', 'file_count_total', 'has_tests',
|
|
443
|
+
'dep_count', 'dev_dep_count', 'reputation_factor',
|
|
444
|
+
'package_age_days', 'weekly_downloads', 'version_count',
|
|
445
|
+
'author_package_count', 'has_repository', 'readme_size',
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
print(f"\n Top 10 features driving discrimination:")
|
|
449
|
+
has_leak = False
|
|
420
450
|
for i, (name, val) in enumerate(importance[:10]):
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
flag = "
|
|
451
|
+
if name in EXCLUDED_CHECK:
|
|
452
|
+
flag = " *** LEAK — should have been excluded in Step 2a!"
|
|
453
|
+
has_leak = True
|
|
454
|
+
elif name in EXPECTED_BEHAVIORAL:
|
|
455
|
+
flag = " (expected — behavioral)"
|
|
456
|
+
elif name.startswith('type_') or name.startswith('has_'):
|
|
457
|
+
flag = " (behavioral signal)"
|
|
458
|
+
else:
|
|
459
|
+
flag = ""
|
|
426
460
|
print(f" {i + 1:2d}. {name:40s} {val:.6f}{flag}")
|
|
427
461
|
|
|
428
|
-
if
|
|
429
|
-
print(f"\n [
|
|
430
|
-
print(f"
|
|
431
|
-
return True
|
|
462
|
+
if has_leak:
|
|
463
|
+
print(f"\n [WARNING] Non-behavioral features found in top discriminators!")
|
|
464
|
+
print(f" Check EXCLUDED_METADATA — some metadata features leaked through.")
|
|
432
465
|
else:
|
|
433
|
-
print(f"\n [
|
|
434
|
-
print(f"
|
|
435
|
-
print(f" Offending features (exclude and re-run):")
|
|
436
|
-
for name, val in importance[:5]:
|
|
437
|
-
print(f" - {name} (SHAP={val:.6f})")
|
|
438
|
-
return False
|
|
466
|
+
print(f"\n [OK] Top discriminators are all behavioral features.")
|
|
467
|
+
print(f" No metadata/source-proxy leak detected.")
|
|
439
468
|
|
|
440
469
|
|
|
441
470
|
def split_data(X: pd.DataFrame, y: np.ndarray) -> tuple:
|
|
@@ -836,18 +865,15 @@ def main():
|
|
|
836
865
|
else:
|
|
837
866
|
active_features = list(remaining_features)
|
|
838
867
|
|
|
839
|
-
# Step 2c: Source discrimination
|
|
868
|
+
# Step 2c: Source discrimination diagnostic (log-only).
|
|
869
|
+
# NOT a hard gate — source label is 100% confounded with class label
|
|
870
|
+
# (all positives = Datadog, all negatives = monitor), so behavioral
|
|
871
|
+
# features will always dominate the discriminator. The diagnostic
|
|
872
|
+
# checks that no METADATA features leaked through Step 2a.
|
|
840
873
|
if not args.skip_gate:
|
|
841
|
-
|
|
842
|
-
if not gate_pass:
|
|
843
|
-
print("\n" + "=" * 60)
|
|
844
|
-
print("ABORTED: Source discrimination gate failed.")
|
|
845
|
-
print("The retained features still encode source identity.")
|
|
846
|
-
print("Add offending features to EXCLUDED_METADATA and re-run.")
|
|
847
|
-
print("=" * 60)
|
|
848
|
-
sys.exit(1)
|
|
874
|
+
source_discrimination_diagnostic(X, y, active_features)
|
|
849
875
|
else:
|
|
850
|
-
print("\n [Step 2c] Source discrimination
|
|
876
|
+
print("\n [Step 2c] Source discrimination diagnostic SKIPPED (--skip-gate)")
|
|
851
877
|
|
|
852
878
|
# Class imbalance weight
|
|
853
879
|
n_neg = stats['n_neg']
|
package/src/monitor/queue.js
CHANGED
|
@@ -647,6 +647,24 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
647
647
|
}
|
|
648
648
|
}
|
|
649
649
|
|
|
650
|
+
// Shadow model: log-only prediction for ALL score >= 20 npm packages.
|
|
651
|
+
// Runs independently of classifyPackage — no effect on mlResult, webhooks,
|
|
652
|
+
// or any decisions. Collects shadow validation data for the retrained model.
|
|
653
|
+
if (riskScore >= 20 && ecosystem === 'npm') {
|
|
654
|
+
try {
|
|
655
|
+
const { isShadowModelAvailable, runShadowPrediction } = require('../ml/classifier.js');
|
|
656
|
+
if (isShadowModelAvailable()) {
|
|
657
|
+
const shadowMeta = { npmRegistryMeta, fileCountTotal, hasTests, unpackedSize: meta.unpackedSize, registryMeta: meta };
|
|
658
|
+
runShadowPrediction(result, shadowMeta, `${name}@${version}`, riskScore);
|
|
659
|
+
}
|
|
660
|
+
} catch (err) {
|
|
661
|
+
// Non-fatal: shadow failure must never block the pipeline
|
|
662
|
+
if (err.code !== 'MODULE_NOT_FOUND') {
|
|
663
|
+
console.error(`[ML-SHADOW] Error for ${name}@${version}: ${err.message}`);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
650
668
|
stats.suspect++;
|
|
651
669
|
|
|
652
670
|
// Fire-and-forget tarball archiving — never blocks the pipeline
|