muaddib-scanner 2.11.76 → 2.11.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +18 -0
- package/README.md +15 -6
- package/package.json +1 -2
- package/{self-scan-v2.11.76.json → self-scan-v2.11.77.json} +1 -1
- package/src/commands/safe-install.js +8 -3
- package/src/monitor/daemon.js +34 -22
- package/src/monitor/ingestion.js +32 -2
- package/src/monitor/queue.js +84 -21
- package/src/monitor/scan-queue.js +68 -1
- package/src/monitor/state.js +24 -1
- package/src/monitor/webhook.js +32 -11
- package/src/scanner/temporal-analysis.js +8 -0
- package/src/scanner/temporal-ast-diff.js +5 -0
- package/.dockerignore +0 -7
- package/.env.example +0 -43
- package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
- package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
- package/ml-retrain/auto-labeler/labeler.py +0 -256
- package/ml-retrain/auto-labeler/npm_checker.py +0 -228
- package/ml-retrain/auto-labeler/ossf_index.py +0 -178
- package/ml-retrain/auto-labeler/requirements.txt +0 -1
- package/ml-retrain/confusion-matrix.png +0 -0
- package/ml-retrain/model-trees-retrained.js +0 -12
- package/ml-retrain/retrain-report.json +0 -225
- package/ml-retrain/retrain.py +0 -974
- package/sbom.json +0 -0
- package/src/ml/train-bundler-detector.py +0 -725
- package/src/ml/train-xgboost.py +0 -957
- package/tools/export-model-js.py +0 -160
- package/tools/requirements-ml.txt +0 -5
- package/tools/train-classifier.py +0 -333
package/src/monitor/webhook.js
CHANGED
|
@@ -16,6 +16,7 @@ const {
|
|
|
16
16
|
DAILY_REPORTS_LOG_DIR,
|
|
17
17
|
getParisDateString,
|
|
18
18
|
getParisHour,
|
|
19
|
+
DAILY_REPORT_HOUR,
|
|
19
20
|
loadScanStats,
|
|
20
21
|
loadDetections,
|
|
21
22
|
saveLastDailyReportDate,
|
|
@@ -60,7 +61,8 @@ const HIGH_INTENT_TYPES = new Set([
|
|
|
60
61
|
'remote_code_load', 'obfuscation_detected'
|
|
61
62
|
]);
|
|
62
63
|
|
|
63
|
-
|
|
64
|
+
// DAILY_REPORT_HOUR (=8) is imported from state.js (single source of truth) and
|
|
65
|
+
// re-exported below for back-compat (monitor.js / tests import it via webhook).
|
|
64
66
|
|
|
65
67
|
// --- Webhook alerting ---
|
|
66
68
|
|
|
@@ -1152,6 +1154,14 @@ function buildDailyReportEmbed(stats, dailyAlerts, ledgerRollup) {
|
|
|
1152
1154
|
* @param {Map} downloadsCache - In-memory downloads cache (will be cleared)
|
|
1153
1155
|
*/
|
|
1154
1156
|
async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCache) {
|
|
1157
|
+
// Dead-zone guard (defense in depth): never send or stamp before the 08:00 Paris window.
|
|
1158
|
+
// The scheduled gate (isDailyReportDue) already excludes 00:00–07:59, but an ungated /
|
|
1159
|
+
// manual / test caller firing at e.g. 00:43 would otherwise write-ahead the NEW day's date
|
|
1160
|
+
// (below) and suppress that day's real report. This makes the early stamp impossible.
|
|
1161
|
+
if (getParisHour() < DAILY_REPORT_HOUR) {
|
|
1162
|
+
console.log(`[MONITOR] Daily report suppressed: before ${DAILY_REPORT_HOUR}:00 Paris (hour=${getParisHour()})`);
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1155
1165
|
// Crash-safe headline: a restart-storm around report time can zero the in-memory
|
|
1156
1166
|
// counter (the monitor OOM-restarts ~10×/day). Floor scanned/clean/suspect at the
|
|
1157
1167
|
// durable scan-stats delta so we never publish "5" when ~44k were really scanned.
|
|
@@ -1171,6 +1181,10 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1171
1181
|
// Persist the monotonic scan-stats counter as the baseline for the NEXT report's
|
|
1172
1182
|
// delta. Written before the (now last) webhook so a mid-send kill can't double-count.
|
|
1173
1183
|
saveLastDailyReportDate(today, captureScanStatsBaseline());
|
|
1184
|
+
// Observability: the success path previously logged nothing, which made the late-fire bug
|
|
1185
|
+
// invisible in the journal. Log the stamped date + the actual Paris hour (an on-time 08:00
|
|
1186
|
+
// fire vs a catch-up at hour 14 are now distinguishable) + the headline count.
|
|
1187
|
+
console.log(`[MONITOR] Daily report firing for ${today} (hour=${getParisHour()} Paris, scanned=${stats.scanned})`);
|
|
1174
1188
|
|
|
1175
1189
|
// Phase 0b: compute the ledger rollup ONCE so the embed shows exactly the numbers
|
|
1176
1190
|
// we persist (no double-scan, no drift between Discord and the on-disk metrics).
|
|
@@ -1365,16 +1379,23 @@ async function sendReportNow(stats) {
|
|
|
1365
1379
|
return { sent: false, message: `Webhook failed: ${err.message}` };
|
|
1366
1380
|
}
|
|
1367
1381
|
|
|
1368
|
-
// Update lastDailyReportDate on disk
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1382
|
+
// Update lastDailyReportDate on disk — but ONLY at/after 08:00 Paris. A manual report run
|
|
1383
|
+
// before 08:00 is a deliberate operator override (we still SEND it), but it must NOT stamp
|
|
1384
|
+
// today's date: hasReportBeenSentToday() keys off the Paris calendar date, so an early
|
|
1385
|
+
// stamp would suppress that day's scheduled 08:00 report (the exact failure we're fixing).
|
|
1386
|
+
if (getParisHour() >= DAILY_REPORT_HOUR) {
|
|
1387
|
+
const today = getParisDateString();
|
|
1388
|
+
const stateRaw = loadStateRaw();
|
|
1389
|
+
const state = {
|
|
1390
|
+
npmLastPackage: stateRaw.npmLastPackage || '',
|
|
1391
|
+
pypiLastPackage: stateRaw.pypiLastPackage || ''
|
|
1392
|
+
};
|
|
1393
|
+
stats.lastDailyReportDate = today;
|
|
1394
|
+
saveState(state, stats);
|
|
1395
|
+
saveLastDailyReportDate(today);
|
|
1396
|
+
} else {
|
|
1397
|
+
console.log(`[MONITOR] Manual report sent; not stamping (before ${DAILY_REPORT_HOUR}:00 Paris — the scheduled report will still fire today)`);
|
|
1398
|
+
}
|
|
1378
1399
|
|
|
1379
1400
|
return { sent: true, message: 'Daily report sent' };
|
|
1380
1401
|
}
|
|
@@ -121,6 +121,14 @@ function _fetchPackageMetadataHttp(packageName) {
|
|
|
121
121
|
return;
|
|
122
122
|
}
|
|
123
123
|
|
|
124
|
+
if (res.statusCode === 429) {
|
|
125
|
+
res.resume();
|
|
126
|
+
// Coordinated backoff on the shared registry limiter — the temporal scanners must
|
|
127
|
+
// signal 429 like the metadata path, not hammer through a rate limit (CLAUDE.md storm).
|
|
128
|
+
try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
|
|
129
|
+
reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}`));
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
124
132
|
if (res.statusCode < 200 || res.statusCode >= 300) {
|
|
125
133
|
res.resume();
|
|
126
134
|
reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}`));
|
|
@@ -71,6 +71,11 @@ function _fetchVersionMetadataHttp(packageName, version) {
|
|
|
71
71
|
res.resume();
|
|
72
72
|
return reject(new Error(`Version ${version} not found for package ${packageName}`));
|
|
73
73
|
}
|
|
74
|
+
if (res.statusCode === 429) {
|
|
75
|
+
res.resume();
|
|
76
|
+
try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
|
|
77
|
+
return reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}@${version}`));
|
|
78
|
+
}
|
|
74
79
|
if (res.statusCode < 200 || res.statusCode >= 300) {
|
|
75
80
|
res.resume();
|
|
76
81
|
return reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}@${version}`));
|
package/.dockerignore
DELETED
package/.env.example
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
# MUAD'DIB environment variables — template
|
|
2
|
-
# Copy to .env (local dev) or /opt/muaddib/.env (VPS) and fill in real values.
|
|
3
|
-
# .env files are gitignored. NEVER commit a real token.
|
|
4
|
-
|
|
5
|
-
# ----------------------------------------------------------------------------
|
|
6
|
-
# Threat-feed API tokens (all OPTIONAL — scrapers degrade gracefully if absent)
|
|
7
|
-
# ----------------------------------------------------------------------------
|
|
8
|
-
|
|
9
|
-
# OpenSourceMalware.com — community-verified threat intel
|
|
10
|
-
# Free tier: 60 req/min, /query-latest gives 100 most recent threats per ecosystem.
|
|
11
|
-
# Sign up + generate at: https://opensourcemalware.com/auth → profile → API Tokens
|
|
12
|
-
# Format: osm_<random-32+chars>
|
|
13
|
-
# Used by: src/ioc/scraper.js → scrapeOSMQueryLatest()
|
|
14
|
-
OSM_API_TOKEN=
|
|
15
|
-
|
|
16
|
-
# ----------------------------------------------------------------------------
|
|
17
|
-
# Webhook destinations (optional — monitor alerts)
|
|
18
|
-
# ----------------------------------------------------------------------------
|
|
19
|
-
|
|
20
|
-
# Discord webhook for monitor alerts (P1/P2/P3 triage)
|
|
21
|
-
# DISCORD_WEBHOOK_URL=
|
|
22
|
-
|
|
23
|
-
# ----------------------------------------------------------------------------
|
|
24
|
-
# FPR plan gates — DEFAULT ON since v2.11.9 (no need to set these unless opting OUT)
|
|
25
|
-
# ----------------------------------------------------------------------------
|
|
26
|
-
# Measured impact on the v2.11.4 evaluation corpus (1054 packages):
|
|
27
|
-
# FPR curated 15.6% -> 9.36% (-6.24 pp), FPR random 7.0% -> 2.0% (-5.00 pp).
|
|
28
|
-
# TPR@3 / TPR@20 / ADR strictly unchanged.
|
|
29
|
-
#
|
|
30
|
-
# Opt-OUT individual gates (uncomment + set to 0):
|
|
31
|
-
# MUADDIB_FN_REACHABILITY=0 # function-level reachability gating
|
|
32
|
-
# MUADDIB_DECAY=0 # group score decay on bundled outputs
|
|
33
|
-
# MUADDIB_MATURE_CAP=0 # cap mature, well-trafficked packages at MEDIUM
|
|
34
|
-
# MUADDIB_METADATA_FACTOR=0 # registry signals -> reputation multiplier
|
|
35
|
-
# MUADDIB_DELTA_MODE=0 # delta scoring against prior versions
|
|
36
|
-
#
|
|
37
|
-
# Skip ALL network fetches (npm registry packument + GitHub Releases IOC
|
|
38
|
-
# bootstrap) in one shot. Disables MATURE_CAP + METADATA_FACTOR + DELTA_MODE
|
|
39
|
-
# at the per-scan level AND the first-run IOC database download. Useful for:
|
|
40
|
-
# - air-gap / offline CI environments
|
|
41
|
-
# - test runners (set automatically by tests/run-tests.js)
|
|
42
|
-
# - perf-critical batch scans where you've pre-warmed the IOC cache
|
|
43
|
-
# MUADDIB_NO_REGISTRY_FETCH=1
|
|
@@ -1,312 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
MUAD'DIB Auto-Labeling Pipeline
|
|
4
|
-
|
|
5
|
-
Correlates muaddib suspects with external signals (OSSF, GHSA, npm status)
|
|
6
|
-
to produce ground truth labels for ML training.
|
|
7
|
-
|
|
8
|
-
Usage:
|
|
9
|
-
python auto_labeler.py --full # Run all steps
|
|
10
|
-
python auto_labeler.py --step ossf # Run individual step
|
|
11
|
-
python auto_labeler.py --step npm
|
|
12
|
-
python auto_labeler.py --step ghsa
|
|
13
|
-
python auto_labeler.py --step label
|
|
14
|
-
python auto_labeler.py --update # Cron mode: re-check pending/unconfirmed
|
|
15
|
-
|
|
16
|
-
Environment:
|
|
17
|
-
GITHUB_TOKEN Optional, for higher GHSA API rate limits
|
|
18
|
-
MUADDIB_DATA Override data directory (default: /opt/muaddib/data)
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
import argparse
|
|
22
|
-
import json
|
|
23
|
-
import logging
|
|
24
|
-
import os
|
|
25
|
-
import sys
|
|
26
|
-
from datetime import datetime, timezone
|
|
27
|
-
from pathlib import Path
|
|
28
|
-
|
|
29
|
-
import ossf_index
|
|
30
|
-
import ghsa_checker
|
|
31
|
-
import npm_checker
|
|
32
|
-
import labeler
|
|
33
|
-
|
|
34
|
-
# ── Paths ──
|
|
35
|
-
MUADDIB_DATA = Path(os.environ.get("MUADDIB_DATA", "/opt/muaddib/data"))
|
|
36
|
-
MUADDIB_ALERTS = Path(os.environ.get("MUADDIB_ALERTS", "/opt/muaddib/logs/alerts"))
|
|
37
|
-
BASE_DIR = Path(__file__).parent
|
|
38
|
-
CACHE_DIR = BASE_DIR / "data"
|
|
39
|
-
OSSF_REPO_DIR = CACHE_DIR / "ossf-malicious-packages"
|
|
40
|
-
OUTPUT_PATH = MUADDIB_DATA / "auto-labels.json"
|
|
41
|
-
|
|
42
|
-
log = logging.getLogger("auto-labeler")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def setup_logging(verbose=False):
|
|
46
|
-
level = logging.DEBUG if verbose else logging.INFO
|
|
47
|
-
fmt = "%(asctime)s [%(name)s] %(levelname)s %(message)s"
|
|
48
|
-
logging.basicConfig(level=level, format=fmt, datefmt="%Y-%m-%d %H:%M:%S")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def load_detections():
|
|
52
|
-
"""Load detections.json from muaddib data directory."""
|
|
53
|
-
path = MUADDIB_DATA / "detections.json"
|
|
54
|
-
if not path.is_file():
|
|
55
|
-
log.error("detections.json not found at %s", path)
|
|
56
|
-
sys.exit(1)
|
|
57
|
-
|
|
58
|
-
with open(path, "r", encoding="utf-8") as f:
|
|
59
|
-
data = json.load(f)
|
|
60
|
-
|
|
61
|
-
detections = data.get("detections", [])
|
|
62
|
-
npm_count = sum(1 for d in detections if d.get("ecosystem") == "npm")
|
|
63
|
-
log.info("Loaded %d detections (%d npm)", len(detections), npm_count)
|
|
64
|
-
return detections
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def load_alert_scores():
|
|
68
|
-
"""Load risk scores and tiers from individual alert files.
|
|
69
|
-
|
|
70
|
-
Scans logs/alerts/ for JSON files and extracts score + tier info.
|
|
71
|
-
Returns dict keyed by "name@version".
|
|
72
|
-
"""
|
|
73
|
-
scores = {}
|
|
74
|
-
|
|
75
|
-
# Try cached scores first
|
|
76
|
-
cache_path = CACHE_DIR / "alert-scores-cache.json"
|
|
77
|
-
if cache_path.is_file():
|
|
78
|
-
try:
|
|
79
|
-
with open(cache_path, "r", encoding="utf-8") as f:
|
|
80
|
-
cached = json.load(f)
|
|
81
|
-
if cached.get("count", 0) > 0:
|
|
82
|
-
log.info("Loaded %d cached alert scores", cached["count"])
|
|
83
|
-
return cached.get("scores", {})
|
|
84
|
-
except (json.JSONDecodeError, OSError):
|
|
85
|
-
pass
|
|
86
|
-
|
|
87
|
-
if not MUADDIB_ALERTS.is_dir():
|
|
88
|
-
log.warning("Alerts directory not found at %s — scores will be estimated from severity",
|
|
89
|
-
MUADDIB_ALERTS)
|
|
90
|
-
return scores
|
|
91
|
-
|
|
92
|
-
alert_files = list(MUADDIB_ALERTS.glob("*.json"))
|
|
93
|
-
log.info("Scanning %d alert files for scores...", len(alert_files))
|
|
94
|
-
|
|
95
|
-
for filepath in alert_files:
|
|
96
|
-
try:
|
|
97
|
-
with open(filepath, "r", encoding="utf-8") as f:
|
|
98
|
-
alert = json.load(f)
|
|
99
|
-
|
|
100
|
-
target = alert.get("target", "")
|
|
101
|
-
summary = alert.get("summary", {})
|
|
102
|
-
score = summary.get("riskScore", summary.get("globalRiskScore", 0))
|
|
103
|
-
|
|
104
|
-
# Parse target: "npm/package-name@version" or "pypi/package@version"
|
|
105
|
-
if "/" in target and "@" in target:
|
|
106
|
-
eco_pkg = target.split("/", 1)
|
|
107
|
-
if len(eco_pkg) == 2:
|
|
108
|
-
pkg_ver = eco_pkg[1]
|
|
109
|
-
# Determine tier from priority
|
|
110
|
-
priority = alert.get("priority", {})
|
|
111
|
-
tier = ""
|
|
112
|
-
p_level = priority.get("level", "")
|
|
113
|
-
if p_level == "P1":
|
|
114
|
-
tier = "T1a"
|
|
115
|
-
elif p_level == "P2":
|
|
116
|
-
tier = "T1b"
|
|
117
|
-
elif p_level == "P3":
|
|
118
|
-
tier = "T2"
|
|
119
|
-
|
|
120
|
-
scores[pkg_ver] = {"score": score, "tier": tier}
|
|
121
|
-
|
|
122
|
-
except (json.JSONDecodeError, OSError):
|
|
123
|
-
continue
|
|
124
|
-
|
|
125
|
-
# Cache for next run
|
|
126
|
-
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
127
|
-
with open(cache_path, "w", encoding="utf-8") as f:
|
|
128
|
-
json.dump({"count": len(scores), "built_at": datetime.now(timezone.utc).isoformat(),
|
|
129
|
-
"scores": scores}, f)
|
|
130
|
-
|
|
131
|
-
log.info("Extracted scores from %d alerts", len(scores))
|
|
132
|
-
return scores
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
# ── Steps ──
|
|
136
|
-
|
|
137
|
-
def step_ossf():
|
|
138
|
-
"""Step 1: Index OSSF malicious-packages."""
|
|
139
|
-
log.info("=== Step 1: OSSF Index ===")
|
|
140
|
-
ossf_index.clone_or_update(OSSF_REPO_DIR)
|
|
141
|
-
index = ossf_index.build_index(OSSF_REPO_DIR, CACHE_DIR)
|
|
142
|
-
return index
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def step_ghsa():
|
|
146
|
-
"""Step 3: Index GitHub Advisory Database."""
|
|
147
|
-
log.info("=== Step 3: GHSA Index ===")
|
|
148
|
-
# Try cache first
|
|
149
|
-
index = ghsa_checker.load_cached_index(CACHE_DIR)
|
|
150
|
-
if index is not None:
|
|
151
|
-
return index
|
|
152
|
-
return ghsa_checker.build_index(CACHE_DIR)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def step_npm(detections):
|
|
156
|
-
"""Step 2: Check npm status for suspects."""
|
|
157
|
-
log.info("=== Step 2: npm Status Check ===")
|
|
158
|
-
return npm_checker.check_suspects(detections, CACHE_DIR)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def step_label(detections, o_index, g_index, npm_status, alert_scores):
|
|
162
|
-
"""Step 4: Generate labels."""
|
|
163
|
-
log.info("=== Step 4: Generate Labels ===")
|
|
164
|
-
|
|
165
|
-
labels = labeler.label_suspects(detections, o_index, g_index, npm_status, alert_scores)
|
|
166
|
-
missed = labeler.find_missed(o_index, g_index, detections)
|
|
167
|
-
summary = labeler.export_labels(labels, missed, OUTPUT_PATH)
|
|
168
|
-
|
|
169
|
-
return summary
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
# ── Modes ──
|
|
173
|
-
|
|
174
|
-
def run_full():
|
|
175
|
-
"""Run all steps sequentially."""
|
|
176
|
-
log.info("Starting full auto-labeling pipeline")
|
|
177
|
-
start = datetime.now()
|
|
178
|
-
|
|
179
|
-
detections = load_detections()
|
|
180
|
-
alert_scores = load_alert_scores()
|
|
181
|
-
|
|
182
|
-
# Steps 1+3 don't depend on detections — could be parallel but keep it simple
|
|
183
|
-
o_index = step_ossf()
|
|
184
|
-
g_index = step_ghsa()
|
|
185
|
-
npm_status = step_npm(detections)
|
|
186
|
-
|
|
187
|
-
summary = step_label(detections, o_index, g_index, npm_status, alert_scores)
|
|
188
|
-
|
|
189
|
-
elapsed = (datetime.now() - start).total_seconds()
|
|
190
|
-
log.info("Pipeline complete in %.1fs — %s", elapsed, summary)
|
|
191
|
-
return summary
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def run_update():
|
|
195
|
-
"""Cron mode: re-check pending/unconfirmed labels against fresh external data."""
|
|
196
|
-
log.info("Starting update (cron mode)")
|
|
197
|
-
|
|
198
|
-
# Refresh external indices
|
|
199
|
-
ossf_index.clone_or_update(OSSF_REPO_DIR)
|
|
200
|
-
o_index = ossf_index.build_index(OSSF_REPO_DIR, CACHE_DIR)
|
|
201
|
-
g_index = ghsa_checker.build_index(CACHE_DIR)
|
|
202
|
-
|
|
203
|
-
# Load existing labels
|
|
204
|
-
if not OUTPUT_PATH.is_file():
|
|
205
|
-
log.error("No existing auto-labels.json — run --full first")
|
|
206
|
-
sys.exit(1)
|
|
207
|
-
|
|
208
|
-
with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
|
|
209
|
-
existing = json.load(f)
|
|
210
|
-
|
|
211
|
-
existing_labels = existing.get("labels", {})
|
|
212
|
-
detections = load_detections()
|
|
213
|
-
alert_scores = load_alert_scores()
|
|
214
|
-
|
|
215
|
-
# Find labels that need re-evaluation
|
|
216
|
-
to_recheck = []
|
|
217
|
-
for key, entry in existing_labels.items():
|
|
218
|
-
lbl = entry.get("auto_label")
|
|
219
|
-
if lbl in ("pending", "unconfirmed", "likely_malicious"):
|
|
220
|
-
# Re-extract detection info
|
|
221
|
-
for det in detections:
|
|
222
|
-
if f"{det['package']}@{det['version']}" == key:
|
|
223
|
-
to_recheck.append(det)
|
|
224
|
-
break
|
|
225
|
-
|
|
226
|
-
if not to_recheck:
|
|
227
|
-
log.info("No pending/unconfirmed labels to re-check")
|
|
228
|
-
return
|
|
229
|
-
|
|
230
|
-
log.info("Re-checking %d labels (pending/unconfirmed/likely_malicious)", len(to_recheck))
|
|
231
|
-
|
|
232
|
-
# Re-check npm status for these specific packages
|
|
233
|
-
npm_status = npm_checker.check_suspects(to_recheck, CACHE_DIR)
|
|
234
|
-
|
|
235
|
-
# Re-label
|
|
236
|
-
updated = labeler.label_suspects(to_recheck, o_index, g_index, npm_status, alert_scores)
|
|
237
|
-
|
|
238
|
-
# Merge updates into existing labels
|
|
239
|
-
changes = 0
|
|
240
|
-
for key, new_entry in updated.items():
|
|
241
|
-
old = existing_labels.get(key, {})
|
|
242
|
-
if old.get("auto_label") != new_entry.get("auto_label"):
|
|
243
|
-
log.info("RELABEL: %s — %s → %s",
|
|
244
|
-
key, old.get("auto_label"), new_entry.get("auto_label"))
|
|
245
|
-
changes += 1
|
|
246
|
-
existing_labels[key] = new_entry
|
|
247
|
-
|
|
248
|
-
# Also refresh missed detection
|
|
249
|
-
missed = labeler.find_missed(o_index, g_index, detections)
|
|
250
|
-
for name, info in missed.items():
|
|
251
|
-
mk = f"{name}@*"
|
|
252
|
-
if mk not in existing_labels:
|
|
253
|
-
existing_labels[mk] = info
|
|
254
|
-
changes += 1
|
|
255
|
-
|
|
256
|
-
# Re-export
|
|
257
|
-
labeler.export_labels(
|
|
258
|
-
{k: v for k, v in existing_labels.items() if v.get("auto_label") != "missed"},
|
|
259
|
-
{k.replace("@*", ""): v for k, v in existing_labels.items() if v.get("auto_label") == "missed"},
|
|
260
|
-
OUTPUT_PATH,
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
log.info("Update complete: %d labels changed", changes)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
def main():
|
|
267
|
-
parser = argparse.ArgumentParser(description="MUAD'DIB Auto-Labeling Pipeline")
|
|
268
|
-
group = parser.add_mutually_exclusive_group(required=True)
|
|
269
|
-
group.add_argument("--full", action="store_true", help="Run all steps")
|
|
270
|
-
group.add_argument("--step", choices=["ossf", "ghsa", "npm", "label"],
|
|
271
|
-
help="Run individual step")
|
|
272
|
-
group.add_argument("--update", action="store_true",
|
|
273
|
-
help="Cron: re-check pending/unconfirmed")
|
|
274
|
-
parser.add_argument("-v", "--verbose", action="store_true", help="Debug logging")
|
|
275
|
-
parser.add_argument("--data-dir", help="Override MUADDIB_DATA path")
|
|
276
|
-
parser.add_argument("--alerts-dir", help="Override MUADDIB_ALERTS path")
|
|
277
|
-
args = parser.parse_args()
|
|
278
|
-
|
|
279
|
-
setup_logging(args.verbose)
|
|
280
|
-
|
|
281
|
-
if args.data_dir:
|
|
282
|
-
global MUADDIB_DATA, OUTPUT_PATH
|
|
283
|
-
MUADDIB_DATA = Path(args.data_dir)
|
|
284
|
-
OUTPUT_PATH = MUADDIB_DATA / "auto-labels.json"
|
|
285
|
-
if args.alerts_dir:
|
|
286
|
-
global MUADDIB_ALERTS
|
|
287
|
-
MUADDIB_ALERTS = Path(args.alerts_dir)
|
|
288
|
-
|
|
289
|
-
if args.full:
|
|
290
|
-
run_full()
|
|
291
|
-
elif args.update:
|
|
292
|
-
run_update()
|
|
293
|
-
elif args.step == "ossf":
|
|
294
|
-
step_ossf()
|
|
295
|
-
elif args.step == "ghsa":
|
|
296
|
-
step_ghsa()
|
|
297
|
-
elif args.step == "npm":
|
|
298
|
-
step_npm(load_detections())
|
|
299
|
-
elif args.step == "label":
|
|
300
|
-
detections = load_detections()
|
|
301
|
-
alert_scores = load_alert_scores()
|
|
302
|
-
o_index = ossf_index.load_cached_index(CACHE_DIR)
|
|
303
|
-
g_index = ghsa_checker.load_cached_index(CACHE_DIR)
|
|
304
|
-
npm_status = npm_checker._load_cache(CACHE_DIR / npm_checker.CACHE_FILENAME)
|
|
305
|
-
if o_index is None or g_index is None:
|
|
306
|
-
log.error("Run --step ossf and --step ghsa first (or use --full)")
|
|
307
|
-
sys.exit(1)
|
|
308
|
-
step_label(detections, o_index, g_index, npm_status, alert_scores)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
if __name__ == "__main__":
|
|
312
|
-
main()
|
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
GitHub Advisory Database checker.
|
|
3
|
-
|
|
4
|
-
Fetches all npm malware advisories from the GitHub Advisory Database API.
|
|
5
|
-
Supports optional GITHUB_TOKEN env var for higher rate limits.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
import logging
|
|
10
|
-
import os
|
|
11
|
-
import time
|
|
12
|
-
from datetime import datetime
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
|
|
15
|
-
import requests
|
|
16
|
-
|
|
17
|
-
log = logging.getLogger("auto-labeler.ghsa")
|
|
18
|
-
|
|
19
|
-
GHSA_API = "https://api.github.com/advisories"
|
|
20
|
-
INDEX_FILENAME = "ghsa-index.json"
|
|
21
|
-
# Cache validity: 12 hours
|
|
22
|
-
CACHE_TTL_SECONDS = 12 * 3600
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _get_headers():
|
|
26
|
-
token = os.environ.get("GITHUB_TOKEN")
|
|
27
|
-
headers = {"Accept": "application/vnd.github+json"}
|
|
28
|
-
if token:
|
|
29
|
-
headers["Authorization"] = f"Bearer {token}"
|
|
30
|
-
log.info("Using GITHUB_TOKEN for GHSA API (5000 req/h)")
|
|
31
|
-
else:
|
|
32
|
-
log.info("No GITHUB_TOKEN — GHSA API limited to 60 req/h")
|
|
33
|
-
return headers
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def fetch_malware_advisories():
|
|
37
|
-
"""Fetch all npm malware advisories from GHSA. Returns list of advisories."""
|
|
38
|
-
headers = _get_headers()
|
|
39
|
-
advisories = []
|
|
40
|
-
page = 1
|
|
41
|
-
per_page = 100
|
|
42
|
-
|
|
43
|
-
while True:
|
|
44
|
-
params = {
|
|
45
|
-
"type": "malware",
|
|
46
|
-
"ecosystem": "npm",
|
|
47
|
-
"per_page": per_page,
|
|
48
|
-
"page": page,
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
for attempt in range(3):
|
|
52
|
-
try:
|
|
53
|
-
resp = requests.get(GHSA_API, headers=headers, params=params, timeout=30)
|
|
54
|
-
|
|
55
|
-
if resp.status_code == 403:
|
|
56
|
-
# Rate limited
|
|
57
|
-
retry_after = int(resp.headers.get("Retry-After", 60))
|
|
58
|
-
log.warning("GHSA rate limited, waiting %ds", retry_after)
|
|
59
|
-
time.sleep(retry_after)
|
|
60
|
-
continue
|
|
61
|
-
|
|
62
|
-
resp.raise_for_status()
|
|
63
|
-
break
|
|
64
|
-
except requests.RequestException as e:
|
|
65
|
-
wait = 2 ** attempt * 5
|
|
66
|
-
log.warning("GHSA request failed (attempt %d): %s — retrying in %ds",
|
|
67
|
-
attempt + 1, e, wait)
|
|
68
|
-
time.sleep(wait)
|
|
69
|
-
else:
|
|
70
|
-
log.error("GHSA fetch failed after 3 attempts on page %d", page)
|
|
71
|
-
break
|
|
72
|
-
|
|
73
|
-
batch = resp.json()
|
|
74
|
-
if not batch:
|
|
75
|
-
break
|
|
76
|
-
|
|
77
|
-
advisories.extend(batch)
|
|
78
|
-
log.info("GHSA page %d: %d advisories (total: %d)", page, len(batch), len(advisories))
|
|
79
|
-
|
|
80
|
-
if len(batch) < per_page:
|
|
81
|
-
break
|
|
82
|
-
page += 1
|
|
83
|
-
time.sleep(1) # Courtesy delay
|
|
84
|
-
|
|
85
|
-
return advisories
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def build_index(cache_dir):
|
|
89
|
-
"""Build GHSA index from API. Returns dict keyed by package name."""
|
|
90
|
-
advisories = fetch_malware_advisories()
|
|
91
|
-
index = {}
|
|
92
|
-
|
|
93
|
-
for adv in advisories:
|
|
94
|
-
ghsa_id = adv.get("ghsa_id", "")
|
|
95
|
-
published = adv.get("published_at", "")
|
|
96
|
-
summary = adv.get("summary", "")
|
|
97
|
-
withdrawn = adv.get("withdrawn_at")
|
|
98
|
-
|
|
99
|
-
# Skip withdrawn advisories
|
|
100
|
-
if withdrawn:
|
|
101
|
-
continue
|
|
102
|
-
|
|
103
|
-
for vuln in adv.get("vulnerabilities", []):
|
|
104
|
-
pkg = vuln.get("package", {})
|
|
105
|
-
ecosystem = pkg.get("ecosystem", "").lower()
|
|
106
|
-
name = pkg.get("name", "")
|
|
107
|
-
|
|
108
|
-
if ecosystem != "npm" or not name:
|
|
109
|
-
continue
|
|
110
|
-
|
|
111
|
-
version_range = vuln.get("vulnerable_version_range", "")
|
|
112
|
-
|
|
113
|
-
entry = {
|
|
114
|
-
"source": "ghsa",
|
|
115
|
-
"ghsa_id": ghsa_id,
|
|
116
|
-
"date": published,
|
|
117
|
-
"summary": summary[:200],
|
|
118
|
-
"version_range": version_range,
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
# Index by package name (version matching is approximate for GHSA)
|
|
122
|
-
if name not in index:
|
|
123
|
-
index[name] = []
|
|
124
|
-
index[name].append(entry)
|
|
125
|
-
|
|
126
|
-
log.info("GHSA index: %d packages from %d advisories", len(index), len(advisories))
|
|
127
|
-
|
|
128
|
-
# Cache to disk
|
|
129
|
-
cache_dir = Path(cache_dir)
|
|
130
|
-
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
131
|
-
cache_path = cache_dir / INDEX_FILENAME
|
|
132
|
-
with open(cache_path, "w", encoding="utf-8") as f:
|
|
133
|
-
json.dump({"built_at": datetime.utcnow().isoformat() + "Z",
|
|
134
|
-
"count": len(index),
|
|
135
|
-
"index": index}, f)
|
|
136
|
-
log.info("GHSA index cached to %s", cache_path)
|
|
137
|
-
|
|
138
|
-
return index
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def load_cached_index(cache_dir):
|
|
142
|
-
"""Load index from cache if fresh enough."""
|
|
143
|
-
cache_path = Path(cache_dir) / INDEX_FILENAME
|
|
144
|
-
if not cache_path.is_file():
|
|
145
|
-
return None
|
|
146
|
-
try:
|
|
147
|
-
stat = cache_path.stat()
|
|
148
|
-
age = time.time() - stat.st_mtime
|
|
149
|
-
if age > CACHE_TTL_SECONDS:
|
|
150
|
-
log.info("GHSA cache expired (%.1fh old)", age / 3600)
|
|
151
|
-
return None
|
|
152
|
-
|
|
153
|
-
with open(cache_path, "r", encoding="utf-8") as f:
|
|
154
|
-
data = json.load(f)
|
|
155
|
-
log.info("Loaded cached GHSA index (%d packages, built %s)",
|
|
156
|
-
data.get("count", 0), data.get("built_at", "?"))
|
|
157
|
-
return data.get("index", {})
|
|
158
|
-
except (json.JSONDecodeError, OSError) as e:
|
|
159
|
-
log.warning("Failed to load GHSA cache: %s", e)
|
|
160
|
-
return None
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def lookup(index, name):
|
|
164
|
-
"""Check if a package name is in the GHSA index.
|
|
165
|
-
|
|
166
|
-
Returns the list of advisory entries or None.
|
|
167
|
-
"""
|
|
168
|
-
entries = index.get(name)
|
|
169
|
-
return entries if entries else None
|