muaddib-scanner 2.10.49 → 2.10.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,312 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MUAD'DIB Auto-Labeling Pipeline
4
+
5
+ Correlates muaddib suspects with external signals (OSSF, GHSA, npm status)
6
+ to produce ground truth labels for ML training.
7
+
8
+ Usage:
9
+ python auto_labeler.py --full # Run all steps
10
+ python auto_labeler.py --step ossf # Run individual step
11
+ python auto_labeler.py --step npm
12
+ python auto_labeler.py --step ghsa
13
+ python auto_labeler.py --step label
14
+ python auto_labeler.py --update # Cron mode: re-check pending/unconfirmed
15
+
16
+ Environment:
17
+ GITHUB_TOKEN Optional, for higher GHSA API rate limits
18
+ MUADDIB_DATA Override data directory (default: /opt/muaddib/data)
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import logging
24
+ import os
25
+ import sys
26
+ from datetime import datetime, timezone
27
+ from pathlib import Path
28
+
29
+ import ossf_index
30
+ import ghsa_checker
31
+ import npm_checker
32
+ import labeler
33
+
34
+ # ── Paths ──
35
+ MUADDIB_DATA = Path(os.environ.get("MUADDIB_DATA", "/opt/muaddib/data"))
36
+ MUADDIB_ALERTS = Path(os.environ.get("MUADDIB_ALERTS", "/opt/muaddib/logs/alerts"))
37
+ BASE_DIR = Path(__file__).parent
38
+ CACHE_DIR = BASE_DIR / "data"
39
+ OSSF_REPO_DIR = CACHE_DIR / "ossf-malicious-packages"
40
+ OUTPUT_PATH = MUADDIB_DATA / "auto-labels.json"
41
+
42
+ log = logging.getLogger("auto-labeler")
43
+
44
+
45
+ def setup_logging(verbose=False):
46
+ level = logging.DEBUG if verbose else logging.INFO
47
+ fmt = "%(asctime)s [%(name)s] %(levelname)s %(message)s"
48
+ logging.basicConfig(level=level, format=fmt, datefmt="%Y-%m-%d %H:%M:%S")
49
+
50
+
51
+ def load_detections():
52
+ """Load detections.json from muaddib data directory."""
53
+ path = MUADDIB_DATA / "detections.json"
54
+ if not path.is_file():
55
+ log.error("detections.json not found at %s", path)
56
+ sys.exit(1)
57
+
58
+ with open(path, "r", encoding="utf-8") as f:
59
+ data = json.load(f)
60
+
61
+ detections = data.get("detections", [])
62
+ npm_count = sum(1 for d in detections if d.get("ecosystem") == "npm")
63
+ log.info("Loaded %d detections (%d npm)", len(detections), npm_count)
64
+ return detections
65
+
66
+
67
+ def load_alert_scores():
68
+ """Load risk scores and tiers from individual alert files.
69
+
70
+ Scans logs/alerts/ for JSON files and extracts score + tier info.
71
+ Returns dict keyed by "name@version".
72
+ """
73
+ scores = {}
74
+
75
+ # Try cached scores first
76
+ cache_path = CACHE_DIR / "alert-scores-cache.json"
77
+ if cache_path.is_file():
78
+ try:
79
+ with open(cache_path, "r", encoding="utf-8") as f:
80
+ cached = json.load(f)
81
+ if cached.get("count", 0) > 0:
82
+ log.info("Loaded %d cached alert scores", cached["count"])
83
+ return cached.get("scores", {})
84
+ except (json.JSONDecodeError, OSError):
85
+ pass
86
+
87
+ if not MUADDIB_ALERTS.is_dir():
88
+ log.warning("Alerts directory not found at %s — scores will be estimated from severity",
89
+ MUADDIB_ALERTS)
90
+ return scores
91
+
92
+ alert_files = list(MUADDIB_ALERTS.glob("*.json"))
93
+ log.info("Scanning %d alert files for scores...", len(alert_files))
94
+
95
+ for filepath in alert_files:
96
+ try:
97
+ with open(filepath, "r", encoding="utf-8") as f:
98
+ alert = json.load(f)
99
+
100
+ target = alert.get("target", "")
101
+ summary = alert.get("summary", {})
102
+ score = summary.get("riskScore", summary.get("globalRiskScore", 0))
103
+
104
+ # Parse target: "npm/package-name@version" or "pypi/package@version"
105
+ if "/" in target and "@" in target:
106
+ eco_pkg = target.split("/", 1)
107
+ if len(eco_pkg) == 2:
108
+ pkg_ver = eco_pkg[1]
109
+ # Determine tier from priority
110
+ priority = alert.get("priority", {})
111
+ tier = ""
112
+ p_level = priority.get("level", "")
113
+ if p_level == "P1":
114
+ tier = "T1a"
115
+ elif p_level == "P2":
116
+ tier = "T1b"
117
+ elif p_level == "P3":
118
+ tier = "T2"
119
+
120
+ scores[pkg_ver] = {"score": score, "tier": tier}
121
+
122
+ except (json.JSONDecodeError, OSError):
123
+ continue
124
+
125
+ # Cache for next run
126
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
127
+ with open(cache_path, "w", encoding="utf-8") as f:
128
+ json.dump({"count": len(scores), "built_at": datetime.now(timezone.utc).isoformat(),
129
+ "scores": scores}, f)
130
+
131
+ log.info("Extracted scores from %d alerts", len(scores))
132
+ return scores
133
+
134
+
135
+ # ── Steps ──
136
+
137
+ def step_ossf():
138
+ """Step 1: Index OSSF malicious-packages."""
139
+ log.info("=== Step 1: OSSF Index ===")
140
+ ossf_index.clone_or_update(OSSF_REPO_DIR)
141
+ index = ossf_index.build_index(OSSF_REPO_DIR, CACHE_DIR)
142
+ return index
143
+
144
+
145
+ def step_ghsa():
146
+ """Step 3: Index GitHub Advisory Database."""
147
+ log.info("=== Step 3: GHSA Index ===")
148
+ # Try cache first
149
+ index = ghsa_checker.load_cached_index(CACHE_DIR)
150
+ if index is not None:
151
+ return index
152
+ return ghsa_checker.build_index(CACHE_DIR)
153
+
154
+
155
+ def step_npm(detections):
156
+ """Step 2: Check npm status for suspects."""
157
+ log.info("=== Step 2: npm Status Check ===")
158
+ return npm_checker.check_suspects(detections, CACHE_DIR)
159
+
160
+
161
+ def step_label(detections, o_index, g_index, npm_status, alert_scores):
162
+ """Step 4: Generate labels."""
163
+ log.info("=== Step 4: Generate Labels ===")
164
+
165
+ labels = labeler.label_suspects(detections, o_index, g_index, npm_status, alert_scores)
166
+ missed = labeler.find_missed(o_index, g_index, detections)
167
+ summary = labeler.export_labels(labels, missed, OUTPUT_PATH)
168
+
169
+ return summary
170
+
171
+
172
+ # ── Modes ──
173
+
174
+ def run_full():
175
+ """Run all steps sequentially."""
176
+ log.info("Starting full auto-labeling pipeline")
177
+ start = datetime.now()
178
+
179
+ detections = load_detections()
180
+ alert_scores = load_alert_scores()
181
+
182
+ # Steps 1+3 don't depend on detections — could be parallel but keep it simple
183
+ o_index = step_ossf()
184
+ g_index = step_ghsa()
185
+ npm_status = step_npm(detections)
186
+
187
+ summary = step_label(detections, o_index, g_index, npm_status, alert_scores)
188
+
189
+ elapsed = (datetime.now() - start).total_seconds()
190
+ log.info("Pipeline complete in %.1fs — %s", elapsed, summary)
191
+ return summary
192
+
193
+
194
+ def run_update():
195
+ """Cron mode: re-check pending/unconfirmed labels against fresh external data."""
196
+ log.info("Starting update (cron mode)")
197
+
198
+ # Refresh external indices
199
+ ossf_index.clone_or_update(OSSF_REPO_DIR)
200
+ o_index = ossf_index.build_index(OSSF_REPO_DIR, CACHE_DIR)
201
+ g_index = ghsa_checker.build_index(CACHE_DIR)
202
+
203
+ # Load existing labels
204
+ if not OUTPUT_PATH.is_file():
205
+ log.error("No existing auto-labels.json — run --full first")
206
+ sys.exit(1)
207
+
208
+ with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
209
+ existing = json.load(f)
210
+
211
+ existing_labels = existing.get("labels", {})
212
+ detections = load_detections()
213
+ alert_scores = load_alert_scores()
214
+
215
+ # Find labels that need re-evaluation
216
+ to_recheck = []
217
+ for key, entry in existing_labels.items():
218
+ lbl = entry.get("auto_label")
219
+ if lbl in ("pending", "unconfirmed", "likely_malicious"):
220
+ # Re-extract detection info
221
+ for det in detections:
222
+ if f"{det['package']}@{det['version']}" == key:
223
+ to_recheck.append(det)
224
+ break
225
+
226
+ if not to_recheck:
227
+ log.info("No pending/unconfirmed labels to re-check")
228
+ return
229
+
230
+ log.info("Re-checking %d labels (pending/unconfirmed/likely_malicious)", len(to_recheck))
231
+
232
+ # Re-check npm status for these specific packages
233
+ npm_status = npm_checker.check_suspects(to_recheck, CACHE_DIR)
234
+
235
+ # Re-label
236
+ updated = labeler.label_suspects(to_recheck, o_index, g_index, npm_status, alert_scores)
237
+
238
+ # Merge updates into existing labels
239
+ changes = 0
240
+ for key, new_entry in updated.items():
241
+ old = existing_labels.get(key, {})
242
+ if old.get("auto_label") != new_entry.get("auto_label"):
243
+ log.info("RELABEL: %s — %s → %s",
244
+ key, old.get("auto_label"), new_entry.get("auto_label"))
245
+ changes += 1
246
+ existing_labels[key] = new_entry
247
+
248
+ # Also refresh missed detection
249
+ missed = labeler.find_missed(o_index, g_index, detections)
250
+ for name, info in missed.items():
251
+ mk = f"{name}@*"
252
+ if mk not in existing_labels:
253
+ existing_labels[mk] = info
254
+ changes += 1
255
+
256
+ # Re-export
257
+ labeler.export_labels(
258
+ {k: v for k, v in existing_labels.items() if v.get("auto_label") != "missed"},
259
+ {k.replace("@*", ""): v for k, v in existing_labels.items() if v.get("auto_label") == "missed"},
260
+ OUTPUT_PATH,
261
+ )
262
+
263
+ log.info("Update complete: %d labels changed", changes)
264
+
265
+
266
+ def main():
267
+ parser = argparse.ArgumentParser(description="MUAD'DIB Auto-Labeling Pipeline")
268
+ group = parser.add_mutually_exclusive_group(required=True)
269
+ group.add_argument("--full", action="store_true", help="Run all steps")
270
+ group.add_argument("--step", choices=["ossf", "ghsa", "npm", "label"],
271
+ help="Run individual step")
272
+ group.add_argument("--update", action="store_true",
273
+ help="Cron: re-check pending/unconfirmed")
274
+ parser.add_argument("-v", "--verbose", action="store_true", help="Debug logging")
275
+ parser.add_argument("--data-dir", help="Override MUADDIB_DATA path")
276
+ parser.add_argument("--alerts-dir", help="Override MUADDIB_ALERTS path")
277
+ args = parser.parse_args()
278
+
279
+ setup_logging(args.verbose)
280
+
281
+ if args.data_dir:
282
+ global MUADDIB_DATA, OUTPUT_PATH
283
+ MUADDIB_DATA = Path(args.data_dir)
284
+ OUTPUT_PATH = MUADDIB_DATA / "auto-labels.json"
285
+ if args.alerts_dir:
286
+ global MUADDIB_ALERTS
287
+ MUADDIB_ALERTS = Path(args.alerts_dir)
288
+
289
+ if args.full:
290
+ run_full()
291
+ elif args.update:
292
+ run_update()
293
+ elif args.step == "ossf":
294
+ step_ossf()
295
+ elif args.step == "ghsa":
296
+ step_ghsa()
297
+ elif args.step == "npm":
298
+ step_npm(load_detections())
299
+ elif args.step == "label":
300
+ detections = load_detections()
301
+ alert_scores = load_alert_scores()
302
+ o_index = ossf_index.load_cached_index(CACHE_DIR)
303
+ g_index = ghsa_checker.load_cached_index(CACHE_DIR)
304
+ npm_status = npm_checker._load_cache(CACHE_DIR / npm_checker.CACHE_FILENAME)
305
+ if o_index is None or g_index is None:
306
+ log.error("Run --step ossf and --step ghsa first (or use --full)")
307
+ sys.exit(1)
308
+ step_label(detections, o_index, g_index, npm_status, alert_scores)
309
+
310
+
311
+ if __name__ == "__main__":
312
+ main()
@@ -0,0 +1,169 @@
1
+ """
2
+ GitHub Advisory Database checker.
3
+
4
+ Fetches all npm malware advisories from the GitHub Advisory Database API.
5
+ Supports optional GITHUB_TOKEN env var for higher rate limits.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import os
11
+ import time
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+
15
+ import requests
16
+
17
+ log = logging.getLogger("auto-labeler.ghsa")
18
+
19
+ GHSA_API = "https://api.github.com/advisories"
20
+ INDEX_FILENAME = "ghsa-index.json"
21
+ # Cache validity: 12 hours
22
+ CACHE_TTL_SECONDS = 12 * 3600
23
+
24
+
25
+ def _get_headers():
26
+ token = os.environ.get("GITHUB_TOKEN")
27
+ headers = {"Accept": "application/vnd.github+json"}
28
+ if token:
29
+ headers["Authorization"] = f"Bearer {token}"
30
+ log.info("Using GITHUB_TOKEN for GHSA API (5000 req/h)")
31
+ else:
32
+ log.info("No GITHUB_TOKEN — GHSA API limited to 60 req/h")
33
+ return headers
34
+
35
+
36
+ def fetch_malware_advisories():
37
+ """Fetch all npm malware advisories from GHSA. Returns list of advisories."""
38
+ headers = _get_headers()
39
+ advisories = []
40
+ page = 1
41
+ per_page = 100
42
+
43
+ while True:
44
+ params = {
45
+ "type": "malware",
46
+ "ecosystem": "npm",
47
+ "per_page": per_page,
48
+ "page": page,
49
+ }
50
+
51
+ for attempt in range(3):
52
+ try:
53
+ resp = requests.get(GHSA_API, headers=headers, params=params, timeout=30)
54
+
55
+ if resp.status_code == 403:
56
+ # Rate limited
57
+ retry_after = int(resp.headers.get("Retry-After", 60))
58
+ log.warning("GHSA rate limited, waiting %ds", retry_after)
59
+ time.sleep(retry_after)
60
+ continue
61
+
62
+ resp.raise_for_status()
63
+ break
64
+ except requests.RequestException as e:
65
+ wait = 2 ** attempt * 5
66
+ log.warning("GHSA request failed (attempt %d): %s — retrying in %ds",
67
+ attempt + 1, e, wait)
68
+ time.sleep(wait)
69
+ else:
70
+ log.error("GHSA fetch failed after 3 attempts on page %d", page)
71
+ break
72
+
73
+ batch = resp.json()
74
+ if not batch:
75
+ break
76
+
77
+ advisories.extend(batch)
78
+ log.info("GHSA page %d: %d advisories (total: %d)", page, len(batch), len(advisories))
79
+
80
+ if len(batch) < per_page:
81
+ break
82
+ page += 1
83
+ time.sleep(1) # Courtesy delay
84
+
85
+ return advisories
86
+
87
+
88
+ def build_index(cache_dir):
89
+ """Build GHSA index from API. Returns dict keyed by package name."""
90
+ advisories = fetch_malware_advisories()
91
+ index = {}
92
+
93
+ for adv in advisories:
94
+ ghsa_id = adv.get("ghsa_id", "")
95
+ published = adv.get("published_at", "")
96
+ summary = adv.get("summary", "")
97
+ withdrawn = adv.get("withdrawn_at")
98
+
99
+ # Skip withdrawn advisories
100
+ if withdrawn:
101
+ continue
102
+
103
+ for vuln in adv.get("vulnerabilities", []):
104
+ pkg = vuln.get("package", {})
105
+ ecosystem = pkg.get("ecosystem", "").lower()
106
+ name = pkg.get("name", "")
107
+
108
+ if ecosystem != "npm" or not name:
109
+ continue
110
+
111
+ version_range = vuln.get("vulnerable_version_range", "")
112
+
113
+ entry = {
114
+ "source": "ghsa",
115
+ "ghsa_id": ghsa_id,
116
+ "date": published,
117
+ "summary": summary[:200],
118
+ "version_range": version_range,
119
+ }
120
+
121
+ # Index by package name (version matching is approximate for GHSA)
122
+ if name not in index:
123
+ index[name] = []
124
+ index[name].append(entry)
125
+
126
+ log.info("GHSA index: %d packages from %d advisories", len(index), len(advisories))
127
+
128
+ # Cache to disk
129
+ cache_dir = Path(cache_dir)
130
+ cache_dir.mkdir(parents=True, exist_ok=True)
131
+ cache_path = cache_dir / INDEX_FILENAME
132
+ with open(cache_path, "w", encoding="utf-8") as f:
133
+ json.dump({"built_at": datetime.utcnow().isoformat() + "Z",
134
+ "count": len(index),
135
+ "index": index}, f)
136
+ log.info("GHSA index cached to %s", cache_path)
137
+
138
+ return index
139
+
140
+
141
+ def load_cached_index(cache_dir):
142
+ """Load index from cache if fresh enough."""
143
+ cache_path = Path(cache_dir) / INDEX_FILENAME
144
+ if not cache_path.is_file():
145
+ return None
146
+ try:
147
+ stat = cache_path.stat()
148
+ age = time.time() - stat.st_mtime
149
+ if age > CACHE_TTL_SECONDS:
150
+ log.info("GHSA cache expired (%.1fh old)", age / 3600)
151
+ return None
152
+
153
+ with open(cache_path, "r", encoding="utf-8") as f:
154
+ data = json.load(f)
155
+ log.info("Loaded cached GHSA index (%d packages, built %s)",
156
+ data.get("count", 0), data.get("built_at", "?"))
157
+ return data.get("index", {})
158
+ except (json.JSONDecodeError, OSError) as e:
159
+ log.warning("Failed to load GHSA cache: %s", e)
160
+ return None
161
+
162
+
163
+ def lookup(index, name):
164
+ """Check if a package name is in the GHSA index.
165
+
166
+ Returns the list of advisory entries or None.
167
+ """
168
+ entries = index.get(name)
169
+ return entries if entries else None
@@ -0,0 +1,249 @@
1
+ """
2
+ Label generation engine.
3
+
4
+ Correlates signals from OSSF, GHSA, and npm status to produce labels.
5
+
6
+ Label tiers (by confidence):
7
+ - confirmed_malicious: authoritative source (ossf/ghsa) OR npm takedown pattern
8
+ - likely_malicious: npm_removed + high muaddib score, but no authoritative confirmation
9
+ - unconfirmed: suspect in muaddib, still on npm, no external signal, >7 days old
10
+ - pending: suspect in muaddib, still on npm, no external signal, <7 days old
11
+ - missed: clean in muaddib BUT flagged by ossf/ghsa (false negative)
12
+ """
13
+
14
+ import json
15
+ import logging
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+
19
+ from ossf_index import lookup as ossf_lookup
20
+ from ghsa_checker import lookup as ghsa_lookup
21
+ from npm_checker import is_quick_takedown
22
+
23
+ log = logging.getLogger("auto-labeler.labeler")
24
+
25
+ # Thresholds
26
+ SCORE_THRESHOLD_CONFIRMED = 50 # Minimum muaddib score for npm_removed → confirmed
27
+ PENDING_DAYS = 7 # Days before pending → unconfirmed
28
+
29
+
30
+ def _parse_iso(s):
31
+ """Parse ISO 8601 date string to datetime."""
32
+ if not s:
33
+ return None
34
+ try:
35
+ return datetime.fromisoformat(s.replace("Z", "+00:00"))
36
+ except (ValueError, TypeError):
37
+ return None
38
+
39
+
40
+ def _days_since(iso_str):
41
+ """Days elapsed since the given ISO date string."""
42
+ dt = _parse_iso(iso_str)
43
+ if not dt:
44
+ return None
45
+ delta = datetime.now(timezone.utc) - dt
46
+ return delta.total_seconds() / 86400
47
+
48
+
49
+ def _severity_to_score_estimate(severity):
50
+ """Rough score estimate from severity when exact score is unavailable."""
51
+ return {"CRITICAL": 70, "HIGH": 40, "MEDIUM": 15, "LOW": 5}.get(severity, 0)
52
+
53
+
54
+ def label_suspects(detections, ossf_index, ghsa_index, npm_status, alert_scores):
55
+ """Generate labels for all suspect detections.
56
+
57
+ Args:
58
+ detections: list of detection dicts from detections.json
59
+ ossf_index: dict from ossf_index.build_index()
60
+ ghsa_index: dict from ghsa_checker.build_index()
61
+ npm_status: dict from npm_checker.check_suspects()
62
+ alert_scores: dict keyed by "name@version" with {"score": N, "tier": "T1a"} from alerts
63
+
64
+ Returns:
65
+ dict keyed by "name@version" with label info
66
+ """
67
+ labels = {}
68
+ stats = {"confirmed_malicious": 0, "likely_malicious": 0,
69
+ "unconfirmed": 0, "pending": 0}
70
+
71
+ for det in detections:
72
+ name = det["package"]
73
+ version = det["version"]
74
+ ecosystem = det.get("ecosystem", "npm")
75
+ key = f"{name}@{version}"
76
+ detection_date = det.get("first_seen_at", "")
77
+ severity = det.get("severity", "UNKNOWN")
78
+ findings = det.get("findings", [])
79
+
80
+ # Skip non-npm for now (OSSF/GHSA npm-focused)
81
+ if ecosystem != "npm":
82
+ continue
83
+
84
+ # Gather signals
85
+ signals = []
86
+
87
+ # Signal 1: OSSF
88
+ ossf_hit = ossf_lookup(ossf_index, name, version)
89
+ if ossf_hit:
90
+ signals.append("ossf")
91
+
92
+ # Signal 2: GHSA
93
+ ghsa_hit = ghsa_lookup(ghsa_index, name)
94
+ if ghsa_hit:
95
+ signals.append("ghsa")
96
+
97
+ # Signal 3: npm status
98
+ npm_result = npm_status.get(key, {})
99
+ npm_removed = npm_result.get("status") == "npm_removed"
100
+ if npm_removed:
101
+ signals.append("npm_removed")
102
+
103
+ # Get score from alerts or estimate from severity
104
+ score_info = alert_scores.get(key, {})
105
+ score = score_info.get("score", _severity_to_score_estimate(severity))
106
+ tier = score_info.get("tier", "")
107
+
108
+ # Determine label
109
+ label = _classify(signals, npm_result, detection_date, score)
110
+ stats[label] += 1
111
+
112
+ labels[key] = {
113
+ "muaddib_label": "suspect",
114
+ "auto_label": label,
115
+ "signals": signals,
116
+ "muaddib_score": score,
117
+ "muaddib_tier": tier,
118
+ "muaddib_severity": severity,
119
+ "muaddib_findings": findings,
120
+ "detection_date": detection_date,
121
+ "label_date": datetime.now(timezone.utc).isoformat(),
122
+ "npm_status": npm_result.get("status", "unknown"),
123
+ "npm_publish_date": npm_result.get("publish_date"),
124
+ }
125
+
126
+ if ossf_hit:
127
+ labels[key]["ossf_id"] = ossf_hit.get("osv_id")
128
+ if ghsa_hit:
129
+ labels[key]["ghsa_id"] = ghsa_hit[0].get("ghsa_id")
130
+
131
+ log.debug("LABEL %s → %s (signals=%s, score=%d)", key, label, signals, score)
132
+
133
+ log.info("Suspect labels: %s", stats)
134
+ return labels
135
+
136
+
137
+ def _classify(signals, npm_result, detection_date, score):
138
+ """Core classification logic."""
139
+ has_authoritative = "ossf" in signals or "ghsa" in signals
140
+ npm_removed = "npm_removed" in signals
141
+
142
+ # Tier 1: Authoritative source confirms malicious
143
+ if has_authoritative:
144
+ return "confirmed_malicious"
145
+
146
+ # Tier 2: npm takedown pattern (removed + high score + quick removal)
147
+ if npm_removed and score >= SCORE_THRESHOLD_CONFIRMED:
148
+ if is_quick_takedown(npm_result, detection_date, threshold_hours=72):
149
+ return "confirmed_malicious"
150
+
151
+ # Tier 3: npm removed but doesn't meet confirmation criteria
152
+ if npm_removed:
153
+ return "likely_malicious"
154
+
155
+ # Tier 4: Still on npm, no external signal
156
+ days = _days_since(detection_date)
157
+ if days is not None and days > PENDING_DAYS:
158
+ return "unconfirmed"
159
+
160
+ return "pending"
161
+
162
+
163
+ def find_missed(ossf_index, ghsa_index, detections):
164
+ """Find packages in OSSF/GHSA that muaddib did NOT detect (false negatives).
165
+
166
+ Returns dict keyed by package name with miss details.
167
+ """
168
+ # Build set of all detected package names
169
+ detected_names = set()
170
+ for det in detections:
171
+ if det.get("ecosystem") == "npm":
172
+ detected_names.add(det["package"])
173
+
174
+ missed = {}
175
+
176
+ # Check OSSF index
177
+ ossf_packages = set()
178
+ for key in ossf_index:
179
+ name = key.rsplit("@", 1)[0]
180
+ ossf_packages.add(name)
181
+
182
+ for name in ossf_packages:
183
+ if name not in detected_names:
184
+ missed[name] = {
185
+ "auto_label": "missed",
186
+ "muaddib_label": "clean",
187
+ "signals": ["ossf"],
188
+ "source_detail": "In ossf/malicious-packages but not in muaddib detections",
189
+ "label_date": datetime.now(timezone.utc).isoformat(),
190
+ }
191
+
192
+ # Check GHSA index
193
+ for name, entries in ghsa_index.items():
194
+ if name not in detected_names:
195
+ existing = missed.get(name)
196
+ if existing:
197
+ existing["signals"].append("ghsa")
198
+ else:
199
+ missed[name] = {
200
+ "auto_label": "missed",
201
+ "muaddib_label": "clean",
202
+ "signals": ["ghsa"],
203
+ "ghsa_id": entries[0].get("ghsa_id") if entries else None,
204
+ "source_detail": "In GHSA malware advisories but not in muaddib detections",
205
+ "label_date": datetime.now(timezone.utc).isoformat(),
206
+ }
207
+
208
+ log.info("Missed packages (false negatives): %d", len(missed))
209
+ if missed:
210
+ # Log the first 20 as these are critical for improving the scanner
211
+ for name in list(missed.keys())[:20]:
212
+ m = missed[name]
213
+ log.warning("MISSED: %s (signals=%s)", name, m["signals"])
214
+
215
+ return missed
216
+
217
+
218
+ def export_labels(labels, missed, output_path):
219
+ """Export all labels to auto-labels.json."""
220
+ output_path = Path(output_path)
221
+ output_path.parent.mkdir(parents=True, exist_ok=True)
222
+
223
+ # Merge suspects and missed into one output
224
+ all_labels = dict(labels)
225
+ for name, info in missed.items():
226
+ all_labels[f"{name}@*"] = info
227
+
228
+ # Generate summary
229
+ summary = {"confirmed_malicious": 0, "likely_malicious": 0,
230
+ "unconfirmed": 0, "pending": 0, "missed": 0}
231
+ for entry in all_labels.values():
232
+ lbl = entry.get("auto_label", "unknown")
233
+ if lbl in summary:
234
+ summary[lbl] += 1
235
+
236
+ output = {
237
+ "generated_at": datetime.now(timezone.utc).isoformat(),
238
+ "summary": summary,
239
+ "total": len(all_labels),
240
+ "labels": all_labels,
241
+ }
242
+
243
+ with open(output_path, "w", encoding="utf-8") as f:
244
+ json.dump(output, f, indent=2)
245
+
246
+ log.info("Exported %d labels to %s", len(all_labels), output_path)
247
+ log.info("Summary: %s", summary)
248
+
249
+ return summary
@@ -0,0 +1,228 @@
1
+ """
2
+ npm registry status checker.
3
+
4
+ For each suspect package, checks if the package/version still exists on npm.
5
+ Extracts publish timing for temporal correlation (quick takedown = strong signal).
6
+ Rate-limited to 50 requests/minute with exponential backoff.
7
+ Resumable: saves progress to npm-status-cache.json.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import time
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+
16
+ import requests
17
+
18
+ log = logging.getLogger("auto-labeler.npm")
19
+
20
+ NPM_REGISTRY = "https://registry.npmjs.org"
21
+ RATE_LIMIT = 50 # requests per minute
22
+ RATE_WINDOW = 60 # seconds
23
+ CACHE_FILENAME = "npm-status-cache.json"
24
+ # Don't re-check packages checked within this window
25
+ RECHECK_INTERVAL_SECONDS = 24 * 3600 # 24h
26
+
27
+
28
+ def _rate_limiter():
29
+ """Generator-based rate limiter. Call next() before each request."""
30
+ timestamps = []
31
+ while True:
32
+ now = time.time()
33
+ # Purge timestamps older than the window
34
+ timestamps = [t for t in timestamps if now - t < RATE_WINDOW]
35
+ if len(timestamps) >= RATE_LIMIT:
36
+ sleep_time = timestamps[0] + RATE_WINDOW - now + 0.1
37
+ log.debug("Rate limit reached, sleeping %.1fs", sleep_time)
38
+ time.sleep(sleep_time)
39
+ now = time.time()
40
+ timestamps = [t for t in timestamps if now - t < RATE_WINDOW]
41
+ timestamps.append(now)
42
+ yield
43
+
44
+
45
+ def _fetch_package_info(session, name, limiter):
46
+ """Fetch package metadata from npm. Returns (status, info) tuple."""
47
+ next(limiter)
48
+
49
+ url = f"{NPM_REGISTRY}/{name}"
50
+ for attempt in range(3):
51
+ try:
52
+ resp = session.get(url, timeout=15)
53
+
54
+ if resp.status_code == 404:
55
+ return "npm_removed", {"reason": "package_404"}
56
+
57
+ if resp.status_code == 429:
58
+ retry_after = int(resp.headers.get("Retry-After", 30))
59
+ log.warning("npm 429 for %s, waiting %ds", name, retry_after)
60
+ time.sleep(retry_after)
61
+ continue
62
+
63
+ resp.raise_for_status()
64
+ return "npm_available", resp.json()
65
+
66
+ except requests.RequestException as e:
67
+ wait = 2 ** attempt * 3
68
+ log.warning("npm fetch failed for %s (attempt %d): %s",
69
+ name, attempt + 1, e)
70
+ time.sleep(wait)
71
+
72
+ return "npm_error", {"reason": "fetch_failed_after_retries"}
73
+
74
+
75
+ def check_suspects(suspects, cache_dir):
76
+ """Check npm status for each suspect. Returns dict of results.
77
+
78
+ Args:
79
+ suspects: list of dicts with 'package', 'version', 'ecosystem' keys
80
+ cache_dir: path to cache directory
81
+
82
+ Returns:
83
+ dict keyed by "name@version" with status info
84
+ """
85
+ cache_dir = Path(cache_dir)
86
+ cache_dir.mkdir(parents=True, exist_ok=True)
87
+ cache_path = cache_dir / CACHE_FILENAME
88
+
89
+ # Load existing cache for resumability
90
+ cache = _load_cache(cache_path)
91
+
92
+ # Deduplicate suspects by name@version, npm only
93
+ unique = {}
94
+ for s in suspects:
95
+ if s.get("ecosystem") != "npm":
96
+ continue
97
+ key = f"{s['package']}@{s['version']}"
98
+ if key not in unique:
99
+ unique[key] = s
100
+
101
+ # Filter out recently checked
102
+ now = time.time()
103
+ to_check = {}
104
+ for key, s in unique.items():
105
+ cached = cache.get(key)
106
+ if cached and (now - cached.get("checked_at", 0)) < RECHECK_INTERVAL_SECONDS:
107
+ continue
108
+ to_check[key] = s
109
+
110
+ log.info("npm check: %d unique suspects, %d already cached, %d to check",
111
+ len(unique), len(unique) - len(to_check), len(to_check))
112
+
113
+ if not to_check:
114
+ return cache
115
+
116
+ session = requests.Session()
117
+ session.headers.update({"Accept": "application/json"})
118
+ limiter = _rate_limiter()
119
+
120
+ checked = 0
121
+ # Group by package name to avoid redundant fetches
122
+ by_name = {}
123
+ for key, s in to_check.items():
124
+ name = s["package"]
125
+ if name not in by_name:
126
+ by_name[name] = []
127
+ by_name[name].append((key, s))
128
+
129
+ total_packages = len(by_name)
130
+
131
+ for i, (name, entries) in enumerate(by_name.items()):
132
+ status, info = _fetch_package_info(session, name, limiter)
133
+
134
+ if i > 0 and i % 100 == 0:
135
+ log.info("npm check progress: %d/%d packages (%.0f%%)",
136
+ i, total_packages, i / total_packages * 100)
137
+ _save_cache(cache, cache_path)
138
+
139
+ for key, s in entries:
140
+ version = s["version"]
141
+ result = {
142
+ "status": status,
143
+ "checked_at": now,
144
+ }
145
+
146
+ if status == "npm_available" and isinstance(info, dict):
147
+ versions = info.get("versions", {})
148
+ time_info = info.get("time", {})
149
+
150
+ if version not in versions:
151
+ result["status"] = "npm_removed"
152
+ result["reason"] = "version_removed"
153
+ else:
154
+ result["reason"] = "available"
155
+
156
+ # Extract timing for temporal correlation
157
+ publish_time = time_info.get(version)
158
+ if publish_time:
159
+ result["publish_date"] = publish_time
160
+
161
+ # Extract latest version publish time
162
+ modified = time_info.get("modified")
163
+ if modified:
164
+ result["last_modified"] = modified
165
+
166
+ elif status == "npm_removed":
167
+ result["reason"] = "package_404"
168
+
169
+ cache[key] = result
170
+ checked += 1
171
+
172
+ _save_cache(cache, cache_path)
173
+ log.info("npm check complete: %d packages checked, %d total cached",
174
+ checked, len(cache))
175
+
176
+ return cache
177
+
178
+
179
+ def _load_cache(cache_path):
180
+ """Load npm status cache from disk."""
181
+ if not cache_path.is_file():
182
+ return {}
183
+ try:
184
+ with open(cache_path, "r", encoding="utf-8") as f:
185
+ data = json.load(f)
186
+ if isinstance(data, dict) and "results" in data:
187
+ return data["results"]
188
+ return {}
189
+ except (json.JSONDecodeError, OSError):
190
+ return {}
191
+
192
+
193
+ def _save_cache(cache, cache_path):
194
+ """Save npm status cache to disk."""
195
+ try:
196
+ with open(cache_path, "w", encoding="utf-8") as f:
197
+ json.dump({
198
+ "saved_at": datetime.utcnow().isoformat() + "Z",
199
+ "count": len(cache),
200
+ "results": cache,
201
+ }, f)
202
+ except OSError as e:
203
+ log.error("Failed to save npm cache: %s", e)
204
+
205
+
206
+ def is_quick_takedown(result, detection_date_str, threshold_hours=72):
207
+ """Check if a package was removed quickly after publish (npm security takedown pattern).
208
+
209
+ Returns True if the package was removed AND was published recently
210
+ relative to the detection date (within threshold_hours).
211
+ """
212
+ if result.get("status") != "npm_removed":
213
+ return False
214
+
215
+ publish_date = result.get("publish_date")
216
+ if not publish_date:
217
+ return False
218
+
219
+ try:
220
+ publish_dt = datetime.fromisoformat(publish_date.replace("Z", "+00:00"))
221
+ detection_dt = datetime.fromisoformat(detection_date_str.replace("Z", "+00:00"))
222
+ delta_hours = (detection_dt - publish_dt).total_seconds() / 3600
223
+
224
+ # Package was detected within threshold_hours of publish
225
+ # AND has since been removed → strong takedown signal
226
+ return 0 <= delta_hours <= threshold_hours
227
+ except (ValueError, TypeError):
228
+ return False
@@ -0,0 +1,178 @@
1
+ """
2
+ OSSF malicious-packages indexer.
3
+
4
+ Clones (or updates) the ossf/malicious-packages repo with sparse checkout
5
+ limited to osv/malicious/npm/, then parses all OSV JSON files into an index.
6
+ Skips osv/withdrawn/ (retracted false positives).
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import os
12
+ import subprocess
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+
16
+ log = logging.getLogger("auto-labeler.ossf")
17
+
18
+ OSSF_REPO_URL = "https://github.com/ossf/malicious-packages.git"
19
+ OSSF_SPARSE_PATH = "osv/malicious/npm"
20
+
21
+ INDEX_FILENAME = "ossf-index.json"
22
+
23
+
24
+ def _run_git(args, cwd=None):
25
+ """Run a git command, raise on failure."""
26
+ result = subprocess.run(
27
+ ["git"] + args,
28
+ cwd=cwd,
29
+ capture_output=True,
30
+ text=True,
31
+ timeout=300,
32
+ )
33
+ if result.returncode != 0:
34
+ raise RuntimeError(f"git {' '.join(args)} failed: {result.stderr.strip()}")
35
+ return result.stdout.strip()
36
+
37
+
38
+ def clone_or_update(repo_dir):
39
+ """Clone with sparse checkout or git pull if already present."""
40
+ repo_dir = Path(repo_dir)
41
+
42
+ if (repo_dir / ".git").is_dir():
43
+ log.info("OSSF repo exists at %s — pulling latest", repo_dir)
44
+ _run_git(["pull", "--ff-only"], cwd=repo_dir)
45
+ return
46
+
47
+ log.info("Cloning OSSF repo (sparse, depth=1) to %s", repo_dir)
48
+ repo_dir.mkdir(parents=True, exist_ok=True)
49
+
50
+ _run_git(["clone", "--depth", "1", "--filter=blob:none",
51
+ "--sparse", OSSF_REPO_URL, str(repo_dir)])
52
+ _run_git(["sparse-checkout", "set", OSSF_SPARSE_PATH], cwd=repo_dir)
53
+ log.info("OSSF clone complete (sparse: %s)", OSSF_SPARSE_PATH)
54
+
55
+
56
+ def _parse_osv_file(filepath):
57
+ """Parse a single OSV JSON file and yield (key, entry) tuples."""
58
+ try:
59
+ with open(filepath, "r", encoding="utf-8") as f:
60
+ data = json.load(f)
61
+ except (json.JSONDecodeError, OSError) as e:
62
+ log.warning("Skipping invalid OSV file %s: %s", filepath, e)
63
+ return
64
+
65
+ osv_id = data.get("id", "")
66
+ published = data.get("published", "")
67
+ summary = data.get("summary", "")
68
+
69
+ # Extract attack type from database_specific if available
70
+ attack_type = None
71
+ db_specific = data.get("database_specific", {})
72
+ origins = db_specific.get("malicious-packages-origins", [])
73
+ if origins:
74
+ attack_type = origins[0].get("reason", None)
75
+
76
+ for affected in data.get("affected", []):
77
+ pkg = affected.get("package", {})
78
+ ecosystem = pkg.get("ecosystem", "").lower()
79
+ name = pkg.get("name", "")
80
+
81
+ if ecosystem != "npm" or not name:
82
+ continue
83
+
84
+ # Collect explicit versions
85
+ versions = affected.get("versions", [])
86
+
87
+ # Also extract versions from ranges
88
+ for rng in affected.get("ranges", []):
89
+ events = rng.get("events", [])
90
+ for event in events:
91
+ if "introduced" in event and event["introduced"] != "0":
92
+ versions.append(event["introduced"])
93
+
94
+ entry = {
95
+ "source": "ossf",
96
+ "osv_id": osv_id,
97
+ "date": published,
98
+ "summary": summary[:200],
99
+ "attack_type": attack_type,
100
+ }
101
+
102
+ if versions:
103
+ for ver in set(versions):
104
+ yield f"{name}@{ver}", entry
105
+ else:
106
+ # No specific versions — all versions affected
107
+ yield f"{name}@*", entry
108
+
109
+
110
+ def build_index(repo_dir, cache_dir):
111
+ """Build OSSF index from the cloned repo. Returns the index dict."""
112
+ repo_dir = Path(repo_dir)
113
+ cache_dir = Path(cache_dir)
114
+ osv_dir = repo_dir / "osv" / "malicious" / "npm"
115
+
116
+ if not osv_dir.is_dir():
117
+ log.error("OSSF osv/malicious/npm/ not found at %s", osv_dir)
118
+ return {}
119
+
120
+ index = {}
121
+ file_count = 0
122
+ entry_count = 0
123
+
124
+ for root, _dirs, files in os.walk(osv_dir):
125
+ # Skip withdrawn reports
126
+ if "withdrawn" in Path(root).parts:
127
+ continue
128
+
129
+ for fname in files:
130
+ if not fname.endswith(".json"):
131
+ continue
132
+
133
+ filepath = os.path.join(root, fname)
134
+ file_count += 1
135
+
136
+ for key, entry in _parse_osv_file(filepath):
137
+ index[key] = entry
138
+ entry_count += 1
139
+
140
+ log.info("OSSF index: %d entries from %d files", entry_count, file_count)
141
+
142
+ # Cache to disk
143
+ cache_dir.mkdir(parents=True, exist_ok=True)
144
+ cache_path = cache_dir / INDEX_FILENAME
145
+ with open(cache_path, "w", encoding="utf-8") as f:
146
+ json.dump({"built_at": datetime.utcnow().isoformat() + "Z",
147
+ "count": len(index),
148
+ "index": index}, f)
149
+ log.info("OSSF index cached to %s", cache_path)
150
+
151
+ return index
152
+
153
+
154
+ def load_cached_index(cache_dir):
155
+ """Load index from cache if available."""
156
+ cache_path = Path(cache_dir) / INDEX_FILENAME
157
+ if not cache_path.is_file():
158
+ return None
159
+ try:
160
+ with open(cache_path, "r", encoding="utf-8") as f:
161
+ data = json.load(f)
162
+ log.info("Loaded cached OSSF index (%d entries, built %s)",
163
+ data.get("count", 0), data.get("built_at", "?"))
164
+ return data.get("index", {})
165
+ except (json.JSONDecodeError, OSError) as e:
166
+ log.warning("Failed to load OSSF cache: %s", e)
167
+ return None
168
+
169
+
170
+ def lookup(index, name, version):
171
+ """Check if a package@version is in the OSSF index.
172
+
173
+ Returns the entry dict or None. Checks both exact version and wildcard.
174
+ """
175
+ exact = index.get(f"{name}@{version}")
176
+ if exact:
177
+ return exact
178
+ return index.get(f"{name}@*")
@@ -0,0 +1 @@
1
+ requests>=2.28.0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.10.49",
3
+ "version": "2.10.50",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {