muaddib-scanner 2.11.76 → 2.11.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.githooks/pre-commit +18 -0
  2. package/README.md +15 -6
  3. package/bin/muaddib.js +18 -4
  4. package/package.json +1 -2
  5. package/{self-scan-v2.11.76.json → self-scan-v2.11.78.json} +1 -1
  6. package/src/commands/interactive.js +5 -6
  7. package/src/commands/safe-install.js +19 -19
  8. package/src/ioc/scraper.js +46 -10
  9. package/src/monitor/daemon.js +39 -28
  10. package/src/monitor/ingestion.js +32 -2
  11. package/src/monitor/queue.js +84 -21
  12. package/src/monitor/scan-queue.js +68 -1
  13. package/src/monitor/state.js +24 -1
  14. package/src/monitor/webhook.js +32 -11
  15. package/src/output/formatter.js +3 -4
  16. package/src/pipeline/executor.js +9 -1
  17. package/src/runtime/daemon.js +27 -28
  18. package/src/runtime/watch.js +7 -7
  19. package/src/sandbox/index.js +11 -9
  20. package/src/scanner/temporal-analysis.js +8 -0
  21. package/src/scanner/temporal-ast-diff.js +5 -0
  22. package/src/utils.js +60 -1
  23. package/.dockerignore +0 -7
  24. package/.env.example +0 -43
  25. package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
  26. package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
  27. package/ml-retrain/auto-labeler/labeler.py +0 -256
  28. package/ml-retrain/auto-labeler/npm_checker.py +0 -228
  29. package/ml-retrain/auto-labeler/ossf_index.py +0 -178
  30. package/ml-retrain/auto-labeler/requirements.txt +0 -1
  31. package/ml-retrain/confusion-matrix.png +0 -0
  32. package/ml-retrain/model-trees-retrained.js +0 -12
  33. package/ml-retrain/retrain-report.json +0 -225
  34. package/ml-retrain/retrain.py +0 -974
  35. package/sbom.json +0 -0
  36. package/src/ml/train-bundler-detector.py +0 -725
  37. package/src/ml/train-xgboost.py +0 -957
  38. package/tools/export-model-js.py +0 -160
  39. package/tools/requirements-ml.txt +0 -5
  40. package/tools/train-classifier.py +0 -333
@@ -121,6 +121,14 @@ function _fetchPackageMetadataHttp(packageName) {
121
121
  return;
122
122
  }
123
123
 
124
+ if (res.statusCode === 429) {
125
+ res.resume();
126
+ // Coordinated backoff on the shared registry limiter — the temporal scanners must
127
+ // signal 429 like the metadata path, not hammer through a rate limit (CLAUDE.md storm).
128
+ try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
129
+ reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}`));
130
+ return;
131
+ }
124
132
  if (res.statusCode < 200 || res.statusCode >= 300) {
125
133
  res.resume();
126
134
  reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}`));
@@ -71,6 +71,11 @@ function _fetchVersionMetadataHttp(packageName, version) {
71
71
  res.resume();
72
72
  return reject(new Error(`Version ${version} not found for package ${packageName}`));
73
73
  }
74
+ if (res.statusCode === 429) {
75
+ res.resume();
76
+ try { require('../shared/http-limiter.js').signal429(); } catch { /* limiter best-effort */ }
77
+ return reject(new Error(`Registry rate limited (HTTP 429) for ${packageName}@${version}`));
78
+ }
74
79
  if (res.statusCode < 200 || res.statusCode >= 300) {
75
80
  res.resume();
76
81
  return reject(new Error(`Registry returned HTTP ${res.statusCode} for ${packageName}@${version}`));
package/src/utils.js CHANGED
@@ -392,6 +392,62 @@ function debugLog(...args) {
392
392
  if (process.env.MUADDIB_DEBUG) console.error('[DEBUG]', ...args);
393
393
  }
394
394
 
395
+ // eslint-disable-next-line no-control-regex -- ESC (\x1b) is required to strip ANSI color sequences before measuring visible width
396
+ const _ANSI_RE = /\x1b\[[0-9;]*m/g;
397
+
398
+ /**
399
+ * Draws an aligned box-drawing banner around the given content line(s).
400
+ * Width auto-fits the widest VISIBLE line (ANSI stripped before measuring),
401
+ * so the right border is always straight — replacing the hand-padded boxes
402
+ * that drifted out of alignment. Returns the banner as a string (no leading or
403
+ * trailing blank lines — callers add spacing as needed).
404
+ *
405
+ * @param {string[]|string} lines - content line(s)
406
+ * @returns {string}
407
+ */
408
+ function banner(lines) {
409
+ const content = Array.isArray(lines) ? lines : [String(lines)];
410
+ const visibleLen = (s) => String(s).replace(_ANSI_RE, '').length;
411
+ const PAD = 1; // spaces of padding on each side
412
+ const inner = content.reduce((max, l) => Math.max(max, visibleLen(l)), 0) + PAD * 2;
413
+ const top = '╔' + '═'.repeat(inner) + '╗';
414
+ const bottom = '╚' + '═'.repeat(inner) + '╝';
415
+ const body = content.map((l) => {
416
+ const trailing = inner - PAD - visibleLen(l);
417
+ return '║' + ' '.repeat(PAD) + l + ' '.repeat(Math.max(0, trailing)) + '║';
418
+ });
419
+ return [top, ...body, bottom].join('\n');
420
+ }
421
+
422
+ /**
423
+ * True when an error represents a user-initiated prompt cancellation — Ctrl-C
424
+ * inside an @inquirer/prompts prompt, which throws an ExitPromptError whose
425
+ * message contains "force closed the prompt with SIGINT". Lets the CLI exit
426
+ * cleanly (code 130) instead of dumping an [ERROR] line or a stack trace.
427
+ *
428
+ * @param {*} err
429
+ * @returns {boolean}
430
+ */
431
+ function isPromptCancellation(err) {
432
+ if (!err) return false;
433
+ if (err.name === 'ExitPromptError') return true;
434
+ return /SIGINT|force closed the prompt/i.test(err.message || '');
435
+ }
436
+
437
+ /**
438
+ * Renders a fixed-width 20-cell [██░░] bar for a 0–100 risk score. Clamps and
439
+ * guards against undefined / NaN / out-of-range so the CLI never throws a
440
+ * RangeError from String.prototype.repeat on a malformed score.
441
+ *
442
+ * @param {number} score
443
+ * @returns {string} a 20-character bar
444
+ */
445
+ function renderScoreBar(score) {
446
+ const s = Number.isFinite(score) ? Math.max(0, Math.min(100, score)) : 0;
447
+ const filled = Math.floor(s / 5);
448
+ return '█'.repeat(filled) + '░'.repeat(20 - filled);
449
+ }
450
+
395
451
  module.exports = {
396
452
  EXCLUDED_DIRS,
397
453
  MAX_SCAN_FILES,
@@ -409,5 +465,8 @@ module.exports = {
409
465
  getExtraExcludes,
410
466
  forEachSafeFile,
411
467
  listInstalledPackages,
412
- debugLog
468
+ debugLog,
469
+ banner,
470
+ isPromptCancellation,
471
+ renderScoreBar
413
472
  };
package/.dockerignore DELETED
@@ -1,7 +0,0 @@
1
- node_modules
2
- .git
3
- datasets
4
- tests
5
- metrics
6
- .muaddib-cache
7
- *.md
package/.env.example DELETED
@@ -1,43 +0,0 @@
1
- # MUAD'DIB environment variables — template
2
- # Copy to .env (local dev) or /opt/muaddib/.env (VPS) and fill in real values.
3
- # .env files are gitignored. NEVER commit a real token.
4
-
5
- # ----------------------------------------------------------------------------
6
- # Threat-feed API tokens (all OPTIONAL — scrapers degrade gracefully if absent)
7
- # ----------------------------------------------------------------------------
8
-
9
- # OpenSourceMalware.com — community-verified threat intel
10
- # Free tier: 60 req/min, /query-latest gives 100 most recent threats per ecosystem.
11
- # Sign up + generate at: https://opensourcemalware.com/auth → profile → API Tokens
12
- # Format: osm_<random-32+chars>
13
- # Used by: src/ioc/scraper.js → scrapeOSMQueryLatest()
14
- OSM_API_TOKEN=
15
-
16
- # ----------------------------------------------------------------------------
17
- # Webhook destinations (optional — monitor alerts)
18
- # ----------------------------------------------------------------------------
19
-
20
- # Discord webhook for monitor alerts (P1/P2/P3 triage)
21
- # DISCORD_WEBHOOK_URL=
22
-
23
- # ----------------------------------------------------------------------------
24
- # FPR plan gates — DEFAULT ON since v2.11.9 (no need to set these unless opting OUT)
25
- # ----------------------------------------------------------------------------
26
- # Measured impact on the v2.11.4 evaluation corpus (1054 packages):
27
- # FPR curated 15.6% -> 9.36% (-6.24 pp), FPR random 7.0% -> 2.0% (-5.00 pp).
28
- # TPR@3 / TPR@20 / ADR strictly unchanged.
29
- #
30
- # Opt-OUT individual gates (uncomment + set to 0):
31
- # MUADDIB_FN_REACHABILITY=0 # function-level reachability gating
32
- # MUADDIB_DECAY=0 # group score decay on bundled outputs
33
- # MUADDIB_MATURE_CAP=0 # cap mature, well-trafficked packages at MEDIUM
34
- # MUADDIB_METADATA_FACTOR=0 # registry signals -> reputation multiplier
35
- # MUADDIB_DELTA_MODE=0 # delta scoring against prior versions
36
- #
37
- # Skip ALL network fetches (npm registry packument + GitHub Releases IOC
38
- # bootstrap) in one shot. Disables MATURE_CAP + METADATA_FACTOR + DELTA_MODE
39
- # at the per-scan level AND the first-run IOC database download. Useful for:
40
- # - air-gap / offline CI environments
41
- # - test runners (set automatically by tests/run-tests.js)
42
- # - perf-critical batch scans where you've pre-warmed the IOC cache
43
- # MUADDIB_NO_REGISTRY_FETCH=1
@@ -1,312 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- MUAD'DIB Auto-Labeling Pipeline
4
-
5
- Correlates muaddib suspects with external signals (OSSF, GHSA, npm status)
6
- to produce ground truth labels for ML training.
7
-
8
- Usage:
9
- python auto_labeler.py --full # Run all steps
10
- python auto_labeler.py --step ossf # Run individual step
11
- python auto_labeler.py --step npm
12
- python auto_labeler.py --step ghsa
13
- python auto_labeler.py --step label
14
- python auto_labeler.py --update # Cron mode: re-check pending/unconfirmed
15
-
16
- Environment:
17
- GITHUB_TOKEN Optional, for higher GHSA API rate limits
18
- MUADDIB_DATA Override data directory (default: /opt/muaddib/data)
19
- """
20
-
21
- import argparse
22
- import json
23
- import logging
24
- import os
25
- import sys
26
- from datetime import datetime, timezone
27
- from pathlib import Path
28
-
29
- import ossf_index
30
- import ghsa_checker
31
- import npm_checker
32
- import labeler
33
-
34
- # ── Paths ──
35
- MUADDIB_DATA = Path(os.environ.get("MUADDIB_DATA", "/opt/muaddib/data"))
36
- MUADDIB_ALERTS = Path(os.environ.get("MUADDIB_ALERTS", "/opt/muaddib/logs/alerts"))
37
- BASE_DIR = Path(__file__).parent
38
- CACHE_DIR = BASE_DIR / "data"
39
- OSSF_REPO_DIR = CACHE_DIR / "ossf-malicious-packages"
40
- OUTPUT_PATH = MUADDIB_DATA / "auto-labels.json"
41
-
42
- log = logging.getLogger("auto-labeler")
43
-
44
-
45
- def setup_logging(verbose=False):
46
- level = logging.DEBUG if verbose else logging.INFO
47
- fmt = "%(asctime)s [%(name)s] %(levelname)s %(message)s"
48
- logging.basicConfig(level=level, format=fmt, datefmt="%Y-%m-%d %H:%M:%S")
49
-
50
-
51
- def load_detections():
52
- """Load detections.json from muaddib data directory."""
53
- path = MUADDIB_DATA / "detections.json"
54
- if not path.is_file():
55
- log.error("detections.json not found at %s", path)
56
- sys.exit(1)
57
-
58
- with open(path, "r", encoding="utf-8") as f:
59
- data = json.load(f)
60
-
61
- detections = data.get("detections", [])
62
- npm_count = sum(1 for d in detections if d.get("ecosystem") == "npm")
63
- log.info("Loaded %d detections (%d npm)", len(detections), npm_count)
64
- return detections
65
-
66
-
67
- def load_alert_scores():
68
- """Load risk scores and tiers from individual alert files.
69
-
70
- Scans logs/alerts/ for JSON files and extracts score + tier info.
71
- Returns dict keyed by "name@version".
72
- """
73
- scores = {}
74
-
75
- # Try cached scores first
76
- cache_path = CACHE_DIR / "alert-scores-cache.json"
77
- if cache_path.is_file():
78
- try:
79
- with open(cache_path, "r", encoding="utf-8") as f:
80
- cached = json.load(f)
81
- if cached.get("count", 0) > 0:
82
- log.info("Loaded %d cached alert scores", cached["count"])
83
- return cached.get("scores", {})
84
- except (json.JSONDecodeError, OSError):
85
- pass
86
-
87
- if not MUADDIB_ALERTS.is_dir():
88
- log.warning("Alerts directory not found at %s — scores will be estimated from severity",
89
- MUADDIB_ALERTS)
90
- return scores
91
-
92
- alert_files = list(MUADDIB_ALERTS.glob("*.json"))
93
- log.info("Scanning %d alert files for scores...", len(alert_files))
94
-
95
- for filepath in alert_files:
96
- try:
97
- with open(filepath, "r", encoding="utf-8") as f:
98
- alert = json.load(f)
99
-
100
- target = alert.get("target", "")
101
- summary = alert.get("summary", {})
102
- score = summary.get("riskScore", summary.get("globalRiskScore", 0))
103
-
104
- # Parse target: "npm/package-name@version" or "pypi/package@version"
105
- if "/" in target and "@" in target:
106
- eco_pkg = target.split("/", 1)
107
- if len(eco_pkg) == 2:
108
- pkg_ver = eco_pkg[1]
109
- # Determine tier from priority
110
- priority = alert.get("priority", {})
111
- tier = ""
112
- p_level = priority.get("level", "")
113
- if p_level == "P1":
114
- tier = "T1a"
115
- elif p_level == "P2":
116
- tier = "T1b"
117
- elif p_level == "P3":
118
- tier = "T2"
119
-
120
- scores[pkg_ver] = {"score": score, "tier": tier}
121
-
122
- except (json.JSONDecodeError, OSError):
123
- continue
124
-
125
- # Cache for next run
126
- CACHE_DIR.mkdir(parents=True, exist_ok=True)
127
- with open(cache_path, "w", encoding="utf-8") as f:
128
- json.dump({"count": len(scores), "built_at": datetime.now(timezone.utc).isoformat(),
129
- "scores": scores}, f)
130
-
131
- log.info("Extracted scores from %d alerts", len(scores))
132
- return scores
133
-
134
-
135
- # ── Steps ──
136
-
137
- def step_ossf():
138
- """Step 1: Index OSSF malicious-packages."""
139
- log.info("=== Step 1: OSSF Index ===")
140
- ossf_index.clone_or_update(OSSF_REPO_DIR)
141
- index = ossf_index.build_index(OSSF_REPO_DIR, CACHE_DIR)
142
- return index
143
-
144
-
145
- def step_ghsa():
146
- """Step 3: Index GitHub Advisory Database."""
147
- log.info("=== Step 3: GHSA Index ===")
148
- # Try cache first
149
- index = ghsa_checker.load_cached_index(CACHE_DIR)
150
- if index is not None:
151
- return index
152
- return ghsa_checker.build_index(CACHE_DIR)
153
-
154
-
155
- def step_npm(detections):
156
- """Step 2: Check npm status for suspects."""
157
- log.info("=== Step 2: npm Status Check ===")
158
- return npm_checker.check_suspects(detections, CACHE_DIR)
159
-
160
-
161
- def step_label(detections, o_index, g_index, npm_status, alert_scores):
162
- """Step 4: Generate labels."""
163
- log.info("=== Step 4: Generate Labels ===")
164
-
165
- labels = labeler.label_suspects(detections, o_index, g_index, npm_status, alert_scores)
166
- missed = labeler.find_missed(o_index, g_index, detections)
167
- summary = labeler.export_labels(labels, missed, OUTPUT_PATH)
168
-
169
- return summary
170
-
171
-
172
- # ── Modes ──
173
-
174
- def run_full():
175
- """Run all steps sequentially."""
176
- log.info("Starting full auto-labeling pipeline")
177
- start = datetime.now()
178
-
179
- detections = load_detections()
180
- alert_scores = load_alert_scores()
181
-
182
- # Steps 1+3 don't depend on detections — could be parallel but keep it simple
183
- o_index = step_ossf()
184
- g_index = step_ghsa()
185
- npm_status = step_npm(detections)
186
-
187
- summary = step_label(detections, o_index, g_index, npm_status, alert_scores)
188
-
189
- elapsed = (datetime.now() - start).total_seconds()
190
- log.info("Pipeline complete in %.1fs — %s", elapsed, summary)
191
- return summary
192
-
193
-
194
- def run_update():
195
- """Cron mode: re-check pending/unconfirmed labels against fresh external data."""
196
- log.info("Starting update (cron mode)")
197
-
198
- # Refresh external indices
199
- ossf_index.clone_or_update(OSSF_REPO_DIR)
200
- o_index = ossf_index.build_index(OSSF_REPO_DIR, CACHE_DIR)
201
- g_index = ghsa_checker.build_index(CACHE_DIR)
202
-
203
- # Load existing labels
204
- if not OUTPUT_PATH.is_file():
205
- log.error("No existing auto-labels.json — run --full first")
206
- sys.exit(1)
207
-
208
- with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
209
- existing = json.load(f)
210
-
211
- existing_labels = existing.get("labels", {})
212
- detections = load_detections()
213
- alert_scores = load_alert_scores()
214
-
215
- # Find labels that need re-evaluation
216
- to_recheck = []
217
- for key, entry in existing_labels.items():
218
- lbl = entry.get("auto_label")
219
- if lbl in ("pending", "unconfirmed", "likely_malicious"):
220
- # Re-extract detection info
221
- for det in detections:
222
- if f"{det['package']}@{det['version']}" == key:
223
- to_recheck.append(det)
224
- break
225
-
226
- if not to_recheck:
227
- log.info("No pending/unconfirmed labels to re-check")
228
- return
229
-
230
- log.info("Re-checking %d labels (pending/unconfirmed/likely_malicious)", len(to_recheck))
231
-
232
- # Re-check npm status for these specific packages
233
- npm_status = npm_checker.check_suspects(to_recheck, CACHE_DIR)
234
-
235
- # Re-label
236
- updated = labeler.label_suspects(to_recheck, o_index, g_index, npm_status, alert_scores)
237
-
238
- # Merge updates into existing labels
239
- changes = 0
240
- for key, new_entry in updated.items():
241
- old = existing_labels.get(key, {})
242
- if old.get("auto_label") != new_entry.get("auto_label"):
243
- log.info("RELABEL: %s — %s → %s",
244
- key, old.get("auto_label"), new_entry.get("auto_label"))
245
- changes += 1
246
- existing_labels[key] = new_entry
247
-
248
- # Also refresh missed detection
249
- missed = labeler.find_missed(o_index, g_index, detections)
250
- for name, info in missed.items():
251
- mk = f"{name}@*"
252
- if mk not in existing_labels:
253
- existing_labels[mk] = info
254
- changes += 1
255
-
256
- # Re-export
257
- labeler.export_labels(
258
- {k: v for k, v in existing_labels.items() if v.get("auto_label") != "missed"},
259
- {k.replace("@*", ""): v for k, v in existing_labels.items() if v.get("auto_label") == "missed"},
260
- OUTPUT_PATH,
261
- )
262
-
263
- log.info("Update complete: %d labels changed", changes)
264
-
265
-
266
- def main():
267
- parser = argparse.ArgumentParser(description="MUAD'DIB Auto-Labeling Pipeline")
268
- group = parser.add_mutually_exclusive_group(required=True)
269
- group.add_argument("--full", action="store_true", help="Run all steps")
270
- group.add_argument("--step", choices=["ossf", "ghsa", "npm", "label"],
271
- help="Run individual step")
272
- group.add_argument("--update", action="store_true",
273
- help="Cron: re-check pending/unconfirmed")
274
- parser.add_argument("-v", "--verbose", action="store_true", help="Debug logging")
275
- parser.add_argument("--data-dir", help="Override MUADDIB_DATA path")
276
- parser.add_argument("--alerts-dir", help="Override MUADDIB_ALERTS path")
277
- args = parser.parse_args()
278
-
279
- setup_logging(args.verbose)
280
-
281
- if args.data_dir:
282
- global MUADDIB_DATA, OUTPUT_PATH
283
- MUADDIB_DATA = Path(args.data_dir)
284
- OUTPUT_PATH = MUADDIB_DATA / "auto-labels.json"
285
- if args.alerts_dir:
286
- global MUADDIB_ALERTS
287
- MUADDIB_ALERTS = Path(args.alerts_dir)
288
-
289
- if args.full:
290
- run_full()
291
- elif args.update:
292
- run_update()
293
- elif args.step == "ossf":
294
- step_ossf()
295
- elif args.step == "ghsa":
296
- step_ghsa()
297
- elif args.step == "npm":
298
- step_npm(load_detections())
299
- elif args.step == "label":
300
- detections = load_detections()
301
- alert_scores = load_alert_scores()
302
- o_index = ossf_index.load_cached_index(CACHE_DIR)
303
- g_index = ghsa_checker.load_cached_index(CACHE_DIR)
304
- npm_status = npm_checker._load_cache(CACHE_DIR / npm_checker.CACHE_FILENAME)
305
- if o_index is None or g_index is None:
306
- log.error("Run --step ossf and --step ghsa first (or use --full)")
307
- sys.exit(1)
308
- step_label(detections, o_index, g_index, npm_status, alert_scores)
309
-
310
-
311
- if __name__ == "__main__":
312
- main()
@@ -1,169 +0,0 @@
1
- """
2
- GitHub Advisory Database checker.
3
-
4
- Fetches all npm malware advisories from the GitHub Advisory Database API.
5
- Supports optional GITHUB_TOKEN env var for higher rate limits.
6
- """
7
-
8
- import json
9
- import logging
10
- import os
11
- import time
12
- from datetime import datetime
13
- from pathlib import Path
14
-
15
- import requests
16
-
17
- log = logging.getLogger("auto-labeler.ghsa")
18
-
19
- GHSA_API = "https://api.github.com/advisories"
20
- INDEX_FILENAME = "ghsa-index.json"
21
- # Cache validity: 12 hours
22
- CACHE_TTL_SECONDS = 12 * 3600
23
-
24
-
25
- def _get_headers():
26
- token = os.environ.get("GITHUB_TOKEN")
27
- headers = {"Accept": "application/vnd.github+json"}
28
- if token:
29
- headers["Authorization"] = f"Bearer {token}"
30
- log.info("Using GITHUB_TOKEN for GHSA API (5000 req/h)")
31
- else:
32
- log.info("No GITHUB_TOKEN — GHSA API limited to 60 req/h")
33
- return headers
34
-
35
-
36
- def fetch_malware_advisories():
37
- """Fetch all npm malware advisories from GHSA. Returns list of advisories."""
38
- headers = _get_headers()
39
- advisories = []
40
- page = 1
41
- per_page = 100
42
-
43
- while True:
44
- params = {
45
- "type": "malware",
46
- "ecosystem": "npm",
47
- "per_page": per_page,
48
- "page": page,
49
- }
50
-
51
- for attempt in range(3):
52
- try:
53
- resp = requests.get(GHSA_API, headers=headers, params=params, timeout=30)
54
-
55
- if resp.status_code == 403:
56
- # Rate limited
57
- retry_after = int(resp.headers.get("Retry-After", 60))
58
- log.warning("GHSA rate limited, waiting %ds", retry_after)
59
- time.sleep(retry_after)
60
- continue
61
-
62
- resp.raise_for_status()
63
- break
64
- except requests.RequestException as e:
65
- wait = 2 ** attempt * 5
66
- log.warning("GHSA request failed (attempt %d): %s — retrying in %ds",
67
- attempt + 1, e, wait)
68
- time.sleep(wait)
69
- else:
70
- log.error("GHSA fetch failed after 3 attempts on page %d", page)
71
- break
72
-
73
- batch = resp.json()
74
- if not batch:
75
- break
76
-
77
- advisories.extend(batch)
78
- log.info("GHSA page %d: %d advisories (total: %d)", page, len(batch), len(advisories))
79
-
80
- if len(batch) < per_page:
81
- break
82
- page += 1
83
- time.sleep(1) # Courtesy delay
84
-
85
- return advisories
86
-
87
-
88
- def build_index(cache_dir):
89
- """Build GHSA index from API. Returns dict keyed by package name."""
90
- advisories = fetch_malware_advisories()
91
- index = {}
92
-
93
- for adv in advisories:
94
- ghsa_id = adv.get("ghsa_id", "")
95
- published = adv.get("published_at", "")
96
- summary = adv.get("summary", "")
97
- withdrawn = adv.get("withdrawn_at")
98
-
99
- # Skip withdrawn advisories
100
- if withdrawn:
101
- continue
102
-
103
- for vuln in adv.get("vulnerabilities", []):
104
- pkg = vuln.get("package", {})
105
- ecosystem = pkg.get("ecosystem", "").lower()
106
- name = pkg.get("name", "")
107
-
108
- if ecosystem != "npm" or not name:
109
- continue
110
-
111
- version_range = vuln.get("vulnerable_version_range", "")
112
-
113
- entry = {
114
- "source": "ghsa",
115
- "ghsa_id": ghsa_id,
116
- "date": published,
117
- "summary": summary[:200],
118
- "version_range": version_range,
119
- }
120
-
121
- # Index by package name (version matching is approximate for GHSA)
122
- if name not in index:
123
- index[name] = []
124
- index[name].append(entry)
125
-
126
- log.info("GHSA index: %d packages from %d advisories", len(index), len(advisories))
127
-
128
- # Cache to disk
129
- cache_dir = Path(cache_dir)
130
- cache_dir.mkdir(parents=True, exist_ok=True)
131
- cache_path = cache_dir / INDEX_FILENAME
132
- with open(cache_path, "w", encoding="utf-8") as f:
133
- json.dump({"built_at": datetime.utcnow().isoformat() + "Z",
134
- "count": len(index),
135
- "index": index}, f)
136
- log.info("GHSA index cached to %s", cache_path)
137
-
138
- return index
139
-
140
-
141
- def load_cached_index(cache_dir):
142
- """Load index from cache if fresh enough."""
143
- cache_path = Path(cache_dir) / INDEX_FILENAME
144
- if not cache_path.is_file():
145
- return None
146
- try:
147
- stat = cache_path.stat()
148
- age = time.time() - stat.st_mtime
149
- if age > CACHE_TTL_SECONDS:
150
- log.info("GHSA cache expired (%.1fh old)", age / 3600)
151
- return None
152
-
153
- with open(cache_path, "r", encoding="utf-8") as f:
154
- data = json.load(f)
155
- log.info("Loaded cached GHSA index (%d packages, built %s)",
156
- data.get("count", 0), data.get("built_at", "?"))
157
- return data.get("index", {})
158
- except (json.JSONDecodeError, OSError) as e:
159
- log.warning("Failed to load GHSA cache: %s", e)
160
- return None
161
-
162
-
163
- def lookup(index, name):
164
- """Check if a package name is in the GHSA index.
165
-
166
- Returns the list of advisory entries or None.
167
- """
168
- entries = index.get(name)
169
- return entries if entries else None