npm - muaddib-scanner - Versions diffs - 2.11.76 → 2.11.77 - Mend

muaddib-scanner 2.11.76 → 2.11.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/.githooks/pre-commit +18 -0
package/README.md +15 -6
package/package.json +1 -2
package/{self-scan-v2.11.76.json → self-scan-v2.11.77.json} +1 -1
package/src/commands/safe-install.js +8 -3
package/src/monitor/daemon.js +34 -22
package/src/monitor/ingestion.js +32 -2
package/src/monitor/queue.js +84 -21
package/src/monitor/scan-queue.js +68 -1
package/src/monitor/state.js +24 -1
package/src/monitor/webhook.js +32 -11
package/src/scanner/temporal-analysis.js +8 -0
package/src/scanner/temporal-ast-diff.js +5 -0
package/.dockerignore +0 -7
package/.env.example +0 -43
package/ml-retrain/auto-labeler/auto_labeler.py +0 -312
package/ml-retrain/auto-labeler/ghsa_checker.py +0 -169
package/ml-retrain/auto-labeler/labeler.py +0 -256
package/ml-retrain/auto-labeler/npm_checker.py +0 -228
package/ml-retrain/auto-labeler/ossf_index.py +0 -178
package/ml-retrain/auto-labeler/requirements.txt +0 -1
package/ml-retrain/confusion-matrix.png +0 -0
package/ml-retrain/model-trees-retrained.js +0 -12
package/ml-retrain/retrain-report.json +0 -225
package/ml-retrain/retrain.py +0 -974
package/sbom.json +0 -0
package/src/ml/train-bundler-detector.py +0 -725
package/src/ml/train-xgboost.py +0 -957
package/tools/export-model-js.py +0 -160
package/tools/requirements-ml.txt +0 -5
package/tools/train-classifier.py +0 -333

package/ml-retrain/auto-labeler/labeler.py DELETED Viewed

@@ -1,256 +0,0 @@
-"""
-Label generation engine.
-Correlates signals from OSSF, GHSA, and npm status to produce labels.
-Label tiers (by confidence):
-- confirmed_malicious: authoritative source (ossf/ghsa) OR npm takedown pattern
-- likely_malicious:     npm_removed + high muaddib score, but no authoritative confirmation
-- unconfirmed:          suspect in muaddib, still on npm, no external signal, >7 days old
-- pending:              suspect in muaddib, still on npm, no external signal, <7 days old
-- missed:               clean in muaddib BUT flagged by ossf/ghsa (false negative)
-"""
-import json
-import logging
-from datetime import datetime, timezone
-from pathlib import Path
-from ossf_index import lookup as ossf_lookup
-from ghsa_checker import lookup as ghsa_lookup
-from npm_checker import is_quick_takedown
-log = logging.getLogger("auto-labeler.labeler")
-# Thresholds
-SCORE_THRESHOLD_CONFIRMED = 50  # Minimum muaddib score for npm_removed → confirmed
-PENDING_DAYS = 7  # Days before pending → unconfirmed
-MAX_CONFIRMATION_AGE_DAYS = 30  # npm_removed only confirms if detection is recent
-def _parse_iso(s):
-    """Parse ISO 8601 date string to datetime."""
-    if not s:
-        return None
-    try:
-        return datetime.fromisoformat(s.replace("Z", "+00:00"))
-    except (ValueError, TypeError):
-        return None
-def _days_since(iso_str):
-    """Days elapsed since the given ISO date string."""
-    dt = _parse_iso(iso_str)
-    if not dt:
-        return None
-    delta = datetime.now(timezone.utc) - dt
-    return delta.total_seconds() / 86400
-def _severity_to_score_estimate(severity):
-    """Rough score estimate from severity when exact score is unavailable."""
-    return {"CRITICAL": 70, "HIGH": 40, "MEDIUM": 15, "LOW": 5}.get(severity, 0)
-def label_suspects(detections, ossf_index, ghsa_index, npm_status, alert_scores):
-    """Generate labels for all suspect detections.
-    Args:
-        detections: list of detection dicts from detections.json
-        ossf_index: dict from ossf_index.build_index()
-        ghsa_index: dict from ghsa_checker.build_index()
-        npm_status: dict from npm_checker.check_suspects()
-        alert_scores: dict keyed by "name@version" with {"score": N, "tier": "T1a"} from alerts
-    Returns:
-        dict keyed by "name@version" with label info
-    """
-    labels = {}
-    stats = {"confirmed_malicious": 0, "likely_malicious": 0,
-             "unconfirmed": 0, "pending": 0}
-    for det in detections:
-        name = det["package"]
-        version = det["version"]
-        ecosystem = det.get("ecosystem", "npm")
-        key = f"{name}@{version}"
-        detection_date = det.get("first_seen_at", "")
-        severity = det.get("severity", "UNKNOWN")
-        findings = det.get("findings", [])
-        # Skip non-npm for now (OSSF/GHSA npm-focused)
-        if ecosystem != "npm":
-            continue
-        # Gather signals
-        signals = []
-        # Signal 1: OSSF
-        ossf_hit = ossf_lookup(ossf_index, name, version)
-        if ossf_hit:
-            signals.append("ossf")
-        # Signal 2: GHSA
-        ghsa_hit = ghsa_lookup(ghsa_index, name)
-        if ghsa_hit:
-            signals.append("ghsa")
-        # Signal 3: npm status
-        npm_result = npm_status.get(key, {})
-        npm_removed = npm_result.get("status") == "npm_removed"
-        if npm_removed:
-            signals.append("npm_removed")
-        # Get score from alerts or estimate from severity
-        score_info = alert_scores.get(key, {})
-        score = score_info.get("score", _severity_to_score_estimate(severity))
-        tier = score_info.get("tier", "")
-        # Determine label
-        label = _classify(signals, npm_result, detection_date, score)
-        stats[label] += 1
-        labels[key] = {
-            "muaddib_label": "suspect",
-            "auto_label": label,
-            "signals": signals,
-            "muaddib_score": score,
-            "muaddib_tier": tier,
-            "muaddib_severity": severity,
-            "muaddib_findings": findings,
-            "detection_date": detection_date,
-            "label_date": datetime.now(timezone.utc).isoformat(),
-            "npm_status": npm_result.get("status", "unknown"),
-            "npm_publish_date": npm_result.get("publish_date"),
-        }
-        if ossf_hit:
-            labels[key]["ossf_id"] = ossf_hit.get("osv_id")
-        if ghsa_hit:
-            labels[key]["ghsa_id"] = ghsa_hit[0].get("ghsa_id")
-        log.debug("LABEL %s → %s (signals=%s, score=%d)", key, label, signals, score)
-    log.info("Suspect labels: %s", stats)
-    return labels
-def _classify(signals, npm_result, detection_date, score):
-    """Core classification logic."""
-    has_authoritative = "ossf" in signals or "ghsa" in signals
-    npm_removed = "npm_removed" in signals
-    # Tier 1: Authoritative source confirms malicious
-    if has_authoritative:
-        return "confirmed_malicious"
-    # Tier 2: npm takedown pattern (removed + high score + quick removal)
-    # Temporal leakage guard (ANSSI audit M5): only confirm via npm_removed if
-    # the detection is recent. Old detections where the package has since been
-    # removed cannot be reliably confirmed — the removal may be unrelated.
-    # Also, is_quick_takedown already validates detection_date > publish_date.
-    if npm_removed and score >= SCORE_THRESHOLD_CONFIRMED:
-        detection_age = _days_since(detection_date)
-        if detection_age is not None and detection_age <= MAX_CONFIRMATION_AGE_DAYS:
-            if is_quick_takedown(npm_result, detection_date, threshold_hours=72):
-                return "confirmed_malicious"
-    # Tier 3: npm removed but doesn't meet confirmation criteria
-    if npm_removed:
-        return "likely_malicious"
-    # Tier 4: Still on npm, no external signal
-    days = _days_since(detection_date)
-    if days is not None and days > PENDING_DAYS:
-        return "unconfirmed"
-    return "pending"
-def find_missed(ossf_index, ghsa_index, detections):
-    """Find packages in OSSF/GHSA that muaddib did NOT detect (false negatives).
-    Returns dict keyed by package name with miss details.
-    """
-    # Build set of all detected package names
-    detected_names = set()
-    for det in detections:
-        if det.get("ecosystem") == "npm":
-            detected_names.add(det["package"])
-    missed = {}
-    # Check OSSF index
-    ossf_packages = set()
-    for key in ossf_index:
-        name = key.rsplit("@", 1)[0]
-        ossf_packages.add(name)
-    for name in ossf_packages:
-        if name not in detected_names:
-            missed[name] = {
-                "auto_label": "missed",
-                "muaddib_label": "clean",
-                "signals": ["ossf"],
-                "source_detail": "In ossf/malicious-packages but not in muaddib detections",
-                "label_date": datetime.now(timezone.utc).isoformat(),
-            }
-    # Check GHSA index
-    for name, entries in ghsa_index.items():
-        if name not in detected_names:
-            existing = missed.get(name)
-            if existing:
-                existing["signals"].append("ghsa")
-            else:
-                missed[name] = {
-                    "auto_label": "missed",
-                    "muaddib_label": "clean",
-                    "signals": ["ghsa"],
-                    "ghsa_id": entries[0].get("ghsa_id") if entries else None,
-                    "source_detail": "In GHSA malware advisories but not in muaddib detections",
-                    "label_date": datetime.now(timezone.utc).isoformat(),
-                }
-    log.info("Missed packages (false negatives): %d", len(missed))
-    if missed:
-        # Log the first 20 as these are critical for improving the scanner
-        for name in list(missed.keys())[:20]:
-            m = missed[name]
-            log.warning("MISSED: %s (signals=%s)", name, m["signals"])
-    return missed
-def export_labels(labels, missed, output_path):
-    """Export all labels to auto-labels.json."""
-    output_path = Path(output_path)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    # Merge suspects and missed into one output
-    all_labels = dict(labels)
-    for name, info in missed.items():
-        all_labels[f"{name}@*"] = info
-    # Generate summary
-    summary = {"confirmed_malicious": 0, "likely_malicious": 0,
-               "unconfirmed": 0, "pending": 0, "missed": 0}
-    for entry in all_labels.values():
-        lbl = entry.get("auto_label", "unknown")
-        if lbl in summary:
-            summary[lbl] += 1
-    output = {
-        "generated_at": datetime.now(timezone.utc).isoformat(),
-        "summary": summary,
-        "total": len(all_labels),
-        "labels": all_labels,
-    }
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(output, f, indent=2)
-    log.info("Exported %d labels to %s", len(all_labels), output_path)
-    log.info("Summary: %s", summary)
-    return summary

package/ml-retrain/auto-labeler/npm_checker.py DELETED Viewed

@@ -1,228 +0,0 @@
-"""
-npm registry status checker.
-For each suspect package, checks if the package/version still exists on npm.
-Extracts publish timing for temporal correlation (quick takedown = strong signal).
-Rate-limited to 50 requests/minute with exponential backoff.
-Resumable: saves progress to npm-status-cache.json.
-"""
-import json
-import logging
-import time
-from datetime import datetime
-from pathlib import Path
-import requests
-log = logging.getLogger("auto-labeler.npm")
-NPM_REGISTRY = "https://registry.npmjs.org"
-RATE_LIMIT = 50  # requests per minute
-RATE_WINDOW = 60  # seconds
-CACHE_FILENAME = "npm-status-cache.json"
-# Don't re-check packages checked within this window
-RECHECK_INTERVAL_SECONDS = 24 * 3600  # 24h
-def _rate_limiter():
-    """Generator-based rate limiter. Call next() before each request."""
-    timestamps = []
-    while True:
-        now = time.time()
-        # Purge timestamps older than the window
-        timestamps = [t for t in timestamps if now - t < RATE_WINDOW]
-        if len(timestamps) >= RATE_LIMIT:
-            sleep_time = timestamps[0] + RATE_WINDOW - now + 0.1
-            log.debug("Rate limit reached, sleeping %.1fs", sleep_time)
-            time.sleep(sleep_time)
-            now = time.time()
-            timestamps = [t for t in timestamps if now - t < RATE_WINDOW]
-        timestamps.append(now)
-        yield
-def _fetch_package_info(session, name, limiter):
-    """Fetch package metadata from npm. Returns (status, info) tuple."""
-    next(limiter)
-    url = f"{NPM_REGISTRY}/{name}"
-    for attempt in range(3):
-        try:
-            resp = session.get(url, timeout=15)
-            if resp.status_code == 404:
-                return "npm_removed", {"reason": "package_404"}
-            if resp.status_code == 429:
-                retry_after = int(resp.headers.get("Retry-After", 30))
-                log.warning("npm 429 for %s, waiting %ds", name, retry_after)
-                time.sleep(retry_after)
-                continue
-            resp.raise_for_status()
-            return "npm_available", resp.json()
-        except requests.RequestException as e:
-            wait = 2 ** attempt * 3
-            log.warning("npm fetch failed for %s (attempt %d): %s",
-                        name, attempt + 1, e)
-            time.sleep(wait)
-    return "npm_error", {"reason": "fetch_failed_after_retries"}
-def check_suspects(suspects, cache_dir):
-    """Check npm status for each suspect. Returns dict of results.
-    Args:
-        suspects: list of dicts with 'package', 'version', 'ecosystem' keys
-        cache_dir: path to cache directory
-    Returns:
-        dict keyed by "name@version" with status info
-    """
-    cache_dir = Path(cache_dir)
-    cache_dir.mkdir(parents=True, exist_ok=True)
-    cache_path = cache_dir / CACHE_FILENAME
-    # Load existing cache for resumability
-    cache = _load_cache(cache_path)
-    # Deduplicate suspects by name@version, npm only
-    unique = {}
-    for s in suspects:
-        if s.get("ecosystem") != "npm":
-            continue
-        key = f"{s['package']}@{s['version']}"
-        if key not in unique:
-            unique[key] = s
-    # Filter out recently checked
-    now = time.time()
-    to_check = {}
-    for key, s in unique.items():
-        cached = cache.get(key)
-        if cached and (now - cached.get("checked_at", 0)) < RECHECK_INTERVAL_SECONDS:
-            continue
-        to_check[key] = s
-    log.info("npm check: %d unique suspects, %d already cached, %d to check",
-             len(unique), len(unique) - len(to_check), len(to_check))
-    if not to_check:
-        return cache
-    session = requests.Session()
-    session.headers.update({"Accept": "application/json"})
-    limiter = _rate_limiter()
-    checked = 0
-    # Group by package name to avoid redundant fetches
-    by_name = {}
-    for key, s in to_check.items():
-        name = s["package"]
-        if name not in by_name:
-            by_name[name] = []
-        by_name[name].append((key, s))
-    total_packages = len(by_name)
-    for i, (name, entries) in enumerate(by_name.items()):
-        status, info = _fetch_package_info(session, name, limiter)
-        if i > 0 and i % 100 == 0:
-            log.info("npm check progress: %d/%d packages (%.0f%%)",
-                     i, total_packages, i / total_packages * 100)
-            _save_cache(cache, cache_path)
-        for key, s in entries:
-            version = s["version"]
-            result = {
-                "status": status,
-                "checked_at": now,
-            }
-            if status == "npm_available" and isinstance(info, dict):
-                versions = info.get("versions", {})
-                time_info = info.get("time", {})
-                if version not in versions:
-                    result["status"] = "npm_removed"
-                    result["reason"] = "version_removed"
-                else:
-                    result["reason"] = "available"
-                # Extract timing for temporal correlation
-                publish_time = time_info.get(version)
-                if publish_time:
-                    result["publish_date"] = publish_time
-                # Extract latest version publish time
-                modified = time_info.get("modified")
-                if modified:
-                    result["last_modified"] = modified
-            elif status == "npm_removed":
-                result["reason"] = "package_404"
-            cache[key] = result
-            checked += 1
-    _save_cache(cache, cache_path)
-    log.info("npm check complete: %d packages checked, %d total cached",
-             checked, len(cache))
-    return cache
-def _load_cache(cache_path):
-    """Load npm status cache from disk."""
-    if not cache_path.is_file():
-        return {}
-    try:
-        with open(cache_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        if isinstance(data, dict) and "results" in data:
-            return data["results"]
-        return {}
-    except (json.JSONDecodeError, OSError):
-        return {}
-def _save_cache(cache, cache_path):
-    """Save npm status cache to disk."""
-    try:
-        with open(cache_path, "w", encoding="utf-8") as f:
-            json.dump({
-                "saved_at": datetime.utcnow().isoformat() + "Z",
-                "count": len(cache),
-                "results": cache,
-            }, f)
-    except OSError as e:
-        log.error("Failed to save npm cache: %s", e)
-def is_quick_takedown(result, detection_date_str, threshold_hours=72):
-    """Check if a package was removed quickly after publish (npm security takedown pattern).
-    Returns True if the package was removed AND was published recently
-    relative to the detection date (within threshold_hours).
-    """
-    if result.get("status") != "npm_removed":
-        return False
-    publish_date = result.get("publish_date")
-    if not publish_date:
-        return False
-    try:
-        publish_dt = datetime.fromisoformat(publish_date.replace("Z", "+00:00"))
-        detection_dt = datetime.fromisoformat(detection_date_str.replace("Z", "+00:00"))
-        delta_hours = (detection_dt - publish_dt).total_seconds() / 3600
-        # Package was detected within threshold_hours of publish
-        # AND has since been removed → strong takedown signal
-        return 0 <= delta_hours <= threshold_hours
-    except (ValueError, TypeError):
-        return False

package/ml-retrain/auto-labeler/ossf_index.py DELETED Viewed

@@ -1,178 +0,0 @@
-"""
-OSSF malicious-packages indexer.
-Clones (or updates) the ossf/malicious-packages repo with sparse checkout
-limited to osv/malicious/npm/, then parses all OSV JSON files into an index.
-Skips osv/withdrawn/ (retracted false positives).
-"""
-import json
-import logging
-import os
-import subprocess
-from datetime import datetime
-from pathlib import Path
-log = logging.getLogger("auto-labeler.ossf")
-OSSF_REPO_URL = "https://github.com/ossf/malicious-packages.git"
-OSSF_SPARSE_PATH = "osv/malicious/npm"
-INDEX_FILENAME = "ossf-index.json"
-def _run_git(args, cwd=None):
-    """Run a git command, raise on failure."""
-    result = subprocess.run(
-        ["git"] + args,
-        cwd=cwd,
-        capture_output=True,
-        text=True,
-        timeout=300,
-    )
-    if result.returncode != 0:
-        raise RuntimeError(f"git {' '.join(args)} failed: {result.stderr.strip()}")
-    return result.stdout.strip()
-def clone_or_update(repo_dir):
-    """Clone with sparse checkout or git pull if already present."""
-    repo_dir = Path(repo_dir)
-    if (repo_dir / ".git").is_dir():
-        log.info("OSSF repo exists at %s — pulling latest", repo_dir)
-        _run_git(["pull", "--ff-only"], cwd=repo_dir)
-        return
-    log.info("Cloning OSSF repo (sparse, depth=1) to %s", repo_dir)
-    repo_dir.mkdir(parents=True, exist_ok=True)
-    _run_git(["clone", "--depth", "1", "--filter=blob:none",
-              "--sparse", OSSF_REPO_URL, str(repo_dir)])
-    _run_git(["sparse-checkout", "set", OSSF_SPARSE_PATH], cwd=repo_dir)
-    log.info("OSSF clone complete (sparse: %s)", OSSF_SPARSE_PATH)
-def _parse_osv_file(filepath):
-    """Parse a single OSV JSON file and yield (key, entry) tuples."""
-    try:
-        with open(filepath, "r", encoding="utf-8") as f:
-            data = json.load(f)
-    except (json.JSONDecodeError, OSError) as e:
-        log.warning("Skipping invalid OSV file %s: %s", filepath, e)
-        return
-    osv_id = data.get("id", "")
-    published = data.get("published", "")
-    summary = data.get("summary", "")
-    # Extract attack type from database_specific if available
-    attack_type = None
-    db_specific = data.get("database_specific", {})
-    origins = db_specific.get("malicious-packages-origins", [])
-    if origins:
-        attack_type = origins[0].get("reason", None)
-    for affected in data.get("affected", []):
-        pkg = affected.get("package", {})
-        ecosystem = pkg.get("ecosystem", "").lower()
-        name = pkg.get("name", "")
-        if ecosystem != "npm" or not name:
-            continue
-        # Collect explicit versions
-        versions = affected.get("versions", [])
-        # Also extract versions from ranges
-        for rng in affected.get("ranges", []):
-            events = rng.get("events", [])
-            for event in events:
-                if "introduced" in event and event["introduced"] != "0":
-                    versions.append(event["introduced"])
-        entry = {
-            "source": "ossf",
-            "osv_id": osv_id,
-            "date": published,
-            "summary": summary[:200],
-            "attack_type": attack_type,
-        }
-        if versions:
-            for ver in set(versions):
-                yield f"{name}@{ver}", entry
-        else:
-            # No specific versions — all versions affected
-            yield f"{name}@*", entry
-def build_index(repo_dir, cache_dir):
-    """Build OSSF index from the cloned repo. Returns the index dict."""
-    repo_dir = Path(repo_dir)
-    cache_dir = Path(cache_dir)
-    osv_dir = repo_dir / "osv" / "malicious" / "npm"
-    if not osv_dir.is_dir():
-        log.error("OSSF osv/malicious/npm/ not found at %s", osv_dir)
-        return {}
-    index = {}
-    file_count = 0
-    entry_count = 0
-    for root, _dirs, files in os.walk(osv_dir):
-        # Skip withdrawn reports
-        if "withdrawn" in Path(root).parts:
-            continue
-        for fname in files:
-            if not fname.endswith(".json"):
-                continue
-            filepath = os.path.join(root, fname)
-            file_count += 1
-            for key, entry in _parse_osv_file(filepath):
-                index[key] = entry
-                entry_count += 1
-    log.info("OSSF index: %d entries from %d files", entry_count, file_count)
-    # Cache to disk
-    cache_dir.mkdir(parents=True, exist_ok=True)
-    cache_path = cache_dir / INDEX_FILENAME
-    with open(cache_path, "w", encoding="utf-8") as f:
-        json.dump({"built_at": datetime.utcnow().isoformat() + "Z",
-                    "count": len(index),
-                    "index": index}, f)
-    log.info("OSSF index cached to %s", cache_path)
-    return index
-def load_cached_index(cache_dir):
-    """Load index from cache if available."""
-    cache_path = Path(cache_dir) / INDEX_FILENAME
-    if not cache_path.is_file():
-        return None
-    try:
-        with open(cache_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        log.info("Loaded cached OSSF index (%d entries, built %s)",
-                 data.get("count", 0), data.get("built_at", "?"))
-        return data.get("index", {})
-    except (json.JSONDecodeError, OSError) as e:
-        log.warning("Failed to load OSSF cache: %s", e)
-        return None
-def lookup(index, name, version):
-    """Check if a package@version is in the OSSF index.
-    Returns the entry dict or None. Checks both exact version and wildcard.
-    """
-    exact = index.get(f"{name}@{version}")
-    if exact:
-        return exact
-    return index.get(f"{name}@*")

package/ml-retrain/auto-labeler/requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- requests>=2.28.0

package/ml-retrain/confusion-matrix.png DELETED Viewed

Binary file