slopguard-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- slopguard/__init__.py +7 -0
- slopguard/__main__.py +13 -0
- slopguard/cli.py +321 -0
- slopguard/config.py +139 -0
- slopguard/data/__init__.py +40 -0
- slopguard/data/hallucinations_seed.json +5603 -0
- slopguard/data/popular_packages.json +2007 -0
- slopguard/models.py +133 -0
- slopguard/parsers/__init__.py +9 -0
- slopguard/parsers/base.py +28 -0
- slopguard/parsers/npm.py +146 -0
- slopguard/parsers/python.py +269 -0
- slopguard/registry/__init__.py +14 -0
- slopguard/registry/base.py +107 -0
- slopguard/registry/npm.py +78 -0
- slopguard/registry/pypi.py +99 -0
- slopguard/report/__init__.py +8 -0
- slopguard/report/json.py +17 -0
- slopguard/report/terminal.py +87 -0
- slopguard/scoring/__init__.py +7 -0
- slopguard/scoring/engine.py +235 -0
- slopguard/scoring/signals.py +183 -0
- slopguard/update.py +15 -0
- slopguard_cli-0.1.0.dist-info/METADATA +197 -0
- slopguard_cli-0.1.0.dist-info/RECORD +28 -0
- slopguard_cli-0.1.0.dist-info/WHEEL +4 -0
- slopguard_cli-0.1.0.dist-info/entry_points.txt +2 -0
- slopguard_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Registry client base + shared metadata type."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RegistryError(Exception):
|
|
14
|
+
"""Raised when a registry probe fails in a way the caller should know about."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class PackageMetadata:
|
|
19
|
+
"""Subset of registry metadata used by the scoring engine."""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
exists: bool
|
|
23
|
+
first_release: datetime | None = None
|
|
24
|
+
latest_release: datetime | None = None
|
|
25
|
+
publisher: str | None = None
|
|
26
|
+
publisher_created: datetime | None = None
|
|
27
|
+
publisher_package_count: int | None = None
|
|
28
|
+
downloads_recent: int | None = None
|
|
29
|
+
extras: dict[str, str] = field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RegistryClient(ABC):
|
|
33
|
+
"""Async registry client. Implementations cache responses for the lifetime of one scan."""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
client: httpx.AsyncClient | None = None,
|
|
39
|
+
timeout: float = 5.0,
|
|
40
|
+
max_retries: int = 2,
|
|
41
|
+
) -> None:
|
|
42
|
+
self._owned_client = client is None
|
|
43
|
+
self._client = client or httpx.AsyncClient(
|
|
44
|
+
timeout=timeout,
|
|
45
|
+
headers={
|
|
46
|
+
"User-Agent": "slopguard/0.1 (+https://github.com/hariomunknownslab/slopguard)"
|
|
47
|
+
},
|
|
48
|
+
)
|
|
49
|
+
self._timeout = timeout
|
|
50
|
+
self._max_retries = max_retries
|
|
51
|
+
self._cache: dict[str, PackageMetadata] = {}
|
|
52
|
+
self._cache_lock = asyncio.Lock()
|
|
53
|
+
|
|
54
|
+
async def __aenter__(self) -> RegistryClient:
|
|
55
|
+
return self
|
|
56
|
+
|
|
57
|
+
async def __aexit__(self, *_: object) -> None:
|
|
58
|
+
await self.aclose()
|
|
59
|
+
|
|
60
|
+
async def aclose(self) -> None:
|
|
61
|
+
if self._owned_client:
|
|
62
|
+
await self._client.aclose()
|
|
63
|
+
|
|
64
|
+
async def fetch(self, name: str) -> PackageMetadata:
|
|
65
|
+
"""Fetch metadata for ``name``. Returns ``exists=False`` on 404.
|
|
66
|
+
|
|
67
|
+
Raises :class:`RegistryError` on any error the caller cannot interpret as
|
|
68
|
+
existence / non-existence (network failures, persistent 5xx, etc.).
|
|
69
|
+
"""
|
|
70
|
+
async with self._cache_lock:
|
|
71
|
+
cached = self._cache.get(name)
|
|
72
|
+
if cached is not None:
|
|
73
|
+
return cached
|
|
74
|
+
|
|
75
|
+
meta = await self._fetch_uncached(name)
|
|
76
|
+
async with self._cache_lock:
|
|
77
|
+
self._cache[name] = meta
|
|
78
|
+
return meta
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
async def _fetch_uncached(self, name: str) -> PackageMetadata: ...
|
|
82
|
+
|
|
83
|
+
async def _request_with_retry(self, method: str, url: str) -> httpx.Response:
|
|
84
|
+
last_exc: Exception | None = None
|
|
85
|
+
for attempt in range(self._max_retries + 1):
|
|
86
|
+
try:
|
|
87
|
+
response = await self._client.request(method, url, timeout=self._timeout)
|
|
88
|
+
except (httpx.TimeoutException, httpx.TransportError) as exc:
|
|
89
|
+
last_exc = exc
|
|
90
|
+
if attempt >= self._max_retries:
|
|
91
|
+
raise RegistryError(f"network error fetching {url}: {exc}") from exc
|
|
92
|
+
await asyncio.sleep(min(2**attempt * 0.2, 2.0))
|
|
93
|
+
continue
|
|
94
|
+
if response.status_code == 429:
|
|
95
|
+
retry_after = float(response.headers.get("Retry-After", "1") or "1")
|
|
96
|
+
if attempt >= self._max_retries:
|
|
97
|
+
raise RegistryError(f"rate limited by {url}")
|
|
98
|
+
await asyncio.sleep(min(retry_after, 5.0))
|
|
99
|
+
continue
|
|
100
|
+
if response.status_code >= 500:
|
|
101
|
+
if attempt >= self._max_retries:
|
|
102
|
+
raise RegistryError(f"server error {response.status_code} at {url}")
|
|
103
|
+
await asyncio.sleep(min(2**attempt * 0.2, 2.0))
|
|
104
|
+
continue
|
|
105
|
+
return response
|
|
106
|
+
# Should be unreachable; the loop either returns or raises above.
|
|
107
|
+
raise RegistryError(f"unexpected retry exhaustion fetching {url}: {last_exc}")
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""npm registry client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
|
|
9
|
+
from slopguard.registry.base import PackageMetadata, RegistryClient, RegistryError
|
|
10
|
+
|
|
11
|
+
NPM_REGISTRY = "https://registry.npmjs.org"
|
|
12
|
+
NPM_DOWNLOADS = "https://api.npmjs.org/downloads/point/last-month"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _parse_iso(value: Any) -> datetime | None:
|
|
16
|
+
if not isinstance(value, str):
|
|
17
|
+
return None
|
|
18
|
+
# npm timestamps end with 'Z' — Python <3.11 chokes on it, 3.11+ handles via fromisoformat
|
|
19
|
+
try:
|
|
20
|
+
if value.endswith("Z"):
|
|
21
|
+
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
22
|
+
return datetime.fromisoformat(value)
|
|
23
|
+
except ValueError:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NpmRegistryClient(RegistryClient):
|
|
28
|
+
"""Probe ``registry.npmjs.org`` for package metadata."""
|
|
29
|
+
|
|
30
|
+
async def _fetch_uncached(self, name: str) -> PackageMetadata:
|
|
31
|
+
url = f"{NPM_REGISTRY}/{quote(name, safe='@/')}"
|
|
32
|
+
response = await self._request_with_retry("GET", url)
|
|
33
|
+
if response.status_code == 404:
|
|
34
|
+
return PackageMetadata(name=name, exists=False)
|
|
35
|
+
if response.status_code != 200:
|
|
36
|
+
raise RegistryError(f"unexpected status {response.status_code} from {url}")
|
|
37
|
+
try:
|
|
38
|
+
data: dict[str, Any] = response.json()
|
|
39
|
+
except ValueError as exc:
|
|
40
|
+
raise RegistryError(f"invalid JSON from {url}: {exc}") from exc
|
|
41
|
+
|
|
42
|
+
time_raw = data.get("time")
|
|
43
|
+
time_block: dict[str, Any] = time_raw if isinstance(time_raw, dict) else {}
|
|
44
|
+
first_release = _parse_iso(time_block.get("created"))
|
|
45
|
+
latest_release = _parse_iso(time_block.get("modified"))
|
|
46
|
+
|
|
47
|
+
publisher: str | None = None
|
|
48
|
+
maintainers = data.get("maintainers")
|
|
49
|
+
if isinstance(maintainers, list) and maintainers:
|
|
50
|
+
first = maintainers[0]
|
|
51
|
+
if isinstance(first, dict) and isinstance(first.get("name"), str):
|
|
52
|
+
publisher = first["name"]
|
|
53
|
+
|
|
54
|
+
downloads = await self._fetch_downloads(name)
|
|
55
|
+
|
|
56
|
+
return PackageMetadata(
|
|
57
|
+
name=name,
|
|
58
|
+
exists=True,
|
|
59
|
+
first_release=first_release,
|
|
60
|
+
latest_release=latest_release,
|
|
61
|
+
publisher=publisher,
|
|
62
|
+
downloads_recent=downloads,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
async def _fetch_downloads(self, name: str) -> int | None:
|
|
66
|
+
url = f"{NPM_DOWNLOADS}/{quote(name, safe='@/')}"
|
|
67
|
+
try:
|
|
68
|
+
response = await self._request_with_retry("GET", url)
|
|
69
|
+
except RegistryError:
|
|
70
|
+
return None
|
|
71
|
+
if response.status_code != 200:
|
|
72
|
+
return None
|
|
73
|
+
try:
|
|
74
|
+
payload = response.json()
|
|
75
|
+
except ValueError:
|
|
76
|
+
return None
|
|
77
|
+
downloads = payload.get("downloads") if isinstance(payload, dict) else None
|
|
78
|
+
return downloads if isinstance(downloads, int) else None
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""PyPI registry client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
|
|
9
|
+
from slopguard.registry.base import PackageMetadata, RegistryClient, RegistryError
|
|
10
|
+
|
|
11
|
+
PYPI_JSON = "https://pypi.org/pypi"
|
|
12
|
+
PYPISTATS_RECENT = "https://pypistats.org/api/packages"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _parse_iso(value: Any) -> datetime | None:
|
|
16
|
+
if not isinstance(value, str):
|
|
17
|
+
return None
|
|
18
|
+
try:
|
|
19
|
+
if value.endswith("Z"):
|
|
20
|
+
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
21
|
+
return datetime.fromisoformat(value)
|
|
22
|
+
except ValueError:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PypiRegistryClient(RegistryClient):
|
|
27
|
+
"""Probe ``pypi.org/pypi/<name>/json`` for package metadata."""
|
|
28
|
+
|
|
29
|
+
async def _fetch_uncached(self, name: str) -> PackageMetadata:
|
|
30
|
+
url = f"{PYPI_JSON}/{quote(name, safe='')}/json"
|
|
31
|
+
response = await self._request_with_retry("GET", url)
|
|
32
|
+
if response.status_code == 404:
|
|
33
|
+
return PackageMetadata(name=name, exists=False)
|
|
34
|
+
if response.status_code != 200:
|
|
35
|
+
raise RegistryError(f"unexpected status {response.status_code} from {url}")
|
|
36
|
+
try:
|
|
37
|
+
data: dict[str, Any] = response.json()
|
|
38
|
+
except ValueError as exc:
|
|
39
|
+
raise RegistryError(f"invalid JSON from {url}: {exc}") from exc
|
|
40
|
+
|
|
41
|
+
info_raw = data.get("info")
|
|
42
|
+
info: dict[str, Any] = info_raw if isinstance(info_raw, dict) else {}
|
|
43
|
+
releases_raw = data.get("releases")
|
|
44
|
+
releases: dict[str, Any] = releases_raw if isinstance(releases_raw, dict) else {}
|
|
45
|
+
|
|
46
|
+
first_release: datetime | None = None
|
|
47
|
+
latest_release: datetime | None = None
|
|
48
|
+
timestamps: list[datetime] = []
|
|
49
|
+
for files in releases.values():
|
|
50
|
+
if not isinstance(files, list):
|
|
51
|
+
continue
|
|
52
|
+
for f in files:
|
|
53
|
+
if not isinstance(f, dict):
|
|
54
|
+
continue
|
|
55
|
+
ts = _parse_iso(f.get("upload_time_iso_8601") or f.get("upload_time"))
|
|
56
|
+
if ts is not None:
|
|
57
|
+
timestamps.append(ts)
|
|
58
|
+
if timestamps:
|
|
59
|
+
timestamps.sort()
|
|
60
|
+
first_release = timestamps[0]
|
|
61
|
+
latest_release = timestamps[-1]
|
|
62
|
+
|
|
63
|
+
author = info.get("author")
|
|
64
|
+
maintainer = info.get("maintainer")
|
|
65
|
+
publisher: str | None = author if isinstance(author, str) and author else None
|
|
66
|
+
if not publisher and isinstance(maintainer, str) and maintainer:
|
|
67
|
+
publisher = maintainer
|
|
68
|
+
|
|
69
|
+
downloads = await self._fetch_downloads(name)
|
|
70
|
+
|
|
71
|
+
return PackageMetadata(
|
|
72
|
+
name=name,
|
|
73
|
+
exists=True,
|
|
74
|
+
first_release=first_release,
|
|
75
|
+
latest_release=latest_release,
|
|
76
|
+
publisher=publisher,
|
|
77
|
+
downloads_recent=downloads,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
async def _fetch_downloads(self, name: str) -> int | None:
|
|
81
|
+
url = f"{PYPISTATS_RECENT}/{quote(name, safe='')}/recent"
|
|
82
|
+
try:
|
|
83
|
+
response = await self._request_with_retry("GET", url)
|
|
84
|
+
except RegistryError:
|
|
85
|
+
return None
|
|
86
|
+
if response.status_code != 200:
|
|
87
|
+
return None
|
|
88
|
+
try:
|
|
89
|
+
payload = response.json()
|
|
90
|
+
except ValueError:
|
|
91
|
+
return None
|
|
92
|
+
if not isinstance(payload, dict):
|
|
93
|
+
return None
|
|
94
|
+
data = payload.get("data")
|
|
95
|
+
if isinstance(data, dict):
|
|
96
|
+
week = data.get("last_week")
|
|
97
|
+
if isinstance(week, int):
|
|
98
|
+
return week
|
|
99
|
+
return None
|
slopguard/report/json.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""JSON report writer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from slopguard.models import ScanReport
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def write_json_report(report: ScanReport, *, path: Path | None = None) -> str:
|
|
12
|
+
"""Serialise ``report`` to JSON. If ``path`` is set, write it there. Returns the JSON string."""
|
|
13
|
+
payload = report.model_dump(mode="json")
|
|
14
|
+
text = json.dumps(payload, indent=2, sort_keys=False) + "\n"
|
|
15
|
+
if path is not None:
|
|
16
|
+
path.write_text(text, encoding="utf-8")
|
|
17
|
+
return text
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Rich terminal rendering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.table import Table
|
|
7
|
+
|
|
8
|
+
from slopguard.models import Finding, RiskTier, ScanReport
|
|
9
|
+
|
|
10
|
+
_TIER_STYLE = {
|
|
11
|
+
RiskTier.HALLUCINATED: "bold red",
|
|
12
|
+
RiskTier.SUSPICIOUS: "bold yellow",
|
|
13
|
+
RiskTier.CLEAN: "green",
|
|
14
|
+
RiskTier.ERROR: "magenta",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
_TIER_LABEL = {
|
|
18
|
+
RiskTier.HALLUCINATED: "HALLUCIN.",
|
|
19
|
+
RiskTier.SUSPICIOUS: "SUSPICIOUS",
|
|
20
|
+
RiskTier.CLEAN: "CLEAN",
|
|
21
|
+
RiskTier.ERROR: "ERROR",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def render_terminal_report(
|
|
26
|
+
report: ScanReport,
|
|
27
|
+
*,
|
|
28
|
+
console: Console | None = None,
|
|
29
|
+
show_clean: bool = True,
|
|
30
|
+
duration_seconds: float | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
console = console or Console()
|
|
33
|
+
console.print(f"[bold]SlopGuard v{report.slopguard_version}[/bold] — scanning {report.path}")
|
|
34
|
+
console.print()
|
|
35
|
+
if report.manifests:
|
|
36
|
+
console.print("Detected manifests:")
|
|
37
|
+
for m in report.manifests:
|
|
38
|
+
console.print(
|
|
39
|
+
f" • {m.path} ([cyan]{m.ecosystem.value}[/cyan], {m.dependency_count} deps)"
|
|
40
|
+
)
|
|
41
|
+
console.print()
|
|
42
|
+
|
|
43
|
+
total = report.summary.total
|
|
44
|
+
if duration_seconds is not None:
|
|
45
|
+
console.print(f"Scanned {total} dependencies in {duration_seconds:.1f}s.\n")
|
|
46
|
+
else:
|
|
47
|
+
console.print(f"Scanned {total} dependencies.\n")
|
|
48
|
+
|
|
49
|
+
table = Table(show_header=True, header_style="bold")
|
|
50
|
+
table.add_column("Package", no_wrap=False, max_width=40)
|
|
51
|
+
table.add_column("Risk", no_wrap=True)
|
|
52
|
+
table.add_column("Reason", overflow="fold")
|
|
53
|
+
rows_added = 0
|
|
54
|
+
for finding in report.findings:
|
|
55
|
+
if not show_clean and finding.risk is RiskTier.CLEAN:
|
|
56
|
+
continue
|
|
57
|
+
style = _TIER_STYLE[finding.risk]
|
|
58
|
+
label = _TIER_LABEL[finding.risk]
|
|
59
|
+
reason = _summarise_reason(finding)
|
|
60
|
+
table.add_row(finding.name, f"[{style}]{label}[/{style}]", reason)
|
|
61
|
+
rows_added += 1
|
|
62
|
+
if rows_added == 0:
|
|
63
|
+
console.print("[green]All dependencies clean.[/green]")
|
|
64
|
+
else:
|
|
65
|
+
console.print(table)
|
|
66
|
+
|
|
67
|
+
console.print()
|
|
68
|
+
console.print(
|
|
69
|
+
f"Summary: [bold red]{report.summary.hallucinated}[/bold red] hallucinated, "
|
|
70
|
+
f"[bold yellow]{report.summary.suspicious}[/bold yellow] suspicious, "
|
|
71
|
+
f"[green]{report.summary.clean}[/green] clean, "
|
|
72
|
+
f"[magenta]{report.summary.errors}[/magenta] error(s)."
|
|
73
|
+
)
|
|
74
|
+
console.print(f"Exit code: [bold]{report.exit_code}[/bold]")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _summarise_reason(finding: Finding) -> str:
|
|
78
|
+
if finding.error:
|
|
79
|
+
return finding.error
|
|
80
|
+
if not finding.signals:
|
|
81
|
+
return "No signals."
|
|
82
|
+
# Top contributor first, then up to one additional.
|
|
83
|
+
primary = max(finding.signals, key=lambda s: s.weight)
|
|
84
|
+
extras = [s for s in finding.signals if s is not primary][:1]
|
|
85
|
+
parts = [primary.detail]
|
|
86
|
+
parts.extend(s.detail for s in extras)
|
|
87
|
+
return " ".join(parts)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""Scoring engine — combines signals into a per-dependency risk score."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from slopguard.data import hallucination_index, popular_set
|
|
9
|
+
from slopguard.models import Dependency, DependencySource, Ecosystem, Finding, RiskTier, Signal
|
|
10
|
+
from slopguard.registry.base import PackageMetadata, RegistryClient, RegistryError
|
|
11
|
+
from slopguard.scoring import signals
|
|
12
|
+
|
|
13
|
+
DEFAULT_SUSPICIOUS_MIN = 0.4
|
|
14
|
+
DEFAULT_HALLUCINATED_MIN = 0.85
|
|
15
|
+
|
|
16
|
+
_REMEDIATION_HALLUCINATED = (
|
|
17
|
+
"Remove this dependency. Verify the package your AI suggested actually exists "
|
|
18
|
+
"at the source you trust."
|
|
19
|
+
)
|
|
20
|
+
_REMEDIATION_SUSPICIOUS = (
|
|
21
|
+
"Inspect this dependency before installing — recent publication, low downloads, "
|
|
22
|
+
"or close to a popular package name. If your AI suggested it, double-check the spelling."
|
|
23
|
+
)
|
|
24
|
+
_REMEDIATION_ERROR = (
|
|
25
|
+
"SlopGuard could not score this dependency. Re-run with network access, or skip "
|
|
26
|
+
"with --no-network plus an ignore rule."
|
|
27
|
+
)
|
|
28
|
+
_REMEDIATION_CLEAN = "No action required."
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class ScoringConfig:
|
|
33
|
+
suspicious_min: float = DEFAULT_SUSPICIOUS_MIN
|
|
34
|
+
hallucinated_min: float = DEFAULT_HALLUCINATED_MIN
|
|
35
|
+
verbose: bool = False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ScoringEngine:
|
|
39
|
+
"""Scores a list of dependencies against a registry client."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
npm_client: RegistryClient | None = None,
|
|
45
|
+
pypi_client: RegistryClient | None = None,
|
|
46
|
+
config: ScoringConfig | None = None,
|
|
47
|
+
no_network: bool = False,
|
|
48
|
+
concurrency: int = 16,
|
|
49
|
+
) -> None:
|
|
50
|
+
self._npm = npm_client
|
|
51
|
+
self._pypi = pypi_client
|
|
52
|
+
self._config = config or ScoringConfig()
|
|
53
|
+
self._no_network = no_network
|
|
54
|
+
self._semaphore = asyncio.Semaphore(concurrency)
|
|
55
|
+
self._db = hallucination_index()
|
|
56
|
+
self._popular_npm = popular_set(Ecosystem.NPM)
|
|
57
|
+
self._popular_pypi = popular_set(Ecosystem.PYPI)
|
|
58
|
+
|
|
59
|
+
async def score_all(self, deps: list[Dependency]) -> list[Finding]:
|
|
60
|
+
tasks = [self._score_one(d) for d in deps]
|
|
61
|
+
return await asyncio.gather(*tasks)
|
|
62
|
+
|
|
63
|
+
async def _score_one(self, dep: Dependency) -> Finding:
|
|
64
|
+
async with self._semaphore:
|
|
65
|
+
return await self._score_one_inner(dep)
|
|
66
|
+
|
|
67
|
+
async def _score_one_inner(self, dep: Dependency) -> Finding:
|
|
68
|
+
# Local file / link refs: scope says ignore. Git URLs: clean with a note.
|
|
69
|
+
if dep.source in (DependencySource.FILE, DependencySource.LINK):
|
|
70
|
+
return self._finding(dep, RiskTier.CLEAN, 0.0, [], _REMEDIATION_CLEAN)
|
|
71
|
+
if dep.source is DependencySource.GIT:
|
|
72
|
+
return self._finding(
|
|
73
|
+
dep,
|
|
74
|
+
RiskTier.CLEAN,
|
|
75
|
+
0.0,
|
|
76
|
+
[
|
|
77
|
+
Signal(
|
|
78
|
+
type="git_dependency",
|
|
79
|
+
weight=0.0,
|
|
80
|
+
detail="Git URL dependency — out of scope for v0.1 scoring.",
|
|
81
|
+
)
|
|
82
|
+
],
|
|
83
|
+
_REMEDIATION_CLEAN,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Scoped npm (@org/pkg) bypasses the registry probe unless the DB has it. Spec §9 edge cases.
|
|
87
|
+
if dep.ecosystem is Ecosystem.NPM and dep.scoped:
|
|
88
|
+
entry = self._db.get((dep.ecosystem, dep.name.lower()))
|
|
89
|
+
if entry is None:
|
|
90
|
+
return self._finding(dep, RiskTier.CLEAN, 0.0, [], _REMEDIATION_CLEAN)
|
|
91
|
+
sig = signals.hallucination_db_hit(dep, entry)
|
|
92
|
+
return self._finalize(dep, [sig] if sig else [])
|
|
93
|
+
|
|
94
|
+
collected: list[Signal] = []
|
|
95
|
+
|
|
96
|
+
# 1) DB hit short-circuits.
|
|
97
|
+
entry = self._db.get((dep.ecosystem, dep.name.lower()))
|
|
98
|
+
db_signal = signals.hallucination_db_hit(dep, entry)
|
|
99
|
+
if db_signal is not None:
|
|
100
|
+
collected.append(db_signal)
|
|
101
|
+
# Only probe further if verbose — for the JSON report's completeness.
|
|
102
|
+
if not (self._config.verbose and not self._no_network):
|
|
103
|
+
return self._finalize(dep, collected)
|
|
104
|
+
|
|
105
|
+
# 2) --no-network: only DB + name pattern.
|
|
106
|
+
if self._no_network:
|
|
107
|
+
pattern_sig = signals.name_pattern_suspicious(dep)
|
|
108
|
+
if pattern_sig is not None:
|
|
109
|
+
collected.append(pattern_sig)
|
|
110
|
+
return self._finalize(dep, collected)
|
|
111
|
+
|
|
112
|
+
# 3) Registry probe.
|
|
113
|
+
client = self._client_for(dep.ecosystem)
|
|
114
|
+
if client is None:
|
|
115
|
+
return self._finding(
|
|
116
|
+
dep,
|
|
117
|
+
RiskTier.ERROR,
|
|
118
|
+
0.0,
|
|
119
|
+
collected,
|
|
120
|
+
_REMEDIATION_ERROR,
|
|
121
|
+
error=f"no registry client configured for {dep.ecosystem.value}",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
meta = await client.fetch(dep.name)
|
|
126
|
+
except RegistryError as exc:
|
|
127
|
+
return self._finding(
|
|
128
|
+
dep,
|
|
129
|
+
RiskTier.ERROR,
|
|
130
|
+
0.0,
|
|
131
|
+
collected,
|
|
132
|
+
_REMEDIATION_ERROR,
|
|
133
|
+
error=str(exc),
|
|
134
|
+
)
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
return self._finding(
|
|
137
|
+
dep,
|
|
138
|
+
RiskTier.ERROR,
|
|
139
|
+
0.0,
|
|
140
|
+
collected,
|
|
141
|
+
_REMEDIATION_ERROR,
|
|
142
|
+
error=f"{type(exc).__name__}: {exc}",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# 4) 404 → registry_not_found and stop. Spec §9 step 3.
|
|
146
|
+
nf = signals.registry_not_found(meta)
|
|
147
|
+
if nf is not None:
|
|
148
|
+
collected.append(nf)
|
|
149
|
+
# Levenshtein is still informative when the package doesn't exist.
|
|
150
|
+
lev = signals.levenshtein_typo(dep, self._popular_for(dep.ecosystem))
|
|
151
|
+
if lev is not None:
|
|
152
|
+
collected.append(lev)
|
|
153
|
+
pat = signals.name_pattern_suspicious(dep)
|
|
154
|
+
if pat is not None:
|
|
155
|
+
collected.append(pat)
|
|
156
|
+
return self._finalize(dep, collected)
|
|
157
|
+
|
|
158
|
+
# 5) Full signal sweep.
|
|
159
|
+
collected.extend(self._collect_metadata_signals(dep, meta))
|
|
160
|
+
return self._finalize(dep, collected)
|
|
161
|
+
|
|
162
|
+
def _collect_metadata_signals(self, dep: Dependency, meta: PackageMetadata) -> list[Signal]:
|
|
163
|
+
out: list[Signal] = []
|
|
164
|
+
very_recent, recent = signals.recently_published(meta)
|
|
165
|
+
if very_recent is not None:
|
|
166
|
+
out.append(very_recent)
|
|
167
|
+
elif recent is not None:
|
|
168
|
+
out.append(recent)
|
|
169
|
+
low = signals.low_downloads(meta)
|
|
170
|
+
if low is not None:
|
|
171
|
+
out.append(low)
|
|
172
|
+
np_sig = signals.new_publisher(meta)
|
|
173
|
+
if np_sig is not None:
|
|
174
|
+
out.append(np_sig)
|
|
175
|
+
solo = signals.single_release_new_account(meta)
|
|
176
|
+
if solo is not None:
|
|
177
|
+
out.append(solo)
|
|
178
|
+
lev = signals.levenshtein_typo(dep, self._popular_for(dep.ecosystem))
|
|
179
|
+
if lev is not None:
|
|
180
|
+
out.append(lev)
|
|
181
|
+
pat = signals.name_pattern_suspicious(dep)
|
|
182
|
+
if pat is not None:
|
|
183
|
+
out.append(pat)
|
|
184
|
+
return out
|
|
185
|
+
|
|
186
|
+
def _finalize(self, dep: Dependency, sigs: list[Signal]) -> Finding:
|
|
187
|
+
score = min(1.0, sum(s.weight for s in sigs))
|
|
188
|
+
tier = self._tier_for(score)
|
|
189
|
+
remediation = self._remediation_for(tier)
|
|
190
|
+
return self._finding(dep, tier, score, sigs, remediation)
|
|
191
|
+
|
|
192
|
+
def _tier_for(self, score: float) -> RiskTier:
|
|
193
|
+
if score >= self._config.hallucinated_min:
|
|
194
|
+
return RiskTier.HALLUCINATED
|
|
195
|
+
if score >= self._config.suspicious_min:
|
|
196
|
+
return RiskTier.SUSPICIOUS
|
|
197
|
+
return RiskTier.CLEAN
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _remediation_for(tier: RiskTier) -> str:
|
|
201
|
+
if tier is RiskTier.HALLUCINATED:
|
|
202
|
+
return _REMEDIATION_HALLUCINATED
|
|
203
|
+
if tier is RiskTier.SUSPICIOUS:
|
|
204
|
+
return _REMEDIATION_SUSPICIOUS
|
|
205
|
+
if tier is RiskTier.ERROR:
|
|
206
|
+
return _REMEDIATION_ERROR
|
|
207
|
+
return _REMEDIATION_CLEAN
|
|
208
|
+
|
|
209
|
+
@staticmethod
|
|
210
|
+
def _finding(
|
|
211
|
+
dep: Dependency,
|
|
212
|
+
tier: RiskTier,
|
|
213
|
+
score: float,
|
|
214
|
+
sigs: list[Signal],
|
|
215
|
+
remediation: str,
|
|
216
|
+
*,
|
|
217
|
+
error: str | None = None,
|
|
218
|
+
) -> Finding:
|
|
219
|
+
return Finding(
|
|
220
|
+
name=dep.name,
|
|
221
|
+
version=dep.version,
|
|
222
|
+
ecosystem=dep.ecosystem,
|
|
223
|
+
manifest=dep.manifest,
|
|
224
|
+
risk=tier,
|
|
225
|
+
score=round(score, 4),
|
|
226
|
+
signals=sigs,
|
|
227
|
+
remediation=remediation,
|
|
228
|
+
error=error,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def _client_for(self, ecosystem: Ecosystem) -> RegistryClient | None:
|
|
232
|
+
return self._npm if ecosystem is Ecosystem.NPM else self._pypi
|
|
233
|
+
|
|
234
|
+
def _popular_for(self, ecosystem: Ecosystem) -> frozenset[str]:
|
|
235
|
+
return self._popular_npm if ecosystem is Ecosystem.NPM else self._popular_pypi
|