pkgwhy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. pkgwhy/__init__.py +3 -0
  2. pkgwhy/__main__.py +6 -0
  3. pkgwhy/agent/__init__.py +2 -0
  4. pkgwhy/agent/judge.py +93 -0
  5. pkgwhy/cli.py +676 -0
  6. pkgwhy/core/__init__.py +2 -0
  7. pkgwhy/core/constants.py +13 -0
  8. pkgwhy/core/models.py +608 -0
  9. pkgwhy/dependencies/__init__.py +2 -0
  10. pkgwhy/dependencies/graph.py +68 -0
  11. pkgwhy/dependencies/reason.py +79 -0
  12. pkgwhy/dynamic/__init__.py +2 -0
  13. pkgwhy/dynamic/analysis.py +156 -0
  14. pkgwhy/explanations/__init__.py +2 -0
  15. pkgwhy/explanations/explain.py +47 -0
  16. pkgwhy/explanations/local_db.py +52 -0
  17. pkgwhy/imports/__init__.py +2 -0
  18. pkgwhy/imports/scanner.py +43 -0
  19. pkgwhy/inspection/__init__.py +2 -0
  20. pkgwhy/inspection/files.py +540 -0
  21. pkgwhy/inspection/python_static.py +323 -0
  22. pkgwhy/inspection/size.py +58 -0
  23. pkgwhy/inspection/text_patterns.py +135 -0
  24. pkgwhy/manifests/__init__.py +2 -0
  25. pkgwhy/manifests/lockfiles.py +51 -0
  26. pkgwhy/manifests/pyproject.py +37 -0
  27. pkgwhy/manifests/requirements.py +27 -0
  28. pkgwhy/metadata/__init__.py +2 -0
  29. pkgwhy/metadata/installed.py +83 -0
  30. pkgwhy/metadata/pypi.py +199 -0
  31. pkgwhy/policy/__init__.py +1 -0
  32. pkgwhy/policy/agent_policy.py +114 -0
  33. pkgwhy/policy/audit_log.py +60 -0
  34. pkgwhy/policy/tool_execution.py +76 -0
  35. pkgwhy/provenance/__init__.py +2 -0
  36. pkgwhy/provenance/installed.py +45 -0
  37. pkgwhy/registry/__init__.py +2 -0
  38. pkgwhy/registry/local.py +178 -0
  39. pkgwhy/registry/manifest.py +78 -0
  40. pkgwhy/registry/publish.py +142 -0
  41. pkgwhy/registry/run.py +148 -0
  42. pkgwhy/registry/tools.py +121 -0
  43. pkgwhy/reports/__init__.py +2 -0
  44. pkgwhy/reports/audit.py +81 -0
  45. pkgwhy/risk/__init__.py +5 -0
  46. pkgwhy/risk/rules.py +372 -0
  47. pkgwhy/risk/scoring.py +231 -0
  48. pkgwhy/typosquat/__init__.py +2 -0
  49. pkgwhy/typosquat/detector.py +182 -0
  50. pkgwhy/typosquat/popular_packages.py +34 -0
  51. pkgwhy/vulnerabilities/__init__.py +2 -0
  52. pkgwhy/vulnerabilities/matching.py +122 -0
  53. pkgwhy/vulnerabilities/osv.py +330 -0
  54. pkgwhy-1.0.0.dist-info/METADATA +688 -0
  55. pkgwhy-1.0.0.dist-info/RECORD +58 -0
  56. pkgwhy-1.0.0.dist-info/WHEEL +4 -0
  57. pkgwhy-1.0.0.dist-info/entry_points.txt +2 -0
  58. pkgwhy-1.0.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,2 @@
1
+ """Typosquatting similarity detection."""
2
+
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from pkgwhy.core.models import TyposquatCandidate
6
+ from pkgwhy.metadata.installed import normalize_package_name
7
+ from pkgwhy.typosquat.popular_packages import KNOWN_LEGITIMATE_FAMILIES, POPULAR_PACKAGE_REFERENCES
8
+
9
+ HOMOGLYPH_TRANSLATION = str.maketrans(
10
+ {
11
+ "0": "o",
12
+ "1": "l",
13
+ "3": "e",
14
+ "4": "a",
15
+ "5": "s",
16
+ "7": "t",
17
+ "@": "a",
18
+ "$": "s",
19
+ "\u0430": "a",
20
+ "\u0435": "e",
21
+ "\u043e": "o",
22
+ "\u0440": "p",
23
+ "\u0441": "c",
24
+ "\u0445": "x",
25
+ "\u0443": "y",
26
+ "\u0391": "a",
27
+ "\u0395": "e",
28
+ "\u039f": "o",
29
+ "\u03a1": "p",
30
+ "\u03a7": "x",
31
+ "\u03bf": "o",
32
+ }
33
+ )
34
+
35
+
36
+ def detect_typosquat(package_name: str) -> TyposquatCandidate | None:
37
+ """Return the strongest conservative typosquat signal for a package name."""
38
+
39
+ normalized = normalize_package_name(package_name)
40
+ if normalized in KNOWN_LEGITIMATE_FAMILIES:
41
+ return None
42
+
43
+ best: TyposquatCandidate | None = None
44
+ for target, references in POPULAR_PACKAGE_REFERENCES.items():
45
+ if normalized == normalize_package_name(target):
46
+ continue
47
+ if _is_known_family_package(normalized, target):
48
+ continue
49
+ for reference in references:
50
+ candidate = _compare_to_reference(package_name, normalized, target, reference)
51
+ if candidate is None:
52
+ continue
53
+ if best is None or candidate.similarity > best.similarity:
54
+ best = candidate
55
+ return best
56
+
57
+
58
+ def detect_typosquats(package_names: list[str]) -> list[TyposquatCandidate]:
59
+ candidates = [detect_typosquat(name) for name in package_names]
60
+ return sorted(
61
+ [candidate for candidate in candidates if candidate is not None],
62
+ key=lambda item: (-item.similarity, -len(item.signals), item.package),
63
+ )
64
+
65
+
66
+ def _compare_to_reference(
67
+ package_name: str,
68
+ normalized: str,
69
+ target: str,
70
+ reference: str,
71
+ ) -> TyposquatCandidate | None:
72
+ reference_normalized = normalize_package_name(reference)
73
+ candidate_compact = _compact(normalized)
74
+ reference_compact = _compact(reference_normalized)
75
+ candidate_homoglyph = _homoglyph_normalize(candidate_compact)
76
+ reference_homoglyph = _homoglyph_normalize(reference_compact)
77
+
78
+ signals: list[str] = []
79
+ evidence: list[str] = []
80
+ distance = _levenshtein(candidate_compact, reference_compact)
81
+ max_length = max(len(candidate_compact), len(reference_compact), 1)
82
+ similarity = 1 - (distance / max_length)
83
+
84
+ if distance <= 2 and similarity >= 0.72:
85
+ signals.append("edit_distance")
86
+ evidence.append(f"Edit distance to popular package reference '{reference}' is {distance}.")
87
+ if _is_adjacent_transposition(candidate_compact, reference_compact):
88
+ signals.append("adjacent_transposition")
89
+ evidence.append(f"Name appears to transpose adjacent characters from '{reference}'.")
90
+ # Adjacent swaps are common typos even when edit-distance similarity underweights them.
91
+ similarity = max(similarity, 0.92)
92
+ if _is_single_missing_or_extra_character(candidate_compact, reference_compact):
93
+ signals.append("missing_or_extra_character")
94
+ evidence.append(f"Name differs from '{reference}' by one missing or extra character.")
95
+ # Single missing/extra characters are strong typo signals for short package names.
96
+ similarity = max(similarity, 0.88)
97
+ if candidate_homoglyph == reference_homoglyph and candidate_compact != reference_compact:
98
+ signals.append("homoglyph_or_lookalike")
99
+ evidence.append(f"Name normalizes to the same lookalike form as '{reference}'.")
100
+ # Lookalike normalization catches visual similarity that plain edit distance can miss.
101
+ similarity = max(similarity, 0.95)
102
+
103
+ if len(signals) == 0:
104
+ return None
105
+
106
+ return TyposquatCandidate(
107
+ package=package_name,
108
+ normalized_package=normalized,
109
+ possible_target=target,
110
+ matched_reference=reference,
111
+ similarity=round(similarity, 3),
112
+ signals=sorted(set(signals)),
113
+ is_possible_typosquat=True,
114
+ recommendation="Possible typosquatting risk. Review package identity, source, maintainer, and purpose before use.",
115
+ evidence=evidence,
116
+ )
117
+
118
+
119
+ def _is_known_family_package(normalized: str, target: str) -> bool:
120
+ target_normalized = normalize_package_name(target)
121
+ return (
122
+ normalized.startswith(f"{target_normalized}-")
123
+ or normalized.endswith(f"-{target_normalized}")
124
+ )
125
+
126
+
127
+ def _compact(value: str) -> str:
128
+ return re.sub(r"[-_.]+", "", value.lower())
129
+
130
+
131
+ def _homoglyph_normalize(value: str) -> str:
132
+ return value.translate(HOMOGLYPH_TRANSLATION)
133
+
134
+
135
+ def _is_adjacent_transposition(candidate: str, reference: str) -> bool:
136
+ if len(candidate) != len(reference) or candidate == reference:
137
+ return False
138
+ differences = [index for index, (left, right) in enumerate(zip(candidate, reference, strict=True)) if left != right]
139
+ if len(differences) != 2:
140
+ return False
141
+ first, second = differences
142
+ return second == first + 1 and candidate[first] == reference[second] and candidate[second] == reference[first]
143
+
144
+
145
+ def _is_single_missing_or_extra_character(candidate: str, reference: str) -> bool:
146
+ if abs(len(candidate) - len(reference)) != 1:
147
+ return False
148
+ shorter, longer = sorted((candidate, reference), key=len)
149
+ index = 0
150
+ skipped = False
151
+ for char in longer:
152
+ if index < len(shorter) and shorter[index] == char:
153
+ index += 1
154
+ elif skipped:
155
+ return False
156
+ else:
157
+ skipped = True
158
+ return True
159
+
160
+
161
+ def _levenshtein(left: str, right: str) -> int:
162
+ if left == right:
163
+ return 0
164
+ if not left:
165
+ return len(right)
166
+ if not right:
167
+ return len(left)
168
+
169
+ previous = list(range(len(right) + 1))
170
+ for left_index, left_char in enumerate(left, start=1):
171
+ current = [left_index]
172
+ for right_index, right_char in enumerate(right, start=1):
173
+ substitution_cost = 0 if left_char == right_char else 1
174
+ current.append(
175
+ min(
176
+ previous[right_index] + 1,
177
+ current[right_index - 1] + 1,
178
+ previous[right_index - 1] + substitution_cost,
179
+ )
180
+ )
181
+ previous = current
182
+ return previous[-1]
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ # Curated subset of commonly typosquatted PyPI targets, not an exhaustive index.
4
+ POPULAR_PACKAGE_REFERENCES: dict[str, tuple[str, ...]] = {
5
+ "requests": ("requests",),
6
+ "numpy": ("numpy",),
7
+ "pandas": ("pandas",),
8
+ "django": ("django",),
9
+ "flask": ("flask",),
10
+ "fastapi": ("fastapi",),
11
+ "scikit-learn": ("scikit-learn", "sklearn"),
12
+ "scipy": ("scipy",),
13
+ "matplotlib": ("matplotlib",),
14
+ "pytest": ("pytest",),
15
+ "setuptools": ("setuptools",),
16
+ "pip": ("pip",),
17
+ "wheel": ("wheel",),
18
+ "typer": ("typer",),
19
+ "click": ("click",),
20
+ "rich": ("rich",),
21
+ "pydantic": ("pydantic",),
22
+ "httpx": ("httpx",),
23
+ "cryptography": ("cryptography",),
24
+ "sqlalchemy": ("sqlalchemy",),
25
+ "beautifulsoup4": ("beautifulsoup4", "bs4"),
26
+ "pillow": ("pillow", "pil"),
27
+ }
28
+
29
+ KNOWN_LEGITIMATE_FAMILIES: set[str] = {
30
+ "django-debug-toolbar",
31
+ "pytest-cov",
32
+ "pandas-stubs",
33
+ "types-requests",
34
+ }
@@ -0,0 +1,2 @@
1
+ """Known-vulnerability parsing and matching boundaries."""
2
+
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ from packaging.utils import canonicalize_name
4
+ from packaging.version import InvalidVersion, Version
5
+
6
+ from pkgwhy.core.models import Confidence, VulnerabilityMatch, VulnerabilityRecord, VulnerabilityRange
7
+
8
+
9
+ def match_vulnerabilities(package: str, version: str | None, records: list[VulnerabilityRecord]) -> list[VulnerabilityMatch]:
10
+ """Return conservative matches for one package/version against advisory records."""
11
+ if version is None:
12
+ return []
13
+ matches: dict[str, VulnerabilityMatch] = {}
14
+ for record in records:
15
+ match = match_vulnerability(package, version, record)
16
+ if match is not None:
17
+ matches.setdefault(match.vulnerability_id, match)
18
+ return sorted(matches.values(), key=lambda item: item.vulnerability_id)
19
+
20
+
21
+ def match_vulnerability(package: str, version: str, record: VulnerabilityRecord) -> VulnerabilityMatch | None:
22
+ if canonicalize_name(record.package_name) != canonicalize_name(package):
23
+ return None
24
+
25
+ evidence: list[str] = []
26
+ if _version_in_list(version, record.affected_versions):
27
+ evidence.append(f"Version {version} is explicitly listed as affected by {record.id}.")
28
+ return _build_match(record, package, version, evidence)
29
+
30
+ for affected_range in record.affected_ranges:
31
+ if _version_in_range(version, affected_range):
32
+ evidence.append(
33
+ f"Version {version} matched affected range "
34
+ f"introduced={affected_range.introduced or 'unknown'} "
35
+ f"fixed={affected_range.fixed or 'none'} "
36
+ f"last_affected={affected_range.last_affected or 'none'} "
37
+ f"limit={affected_range.limit or 'none'}."
38
+ )
39
+ return _build_match(record, package, version, evidence)
40
+
41
+ return None
42
+
43
+
44
+ def _version_in_range(version: str, affected_range: VulnerabilityRange) -> bool:
45
+ if not any(
46
+ (affected_range.introduced, affected_range.fixed, affected_range.last_affected, affected_range.limit)
47
+ ):
48
+ return False
49
+ range_type = affected_range.range_type.upper() if affected_range.range_type is not None else None
50
+ if range_type not in {None, "ECOSYSTEM", "PYPI"}:
51
+ return False
52
+
53
+ try:
54
+ parsed_version = Version(version)
55
+ except InvalidVersion:
56
+ return False
57
+
58
+ if affected_range.introduced not in {None, "", "0"}:
59
+ try:
60
+ if parsed_version < Version(affected_range.introduced):
61
+ return False
62
+ except InvalidVersion:
63
+ return False
64
+
65
+ if affected_range.fixed:
66
+ try:
67
+ if parsed_version >= Version(affected_range.fixed):
68
+ return False
69
+ except InvalidVersion:
70
+ return False
71
+
72
+ if affected_range.last_affected:
73
+ try:
74
+ if parsed_version > Version(affected_range.last_affected):
75
+ return False
76
+ except InvalidVersion:
77
+ return False
78
+
79
+ if affected_range.limit:
80
+ try:
81
+ if parsed_version >= Version(affected_range.limit):
82
+ return False
83
+ except InvalidVersion:
84
+ return False
85
+
86
+ return True
87
+
88
+
89
+ def _version_in_list(version: str, candidates: list[str]) -> bool:
90
+ if version in candidates:
91
+ return True
92
+ try:
93
+ parsed_version = Version(version)
94
+ except InvalidVersion:
95
+ return False
96
+ for candidate in candidates:
97
+ try:
98
+ if parsed_version == Version(candidate):
99
+ return True
100
+ except InvalidVersion:
101
+ continue
102
+ return False
103
+
104
+
105
+ def _build_match(record: VulnerabilityRecord, package: str, version: str, evidence: list[str]) -> VulnerabilityMatch:
106
+ evidence.append(f"Advisory source: {record.source}.")
107
+ if record.source_url:
108
+ evidence.append(f"Advisory URL: {record.source_url}.")
109
+ return VulnerabilityMatch(
110
+ vulnerability_id=record.id,
111
+ package=package,
112
+ version=version,
113
+ aliases=record.aliases,
114
+ summary=record.summary,
115
+ severity=record.severity,
116
+ fixed_versions=record.fixed_versions,
117
+ references=record.references,
118
+ source=record.source,
119
+ source_url=record.source_url,
120
+ confidence=Confidence.MEDIUM,
121
+ evidence=evidence,
122
+ )
@@ -0,0 +1,330 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import UTC, datetime
5
+ import hashlib
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+ from tempfile import NamedTemporaryFile
10
+ from typing import Any
11
+ from urllib import error, request
12
+
13
+ from pkgwhy.core.models import VulnerabilityRange, VulnerabilityRecord
14
+ from pkgwhy.metadata.installed import normalize_package_name
15
+
16
+ OSV_QUERY_URL = "https://api.osv.dev/v1/query"
17
+ OSV_SOURCE = "OSV.dev"
18
+ OSV_CACHE_ENV = "PKGWHY_CACHE_HOME"
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class OSVLookupResult:
23
+ """Cache-aware OSV lookup result with explicit freshness status."""
24
+
25
+ records: list[VulnerabilityRecord]
26
+ cache_status: str
27
+ cache_path: Path | None = None
28
+ warnings: tuple[str, ...] = ()
29
+
30
+
31
+ class OSVClientError(RuntimeError):
32
+ """Raised when the optional OSV client cannot retrieve advisory data."""
33
+
34
+
35
+ def load_osv_records(path: Path, package_name: str | None = None) -> list[VulnerabilityRecord]:
36
+ """Load OSV-like JSON from a local file without network access."""
37
+ try:
38
+ payload = json.loads(path.read_text(encoding="utf-8"))
39
+ except (OSError, json.JSONDecodeError) as exc:
40
+ raise ValueError(f"Could not read vulnerability data from {path}: {exc}") from exc
41
+ return parse_osv_payload(payload, package_name=package_name)
42
+
43
+
44
+ def parse_osv_payload(payload: dict[str, Any] | list[Any], package_name: str | None = None) -> list[VulnerabilityRecord]:
45
+ """Parse a minimal OSV response or vulnerability list into internal records."""
46
+ if isinstance(payload, dict):
47
+ if isinstance(payload.get("vulns"), list):
48
+ vulnerabilities = payload["vulns"]
49
+ elif isinstance(payload.get("id"), str):
50
+ vulnerabilities = [payload]
51
+ else:
52
+ vulnerabilities = []
53
+ else:
54
+ vulnerabilities = payload
55
+
56
+ records: list[VulnerabilityRecord] = []
57
+ for item in vulnerabilities:
58
+ if not isinstance(item, dict):
59
+ continue
60
+ records.extend(_records_from_vulnerability(item, package_name=package_name))
61
+ return records
62
+
63
+
64
+ def query_osv(package_name: str, version: str | None, *, timeout_seconds: float = 10.0) -> list[VulnerabilityRecord]:
65
+ """Query OSV.dev explicitly; callers decide when network access is allowed."""
66
+ payload = _fetch_osv_payload(package_name, version, timeout_seconds=timeout_seconds)
67
+ return parse_osv_payload(payload, package_name=package_name)
68
+
69
+
70
+ def query_osv_cached(
71
+ package_name: str,
72
+ version: str | None,
73
+ *,
74
+ timeout_seconds: float = 10.0,
75
+ cache_dir: Path | None = None,
76
+ ) -> OSVLookupResult:
77
+ """Query OSV.dev with a stale-cache fallback for explicitly online callers."""
78
+ resolved_cache_dir = cache_dir or default_osv_cache_dir()
79
+ cache_path = _cache_path(resolved_cache_dir, package_name, version)
80
+ try:
81
+ payload = _fetch_osv_payload(package_name, version, timeout_seconds=timeout_seconds)
82
+ except OSVClientError as exc:
83
+ cached = _read_cached_payload(cache_path, package_name, version)
84
+ warning = (
85
+ f"OSV.dev lookup unavailable for {package_name} {version or 'unknown-version'}: {exc}. "
86
+ "Missing vulnerability matches are not proof of safety."
87
+ )
88
+ if cached is None:
89
+ return OSVLookupResult(records=[], cache_status="unavailable", cache_path=cache_path, warnings=(warning,))
90
+ records = parse_osv_payload(cached, package_name=package_name)
91
+ return OSVLookupResult(
92
+ records=records,
93
+ cache_status="stale_cache",
94
+ cache_path=cache_path,
95
+ warnings=(
96
+ warning,
97
+ "Using cached OSV.dev response. Cached advisory data may be stale.",
98
+ ),
99
+ )
100
+
101
+ warnings = _write_cached_payload(cache_path, package_name, version, payload)
102
+ return OSVLookupResult(
103
+ records=parse_osv_payload(payload, package_name=package_name),
104
+ cache_status="fresh",
105
+ cache_path=cache_path,
106
+ warnings=tuple(warnings),
107
+ )
108
+
109
+
110
+ def default_osv_cache_dir() -> Path:
111
+ """Return the default OSV cache directory without creating it."""
112
+ configured = os.environ.get(OSV_CACHE_ENV)
113
+ root = Path(configured).expanduser() if configured else Path.home() / ".cache" / "pkgwhy"
114
+ return root / "vulnerabilities" / "osv"
115
+
116
+
117
+ def _fetch_osv_payload(package_name: str, version: str | None, *, timeout_seconds: float) -> dict[str, Any]:
118
+ query: dict[str, Any] = {"package": {"name": package_name, "ecosystem": "PyPI"}}
119
+ if version is not None:
120
+ query["version"] = version
121
+ data = json.dumps(query).encode("utf-8")
122
+ req = request.Request(
123
+ OSV_QUERY_URL,
124
+ data=data,
125
+ headers={"Content-Type": "application/json", "Accept": "application/json"},
126
+ method="POST",
127
+ )
128
+ try:
129
+ with request.urlopen(req, timeout=timeout_seconds) as response:
130
+ return json.loads(response.read().decode("utf-8"))
131
+ except (OSError, error.HTTPError, UnicodeDecodeError, json.JSONDecodeError) as exc:
132
+ raise OSVClientError(f"OSV.dev query failed for {package_name}: {exc}") from exc
133
+
134
+
135
+ def _cache_path(cache_dir: Path, package_name: str, version: str | None) -> Path:
136
+ normalized = normalize_package_name(package_name)
137
+ version_part = version or "no-version"
138
+ digest = hashlib.sha256(f"{normalized}\0{version_part}".encode("utf-8")).hexdigest()[:16]
139
+ return cache_dir / f"{normalized}-{digest}.json"
140
+
141
+
142
+ def _write_cached_payload(cache_path: Path, package_name: str, version: str | None, payload: dict[str, Any]) -> list[str]:
143
+ document = {
144
+ "schema_version": "pkgwhy.osv_cache.v1",
145
+ "source": OSV_SOURCE,
146
+ "package": package_name,
147
+ "version": version,
148
+ "fetched_at": datetime.now(UTC).isoformat(),
149
+ "payload": payload,
150
+ }
151
+ tmp_path: Path | None = None
152
+ try:
153
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
154
+ serialized = json.dumps(document, indent=2, sort_keys=True)
155
+ with NamedTemporaryFile("w", dir=cache_path.parent, encoding="utf-8", delete=False) as tmp_file:
156
+ tmp_file.write(serialized)
157
+ tmp_file.flush()
158
+ os.fsync(tmp_file.fileno())
159
+ tmp_path = Path(tmp_file.name)
160
+ os.replace(tmp_path, cache_path)
161
+ except OSError:
162
+ if tmp_path is not None:
163
+ try:
164
+ tmp_path.unlink(missing_ok=True)
165
+ except OSError:
166
+ pass
167
+ return ["Could not write OSV.dev cache."]
168
+ return []
169
+
170
+
171
+ def _read_cached_payload(cache_path: Path, package_name: str, version: str | None) -> dict[str, Any] | None:
172
+ try:
173
+ document = json.loads(cache_path.read_text(encoding="utf-8"))
174
+ except (OSError, json.JSONDecodeError):
175
+ return None
176
+ if not isinstance(document, dict):
177
+ return None
178
+ if document.get("schema_version") != "pkgwhy.osv_cache.v1":
179
+ return None
180
+ if document.get("source") != OSV_SOURCE:
181
+ return None
182
+ cached_package = document.get("package")
183
+ if not isinstance(cached_package, str):
184
+ return None
185
+ if normalize_package_name(cached_package) != normalize_package_name(package_name):
186
+ return None
187
+ if document.get("version") != version:
188
+ return None
189
+ payload = document.get("payload")
190
+ return payload if isinstance(payload, dict) else None
191
+
192
+
193
+ def _records_from_vulnerability(item: dict[str, Any], package_name: str | None) -> list[VulnerabilityRecord]:
194
+ vulnerability_id = item.get("id")
195
+ if not isinstance(vulnerability_id, str) or not vulnerability_id:
196
+ return []
197
+
198
+ affected_entries = item.get("affected")
199
+ if not isinstance(affected_entries, list):
200
+ affected_entries = []
201
+
202
+ records: list[VulnerabilityRecord] = []
203
+ for affected in affected_entries:
204
+ if not isinstance(affected, dict):
205
+ continue
206
+ package = affected.get("package")
207
+ package_info = package if isinstance(package, dict) else {}
208
+ affected_name = _string_or_none(package_info.get("name")) or package_name
209
+ if not affected_name:
210
+ continue
211
+ ecosystem = _string_or_none(package_info.get("ecosystem"))
212
+ if ecosystem and ecosystem.lower() not in {"pypi", "python"}:
213
+ continue
214
+ records.append(
215
+ VulnerabilityRecord(
216
+ id=vulnerability_id,
217
+ aliases=_string_list(item.get("aliases")),
218
+ package_name=normalize_package_name(affected_name),
219
+ ecosystem=ecosystem,
220
+ summary=_string_or_none(item.get("summary")),
221
+ details=_string_or_none(item.get("details")),
222
+ severity=_parse_severity(item.get("severity")),
223
+ affected_ranges=_parse_ranges(affected.get("ranges")),
224
+ affected_versions=_string_list(affected.get("versions")),
225
+ fixed_versions=_fixed_versions_from_ranges(affected.get("ranges")),
226
+ references=_parse_references(item.get("references")),
227
+ source=OSV_SOURCE,
228
+ source_url=f"https://osv.dev/vulnerability/{vulnerability_id}",
229
+ )
230
+ )
231
+ return records
232
+
233
+
234
+ def _parse_ranges(value: Any) -> list[VulnerabilityRange]:
235
+ if not isinstance(value, list):
236
+ return []
237
+ ranges: list[VulnerabilityRange] = []
238
+ for item in value:
239
+ if not isinstance(item, dict):
240
+ continue
241
+ events = item.get("events")
242
+ if not isinstance(events, list):
243
+ continue
244
+ introduced: str | None = None
245
+ for event in events:
246
+ if not isinstance(event, dict):
247
+ continue
248
+ if "introduced" in event:
249
+ introduced = _string_or_none(event.get("introduced"))
250
+ continue
251
+ if "fixed" in event:
252
+ ranges.append(
253
+ VulnerabilityRange(
254
+ introduced=introduced,
255
+ fixed=_string_or_none(event.get("fixed")),
256
+ range_type=_string_or_none(item.get("type")),
257
+ )
258
+ )
259
+ introduced = None
260
+ continue
261
+ if "last_affected" in event:
262
+ ranges.append(
263
+ VulnerabilityRange(
264
+ introduced=introduced,
265
+ last_affected=_string_or_none(event.get("last_affected")),
266
+ range_type=_string_or_none(item.get("type")),
267
+ )
268
+ )
269
+ introduced = None
270
+ if "limit" in event:
271
+ ranges.append(
272
+ VulnerabilityRange(
273
+ introduced=introduced,
274
+ limit=_string_or_none(event.get("limit")),
275
+ range_type=_string_or_none(item.get("type")),
276
+ )
277
+ )
278
+ introduced = None
279
+ if introduced is not None:
280
+ ranges.append(VulnerabilityRange(introduced=introduced, range_type=_string_or_none(item.get("type"))))
281
+ return ranges
282
+
283
+
284
+ def _fixed_versions_from_ranges(value: Any) -> list[str]:
285
+ fixed: list[str] = []
286
+ if not isinstance(value, list):
287
+ return fixed
288
+ for item in value:
289
+ if not isinstance(item, dict):
290
+ continue
291
+ events = item.get("events")
292
+ if not isinstance(events, list):
293
+ continue
294
+ for event in events:
295
+ if isinstance(event, dict) and isinstance(event.get("fixed"), str):
296
+ fixed.append(event["fixed"])
297
+ return sorted(set(fixed))
298
+
299
+
300
+ def _parse_references(value: Any) -> list[str]:
301
+ if not isinstance(value, list):
302
+ return []
303
+ refs: list[str] = []
304
+ for item in value:
305
+ if isinstance(item, dict) and isinstance(item.get("url"), str):
306
+ refs.append(item["url"])
307
+ return refs
308
+
309
+
310
+ def _parse_severity(value: Any) -> list[str]:
311
+ if not isinstance(value, list):
312
+ return []
313
+ severities: list[str] = []
314
+ for item in value:
315
+ if not isinstance(item, dict):
316
+ continue
317
+ score = item.get("score")
318
+ if isinstance(score, str):
319
+ severities.append(score)
320
+ return severities
321
+
322
+
323
+ def _string_list(value: Any) -> list[str]:
324
+ if not isinstance(value, list):
325
+ return []
326
+ return [item for item in value if isinstance(item, str)]
327
+
328
+
329
+ def _string_or_none(value: Any) -> str | None:
330
+ return value if isinstance(value, str) and value else None