pkgwhy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pkgwhy/__init__.py +3 -0
- pkgwhy/__main__.py +6 -0
- pkgwhy/agent/__init__.py +2 -0
- pkgwhy/agent/judge.py +93 -0
- pkgwhy/cli.py +676 -0
- pkgwhy/core/__init__.py +2 -0
- pkgwhy/core/constants.py +13 -0
- pkgwhy/core/models.py +608 -0
- pkgwhy/dependencies/__init__.py +2 -0
- pkgwhy/dependencies/graph.py +68 -0
- pkgwhy/dependencies/reason.py +79 -0
- pkgwhy/dynamic/__init__.py +2 -0
- pkgwhy/dynamic/analysis.py +156 -0
- pkgwhy/explanations/__init__.py +2 -0
- pkgwhy/explanations/explain.py +47 -0
- pkgwhy/explanations/local_db.py +52 -0
- pkgwhy/imports/__init__.py +2 -0
- pkgwhy/imports/scanner.py +43 -0
- pkgwhy/inspection/__init__.py +2 -0
- pkgwhy/inspection/files.py +540 -0
- pkgwhy/inspection/python_static.py +323 -0
- pkgwhy/inspection/size.py +58 -0
- pkgwhy/inspection/text_patterns.py +135 -0
- pkgwhy/manifests/__init__.py +2 -0
- pkgwhy/manifests/lockfiles.py +51 -0
- pkgwhy/manifests/pyproject.py +37 -0
- pkgwhy/manifests/requirements.py +27 -0
- pkgwhy/metadata/__init__.py +2 -0
- pkgwhy/metadata/installed.py +83 -0
- pkgwhy/metadata/pypi.py +199 -0
- pkgwhy/policy/__init__.py +1 -0
- pkgwhy/policy/agent_policy.py +114 -0
- pkgwhy/policy/audit_log.py +60 -0
- pkgwhy/policy/tool_execution.py +76 -0
- pkgwhy/provenance/__init__.py +2 -0
- pkgwhy/provenance/installed.py +45 -0
- pkgwhy/registry/__init__.py +2 -0
- pkgwhy/registry/local.py +178 -0
- pkgwhy/registry/manifest.py +78 -0
- pkgwhy/registry/publish.py +142 -0
- pkgwhy/registry/run.py +148 -0
- pkgwhy/registry/tools.py +121 -0
- pkgwhy/reports/__init__.py +2 -0
- pkgwhy/reports/audit.py +81 -0
- pkgwhy/risk/__init__.py +5 -0
- pkgwhy/risk/rules.py +372 -0
- pkgwhy/risk/scoring.py +231 -0
- pkgwhy/typosquat/__init__.py +2 -0
- pkgwhy/typosquat/detector.py +182 -0
- pkgwhy/typosquat/popular_packages.py +34 -0
- pkgwhy/vulnerabilities/__init__.py +2 -0
- pkgwhy/vulnerabilities/matching.py +122 -0
- pkgwhy/vulnerabilities/osv.py +330 -0
- pkgwhy-1.0.0.dist-info/METADATA +688 -0
- pkgwhy-1.0.0.dist-info/RECORD +58 -0
- pkgwhy-1.0.0.dist-info/WHEEL +4 -0
- pkgwhy-1.0.0.dist-info/entry_points.txt +2 -0
- pkgwhy-1.0.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from pkgwhy.core.models import TyposquatCandidate
|
|
6
|
+
from pkgwhy.metadata.installed import normalize_package_name
|
|
7
|
+
from pkgwhy.typosquat.popular_packages import KNOWN_LEGITIMATE_FAMILIES, POPULAR_PACKAGE_REFERENCES
|
|
8
|
+
|
|
9
|
+
HOMOGLYPH_TRANSLATION = str.maketrans(
|
|
10
|
+
{
|
|
11
|
+
"0": "o",
|
|
12
|
+
"1": "l",
|
|
13
|
+
"3": "e",
|
|
14
|
+
"4": "a",
|
|
15
|
+
"5": "s",
|
|
16
|
+
"7": "t",
|
|
17
|
+
"@": "a",
|
|
18
|
+
"$": "s",
|
|
19
|
+
"\u0430": "a",
|
|
20
|
+
"\u0435": "e",
|
|
21
|
+
"\u043e": "o",
|
|
22
|
+
"\u0440": "p",
|
|
23
|
+
"\u0441": "c",
|
|
24
|
+
"\u0445": "x",
|
|
25
|
+
"\u0443": "y",
|
|
26
|
+
"\u0391": "a",
|
|
27
|
+
"\u0395": "e",
|
|
28
|
+
"\u039f": "o",
|
|
29
|
+
"\u03a1": "p",
|
|
30
|
+
"\u03a7": "x",
|
|
31
|
+
"\u03bf": "o",
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def detect_typosquat(package_name: str) -> TyposquatCandidate | None:
|
|
37
|
+
"""Return the strongest conservative typosquat signal for a package name."""
|
|
38
|
+
|
|
39
|
+
normalized = normalize_package_name(package_name)
|
|
40
|
+
if normalized in KNOWN_LEGITIMATE_FAMILIES:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
best: TyposquatCandidate | None = None
|
|
44
|
+
for target, references in POPULAR_PACKAGE_REFERENCES.items():
|
|
45
|
+
if normalized == normalize_package_name(target):
|
|
46
|
+
continue
|
|
47
|
+
if _is_known_family_package(normalized, target):
|
|
48
|
+
continue
|
|
49
|
+
for reference in references:
|
|
50
|
+
candidate = _compare_to_reference(package_name, normalized, target, reference)
|
|
51
|
+
if candidate is None:
|
|
52
|
+
continue
|
|
53
|
+
if best is None or candidate.similarity > best.similarity:
|
|
54
|
+
best = candidate
|
|
55
|
+
return best
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def detect_typosquats(package_names: list[str]) -> list[TyposquatCandidate]:
|
|
59
|
+
candidates = [detect_typosquat(name) for name in package_names]
|
|
60
|
+
return sorted(
|
|
61
|
+
[candidate for candidate in candidates if candidate is not None],
|
|
62
|
+
key=lambda item: (-item.similarity, -len(item.signals), item.package),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _compare_to_reference(
|
|
67
|
+
package_name: str,
|
|
68
|
+
normalized: str,
|
|
69
|
+
target: str,
|
|
70
|
+
reference: str,
|
|
71
|
+
) -> TyposquatCandidate | None:
|
|
72
|
+
reference_normalized = normalize_package_name(reference)
|
|
73
|
+
candidate_compact = _compact(normalized)
|
|
74
|
+
reference_compact = _compact(reference_normalized)
|
|
75
|
+
candidate_homoglyph = _homoglyph_normalize(candidate_compact)
|
|
76
|
+
reference_homoglyph = _homoglyph_normalize(reference_compact)
|
|
77
|
+
|
|
78
|
+
signals: list[str] = []
|
|
79
|
+
evidence: list[str] = []
|
|
80
|
+
distance = _levenshtein(candidate_compact, reference_compact)
|
|
81
|
+
max_length = max(len(candidate_compact), len(reference_compact), 1)
|
|
82
|
+
similarity = 1 - (distance / max_length)
|
|
83
|
+
|
|
84
|
+
if distance <= 2 and similarity >= 0.72:
|
|
85
|
+
signals.append("edit_distance")
|
|
86
|
+
evidence.append(f"Edit distance to popular package reference '{reference}' is {distance}.")
|
|
87
|
+
if _is_adjacent_transposition(candidate_compact, reference_compact):
|
|
88
|
+
signals.append("adjacent_transposition")
|
|
89
|
+
evidence.append(f"Name appears to transpose adjacent characters from '{reference}'.")
|
|
90
|
+
# Adjacent swaps are common typos even when edit-distance similarity underweights them.
|
|
91
|
+
similarity = max(similarity, 0.92)
|
|
92
|
+
if _is_single_missing_or_extra_character(candidate_compact, reference_compact):
|
|
93
|
+
signals.append("missing_or_extra_character")
|
|
94
|
+
evidence.append(f"Name differs from '{reference}' by one missing or extra character.")
|
|
95
|
+
# Single missing/extra characters are strong typo signals for short package names.
|
|
96
|
+
similarity = max(similarity, 0.88)
|
|
97
|
+
if candidate_homoglyph == reference_homoglyph and candidate_compact != reference_compact:
|
|
98
|
+
signals.append("homoglyph_or_lookalike")
|
|
99
|
+
evidence.append(f"Name normalizes to the same lookalike form as '{reference}'.")
|
|
100
|
+
# Lookalike normalization catches visual similarity that plain edit distance can miss.
|
|
101
|
+
similarity = max(similarity, 0.95)
|
|
102
|
+
|
|
103
|
+
if len(signals) == 0:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
return TyposquatCandidate(
|
|
107
|
+
package=package_name,
|
|
108
|
+
normalized_package=normalized,
|
|
109
|
+
possible_target=target,
|
|
110
|
+
matched_reference=reference,
|
|
111
|
+
similarity=round(similarity, 3),
|
|
112
|
+
signals=sorted(set(signals)),
|
|
113
|
+
is_possible_typosquat=True,
|
|
114
|
+
recommendation="Possible typosquatting risk. Review package identity, source, maintainer, and purpose before use.",
|
|
115
|
+
evidence=evidence,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _is_known_family_package(normalized: str, target: str) -> bool:
|
|
120
|
+
target_normalized = normalize_package_name(target)
|
|
121
|
+
return (
|
|
122
|
+
normalized.startswith(f"{target_normalized}-")
|
|
123
|
+
or normalized.endswith(f"-{target_normalized}")
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _compact(value: str) -> str:
|
|
128
|
+
return re.sub(r"[-_.]+", "", value.lower())
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _homoglyph_normalize(value: str) -> str:
|
|
132
|
+
return value.translate(HOMOGLYPH_TRANSLATION)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _is_adjacent_transposition(candidate: str, reference: str) -> bool:
|
|
136
|
+
if len(candidate) != len(reference) or candidate == reference:
|
|
137
|
+
return False
|
|
138
|
+
differences = [index for index, (left, right) in enumerate(zip(candidate, reference, strict=True)) if left != right]
|
|
139
|
+
if len(differences) != 2:
|
|
140
|
+
return False
|
|
141
|
+
first, second = differences
|
|
142
|
+
return second == first + 1 and candidate[first] == reference[second] and candidate[second] == reference[first]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _is_single_missing_or_extra_character(candidate: str, reference: str) -> bool:
|
|
146
|
+
if abs(len(candidate) - len(reference)) != 1:
|
|
147
|
+
return False
|
|
148
|
+
shorter, longer = sorted((candidate, reference), key=len)
|
|
149
|
+
index = 0
|
|
150
|
+
skipped = False
|
|
151
|
+
for char in longer:
|
|
152
|
+
if index < len(shorter) and shorter[index] == char:
|
|
153
|
+
index += 1
|
|
154
|
+
elif skipped:
|
|
155
|
+
return False
|
|
156
|
+
else:
|
|
157
|
+
skipped = True
|
|
158
|
+
return True
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _levenshtein(left: str, right: str) -> int:
|
|
162
|
+
if left == right:
|
|
163
|
+
return 0
|
|
164
|
+
if not left:
|
|
165
|
+
return len(right)
|
|
166
|
+
if not right:
|
|
167
|
+
return len(left)
|
|
168
|
+
|
|
169
|
+
previous = list(range(len(right) + 1))
|
|
170
|
+
for left_index, left_char in enumerate(left, start=1):
|
|
171
|
+
current = [left_index]
|
|
172
|
+
for right_index, right_char in enumerate(right, start=1):
|
|
173
|
+
substitution_cost = 0 if left_char == right_char else 1
|
|
174
|
+
current.append(
|
|
175
|
+
min(
|
|
176
|
+
previous[right_index] + 1,
|
|
177
|
+
current[right_index - 1] + 1,
|
|
178
|
+
previous[right_index - 1] + substitution_cost,
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
previous = current
|
|
182
|
+
return previous[-1]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# Curated subset of commonly typosquatted PyPI targets, not an exhaustive index.
|
|
4
|
+
POPULAR_PACKAGE_REFERENCES: dict[str, tuple[str, ...]] = {
|
|
5
|
+
"requests": ("requests",),
|
|
6
|
+
"numpy": ("numpy",),
|
|
7
|
+
"pandas": ("pandas",),
|
|
8
|
+
"django": ("django",),
|
|
9
|
+
"flask": ("flask",),
|
|
10
|
+
"fastapi": ("fastapi",),
|
|
11
|
+
"scikit-learn": ("scikit-learn", "sklearn"),
|
|
12
|
+
"scipy": ("scipy",),
|
|
13
|
+
"matplotlib": ("matplotlib",),
|
|
14
|
+
"pytest": ("pytest",),
|
|
15
|
+
"setuptools": ("setuptools",),
|
|
16
|
+
"pip": ("pip",),
|
|
17
|
+
"wheel": ("wheel",),
|
|
18
|
+
"typer": ("typer",),
|
|
19
|
+
"click": ("click",),
|
|
20
|
+
"rich": ("rich",),
|
|
21
|
+
"pydantic": ("pydantic",),
|
|
22
|
+
"httpx": ("httpx",),
|
|
23
|
+
"cryptography": ("cryptography",),
|
|
24
|
+
"sqlalchemy": ("sqlalchemy",),
|
|
25
|
+
"beautifulsoup4": ("beautifulsoup4", "bs4"),
|
|
26
|
+
"pillow": ("pillow", "pil"),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
KNOWN_LEGITIMATE_FAMILIES: set[str] = {
|
|
30
|
+
"django-debug-toolbar",
|
|
31
|
+
"pytest-cov",
|
|
32
|
+
"pandas-stubs",
|
|
33
|
+
"types-requests",
|
|
34
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from packaging.utils import canonicalize_name
|
|
4
|
+
from packaging.version import InvalidVersion, Version
|
|
5
|
+
|
|
6
|
+
from pkgwhy.core.models import Confidence, VulnerabilityMatch, VulnerabilityRecord, VulnerabilityRange
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def match_vulnerabilities(package: str, version: str | None, records: list[VulnerabilityRecord]) -> list[VulnerabilityMatch]:
|
|
10
|
+
"""Return conservative matches for one package/version against advisory records."""
|
|
11
|
+
if version is None:
|
|
12
|
+
return []
|
|
13
|
+
matches: dict[str, VulnerabilityMatch] = {}
|
|
14
|
+
for record in records:
|
|
15
|
+
match = match_vulnerability(package, version, record)
|
|
16
|
+
if match is not None:
|
|
17
|
+
matches.setdefault(match.vulnerability_id, match)
|
|
18
|
+
return sorted(matches.values(), key=lambda item: item.vulnerability_id)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def match_vulnerability(package: str, version: str, record: VulnerabilityRecord) -> VulnerabilityMatch | None:
|
|
22
|
+
if canonicalize_name(record.package_name) != canonicalize_name(package):
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
evidence: list[str] = []
|
|
26
|
+
if _version_in_list(version, record.affected_versions):
|
|
27
|
+
evidence.append(f"Version {version} is explicitly listed as affected by {record.id}.")
|
|
28
|
+
return _build_match(record, package, version, evidence)
|
|
29
|
+
|
|
30
|
+
for affected_range in record.affected_ranges:
|
|
31
|
+
if _version_in_range(version, affected_range):
|
|
32
|
+
evidence.append(
|
|
33
|
+
f"Version {version} matched affected range "
|
|
34
|
+
f"introduced={affected_range.introduced or 'unknown'} "
|
|
35
|
+
f"fixed={affected_range.fixed or 'none'} "
|
|
36
|
+
f"last_affected={affected_range.last_affected or 'none'} "
|
|
37
|
+
f"limit={affected_range.limit or 'none'}."
|
|
38
|
+
)
|
|
39
|
+
return _build_match(record, package, version, evidence)
|
|
40
|
+
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _version_in_range(version: str, affected_range: VulnerabilityRange) -> bool:
|
|
45
|
+
if not any(
|
|
46
|
+
(affected_range.introduced, affected_range.fixed, affected_range.last_affected, affected_range.limit)
|
|
47
|
+
):
|
|
48
|
+
return False
|
|
49
|
+
range_type = affected_range.range_type.upper() if affected_range.range_type is not None else None
|
|
50
|
+
if range_type not in {None, "ECOSYSTEM", "PYPI"}:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
parsed_version = Version(version)
|
|
55
|
+
except InvalidVersion:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
if affected_range.introduced not in {None, "", "0"}:
|
|
59
|
+
try:
|
|
60
|
+
if parsed_version < Version(affected_range.introduced):
|
|
61
|
+
return False
|
|
62
|
+
except InvalidVersion:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
if affected_range.fixed:
|
|
66
|
+
try:
|
|
67
|
+
if parsed_version >= Version(affected_range.fixed):
|
|
68
|
+
return False
|
|
69
|
+
except InvalidVersion:
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
if affected_range.last_affected:
|
|
73
|
+
try:
|
|
74
|
+
if parsed_version > Version(affected_range.last_affected):
|
|
75
|
+
return False
|
|
76
|
+
except InvalidVersion:
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
if affected_range.limit:
|
|
80
|
+
try:
|
|
81
|
+
if parsed_version >= Version(affected_range.limit):
|
|
82
|
+
return False
|
|
83
|
+
except InvalidVersion:
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
return True
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _version_in_list(version: str, candidates: list[str]) -> bool:
|
|
90
|
+
if version in candidates:
|
|
91
|
+
return True
|
|
92
|
+
try:
|
|
93
|
+
parsed_version = Version(version)
|
|
94
|
+
except InvalidVersion:
|
|
95
|
+
return False
|
|
96
|
+
for candidate in candidates:
|
|
97
|
+
try:
|
|
98
|
+
if parsed_version == Version(candidate):
|
|
99
|
+
return True
|
|
100
|
+
except InvalidVersion:
|
|
101
|
+
continue
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _build_match(record: VulnerabilityRecord, package: str, version: str, evidence: list[str]) -> VulnerabilityMatch:
|
|
106
|
+
evidence.append(f"Advisory source: {record.source}.")
|
|
107
|
+
if record.source_url:
|
|
108
|
+
evidence.append(f"Advisory URL: {record.source_url}.")
|
|
109
|
+
return VulnerabilityMatch(
|
|
110
|
+
vulnerability_id=record.id,
|
|
111
|
+
package=package,
|
|
112
|
+
version=version,
|
|
113
|
+
aliases=record.aliases,
|
|
114
|
+
summary=record.summary,
|
|
115
|
+
severity=record.severity,
|
|
116
|
+
fixed_versions=record.fixed_versions,
|
|
117
|
+
references=record.references,
|
|
118
|
+
source=record.source,
|
|
119
|
+
source_url=record.source_url,
|
|
120
|
+
confidence=Confidence.MEDIUM,
|
|
121
|
+
evidence=evidence,
|
|
122
|
+
)
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from tempfile import NamedTemporaryFile
|
|
10
|
+
from typing import Any
|
|
11
|
+
from urllib import error, request
|
|
12
|
+
|
|
13
|
+
from pkgwhy.core.models import VulnerabilityRange, VulnerabilityRecord
|
|
14
|
+
from pkgwhy.metadata.installed import normalize_package_name
|
|
15
|
+
|
|
16
|
+
OSV_QUERY_URL = "https://api.osv.dev/v1/query"
|
|
17
|
+
OSV_SOURCE = "OSV.dev"
|
|
18
|
+
OSV_CACHE_ENV = "PKGWHY_CACHE_HOME"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class OSVLookupResult:
|
|
23
|
+
"""Cache-aware OSV lookup result with explicit freshness status."""
|
|
24
|
+
|
|
25
|
+
records: list[VulnerabilityRecord]
|
|
26
|
+
cache_status: str
|
|
27
|
+
cache_path: Path | None = None
|
|
28
|
+
warnings: tuple[str, ...] = ()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class OSVClientError(RuntimeError):
|
|
32
|
+
"""Raised when the optional OSV client cannot retrieve advisory data."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def load_osv_records(path: Path, package_name: str | None = None) -> list[VulnerabilityRecord]:
|
|
36
|
+
"""Load OSV-like JSON from a local file without network access."""
|
|
37
|
+
try:
|
|
38
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
39
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
40
|
+
raise ValueError(f"Could not read vulnerability data from {path}: {exc}") from exc
|
|
41
|
+
return parse_osv_payload(payload, package_name=package_name)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def parse_osv_payload(payload: dict[str, Any] | list[Any], package_name: str | None = None) -> list[VulnerabilityRecord]:
|
|
45
|
+
"""Parse a minimal OSV response or vulnerability list into internal records."""
|
|
46
|
+
if isinstance(payload, dict):
|
|
47
|
+
if isinstance(payload.get("vulns"), list):
|
|
48
|
+
vulnerabilities = payload["vulns"]
|
|
49
|
+
elif isinstance(payload.get("id"), str):
|
|
50
|
+
vulnerabilities = [payload]
|
|
51
|
+
else:
|
|
52
|
+
vulnerabilities = []
|
|
53
|
+
else:
|
|
54
|
+
vulnerabilities = payload
|
|
55
|
+
|
|
56
|
+
records: list[VulnerabilityRecord] = []
|
|
57
|
+
for item in vulnerabilities:
|
|
58
|
+
if not isinstance(item, dict):
|
|
59
|
+
continue
|
|
60
|
+
records.extend(_records_from_vulnerability(item, package_name=package_name))
|
|
61
|
+
return records
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def query_osv(package_name: str, version: str | None, *, timeout_seconds: float = 10.0) -> list[VulnerabilityRecord]:
|
|
65
|
+
"""Query OSV.dev explicitly; callers decide when network access is allowed."""
|
|
66
|
+
payload = _fetch_osv_payload(package_name, version, timeout_seconds=timeout_seconds)
|
|
67
|
+
return parse_osv_payload(payload, package_name=package_name)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def query_osv_cached(
|
|
71
|
+
package_name: str,
|
|
72
|
+
version: str | None,
|
|
73
|
+
*,
|
|
74
|
+
timeout_seconds: float = 10.0,
|
|
75
|
+
cache_dir: Path | None = None,
|
|
76
|
+
) -> OSVLookupResult:
|
|
77
|
+
"""Query OSV.dev with a stale-cache fallback for explicitly online callers."""
|
|
78
|
+
resolved_cache_dir = cache_dir or default_osv_cache_dir()
|
|
79
|
+
cache_path = _cache_path(resolved_cache_dir, package_name, version)
|
|
80
|
+
try:
|
|
81
|
+
payload = _fetch_osv_payload(package_name, version, timeout_seconds=timeout_seconds)
|
|
82
|
+
except OSVClientError as exc:
|
|
83
|
+
cached = _read_cached_payload(cache_path, package_name, version)
|
|
84
|
+
warning = (
|
|
85
|
+
f"OSV.dev lookup unavailable for {package_name} {version or 'unknown-version'}: {exc}. "
|
|
86
|
+
"Missing vulnerability matches are not proof of safety."
|
|
87
|
+
)
|
|
88
|
+
if cached is None:
|
|
89
|
+
return OSVLookupResult(records=[], cache_status="unavailable", cache_path=cache_path, warnings=(warning,))
|
|
90
|
+
records = parse_osv_payload(cached, package_name=package_name)
|
|
91
|
+
return OSVLookupResult(
|
|
92
|
+
records=records,
|
|
93
|
+
cache_status="stale_cache",
|
|
94
|
+
cache_path=cache_path,
|
|
95
|
+
warnings=(
|
|
96
|
+
warning,
|
|
97
|
+
"Using cached OSV.dev response. Cached advisory data may be stale.",
|
|
98
|
+
),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
warnings = _write_cached_payload(cache_path, package_name, version, payload)
|
|
102
|
+
return OSVLookupResult(
|
|
103
|
+
records=parse_osv_payload(payload, package_name=package_name),
|
|
104
|
+
cache_status="fresh",
|
|
105
|
+
cache_path=cache_path,
|
|
106
|
+
warnings=tuple(warnings),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def default_osv_cache_dir() -> Path:
|
|
111
|
+
"""Return the default OSV cache directory without creating it."""
|
|
112
|
+
configured = os.environ.get(OSV_CACHE_ENV)
|
|
113
|
+
root = Path(configured).expanduser() if configured else Path.home() / ".cache" / "pkgwhy"
|
|
114
|
+
return root / "vulnerabilities" / "osv"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _fetch_osv_payload(package_name: str, version: str | None, *, timeout_seconds: float) -> dict[str, Any]:
|
|
118
|
+
query: dict[str, Any] = {"package": {"name": package_name, "ecosystem": "PyPI"}}
|
|
119
|
+
if version is not None:
|
|
120
|
+
query["version"] = version
|
|
121
|
+
data = json.dumps(query).encode("utf-8")
|
|
122
|
+
req = request.Request(
|
|
123
|
+
OSV_QUERY_URL,
|
|
124
|
+
data=data,
|
|
125
|
+
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
|
126
|
+
method="POST",
|
|
127
|
+
)
|
|
128
|
+
try:
|
|
129
|
+
with request.urlopen(req, timeout=timeout_seconds) as response:
|
|
130
|
+
return json.loads(response.read().decode("utf-8"))
|
|
131
|
+
except (OSError, error.HTTPError, UnicodeDecodeError, json.JSONDecodeError) as exc:
|
|
132
|
+
raise OSVClientError(f"OSV.dev query failed for {package_name}: {exc}") from exc
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _cache_path(cache_dir: Path, package_name: str, version: str | None) -> Path:
|
|
136
|
+
normalized = normalize_package_name(package_name)
|
|
137
|
+
version_part = version or "no-version"
|
|
138
|
+
digest = hashlib.sha256(f"{normalized}\0{version_part}".encode("utf-8")).hexdigest()[:16]
|
|
139
|
+
return cache_dir / f"{normalized}-{digest}.json"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _write_cached_payload(cache_path: Path, package_name: str, version: str | None, payload: dict[str, Any]) -> list[str]:
|
|
143
|
+
document = {
|
|
144
|
+
"schema_version": "pkgwhy.osv_cache.v1",
|
|
145
|
+
"source": OSV_SOURCE,
|
|
146
|
+
"package": package_name,
|
|
147
|
+
"version": version,
|
|
148
|
+
"fetched_at": datetime.now(UTC).isoformat(),
|
|
149
|
+
"payload": payload,
|
|
150
|
+
}
|
|
151
|
+
tmp_path: Path | None = None
|
|
152
|
+
try:
|
|
153
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
serialized = json.dumps(document, indent=2, sort_keys=True)
|
|
155
|
+
with NamedTemporaryFile("w", dir=cache_path.parent, encoding="utf-8", delete=False) as tmp_file:
|
|
156
|
+
tmp_file.write(serialized)
|
|
157
|
+
tmp_file.flush()
|
|
158
|
+
os.fsync(tmp_file.fileno())
|
|
159
|
+
tmp_path = Path(tmp_file.name)
|
|
160
|
+
os.replace(tmp_path, cache_path)
|
|
161
|
+
except OSError:
|
|
162
|
+
if tmp_path is not None:
|
|
163
|
+
try:
|
|
164
|
+
tmp_path.unlink(missing_ok=True)
|
|
165
|
+
except OSError:
|
|
166
|
+
pass
|
|
167
|
+
return ["Could not write OSV.dev cache."]
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _read_cached_payload(cache_path: Path, package_name: str, version: str | None) -> dict[str, Any] | None:
|
|
172
|
+
try:
|
|
173
|
+
document = json.loads(cache_path.read_text(encoding="utf-8"))
|
|
174
|
+
except (OSError, json.JSONDecodeError):
|
|
175
|
+
return None
|
|
176
|
+
if not isinstance(document, dict):
|
|
177
|
+
return None
|
|
178
|
+
if document.get("schema_version") != "pkgwhy.osv_cache.v1":
|
|
179
|
+
return None
|
|
180
|
+
if document.get("source") != OSV_SOURCE:
|
|
181
|
+
return None
|
|
182
|
+
cached_package = document.get("package")
|
|
183
|
+
if not isinstance(cached_package, str):
|
|
184
|
+
return None
|
|
185
|
+
if normalize_package_name(cached_package) != normalize_package_name(package_name):
|
|
186
|
+
return None
|
|
187
|
+
if document.get("version") != version:
|
|
188
|
+
return None
|
|
189
|
+
payload = document.get("payload")
|
|
190
|
+
return payload if isinstance(payload, dict) else None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _records_from_vulnerability(item: dict[str, Any], package_name: str | None) -> list[VulnerabilityRecord]:
|
|
194
|
+
vulnerability_id = item.get("id")
|
|
195
|
+
if not isinstance(vulnerability_id, str) or not vulnerability_id:
|
|
196
|
+
return []
|
|
197
|
+
|
|
198
|
+
affected_entries = item.get("affected")
|
|
199
|
+
if not isinstance(affected_entries, list):
|
|
200
|
+
affected_entries = []
|
|
201
|
+
|
|
202
|
+
records: list[VulnerabilityRecord] = []
|
|
203
|
+
for affected in affected_entries:
|
|
204
|
+
if not isinstance(affected, dict):
|
|
205
|
+
continue
|
|
206
|
+
package = affected.get("package")
|
|
207
|
+
package_info = package if isinstance(package, dict) else {}
|
|
208
|
+
affected_name = _string_or_none(package_info.get("name")) or package_name
|
|
209
|
+
if not affected_name:
|
|
210
|
+
continue
|
|
211
|
+
ecosystem = _string_or_none(package_info.get("ecosystem"))
|
|
212
|
+
if ecosystem and ecosystem.lower() not in {"pypi", "python"}:
|
|
213
|
+
continue
|
|
214
|
+
records.append(
|
|
215
|
+
VulnerabilityRecord(
|
|
216
|
+
id=vulnerability_id,
|
|
217
|
+
aliases=_string_list(item.get("aliases")),
|
|
218
|
+
package_name=normalize_package_name(affected_name),
|
|
219
|
+
ecosystem=ecosystem,
|
|
220
|
+
summary=_string_or_none(item.get("summary")),
|
|
221
|
+
details=_string_or_none(item.get("details")),
|
|
222
|
+
severity=_parse_severity(item.get("severity")),
|
|
223
|
+
affected_ranges=_parse_ranges(affected.get("ranges")),
|
|
224
|
+
affected_versions=_string_list(affected.get("versions")),
|
|
225
|
+
fixed_versions=_fixed_versions_from_ranges(affected.get("ranges")),
|
|
226
|
+
references=_parse_references(item.get("references")),
|
|
227
|
+
source=OSV_SOURCE,
|
|
228
|
+
source_url=f"https://osv.dev/vulnerability/{vulnerability_id}",
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
return records
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _parse_ranges(value: Any) -> list[VulnerabilityRange]:
|
|
235
|
+
if not isinstance(value, list):
|
|
236
|
+
return []
|
|
237
|
+
ranges: list[VulnerabilityRange] = []
|
|
238
|
+
for item in value:
|
|
239
|
+
if not isinstance(item, dict):
|
|
240
|
+
continue
|
|
241
|
+
events = item.get("events")
|
|
242
|
+
if not isinstance(events, list):
|
|
243
|
+
continue
|
|
244
|
+
introduced: str | None = None
|
|
245
|
+
for event in events:
|
|
246
|
+
if not isinstance(event, dict):
|
|
247
|
+
continue
|
|
248
|
+
if "introduced" in event:
|
|
249
|
+
introduced = _string_or_none(event.get("introduced"))
|
|
250
|
+
continue
|
|
251
|
+
if "fixed" in event:
|
|
252
|
+
ranges.append(
|
|
253
|
+
VulnerabilityRange(
|
|
254
|
+
introduced=introduced,
|
|
255
|
+
fixed=_string_or_none(event.get("fixed")),
|
|
256
|
+
range_type=_string_or_none(item.get("type")),
|
|
257
|
+
)
|
|
258
|
+
)
|
|
259
|
+
introduced = None
|
|
260
|
+
continue
|
|
261
|
+
if "last_affected" in event:
|
|
262
|
+
ranges.append(
|
|
263
|
+
VulnerabilityRange(
|
|
264
|
+
introduced=introduced,
|
|
265
|
+
last_affected=_string_or_none(event.get("last_affected")),
|
|
266
|
+
range_type=_string_or_none(item.get("type")),
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
introduced = None
|
|
270
|
+
if "limit" in event:
|
|
271
|
+
ranges.append(
|
|
272
|
+
VulnerabilityRange(
|
|
273
|
+
introduced=introduced,
|
|
274
|
+
limit=_string_or_none(event.get("limit")),
|
|
275
|
+
range_type=_string_or_none(item.get("type")),
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
introduced = None
|
|
279
|
+
if introduced is not None:
|
|
280
|
+
ranges.append(VulnerabilityRange(introduced=introduced, range_type=_string_or_none(item.get("type"))))
|
|
281
|
+
return ranges
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _fixed_versions_from_ranges(value: Any) -> list[str]:
|
|
285
|
+
fixed: list[str] = []
|
|
286
|
+
if not isinstance(value, list):
|
|
287
|
+
return fixed
|
|
288
|
+
for item in value:
|
|
289
|
+
if not isinstance(item, dict):
|
|
290
|
+
continue
|
|
291
|
+
events = item.get("events")
|
|
292
|
+
if not isinstance(events, list):
|
|
293
|
+
continue
|
|
294
|
+
for event in events:
|
|
295
|
+
if isinstance(event, dict) and isinstance(event.get("fixed"), str):
|
|
296
|
+
fixed.append(event["fixed"])
|
|
297
|
+
return sorted(set(fixed))
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _parse_references(value: Any) -> list[str]:
|
|
301
|
+
if not isinstance(value, list):
|
|
302
|
+
return []
|
|
303
|
+
refs: list[str] = []
|
|
304
|
+
for item in value:
|
|
305
|
+
if isinstance(item, dict) and isinstance(item.get("url"), str):
|
|
306
|
+
refs.append(item["url"])
|
|
307
|
+
return refs
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _parse_severity(value: Any) -> list[str]:
|
|
311
|
+
if not isinstance(value, list):
|
|
312
|
+
return []
|
|
313
|
+
severities: list[str] = []
|
|
314
|
+
for item in value:
|
|
315
|
+
if not isinstance(item, dict):
|
|
316
|
+
continue
|
|
317
|
+
score = item.get("score")
|
|
318
|
+
if isinstance(score, str):
|
|
319
|
+
severities.append(score)
|
|
320
|
+
return severities
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _string_list(value: Any) -> list[str]:
|
|
324
|
+
if not isinstance(value, list):
|
|
325
|
+
return []
|
|
326
|
+
return [item for item in value if isinstance(item, str)]
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _string_or_none(value: Any) -> str | None:
|
|
330
|
+
return value if isinstance(value, str) and value else None
|