pkgwhy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. pkgwhy/__init__.py +3 -0
  2. pkgwhy/__main__.py +6 -0
  3. pkgwhy/agent/__init__.py +2 -0
  4. pkgwhy/agent/judge.py +93 -0
  5. pkgwhy/cli.py +676 -0
  6. pkgwhy/core/__init__.py +2 -0
  7. pkgwhy/core/constants.py +13 -0
  8. pkgwhy/core/models.py +608 -0
  9. pkgwhy/dependencies/__init__.py +2 -0
  10. pkgwhy/dependencies/graph.py +68 -0
  11. pkgwhy/dependencies/reason.py +79 -0
  12. pkgwhy/dynamic/__init__.py +2 -0
  13. pkgwhy/dynamic/analysis.py +156 -0
  14. pkgwhy/explanations/__init__.py +2 -0
  15. pkgwhy/explanations/explain.py +47 -0
  16. pkgwhy/explanations/local_db.py +52 -0
  17. pkgwhy/imports/__init__.py +2 -0
  18. pkgwhy/imports/scanner.py +43 -0
  19. pkgwhy/inspection/__init__.py +2 -0
  20. pkgwhy/inspection/files.py +540 -0
  21. pkgwhy/inspection/python_static.py +323 -0
  22. pkgwhy/inspection/size.py +58 -0
  23. pkgwhy/inspection/text_patterns.py +135 -0
  24. pkgwhy/manifests/__init__.py +2 -0
  25. pkgwhy/manifests/lockfiles.py +51 -0
  26. pkgwhy/manifests/pyproject.py +37 -0
  27. pkgwhy/manifests/requirements.py +27 -0
  28. pkgwhy/metadata/__init__.py +2 -0
  29. pkgwhy/metadata/installed.py +83 -0
  30. pkgwhy/metadata/pypi.py +199 -0
  31. pkgwhy/policy/__init__.py +1 -0
  32. pkgwhy/policy/agent_policy.py +114 -0
  33. pkgwhy/policy/audit_log.py +60 -0
  34. pkgwhy/policy/tool_execution.py +76 -0
  35. pkgwhy/provenance/__init__.py +2 -0
  36. pkgwhy/provenance/installed.py +45 -0
  37. pkgwhy/registry/__init__.py +2 -0
  38. pkgwhy/registry/local.py +178 -0
  39. pkgwhy/registry/manifest.py +78 -0
  40. pkgwhy/registry/publish.py +142 -0
  41. pkgwhy/registry/run.py +148 -0
  42. pkgwhy/registry/tools.py +121 -0
  43. pkgwhy/reports/__init__.py +2 -0
  44. pkgwhy/reports/audit.py +81 -0
  45. pkgwhy/risk/__init__.py +5 -0
  46. pkgwhy/risk/rules.py +372 -0
  47. pkgwhy/risk/scoring.py +231 -0
  48. pkgwhy/typosquat/__init__.py +2 -0
  49. pkgwhy/typosquat/detector.py +182 -0
  50. pkgwhy/typosquat/popular_packages.py +34 -0
  51. pkgwhy/vulnerabilities/__init__.py +2 -0
  52. pkgwhy/vulnerabilities/matching.py +122 -0
  53. pkgwhy/vulnerabilities/osv.py +330 -0
  54. pkgwhy-1.0.0.dist-info/METADATA +688 -0
  55. pkgwhy-1.0.0.dist-info/RECORD +58 -0
  56. pkgwhy-1.0.0.dist-info/WHEEL +4 -0
  57. pkgwhy-1.0.0.dist-info/entry_points.txt +2 -0
  58. pkgwhy-1.0.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,323 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import re
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from pkgwhy.core.models import PythonStaticAnalysis
9
+ from pkgwhy.risk.rules import make_rule_evidence
10
+
11
+ MAX_STATIC_FILES = 200
12
+ MAX_STATIC_FILE_BYTES = 1_000_000
13
+
14
+ IMPORT_CAPABILITIES = {
15
+ "os": "Filesystem access signals",
16
+ "pathlib": "Filesystem access signals",
17
+ "shutil": "Filesystem access signals",
18
+ "glob": "Filesystem access signals",
19
+ "socket": "Network access signals",
20
+ "ssl": "Network access signals",
21
+ "urllib": "Network access signals",
22
+ "http.client": "Network access signals",
23
+ "requests": "Network access signals",
24
+ "httpx": "Network access signals",
25
+ "subprocess": "Subprocess or shell execution signals",
26
+ "pty": "Subprocess or shell execution signals",
27
+ "pickle": "Deserialisation risk signals",
28
+ "marshal": "Deserialisation risk signals",
29
+ "shelve": "Deserialisation risk signals",
30
+ "dill": "Deserialisation risk signals",
31
+ "cloudpickle": "Deserialisation risk signals",
32
+ "base64": "Encoded payload handling signals",
33
+ "zlib": "Encoded payload handling signals",
34
+ "importlib": "Dynamic import signals",
35
+ }
36
+
37
+ CALL_CAPABILITIES = {
38
+ "eval": "Dynamic code execution signals",
39
+ "exec": "Dynamic code execution signals",
40
+ "compile": "Dynamic code execution signals",
41
+ "__import__": "Dynamic import signals",
42
+ "importlib.import_module": "Dynamic import signals",
43
+ "subprocess.run": "Subprocess or shell execution signals",
44
+ "subprocess.Popen": "Subprocess or shell execution signals",
45
+ "subprocess.call": "Subprocess or shell execution signals",
46
+ "subprocess.check_call": "Subprocess or shell execution signals",
47
+ "subprocess.check_output": "Subprocess or shell execution signals",
48
+ "os.system": "Subprocess or shell execution signals",
49
+ "os.popen": "Subprocess or shell execution signals",
50
+ "os.execv": "Subprocess or shell execution signals",
51
+ "os.execve": "Subprocess or shell execution signals",
52
+ "os.spawnv": "Subprocess or shell execution signals",
53
+ "os.getenv": "Environment variable access signals",
54
+ "pickle.load": "Deserialisation risk signals",
55
+ "pickle.loads": "Deserialisation risk signals",
56
+ "marshal.loads": "Deserialisation risk signals",
57
+ "dill.loads": "Deserialisation risk signals",
58
+ "cloudpickle.loads": "Deserialisation risk signals",
59
+ "base64.b64decode": "Encoded payload handling signals",
60
+ "zlib.decompress": "Encoded payload handling signals",
61
+ "yaml.load": "Deserialisation risk signals",
62
+ }
63
+
64
+ CALL_RULE_IDS = {
65
+ "eval": "PKGWHY-PY-001",
66
+ "exec": "PKGWHY-PY-001",
67
+ "compile": "PKGWHY-PY-001",
68
+ "__import__": "PKGWHY-PY-002",
69
+ "importlib.import_module": "PKGWHY-PY-002",
70
+ "pickle.load": "PKGWHY-PY-003",
71
+ "pickle.loads": "PKGWHY-PY-003",
72
+ "marshal.loads": "PKGWHY-PY-003",
73
+ "dill.loads": "PKGWHY-PY-003",
74
+ "cloudpickle.loads": "PKGWHY-PY-003",
75
+ "base64.b64decode": "PKGWHY-PY-004",
76
+ "zlib.decompress": "PKGWHY-PY-004",
77
+ "subprocess.run": "PKGWHY-PY-005",
78
+ "subprocess.Popen": "PKGWHY-PY-005",
79
+ "subprocess.call": "PKGWHY-PY-005",
80
+ "subprocess.check_call": "PKGWHY-PY-005",
81
+ "subprocess.check_output": "PKGWHY-PY-005",
82
+ "os.system": "PKGWHY-PY-005",
83
+ "os.popen": "PKGWHY-PY-005",
84
+ "os.execv": "PKGWHY-PY-005",
85
+ "os.execve": "PKGWHY-PY-005",
86
+ "os.spawnv": "PKGWHY-PY-005",
87
+ "os.getenv": "PKGWHY-PY-006",
88
+ "yaml.load": "PKGWHY-PY-008",
89
+ }
90
+
91
+ CREDENTIAL_PATTERNS = tuple(
92
+ re.compile(pattern)
93
+ for pattern in (
94
+ r"(^|[^a-z0-9])api[_-]?key([^a-z0-9]|$)",
95
+ r"(^|[^a-z0-9])token([^a-z0-9]|$)",
96
+ r"(^|[^a-z0-9])secret([^a-z0-9]|$)",
97
+ r"(^|[^a-z0-9])password([^a-z0-9]|$)",
98
+ r"(^|[^a-z0-9])credential([^a-z0-9]|$)",
99
+ )
100
+ )
101
+
102
+ PACKAGE_MANAGER_PATTERN = re.compile(
103
+ r"(\bpython\s+-m\s+pip\b|\bpip3?\s+(install|uninstall|remove)|\buv\s+(add|remove|sync|pip)|\bpoetry\s+(add|remove|install))"
104
+ )
105
+ LARGE_ENCODED_LITERAL_PATTERN = re.compile(r"^[A-Za-z0-9+/]{120,}={0,2}$")
106
+ PYTHON_OBFUSCATION_BOOTSTRAP_PATTERN = re.compile(
107
+ r"(__pyarmor__|pyarmor_runtime|pytransform|__armor_enter__|__armor_exit__)",
108
+ re.IGNORECASE,
109
+ )
110
+
111
+
112
+ @dataclass(frozen=True)
113
+ class PythonSignal:
114
+ capability: str
115
+ detail: str
116
+ rule_id: str | None = None
117
+ line_number: int | None = None
118
+ symbol: str | None = None
119
+
120
+
121
+ def analyze_python_files(paths: list[Path]) -> PythonStaticAnalysis:
122
+ capabilities: set[str] = set()
123
+ warnings: list[str] = []
124
+ evidence: list[str] = []
125
+ rule_evidence = []
126
+ files_scanned = 0
127
+
128
+ for path in [item for item in paths if item.suffix == ".py"][:MAX_STATIC_FILES]:
129
+ try:
130
+ if path.stat().st_size > MAX_STATIC_FILE_BYTES:
131
+ warnings.append(f"Skipped large Python file during static scan: {path.name}")
132
+ continue
133
+ source = path.read_text(encoding="utf-8")
134
+ tree = ast.parse(source, filename=str(path))
135
+ except (OSError, SyntaxError, UnicodeDecodeError) as exc:
136
+ warnings.append(f"Could not statically parse Python file {path.name}: {exc.__class__.__name__}")
137
+ continue
138
+
139
+ files_scanned += 1
140
+ file_capabilities = _capabilities_from_tree(tree)
141
+ for signal in file_capabilities:
142
+ capabilities.add(signal.capability)
143
+ location = f"{path.name}:{signal.line_number}" if signal.line_number else path.name
144
+ evidence.append(f"{signal.capability}: {location} references {signal.detail}")
145
+ if signal.rule_id:
146
+ rule_evidence.append(
147
+ make_rule_evidence(
148
+ signal.rule_id,
149
+ message=f"{signal.capability}: {signal.detail}.",
150
+ evidence=[f"{location} references {signal.detail}."],
151
+ file_path=path.name,
152
+ line_number=signal.line_number,
153
+ symbol=signal.symbol or signal.detail,
154
+ )
155
+ )
156
+
157
+ return PythonStaticAnalysis(
158
+ detected_capabilities=sorted(capabilities),
159
+ warnings=warnings,
160
+ evidence=evidence[:100],
161
+ rule_evidence=rule_evidence[:100],
162
+ files_scanned=files_scanned,
163
+ )
164
+
165
+
166
+ def _capabilities_from_tree(tree: ast.AST) -> list[PythonSignal]:
167
+ detected: dict[tuple[str, str, int | None], PythonSignal] = {}
168
+ for node in ast.walk(tree):
169
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
170
+ for imported_name in _imported_names(node):
171
+ capability = _capability_for_import(imported_name)
172
+ if capability:
173
+ _add_signal(
174
+ detected,
175
+ PythonSignal(
176
+ capability=capability,
177
+ detail=imported_name,
178
+ line_number=getattr(node, "lineno", None),
179
+ symbol=imported_name,
180
+ ),
181
+ )
182
+ elif isinstance(node, ast.Call):
183
+ call_name = _call_name(node.func)
184
+ capability = CALL_CAPABILITIES.get(call_name)
185
+ if capability:
186
+ _add_signal(
187
+ detected,
188
+ PythonSignal(
189
+ capability=capability,
190
+ detail=call_name,
191
+ rule_id=CALL_RULE_IDS.get(call_name),
192
+ line_number=getattr(node, "lineno", None),
193
+ symbol=call_name,
194
+ ),
195
+ )
196
+ if _is_package_manager_call(call_name, node):
197
+ _add_signal(
198
+ detected,
199
+ PythonSignal(
200
+ capability="Package manager manipulation signals",
201
+ detail=call_name,
202
+ rule_id="PKGWHY-PY-007",
203
+ line_number=getattr(node, "lineno", None),
204
+ symbol=call_name,
205
+ ),
206
+ )
207
+ elif isinstance(node, ast.Attribute):
208
+ attr_name = _call_name(node)
209
+ if attr_name == "os.environ":
210
+ _add_signal(
211
+ detected,
212
+ PythonSignal(
213
+ capability="Environment variable access signals",
214
+ detail=attr_name,
215
+ rule_id="PKGWHY-PY-006",
216
+ line_number=getattr(node, "lineno", None),
217
+ symbol=attr_name,
218
+ ),
219
+ )
220
+ elif isinstance(node, ast.Constant) and isinstance(node.value, str):
221
+ lowered = node.value.lower()
222
+ if any(pattern.search(lowered) for pattern in CREDENTIAL_PATTERNS):
223
+ _add_signal(
224
+ detected,
225
+ PythonSignal(
226
+ capability="Credential or token access patterns",
227
+ detail="string literal containing credential-like token",
228
+ rule_id="PKGWHY-PY-006",
229
+ line_number=getattr(node, "lineno", None),
230
+ symbol="credential-like string literal",
231
+ ),
232
+ )
233
+ if LARGE_ENCODED_LITERAL_PATTERN.fullmatch(node.value.strip()):
234
+ _add_signal(
235
+ detected,
236
+ PythonSignal(
237
+ capability="Encoded payload handling signals",
238
+ detail="large encoded-looking string literal",
239
+ rule_id="PKGWHY-PY-004",
240
+ line_number=getattr(node, "lineno", None),
241
+ symbol="large encoded-looking string literal",
242
+ ),
243
+ )
244
+ if PYTHON_OBFUSCATION_BOOTSTRAP_PATTERN.search(node.value):
245
+ _add_signal(
246
+ detected,
247
+ PythonSignal(
248
+ capability="Python obfuscation signals",
249
+ detail="obfuscation-bootstrap string literal",
250
+ rule_id="PKGWHY-PY-009",
251
+ line_number=getattr(node, "lineno", None),
252
+ symbol="obfuscation-bootstrap string literal",
253
+ ),
254
+ )
255
+ elif isinstance(node, ast.Name) and PYTHON_OBFUSCATION_BOOTSTRAP_PATTERN.search(node.id):
256
+ _add_signal(
257
+ detected,
258
+ PythonSignal(
259
+ capability="Python obfuscation signals",
260
+ detail=node.id,
261
+ rule_id="PKGWHY-PY-009",
262
+ line_number=getattr(node, "lineno", None),
263
+ symbol=node.id,
264
+ ),
265
+ )
266
+ return list(detected.values())
267
+
268
+
269
+ def _imported_names(node: ast.Import | ast.ImportFrom) -> list[str]:
270
+ if isinstance(node, ast.Import):
271
+ return [alias.name for alias in node.names]
272
+ if node.module is None:
273
+ return []
274
+ return [node.module]
275
+
276
+
277
+ def _capability_for_import(imported_name: str) -> str | None:
278
+ parts = imported_name.split(".")
279
+ candidates = [imported_name]
280
+ candidates.append(parts[0])
281
+ for candidate in candidates:
282
+ if candidate in IMPORT_CAPABILITIES:
283
+ return IMPORT_CAPABILITIES[candidate]
284
+ return None
285
+
286
+
287
+ def _call_name(node: ast.AST) -> str:
288
+ if isinstance(node, ast.Name):
289
+ return node.id
290
+ if isinstance(node, ast.Attribute):
291
+ base = _call_name(node.value)
292
+ return f"{base}.{node.attr}" if base else node.attr
293
+ return ""
294
+
295
+
296
+ def _add_signal(detected: dict[tuple[str, str, int | None], PythonSignal], signal: PythonSignal) -> None:
297
+ detected.setdefault((signal.capability, signal.detail, signal.line_number), signal)
298
+
299
+
300
+ def _is_package_manager_call(call_name: str, node: ast.Call) -> bool:
301
+ if call_name in {"pip.main"}:
302
+ return True
303
+ if call_name not in {
304
+ "subprocess.run",
305
+ "subprocess.Popen",
306
+ "subprocess.call",
307
+ "subprocess.check_call",
308
+ "subprocess.check_output",
309
+ "os.system",
310
+ }:
311
+ return False
312
+ text_args = " ".join(_literal_text_args(node))
313
+ return bool(PACKAGE_MANAGER_PATTERN.search(text_args))
314
+
315
+
316
+ def _literal_text_args(node: ast.Call) -> list[str]:
317
+ values: list[str] = []
318
+ for arg in node.args:
319
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
320
+ values.append(arg.value)
321
+ elif isinstance(arg, (ast.List, ast.Tuple)):
322
+ values.extend(item.value for item in arg.elts if isinstance(item, ast.Constant) and isinstance(item.value, str))
323
+ return values
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ import heapq
4
+ from importlib.metadata import Distribution
5
+ from pathlib import Path
6
+
7
+ from pkgwhy.core.models import LargestFile, PackageSize
8
+
9
+ NATIVE_SUFFIXES = {".so", ".pyd", ".dll", ".dylib", ".a", ".lib", ".exe", ".wasm"}
10
+ JAVASCRIPT_SUFFIXES = {".js", ".mjs", ".cjs"}
11
+
12
+
13
+ def measure_distribution_size(dist: Distribution | None) -> PackageSize:
14
+ if dist is None or dist.files is None:
15
+ return PackageSize()
16
+
17
+ total = 0
18
+ python_bytes = 0
19
+ native_binary_bytes = 0
20
+ javascript_bytes = 0
21
+ other_bytes = 0
22
+ file_count = 0
23
+ largest: list[LargestFile] = []
24
+
25
+ for package_file in dist.files:
26
+ try:
27
+ path = Path(dist.locate_file(package_file))
28
+ except (OSError, ValueError):
29
+ continue
30
+ if not path.is_file():
31
+ continue
32
+ try:
33
+ size = path.stat().st_size
34
+ except OSError:
35
+ continue
36
+ suffix = path.suffix.lower()
37
+ total += size
38
+ file_count += 1
39
+ if suffix == ".py":
40
+ python_bytes += size
41
+ elif suffix in NATIVE_SUFFIXES:
42
+ native_binary_bytes += size
43
+ elif suffix in JAVASCRIPT_SUFFIXES:
44
+ javascript_bytes += size
45
+ else:
46
+ other_bytes += size
47
+ largest.append(LargestFile(path=str(package_file), size_bytes=size))
48
+
49
+ largest = heapq.nlargest(5, largest, key=lambda item: item.size_bytes)
50
+ return PackageSize(
51
+ total_bytes=total,
52
+ python_bytes=python_bytes,
53
+ native_binary_bytes=native_binary_bytes,
54
+ javascript_bytes=javascript_bytes,
55
+ other_bytes=other_bytes,
56
+ file_count=file_count,
57
+ largest_files=largest,
58
+ )
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from urllib.parse import urlparse
6
+
7
+ from pkgwhy.core.models import FileStaticAnalysis
8
+ from pkgwhy.risk.rules import make_rule_evidence
9
+
10
+ MAX_TEXT_PATTERN_BYTES = 500_000
11
+ TEXT_PATTERN_SUFFIXES = {
12
+ ".cfg",
13
+ ".ini",
14
+ ".js",
15
+ ".json",
16
+ ".md",
17
+ ".py",
18
+ ".toml",
19
+ ".txt",
20
+ ".yaml",
21
+ ".yml",
22
+ }
23
+
24
+ URL_PATTERN = re.compile(r"https?://[^\s'\"<>)\]}]+", re.IGNORECASE)
25
+ CREDENTIAL_ASSIGNMENT_PATTERN = re.compile(
26
+ r"(?<![A-Za-z0-9_])['\"]?"
27
+ r"(?P<name>(?:[A-Za-z_][A-Za-z0-9]*[_-])*(?:api[_-]?key|token|secret|password|credential)"
28
+ r"(?:[_-](?:value|id|key|token|secret|password|credential))*)"
29
+ r"['\"]?(?![A-Za-z0-9_])"
30
+ r"\s*(?:=|:\s*(?:[A-Za-z_][A-Za-z0-9_\[\], .|]*\s*=)?)\s*"
31
+ r"(?P<quote>['\"])"
32
+ r"(?P<value>[A-Za-z0-9_\-./+=]{8,})"
33
+ r"(?P=quote)",
34
+ re.IGNORECASE,
35
+ )
36
+
37
+
38
+ def analyze_text_patterns(path: Path) -> FileStaticAnalysis:
39
+ """Extract conservative URL/domain and credential-like text signals."""
40
+
41
+ source = _read_small_text(path)
42
+ if source is None:
43
+ return FileStaticAnalysis()
44
+
45
+ url_references: list[str] = []
46
+ domain_references: list[str] = []
47
+ credential_references: list[str] = []
48
+ capabilities: set[str] = set()
49
+ evidence: list[str] = []
50
+ rule_evidence = []
51
+
52
+ for line_number, line in enumerate(source.splitlines(), start=1):
53
+ for match in URL_PATTERN.finditer(line):
54
+ sanitized_url = _sanitize_url(match.group(0))
55
+ domain = _domain_from_url(match.group(0))
56
+ if not sanitized_url or not domain:
57
+ continue
58
+ capabilities.add("URL or domain references")
59
+ _append_unique(url_references, sanitized_url)
60
+ _append_unique(domain_references, domain)
61
+ evidence.append(f"URL/domain reference in {path.name}:{line_number}: {domain}.")
62
+ rule_evidence.append(
63
+ make_rule_evidence(
64
+ "PKGWHY-NET-001",
65
+ message=f"Source text references URL/domain {domain}.",
66
+ evidence=[f"{path.name}:{line_number} references domain {domain}."],
67
+ file_path=path.name,
68
+ line_number=line_number,
69
+ symbol=domain,
70
+ )
71
+ )
72
+
73
+ for match in CREDENTIAL_ASSIGNMENT_PATTERN.finditer(line):
74
+ credential_name = match.group("name")
75
+ capabilities.add("Credential or token access patterns")
76
+ reference = f"{path.name}:{line_number}:{credential_name}=(masked)"
77
+ _append_unique(credential_references, reference)
78
+ evidence.append(f"Credential-like assignment in {path.name}:{line_number}: {credential_name}=(masked).")
79
+ rule_evidence.append(
80
+ make_rule_evidence(
81
+ "PKGWHY-CRED-001",
82
+ message=f"Credential-like assignment references {credential_name}; value masked.",
83
+ evidence=[f"{path.name}:{line_number} contains {credential_name}=(masked)."],
84
+ file_path=path.name,
85
+ line_number=line_number,
86
+ symbol=credential_name,
87
+ )
88
+ )
89
+
90
+ return FileStaticAnalysis(
91
+ detected_capabilities=sorted(capabilities),
92
+ evidence=evidence,
93
+ rule_evidence=rule_evidence,
94
+ url_references=url_references,
95
+ domain_references=domain_references,
96
+ credential_references=credential_references,
97
+ )
98
+
99
+
100
+ def is_text_pattern_candidate(path: Path) -> bool:
101
+ return path.suffix.lower() in TEXT_PATTERN_SUFFIXES or path.name in {"setup.py", "setup.cfg", "pyproject.toml"}
102
+
103
+
104
+ def _read_small_text(path: Path) -> str | None:
105
+ try:
106
+ if path.stat().st_size > MAX_TEXT_PATTERN_BYTES:
107
+ return None
108
+ return path.read_text(encoding="utf-8")
109
+ except (OSError, UnicodeDecodeError):
110
+ return None
111
+
112
+
113
+ def _sanitize_url(raw_url: str) -> str | None:
114
+ cleaned = raw_url.rstrip(".,;:")
115
+ parsed = urlparse(cleaned)
116
+ if parsed.scheme not in {"http", "https"} or not parsed.netloc:
117
+ return None
118
+ host = parsed.hostname
119
+ if not host:
120
+ return None
121
+ path = "/..." if parsed.path and parsed.path != "/" else ""
122
+ return f"{parsed.scheme}://{host.lower()}{path}"
123
+
124
+
125
+ def _domain_from_url(raw_url: str) -> str | None:
126
+ parsed = urlparse(raw_url.rstrip(".,;:"))
127
+ if parsed.scheme not in {"http", "https"}:
128
+ return None
129
+ host = parsed.hostname
130
+ return host.lower() if host else None
131
+
132
+
133
+ def _append_unique(values: list[str], value: str) -> None:
134
+ if value not in values:
135
+ values.append(value)
@@ -0,0 +1,2 @@
1
+ """Project manifest parsers."""
2
+
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import tomllib
5
+ from pathlib import Path
6
+
7
+ from pkgwhy.metadata.installed import normalize_package_name
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def read_uv_lock_dependencies(path: Path) -> set[str]:
13
+ return _read_toml_lock_packages(path)
14
+
15
+
16
+ def read_poetry_lock_dependencies(path: Path) -> set[str]:
17
+ return _read_toml_lock_packages(path)
18
+
19
+
20
+ def read_lockfile_dependencies(project_root: Path) -> dict[str, set[str]]:
21
+ lockfiles: dict[str, set[str]] = {}
22
+ uv_lock = project_root / "uv.lock"
23
+ poetry_lock = project_root / "poetry.lock"
24
+ uv_dependencies = read_uv_lock_dependencies(uv_lock)
25
+ poetry_dependencies = read_poetry_lock_dependencies(poetry_lock)
26
+ if uv_dependencies:
27
+ lockfiles["uv.lock"] = uv_dependencies
28
+ if poetry_dependencies:
29
+ lockfiles["poetry.lock"] = poetry_dependencies
30
+ return lockfiles
31
+
32
+
33
+ def _read_toml_lock_packages(path: Path) -> set[str]:
34
+ if not path.exists():
35
+ return set()
36
+ try:
37
+ data = tomllib.loads(path.read_text(encoding="utf-8"))
38
+ except (OSError, tomllib.TOMLDecodeError) as exc:
39
+ logger.debug("Unable to parse lockfile %s: %s", path, exc)
40
+ return set()
41
+
42
+ names: set[str] = set()
43
+ packages = data.get("package", [])
44
+ if isinstance(packages, list):
45
+ for package in packages:
46
+ if not isinstance(package, dict):
47
+ continue
48
+ name = package.get("name")
49
+ if isinstance(name, str):
50
+ names.add(normalize_package_name(name))
51
+ return names
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ import tomllib
4
+ from pathlib import Path
5
+
6
+ from packaging.requirements import InvalidRequirement, Requirement
7
+ from packaging.utils import canonicalize_name
8
+
9
+
10
+ def read_pyproject_dependencies(path: Path) -> set[str]:
11
+ if not path.exists():
12
+ return set()
13
+ try:
14
+ data = tomllib.loads(path.read_text(encoding="utf-8"))
15
+ except (OSError, tomllib.TOMLDecodeError):
16
+ return set()
17
+
18
+ names: set[str] = set()
19
+ project = data.get("project", {})
20
+ for dependency in project.get("dependencies", []):
21
+ _add_requirement_name(names, dependency)
22
+
23
+ optional = project.get("optional-dependencies", {})
24
+ if isinstance(optional, dict):
25
+ for dependencies in optional.values():
26
+ if not isinstance(dependencies, list):
27
+ continue
28
+ for dependency in dependencies:
29
+ _add_requirement_name(names, dependency)
30
+ return names
31
+
32
+
33
+ def _add_requirement_name(names: set[str], value: str) -> None:
34
+ try:
35
+ names.add(canonicalize_name(Requirement(value).name))
36
+ except InvalidRequirement:
37
+ return
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from packaging.requirements import InvalidRequirement, Requirement
6
+ from packaging.utils import canonicalize_name
7
+
8
+
9
+ def read_requirements_dependencies(path: Path) -> set[str]:
10
+ if not path.exists():
11
+ return set()
12
+ names: set[str] = set()
13
+ try:
14
+ lines = path.read_text(encoding="utf-8").splitlines()
15
+ except OSError:
16
+ return names
17
+ for line in lines:
18
+ cleaned = line.split("#", 1)[0].strip()
19
+ if not cleaned or cleaned.startswith(
20
+ ("-", "http:", "https:", "git+", "svn+", "hg+", "bzr+", "file://")
21
+ ):
22
+ continue
23
+ try:
24
+ names.add(canonicalize_name(Requirement(cleaned).name))
25
+ except InvalidRequirement:
26
+ continue
27
+ return names
@@ -0,0 +1,2 @@
1
+ """Installed package metadata readers."""
2
+