ca9 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ca9/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.1"
File without changes
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import importlib.metadata
5
+ import re
6
+ from pathlib import Path
7
+
8
+ PYPI_TO_IMPORT: dict[str, str] = {
9
+ "beautifulsoup4": "bs4",
10
+ "dateutil": "dateutil",
11
+ "django-rest-framework": "rest_framework",
12
+ "djangorestframework": "rest_framework",
13
+ "elasticsearch-dsl": "elasticsearch_dsl",
14
+ "google-api-python-client": "googleapiclient",
15
+ "google-auth": "google.auth",
16
+ "google-cloud-storage": "google.cloud.storage",
17
+ "jinja2": "jinja2",
18
+ "msgpack-python": "msgpack",
19
+ "opencv-python": "cv2",
20
+ "opencv-python-headless": "cv2",
21
+ "pillow": "pil",
22
+ "protobuf": "google.protobuf",
23
+ "pyasn1": "pyasn1",
24
+ "pycryptodome": "crypto",
25
+ "pyjwt": "jwt",
26
+ "pymongo": "pymongo",
27
+ "pyopenssl": "openssl",
28
+ "python-dateutil": "dateutil",
29
+ "python-dotenv": "dotenv",
30
+ "python-jose": "jose",
31
+ "python-multipart": "multipart",
32
+ "pyyaml": "yaml",
33
+ "scikit-learn": "sklearn",
34
+ "sentry-sdk": "sentry_sdk",
35
+ "setuptools": "setuptools",
36
+ "typing-extensions": "typing_extensions",
37
+ "websocket-client": "websocket",
38
+ }
39
+
40
+
41
+ def pypi_to_import_name(package_name: str) -> str:
42
+ lower = package_name.lower()
43
+ if lower in PYPI_TO_IMPORT:
44
+ return PYPI_TO_IMPORT[lower]
45
+ return lower.replace("-", "_")
46
+
47
+
48
+ def collect_imports_from_source(source: str) -> set[str]:
49
+ try:
50
+ tree = ast.parse(source)
51
+ except SyntaxError:
52
+ return set()
53
+
54
+ imports: set[str] = set()
55
+ for node in ast.walk(tree):
56
+ if isinstance(node, ast.Import):
57
+ for alias in node.names:
58
+ imports.add(alias.name)
59
+ elif isinstance(node, ast.ImportFrom) and node.module:
60
+ imports.add(node.module)
61
+ for alias in node.names:
62
+ if alias.name != "*":
63
+ imports.add(f"{node.module}.{alias.name}")
64
+ return imports
65
+
66
+
67
+ _EXCLUDED_DIRS = {
68
+ ".venv",
69
+ "venv",
70
+ ".env",
71
+ "env",
72
+ "node_modules",
73
+ ".git",
74
+ "__pycache__",
75
+ ".tox",
76
+ ".nox",
77
+ ".eggs",
78
+ ".mypy_cache",
79
+ "site-packages",
80
+ "dist-packages",
81
+ }
82
+
83
+
84
+ def collect_imports_from_repo(repo_path: Path) -> set[str]:
85
+ all_imports: set[str] = set()
86
+ for py_file in repo_path.rglob("*.py"):
87
+ if _EXCLUDED_DIRS & {p.name for p in py_file.relative_to(repo_path).parents}:
88
+ continue
89
+ try:
90
+ source = py_file.read_text(encoding="utf-8", errors="ignore")
91
+ except OSError:
92
+ continue
93
+ all_imports.update(collect_imports_from_source(source))
94
+ return all_imports
95
+
96
+
97
+ def is_package_imported(package_name: str, repo_imports: set[str]) -> bool:
98
+ import_name = pypi_to_import_name(package_name)
99
+ target = import_name.lower()
100
+
101
+ for imp in repo_imports:
102
+ imp_lower = imp.lower()
103
+ if imp_lower == target:
104
+ return True
105
+ if imp_lower.startswith(target + ".") or target.startswith(imp_lower + "."):
106
+ return True
107
+
108
+ return False
109
+
110
+
111
+ def is_submodule_imported(
112
+ submodule_paths: tuple[str, ...],
113
+ repo_imports: set[str],
114
+ ) -> tuple[bool, str | None]:
115
+ for submod in submodule_paths:
116
+ target = submod.lower()
117
+ for imp in repo_imports:
118
+ imp_lower = imp.lower()
119
+ if imp_lower == target:
120
+ return True, imp
121
+ if imp_lower.startswith(target + "."):
122
+ return True, imp
123
+ if "." in imp_lower and target.startswith(imp_lower + "."):
124
+ return True, imp
125
+ return False, None
126
+
127
+
128
+ _REQ_NAME_RE = re.compile(r"^([A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?)")
129
+
130
+
131
+ def _parse_requirement_name(req_str: str) -> str | None:
132
+ m = _REQ_NAME_RE.match(req_str.strip())
133
+ return m.group(1) if m else None
134
+
135
+
136
+ def _get_direct_deps(package_name: str) -> list[str]:
137
+ try:
138
+ reqs = importlib.metadata.requires(package_name)
139
+ except importlib.metadata.PackageNotFoundError:
140
+ return []
141
+ if reqs is None:
142
+ return []
143
+ deps = []
144
+ for req_str in reqs:
145
+ if "extra ==" in req_str or "extra==" in req_str:
146
+ continue
147
+ name = _parse_requirement_name(req_str)
148
+ if name:
149
+ deps.append(name)
150
+ return deps
151
+
152
+
153
+ def resolve_transitive_deps(repo_imports: set[str]) -> dict[str, str]:
154
+ directly_imported: set[str] = set()
155
+ try:
156
+ for dist in importlib.metadata.distributions():
157
+ name = dist.metadata["Name"]
158
+ if name and is_package_imported(name, repo_imports):
159
+ directly_imported.add(name)
160
+ except Exception:
161
+ return {}
162
+
163
+ direct_lower = {n.lower() for n in directly_imported}
164
+
165
+ transitive: dict[str, str] = {}
166
+ visited: set[str] = set()
167
+
168
+ def _walk(pkg_name: str, root: str) -> None:
169
+ key = pkg_name.lower()
170
+ if key in visited:
171
+ return
172
+ visited.add(key)
173
+ for dep in _get_direct_deps(pkg_name):
174
+ dep_lower = dep.lower()
175
+ if dep_lower not in direct_lower and dep_lower not in transitive:
176
+ transitive[dep_lower] = root
177
+ _walk(dep, root)
178
+
179
+ for pkg in directly_imported:
180
+ _walk(pkg, pkg)
181
+
182
+ return transitive
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from ca9.analysis.ast_scanner import pypi_to_import_name
7
+
8
+
9
+ def load_coverage(coverage_path: Path) -> dict:
10
+ return json.loads(coverage_path.read_text())
11
+
12
+
13
+ def get_covered_files(coverage_data: dict) -> dict[str, list[int]]:
14
+ files: dict[str, list[int]] = {}
15
+ file_data = coverage_data.get("files", {})
16
+ for filepath, info in file_data.items():
17
+ executed = info.get("executed_lines", [])
18
+ if executed:
19
+ files[filepath] = executed
20
+ return files
21
+
22
+
23
+ def is_package_executed(
24
+ package_name: str,
25
+ covered_files: dict[str, list[int]],
26
+ ) -> tuple[bool, list[str]]:
27
+ import_name = pypi_to_import_name(package_name)
28
+ path_fragment = import_name.replace(".", "/")
29
+
30
+ matching_files: list[str] = []
31
+
32
+ for filepath in covered_files:
33
+ normalized = filepath.replace("\\", "/").lower()
34
+ if (
35
+ f"site-packages/{path_fragment}/" in normalized
36
+ or f"site-packages/{path_fragment}.py" in normalized
37
+ or normalized.endswith(f"/{path_fragment}/__init__.py")
38
+ or normalized.endswith(f"/{path_fragment}.py")
39
+ ):
40
+ matching_files.append(filepath)
41
+
42
+ return bool(matching_files), matching_files
43
+
44
+
45
+ def is_submodule_executed(
46
+ submodule_paths: tuple[str, ...],
47
+ file_hints: tuple[str, ...],
48
+ covered_files: dict[str, list[int]],
49
+ ) -> tuple[bool, list[str]]:
50
+ matching_files: list[str] = []
51
+
52
+ fragments: list[str] = []
53
+ for submod in submodule_paths:
54
+ fragment = submod.replace(".", "/")
55
+ fragments.append(fragment)
56
+
57
+ for filepath in covered_files:
58
+ normalized = filepath.replace("\\", "/").lower()
59
+
60
+ for fragment in fragments:
61
+ if (
62
+ f"/{fragment}/" in normalized
63
+ or f"/{fragment}.py" in normalized
64
+ or normalized.endswith(f"/{fragment}/__init__.py")
65
+ or normalized.endswith(f"/{fragment}.py")
66
+ ):
67
+ matching_files.append(filepath)
68
+ break
69
+ else:
70
+ for hint in file_hints:
71
+ if normalized.endswith(f"/{hint.lower()}"):
72
+ matching_files.append(filepath)
73
+ break
74
+
75
+ return bool(matching_files), matching_files
@@ -0,0 +1,371 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ import urllib.error
6
+ import urllib.request
7
+
8
+ from ca9.models import AffectedComponent, Vulnerability
9
+
10
+ _CURATED: dict[str, list[tuple[re.Pattern[str], tuple[str, ...], tuple[str, ...]]]] = {
11
+ "django": [
12
+ (re.compile(r"admin(?:docs)?", re.I), ("django.contrib.admin",), ()),
13
+ (re.compile(r"admindocs", re.I), ("django.contrib.admindocs",), ()),
14
+ (re.compile(r"(?:session|SESSION)", re.I), ("django.contrib.sessions",), ()),
15
+ (
16
+ re.compile(r"(?:auth(?:entication)?|password|login|logout)", re.I),
17
+ ("django.contrib.auth",),
18
+ (),
19
+ ),
20
+ (re.compile(r"QuerySet|aggregat|\.db\.models", re.I), ("django.db.models",), ()),
21
+ (re.compile(r"Truncat|utils\.text", re.I), ("django.utils.text",), ()),
22
+ (re.compile(r"utils\.encoding", re.I), ("django.utils.encoding",), ()),
23
+ (re.compile(r"multipart|MultiPartParser", re.I), ("django.http.multipartparser",), ()),
24
+ (re.compile(r"(?:template|Template)", re.I), ("django.template",), ()),
25
+ (re.compile(r"(?:GIS|Geo|GDAL|GeoJSON)", re.I), ("django.contrib.gis",), ()),
26
+ (re.compile(r"(?:syndication|feed)", re.I), ("django.contrib.syndication",), ()),
27
+ (re.compile(r"validators?\.URL|URLValidator", re.I), ("django.core.validators",), ()),
28
+ (
29
+ re.compile(r"FileUpload|UploadedFile|InMemoryUploadedFile", re.I),
30
+ ("django.core.files",),
31
+ (),
32
+ ),
33
+ (re.compile(r"(?:cache|caching)", re.I), ("django.core.cache",), ()),
34
+ ],
35
+ "werkzeug": [
36
+ (re.compile(r"debug|Debug", re.I), ("werkzeug.debug",), ("debugger.py",)),
37
+ (re.compile(r"formparser|FormDataParser|multipart", re.I), ("werkzeug.formparser",), ()),
38
+ (re.compile(r"safe_join|utils", re.I), ("werkzeug.utils",), ()),
39
+ ],
40
+ "jinja2": [
41
+ (re.compile(r"sandbox|Sandbox", re.I), ("jinja2.sandbox",), ("sandbox.py",)),
42
+ (re.compile(r"xmlattr|filters", re.I), ("jinja2.filters",), ()),
43
+ ],
44
+ "pyyaml": [
45
+ (re.compile(r"yaml\.load|unsafe_load|FullLoader|UnsafeLoader", re.I), ("yaml",), ()),
46
+ ],
47
+ "urllib3": [
48
+ (re.compile(r"CRLF|header.inject", re.I), ("urllib3",), ()),
49
+ (re.compile(r"proxy|CONNECT", re.I), ("urllib3",), ()),
50
+ ],
51
+ }
52
+
53
+ _GITHUB_COMMIT_RE = re.compile(r"https://github\.com/([^/]+/[^/]+)/commit/([0-9a-f]{7,40})")
54
+
55
+
56
+ def _fetch_commit_files(owner_repo: str, sha: str) -> list[str]:
57
+ url = f"https://api.github.com/repos/{owner_repo}/commits/{sha}"
58
+ req = urllib.request.Request(
59
+ url,
60
+ headers={
61
+ "Accept": "application/json",
62
+ "User-Agent": "ca9-scanner",
63
+ },
64
+ )
65
+ try:
66
+ with urllib.request.urlopen(req, timeout=10) as resp:
67
+ data = json.loads(resp.read().decode())
68
+ except (urllib.error.URLError, urllib.error.HTTPError, OSError, json.JSONDecodeError):
69
+ return []
70
+
71
+ return [f["filename"] for f in data.get("files", []) if "filename" in f]
72
+
73
+
74
+ def _file_paths_to_submodules(
75
+ file_paths: list[str],
76
+ import_name: str,
77
+ ) -> list[str]:
78
+ prefix = import_name.replace(".", "/").lower()
79
+ submodules: set[str] = set()
80
+
81
+ for fp in file_paths:
82
+ if not fp.endswith(".py"):
83
+ continue
84
+
85
+ fp_lower = fp.lower()
86
+
87
+ basename = fp.rsplit("/", 1)[-1] if "/" in fp else fp
88
+ if basename.startswith("test_") or basename == "conftest.py":
89
+ continue
90
+ if "/tests/" in fp_lower or "/test/" in fp_lower:
91
+ continue
92
+
93
+ idx = fp_lower.find(prefix + "/")
94
+ if idx == -1:
95
+ if fp_lower == prefix + ".py" or fp_lower.endswith("/" + prefix + ".py"):
96
+ submodules.add(import_name)
97
+ continue
98
+
99
+ rel = fp[idx:]
100
+ rel = rel[:-3]
101
+ dotted = rel.replace("/", ".")
102
+ if dotted.endswith(".__init__"):
103
+ dotted = dotted[: -len(".__init__")]
104
+
105
+ if dotted:
106
+ submodules.add(dotted)
107
+
108
+ return sorted(submodules)
109
+
110
+
111
+ def _match_commits(
112
+ vuln: Vulnerability,
113
+ ) -> AffectedComponent | None:
114
+ if not vuln.references:
115
+ return None
116
+
117
+ from ca9.analysis.ast_scanner import pypi_to_import_name
118
+
119
+ import_name = pypi_to_import_name(vuln.package_name)
120
+
121
+ all_submodules: set[str] = set()
122
+ file_hints: set[str] = set()
123
+
124
+ for ref in vuln.references:
125
+ m = _GITHUB_COMMIT_RE.search(ref)
126
+ if not m:
127
+ continue
128
+
129
+ owner_repo, sha = m.group(1), m.group(2)
130
+ changed_files = _fetch_commit_files(owner_repo, sha)
131
+ if not changed_files:
132
+ continue
133
+
134
+ submodules = _file_paths_to_submodules(changed_files, import_name)
135
+ all_submodules.update(submodules)
136
+
137
+ for fp in changed_files:
138
+ if fp.endswith(".py"):
139
+ basename = fp.rsplit("/", 1)[-1] if "/" in fp else fp
140
+ if not basename.startswith("test_") and basename != "conftest.py":
141
+ file_hints.add(basename)
142
+
143
+ if all_submodules:
144
+ return AffectedComponent(
145
+ package_import_name=import_name,
146
+ submodule_paths=tuple(sorted(all_submodules)),
147
+ file_hints=tuple(sorted(file_hints)),
148
+ confidence="high",
149
+ extraction_source="commit_analysis",
150
+ )
151
+
152
+ return None
153
+
154
+
155
+ _DOTTED_PATH_RE = re.compile(r"`([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)+)`")
156
+
157
+
158
+ def _match_curated(
159
+ package_name: str,
160
+ text: str,
161
+ ) -> AffectedComponent | None:
162
+ key = package_name.lower()
163
+ patterns = _CURATED.get(key)
164
+ if not patterns:
165
+ return None
166
+
167
+ from ca9.analysis.ast_scanner import pypi_to_import_name
168
+
169
+ import_name = pypi_to_import_name(package_name)
170
+
171
+ for regex, submodule_paths, file_hints in patterns:
172
+ if regex.search(text):
173
+ return AffectedComponent(
174
+ package_import_name=import_name,
175
+ submodule_paths=submodule_paths,
176
+ file_hints=file_hints,
177
+ confidence="high",
178
+ extraction_source=f"curated:{key}:{regex.pattern}",
179
+ )
180
+
181
+ return None
182
+
183
+
184
+ def _extract_from_text(
185
+ package_name: str,
186
+ text: str,
187
+ ) -> AffectedComponent | None:
188
+ from ca9.analysis.ast_scanner import pypi_to_import_name
189
+
190
+ import_name = pypi_to_import_name(package_name)
191
+ prefix = import_name.lower()
192
+
193
+ matches = _DOTTED_PATH_RE.findall(text)
194
+ submodule_paths: list[str] = []
195
+
196
+ for match in matches:
197
+ if match.lower().startswith(prefix + "."):
198
+ submodule_paths.append(match)
199
+
200
+ if submodule_paths:
201
+ return AffectedComponent(
202
+ package_import_name=import_name,
203
+ submodule_paths=tuple(sorted(set(submodule_paths))),
204
+ confidence="medium",
205
+ extraction_source="regex:dotted_path",
206
+ )
207
+
208
+ return None
209
+
210
+
211
+ _CLASS_NAME_RE = re.compile(r"\b([A-Z][a-z]+(?:[A-Z][a-z0-9]+)+)\b")
212
+ _GENERIC_NAMES = frozenset(
213
+ {
214
+ "JavaScript",
215
+ "TypeError",
216
+ "ValueError",
217
+ "KeyError",
218
+ "IndexError",
219
+ "RuntimeError",
220
+ "ImportError",
221
+ "AttributeError",
222
+ "HttpResponse",
223
+ "ContentType",
224
+ "StackOverflow",
225
+ "GitHub",
226
+ "PullRequest",
227
+ "ChangeLog",
228
+ "ReadOnly",
229
+ "ReleaseNotes",
230
+ }
231
+ )
232
+
233
+
234
+ def _find_package_source_dir(package_name: str) -> str | None:
235
+ import importlib.metadata
236
+ import importlib.util
237
+
238
+ from ca9.analysis.ast_scanner import pypi_to_import_name
239
+
240
+ import_name = pypi_to_import_name(package_name)
241
+ top_level = import_name.split(".")[0]
242
+
243
+ spec = importlib.util.find_spec(top_level)
244
+ if spec is None or spec.origin is None:
245
+ return None
246
+
247
+ origin = spec.origin
248
+ if origin.endswith("__init__.py"):
249
+ return str(origin.rsplit("/", 1)[0]) if "/" in origin else None
250
+ return origin
251
+
252
+
253
+ def _scan_package_for_name(
254
+ source_dir: str,
255
+ class_name: str,
256
+ import_name: str,
257
+ ) -> str | None:
258
+ import ast
259
+ import os
260
+
261
+ if source_dir.endswith(".py"):
262
+ try:
263
+ with open(source_dir, encoding="utf-8", errors="replace") as f:
264
+ tree = ast.parse(f.read(), filename=source_dir)
265
+ except (SyntaxError, OSError):
266
+ return None
267
+ for node in ast.walk(tree):
268
+ if (
269
+ isinstance(node, ast.ClassDef | ast.FunctionDef | ast.AsyncFunctionDef)
270
+ and node.name == class_name
271
+ ):
272
+ return import_name
273
+ return None
274
+
275
+ for dirpath, _dirnames, filenames in os.walk(source_dir):
276
+ for fname in filenames:
277
+ if not fname.endswith(".py"):
278
+ continue
279
+ fpath = os.path.join(dirpath, fname)
280
+ try:
281
+ with open(fpath, encoding="utf-8", errors="replace") as f:
282
+ source = f.read()
283
+ except OSError:
284
+ continue
285
+
286
+ if class_name not in source:
287
+ continue
288
+
289
+ try:
290
+ tree = ast.parse(source, filename=fpath)
291
+ except SyntaxError:
292
+ continue
293
+
294
+ for node in ast.walk(tree):
295
+ if (
296
+ isinstance(node, ast.ClassDef | ast.FunctionDef | ast.AsyncFunctionDef)
297
+ and node.name == class_name
298
+ ):
299
+ rel = fpath[len(source_dir) :]
300
+ if rel.startswith("/"):
301
+ rel = rel[1:]
302
+ if rel.endswith(".py"):
303
+ rel = rel[:-3]
304
+ dotted = rel.replace("/", ".")
305
+ if dotted.endswith(".__init__"):
306
+ dotted = dotted[:-9]
307
+ return f"{import_name}.{dotted}" if dotted else import_name
308
+
309
+ return None
310
+
311
+
312
+ def _resolve_class_names(
313
+ package_name: str,
314
+ text: str,
315
+ ) -> AffectedComponent | None:
316
+ from ca9.analysis.ast_scanner import pypi_to_import_name
317
+
318
+ import_name = pypi_to_import_name(package_name)
319
+
320
+ candidates = set(_CLASS_NAME_RE.findall(text)) - _GENERIC_NAMES
321
+ if not candidates:
322
+ return None
323
+
324
+ source_dir = _find_package_source_dir(package_name)
325
+ if source_dir is None:
326
+ return None
327
+
328
+ submodule_paths: list[str] = []
329
+ for name in candidates:
330
+ result = _scan_package_for_name(source_dir, name, import_name)
331
+ if result:
332
+ submodule_paths.append(result)
333
+
334
+ if submodule_paths:
335
+ return AffectedComponent(
336
+ package_import_name=import_name,
337
+ submodule_paths=tuple(sorted(set(submodule_paths))),
338
+ confidence="medium",
339
+ extraction_source="class_name_resolution",
340
+ )
341
+
342
+ return None
343
+
344
+
345
+ def extract_affected_component(vuln: Vulnerability) -> AffectedComponent:
346
+ from ca9.analysis.ast_scanner import pypi_to_import_name
347
+
348
+ text = f"{vuln.title} {vuln.description}"
349
+
350
+ result = _match_commits(vuln)
351
+ if result is not None:
352
+ return result
353
+
354
+ result = _match_curated(vuln.package_name, text)
355
+ if result is not None:
356
+ return result
357
+
358
+ result = _extract_from_text(vuln.package_name, text)
359
+ if result is not None:
360
+ return result
361
+
362
+ result = _resolve_class_names(vuln.package_name, text)
363
+ if result is not None:
364
+ return result
365
+
366
+ import_name = pypi_to_import_name(vuln.package_name)
367
+ return AffectedComponent(
368
+ package_import_name=import_name,
369
+ confidence="low",
370
+ extraction_source="fallback",
371
+ )