proofctl 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proofctl/__init__.py +0 -0
- proofctl/baseline.py +63 -0
- proofctl/checkers/__init__.py +0 -0
- proofctl/checkers/base.py +61 -0
- proofctl/checkers/dockerfile.py +452 -0
- proofctl/checkers/hcl_utils.py +180 -0
- proofctl/checkers/imports.py +486 -0
- proofctl/checkers/leakage.py +228 -0
- proofctl/checkers/llm_integration.py +370 -0
- proofctl/checkers/placeholders.py +261 -0
- proofctl/checkers/quality.py +539 -0
- proofctl/checkers/security.py +831 -0
- proofctl/checkers/terraform.py +1979 -0
- proofctl/checkers/terragrunt.py +206 -0
- proofctl/checkers/variants.py +166 -0
- proofctl/checkers/yaml_checker.py +1220 -0
- proofctl/cli.py +246 -0
- proofctl/config.py +92 -0
- proofctl/engine.py +403 -0
- proofctl/fixer.py +138 -0
- proofctl/models.py +44 -0
- proofctl/reporters/__init__.py +0 -0
- proofctl/reporters/html_reporter.py +191 -0
- proofctl/reporters/json_reporter.py +22 -0
- proofctl/reporters/terminal.py +110 -0
- proofctl-0.1.0.dist-info/METADATA +563 -0
- proofctl-0.1.0.dist-info/RECORD +31 -0
- proofctl-0.1.0.dist-info/WHEEL +5 -0
- proofctl-0.1.0.dist-info/entry_points.txt +2 -0
- proofctl-0.1.0.dist-info/licenses/LICENSE +21 -0
- proofctl-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
|
|
6
|
+
# Matches the header of a block: keyword ("label1")? ("label2")? {
|
|
7
|
+
_BLOCK_HEADER_RE = re.compile(
|
|
8
|
+
r'^\s*(\w+)' # kind
|
|
9
|
+
r'(?:\s+"([^"]*)")?' # optional label1
|
|
10
|
+
r'(?:\s+"([^"]*)")?' # optional label2
|
|
11
|
+
r'\s*\{' # opening brace (rest of line may follow)
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
_HEREDOC_START_RE = re.compile(r'<<[-~]?(\w+)\s*$')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class HclBlock:
|
|
19
|
+
kind: str
|
|
20
|
+
label1: str
|
|
21
|
+
label2: str
|
|
22
|
+
start_line: int # 1-based, line of the block header
|
|
23
|
+
end_line: int # 1-based, line of the closing brace
|
|
24
|
+
raw_lines: list[str] = field(repr=False)
|
|
25
|
+
|
|
26
|
+
# ── Attribute helpers ────────────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
def attr(self, name: str) -> tuple[str, int] | None:
|
|
29
|
+
"""Return (raw_value_str, abs_lineno) for the first `name = …` in this block."""
|
|
30
|
+
pat = re.compile(rf'^\s*{re.escape(name)}\s*=\s*(.*?)(?:\s*#.*)?$')
|
|
31
|
+
for i, line in enumerate(self.raw_lines):
|
|
32
|
+
m = pat.match(line)
|
|
33
|
+
if m:
|
|
34
|
+
return m.group(1).strip(), self.start_line + i
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
def has_attr(self, name: str) -> bool:
|
|
38
|
+
return self.attr(name) is not None
|
|
39
|
+
|
|
40
|
+
def attr_value(self, name: str) -> str | None:
|
|
41
|
+
r = self.attr(name)
|
|
42
|
+
return r[0] if r else None
|
|
43
|
+
|
|
44
|
+
def attr_line(self, name: str) -> int | None:
|
|
45
|
+
r = self.attr(name)
|
|
46
|
+
return r[1] if r else None
|
|
47
|
+
|
|
48
|
+
# ── Nested block helpers ─────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
def nested(self, kind: str) -> list[HclBlock]:
|
|
51
|
+
"""Directly nested blocks of the given kind."""
|
|
52
|
+
inner = _parse(self.raw_lines, base=self.start_line, skip_outer=True)
|
|
53
|
+
return [b for b in inner if b.kind == kind]
|
|
54
|
+
|
|
55
|
+
def has_nested(self, kind: str) -> bool:
|
|
56
|
+
return bool(self.nested(kind))
|
|
57
|
+
|
|
58
|
+
# ── Raw text helpers ─────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
def any_line_matches(self, pattern: str, flags: int = 0) -> int | None:
|
|
61
|
+
"""Return abs 1-based lineno of first matching line, or None."""
|
|
62
|
+
pat = re.compile(pattern, flags)
|
|
63
|
+
for i, line in enumerate(self.raw_lines):
|
|
64
|
+
if pat.search(line):
|
|
65
|
+
return self.start_line + i
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def contains(self, pattern: str, flags: int = 0) -> bool:
|
|
69
|
+
return self.any_line_matches(pattern, flags) is not None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ── Parser ───────────────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def parse_blocks(source: str) -> list[HclBlock]:
|
|
76
|
+
"""Parse top-level HCL blocks from *source*, returning a flat list."""
|
|
77
|
+
return _parse(source.splitlines(), base=1, skip_outer=False)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _strip_comment(line: str) -> str:
|
|
81
|
+
in_str = False
|
|
82
|
+
i = 0
|
|
83
|
+
while i < len(line):
|
|
84
|
+
c = line[i]
|
|
85
|
+
if c == '"' and (i == 0 or line[i - 1] != '\\'):
|
|
86
|
+
in_str = not in_str
|
|
87
|
+
if not in_str:
|
|
88
|
+
if c == '#':
|
|
89
|
+
return line[:i]
|
|
90
|
+
if c == '/' and i + 1 < len(line) and line[i + 1] == '/':
|
|
91
|
+
return line[:i]
|
|
92
|
+
i += 1
|
|
93
|
+
return line
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _brace_delta(line: str) -> int:
|
|
97
|
+
s = _strip_comment(line)
|
|
98
|
+
opens = closes = 0
|
|
99
|
+
in_str = False
|
|
100
|
+
i = 0
|
|
101
|
+
while i < len(s):
|
|
102
|
+
c = s[i]
|
|
103
|
+
if in_str and c == "\\" and i + 1 < len(s):
|
|
104
|
+
i += 2 # skip escaped character (e.g. \" inside a string)
|
|
105
|
+
continue
|
|
106
|
+
if c == '"':
|
|
107
|
+
in_str = not in_str
|
|
108
|
+
elif not in_str:
|
|
109
|
+
if c == '{':
|
|
110
|
+
opens += 1
|
|
111
|
+
elif c == '}':
|
|
112
|
+
closes += 1
|
|
113
|
+
i += 1
|
|
114
|
+
return opens - closes
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _parse(lines: list[str], base: int, skip_outer: bool) -> list[HclBlock]:
|
|
118
|
+
"""
|
|
119
|
+
Parse HCL blocks from *lines*.
|
|
120
|
+
|
|
121
|
+
base: 1-based absolute line number of lines[0].
|
|
122
|
+
skip_outer: True when lines is the raw_lines of a parent block — we skip
|
|
123
|
+
the first and last lines (header / closing brace).
|
|
124
|
+
"""
|
|
125
|
+
blocks: list[HclBlock] = []
|
|
126
|
+
depth = 0
|
|
127
|
+
block_start_idx: int | None = None
|
|
128
|
+
block_kind = block_label1 = block_label2 = ""
|
|
129
|
+
heredoc_end: str | None = None
|
|
130
|
+
|
|
131
|
+
start = 1 if skip_outer else 0
|
|
132
|
+
end = len(lines) - (1 if skip_outer else 0)
|
|
133
|
+
|
|
134
|
+
for i in range(start, end):
|
|
135
|
+
line = lines[i]
|
|
136
|
+
lineno = base + i
|
|
137
|
+
|
|
138
|
+
# ── heredoc passthrough ──
|
|
139
|
+
if heredoc_end:
|
|
140
|
+
if line.strip() == heredoc_end or line.strip().rstrip("-~") == heredoc_end:
|
|
141
|
+
heredoc_end = None
|
|
142
|
+
continue
|
|
143
|
+
hm = _HEREDOC_START_RE.search(line)
|
|
144
|
+
if hm:
|
|
145
|
+
heredoc_end = hm.group(1)
|
|
146
|
+
|
|
147
|
+
delta = _brace_delta(line)
|
|
148
|
+
|
|
149
|
+
if depth == 0 and delta > 0:
|
|
150
|
+
m = _BLOCK_HEADER_RE.match(line)
|
|
151
|
+
if m:
|
|
152
|
+
block_kind = m.group(1)
|
|
153
|
+
block_label1 = m.group(2) or ""
|
|
154
|
+
block_label2 = m.group(3) or ""
|
|
155
|
+
block_start_idx = i
|
|
156
|
+
depth += delta
|
|
157
|
+
if depth == 0:
|
|
158
|
+
# Single-line block (opened and closed on same line)
|
|
159
|
+
blocks.append(HclBlock(
|
|
160
|
+
kind=block_kind, label1=block_label1, label2=block_label2,
|
|
161
|
+
start_line=lineno, end_line=lineno,
|
|
162
|
+
raw_lines=lines[i:i + 1],
|
|
163
|
+
))
|
|
164
|
+
block_start_idx = None
|
|
165
|
+
continue # pragma: no cover
|
|
166
|
+
|
|
167
|
+
if depth > 0:
|
|
168
|
+
depth += delta
|
|
169
|
+
if depth <= 0:
|
|
170
|
+
depth = 0
|
|
171
|
+
if block_start_idx is not None:
|
|
172
|
+
blocks.append(HclBlock(
|
|
173
|
+
kind=block_kind, label1=block_label1, label2=block_label2,
|
|
174
|
+
start_line=base + block_start_idx,
|
|
175
|
+
end_line=lineno,
|
|
176
|
+
raw_lines=lines[block_start_idx:i + 1],
|
|
177
|
+
))
|
|
178
|
+
block_start_idx = None
|
|
179
|
+
|
|
180
|
+
return blocks
|
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
import urllib.error
|
|
11
|
+
import urllib.request
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from ..models import Finding, Severity
|
|
15
|
+
from .base import FileChecker, DirectoryChecker
|
|
16
|
+
|
|
17
|
+
_CACHE_DIR = Path.home() / ".cache" / "proofctl" / "pypi"
|
|
18
|
+
_CACHE_TTL = 86_400 # 24 hours
|
|
19
|
+
_PYPI_TIMEOUT = 3 # seconds
|
|
20
|
+
|
|
21
|
+
# Packages known to be frequent AI hallucination targets
|
|
22
|
+
_HIGH_RISK_NAMES: frozenset[str] = frozenset({
|
|
23
|
+
"boto4", "aiohttp-extras", "langchain-enhanced", "openai-utils",
|
|
24
|
+
"anthropic-client", "fastapi-helpers", "requests-async",
|
|
25
|
+
"sqlalchemy-utils-extended", "pydantic-extras", "flask-utils",
|
|
26
|
+
"django-extras", "numpy-utils", "pandas-extras", "torch-utils",
|
|
27
|
+
"tensorflow-extras", "scikit-learn-utils", "celery-extras",
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
# requirements.txt line: optional extras, then version specifier, then env markers / comments
|
|
31
|
+
_REQ_LINE_RE = re.compile(
|
|
32
|
+
r"^\s*([A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?)" # PEP 508 name
|
|
33
|
+
r"(\[.*?\])?" # optional extras
|
|
34
|
+
r"\s*([~<>=!][^\s;#]*)?" # optional version specifier
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _parse_requirements(text: str) -> list[tuple[str, str | None, int]]:
|
|
39
|
+
"""Parse requirements.txt; returns (name, version_spec_or_None, lineno)."""
|
|
40
|
+
results = []
|
|
41
|
+
for lineno, line in enumerate(text.splitlines(), start=1):
|
|
42
|
+
stripped = line.strip()
|
|
43
|
+
if not stripped or stripped.startswith("#") or stripped.startswith("-"):
|
|
44
|
+
continue
|
|
45
|
+
m = _REQ_LINE_RE.match(stripped)
|
|
46
|
+
if m:
|
|
47
|
+
results.append((m.group(1), m.group(4) or None, lineno))
|
|
48
|
+
return results
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _parse_pyproject_deps(text: str) -> list[tuple[str, str | None, int]]:
|
|
52
|
+
"""Lightweight parser for [project] dependencies in pyproject.toml."""
|
|
53
|
+
results = []
|
|
54
|
+
in_deps = False
|
|
55
|
+
for lineno, line in enumerate(text.splitlines(), start=1):
|
|
56
|
+
stripped = line.strip()
|
|
57
|
+
if re.match(r"^dependencies\s*=\s*\[", stripped):
|
|
58
|
+
in_deps = True
|
|
59
|
+
continue
|
|
60
|
+
if in_deps:
|
|
61
|
+
if stripped.startswith("]"):
|
|
62
|
+
in_deps = False
|
|
63
|
+
continue
|
|
64
|
+
# Quoted dep string: "boto4>=1.0" or 'boto4'
|
|
65
|
+
m = re.match(r'["\']([A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?)(\[.*?\])?([^"\']*)["\']', stripped)
|
|
66
|
+
if m:
|
|
67
|
+
name = m.group(1)
|
|
68
|
+
spec = m.group(4).strip() or None
|
|
69
|
+
results.append((name, spec, lineno))
|
|
70
|
+
return results
|
|
71
|
+
|
|
72
|
+
# mypy error patterns indicating phantom method calls
|
|
73
|
+
_MYPY_ATTR_RE = re.compile(
|
|
74
|
+
r'^(.+):(\d+):\s+error:\s+(?:Item\s+"[^"]+"\s+of\s+)?"([^"]+)"\s+has\s+no\s+attribute\s+"([^"]+)"'
|
|
75
|
+
r'|^(.+):(\d+):\s+error:\s+Module\s+"[^"]+"\s+has\s+no\s+attribute\s+"([^"]+)"'
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _normalize_pkg_name(name: str) -> str:
|
|
80
|
+
return re.sub(r"[-_.]+", "-", name).lower()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ── I-002 pattern matching ────────────────────────────────────────────────────
|
|
84
|
+
|
|
85
|
+
# Popular packages whose canonical PyPI name is the short form (no python- prefix).
|
|
86
|
+
# Only include packages where the base name alone is the correct PyPI identifier
|
|
87
|
+
# so that legitimate python-X packages (python-dateutil, python-dotenv, etc.)
|
|
88
|
+
# are NOT flagged — their base names (dateutil, dotenv) are absent from this set.
|
|
89
|
+
_POPULAR_PACKAGES: frozenset[str] = frozenset({
|
|
90
|
+
# HTTP / web clients
|
|
91
|
+
"requests", "httpx", "aiohttp", "urllib3",
|
|
92
|
+
# Web frameworks
|
|
93
|
+
"flask", "django", "fastapi", "starlette", "tornado", "sanic",
|
|
94
|
+
# ASGI / WSGI servers
|
|
95
|
+
"uvicorn", "gunicorn",
|
|
96
|
+
# Data science
|
|
97
|
+
"numpy", "pandas", "scipy", "matplotlib", "seaborn", "plotly",
|
|
98
|
+
# ML / AI — highest hallucination density
|
|
99
|
+
"torch", "tensorflow", "keras", "transformers", "openai", "anthropic",
|
|
100
|
+
"langchain", "cohere", "datasets",
|
|
101
|
+
# Databases / ORMs
|
|
102
|
+
"sqlalchemy", "sqlalchemy-utils", "alembic",
|
|
103
|
+
"pymongo", "redis", "psycopg2", "asyncpg", "motor", "pymysql",
|
|
104
|
+
"elasticsearch",
|
|
105
|
+
# Cloud
|
|
106
|
+
"boto3",
|
|
107
|
+
# Task queues
|
|
108
|
+
"celery", "rq",
|
|
109
|
+
# Validation / config
|
|
110
|
+
"pydantic", "attrs", "marshmallow",
|
|
111
|
+
# CLI / UI
|
|
112
|
+
"click", "typer", "rich",
|
|
113
|
+
# Auth / crypto
|
|
114
|
+
"cryptography", "passlib", "bcrypt", "pyjwt", "authlib",
|
|
115
|
+
# Testing
|
|
116
|
+
"pytest", "hypothesis", "faker", "factory-boy",
|
|
117
|
+
# Tooling
|
|
118
|
+
"mypy", "ruff", "black", "isort", "bandit",
|
|
119
|
+
# Serialization
|
|
120
|
+
"orjson", "ujson",
|
|
121
|
+
# Misc popular
|
|
122
|
+
"pillow", "loguru", "tenacity", "jinja2", "pyyaml", "paramiko",
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
# Suffixes that carry no semantic meaning and that AI appends to real package names.
|
|
126
|
+
# Kept intentionally tight: words like "login", "cors", "mock", "toolbelt" are
|
|
127
|
+
# used in legitimate packages (flask-login, flask-cors, pytest-mock, requests-toolbelt)
|
|
128
|
+
# and must NOT appear here.
|
|
129
|
+
_SUSPICIOUS_SUFFIXES: frozenset[str] = frozenset({
|
|
130
|
+
"utils", "extras", "helpers", "enhanced", "extended",
|
|
131
|
+
"new", "plus", "pro",
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _suspicious_variant_of(name: str) -> str | None:
|
|
136
|
+
"""Return the probable real base package if name matches a hallucination pattern.
|
|
137
|
+
|
|
138
|
+
Returns None when the name appears legitimate. Four patterns checked:
|
|
139
|
+
1. python-{X} where X is a standalone popular package
|
|
140
|
+
2. {X}-python / {X}-py
|
|
141
|
+
3. {X}-{meaningless_suffix} where suffix is in _SUSPICIOUS_SUFFIXES
|
|
142
|
+
4. {X}{N} where {X}{N-1} is a known popular package (e.g. boto4 → boto3)
|
|
143
|
+
"""
|
|
144
|
+
n = _normalize_pkg_name(name)
|
|
145
|
+
|
|
146
|
+
# Already a known popular package — not a variant.
|
|
147
|
+
if n in _POPULAR_PACKAGES:
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
# Pattern 1: python-{X} where X is a standalone popular package.
|
|
151
|
+
# Intentionally excludes python-dateutil / python-dotenv because 'dateutil'
|
|
152
|
+
# and 'dotenv' are not in _POPULAR_PACKAGES.
|
|
153
|
+
if n.startswith("python-"):
|
|
154
|
+
base = n[7:]
|
|
155
|
+
if base in _POPULAR_PACKAGES:
|
|
156
|
+
return base
|
|
157
|
+
|
|
158
|
+
# Pattern 2: {X}-python or {X}-py
|
|
159
|
+
for trailing in ("python", "py"):
|
|
160
|
+
if n.endswith(f"-{trailing}"):
|
|
161
|
+
base = n[:-(len(trailing) + 1)]
|
|
162
|
+
if base in _POPULAR_PACKAGES:
|
|
163
|
+
return base
|
|
164
|
+
|
|
165
|
+
# Pattern 3: {X}-{suffix} where suffix is in _SUSPICIOUS_SUFFIXES.
|
|
166
|
+
# rpartition splits at the LAST hyphen, so multi-hyphen names like
|
|
167
|
+
# langchain-community-extras split as ("langchain-community", "extras").
|
|
168
|
+
base, sep, suffix = n.rpartition("-")
|
|
169
|
+
if sep and suffix in _SUSPICIOUS_SUFFIXES and base in _POPULAR_PACKAGES:
|
|
170
|
+
return base
|
|
171
|
+
|
|
172
|
+
# Pattern 4: digit-incremented variant.
|
|
173
|
+
# Only flag when the predecessor name is explicitly in _POPULAR_PACKAGES
|
|
174
|
+
# (prevents false positives from arbitrary numeric suffixes).
|
|
175
|
+
m = re.match(r"^([a-z][a-z0-9-]*)(\d+)$", n)
|
|
176
|
+
if m:
|
|
177
|
+
base_name, digit = m.group(1), int(m.group(2))
|
|
178
|
+
prev = f"{base_name}{digit - 1}"
|
|
179
|
+
if prev in _POPULAR_PACKAGES:
|
|
180
|
+
return prev
|
|
181
|
+
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# Static fallback list: packages that don't fit the structural patterns above
|
|
186
|
+
# but are confirmed hallucination targets (keeps pattern rules tight).
|
|
187
|
+
_HIGH_RISK_DEPS: frozenset[str] = frozenset(
|
|
188
|
+
_normalize_pkg_name(n) for n in _HIGH_RISK_NAMES
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _pypi_exists(name: str) -> bool | None:
|
|
193
|
+
"""Returns True if the package exists on PyPI, False if not, None on network error."""
|
|
194
|
+
normalized = _normalize_pkg_name(name)
|
|
195
|
+
cache_file = _CACHE_DIR / f"{normalized}.json"
|
|
196
|
+
|
|
197
|
+
if cache_file.exists():
|
|
198
|
+
try:
|
|
199
|
+
data = json.loads(cache_file.read_text())
|
|
200
|
+
if time.time() - data["ts"] < _CACHE_TTL:
|
|
201
|
+
return data["exists"]
|
|
202
|
+
except (json.JSONDecodeError, KeyError):
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
req = urllib.request.Request(
|
|
207
|
+
f"https://pypi.org/simple/{normalized}/",
|
|
208
|
+
headers={"User-Agent": "proofctl/0.1 (AI slop linter; security research)"},
|
|
209
|
+
)
|
|
210
|
+
urllib.request.urlopen(req, timeout=_PYPI_TIMEOUT)
|
|
211
|
+
exists = True
|
|
212
|
+
except urllib.error.HTTPError as e:
|
|
213
|
+
exists = e.code != 404
|
|
214
|
+
except Exception:
|
|
215
|
+
return None # network unavailable — skip, don't false-positive
|
|
216
|
+
|
|
217
|
+
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
218
|
+
cache_file.write_text(json.dumps({"exists": exists, "ts": time.time()}))
|
|
219
|
+
return exists
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _stdlib_names() -> frozenset[str]:
|
|
223
|
+
if hasattr(sys, "stdlib_module_names"):
|
|
224
|
+
return frozenset(sys.stdlib_module_names)
|
|
225
|
+
# Fallback for Python < 3.10: common stdlib modules
|
|
226
|
+
return frozenset({
|
|
227
|
+
"abc", "ast", "asyncio", "builtins", "collections", "contextlib",
|
|
228
|
+
"copy", "dataclasses", "datetime", "enum", "fnmatch", "functools",
|
|
229
|
+
"hashlib", "http", "importlib", "inspect", "io", "itertools", "json",
|
|
230
|
+
"logging", "math", "operator", "os", "pathlib", "pickle", "platform",
|
|
231
|
+
"queue", "random", "re", "shutil", "signal", "socket", "sqlite3",
|
|
232
|
+
"string", "struct", "subprocess", "sys", "tempfile", "textwrap",
|
|
233
|
+
"threading", "time", "traceback", "types", "typing", "unittest",
|
|
234
|
+
"urllib", "uuid", "warnings", "weakref", "xml", "zipfile",
|
|
235
|
+
"_thread", "__future__",
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _local_packages(root: Path) -> frozenset[str]:
|
|
240
|
+
"""Top-level importable names within the scanned project."""
|
|
241
|
+
names: set[str] = set()
|
|
242
|
+
try:
|
|
243
|
+
children = list(root.iterdir())
|
|
244
|
+
except (OSError, PermissionError):
|
|
245
|
+
return frozenset()
|
|
246
|
+
for child in children:
|
|
247
|
+
if child.is_dir() and (child / "__init__.py").exists():
|
|
248
|
+
names.add(child.name)
|
|
249
|
+
elif child.suffix == ".py" and child.name != "__init__.py":
|
|
250
|
+
names.add(child.stem)
|
|
251
|
+
# Also check src/ layout
|
|
252
|
+
src = root / "src"
|
|
253
|
+
if src.is_dir():
|
|
254
|
+
for child in src.iterdir():
|
|
255
|
+
if child.is_dir() and (child / "__init__.py").exists():
|
|
256
|
+
names.add(child.name)
|
|
257
|
+
elif child.suffix == ".py":
|
|
258
|
+
names.add(child.stem)
|
|
259
|
+
return frozenset(names)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _extract_imports(tree: ast.Module) -> list[tuple[str, int, int]]:
|
|
263
|
+
"""Returns (top-level package name, lineno, col_offset) for every import."""
|
|
264
|
+
imports = []
|
|
265
|
+
for node in ast.walk(tree):
|
|
266
|
+
if isinstance(node, ast.Import):
|
|
267
|
+
for alias in node.names:
|
|
268
|
+
pkg = alias.name.split(".")[0]
|
|
269
|
+
imports.append((pkg, node.lineno, node.col_offset))
|
|
270
|
+
elif isinstance(node, ast.ImportFrom):
|
|
271
|
+
if node.module and node.level == 0: # skip relative imports
|
|
272
|
+
pkg = node.module.split(".")[0]
|
|
273
|
+
imports.append((pkg, node.lineno, node.col_offset))
|
|
274
|
+
return imports
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class ImportChecker(FileChecker):
|
|
278
|
+
def __init__(
|
|
279
|
+
self,
|
|
280
|
+
local_namespaces: list[str] | None = None,
|
|
281
|
+
extra_indexes: list[str] | None = None,
|
|
282
|
+
check_pypi: bool = True,
|
|
283
|
+
) -> None:
|
|
284
|
+
self._local_namespaces = tuple(local_namespaces or [])
|
|
285
|
+
self._check_pypi = check_pypi
|
|
286
|
+
self._stdlib = _stdlib_names()
|
|
287
|
+
|
|
288
|
+
def check(self, path: Path, source: str, tree: ast.Module | None) -> list[Finding]:
|
|
289
|
+
findings: list[Finding] = []
|
|
290
|
+
if tree is not None:
|
|
291
|
+
findings.extend(self._i001(path, tree))
|
|
292
|
+
return findings
|
|
293
|
+
|
|
294
|
+
def _i001(self, path: Path, tree: ast.Module) -> list[Finding]:
|
|
295
|
+
# Infer project root by walking up to the nearest pyproject.toml / setup.py.
|
|
296
|
+
# Stop at the filesystem root to avoid scanning /. Default to parent dir.
|
|
297
|
+
root = path.parent
|
|
298
|
+
candidate = root
|
|
299
|
+
for _ in range(8): # max 8 levels up
|
|
300
|
+
if (candidate / "pyproject.toml").exists() or (candidate / "setup.py").exists():
|
|
301
|
+
root = candidate
|
|
302
|
+
break
|
|
303
|
+
if candidate.parent == candidate:
|
|
304
|
+
break
|
|
305
|
+
candidate = candidate.parent
|
|
306
|
+
local = _local_packages(root)
|
|
307
|
+
|
|
308
|
+
findings = []
|
|
309
|
+
seen: set[str] = set()
|
|
310
|
+
|
|
311
|
+
for pkg, lineno, col in _extract_imports(tree):
|
|
312
|
+
if pkg in seen:
|
|
313
|
+
continue
|
|
314
|
+
seen.add(pkg)
|
|
315
|
+
|
|
316
|
+
if pkg in self._stdlib:
|
|
317
|
+
continue
|
|
318
|
+
if pkg in local:
|
|
319
|
+
continue
|
|
320
|
+
if self._local_namespaces and pkg.startswith(self._local_namespaces):
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
if not self._check_pypi:
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
exists = _pypi_exists(pkg)
|
|
327
|
+
if exists is False:
|
|
328
|
+
findings.append(Finding(
|
|
329
|
+
file=str(path),
|
|
330
|
+
line=lineno,
|
|
331
|
+
col=col,
|
|
332
|
+
rule_id="PROOFCTL-I-001",
|
|
333
|
+
rule_name="Hallucinated import",
|
|
334
|
+
severity=Severity.ERROR,
|
|
335
|
+
message=f"Package '{pkg}' not found on PyPI — possible AI hallucination",
|
|
336
|
+
hint=(
|
|
337
|
+
f"Verify '{pkg}' exists and is spelled correctly. "
|
|
338
|
+
"If it's a private package, add it to local_namespaces in .proofctl.yaml."
|
|
339
|
+
),
|
|
340
|
+
authority="Slopsquatting research — AI-hallucinated package names",
|
|
341
|
+
))
|
|
342
|
+
|
|
343
|
+
return findings
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
class DependencyChecker(DirectoryChecker):
|
|
347
|
+
"""PROOFCTL-I-002: Hallucination-prone package name in project dependency files."""
|
|
348
|
+
|
|
349
|
+
def __init__(self, extra_high_risk: list[str] | None = None) -> None:
|
|
350
|
+
extra = frozenset(_normalize_pkg_name(p) for p in (extra_high_risk or []))
|
|
351
|
+
self._high_risk = _HIGH_RISK_DEPS | extra
|
|
352
|
+
|
|
353
|
+
def check(self, root: Path, py_files: list[Path]) -> list[Finding]:
|
|
354
|
+
findings = []
|
|
355
|
+
for dep_file in self._find_dep_files(root):
|
|
356
|
+
findings.extend(self._check_dep_file(dep_file))
|
|
357
|
+
return findings
|
|
358
|
+
|
|
359
|
+
def _find_dep_files(self, root: Path) -> list[Path]:
|
|
360
|
+
files: list[Path] = []
|
|
361
|
+
files.extend(root.glob("requirements*.txt"))
|
|
362
|
+
files.extend(root.glob("requirements/*.txt"))
|
|
363
|
+
pyproject = root / "pyproject.toml"
|
|
364
|
+
if pyproject.exists():
|
|
365
|
+
files.append(pyproject)
|
|
366
|
+
return files
|
|
367
|
+
|
|
368
|
+
def _check_dep_file(self, path: Path) -> list[Finding]:
|
|
369
|
+
try:
|
|
370
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
371
|
+
except OSError:
|
|
372
|
+
return []
|
|
373
|
+
|
|
374
|
+
if path.suffix == ".toml":
|
|
375
|
+
deps = _parse_pyproject_deps(text)
|
|
376
|
+
else:
|
|
377
|
+
deps = _parse_requirements(text)
|
|
378
|
+
|
|
379
|
+
findings = []
|
|
380
|
+
for name, spec, lineno in deps:
|
|
381
|
+
normalized = _normalize_pkg_name(name)
|
|
382
|
+
|
|
383
|
+
# Static list: confirmed hallucination targets that don't fit patterns.
|
|
384
|
+
if normalized in self._high_risk:
|
|
385
|
+
reason = "matches known AI-hallucination-prone package list"
|
|
386
|
+
else:
|
|
387
|
+
# Structural pattern matching: broader coverage, no network needed.
|
|
388
|
+
base_pkg = _suspicious_variant_of(name)
|
|
389
|
+
if base_pkg is None:
|
|
390
|
+
continue
|
|
391
|
+
reason = f"looks like a hallucinated variant of '{base_pkg}'"
|
|
392
|
+
|
|
393
|
+
is_pinned = spec is not None and "==" in spec
|
|
394
|
+
sev = Severity.WARNING if is_pinned else Severity.ERROR
|
|
395
|
+
findings.append(Finding(
|
|
396
|
+
file=str(path),
|
|
397
|
+
line=lineno,
|
|
398
|
+
col=0,
|
|
399
|
+
rule_id="PROOFCTL-I-002",
|
|
400
|
+
rule_name="High-risk dependency name",
|
|
401
|
+
severity=sev,
|
|
402
|
+
message=(
|
|
403
|
+
f"'{name}' {reason}"
|
|
404
|
+
+ (" (pinned)" if is_pinned else " — unpinned, squatting risk")
|
|
405
|
+
),
|
|
406
|
+
hint=(
|
|
407
|
+
f"Verify '{name}' is the intended package. "
|
|
408
|
+
"AI models commonly hallucinate plausible-sounding package names."
|
|
409
|
+
),
|
|
410
|
+
authority="Slopsquatting research — AI-hallucinated package names",
|
|
411
|
+
))
|
|
412
|
+
return findings
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class MethodChecker(DirectoryChecker):
|
|
416
|
+
"""PROOFCTL-M-001: Phantom method calls detected by a single mypy invocation.
|
|
417
|
+
|
|
418
|
+
Running mypy once on all files is orders of magnitude faster than once per
|
|
419
|
+
file: mypy's startup and import-resolution cost is paid once, not N times.
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
def check(self, root: Path, py_files: list[Path]) -> list[Finding]:
|
|
423
|
+
if not shutil.which("mypy") or not py_files:
|
|
424
|
+
return []
|
|
425
|
+
return self._run_mypy(py_files)
|
|
426
|
+
|
|
427
|
+
def _run_mypy(self, py_files: list[Path]) -> list[Finding]:
|
|
428
|
+
try:
|
|
429
|
+
result = subprocess.run(
|
|
430
|
+
[
|
|
431
|
+
"mypy",
|
|
432
|
+
"--ignore-missing-imports",
|
|
433
|
+
"--no-error-summary",
|
|
434
|
+
"--no-pretty",
|
|
435
|
+
*[str(p) for p in py_files],
|
|
436
|
+
],
|
|
437
|
+
capture_output=True,
|
|
438
|
+
text=True,
|
|
439
|
+
timeout=300,
|
|
440
|
+
)
|
|
441
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
442
|
+
return []
|
|
443
|
+
|
|
444
|
+
# Index scoped files for quick membership check after path normalisation.
|
|
445
|
+
scoped = {str(p.resolve()) for p in py_files}
|
|
446
|
+
|
|
447
|
+
findings = []
|
|
448
|
+
for line in result.stdout.splitlines():
|
|
449
|
+
m = _MYPY_ATTR_RE.match(line)
|
|
450
|
+
if not m:
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
if m.group(1):
|
|
454
|
+
file_path, lineno_str = m.group(1), m.group(2)
|
|
455
|
+
obj_type, attr = m.group(3), m.group(4)
|
|
456
|
+
msg = f"'{obj_type}' has no attribute '{attr}'"
|
|
457
|
+
else:
|
|
458
|
+
file_path, lineno_str = m.group(5), m.group(6)
|
|
459
|
+
attr = m.group(7)
|
|
460
|
+
msg = f"Module has no attribute '{attr}'"
|
|
461
|
+
|
|
462
|
+
# Skip findings for files outside the scan scope (e.g. installed libs).
|
|
463
|
+
try:
|
|
464
|
+
if str(Path(file_path).resolve()) not in scoped:
|
|
465
|
+
continue
|
|
466
|
+
except OSError:
|
|
467
|
+
continue
|
|
468
|
+
|
|
469
|
+
try:
|
|
470
|
+
lineno = int(lineno_str)
|
|
471
|
+
except ValueError:
|
|
472
|
+
lineno = None
|
|
473
|
+
|
|
474
|
+
findings.append(Finding(
|
|
475
|
+
file=file_path,
|
|
476
|
+
line=lineno,
|
|
477
|
+
col=None,
|
|
478
|
+
rule_id="PROOFCTL-M-001",
|
|
479
|
+
rule_name="Phantom method call",
|
|
480
|
+
severity=Severity.ERROR,
|
|
481
|
+
message=msg,
|
|
482
|
+
hint="Check the library's documentation for the correct method name.",
|
|
483
|
+
authority="mypy attr-defined — AI calls methods that don't exist",
|
|
484
|
+
))
|
|
485
|
+
|
|
486
|
+
return findings
|