vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Dirty baseline pre-launch gate.
|
|
2
|
+
|
|
3
|
+
Phase C7 — closes the regression where Rubik's working copy carried 17+
|
|
4
|
+
untracked paths (``.codex_*backup/`` artefacts, ``.scratch/``, foreign
|
|
5
|
+
report files, half-finished scripts) that polluted the executor's
|
|
6
|
+
context AND broke commit-delta reconciliation. The auto-commit step
|
|
7
|
+
could not tell which paths belonged to "this task" vs "leftover from a
|
|
8
|
+
previous run", so it returned ``status=committed_reported_unreconciled``
|
|
9
|
+
even on otherwise successful sessions.
|
|
10
|
+
|
|
11
|
+
The fix introduces an opt-in pre-launch gate:
|
|
12
|
+
|
|
13
|
+
.cortex/dirty_baseline_policy.json
|
|
14
|
+
{
|
|
15
|
+
"enabled": true,
|
|
16
|
+
"whitelist": [
|
|
17
|
+
".cortex/", ".vscode/", "*.lock", ".scratch/"
|
|
18
|
+
],
|
|
19
|
+
"max_files": 5,
|
|
20
|
+
"action": "warn" # "block" once policy hardens
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
The check runs ``git status --porcelain``, filters paths against the
|
|
24
|
+
whitelist, and returns a structured verdict the orchestrator wires
|
|
25
|
+
into pre-launch decisions.
|
|
26
|
+
|
|
27
|
+
Architecture:
|
|
28
|
+
|
|
29
|
+
- This module is a pure helper. It performs no I/O beyond the single
|
|
30
|
+
``git status`` invocation and a JSON read; callers decide how to
|
|
31
|
+
surface the verdict (banner, hard-fail, log entry).
|
|
32
|
+
- The whitelist supports glob-style patterns (``*.lock``) and prefix
|
|
33
|
+
matches (``.cortex/`` matches any path under that directory).
|
|
34
|
+
- ``max_files`` is the hard ceiling for the **non-whitelisted** count.
|
|
35
|
+
Whitelisted paths never count toward the limit, regardless of
|
|
36
|
+
quantity.
|
|
37
|
+
- ``action="warn"`` returns ``DirtyBaselineVerdict(blocking=False)``
|
|
38
|
+
with the dirty paths populated so the UI can show a banner; the
|
|
39
|
+
caller proceeds with the launch. ``action="block"`` returns
|
|
40
|
+
``blocking=True`` and the caller refuses the launch.
|
|
41
|
+
- Defaults match the plan: ``warn`` for the first 7 days post-rollout,
|
|
42
|
+
then a follow-up flips the default to ``block``. The policy file
|
|
43
|
+
is the source of truth — defaults only apply when the file is
|
|
44
|
+
absent.
|
|
45
|
+
"""
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import fnmatch
|
|
49
|
+
import json
|
|
50
|
+
import logging
|
|
51
|
+
import subprocess
|
|
52
|
+
from dataclasses import dataclass, field
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
from typing import Any, Iterable
|
|
55
|
+
|
|
56
|
+
_log = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Defaults — apply when no .cortex/dirty_baseline_policy.json is present.
|
|
60
|
+
# The whitelist covers the standard Vigil + IDE state that should never
|
|
61
|
+
# count toward the dirty-files budget.
|
|
62
|
+
_DEFAULT_WHITELIST: tuple[str, ...] = (
|
|
63
|
+
".cortex/",
|
|
64
|
+
".vscode/",
|
|
65
|
+
"*.lock",
|
|
66
|
+
"__pycache__/",
|
|
67
|
+
)
|
|
68
|
+
_DEFAULT_MAX_FILES = 5
|
|
69
|
+
_DEFAULT_ACTION = "warn" # one of "warn" | "block"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(frozen=True)
|
|
73
|
+
class DirtyBaselineVerdict:
|
|
74
|
+
"""Outcome of a single dirty-baseline check.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
ok: ``True`` when the verdict is clean OR within budget OR the
|
|
78
|
+
check is disabled. ``False`` when the launch should be
|
|
79
|
+
blocked or warned about.
|
|
80
|
+
action: ``"warn"`` / ``"block"`` / ``"disabled"`` /
|
|
81
|
+
``"git_unavailable"`` describing why the verdict landed
|
|
82
|
+
this way.
|
|
83
|
+
blocking: ``True`` only when ``action == "block"`` and the
|
|
84
|
+
count exceeds the budget. Callers gate the launch on this
|
|
85
|
+
field.
|
|
86
|
+
count: Number of dirty paths after whitelist filtering.
|
|
87
|
+
max_files: Configured ceiling.
|
|
88
|
+
dirty_paths: Up to 50 representative non-whitelisted paths;
|
|
89
|
+
surfaces in the UI banner.
|
|
90
|
+
whitelisted_count: How many paths the whitelist absorbed (for
|
|
91
|
+
diagnostics — confirms the policy is doing what it claims).
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
ok: bool
|
|
95
|
+
action: str
|
|
96
|
+
blocking: bool
|
|
97
|
+
count: int
|
|
98
|
+
max_files: int
|
|
99
|
+
dirty_paths: tuple[str, ...] = field(default_factory=tuple)
|
|
100
|
+
whitelisted_count: int = 0
|
|
101
|
+
|
|
102
|
+
def to_dict(self) -> dict[str, Any]:
|
|
103
|
+
return {
|
|
104
|
+
"ok": self.ok,
|
|
105
|
+
"action": self.action,
|
|
106
|
+
"blocking": self.blocking,
|
|
107
|
+
"count": self.count,
|
|
108
|
+
"max_files": self.max_files,
|
|
109
|
+
"dirty_paths": list(self.dirty_paths),
|
|
110
|
+
"whitelisted_count": self.whitelisted_count,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _load_policy(project_dir: Path) -> dict[str, Any]:
|
|
115
|
+
policy_path = project_dir / ".cortex" / "dirty_baseline_policy.json"
|
|
116
|
+
if not policy_path.exists():
|
|
117
|
+
return {
|
|
118
|
+
"enabled": True,
|
|
119
|
+
"whitelist": list(_DEFAULT_WHITELIST),
|
|
120
|
+
"max_files": _DEFAULT_MAX_FILES,
|
|
121
|
+
"action": _DEFAULT_ACTION,
|
|
122
|
+
}
|
|
123
|
+
try:
|
|
124
|
+
raw = json.loads(policy_path.read_text(encoding="utf-8"))
|
|
125
|
+
if not isinstance(raw, dict):
|
|
126
|
+
raise ValueError("dirty_baseline_policy.json must be a JSON object")
|
|
127
|
+
except (OSError, json.JSONDecodeError, ValueError) as exc:
|
|
128
|
+
_log.warning(
|
|
129
|
+
"dirty_baseline_check: policy load failed, using defaults: %s", exc,
|
|
130
|
+
)
|
|
131
|
+
return {
|
|
132
|
+
"enabled": True,
|
|
133
|
+
"whitelist": list(_DEFAULT_WHITELIST),
|
|
134
|
+
"max_files": _DEFAULT_MAX_FILES,
|
|
135
|
+
"action": _DEFAULT_ACTION,
|
|
136
|
+
}
|
|
137
|
+
whitelist = raw.get("whitelist")
|
|
138
|
+
if not isinstance(whitelist, list):
|
|
139
|
+
whitelist = list(_DEFAULT_WHITELIST)
|
|
140
|
+
action = str(raw.get("action") or _DEFAULT_ACTION).strip().lower()
|
|
141
|
+
if action not in ("warn", "block"):
|
|
142
|
+
action = _DEFAULT_ACTION
|
|
143
|
+
return {
|
|
144
|
+
"enabled": bool(raw.get("enabled", True)),
|
|
145
|
+
"whitelist": [str(item) for item in whitelist],
|
|
146
|
+
"max_files": int(raw.get("max_files", _DEFAULT_MAX_FILES)),
|
|
147
|
+
"action": action,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _is_whitelisted(path: str, whitelist: Iterable[str]) -> bool:
|
|
152
|
+
"""True if ``path`` matches any entry in the whitelist.
|
|
153
|
+
|
|
154
|
+
Matching rules (in order):
|
|
155
|
+
1. Glob (``fnmatch``) — handles ``*.lock`` and similar.
|
|
156
|
+
2. Directory prefix — entries ending in ``/`` match any path
|
|
157
|
+
under that directory (``.cortex/`` matches ``.cortex/x.json``
|
|
158
|
+
AND ``.cortex/sub/y.json``).
|
|
159
|
+
3. Exact equality fallback.
|
|
160
|
+
"""
|
|
161
|
+
p = path.replace("\\", "/").lstrip("./")
|
|
162
|
+
for pattern in whitelist:
|
|
163
|
+
pat = pattern.replace("\\", "/").lstrip("./")
|
|
164
|
+
if fnmatch.fnmatch(p, pat):
|
|
165
|
+
return True
|
|
166
|
+
if pat.endswith("/") and (p == pat.rstrip("/") or p.startswith(pat)):
|
|
167
|
+
return True
|
|
168
|
+
if p == pat:
|
|
169
|
+
return True
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _parse_porcelain(output: str) -> list[str]:
|
|
174
|
+
"""Extract paths from ``git status --porcelain`` output.
|
|
175
|
+
|
|
176
|
+
Each line is ``XY <path>`` or ``XY <orig> -> <new>`` for renames.
|
|
177
|
+
We capture the new path in renames and the only path otherwise.
|
|
178
|
+
Empty lines (no dirty state) yield an empty list.
|
|
179
|
+
"""
|
|
180
|
+
paths: list[str] = []
|
|
181
|
+
for raw_line in output.splitlines():
|
|
182
|
+
line = raw_line.rstrip()
|
|
183
|
+
if len(line) < 4:
|
|
184
|
+
continue
|
|
185
|
+
# Strip the 2-char status + 1 space prefix.
|
|
186
|
+
rest = line[3:]
|
|
187
|
+
if " -> " in rest:
|
|
188
|
+
_orig, _, new = rest.partition(" -> ")
|
|
189
|
+
paths.append(new.strip().strip('"'))
|
|
190
|
+
else:
|
|
191
|
+
paths.append(rest.strip().strip('"'))
|
|
192
|
+
return paths
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def check_dirty_baseline(
|
|
196
|
+
project_dir: Path,
|
|
197
|
+
*,
|
|
198
|
+
git_status_output: str | None = None,
|
|
199
|
+
) -> DirtyBaselineVerdict:
|
|
200
|
+
"""Run the dirty-baseline gate against ``project_dir``.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
project_dir: Project root containing the ``.git`` directory.
|
|
204
|
+
git_status_output: Optional pre-captured ``git status --porcelain``
|
|
205
|
+
output. When ``None`` the function shells out to ``git``;
|
|
206
|
+
tests inject deterministic output via this parameter.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
``DirtyBaselineVerdict`` describing the outcome. The function
|
|
210
|
+
never raises on git failures — instead it returns ``ok=True``
|
|
211
|
+
with ``action="git_unavailable"`` so the launch proceeds. The
|
|
212
|
+
caller can log the missing-git case but it should not block a
|
|
213
|
+
legitimate run.
|
|
214
|
+
"""
|
|
215
|
+
project_dir = Path(project_dir)
|
|
216
|
+
policy = _load_policy(project_dir)
|
|
217
|
+
if not policy.get("enabled", True):
|
|
218
|
+
return DirtyBaselineVerdict(
|
|
219
|
+
ok=True, action="disabled", blocking=False,
|
|
220
|
+
count=0, max_files=policy["max_files"],
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
if git_status_output is None:
|
|
224
|
+
try:
|
|
225
|
+
completed = subprocess.run(
|
|
226
|
+
["git", "status", "--porcelain"],
|
|
227
|
+
cwd=str(project_dir),
|
|
228
|
+
capture_output=True,
|
|
229
|
+
text=True,
|
|
230
|
+
timeout=30,
|
|
231
|
+
check=False,
|
|
232
|
+
)
|
|
233
|
+
except (OSError, subprocess.SubprocessError) as exc:
|
|
234
|
+
_log.warning("dirty_baseline_check: git invocation failed: %s", exc)
|
|
235
|
+
return DirtyBaselineVerdict(
|
|
236
|
+
ok=True, action="git_unavailable", blocking=False,
|
|
237
|
+
count=0, max_files=policy["max_files"],
|
|
238
|
+
)
|
|
239
|
+
if completed.returncode != 0:
|
|
240
|
+
_log.warning(
|
|
241
|
+
"dirty_baseline_check: git status rc=%s stderr=%s",
|
|
242
|
+
completed.returncode, (completed.stderr or "")[:200],
|
|
243
|
+
)
|
|
244
|
+
return DirtyBaselineVerdict(
|
|
245
|
+
ok=True, action="git_unavailable", blocking=False,
|
|
246
|
+
count=0, max_files=policy["max_files"],
|
|
247
|
+
)
|
|
248
|
+
git_status_output = completed.stdout or ""
|
|
249
|
+
|
|
250
|
+
all_paths = _parse_porcelain(git_status_output)
|
|
251
|
+
whitelist = policy["whitelist"]
|
|
252
|
+
dirty: list[str] = []
|
|
253
|
+
whitelisted = 0
|
|
254
|
+
for path in all_paths:
|
|
255
|
+
if _is_whitelisted(path, whitelist):
|
|
256
|
+
whitelisted += 1
|
|
257
|
+
else:
|
|
258
|
+
dirty.append(path)
|
|
259
|
+
|
|
260
|
+
max_files = int(policy["max_files"])
|
|
261
|
+
over_budget = len(dirty) > max_files
|
|
262
|
+
action = str(policy["action"])
|
|
263
|
+
blocking = over_budget and action == "block"
|
|
264
|
+
ok = not over_budget
|
|
265
|
+
|
|
266
|
+
return DirtyBaselineVerdict(
|
|
267
|
+
ok=ok,
|
|
268
|
+
action=action if over_budget else "clean",
|
|
269
|
+
blocking=blocking,
|
|
270
|
+
count=len(dirty),
|
|
271
|
+
max_files=max_files,
|
|
272
|
+
dirty_paths=tuple(dirty[:50]),
|
|
273
|
+
whitelisted_count=whitelisted,
|
|
274
|
+
)
|
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from vigil_forensic._shared import EvidenceReference, GateCategory, GateImpact, GateSeverity, RepairKind
|
|
7
|
+
from vigil_forensic.gate_models import PostExecGateContext
|
|
8
|
+
from ..source_analysis import is_source_file, is_test_file
|
|
9
|
+
from .common import build_check_result, build_finding, extract_python_functions, hash_normalized_code, hash_text_block, iter_touched_snapshots
|
|
10
|
+
import logging
|
|
11
|
+
_log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# Bare identifier lines (import continuation: "GateFinding,") — exclude from text block windows
|
|
14
|
+
_IMPORT_CONTINUATION_RE = re.compile(r"^[A-Za-z_]\w*,?$")
|
|
15
|
+
|
|
16
|
+
# Pure parameter-declaration / call-argument continuation lines, e.g.
|
|
17
|
+
# ``timeout: float = 0.05,`` ``poll_interval=poll_interval,`` ``arg0=None,``
|
|
18
|
+
# A long signature or call mirrored across sync/async APIs is structure, not
|
|
19
|
+
# copy-pasted logic — exclude these lines from the text-block window so the
|
|
20
|
+
# detector does not fire on shared parameter lists.
|
|
21
|
+
_PARAM_DECL_RE = re.compile(
|
|
22
|
+
r"^\*{0,2}[A-Za-z_]\w*\s*(?::\s*[^=]+?)?(?:=\s*[^,]+?)?,?$"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _string_literal_lines(text: str) -> frozenset[int]:
|
|
27
|
+
"""Return the set of 1-based line numbers that lie inside a string literal.
|
|
28
|
+
|
|
29
|
+
Covers docstrings and any multi-line string constant. Used to exclude
|
|
30
|
+
docstring / long-string content from the text-block duplication window:
|
|
31
|
+
shared docstrings (e.g. identical ``:param`` blocks on sync/async API
|
|
32
|
+
mirrors) are documentation, not duplicated CODE.
|
|
33
|
+
|
|
34
|
+
Python only. Returns an empty set on SyntaxError (fail-open) — non-Python
|
|
35
|
+
files keep their previous behavior.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
tree = ast.parse(text)
|
|
39
|
+
except (SyntaxError, ValueError):
|
|
40
|
+
return frozenset()
|
|
41
|
+
lines: set[int] = set()
|
|
42
|
+
for node in ast.walk(tree):
|
|
43
|
+
if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
44
|
+
start = getattr(node, "lineno", None)
|
|
45
|
+
end = getattr(node, "end_lineno", None) or start
|
|
46
|
+
if start is None:
|
|
47
|
+
continue
|
|
48
|
+
lines.update(range(start, end + 1))
|
|
49
|
+
return frozenset(lines)
|
|
50
|
+
|
|
51
|
+
_MAX_DUPLICATION_CHECK_FILES = 200 # Prevent rglob on massive projects
|
|
52
|
+
|
|
53
|
+
_BOILERPLATE_FUNCTION_NAMES = frozenset({
|
|
54
|
+
"to_dict", "from_dict", "from_mapping", "__init__", "__repr__", "__str__",
|
|
55
|
+
"__eq__", "__hash__", "__post_init__", "_now", "to_json", "from_json",
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _extract_snippets(path: str, text: str) -> list[tuple[str, int, int, str]]:
|
|
60
|
+
"""Return (name, start, end, snippet) for all function-like regions.
|
|
61
|
+
|
|
62
|
+
Python: AST-based (exact). JS/TS: regex-based (heuristic). Others: empty.
|
|
63
|
+
"""
|
|
64
|
+
from ..source_analysis import extract_functions, get_language_id
|
|
65
|
+
lang = get_language_id(path)
|
|
66
|
+
if lang == "python":
|
|
67
|
+
return extract_python_functions(text)
|
|
68
|
+
fns = extract_functions(path, text)
|
|
69
|
+
if not fns:
|
|
70
|
+
return []
|
|
71
|
+
lines = text.splitlines()
|
|
72
|
+
return [
|
|
73
|
+
(fi.name, fi.start_line, fi.end_line,
|
|
74
|
+
"\n".join(lines[fi.start_line - 1:fi.end_line]))
|
|
75
|
+
for fi in fns
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def run_duplication_checks(ctx: PostExecGateContext):
|
|
80
|
+
findings = []
|
|
81
|
+
profile = ctx.repo_profile
|
|
82
|
+
touched_hashes: dict[str, list[tuple[str, str]]] = {}
|
|
83
|
+
for snapshot in iter_touched_snapshots(ctx):
|
|
84
|
+
if not snapshot.exists or not is_source_file(snapshot.path):
|
|
85
|
+
continue
|
|
86
|
+
if is_test_file(snapshot.path):
|
|
87
|
+
continue
|
|
88
|
+
for func_name, start, end, snippet in _extract_snippets(snapshot.path, snapshot.text):
|
|
89
|
+
if end - start < 3:
|
|
90
|
+
continue
|
|
91
|
+
if func_name in _BOILERPLATE_FUNCTION_NAMES:
|
|
92
|
+
continue
|
|
93
|
+
touched_hashes.setdefault(hash_normalized_code(snippet), []).append((snapshot.path, func_name))
|
|
94
|
+
# cross_touched_duplicate is designed for incremental AI edits (2-20 files).
|
|
95
|
+
# In a full-scan (self-audit), touched_files == all source files, so every
|
|
96
|
+
# pair of structurally-similar helpers cross-matches — meaningless noise.
|
|
97
|
+
#
|
|
98
|
+
# Full-scan detection: explicit flag OR (large touched set that equals all
|
|
99
|
+
# known snapshots — at least 10 files so genuine 2-20 file incremental
|
|
100
|
+
# changes are never mis-classified as full scans).
|
|
101
|
+
_MIN_FULL_SCAN_FILES = 10
|
|
102
|
+
# cross_touched_duplicate is for INCREMENTAL diffs (2-20 touched files). A
|
|
103
|
+
# full scan clears the touched hashes so it does not DOUBLE-COUNT
|
|
104
|
+
# duplicate_scan (C45), which already covers the duplication. Full scan =
|
|
105
|
+
# explicit ctx.is_full_scan flag (standalone self-audit sets it), OR the
|
|
106
|
+
# heuristic "touched covers all snapshots and >= _MIN_FULL_SCAN_FILES".
|
|
107
|
+
# Incremental scans (no flag, small touched set) keep cross_touched active.
|
|
108
|
+
_is_full_scan = getattr(ctx, "is_full_scan", False) or (
|
|
109
|
+
len(ctx.touched_files) >= _MIN_FULL_SCAN_FILES
|
|
110
|
+
and set(ctx.touched_files) >= set(ctx.file_snapshots.keys())
|
|
111
|
+
)
|
|
112
|
+
if _is_full_scan:
|
|
113
|
+
touched_hashes.clear()
|
|
114
|
+
|
|
115
|
+
seen_pairs: set[tuple[str, str, str, str]] = set()
|
|
116
|
+
file_count = 0
|
|
117
|
+
for path in (ctx.project_dir.rglob("*") if touched_hashes else ()):
|
|
118
|
+
if file_count > _MAX_DUPLICATION_CHECK_FILES:
|
|
119
|
+
_log.warning(
|
|
120
|
+
"duplication_checks: rglob limit exceeded (%d > %d), stopping early",
|
|
121
|
+
file_count, _MAX_DUPLICATION_CHECK_FILES,
|
|
122
|
+
)
|
|
123
|
+
break
|
|
124
|
+
file_count += 1
|
|
125
|
+
if not path.is_file():
|
|
126
|
+
continue
|
|
127
|
+
repo_path = str(path.relative_to(ctx.project_dir)).replace("\\", "/")
|
|
128
|
+
if not is_source_file(repo_path):
|
|
129
|
+
continue
|
|
130
|
+
if profile and profile.is_generated_or_vendored(repo_path):
|
|
131
|
+
continue
|
|
132
|
+
if repo_path in ctx.touched_files:
|
|
133
|
+
continue
|
|
134
|
+
if is_test_file(repo_path):
|
|
135
|
+
continue
|
|
136
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
137
|
+
for func_name, start, end, snippet in _extract_snippets(repo_path, text):
|
|
138
|
+
if end - start < 3:
|
|
139
|
+
continue
|
|
140
|
+
if func_name in _BOILERPLATE_FUNCTION_NAMES:
|
|
141
|
+
continue
|
|
142
|
+
hashed = hash_normalized_code(snippet)
|
|
143
|
+
matches = touched_hashes.get(hashed) or []
|
|
144
|
+
if not matches:
|
|
145
|
+
continue
|
|
146
|
+
for match_path, match_name in matches:
|
|
147
|
+
pair = (match_path, match_name, repo_path, func_name)
|
|
148
|
+
if pair in seen_pairs:
|
|
149
|
+
continue
|
|
150
|
+
seen_pairs.add(pair)
|
|
151
|
+
findings.append(
|
|
152
|
+
build_finding(
|
|
153
|
+
check_id="duplication.normalized_function",
|
|
154
|
+
category=GateCategory.DUPLICATION,
|
|
155
|
+
title="Touched code duplicates existing logic under a different location",
|
|
156
|
+
severity=GateSeverity.HIGH,
|
|
157
|
+
impact=GateImpact.REVISE,
|
|
158
|
+
summary=f"{match_path}::{match_name} is near-duplicate of {repo_path}::{func_name}.",
|
|
159
|
+
recommendation=(
|
|
160
|
+
"Remove the duplicate and import the canonical implementation directly. "
|
|
161
|
+
"If both locations need minor variations, add a parameter to the canonical function "
|
|
162
|
+
"rather than forking a copy."
|
|
163
|
+
),
|
|
164
|
+
evidence=[
|
|
165
|
+
EvidenceReference(kind="file", path=match_path, detail=match_name),
|
|
166
|
+
EvidenceReference(kind="file", path=repo_path, detail=func_name),
|
|
167
|
+
],
|
|
168
|
+
repair_kind=RepairKind.REMOVE_DUPLICATE.value,
|
|
169
|
+
executor_action=f"Remove {match_path}::{match_name} — near-duplicate of {repo_path}::{func_name}; import canonical instead",
|
|
170
|
+
proof_required="duplicate removed; original passes existing tests",
|
|
171
|
+
allowlist_allowed=False,
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
for locations in touched_hashes.values():
|
|
175
|
+
if len(locations) < 2:
|
|
176
|
+
continue
|
|
177
|
+
for i in range(len(locations)):
|
|
178
|
+
for j in range(i + 1, len(locations)):
|
|
179
|
+
path_a, name_a = locations[i]
|
|
180
|
+
path_b, name_b = locations[j]
|
|
181
|
+
pair = (path_a, name_a, path_b, name_b)
|
|
182
|
+
if pair in seen_pairs:
|
|
183
|
+
continue
|
|
184
|
+
seen_pairs.add(pair)
|
|
185
|
+
findings.append(
|
|
186
|
+
build_finding(
|
|
187
|
+
check_id="duplication.cross_touched_duplicate",
|
|
188
|
+
category=GateCategory.DUPLICATION,
|
|
189
|
+
title="Two newly-touched files contain duplicate function implementations",
|
|
190
|
+
severity=GateSeverity.HIGH,
|
|
191
|
+
impact=GateImpact.REVISE,
|
|
192
|
+
summary=f"{path_a}::{name_a} is a near-duplicate of {path_b}::{name_b} -- both were touched in this change.",
|
|
193
|
+
recommendation=(
|
|
194
|
+
"Consolidate into one canonical implementation. "
|
|
195
|
+
"Move it to `<package>/shared.py` or `<package>/utils.py` if both files are in the same package, "
|
|
196
|
+
"or to a common cross-package helper module. "
|
|
197
|
+
"Replace both copies with an import."
|
|
198
|
+
),
|
|
199
|
+
evidence=[
|
|
200
|
+
EvidenceReference(kind="file", path=path_a, detail=name_a),
|
|
201
|
+
EvidenceReference(kind="file", path=path_b, detail=name_b),
|
|
202
|
+
],
|
|
203
|
+
repair_kind=RepairKind.EXTRACT_SHARED.value,
|
|
204
|
+
executor_action=f"Extract shared impl from {path_a}::{name_a} and {path_b}::{name_b} into canonical module; replace both with import",
|
|
205
|
+
proof_required="one canonical impl; both callers import it; tests pass",
|
|
206
|
+
allowlist_allowed=False,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
# ── Phase 2: Universal text block duplication ──
|
|
210
|
+
# Catches repeated HTML/CSS/JS/config blocks across files — not just Python functions.
|
|
211
|
+
# Works by hashing sliding windows of N consecutive non-empty lines.
|
|
212
|
+
_BLOCK_MIN_LINES = 12
|
|
213
|
+
_BLOCK_IGNORE_PREFIXES = ("#", "//", "/*", "*", "import ", "from ", '"""', "'''", "assert ", "return ")
|
|
214
|
+
_SKIP_DIRS = (".vendor", "node_modules", "migrations", "__generated__", "__pycache__", "gate_checks")
|
|
215
|
+
block_hashes: dict[str, list[tuple[str, int]]] = {} # hash -> [(file, start_line), ...]
|
|
216
|
+
|
|
217
|
+
_MAX_TEXT_BLOCK_FINDINGS = 50
|
|
218
|
+
_text_block_count = 0
|
|
219
|
+
|
|
220
|
+
for snapshot in iter_touched_snapshots(ctx):
|
|
221
|
+
if not snapshot.exists or not snapshot.text:
|
|
222
|
+
continue
|
|
223
|
+
# Skip test files — they naturally have repetitive assertion patterns
|
|
224
|
+
norm_path = snapshot.path.replace("\\", "/")
|
|
225
|
+
if norm_path.split("/")[-1].startswith("test_"):
|
|
226
|
+
continue
|
|
227
|
+
# Skip vendored/generated directories
|
|
228
|
+
if any(f"/{d}/" in f"/{norm_path}/" for d in _SKIP_DIRS):
|
|
229
|
+
continue
|
|
230
|
+
lines = snapshot.text.splitlines()
|
|
231
|
+
# Lines that sit inside a string literal (docstrings, long multi-line
|
|
232
|
+
# strings). Shared docstrings / :param blocks across sync/async API
|
|
233
|
+
# mirrors are documentation, not duplicated CODE — exclude them.
|
|
234
|
+
docstring_lines = (
|
|
235
|
+
_string_literal_lines(snapshot.text)
|
|
236
|
+
if is_source_file(snapshot.path) and snapshot.path.endswith(".py")
|
|
237
|
+
else frozenset()
|
|
238
|
+
)
|
|
239
|
+
# Filter to meaningful lines (skip empty, comments, imports, docstrings,
|
|
240
|
+
# and pure parameter-declaration / argument-continuation lines).
|
|
241
|
+
meaningful: list[tuple[int, str]] = []
|
|
242
|
+
for i, line in enumerate(lines):
|
|
243
|
+
line_no = i + 1
|
|
244
|
+
if line_no in docstring_lines:
|
|
245
|
+
continue
|
|
246
|
+
stripped = line.strip()
|
|
247
|
+
if not stripped:
|
|
248
|
+
continue
|
|
249
|
+
if any(stripped.startswith(p) for p in _BLOCK_IGNORE_PREFIXES):
|
|
250
|
+
continue
|
|
251
|
+
if _IMPORT_CONTINUATION_RE.match(stripped):
|
|
252
|
+
continue
|
|
253
|
+
if _PARAM_DECL_RE.match(stripped):
|
|
254
|
+
continue
|
|
255
|
+
meaningful.append((line_no, stripped))
|
|
256
|
+
|
|
257
|
+
# Sliding window of _BLOCK_MIN_LINES
|
|
258
|
+
for idx in range(len(meaningful) - _BLOCK_MIN_LINES + 1):
|
|
259
|
+
window = meaningful[idx:idx + _BLOCK_MIN_LINES]
|
|
260
|
+
block_text = "\n".join(text for _, text in window)
|
|
261
|
+
block_hash = hash_text_block(block_text)
|
|
262
|
+
start_line = window[0][0]
|
|
263
|
+
entries = block_hashes.setdefault(block_hash, [])
|
|
264
|
+
# Don't add overlapping blocks from the same file
|
|
265
|
+
if entries and entries[-1][0] == snapshot.path and abs(entries[-1][1] - start_line) < _BLOCK_MIN_LINES:
|
|
266
|
+
continue
|
|
267
|
+
entries.append((snapshot.path, start_line))
|
|
268
|
+
|
|
269
|
+
# ── Collapse per-line-window inflation ──
|
|
270
|
+
# A single duplicated REGION of N lines produces N-_BLOCK_MIN_LINES+1
|
|
271
|
+
# distinct window hashes, all sharing the same set of files at adjacent
|
|
272
|
+
# start lines. Emitting one finding per hash inflated the count (~13
|
|
273
|
+
# findings for one shared block on filelock). Group the duplicate hashes by
|
|
274
|
+
# the set of files involved, then merge windows whose start lines are within
|
|
275
|
+
# _BLOCK_MIN_LINES of each other (contiguous/overlapping = same region) so
|
|
276
|
+
# each duplicated region yields exactly ONE finding.
|
|
277
|
+
#
|
|
278
|
+
# group key = frozenset of files; value = {file: [start_line, ...]}
|
|
279
|
+
region_groups: dict[frozenset, dict[str, list[int]]] = {}
|
|
280
|
+
for block_hash, locations in block_hashes.items():
|
|
281
|
+
if len(locations) < 2:
|
|
282
|
+
continue
|
|
283
|
+
files_in_hash = frozenset(path for path, _ in locations)
|
|
284
|
+
if len(files_in_hash) >= 2:
|
|
285
|
+
key = files_in_hash
|
|
286
|
+
else:
|
|
287
|
+
# Intra-file: only meaningful if the SAME block repeats at >=2 spots.
|
|
288
|
+
only_file = next(iter(files_in_hash))
|
|
289
|
+
if len({ln for _, ln in locations}) < 2:
|
|
290
|
+
continue
|
|
291
|
+
key = files_in_hash
|
|
292
|
+
bucket = region_groups.setdefault(key, {})
|
|
293
|
+
for path, ln in locations:
|
|
294
|
+
bucket.setdefault(path, []).append(ln)
|
|
295
|
+
|
|
296
|
+
def _merge_starts(starts: list[int]) -> list[int]:
|
|
297
|
+
"""Collapse start lines within _BLOCK_MIN_LINES of each other into the
|
|
298
|
+
first line of each contiguous region."""
|
|
299
|
+
if not starts:
|
|
300
|
+
return []
|
|
301
|
+
ordered = sorted(set(starts))
|
|
302
|
+
regions = [ordered[0]]
|
|
303
|
+
for ln in ordered[1:]:
|
|
304
|
+
if ln - regions[-1] >= _BLOCK_MIN_LINES:
|
|
305
|
+
regions.append(ln)
|
|
306
|
+
return regions
|
|
307
|
+
|
|
308
|
+
for files_key, per_file in region_groups.items():
|
|
309
|
+
if _text_block_count >= _MAX_TEXT_BLOCK_FINDINGS:
|
|
310
|
+
break
|
|
311
|
+
unique_files = sorted(per_file.keys())
|
|
312
|
+
# Region anchor = first start line in each file (after merging).
|
|
313
|
+
merged_per_file = {f: _merge_starts(per_file[f]) for f in unique_files}
|
|
314
|
+
|
|
315
|
+
if len(unique_files) >= 2:
|
|
316
|
+
# One cross-file finding per duplicated region. The number of regions
|
|
317
|
+
# is the max region count across the involved files.
|
|
318
|
+
region_count = max(len(v) for v in merged_per_file.values()) or 1
|
|
319
|
+
for region_idx in range(region_count):
|
|
320
|
+
if _text_block_count >= _MAX_TEXT_BLOCK_FINDINGS:
|
|
321
|
+
break
|
|
322
|
+
file_lines_map = {
|
|
323
|
+
f: merged_per_file[f][min(region_idx, len(merged_per_file[f]) - 1)]
|
|
324
|
+
for f in unique_files
|
|
325
|
+
}
|
|
326
|
+
_text_block_count += 1
|
|
327
|
+
file_list = ", ".join(f"{f} (line {file_lines_map[f]})" for f in unique_files[:5])
|
|
328
|
+
suffix = f" and {len(unique_files) - 5} more" if len(unique_files) > 5 else ""
|
|
329
|
+
findings.append(
|
|
330
|
+
build_finding(
|
|
331
|
+
check_id="duplication.text_block",
|
|
332
|
+
category=GateCategory.DUPLICATION,
|
|
333
|
+
title="Repeated text block across files",
|
|
334
|
+
severity=GateSeverity.MEDIUM,
|
|
335
|
+
impact=GateImpact.REVISE,
|
|
336
|
+
summary=(
|
|
337
|
+
f"Duplicated {_BLOCK_MIN_LINES}+ line block found in {len(unique_files)} files: "
|
|
338
|
+
f"{file_list}{suffix}"
|
|
339
|
+
),
|
|
340
|
+
recommendation=(
|
|
341
|
+
"Extract the repeated block into a shared function, template, or constant. "
|
|
342
|
+
"If the files belong to the same package — add a `shared.py` or `utils.py` module there. "
|
|
343
|
+
"If they span multiple packages — move the helper to the nearest common ancestor package."
|
|
344
|
+
),
|
|
345
|
+
evidence=[
|
|
346
|
+
EvidenceReference(kind="file", path=f, detail=f"line:{file_lines_map[f]}")
|
|
347
|
+
for f in unique_files[:5]
|
|
348
|
+
],
|
|
349
|
+
repair_kind=RepairKind.EXTRACT_SHARED.value,
|
|
350
|
+
executor_action=f"Extract duplicated block from {unique_files[0]} et al. into shared helper; replace each occurrence with a call",
|
|
351
|
+
proof_required="no repeated block; tests pass",
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
# Intra-file duplication (same block repeated within one file).
|
|
356
|
+
file_path = unique_files[0]
|
|
357
|
+
file_lines = merged_per_file[file_path]
|
|
358
|
+
if len(file_lines) >= 2:
|
|
359
|
+
_text_block_count += 1
|
|
360
|
+
findings.append(
|
|
361
|
+
build_finding(
|
|
362
|
+
check_id="duplication.text_block_intra",
|
|
363
|
+
category=GateCategory.DUPLICATION,
|
|
364
|
+
title="Repeated text block within same file",
|
|
365
|
+
severity=GateSeverity.MEDIUM,
|
|
366
|
+
impact=GateImpact.REVISE,
|
|
367
|
+
summary=(
|
|
368
|
+
f"{file_path} has {len(file_lines)} copies of a {_BLOCK_MIN_LINES}+ line block "
|
|
369
|
+
f"(lines {', '.join(str(l) for l in file_lines[:5])}). "
|
|
370
|
+
f"Extract to a shared helper."
|
|
371
|
+
),
|
|
372
|
+
recommendation=(
|
|
373
|
+
"Extract the repeated block into a private helper function within the same file "
|
|
374
|
+
"and call it from each location. "
|
|
375
|
+
"If the same logic is needed elsewhere — move the helper to `<package>/shared.py`."
|
|
376
|
+
),
|
|
377
|
+
evidence=[
|
|
378
|
+
EvidenceReference(kind="file", path=file_path, detail=f"line:{ln}")
|
|
379
|
+
for ln in file_lines[:3]
|
|
380
|
+
],
|
|
381
|
+
repair_kind=RepairKind.EXTRACT_SHARED.value,
|
|
382
|
+
executor_action=f"Extract repeated block in {file_path} (lines {', '.join(str(l) for l in file_lines[:3])}) into private helper; replace each copy with a call",
|
|
383
|
+
proof_required="no repeated block; tests pass",
|
|
384
|
+
)
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
return build_check_result(check_id="duplication", category=GateCategory.DUPLICATION, findings=findings)
|