vigil-codeintel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
  2. vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
  3. vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
  4. vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
  5. vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
  7. vigil_forensic/__init__.py +224 -0
  8. vigil_forensic/_git_utils.py +178 -0
  9. vigil_forensic/_shared.py +510 -0
  10. vigil_forensic/_stubs.py +156 -0
  11. vigil_forensic/gate_checks/__init__.py +1 -0
  12. vigil_forensic/gate_checks/_ast_helpers.py +629 -0
  13. vigil_forensic/gate_checks/_deployment_detector.py +573 -0
  14. vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
  15. vigil_forensic/gate_checks/authority_checks.py +95 -0
  16. vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
  17. vigil_forensic/gate_checks/broad_except_checks.py +301 -0
  18. vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
  19. vigil_forensic/gate_checks/common.py +253 -0
  20. vigil_forensic/gate_checks/config_safety_checks.py +704 -0
  21. vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
  22. vigil_forensic/gate_checks/conflict_checks.py +193 -0
  23. vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
  24. vigil_forensic/gate_checks/context_health_checks.py +289 -0
  25. vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
  26. vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
  27. vigil_forensic/gate_checks/duplication_checks.py +387 -0
  28. vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
  29. vigil_forensic/gate_checks/empty_output_checks.py +87 -0
  30. vigil_forensic/gate_checks/encoding_checks.py +847 -0
  31. vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
  32. vigil_forensic/gate_checks/fallback_checks.py +41 -0
  33. vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
  34. vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
  35. vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
  36. vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
  37. vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
  38. vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
  39. vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
  40. vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
  41. vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
  42. vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
  43. vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
  44. vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
  45. vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
  46. vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
  47. vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
  48. vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
  49. vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
  50. vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
  51. vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
  52. vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
  53. vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
  54. vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
  55. vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
  56. vigil_forensic/gate_checks/hallucination_checks.py +566 -0
  57. vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
  58. vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
  59. vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
  60. vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
  61. vigil_forensic/gate_checks/ml_checks.py +318 -0
  62. vigil_forensic/gate_checks/performance_checks.py +106 -0
  63. vigil_forensic/gate_checks/project_specific_runner.py +691 -0
  64. vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
  65. vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
  66. vigil_forensic/gate_checks/reliability_checks.py +389 -0
  67. vigil_forensic/gate_checks/reporting_checks.py +55 -0
  68. vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
  69. vigil_forensic/gate_checks/security_injection_checks.py +332 -0
  70. vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
  71. vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
  72. vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
  73. vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
  74. vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
  75. vigil_forensic/gate_checks/test_quality_checks.py +946 -0
  76. vigil_forensic/gate_checks/testing_checks.py +149 -0
  77. vigil_forensic/gate_checks/toctou_checks.py +367 -0
  78. vigil_forensic/gate_checks/type_checking_checks.py +316 -0
  79. vigil_forensic/gate_models.py +392 -0
  80. vigil_forensic/gate_packs/__init__.py +1 -0
  81. vigil_forensic/gate_packs/universal.py +179 -0
  82. vigil_forensic/gate_profile.json +31 -0
  83. vigil_forensic/gate_registry.py +21 -0
  84. vigil_forensic/language_profiles.py +219 -0
  85. vigil_forensic/meta_findings.py +207 -0
  86. vigil_forensic/self_audit.py +725 -0
  87. vigil_forensic/source_analysis.py +175 -0
  88. vigil_mapper/__init__.py +103 -0
  89. vigil_mapper/_ast_helpers_minimal.py +229 -0
  90. vigil_mapper/_extract_imports_impl.py +123 -0
  91. vigil_mapper/_file_count_guard.py +129 -0
  92. vigil_mapper/_git_utils.py +178 -0
  93. vigil_mapper/_runtime_ast.py +438 -0
  94. vigil_mapper/_runtime_dispatch.py +137 -0
  95. vigil_mapper/_seed_helpers.py +82 -0
  96. vigil_mapper/authority_builder.py +1102 -0
  97. vigil_mapper/cli_entry.py +731 -0
  98. vigil_mapper/conflict_builder.py +818 -0
  99. vigil_mapper/data_contract_builder.py +446 -0
  100. vigil_mapper/findings_builder.py +716 -0
  101. vigil_mapper/fingerprint.py +53 -0
  102. vigil_mapper/hotspot_builder.py +539 -0
  103. vigil_mapper/map_common.py +449 -0
  104. vigil_mapper/map_errors.py +55 -0
  105. vigil_mapper/map_models.py +431 -0
  106. vigil_mapper/map_models_ext.py +206 -0
  107. vigil_mapper/map_models_findings.py +130 -0
  108. vigil_mapper/map_storage.py +455 -0
  109. vigil_mapper/parse_cache.py +795 -0
  110. vigil_mapper/refactor_boundary_builder.py +266 -0
  111. vigil_mapper/runtime_builder.py +527 -0
  112. vigil_mapper/runtime_tracer.py +243 -0
  113. vigil_mapper/runtime_tracer_entry.py +199 -0
  114. vigil_mapper/semantic_diff.py +71 -0
  115. vigil_mapper/source_adapters/__init__.py +109 -0
  116. vigil_mapper/source_adapters/_base.py +264 -0
  117. vigil_mapper/source_adapters/_ir.py +156 -0
  118. vigil_mapper/source_adapters/_lexer.py +309 -0
  119. vigil_mapper/source_adapters/_patterns.py +212 -0
  120. vigil_mapper/source_adapters/_treesitter.py +182 -0
  121. vigil_mapper/source_adapters/go.py +553 -0
  122. vigil_mapper/source_adapters/java.py +541 -0
  123. vigil_mapper/source_adapters/javascript.py +626 -0
  124. vigil_mapper/source_adapters/python.py +325 -0
  125. vigil_mapper/source_adapters/typescript.py +749 -0
  126. vigil_mapper/structural_builder.py +586 -0
  127. vigil_mcp/__init__.py +1 -0
  128. vigil_mcp/_jobs.py +587 -0
  129. vigil_mcp/_paths.py +93 -0
  130. vigil_mcp/forensic_server.py +419 -0
  131. vigil_mcp/map_server.py +452 -0
@@ -0,0 +1,274 @@
1
+ """Dirty baseline pre-launch gate.
2
+
3
+ Phase C7 — closes the regression where Rubik's working copy carried 17+
4
+ untracked paths (``.codex_*backup/`` artefacts, ``.scratch/``, foreign
5
+ report files, half-finished scripts) that polluted the executor's
6
+ context AND broke commit-delta reconciliation. The auto-commit step
7
+ could not tell which paths belonged to "this task" vs "leftover from a
8
+ previous run", so it returned ``status=committed_reported_unreconciled``
9
+ even on otherwise successful sessions.
10
+
11
+ The fix introduces an opt-in pre-launch gate:
12
+
13
+ .cortex/dirty_baseline_policy.json
14
+ {
15
+ "enabled": true,
16
+ "whitelist": [
17
+ ".cortex/", ".vscode/", "*.lock", ".scratch/"
18
+ ],
19
+ "max_files": 5,
20
+ "action": "warn" # "block" once policy hardens
21
+ }
22
+
23
+ The check runs ``git status --porcelain``, filters paths against the
24
+ whitelist, and returns a structured verdict the orchestrator wires
25
+ into pre-launch decisions.
26
+
27
+ Architecture:
28
+
29
+ - This module is a pure helper. It performs no I/O beyond the single
30
+ ``git status`` invocation and a JSON read; callers decide how to
31
+ surface the verdict (banner, hard-fail, log entry).
32
+ - The whitelist supports glob-style patterns (``*.lock``) and prefix
33
+ matches (``.cortex/`` matches any path under that directory).
34
+ - ``max_files`` is the hard ceiling for the **non-whitelisted** count.
35
+ Whitelisted paths never count toward the limit, regardless of
36
+ quantity.
37
+ - ``action="warn"`` returns ``DirtyBaselineVerdict(blocking=False)``
38
+ with the dirty paths populated so the UI can show a banner; the
39
+ caller proceeds with the launch. ``action="block"`` returns
40
+ ``blocking=True`` and the caller refuses the launch.
41
+ - Defaults match the plan: ``warn`` for the first 7 days post-rollout,
42
+ then a follow-up flips the default to ``block``. The policy file
43
+ is the source of truth — defaults only apply when the file is
44
+ absent.
45
+ """
46
+ from __future__ import annotations
47
+
48
+ import fnmatch
49
+ import json
50
+ import logging
51
+ import subprocess
52
+ from dataclasses import dataclass, field
53
+ from pathlib import Path
54
+ from typing import Any, Iterable
55
+
56
+ _log = logging.getLogger(__name__)
57
+
58
+
59
+ # Defaults — apply when no .cortex/dirty_baseline_policy.json is present.
60
+ # The whitelist covers the standard Vigil + IDE state that should never
61
+ # count toward the dirty-files budget.
62
+ _DEFAULT_WHITELIST: tuple[str, ...] = (
63
+ ".cortex/",
64
+ ".vscode/",
65
+ "*.lock",
66
+ "__pycache__/",
67
+ )
68
+ _DEFAULT_MAX_FILES = 5
69
+ _DEFAULT_ACTION = "warn" # one of "warn" | "block"
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class DirtyBaselineVerdict:
74
+ """Outcome of a single dirty-baseline check.
75
+
76
+ Attributes:
77
+ ok: ``True`` when the verdict is clean OR within budget OR the
78
+ check is disabled. ``False`` when the launch should be
79
+ blocked or warned about.
80
+ action: ``"warn"`` / ``"block"`` / ``"disabled"`` /
81
+ ``"git_unavailable"`` describing why the verdict landed
82
+ this way.
83
+ blocking: ``True`` only when ``action == "block"`` and the
84
+ count exceeds the budget. Callers gate the launch on this
85
+ field.
86
+ count: Number of dirty paths after whitelist filtering.
87
+ max_files: Configured ceiling.
88
+ dirty_paths: Up to 50 representative non-whitelisted paths;
89
+ surfaces in the UI banner.
90
+ whitelisted_count: How many paths the whitelist absorbed (for
91
+ diagnostics — confirms the policy is doing what it claims).
92
+ """
93
+
94
+ ok: bool
95
+ action: str
96
+ blocking: bool
97
+ count: int
98
+ max_files: int
99
+ dirty_paths: tuple[str, ...] = field(default_factory=tuple)
100
+ whitelisted_count: int = 0
101
+
102
+ def to_dict(self) -> dict[str, Any]:
103
+ return {
104
+ "ok": self.ok,
105
+ "action": self.action,
106
+ "blocking": self.blocking,
107
+ "count": self.count,
108
+ "max_files": self.max_files,
109
+ "dirty_paths": list(self.dirty_paths),
110
+ "whitelisted_count": self.whitelisted_count,
111
+ }
112
+
113
+
114
+ def _load_policy(project_dir: Path) -> dict[str, Any]:
115
+ policy_path = project_dir / ".cortex" / "dirty_baseline_policy.json"
116
+ if not policy_path.exists():
117
+ return {
118
+ "enabled": True,
119
+ "whitelist": list(_DEFAULT_WHITELIST),
120
+ "max_files": _DEFAULT_MAX_FILES,
121
+ "action": _DEFAULT_ACTION,
122
+ }
123
+ try:
124
+ raw = json.loads(policy_path.read_text(encoding="utf-8"))
125
+ if not isinstance(raw, dict):
126
+ raise ValueError("dirty_baseline_policy.json must be a JSON object")
127
+ except (OSError, json.JSONDecodeError, ValueError) as exc:
128
+ _log.warning(
129
+ "dirty_baseline_check: policy load failed, using defaults: %s", exc,
130
+ )
131
+ return {
132
+ "enabled": True,
133
+ "whitelist": list(_DEFAULT_WHITELIST),
134
+ "max_files": _DEFAULT_MAX_FILES,
135
+ "action": _DEFAULT_ACTION,
136
+ }
137
+ whitelist = raw.get("whitelist")
138
+ if not isinstance(whitelist, list):
139
+ whitelist = list(_DEFAULT_WHITELIST)
140
+ action = str(raw.get("action") or _DEFAULT_ACTION).strip().lower()
141
+ if action not in ("warn", "block"):
142
+ action = _DEFAULT_ACTION
143
+ return {
144
+ "enabled": bool(raw.get("enabled", True)),
145
+ "whitelist": [str(item) for item in whitelist],
146
+ "max_files": int(raw.get("max_files", _DEFAULT_MAX_FILES)),
147
+ "action": action,
148
+ }
149
+
150
+
151
+ def _is_whitelisted(path: str, whitelist: Iterable[str]) -> bool:
152
+ """True if ``path`` matches any entry in the whitelist.
153
+
154
+ Matching rules (in order):
155
+ 1. Glob (``fnmatch``) — handles ``*.lock`` and similar.
156
+ 2. Directory prefix — entries ending in ``/`` match any path
157
+ under that directory (``.cortex/`` matches ``.cortex/x.json``
158
+ AND ``.cortex/sub/y.json``).
159
+ 3. Exact equality fallback.
160
+ """
161
+ p = path.replace("\\", "/").lstrip("./")
162
+ for pattern in whitelist:
163
+ pat = pattern.replace("\\", "/").lstrip("./")
164
+ if fnmatch.fnmatch(p, pat):
165
+ return True
166
+ if pat.endswith("/") and (p == pat.rstrip("/") or p.startswith(pat)):
167
+ return True
168
+ if p == pat:
169
+ return True
170
+ return False
171
+
172
+
173
+ def _parse_porcelain(output: str) -> list[str]:
174
+ """Extract paths from ``git status --porcelain`` output.
175
+
176
+ Each line is ``XY <path>`` or ``XY <orig> -> <new>`` for renames.
177
+ We capture the new path in renames and the only path otherwise.
178
+ Empty lines (no dirty state) yield an empty list.
179
+ """
180
+ paths: list[str] = []
181
+ for raw_line in output.splitlines():
182
+ line = raw_line.rstrip()
183
+ if len(line) < 4:
184
+ continue
185
+ # Strip the 2-char status + 1 space prefix.
186
+ rest = line[3:]
187
+ if " -> " in rest:
188
+ _orig, _, new = rest.partition(" -> ")
189
+ paths.append(new.strip().strip('"'))
190
+ else:
191
+ paths.append(rest.strip().strip('"'))
192
+ return paths
193
+
194
+
195
+ def check_dirty_baseline(
196
+ project_dir: Path,
197
+ *,
198
+ git_status_output: str | None = None,
199
+ ) -> DirtyBaselineVerdict:
200
+ """Run the dirty-baseline gate against ``project_dir``.
201
+
202
+ Args:
203
+ project_dir: Project root containing the ``.git`` directory.
204
+ git_status_output: Optional pre-captured ``git status --porcelain``
205
+ output. When ``None`` the function shells out to ``git``;
206
+ tests inject deterministic output via this parameter.
207
+
208
+ Returns:
209
+ ``DirtyBaselineVerdict`` describing the outcome. The function
210
+ never raises on git failures — instead it returns ``ok=True``
211
+ with ``action="git_unavailable"`` so the launch proceeds. The
212
+ caller can log the missing-git case but it should not block a
213
+ legitimate run.
214
+ """
215
+ project_dir = Path(project_dir)
216
+ policy = _load_policy(project_dir)
217
+ if not policy.get("enabled", True):
218
+ return DirtyBaselineVerdict(
219
+ ok=True, action="disabled", blocking=False,
220
+ count=0, max_files=policy["max_files"],
221
+ )
222
+
223
+ if git_status_output is None:
224
+ try:
225
+ completed = subprocess.run(
226
+ ["git", "status", "--porcelain"],
227
+ cwd=str(project_dir),
228
+ capture_output=True,
229
+ text=True,
230
+ timeout=30,
231
+ check=False,
232
+ )
233
+ except (OSError, subprocess.SubprocessError) as exc:
234
+ _log.warning("dirty_baseline_check: git invocation failed: %s", exc)
235
+ return DirtyBaselineVerdict(
236
+ ok=True, action="git_unavailable", blocking=False,
237
+ count=0, max_files=policy["max_files"],
238
+ )
239
+ if completed.returncode != 0:
240
+ _log.warning(
241
+ "dirty_baseline_check: git status rc=%s stderr=%s",
242
+ completed.returncode, (completed.stderr or "")[:200],
243
+ )
244
+ return DirtyBaselineVerdict(
245
+ ok=True, action="git_unavailable", blocking=False,
246
+ count=0, max_files=policy["max_files"],
247
+ )
248
+ git_status_output = completed.stdout or ""
249
+
250
+ all_paths = _parse_porcelain(git_status_output)
251
+ whitelist = policy["whitelist"]
252
+ dirty: list[str] = []
253
+ whitelisted = 0
254
+ for path in all_paths:
255
+ if _is_whitelisted(path, whitelist):
256
+ whitelisted += 1
257
+ else:
258
+ dirty.append(path)
259
+
260
+ max_files = int(policy["max_files"])
261
+ over_budget = len(dirty) > max_files
262
+ action = str(policy["action"])
263
+ blocking = over_budget and action == "block"
264
+ ok = not over_budget
265
+
266
+ return DirtyBaselineVerdict(
267
+ ok=ok,
268
+ action=action if over_budget else "clean",
269
+ blocking=blocking,
270
+ count=len(dirty),
271
+ max_files=max_files,
272
+ dirty_paths=tuple(dirty[:50]),
273
+ whitelisted_count=whitelisted,
274
+ )
@@ -0,0 +1,387 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import re
5
+
6
+ from vigil_forensic._shared import EvidenceReference, GateCategory, GateImpact, GateSeverity, RepairKind
7
+ from vigil_forensic.gate_models import PostExecGateContext
8
+ from ..source_analysis import is_source_file, is_test_file
9
+ from .common import build_check_result, build_finding, extract_python_functions, hash_normalized_code, hash_text_block, iter_touched_snapshots
10
+ import logging
11
+ _log = logging.getLogger(__name__)
12
+
13
+ # Bare identifier lines (import continuation: "GateFinding,") — exclude from text block windows
14
+ _IMPORT_CONTINUATION_RE = re.compile(r"^[A-Za-z_]\w*,?$")
15
+
16
+ # Pure parameter-declaration / call-argument continuation lines, e.g.
17
+ # ``timeout: float = 0.05,`` ``poll_interval=poll_interval,`` ``arg0=None,``
18
+ # A long signature or call mirrored across sync/async APIs is structure, not
19
+ # copy-pasted logic — exclude these lines from the text-block window so the
20
+ # detector does not fire on shared parameter lists.
21
+ _PARAM_DECL_RE = re.compile(
22
+ r"^\*{0,2}[A-Za-z_]\w*\s*(?::\s*[^=]+?)?(?:=\s*[^,]+?)?,?$"
23
+ )
24
+
25
+
26
+ def _string_literal_lines(text: str) -> frozenset[int]:
27
+ """Return the set of 1-based line numbers that lie inside a string literal.
28
+
29
+ Covers docstrings and any multi-line string constant. Used to exclude
30
+ docstring / long-string content from the text-block duplication window:
31
+ shared docstrings (e.g. identical ``:param`` blocks on sync/async API
32
+ mirrors) are documentation, not duplicated CODE.
33
+
34
+ Python only. Returns an empty set on SyntaxError (fail-open) — non-Python
35
+ files keep their previous behavior.
36
+ """
37
+ try:
38
+ tree = ast.parse(text)
39
+ except (SyntaxError, ValueError):
40
+ return frozenset()
41
+ lines: set[int] = set()
42
+ for node in ast.walk(tree):
43
+ if isinstance(node, ast.Constant) and isinstance(node.value, str):
44
+ start = getattr(node, "lineno", None)
45
+ end = getattr(node, "end_lineno", None) or start
46
+ if start is None:
47
+ continue
48
+ lines.update(range(start, end + 1))
49
+ return frozenset(lines)
50
+
51
+ _MAX_DUPLICATION_CHECK_FILES = 200 # Prevent rglob on massive projects
52
+
53
+ _BOILERPLATE_FUNCTION_NAMES = frozenset({
54
+ "to_dict", "from_dict", "from_mapping", "__init__", "__repr__", "__str__",
55
+ "__eq__", "__hash__", "__post_init__", "_now", "to_json", "from_json",
56
+ })
57
+
58
+
59
+ def _extract_snippets(path: str, text: str) -> list[tuple[str, int, int, str]]:
60
+ """Return (name, start, end, snippet) for all function-like regions.
61
+
62
+ Python: AST-based (exact). JS/TS: regex-based (heuristic). Others: empty.
63
+ """
64
+ from ..source_analysis import extract_functions, get_language_id
65
+ lang = get_language_id(path)
66
+ if lang == "python":
67
+ return extract_python_functions(text)
68
+ fns = extract_functions(path, text)
69
+ if not fns:
70
+ return []
71
+ lines = text.splitlines()
72
+ return [
73
+ (fi.name, fi.start_line, fi.end_line,
74
+ "\n".join(lines[fi.start_line - 1:fi.end_line]))
75
+ for fi in fns
76
+ ]
77
+
78
+
79
+ def run_duplication_checks(ctx: PostExecGateContext):
80
+ findings = []
81
+ profile = ctx.repo_profile
82
+ touched_hashes: dict[str, list[tuple[str, str]]] = {}
83
+ for snapshot in iter_touched_snapshots(ctx):
84
+ if not snapshot.exists or not is_source_file(snapshot.path):
85
+ continue
86
+ if is_test_file(snapshot.path):
87
+ continue
88
+ for func_name, start, end, snippet in _extract_snippets(snapshot.path, snapshot.text):
89
+ if end - start < 3:
90
+ continue
91
+ if func_name in _BOILERPLATE_FUNCTION_NAMES:
92
+ continue
93
+ touched_hashes.setdefault(hash_normalized_code(snippet), []).append((snapshot.path, func_name))
94
+ # cross_touched_duplicate is designed for incremental AI edits (2-20 files).
95
+ # In a full-scan (self-audit), touched_files == all source files, so every
96
+ # pair of structurally-similar helpers cross-matches — meaningless noise.
97
+ #
98
+ # Full-scan detection: explicit flag OR (large touched set that equals all
99
+ # known snapshots — at least 10 files so genuine 2-20 file incremental
100
+ # changes are never mis-classified as full scans).
101
+ _MIN_FULL_SCAN_FILES = 10
102
+ # cross_touched_duplicate is for INCREMENTAL diffs (2-20 touched files). A
103
+ # full scan clears the touched hashes so it does not DOUBLE-COUNT
104
+ # duplicate_scan (C45), which already covers the duplication. Full scan =
105
+ # explicit ctx.is_full_scan flag (standalone self-audit sets it), OR the
106
+ # heuristic "touched covers all snapshots and >= _MIN_FULL_SCAN_FILES".
107
+ # Incremental scans (no flag, small touched set) keep cross_touched active.
108
+ _is_full_scan = getattr(ctx, "is_full_scan", False) or (
109
+ len(ctx.touched_files) >= _MIN_FULL_SCAN_FILES
110
+ and set(ctx.touched_files) >= set(ctx.file_snapshots.keys())
111
+ )
112
+ if _is_full_scan:
113
+ touched_hashes.clear()
114
+
115
+ seen_pairs: set[tuple[str, str, str, str]] = set()
116
+ file_count = 0
117
+ for path in (ctx.project_dir.rglob("*") if touched_hashes else ()):
118
+ if file_count > _MAX_DUPLICATION_CHECK_FILES:
119
+ _log.warning(
120
+ "duplication_checks: rglob limit exceeded (%d > %d), stopping early",
121
+ file_count, _MAX_DUPLICATION_CHECK_FILES,
122
+ )
123
+ break
124
+ file_count += 1
125
+ if not path.is_file():
126
+ continue
127
+ repo_path = str(path.relative_to(ctx.project_dir)).replace("\\", "/")
128
+ if not is_source_file(repo_path):
129
+ continue
130
+ if profile and profile.is_generated_or_vendored(repo_path):
131
+ continue
132
+ if repo_path in ctx.touched_files:
133
+ continue
134
+ if is_test_file(repo_path):
135
+ continue
136
+ text = path.read_text(encoding="utf-8", errors="replace")
137
+ for func_name, start, end, snippet in _extract_snippets(repo_path, text):
138
+ if end - start < 3:
139
+ continue
140
+ if func_name in _BOILERPLATE_FUNCTION_NAMES:
141
+ continue
142
+ hashed = hash_normalized_code(snippet)
143
+ matches = touched_hashes.get(hashed) or []
144
+ if not matches:
145
+ continue
146
+ for match_path, match_name in matches:
147
+ pair = (match_path, match_name, repo_path, func_name)
148
+ if pair in seen_pairs:
149
+ continue
150
+ seen_pairs.add(pair)
151
+ findings.append(
152
+ build_finding(
153
+ check_id="duplication.normalized_function",
154
+ category=GateCategory.DUPLICATION,
155
+ title="Touched code duplicates existing logic under a different location",
156
+ severity=GateSeverity.HIGH,
157
+ impact=GateImpact.REVISE,
158
+ summary=f"{match_path}::{match_name} is near-duplicate of {repo_path}::{func_name}.",
159
+ recommendation=(
160
+ "Remove the duplicate and import the canonical implementation directly. "
161
+ "If both locations need minor variations, add a parameter to the canonical function "
162
+ "rather than forking a copy."
163
+ ),
164
+ evidence=[
165
+ EvidenceReference(kind="file", path=match_path, detail=match_name),
166
+ EvidenceReference(kind="file", path=repo_path, detail=func_name),
167
+ ],
168
+ repair_kind=RepairKind.REMOVE_DUPLICATE.value,
169
+ executor_action=f"Remove {match_path}::{match_name} — near-duplicate of {repo_path}::{func_name}; import canonical instead",
170
+ proof_required="duplicate removed; original passes existing tests",
171
+ allowlist_allowed=False,
172
+ )
173
+ )
174
+ for locations in touched_hashes.values():
175
+ if len(locations) < 2:
176
+ continue
177
+ for i in range(len(locations)):
178
+ for j in range(i + 1, len(locations)):
179
+ path_a, name_a = locations[i]
180
+ path_b, name_b = locations[j]
181
+ pair = (path_a, name_a, path_b, name_b)
182
+ if pair in seen_pairs:
183
+ continue
184
+ seen_pairs.add(pair)
185
+ findings.append(
186
+ build_finding(
187
+ check_id="duplication.cross_touched_duplicate",
188
+ category=GateCategory.DUPLICATION,
189
+ title="Two newly-touched files contain duplicate function implementations",
190
+ severity=GateSeverity.HIGH,
191
+ impact=GateImpact.REVISE,
192
+ summary=f"{path_a}::{name_a} is a near-duplicate of {path_b}::{name_b} -- both were touched in this change.",
193
+ recommendation=(
194
+ "Consolidate into one canonical implementation. "
195
+ "Move it to `<package>/shared.py` or `<package>/utils.py` if both files are in the same package, "
196
+ "or to a common cross-package helper module. "
197
+ "Replace both copies with an import."
198
+ ),
199
+ evidence=[
200
+ EvidenceReference(kind="file", path=path_a, detail=name_a),
201
+ EvidenceReference(kind="file", path=path_b, detail=name_b),
202
+ ],
203
+ repair_kind=RepairKind.EXTRACT_SHARED.value,
204
+ executor_action=f"Extract shared impl from {path_a}::{name_a} and {path_b}::{name_b} into canonical module; replace both with import",
205
+ proof_required="one canonical impl; both callers import it; tests pass",
206
+ allowlist_allowed=False,
207
+ )
208
+ )
209
+ # ── Phase 2: Universal text block duplication ──
210
+ # Catches repeated HTML/CSS/JS/config blocks across files — not just Python functions.
211
+ # Works by hashing sliding windows of N consecutive non-empty lines.
212
+ _BLOCK_MIN_LINES = 12
213
+ _BLOCK_IGNORE_PREFIXES = ("#", "//", "/*", "*", "import ", "from ", '"""', "'''", "assert ", "return ")
214
+ _SKIP_DIRS = (".vendor", "node_modules", "migrations", "__generated__", "__pycache__", "gate_checks")
215
+ block_hashes: dict[str, list[tuple[str, int]]] = {} # hash -> [(file, start_line), ...]
216
+
217
+ _MAX_TEXT_BLOCK_FINDINGS = 50
218
+ _text_block_count = 0
219
+
220
+ for snapshot in iter_touched_snapshots(ctx):
221
+ if not snapshot.exists or not snapshot.text:
222
+ continue
223
+ # Skip test files — they naturally have repetitive assertion patterns
224
+ norm_path = snapshot.path.replace("\\", "/")
225
+ if norm_path.split("/")[-1].startswith("test_"):
226
+ continue
227
+ # Skip vendored/generated directories
228
+ if any(f"/{d}/" in f"/{norm_path}/" for d in _SKIP_DIRS):
229
+ continue
230
+ lines = snapshot.text.splitlines()
231
+ # Lines that sit inside a string literal (docstrings, long multi-line
232
+ # strings). Shared docstrings / :param blocks across sync/async API
233
+ # mirrors are documentation, not duplicated CODE — exclude them.
234
+ docstring_lines = (
235
+ _string_literal_lines(snapshot.text)
236
+ if is_source_file(snapshot.path) and snapshot.path.endswith(".py")
237
+ else frozenset()
238
+ )
239
+ # Filter to meaningful lines (skip empty, comments, imports, docstrings,
240
+ # and pure parameter-declaration / argument-continuation lines).
241
+ meaningful: list[tuple[int, str]] = []
242
+ for i, line in enumerate(lines):
243
+ line_no = i + 1
244
+ if line_no in docstring_lines:
245
+ continue
246
+ stripped = line.strip()
247
+ if not stripped:
248
+ continue
249
+ if any(stripped.startswith(p) for p in _BLOCK_IGNORE_PREFIXES):
250
+ continue
251
+ if _IMPORT_CONTINUATION_RE.match(stripped):
252
+ continue
253
+ if _PARAM_DECL_RE.match(stripped):
254
+ continue
255
+ meaningful.append((line_no, stripped))
256
+
257
+ # Sliding window of _BLOCK_MIN_LINES
258
+ for idx in range(len(meaningful) - _BLOCK_MIN_LINES + 1):
259
+ window = meaningful[idx:idx + _BLOCK_MIN_LINES]
260
+ block_text = "\n".join(text for _, text in window)
261
+ block_hash = hash_text_block(block_text)
262
+ start_line = window[0][0]
263
+ entries = block_hashes.setdefault(block_hash, [])
264
+ # Don't add overlapping blocks from the same file
265
+ if entries and entries[-1][0] == snapshot.path and abs(entries[-1][1] - start_line) < _BLOCK_MIN_LINES:
266
+ continue
267
+ entries.append((snapshot.path, start_line))
268
+
269
+ # ── Collapse per-line-window inflation ──
270
+ # A single duplicated REGION of N lines produces N-_BLOCK_MIN_LINES+1
271
+ # distinct window hashes, all sharing the same set of files at adjacent
272
+ # start lines. Emitting one finding per hash inflated the count (~13
273
+ # findings for one shared block on filelock). Group the duplicate hashes by
274
+ # the set of files involved, then merge windows whose start lines are within
275
+ # _BLOCK_MIN_LINES of each other (contiguous/overlapping = same region) so
276
+ # each duplicated region yields exactly ONE finding.
277
+ #
278
+ # group key = frozenset of files; value = {file: [start_line, ...]}
279
+ region_groups: dict[frozenset, dict[str, list[int]]] = {}
280
+ for block_hash, locations in block_hashes.items():
281
+ if len(locations) < 2:
282
+ continue
283
+ files_in_hash = frozenset(path for path, _ in locations)
284
+ if len(files_in_hash) >= 2:
285
+ key = files_in_hash
286
+ else:
287
+ # Intra-file: only meaningful if the SAME block repeats at >=2 spots.
288
+ only_file = next(iter(files_in_hash))
289
+ if len({ln for _, ln in locations}) < 2:
290
+ continue
291
+ key = files_in_hash
292
+ bucket = region_groups.setdefault(key, {})
293
+ for path, ln in locations:
294
+ bucket.setdefault(path, []).append(ln)
295
+
296
+ def _merge_starts(starts: list[int]) -> list[int]:
297
+ """Collapse start lines within _BLOCK_MIN_LINES of each other into the
298
+ first line of each contiguous region."""
299
+ if not starts:
300
+ return []
301
+ ordered = sorted(set(starts))
302
+ regions = [ordered[0]]
303
+ for ln in ordered[1:]:
304
+ if ln - regions[-1] >= _BLOCK_MIN_LINES:
305
+ regions.append(ln)
306
+ return regions
307
+
308
+ for files_key, per_file in region_groups.items():
309
+ if _text_block_count >= _MAX_TEXT_BLOCK_FINDINGS:
310
+ break
311
+ unique_files = sorted(per_file.keys())
312
+ # Region anchor = first start line in each file (after merging).
313
+ merged_per_file = {f: _merge_starts(per_file[f]) for f in unique_files}
314
+
315
+ if len(unique_files) >= 2:
316
+ # One cross-file finding per duplicated region. The number of regions
317
+ # is the max region count across the involved files.
318
+ region_count = max(len(v) for v in merged_per_file.values()) or 1
319
+ for region_idx in range(region_count):
320
+ if _text_block_count >= _MAX_TEXT_BLOCK_FINDINGS:
321
+ break
322
+ file_lines_map = {
323
+ f: merged_per_file[f][min(region_idx, len(merged_per_file[f]) - 1)]
324
+ for f in unique_files
325
+ }
326
+ _text_block_count += 1
327
+ file_list = ", ".join(f"{f} (line {file_lines_map[f]})" for f in unique_files[:5])
328
+ suffix = f" and {len(unique_files) - 5} more" if len(unique_files) > 5 else ""
329
+ findings.append(
330
+ build_finding(
331
+ check_id="duplication.text_block",
332
+ category=GateCategory.DUPLICATION,
333
+ title="Repeated text block across files",
334
+ severity=GateSeverity.MEDIUM,
335
+ impact=GateImpact.REVISE,
336
+ summary=(
337
+ f"Duplicated {_BLOCK_MIN_LINES}+ line block found in {len(unique_files)} files: "
338
+ f"{file_list}{suffix}"
339
+ ),
340
+ recommendation=(
341
+ "Extract the repeated block into a shared function, template, or constant. "
342
+ "If the files belong to the same package — add a `shared.py` or `utils.py` module there. "
343
+ "If they span multiple packages — move the helper to the nearest common ancestor package."
344
+ ),
345
+ evidence=[
346
+ EvidenceReference(kind="file", path=f, detail=f"line:{file_lines_map[f]}")
347
+ for f in unique_files[:5]
348
+ ],
349
+ repair_kind=RepairKind.EXTRACT_SHARED.value,
350
+ executor_action=f"Extract duplicated block from {unique_files[0]} et al. into shared helper; replace each occurrence with a call",
351
+ proof_required="no repeated block; tests pass",
352
+ )
353
+ )
354
+ else:
355
+ # Intra-file duplication (same block repeated within one file).
356
+ file_path = unique_files[0]
357
+ file_lines = merged_per_file[file_path]
358
+ if len(file_lines) >= 2:
359
+ _text_block_count += 1
360
+ findings.append(
361
+ build_finding(
362
+ check_id="duplication.text_block_intra",
363
+ category=GateCategory.DUPLICATION,
364
+ title="Repeated text block within same file",
365
+ severity=GateSeverity.MEDIUM,
366
+ impact=GateImpact.REVISE,
367
+ summary=(
368
+ f"{file_path} has {len(file_lines)} copies of a {_BLOCK_MIN_LINES}+ line block "
369
+ f"(lines {', '.join(str(l) for l in file_lines[:5])}). "
370
+ f"Extract to a shared helper."
371
+ ),
372
+ recommendation=(
373
+ "Extract the repeated block into a private helper function within the same file "
374
+ "and call it from each location. "
375
+ "If the same logic is needed elsewhere — move the helper to `<package>/shared.py`."
376
+ ),
377
+ evidence=[
378
+ EvidenceReference(kind="file", path=file_path, detail=f"line:{ln}")
379
+ for ln in file_lines[:3]
380
+ ],
381
+ repair_kind=RepairKind.EXTRACT_SHARED.value,
382
+ executor_action=f"Extract repeated block in {file_path} (lines {', '.join(str(l) for l in file_lines[:3])}) into private helper; replace each copy with a call",
383
+ proof_required="no repeated block; tests pass",
384
+ )
385
+ )
386
+
387
+ return build_check_result(check_id="duplication", category=GateCategory.DUPLICATION, findings=findings)