vigil-codeintel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
  2. vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
  3. vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
  4. vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
  5. vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
  7. vigil_forensic/__init__.py +224 -0
  8. vigil_forensic/_git_utils.py +178 -0
  9. vigil_forensic/_shared.py +510 -0
  10. vigil_forensic/_stubs.py +156 -0
  11. vigil_forensic/gate_checks/__init__.py +1 -0
  12. vigil_forensic/gate_checks/_ast_helpers.py +629 -0
  13. vigil_forensic/gate_checks/_deployment_detector.py +573 -0
  14. vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
  15. vigil_forensic/gate_checks/authority_checks.py +95 -0
  16. vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
  17. vigil_forensic/gate_checks/broad_except_checks.py +301 -0
  18. vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
  19. vigil_forensic/gate_checks/common.py +253 -0
  20. vigil_forensic/gate_checks/config_safety_checks.py +704 -0
  21. vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
  22. vigil_forensic/gate_checks/conflict_checks.py +193 -0
  23. vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
  24. vigil_forensic/gate_checks/context_health_checks.py +289 -0
  25. vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
  26. vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
  27. vigil_forensic/gate_checks/duplication_checks.py +387 -0
  28. vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
  29. vigil_forensic/gate_checks/empty_output_checks.py +87 -0
  30. vigil_forensic/gate_checks/encoding_checks.py +847 -0
  31. vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
  32. vigil_forensic/gate_checks/fallback_checks.py +41 -0
  33. vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
  34. vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
  35. vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
  36. vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
  37. vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
  38. vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
  39. vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
  40. vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
  41. vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
  42. vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
  43. vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
  44. vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
  45. vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
  46. vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
  47. vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
  48. vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
  49. vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
  50. vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
  51. vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
  52. vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
  53. vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
  54. vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
  55. vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
  56. vigil_forensic/gate_checks/hallucination_checks.py +566 -0
  57. vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
  58. vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
  59. vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
  60. vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
  61. vigil_forensic/gate_checks/ml_checks.py +318 -0
  62. vigil_forensic/gate_checks/performance_checks.py +106 -0
  63. vigil_forensic/gate_checks/project_specific_runner.py +691 -0
  64. vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
  65. vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
  66. vigil_forensic/gate_checks/reliability_checks.py +389 -0
  67. vigil_forensic/gate_checks/reporting_checks.py +55 -0
  68. vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
  69. vigil_forensic/gate_checks/security_injection_checks.py +332 -0
  70. vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
  71. vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
  72. vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
  73. vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
  74. vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
  75. vigil_forensic/gate_checks/test_quality_checks.py +946 -0
  76. vigil_forensic/gate_checks/testing_checks.py +149 -0
  77. vigil_forensic/gate_checks/toctou_checks.py +367 -0
  78. vigil_forensic/gate_checks/type_checking_checks.py +316 -0
  79. vigil_forensic/gate_models.py +392 -0
  80. vigil_forensic/gate_packs/__init__.py +1 -0
  81. vigil_forensic/gate_packs/universal.py +179 -0
  82. vigil_forensic/gate_profile.json +31 -0
  83. vigil_forensic/gate_registry.py +21 -0
  84. vigil_forensic/language_profiles.py +219 -0
  85. vigil_forensic/meta_findings.py +207 -0
  86. vigil_forensic/self_audit.py +725 -0
  87. vigil_forensic/source_analysis.py +175 -0
  88. vigil_mapper/__init__.py +103 -0
  89. vigil_mapper/_ast_helpers_minimal.py +229 -0
  90. vigil_mapper/_extract_imports_impl.py +123 -0
  91. vigil_mapper/_file_count_guard.py +129 -0
  92. vigil_mapper/_git_utils.py +178 -0
  93. vigil_mapper/_runtime_ast.py +438 -0
  94. vigil_mapper/_runtime_dispatch.py +137 -0
  95. vigil_mapper/_seed_helpers.py +82 -0
  96. vigil_mapper/authority_builder.py +1102 -0
  97. vigil_mapper/cli_entry.py +731 -0
  98. vigil_mapper/conflict_builder.py +818 -0
  99. vigil_mapper/data_contract_builder.py +446 -0
  100. vigil_mapper/findings_builder.py +716 -0
  101. vigil_mapper/fingerprint.py +53 -0
  102. vigil_mapper/hotspot_builder.py +539 -0
  103. vigil_mapper/map_common.py +449 -0
  104. vigil_mapper/map_errors.py +55 -0
  105. vigil_mapper/map_models.py +431 -0
  106. vigil_mapper/map_models_ext.py +206 -0
  107. vigil_mapper/map_models_findings.py +130 -0
  108. vigil_mapper/map_storage.py +455 -0
  109. vigil_mapper/parse_cache.py +795 -0
  110. vigil_mapper/refactor_boundary_builder.py +266 -0
  111. vigil_mapper/runtime_builder.py +527 -0
  112. vigil_mapper/runtime_tracer.py +243 -0
  113. vigil_mapper/runtime_tracer_entry.py +199 -0
  114. vigil_mapper/semantic_diff.py +71 -0
  115. vigil_mapper/source_adapters/__init__.py +109 -0
  116. vigil_mapper/source_adapters/_base.py +264 -0
  117. vigil_mapper/source_adapters/_ir.py +156 -0
  118. vigil_mapper/source_adapters/_lexer.py +309 -0
  119. vigil_mapper/source_adapters/_patterns.py +212 -0
  120. vigil_mapper/source_adapters/_treesitter.py +182 -0
  121. vigil_mapper/source_adapters/go.py +553 -0
  122. vigil_mapper/source_adapters/java.py +541 -0
  123. vigil_mapper/source_adapters/javascript.py +626 -0
  124. vigil_mapper/source_adapters/python.py +325 -0
  125. vigil_mapper/source_adapters/typescript.py +749 -0
  126. vigil_mapper/structural_builder.py +586 -0
  127. vigil_mcp/__init__.py +1 -0
  128. vigil_mcp/_jobs.py +587 -0
  129. vigil_mcp/_paths.py +93 -0
  130. vigil_mcp/forensic_server.py +419 -0
  131. vigil_mcp/map_server.py +452 -0
@@ -0,0 +1,763 @@
1
+ """Data handling, time, duplication, and dependency quality. Clusters 44-50.
2
+
3
+ Clusters:
4
+ 44 - Naive Timezone Usage
5
+ 45 - Intra-File Near-Duplicate Code
6
+ 46 - Missing Null/None Check at API Boundary
7
+ 47 - String Concatenation for Paths
8
+ 48 - Log Without Error Context
9
+ 49 - Secrets in Test Files
10
+ 50 - Unpinned Dependencies
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from .core import detect_language
15
+ from ...gate_models import (
16
+ EvidenceReference,
17
+ GateCategory,
18
+ GateFinding,
19
+ GateImpact,
20
+ GateSeverity,
21
+ RepairKind,
22
+ )
23
+ from ..common import build_finding
24
+ import logging
25
+ _log = logging.getLogger(__name__)
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Cluster 44: Naive Timezone Usage
30
+ # ---------------------------------------------------------------------------
31
+
32
+
33
+ def assess_naive_timezone(
34
+ file_path: str,
35
+ content: str,
36
+ ) -> list[GateFinding]:
37
+ """Cluster 44: Detect naive datetime usage without timezone awareness."""
38
+ import re
39
+
40
+ if not content.strip():
41
+ return []
42
+
43
+ lang = detect_language(file_path)
44
+ if lang not in ("python", "javascript", "typescript"):
45
+ return []
46
+
47
+ basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] if "/" in file_path.replace("\\", "/") else file_path
48
+ if basename.startswith("test_") or basename.startswith("conftest"):
49
+ return []
50
+
51
+ findings: list[GateFinding] = []
52
+
53
+ if lang == "python":
54
+ for i, line in enumerate(content.splitlines(), 1):
55
+ stripped = line.strip()
56
+ if stripped.startswith("#"):
57
+ continue
58
+ if re.search(r'datetime\.now\s*\(\s*\)', stripped):
59
+ detail = f"datetime.now() without timezone (line {i}) -- use datetime.now(tz=timezone.utc)"
60
+ findings.append(build_finding(
61
+ check_id="timezone_scan",
62
+ category=GateCategory.RUNTIME_BEHAVIOR,
63
+ title=f"[naive_timezone] {file_path}:{i}",
64
+ severity=GateSeverity.MEDIUM,
65
+ impact=GateImpact.REVISE,
66
+ summary=detail,
67
+ recommendation="Use `datetime.now(tz=timezone.utc)` for timezone-aware datetimes.",
68
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
69
+ repair_kind=RepairKind.FIX_CONTRACT.value,
70
+ executor_action=f"Fix naive datetime at {file_path}:{i}",
71
+ ))
72
+ if re.search(r'datetime\.utcnow\s*\(', stripped):
73
+ detail = f"datetime.utcnow() is deprecated (line {i}) -- use datetime.now(tz=timezone.utc)"
74
+ findings.append(build_finding(
75
+ check_id="timezone_scan",
76
+ category=GateCategory.RUNTIME_BEHAVIOR,
77
+ title=f"[naive_timezone] {file_path}:{i}",
78
+ severity=GateSeverity.MEDIUM,
79
+ impact=GateImpact.REVISE,
80
+ summary=detail,
81
+ recommendation="Replace `datetime.utcnow()` with `datetime.now(tz=timezone.utc)`.",
82
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
83
+ repair_kind=RepairKind.FIX_CONTRACT.value,
84
+ executor_action=f"Fix deprecated utcnow() at {file_path}:{i}",
85
+ ))
86
+ if re.search(r'time\.localtime\s*\(\s*\)', stripped):
87
+ detail = f"time.localtime() without timezone (line {i}) -- use time.gmtime() or datetime"
88
+ findings.append(build_finding(
89
+ check_id="timezone_scan",
90
+ category=GateCategory.RUNTIME_BEHAVIOR,
91
+ title=f"[naive_timezone] {file_path}:{i}",
92
+ severity=GateSeverity.LOW,
93
+ impact=GateImpact.WARN,
94
+ summary=detail,
95
+ recommendation="Use `time.gmtime()` or `datetime.now(tz=timezone.utc)` instead.",
96
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
97
+ repair_kind=RepairKind.FIX_CONTRACT.value,
98
+ executor_action=f"Fix localtime() at {file_path}:{i}",
99
+ ))
100
+ if len(findings) >= 10:
101
+ break
102
+
103
+ elif lang in ("javascript", "typescript"):
104
+ for i, line in enumerate(content.splitlines(), 1):
105
+ stripped = line.strip()
106
+ if stripped.startswith("//"):
107
+ continue
108
+ if re.search(r'\.toLocaleDateString\s*\(\s*\)', stripped):
109
+ detail = f"toLocaleDateString() without locale (line {i}) -- specify locale explicitly"
110
+ findings.append(build_finding(
111
+ check_id="timezone_scan",
112
+ category=GateCategory.RUNTIME_BEHAVIOR,
113
+ title=f"[naive_timezone] {file_path}:{i}",
114
+ severity=GateSeverity.LOW,
115
+ impact=GateImpact.WARN,
116
+ summary=detail,
117
+ recommendation="Pass explicit locale: `.toLocaleDateString('en-US', { timeZone: 'UTC' })`.",
118
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
119
+ repair_kind=RepairKind.FIX_CONTRACT.value,
120
+ executor_action=f"Fix toLocaleDateString() at {file_path}:{i}",
121
+ ))
122
+ if len(findings) >= 10:
123
+ break
124
+
125
+ return findings
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Cluster 45: Intra-File Near-Duplicate Code
130
+ # ---------------------------------------------------------------------------
131
+
132
+
133
+ import re as _re
134
+
135
+ # FP-round2-D (2026-06-28): signature / typing scaffolding line shapes that must
136
+ # NOT count as meaningful duplicate lines. These repeat by language requirement
137
+ # (typing overloads) or API symmetry and are not refactorable logic:
138
+ # * decorator lines: ``@t.overload`` / ``@property`` / ``@staticmethod``
139
+ # * def openers / closers: ``def f(`` / ``async def f(`` / ``): ...`` / ``) -> X:``
140
+ # * bare parameter declarations inside a multi-line signature:
141
+ # ``name`` | ``name,`` | ``name=default,`` | ``name: type,`` | ``*args,``
142
+ # where the value is a simple literal/identifier (NOT a function call, so a
143
+ # real statement like ``record = build_record(...)`` is never skipped).
144
+ # * lone ellipsis stub bodies: ``...``
145
+ _SCAFFOLD_PARAM_RE = _re.compile(
146
+ r"^\*{0,2}[A-Za-z_]\w*" # name, *args, **kwargs
147
+ r"(?:\s*:\s*[^=(]+?)?" # optional annotation (no call parens)
148
+ r"(?:\s*=\s*[^(]+?)?" # optional simple default (no call parens)
149
+ r",?$" # optional trailing comma
150
+ )
151
+
152
+
153
+ def _is_signature_scaffolding(s: str) -> bool:
154
+ """True if normalized line *s* is signature / typing scaffolding."""
155
+ if s == "..." or s.endswith("): ...") or s == "): ..." or s.endswith(") -> ..."):
156
+ return True
157
+ if s.startswith("@"): # decorator
158
+ return True
159
+ if s.startswith("def ") or s.startswith("async def "):
160
+ # ``def f(`` opener (possibly with the full single-line signature). A
161
+ # single-line def with a body on the same line is rare; treat the
162
+ # ``def`` header as scaffolding either way.
163
+ return True
164
+ if s in (")", "):", "->", ") ->"):
165
+ return True
166
+ # Closer with return type only: ``) -> SomeType:`` (no other statement).
167
+ if s.startswith(")") and s.endswith(":"):
168
+ return True
169
+ # Bare parameter declaration line inside a multi-line signature.
170
+ if _SCAFFOLD_PARAM_RE.match(s) and "(" not in s:
171
+ return True
172
+ return False
173
+
174
+
175
+ def assess_near_duplicate_code(
176
+ file_path: str,
177
+ content: str,
178
+ ) -> list[GateFinding]:
179
+ """Cluster 45: Detect near-duplicate code blocks within the same file.
180
+
181
+ A single duplicated REGION of N lines spans N-BLOCK_SIZE+1 overlapping
182
+ sliding windows. Emitting one finding per window inflated the count (a
183
+ 4-statement block reported once per line: "lines 118 and 201", "119 and
184
+ 202", ...). We collect every duplicate window-pair, then MERGE contiguous /
185
+ overlapping pairs into ONE finding per contiguous block — mirroring the
186
+ region-grouping ``_merge_starts`` used by ``duplication.text_block`` — so a
187
+ block reports as "lines 118-121 <-> 201-204" exactly once. Genuinely
188
+ separate duplicate blocks still each report once (merge, not cap).
189
+ """
190
+ if not content.strip():
191
+ return []
192
+
193
+ lang = detect_language(file_path)
194
+ if lang in ("json", "yaml", "toml", "markdown", "restructuredtext", "sql"):
195
+ return []
196
+
197
+ lines = content.splitlines()
198
+ if len(lines) < 10:
199
+ return []
200
+
201
+ BLOCK_SIZE = 4
202
+ # FP-round2-D (2026-06-28): minimum number of MEANINGFUL (post-normalization,
203
+ # non-scaffolding) lines a duplicated region must span to be reported.
204
+ #
205
+ # The PRIMARY noise discriminator is ``_is_signature_scaffolding`` below: on
206
+ # real code (click, mcp) most near-duplicate hits were typing/signature
207
+ # mirrors — ``@t.overload`` stubs, parameter-list mirrors — whose lines are
208
+ # now stripped from ``normalized`` entirely, so those regions never form.
209
+ #
210
+ # This line-count floor is a SECONDARY filter against very short residual
211
+ # mirrors (e.g. repeated 3-4 line encoding-literal bodies). It is set to 5
212
+ # so that genuine multi-statement logic duplicates (>=5 meaningful lines)
213
+ # are still reported — including the oracle's 6-line route_alpha/route_beta
214
+ # bodies and 4-statement+return logic blocks — while trivial 3-4 line
215
+ # mirrors are dropped.
216
+ MIN_DUP_REGION_LINES = 5
217
+ normalized: list[tuple[str, int]] = []
218
+ for i, line in enumerate(lines, 1):
219
+ s = line.strip()
220
+ if not s or s.startswith("#") or s.startswith("//") or s.startswith("*"):
221
+ continue
222
+ if s in ("}", "{", "pass", "return", "break", "continue", "else:", "try:", "finally:"):
223
+ continue
224
+ # FP-round2-D: skip signature / typing scaffolding so overload stubs and
225
+ # parameter-list mirrors do not accumulate "meaningful" duplicate lines.
226
+ if _is_signature_scaffolding(s):
227
+ continue
228
+ normalized.append((" ".join(s.split()), i))
229
+
230
+ if len(normalized) < BLOCK_SIZE * 2:
231
+ return []
232
+
233
+ # Pass 1: collect every duplicate window as a (first_occurrence, this) pair
234
+ # of the *normalized* index, keeping the source line number for each.
235
+ seen: dict[str, tuple[int, int]] = {} # fingerprint -> (norm_idx, line_no)
236
+ # raw_pairs: list of (orig_norm_idx, dup_norm_idx, orig_line, dup_line)
237
+ raw_pairs: list[tuple[int, int, int, int]] = []
238
+ for idx in range(len(normalized) - BLOCK_SIZE + 1):
239
+ block = tuple(normalized[idx + k][0] for k in range(BLOCK_SIZE))
240
+ fp = "\n".join(block)
241
+ first_line = normalized[idx][1]
242
+ if fp in seen:
243
+ orig_idx, orig_line = seen[fp]
244
+ if abs(first_line - orig_line) >= BLOCK_SIZE:
245
+ raw_pairs.append((orig_idx, idx, orig_line, first_line))
246
+ else:
247
+ seen[fp] = (idx, first_line)
248
+
249
+ if not raw_pairs:
250
+ return []
251
+
252
+ # Pass 2: merge contiguous/overlapping window-pairs into block-level
253
+ # regions. Two pairs belong to the same duplicated block when BOTH their
254
+ # original-window index and duplicate-window index advance by exactly one
255
+ # step together (the sliding window moved one normalized line on each side).
256
+ # Each merged region records the source line span on both sides.
257
+ raw_pairs.sort()
258
+ regions: list[tuple[int, int, int, int]] = [] # (orig_start_line, orig_end_line, dup_start_line, dup_end_line)
259
+ cur_orig_idx, cur_dup_idx, cur_orig_start, cur_dup_start = raw_pairs[0]
260
+ cur_orig_end_line = cur_orig_start
261
+ cur_dup_end_line = cur_dup_start
262
+ prev_orig_idx, prev_dup_idx = cur_orig_idx, cur_dup_idx
263
+
264
+ def _flush() -> None:
265
+ # End line of a BLOCK_SIZE window starting at the recorded start line:
266
+ # add the height of the window (last normalized line in the window).
267
+ oi = prev_orig_idx
268
+ di = prev_dup_idx
269
+ orig_end = normalized[oi + BLOCK_SIZE - 1][1]
270
+ dup_end = normalized[di + BLOCK_SIZE - 1][1]
271
+ regions.append((cur_orig_start, orig_end, cur_dup_start, dup_end))
272
+
273
+ for orig_idx, dup_idx, orig_line, dup_line in raw_pairs[1:]:
274
+ if orig_idx == prev_orig_idx + 1 and dup_idx == prev_dup_idx + 1:
275
+ # Same sliding region — extend.
276
+ prev_orig_idx, prev_dup_idx = orig_idx, dup_idx
277
+ continue
278
+ # New region — flush the current one and start fresh.
279
+ _flush()
280
+ cur_orig_idx, cur_dup_idx = orig_idx, dup_idx
281
+ cur_orig_start, cur_dup_start = orig_line, dup_line
282
+ prev_orig_idx, prev_dup_idx = orig_idx, dup_idx
283
+ _flush()
284
+
285
+ findings: list[GateFinding] = []
286
+ for orig_start, orig_end, dup_start, dup_end in regions:
287
+ n_lines = orig_end - orig_start + 1
288
+ # FP-round2-D: count MEANINGFUL (normalized, non-scaffolding) lines that
289
+ # actually fall inside the original region — the raw line span can
290
+ # include blank/comment gaps. Require >= MIN_DUP_REGION_LINES to report.
291
+ meaningful_in_region = sum(
292
+ 1 for _norm_text, _ln in normalized if orig_start <= _ln <= orig_end
293
+ )
294
+ if meaningful_in_region < MIN_DUP_REGION_LINES:
295
+ continue
296
+ detail = (
297
+ f"Near-duplicate block at lines {orig_start}-{orig_end} <-> "
298
+ f"{dup_start}-{dup_end} ({n_lines} lines)"
299
+ )
300
+ findings.append(build_finding(
301
+ check_id="duplicate_scan",
302
+ category=GateCategory.DRIFT,
303
+ title=f"[near_duplicate_code] {file_path}:{dup_start}",
304
+ severity=GateSeverity.LOW,
305
+ impact=GateImpact.WARN,
306
+ summary=detail,
307
+ recommendation="Extract the duplicate block into a shared function.",
308
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
309
+ repair_kind=RepairKind.REMOVE_DUPLICATE.value,
310
+ executor_action=f"Deduplicate code block at {file_path}:{dup_start}",
311
+ ))
312
+ if len(findings) >= 10:
313
+ break
314
+
315
+ return findings
316
+
317
+
318
+ # ---------------------------------------------------------------------------
319
+ # Cluster 46: Missing Null/None Check at API Boundary
320
+ # ---------------------------------------------------------------------------
321
+
322
+
323
+ def assess_missing_null_check(
324
+ file_path: str,
325
+ content: str,
326
+ ) -> list[GateFinding]:
327
+ """Cluster 46: Detect missing null/None checks at API boundaries."""
328
+ import re
329
+
330
+ if not content.strip():
331
+ return []
332
+
333
+ lang = detect_language(file_path)
334
+ if lang not in ("python", "javascript", "typescript"):
335
+ return []
336
+
337
+ basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] if "/" in file_path.replace("\\", "/") else file_path
338
+ if basename.startswith("test_") or basename.startswith("conftest"):
339
+ return []
340
+
341
+ findings: list[GateFinding] = []
342
+
343
+ if lang == "python":
344
+ for i, line in enumerate(content.splitlines(), 1):
345
+ stripped = line.strip()
346
+ if stripped.startswith("#"):
347
+ continue
348
+ if re.search(r'request\.json\s*\[', stripped):
349
+ detail = f"request.json[key] without .get() -- KeyError if missing (line {i})"
350
+ findings.append(build_finding(
351
+ check_id="null_check_scan",
352
+ category=GateCategory.RUNTIME_BEHAVIOR,
353
+ title=f"[missing_null_check] {file_path}:{i}",
354
+ severity=GateSeverity.MEDIUM,
355
+ impact=GateImpact.REVISE,
356
+ summary=detail,
357
+ recommendation="Use `request.json.get('key')` with a default value.",
358
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
359
+ repair_kind=RepairKind.ADD_BOUNDARY_CHECK.value,
360
+ executor_action=f"Fix missing null check at {file_path}:{i}",
361
+ ))
362
+ if re.search(r'request\.form\s*\[', stripped):
363
+ detail = f"request.form[key] without .get() -- KeyError if missing (line {i})"
364
+ findings.append(build_finding(
365
+ check_id="null_check_scan",
366
+ category=GateCategory.RUNTIME_BEHAVIOR,
367
+ title=f"[missing_null_check] {file_path}:{i}",
368
+ severity=GateSeverity.MEDIUM,
369
+ impact=GateImpact.REVISE,
370
+ summary=detail,
371
+ recommendation="Use `request.form.get('key')` with a default value.",
372
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
373
+ repair_kind=RepairKind.ADD_BOUNDARY_CHECK.value,
374
+ executor_action=f"Fix missing null check at {file_path}:{i}",
375
+ ))
376
+ if re.search(r'json\.loads\s*\([^)]+\)\s*\[', stripped):
377
+ detail = f"json.loads()[key] -- chain of failure points (line {i})"
378
+ findings.append(build_finding(
379
+ check_id="null_check_scan",
380
+ category=GateCategory.RUNTIME_BEHAVIOR,
381
+ title=f"[missing_null_check] {file_path}:{i}",
382
+ severity=GateSeverity.MEDIUM,
383
+ impact=GateImpact.REVISE,
384
+ summary=detail,
385
+ recommendation="Assign json.loads() to a variable and use .get() for key access.",
386
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
387
+ repair_kind=RepairKind.ADD_BOUNDARY_CHECK.value,
388
+ executor_action=f"Fix chained failure points at {file_path}:{i}",
389
+ ))
390
+ if len(findings) >= 10:
391
+ break
392
+
393
+ elif lang in ("javascript", "typescript"):
394
+ for i, line in enumerate(content.splitlines(), 1):
395
+ stripped = line.strip()
396
+ if stripped.startswith("//"):
397
+ continue
398
+ if re.search(r'req\.body\.\w+', stripped) and "?." not in stripped:
399
+ if not re.search(r'if\s*\(.*req\.body', stripped):
400
+ detail = f"req.body.field without null check (line {i}) -- use ?. or validate first"
401
+ findings.append(build_finding(
402
+ check_id="null_check_scan",
403
+ category=GateCategory.RUNTIME_BEHAVIOR,
404
+ title=f"[missing_null_check] {file_path}:{i}",
405
+ severity=GateSeverity.MEDIUM,
406
+ impact=GateImpact.REVISE,
407
+ summary=detail,
408
+ recommendation="Use optional chaining (`?.`) or validate `req.body` before accessing fields.",
409
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
410
+ repair_kind=RepairKind.ADD_BOUNDARY_CHECK.value,
411
+ executor_action=f"Fix missing null check at {file_path}:{i}",
412
+ ))
413
+ if re.search(r'JSON\.parse\s*\([^)]+\)\.\w+', stripped):
414
+ detail = f"JSON.parse().field -- chain of failure points (line {i})"
415
+ findings.append(build_finding(
416
+ check_id="null_check_scan",
417
+ category=GateCategory.RUNTIME_BEHAVIOR,
418
+ title=f"[missing_null_check] {file_path}:{i}",
419
+ severity=GateSeverity.MEDIUM,
420
+ impact=GateImpact.REVISE,
421
+ summary=detail,
422
+ recommendation="Assign JSON.parse() to a variable and use optional chaining for field access.",
423
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
424
+ repair_kind=RepairKind.ADD_BOUNDARY_CHECK.value,
425
+ executor_action=f"Fix chained failure points at {file_path}:{i}",
426
+ ))
427
+ if len(findings) >= 10:
428
+ break
429
+
430
+ return findings
431
+
432
+
433
+ # ---------------------------------------------------------------------------
434
+ # Cluster 47: String Concatenation for Paths
435
+ # ---------------------------------------------------------------------------
436
+
437
+
438
+ def assess_path_concatenation(
439
+ file_path: str,
440
+ content: str,
441
+ ) -> list[GateFinding]:
442
+ """Cluster 47: Detect string concatenation used to build file paths."""
443
+ import re
444
+
445
+ if not content.strip():
446
+ return []
447
+
448
+ lang = detect_language(file_path)
449
+ if lang not in ("python", "javascript", "typescript"):
450
+ return []
451
+
452
+ basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] if "/" in file_path.replace("\\", "/") else file_path
453
+ if basename.startswith("test_") or basename.startswith("conftest"):
454
+ return []
455
+
456
+ findings: list[GateFinding] = []
457
+
458
+ for i, line in enumerate(content.splitlines(), 1):
459
+ stripped = line.strip()
460
+ if stripped.startswith("#") or stripped.startswith("//"):
461
+ continue
462
+ if "http://" in stripped or "https://" in stripped:
463
+ continue
464
+ if re.search(r'\w+\s*\+\s*["\'][/\\]["\']', stripped):
465
+ ctx_words = ("path", "dir", "file", "folder", "name", "root", "base")
466
+ if any(w in stripped.lower() for w in ctx_words):
467
+ detail = f"String concat for path building (line {i}) -- use os.path.join / Path"
468
+ findings.append(build_finding(
469
+ check_id="path_concat_scan",
470
+ category=GateCategory.CONTRACT,
471
+ title=f"[path_concatenation] {file_path}:{i}",
472
+ severity=GateSeverity.LOW,
473
+ impact=GateImpact.WARN,
474
+ summary=detail,
475
+ recommendation="Use `os.path.join()` or `pathlib.Path` instead of string concatenation.",
476
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
477
+ repair_kind=RepairKind.FIX_CONTRACT.value,
478
+ executor_action=f"Fix path concatenation at {file_path}:{i}",
479
+ ))
480
+ if lang == "python" and re.search(r'f["\'][^"\']*\{[^}]+\}/\{[^}]+\}', stripped):
481
+ detail = f"f-string path building (line {i}) -- use os.path.join / Path"
482
+ findings.append(build_finding(
483
+ check_id="path_concat_scan",
484
+ category=GateCategory.CONTRACT,
485
+ title=f"[path_concatenation] {file_path}:{i}",
486
+ severity=GateSeverity.LOW,
487
+ impact=GateImpact.WARN,
488
+ summary=detail,
489
+ recommendation="Use `os.path.join()` or `pathlib.Path` instead of f-string path building.",
490
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
491
+ repair_kind=RepairKind.FIX_CONTRACT.value,
492
+ executor_action=f"Fix f-string path building at {file_path}:{i}",
493
+ ))
494
+ if len(findings) >= 10:
495
+ break
496
+
497
+ return findings
498
+
499
+
500
+ # ---------------------------------------------------------------------------
501
+ # Cluster 48: Log Without Error Context
502
+ # ---------------------------------------------------------------------------
503
+
504
+
505
+ def assess_log_without_context(
506
+ file_path: str,
507
+ content: str,
508
+ ) -> list[GateFinding]:
509
+ """Cluster 48: Detect error logging without exception context."""
510
+ import re
511
+
512
+ if not content.strip():
513
+ return []
514
+
515
+ lang = detect_language(file_path)
516
+ if lang not in ("python", "javascript", "typescript", "java"):
517
+ return []
518
+
519
+ basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] if "/" in file_path.replace("\\", "/") else file_path
520
+ if basename.startswith("test_") or basename.startswith("conftest"):
521
+ return []
522
+
523
+ findings: list[GateFinding] = []
524
+ lines = content.splitlines()
525
+
526
+ if lang == "python":
527
+ in_except = False
528
+ except_var = None
529
+ except_indent = 0
530
+
531
+ for i, line in enumerate(lines, 1):
532
+ stripped = line.strip()
533
+ indent = len(line) - len(line.lstrip())
534
+
535
+ m = re.match(r'^except\s+\w+(?:\s+as\s+(\w+))?\s*:', stripped)
536
+ if m:
537
+ in_except = True
538
+ except_var = m.group(1)
539
+ except_indent = indent
540
+ continue
541
+
542
+ if in_except:
543
+ if indent <= except_indent and stripped:
544
+ in_except = False
545
+ except_var = None
546
+ continue
547
+ if re.search(r'(?:logger?|logging)\.\w*(error|exception|critical)\s*\(', stripped):
548
+ has_context = False
549
+ if except_var and re.search(rf'\b{re.escape(except_var)}\b', stripped):
550
+ has_context = True
551
+ if "exc_info" in stripped:
552
+ has_context = True
553
+ if "traceback" in stripped:
554
+ has_context = True
555
+ if ".exception(" in stripped:
556
+ has_context = True
557
+ if not has_context:
558
+ detail = f"logger.error() in except block without exception context (line {i})"
559
+ findings.append(build_finding(
560
+ check_id="log_context_scan",
561
+ category=GateCategory.REPORTING,
562
+ title=f"[log_without_context] {file_path}:{i}",
563
+ severity=GateSeverity.LOW,
564
+ impact=GateImpact.WARN,
565
+ summary=detail,
566
+ recommendation="Use `logger.exception()` or pass `exc_info=True` to include traceback.",
567
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
568
+ repair_kind=RepairKind.ADD_PROOF.value,
569
+ executor_action=f"Add exception context to log at {file_path}:{i}",
570
+ ))
571
+ if len(findings) >= 10:
572
+ break
573
+
574
+ elif lang in ("javascript", "typescript"):
575
+ in_catch = False
576
+ catch_var = None
577
+ catch_indent = 0
578
+
579
+ for i, line in enumerate(lines, 1):
580
+ stripped = line.strip()
581
+ indent = len(line) - len(line.lstrip())
582
+
583
+ m = re.match(r'catch\s*\(\s*(\w+)\s*\)', stripped)
584
+ if m:
585
+ in_catch = True
586
+ catch_var = m.group(1)
587
+ catch_indent = indent
588
+ continue
589
+
590
+ if in_catch:
591
+ if indent <= catch_indent and stripped and stripped != "}":
592
+ in_catch = False
593
+ catch_var = None
594
+ continue
595
+ if re.search(r'console\.error\s*\(', stripped):
596
+ if catch_var and catch_var not in stripped:
597
+ detail = f"console.error() in catch block without error object (line {i})"
598
+ findings.append(build_finding(
599
+ check_id="log_context_scan",
600
+ category=GateCategory.REPORTING,
601
+ title=f"[log_without_context] {file_path}:{i}",
602
+ severity=GateSeverity.LOW,
603
+ impact=GateImpact.WARN,
604
+ summary=detail,
605
+ recommendation=f"Pass the error object to console.error: `console.error('message', {catch_var})`.",
606
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
607
+ repair_kind=RepairKind.ADD_PROOF.value,
608
+ executor_action=f"Add error context to log at {file_path}:{i}",
609
+ ))
610
+ if len(findings) >= 10:
611
+ break
612
+
613
+ return findings
614
+
615
+
616
+ # ---------------------------------------------------------------------------
617
+ # Cluster 49: Secrets in Test Files
618
+ # ---------------------------------------------------------------------------
619
+
620
+
621
+ def assess_test_secrets(
622
+ file_path: str,
623
+ content: str,
624
+ ) -> list[GateFinding]:
625
+ """Cluster 49: Detect real-looking secrets in test files."""
626
+ import re
627
+
628
+ if not content.strip():
629
+ return []
630
+
631
+ basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] if "/" in file_path.replace("\\", "/") else file_path
632
+ if not (basename.startswith("test_") or basename.startswith("conftest") or "_test." in basename):
633
+ return []
634
+
635
+ findings: list[GateFinding] = []
636
+
637
+ secret_patterns = [
638
+ (r'(?:sk|pk)[-_](?:live|test)[-_][a-zA-Z0-9]{20,}', "Stripe-like API key"),
639
+ (r'ghp_[a-zA-Z0-9]{36,}', "GitHub personal access token"),
640
+ (r'gho_[a-zA-Z0-9]{36,}', "GitHub OAuth token"),
641
+ (r'AKIA[A-Z0-9]{16}', "AWS access key ID"),
642
+ (r'eyJ[a-zA-Z0-9_-]{20,}\.[a-zA-Z0-9_-]{20,}\.[a-zA-Z0-9_-]{20,}', "JWT token"),
643
+ (r'xox[bpsar]-[a-zA-Z0-9-]{20,}', "Slack token"),
644
+ (r'sk-[a-zA-Z0-9]{40,}', "OpenAI API key"),
645
+ (r'AIza[a-zA-Z0-9_-]{35}', "Google API key"),
646
+ ]
647
+
648
+ for i, line in enumerate(content.splitlines(), 1):
649
+ stripped = line.strip()
650
+ if stripped.startswith("#") or stripped.startswith("//"):
651
+ continue
652
+ for pattern, description in secret_patterns:
653
+ if re.search(pattern, stripped):
654
+ if any(ph in stripped.lower() for ph in ("placeholder", "example", "fake", "mock", "dummy", "xxx", "test_key")):
655
+ continue
656
+ detail = f"Possible {description} in test file (line {i})"
657
+ findings.append(build_finding(
658
+ check_id="test_secret_scan",
659
+ category=GateCategory.TRUTH_BOUNDARY,
660
+ title=f"[test_secrets] {file_path}:{i}",
661
+ severity=GateSeverity.HIGH,
662
+ impact=GateImpact.REVISE,
663
+ summary=detail,
664
+ recommendation="Replace real secrets with obviously fake placeholders (e.g. 'fake-key-xxx').",
665
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
666
+ repair_kind=RepairKind.REPLACE_WITH_FAIL_LOUD.value,
667
+ executor_action=f"Remove secret from test file at {file_path}:{i}",
668
+ ))
669
+ break
670
+ if len(findings) >= 10:
671
+ break
672
+
673
+ return findings
674
+
675
+
676
+ # ---------------------------------------------------------------------------
677
+ # Cluster 50: Unpinned Dependencies
678
+ # ---------------------------------------------------------------------------
679
+
680
+
681
+ def assess_unpinned_dependencies(
682
+ file_path: str,
683
+ content: str,
684
+ ) -> list[GateFinding]:
685
+ """Cluster 50: Detect unpinned dependency versions."""
686
+ import re
687
+
688
+ if not content.strip():
689
+ return []
690
+
691
+ basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] if "/" in file_path.replace("\\", "/") else file_path
692
+
693
+ findings: list[GateFinding] = []
694
+
695
+ if basename.startswith("requirements") and basename.endswith(".txt"):
696
+ for i, line in enumerate(content.splitlines(), 1):
697
+ stripped = line.strip()
698
+ if not stripped or stripped.startswith("#") or stripped.startswith("-"):
699
+ continue
700
+ if re.match(r'^[a-zA-Z][a-zA-Z0-9._-]*\s*$', stripped):
701
+ detail = f"Unpinned dependency: '{stripped}' -- add ==X.Y.Z"
702
+ findings.append(build_finding(
703
+ check_id="unpinned_dep_scan",
704
+ category=GateCategory.CONTRACT,
705
+ title=f"[unpinned_dependencies] {file_path}:{i}",
706
+ severity=GateSeverity.MEDIUM,
707
+ impact=GateImpact.REVISE,
708
+ summary=detail,
709
+ recommendation=f"Pin the dependency with an exact version: `{stripped}==X.Y.Z`.",
710
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
711
+ repair_kind=RepairKind.FIX_CONTRACT.value,
712
+ executor_action=f"Pin dependency '{stripped}' at {file_path}:{i}",
713
+ ))
714
+ elif re.search(r'>=|<=|~=|!=', stripped) and '==' not in stripped:
715
+ detail = f"Loosely pinned: '{stripped}' -- prefer exact ==X.Y.Z"
716
+ findings.append(build_finding(
717
+ check_id="unpinned_dep_scan",
718
+ category=GateCategory.CONTRACT,
719
+ title=f"[unpinned_dependencies] {file_path}:{i}",
720
+ severity=GateSeverity.LOW,
721
+ impact=GateImpact.WARN,
722
+ summary=detail,
723
+ recommendation="Use exact version pinning (`==X.Y.Z`) for reproducible builds.",
724
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
725
+ repair_kind=RepairKind.FIX_CONTRACT.value,
726
+ executor_action=f"Pin dependency at {file_path}:{i}",
727
+ ))
728
+ if len(findings) >= 10:
729
+ break
730
+
731
+ elif basename == "package.json":
732
+ import json as json_mod
733
+ try:
734
+ pkg = json_mod.loads(content)
735
+ except (json_mod.JSONDecodeError, ValueError):
736
+ return []
737
+ for section in ("dependencies", "devDependencies"):
738
+ deps = pkg.get(section, {})
739
+ if not isinstance(deps, dict):
740
+ continue
741
+ for name, version in deps.items():
742
+ if not isinstance(version, str):
743
+ continue
744
+ if version.startswith("^") or version.startswith("~") or version == "*":
745
+ detail = f"Loosely pinned '{name}': '{version}' in {section}"
746
+ findings.append(build_finding(
747
+ check_id="unpinned_dep_scan",
748
+ category=GateCategory.CONTRACT,
749
+ title=f"[unpinned_dependencies] {file_path}:{section}:{name}",
750
+ severity=GateSeverity.LOW,
751
+ impact=GateImpact.WARN,
752
+ summary=detail,
753
+ recommendation=f"Use exact version pinning for '{name}' in {section}.",
754
+ evidence=(EvidenceReference(kind="probe", path=file_path, detail=detail, ok=False),),
755
+ repair_kind=RepairKind.FIX_CONTRACT.value,
756
+ executor_action=f"Pin '{name}' in {section} at {file_path}",
757
+ ))
758
+ if len(findings) >= 10:
759
+ break
760
+ else:
761
+ return []
762
+
763
+ return findings