codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,557 @@
1
+ """Code quality analyzers — dead code, duplicate logic, complexity, security.
2
+
3
+ All analyzers operate on parsed ``Symbol`` lists and raw file content,
4
+ returning structured reports that are both human-readable (via Rich) and
5
+ machine-parsable (``to_dict()`` → JSON).
6
+
7
+ Uses `radon <https://radon.readthedocs.io/>`_ for AST-based cyclomatic
8
+ complexity analysis on Python files, with a regex fallback for other languages.
9
+ Optionally integrates `bandit <https://bandit.readthedocs.io/>`_ for Python
10
+ security linting.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import math
16
+ import re
17
+ from dataclasses import dataclass, field
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from semantic_code_intelligence.parsing.parser import Symbol, parse_file
22
+ from semantic_code_intelligence.context.engine import CallGraph, ContextBuilder
23
+ from semantic_code_intelligence.llm.safety import SafetyValidator, SafetyReport
24
+ from semantic_code_intelligence.utils.logging import get_logger
25
+
26
+ logger = get_logger("ci.quality")
27
+
28
+ # ── Radon (optional — used for Python AST-based complexity) ──────────
29
+
30
+ try:
31
+ from radon.complexity import cc_visit
32
+ from radon.metrics import mi_visit
33
+
34
+ _HAS_RADON = True
35
+ except ImportError: # pragma: no cover
36
+ _HAS_RADON = False
37
+
38
+ # ── Cyclomatic complexity ────────────────────────────────────────────
39
+
40
+ # Decision keywords/patterns — used as fallback for non-Python files.
41
+ _DECISION_PATTERNS: list[re.Pattern[str]] = [
42
+ re.compile(r"\bif\b"),
43
+ re.compile(r"\belif\b"),
44
+ re.compile(r"\belse\s+if\b"),
45
+ re.compile(r"\bfor\b"),
46
+ re.compile(r"\bwhile\b"),
47
+ re.compile(r"\bcatch\b"),
48
+ re.compile(r"\bexcept\b"),
49
+ re.compile(r"\bcase\b"),
50
+ re.compile(r"\b\?\?"), # null coalescing
51
+ re.compile(r"\?\s*\."), # optional chaining counts mildly
52
+ re.compile(r"\band\b"),
53
+ re.compile(r"\bor\b"),
54
+ re.compile(r"&&"),
55
+ re.compile(r"\|\|"),
56
+ ]
57
+
58
+
59
+ @dataclass
60
+ class ComplexityResult:
61
+ """Cyclomatic complexity measurement for a single symbol."""
62
+
63
+ symbol_name: str
64
+ file_path: str
65
+ start_line: int
66
+ end_line: int
67
+ complexity: int
68
+ rating: str # "low", "moderate", "high", "very_high"
69
+
70
+ def to_dict(self) -> dict[str, Any]:
71
+ return {
72
+ "symbol_name": self.symbol_name,
73
+ "file_path": self.file_path,
74
+ "start_line": self.start_line,
75
+ "end_line": self.end_line,
76
+ "complexity": self.complexity,
77
+ "rating": self.rating,
78
+ }
79
+
80
+
81
+ def _rate_complexity(score: int) -> str:
82
+ if score <= 5:
83
+ return "low"
84
+ if score <= 10:
85
+ return "moderate"
86
+ if score <= 20:
87
+ return "high"
88
+ return "very_high"
89
+
90
+
91
+ def _compute_complexity_regex(symbol: Symbol) -> ComplexityResult:
92
+ """Regex-based fallback for non-Python files."""
93
+ body = symbol.body or ""
94
+ score = 1 # base path
95
+ for line in body.splitlines():
96
+ stripped = line.strip()
97
+ if not stripped or stripped.startswith("#") or stripped.startswith("//"):
98
+ continue
99
+ for pattern in _DECISION_PATTERNS:
100
+ if pattern.search(stripped):
101
+ score += 1
102
+
103
+ return ComplexityResult(
104
+ symbol_name=symbol.name,
105
+ file_path=symbol.file_path,
106
+ start_line=symbol.start_line,
107
+ end_line=symbol.end_line,
108
+ complexity=score,
109
+ rating=_rate_complexity(score),
110
+ )
111
+
112
+
113
+ def _compute_complexity_radon(symbol: Symbol) -> ComplexityResult:
114
+ """AST-based complexity via radon for Python symbols."""
115
+ body = symbol.body or ""
116
+
117
+ # If the body is not a complete function/class definition, wrap it so
118
+ # radon can parse it. Symbols store the body content which may or may
119
+ # not include the ``def`` line.
120
+ code = body
121
+ if not body.lstrip().startswith(("def ", "class ", "async def ")):
122
+ # Indent all body lines and wrap in a temporary function
123
+ indented = "\n".join(" " + ln for ln in body.splitlines())
124
+ code = f"def _wrapper():\n{indented}\n"
125
+
126
+ try:
127
+ results = cc_visit(code)
128
+ # Sum complexities from all top-level blocks (usually 1 function).
129
+ score = max((r.complexity for r in results), default=1)
130
+ except SyntaxError:
131
+ # If radon can't parse the snippet, fall back to regex
132
+ return _compute_complexity_regex(symbol)
133
+
134
+ return ComplexityResult(
135
+ symbol_name=symbol.name,
136
+ file_path=symbol.file_path,
137
+ start_line=symbol.start_line,
138
+ end_line=symbol.end_line,
139
+ complexity=score,
140
+ rating=_rate_complexity(score),
141
+ )
142
+
143
+
144
+ def compute_complexity(symbol: Symbol) -> ComplexityResult:
145
+ """Compute cyclomatic complexity for a single symbol.
146
+
147
+ Uses radon's AST analysis for Python files, falling back to regex-based
148
+ counting for other languages.
149
+ """
150
+ if _HAS_RADON and symbol.file_path.endswith(".py"):
151
+ return _compute_complexity_radon(symbol)
152
+ return _compute_complexity_regex(symbol)
153
+
154
+
155
+ def analyze_complexity(
156
+ symbols: list[Symbol],
157
+ *,
158
+ threshold: int = 10,
159
+ ) -> list[ComplexityResult]:
160
+ """Analyze all function/method symbols and return those above *threshold*."""
161
+ results: list[ComplexityResult] = []
162
+ for sym in symbols:
163
+ if sym.kind not in ("function", "method"):
164
+ continue
165
+ cr = compute_complexity(sym)
166
+ if cr.complexity >= threshold:
167
+ results.append(cr)
168
+ results.sort(key=lambda r: r.complexity, reverse=True)
169
+ return results
170
+
171
+
172
+ # ── Dead code detection ──────────────────────────────────────────────
173
+
174
+ @dataclass
175
+ class DeadCodeResult:
176
+ """A symbol suspected of being unreferenced."""
177
+
178
+ symbol_name: str
179
+ kind: str
180
+ file_path: str
181
+ start_line: int
182
+
183
+ def to_dict(self) -> dict[str, Any]:
184
+ return {
185
+ "symbol_name": self.symbol_name,
186
+ "kind": self.kind,
187
+ "file_path": self.file_path,
188
+ "start_line": self.start_line,
189
+ }
190
+
191
+
192
+ # Names that are conventionally always reachable.
193
+ _ENTRY_NAMES: set[str] = {
194
+ "main", "__init__", "__new__", "__del__", "__str__", "__repr__",
195
+ "__enter__", "__exit__", "__call__", "__getattr__", "__setattr__",
196
+ "__getitem__", "__setitem__", "__len__", "__iter__", "__next__",
197
+ "__eq__", "__hash__", "__lt__", "__le__", "__gt__", "__ge__",
198
+ "__add__", "__sub__", "__mul__", "__truediv__",
199
+ "setUp", "tearDown", "setUpClass", "tearDownClass",
200
+ "setup_method", "teardown_method",
201
+ "create_plugin", # CodexA plugin factory
202
+ }
203
+
204
+ # Prefix patterns that indicate entry points / framework hooks.
205
+ _ENTRY_PREFIXES: tuple[str, ...] = (
206
+ "test_", "Test", # pytest
207
+ )
208
+
209
+
210
+ def detect_dead_code(
211
+ symbols: list[Symbol],
212
+ call_graph: CallGraph | None = None,
213
+ ) -> list[DeadCodeResult]:
214
+ """Detect functions/methods that are never referenced.
215
+
216
+ Uses the ``CallGraph`` callee set to determine reachability.
217
+ Symbols whose names match known entry-point patterns are excluded.
218
+ """
219
+ if not symbols:
220
+ return []
221
+
222
+ # Build reference set from call graph edges
223
+ referenced: set[str] = set()
224
+ if call_graph:
225
+ for edge in call_graph.edges:
226
+ callee = edge.callee
227
+ if ":" in callee:
228
+ callee = callee.rsplit(":", 1)[-1]
229
+ referenced.add(callee)
230
+
231
+ # Also scan raw bodies for name mentions (lightweight fallback)
232
+ all_bodies = "\n".join(s.body for s in symbols if s.body)
233
+ name_set = {s.name for s in symbols}
234
+
235
+ results: list[DeadCodeResult] = []
236
+ for sym in symbols:
237
+ if sym.kind not in ("function", "method", "class"):
238
+ continue
239
+ if sym.name in _ENTRY_NAMES:
240
+ continue
241
+ if any(sym.name.startswith(p) for p in _ENTRY_PREFIXES):
242
+ continue
243
+ # Private dunder-style names starting and ending with __ handled above
244
+ if sym.name.startswith("_") and sym.name.endswith("_"):
245
+ continue
246
+
247
+ # Check call graph
248
+ if sym.name in referenced:
249
+ continue
250
+
251
+ # Heuristic: name appears in other symbols' bodies
252
+ # We count occurrences of the name in all bodies excluding the symbol itself.
253
+ body_without_self = all_bodies.replace(sym.body, "", 1) if sym.body else all_bodies
254
+ if re.search(rf"\b{re.escape(sym.name)}\b", body_without_self):
255
+ continue
256
+
257
+ results.append(DeadCodeResult(
258
+ symbol_name=sym.name,
259
+ kind=sym.kind,
260
+ file_path=sym.file_path,
261
+ start_line=sym.start_line,
262
+ ))
263
+
264
+ return results
265
+
266
+
267
+ # ── Duplicate logic detection ────────────────────────────────────────
268
+
269
+ @dataclass
270
+ class DuplicateResult:
271
+ """A pair of symbols with similar bodies."""
272
+
273
+ symbol_a: str
274
+ file_a: str
275
+ line_a: int
276
+ symbol_b: str
277
+ file_b: str
278
+ line_b: int
279
+ similarity: float # 0.0 – 1.0
280
+
281
+ def to_dict(self) -> dict[str, Any]:
282
+ return {
283
+ "symbol_a": self.symbol_a,
284
+ "file_a": self.file_a,
285
+ "line_a": self.line_a,
286
+ "symbol_b": self.symbol_b,
287
+ "file_b": self.file_b,
288
+ "line_b": self.line_b,
289
+ "similarity": round(self.similarity, 3),
290
+ }
291
+
292
+
293
+ def _normalize_body(body: str) -> str:
294
+ """Strip whitespace and comments for comparison."""
295
+ lines: list[str] = []
296
+ for line in body.splitlines():
297
+ stripped = line.strip()
298
+ if not stripped or stripped.startswith("#") or stripped.startswith("//"):
299
+ continue
300
+ lines.append(stripped)
301
+ return "\n".join(lines)
302
+
303
+
304
+ def _trigram_set(text: str) -> set[str]:
305
+ """Return the set of 3-character shingles in *text*."""
306
+ if len(text) < 3:
307
+ return {text} if text else set()
308
+ return {text[i : i + 3] for i in range(len(text) - 2)}
309
+
310
+
311
+ def _jaccard(a: set[str], b: set[str]) -> float:
312
+ if not a and not b:
313
+ return 1.0
314
+ if not a or not b:
315
+ return 0.0
316
+ return len(a & b) / len(a | b)
317
+
318
+
319
+ def detect_duplicates(
320
+ symbols: list[Symbol],
321
+ *,
322
+ threshold: float = 0.75,
323
+ min_lines: int = 4,
324
+ ) -> list[DuplicateResult]:
325
+ """Detect similar function/method bodies using trigram Jaccard similarity.
326
+
327
+ Only compares symbols whose normalised body has at least *min_lines* lines.
328
+ Pairs with similarity ≥ *threshold* are returned.
329
+ """
330
+ # Pre-filter
331
+ candidates: list[tuple[Symbol, str, set[str]]] = []
332
+ for sym in symbols:
333
+ if sym.kind not in ("function", "method"):
334
+ continue
335
+ norm = _normalize_body(sym.body or "")
336
+ if norm.count("\n") + 1 < min_lines:
337
+ continue
338
+ candidates.append((sym, norm, _trigram_set(norm)))
339
+
340
+ results: list[DuplicateResult] = []
341
+ seen: set[tuple[str, str]] = set()
342
+
343
+ for i, (sym_a, _norm_a, tri_a) in enumerate(candidates):
344
+ for j in range(i + 1, len(candidates)):
345
+ sym_b, _norm_b, tri_b = candidates[j]
346
+ key = (f"{sym_a.file_path}:{sym_a.name}", f"{sym_b.file_path}:{sym_b.name}")
347
+ if key in seen:
348
+ continue
349
+ sim = _jaccard(tri_a, tri_b)
350
+ if sim >= threshold:
351
+ seen.add(key)
352
+ results.append(DuplicateResult(
353
+ symbol_a=sym_a.name,
354
+ file_a=sym_a.file_path,
355
+ line_a=sym_a.start_line,
356
+ symbol_b=sym_b.name,
357
+ file_b=sym_b.file_path,
358
+ line_b=sym_b.start_line,
359
+ similarity=sim,
360
+ ))
361
+
362
+ results.sort(key=lambda r: r.similarity, reverse=True)
363
+ return results
364
+
365
+
366
+ # ── Bandit security linting (optional) ───────────────────────────────
367
+
368
+ try:
369
+ from bandit.core import manager as _bandit_manager
370
+ from bandit.core import config as _bandit_config
371
+
372
+ _HAS_BANDIT = True
373
+ except ImportError: # pragma: no cover
374
+ _HAS_BANDIT = False
375
+
376
+
377
+ @dataclass
378
+ class BanditIssue:
379
+ """A security issue found by Bandit."""
380
+
381
+ test_id: str
382
+ severity: str # LOW / MEDIUM / HIGH
383
+ confidence: str # LOW / MEDIUM / HIGH
384
+ text: str
385
+ file_path: str
386
+ line: int
387
+
388
+ def to_dict(self) -> dict[str, Any]:
389
+ return {
390
+ "test_id": self.test_id,
391
+ "severity": self.severity,
392
+ "confidence": self.confidence,
393
+ "text": self.text,
394
+ "file_path": self.file_path,
395
+ "line": self.line,
396
+ }
397
+
398
+
399
+ def run_bandit_scan(file_paths: list[str]) -> list[BanditIssue]:
400
+ """Run Bandit static analysis on the given Python files.
401
+
402
+ Returns an empty list when Bandit is not installed or when no
403
+ Python files are provided.
404
+ """
405
+ if not _HAS_BANDIT:
406
+ return []
407
+
408
+ py_files = [f for f in file_paths if f.endswith(".py")]
409
+ if not py_files:
410
+ return []
411
+
412
+ try:
413
+ conf = _bandit_config.BanditConfig()
414
+ mgr = _bandit_manager.BanditManager(conf, "file")
415
+ mgr.discover_files(py_files)
416
+ mgr.run_tests()
417
+ return [
418
+ BanditIssue(
419
+ test_id=iss.test_id,
420
+ severity=str(iss.severity).upper(),
421
+ confidence=str(iss.confidence).upper(),
422
+ text=iss.text,
423
+ file_path=iss.fname,
424
+ line=iss.lineno,
425
+ )
426
+ for iss in mgr.get_issue_list()
427
+ ]
428
+ except Exception as exc:
429
+ logger.debug("Bandit scan failed: %s", exc)
430
+ return []
431
+
432
+
433
+ # ── Quality report (aggregate) ───────────────────────────────────────
434
+
435
+ @dataclass
436
+ class QualityReport:
437
+ """Aggregate quality report for a project or file set."""
438
+
439
+ files_analyzed: int = 0
440
+ symbol_count: int = 0
441
+ complexity_issues: list[ComplexityResult] = field(default_factory=list)
442
+ dead_code: list[DeadCodeResult] = field(default_factory=list)
443
+ duplicates: list[DuplicateResult] = field(default_factory=list)
444
+ safety: SafetyReport | None = None
445
+ bandit_issues: list[BanditIssue] = field(default_factory=list)
446
+ maintainability_index: float | None = None
447
+
448
+ @property
449
+ def issue_count(self) -> int:
450
+ n = len(self.complexity_issues) + len(self.dead_code) + len(self.duplicates)
451
+ n += len(self.bandit_issues)
452
+ if self.safety:
453
+ n += len(self.safety.issues)
454
+ return n
455
+
456
+ def to_dict(self) -> dict[str, Any]:
457
+ return {
458
+ "files_analyzed": self.files_analyzed,
459
+ "symbol_count": self.symbol_count,
460
+ "issue_count": self.issue_count,
461
+ "complexity_issues": [c.to_dict() for c in self.complexity_issues],
462
+ "dead_code": [d.to_dict() for d in self.dead_code],
463
+ "duplicates": [d.to_dict() for d in self.duplicates],
464
+ "safety": self.safety.to_dict() if self.safety else None,
465
+ "bandit_issues": [b.to_dict() for b in self.bandit_issues],
466
+ "maintainability_index": round(self.maintainability_index, 2) if self.maintainability_index is not None else None,
467
+ }
468
+
469
+
470
+ def analyze_project(
471
+ project_root: Path,
472
+ *,
473
+ file_paths: list[str] | None = None,
474
+ complexity_threshold: int = 10,
475
+ duplicate_threshold: float = 0.75,
476
+ run_safety: bool = True,
477
+ ) -> QualityReport:
478
+ """Run all quality analyzers on a project or a subset of files.
479
+
480
+ Args:
481
+ project_root: Repository root directory.
482
+ file_paths: Optional specific file list. If *None*, indexes the whole project.
483
+ complexity_threshold: Minimum complexity score to report.
484
+ duplicate_threshold: Minimum Jaccard similarity to report as duplicate.
485
+ run_safety: Whether to run the safety validator.
486
+
487
+ Returns:
488
+ Aggregated ``QualityReport``.
489
+ """
490
+ builder = ContextBuilder()
491
+ all_symbols: list[Symbol] = []
492
+ all_code = ""
493
+
494
+ if file_paths:
495
+ files = [str(Path(f).resolve()) for f in file_paths]
496
+ else:
497
+ # Walk project for supported files
498
+ from semantic_code_intelligence.parsing.parser import EXTENSION_TO_LANGUAGE
499
+ files = []
500
+ for f in project_root.rglob("*"):
501
+ if f.is_file() and f.suffix in EXTENSION_TO_LANGUAGE:
502
+ # Skip hidden dirs, .codexa, __pycache__, node_modules
503
+ parts = f.relative_to(project_root).parts
504
+ if any(p.startswith(".") or p in ("__pycache__", "node_modules", ".codexa") for p in parts):
505
+ continue
506
+ files.append(str(f))
507
+
508
+ for fpath in files:
509
+ try:
510
+ syms = builder.index_file(fpath)
511
+ all_symbols.extend(syms)
512
+ content = Path(fpath).read_text(encoding="utf-8", errors="replace")
513
+ all_code += content + "\n"
514
+ except Exception as exc:
515
+ logger.debug("Skipping %s: %s", fpath, exc)
516
+
517
+ # Call graph for dead-code analysis
518
+ call_graph = CallGraph()
519
+ call_graph.build(all_symbols)
520
+
521
+ report = QualityReport(
522
+ files_analyzed=len(files),
523
+ symbol_count=len(all_symbols),
524
+ )
525
+
526
+ report.complexity_issues = analyze_complexity(
527
+ all_symbols, threshold=complexity_threshold
528
+ )
529
+ report.dead_code = detect_dead_code(all_symbols, call_graph=call_graph)
530
+ report.duplicates = detect_duplicates(
531
+ all_symbols, threshold=duplicate_threshold
532
+ )
533
+
534
+ # Bandit security scan (Python files only)
535
+ report.bandit_issues = run_bandit_scan(files)
536
+
537
+ # Maintainability index (average across Python files)
538
+ if _HAS_RADON:
539
+ mi_scores: list[float] = []
540
+ for fpath in files:
541
+ if not fpath.endswith(".py"):
542
+ continue
543
+ try:
544
+ code = Path(fpath).read_text(encoding="utf-8", errors="replace")
545
+ score = mi_visit(code, True)
546
+ if isinstance(score, (int, float)):
547
+ mi_scores.append(float(score))
548
+ except Exception:
549
+ pass
550
+ if mi_scores:
551
+ report.maintainability_index = sum(mi_scores) / len(mi_scores)
552
+
553
+ if run_safety:
554
+ validator = SafetyValidator()
555
+ report.safety = validator.validate(all_code)
556
+
557
+ return report