codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,438 @@
1
+ """Reasoning engine — orchestrates context gathering and LLM interaction.
2
+
3
+ Provides the core logic behind ``codexa ask``, ``codexa review``, and
4
+ ``codexa refactor`` by combining:
5
+ 1. Semantic search results
6
+ 2. Parsed symbol / context data
7
+ 3. LLM conversations
8
+
9
+ Each public method returns structured data suitable for CLI display and
10
+ machine consumption.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from dataclasses import dataclass, field
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from semantic_code_intelligence.analysis.ai_features import (
21
+ explain_symbol,
22
+ generate_ai_context,
23
+ summarize_repository,
24
+ )
25
+ from semantic_code_intelligence.context.engine import ContextBuilder
26
+ from semantic_code_intelligence.llm.provider import (
27
+ LLMMessage,
28
+ LLMProvider,
29
+ LLMResponse,
30
+ MessageRole,
31
+ )
32
+ from semantic_code_intelligence.services.search_service import search_codebase
33
+ from semantic_code_intelligence.utils.logging import get_logger
34
+
35
+ logger = get_logger("llm.reasoning")
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Data types
40
+ # ---------------------------------------------------------------------------
41
+
42
+ @dataclass
43
+ class AskResult:
44
+ """Result of an ``ask`` operation."""
45
+
46
+ question: str
47
+ answer: str
48
+ context_snippets: list[dict[str, Any]] = field(default_factory=list)
49
+ llm_response: LLMResponse | None = None
50
+ explainability: dict[str, Any] = field(default_factory=dict)
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ return {
54
+ "question": self.question,
55
+ "answer": self.answer,
56
+ "context_snippets": self.context_snippets,
57
+ "usage": self.llm_response.usage if self.llm_response else {},
58
+ "explainability": self.explainability,
59
+ }
60
+
61
+
62
+ @dataclass
63
+ class ReviewResult:
64
+ """Result of a ``review`` operation."""
65
+
66
+ file_path: str
67
+ issues: list[dict[str, Any]] = field(default_factory=list)
68
+ summary: str = ""
69
+ llm_response: LLMResponse | None = None
70
+ explainability: dict[str, Any] = field(default_factory=dict)
71
+
72
+ def to_dict(self) -> dict[str, Any]:
73
+ return {
74
+ "file_path": self.file_path,
75
+ "issues": self.issues,
76
+ "summary": self.summary,
77
+ "usage": self.llm_response.usage if self.llm_response else {},
78
+ "explainability": self.explainability,
79
+ }
80
+
81
+
82
+ @dataclass
83
+ class RefactorResult:
84
+ """Result of a ``refactor`` operation."""
85
+
86
+ file_path: str
87
+ original_code: str = ""
88
+ refactored_code: str = ""
89
+ explanation: str = ""
90
+ llm_response: LLMResponse | None = None
91
+ explainability: dict[str, Any] = field(default_factory=dict)
92
+
93
+ def to_dict(self) -> dict[str, Any]:
94
+ return {
95
+ "file_path": self.file_path,
96
+ "original_code": self.original_code,
97
+ "refactored_code": self.refactored_code,
98
+ "explanation": self.explanation,
99
+ "usage": self.llm_response.usage if self.llm_response else {},
100
+ "explainability": self.explainability,
101
+ }
102
+
103
+
104
+ @dataclass
105
+ class SuggestResult:
106
+ """Result of a ``suggest`` operation."""
107
+
108
+ target: str
109
+ suggestions: list[dict[str, Any]] = field(default_factory=list)
110
+ llm_response: LLMResponse | None = None
111
+ explainability: dict[str, Any] = field(default_factory=dict)
112
+
113
+ def to_dict(self) -> dict[str, Any]:
114
+ return {
115
+ "target": self.target,
116
+ "suggestions": self.suggestions,
117
+ "usage": self.llm_response.usage if self.llm_response else {},
118
+ "explainability": self.explainability,
119
+ }
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Reasoning Engine
124
+ # ---------------------------------------------------------------------------
125
+
126
+ class ReasoningEngine:
127
+ """High-level reasoning engine that combines semantic context with LLM.
128
+
129
+ This is the central orchestrator for all AI workflows in CodexA.
130
+ """
131
+
132
+ DEFAULT_MAX_CONTEXT_CHARS = 6000
133
+
134
+ def __init__(
135
+ self,
136
+ provider: LLMProvider,
137
+ project_root: Path,
138
+ *,
139
+ builder: ContextBuilder | None = None,
140
+ max_context_chars: int | None = None,
141
+ ) -> None:
142
+ self._provider = provider
143
+ self._root = project_root.resolve()
144
+ self._builder = builder or ContextBuilder()
145
+ self._indexed = False
146
+ self._max_ctx = max_context_chars or self.DEFAULT_MAX_CONTEXT_CHARS
147
+
148
+ def _ensure_indexed(self) -> None:
149
+ """Lazy-index the project for symbol/context lookups."""
150
+ if self._indexed:
151
+ return
152
+ from semantic_code_intelligence.config.settings import load_config
153
+ from semantic_code_intelligence.indexing.scanner import scan_repository
154
+
155
+ config = load_config(self._root)
156
+ scanned = scan_repository(self._root, config.index)
157
+ for sf in scanned:
158
+ full_path = str(self._root / sf.relative_path)
159
+ try:
160
+ self._builder.index_file(full_path)
161
+ except Exception:
162
+ logger.debug("Failed to index %s", full_path)
163
+ self._indexed = True
164
+
165
+ # --- gather context helpers ---
166
+
167
+ def _search_context(self, query: str, top_k: int = 5) -> list[dict[str, Any]]:
168
+ """Run semantic search and return snippet dicts."""
169
+ try:
170
+ results = search_codebase(query, self._root, top_k=top_k, threshold=0.2)
171
+ return [r.to_dict() for r in results]
172
+ except Exception:
173
+ logger.debug("Semantic search unavailable, proceeding without search context.")
174
+ return []
175
+
176
+ def _file_context(self, file_path: str) -> str:
177
+ """Read a file's content, returning the text."""
178
+ try:
179
+ return Path(file_path).read_text(encoding="utf-8", errors="replace")
180
+ except (OSError, PermissionError):
181
+ return ""
182
+
183
+ def _symbol_context(self, symbol_name: str) -> str:
184
+ """Build a text summary of a symbol for use in prompts."""
185
+ self._ensure_indexed()
186
+ matches = self._builder.find_symbol(symbol_name)
187
+ if not matches:
188
+ return ""
189
+ explanations = [explain_symbol(s, self._builder) for s in matches[:3]]
190
+ parts: list[str] = []
191
+ for exp in explanations:
192
+ parts.append(exp.render())
193
+ return "\n\n".join(parts)
194
+
195
+ # --- context pruning & scoring (Phase 12) ---
196
+
197
+ @staticmethod
198
+ def _score_snippet(snippet: dict[str, Any], query_lower: str) -> float:
199
+ """Compute a priority score for a context snippet.
200
+
201
+ Combines the semantic search score with a keyword-overlap bonus.
202
+ """
203
+ base = float(snippet.get("score", 0.0))
204
+ content = snippet.get("content", snippet.get("chunk", "")).lower()
205
+ # Keyword overlap bonus: fraction of query words found in snippet
206
+ words = [w for w in query_lower.split() if len(w) > 2]
207
+ if words:
208
+ found = sum(1 for w in words if w in content)
209
+ base += 0.1 * (found / len(words))
210
+ return round(base, 4)
211
+
212
+ def _prune_context(
213
+ self,
214
+ snippets: list[dict[str, Any]],
215
+ query: str,
216
+ max_chars: int | None = None,
217
+ ) -> list[dict[str, Any]]:
218
+ """Score, rank, and prune snippets to stay within token budget.
219
+
220
+ Returns a subset of *snippets* sorted by priority score (descending),
221
+ trimmed so total character count stays within *max_chars*.
222
+ """
223
+ limit = max_chars or self._max_ctx
224
+ query_lower = query.lower()
225
+ scored = [
226
+ (self._score_snippet(s, query_lower), s)
227
+ for s in snippets
228
+ ]
229
+ scored.sort(key=lambda t: t[0], reverse=True)
230
+
231
+ kept: list[dict[str, Any]] = []
232
+ total = 0
233
+ for score, snip in scored:
234
+ text = snip.get("content", snip.get("chunk", ""))
235
+ if total + len(text) > limit and kept:
236
+ break
237
+ snip["priority_score"] = score
238
+ kept.append(snip)
239
+ total += len(text)
240
+ return kept
241
+
242
+ # --- public AI workflows ---
243
+
244
+ def ask(self, question: str, *, top_k: int = 5) -> AskResult:
245
+ """Answer a natural-language question about the codebase.
246
+
247
+ Gathers semantic search results and repo context, then asks the LLM.
248
+ """
249
+ raw_snippets = self._search_context(question, top_k=top_k)
250
+ snippets = self._prune_context(raw_snippets, question)
251
+ self._ensure_indexed()
252
+ repo_summary = summarize_repository(self._builder).render()
253
+
254
+ # Build prompt
255
+ context_text = ""
256
+ for snip in snippets:
257
+ context_text += (
258
+ f"\n--- {snip.get('file_path', '?')} "
259
+ f"(score: {snip.get('score', 0):.2f}) ---\n"
260
+ f"{snip.get('content', snip.get('chunk', ''))}\n"
261
+ )
262
+
263
+ system = (
264
+ "You are CodexA, an AI coding assistant. Answer questions about the "
265
+ "user's codebase using the provided context. Be concise, accurate, "
266
+ "and cite file paths when relevant."
267
+ )
268
+ user_msg = (
269
+ f"Repository summary:\n{repo_summary}\n\n"
270
+ f"Relevant code snippets:{context_text}\n\n"
271
+ f"Question: {question}"
272
+ )
273
+
274
+ messages = [
275
+ LLMMessage(role=MessageRole.SYSTEM, content=system),
276
+ LLMMessage(role=MessageRole.USER, content=user_msg),
277
+ ]
278
+ resp = self._provider.chat(messages)
279
+
280
+ return AskResult(
281
+ question=question,
282
+ answer=resp.content,
283
+ context_snippets=snippets,
284
+ llm_response=resp,
285
+ explainability={
286
+ "snippets_before_pruning": len(raw_snippets),
287
+ "snippets_after_pruning": len(snippets),
288
+ "context_chars": sum(len(s.get("content", "")) for s in snippets),
289
+ "method": "semantic_search+pruning",
290
+ },
291
+ )
292
+
293
+ def review(self, file_path: str) -> ReviewResult:
294
+ """Review a file for potential issues, bugs, and improvements."""
295
+ content = self._file_context(file_path)
296
+ if not content:
297
+ return ReviewResult(file_path=file_path, summary="File not found or empty.")
298
+
299
+ self._ensure_indexed()
300
+ symbols = self._builder.get_symbols(file_path)
301
+ symbol_info = "\n".join(
302
+ f" - {s.kind} '{s.name}' (L{s.start_line}-{s.end_line})"
303
+ for s in symbols[:30]
304
+ )
305
+
306
+ system = (
307
+ "You are CodexA, a code reviewer. Analyze the provided file and "
308
+ "return a JSON object with keys: 'issues' (list of objects with "
309
+ "'severity', 'line', 'message', 'suggestion') and 'summary' (string). "
310
+ "Severities: 'error', 'warning', 'info'. Be precise and actionable."
311
+ )
312
+ user_msg = (
313
+ f"File: {file_path}\n\n"
314
+ f"Symbols:\n{symbol_info}\n\n"
315
+ f"Source code:\n```\n{content[:8000]}\n```"
316
+ )
317
+
318
+ messages = [
319
+ LLMMessage(role=MessageRole.SYSTEM, content=system),
320
+ LLMMessage(role=MessageRole.USER, content=user_msg),
321
+ ]
322
+ resp = self._provider.chat(messages)
323
+
324
+ # Parse JSON from response — fallback to raw text
325
+ issues: list[dict[str, Any]] = []
326
+ summary = resp.content
327
+ try:
328
+ parsed = json.loads(resp.content)
329
+ if isinstance(parsed, dict):
330
+ issues = parsed.get("issues", [])
331
+ summary = parsed.get("summary", resp.content)
332
+ except (json.JSONDecodeError, TypeError):
333
+ pass
334
+
335
+ return ReviewResult(
336
+ file_path=file_path,
337
+ issues=issues,
338
+ summary=summary,
339
+ llm_response=resp,
340
+ )
341
+
342
+ def refactor(
343
+ self,
344
+ file_path: str,
345
+ instruction: str = "Improve code quality, readability, and performance.",
346
+ ) -> RefactorResult:
347
+ """Suggest refactored code for a file based on an instruction."""
348
+ content = self._file_context(file_path)
349
+ if not content:
350
+ return RefactorResult(file_path=file_path, explanation="File not found or empty.")
351
+
352
+ system = (
353
+ "You are CodexA, a code refactoring assistant. Given the source code "
354
+ "and an instruction, return a JSON object with 'refactored_code' "
355
+ "(the improved code as a string) and 'explanation' (what you changed "
356
+ "and why). Do NOT include markdown fences inside the JSON values."
357
+ )
358
+ user_msg = (
359
+ f"File: {file_path}\n"
360
+ f"Instruction: {instruction}\n\n"
361
+ f"Source code:\n```\n{content[:8000]}\n```"
362
+ )
363
+
364
+ messages = [
365
+ LLMMessage(role=MessageRole.SYSTEM, content=system),
366
+ LLMMessage(role=MessageRole.USER, content=user_msg),
367
+ ]
368
+ resp = self._provider.chat(messages)
369
+
370
+ refactored = ""
371
+ explanation = resp.content
372
+ try:
373
+ parsed = json.loads(resp.content)
374
+ if isinstance(parsed, dict):
375
+ refactored = parsed.get("refactored_code", "")
376
+ explanation = parsed.get("explanation", resp.content)
377
+ except (json.JSONDecodeError, TypeError):
378
+ pass
379
+
380
+ return RefactorResult(
381
+ file_path=file_path,
382
+ original_code=content,
383
+ refactored_code=refactored,
384
+ explanation=explanation,
385
+ llm_response=resp,
386
+ )
387
+
388
+ def suggest(self, target: str, *, top_k: int = 5) -> SuggestResult:
389
+ """Generate intelligent suggestions for a symbol, file, or topic.
390
+
391
+ Combines call-graph, dependency, and semantic data with LLM reasoning
392
+ to produce actionable suggestions with "why" reasoning.
393
+ """
394
+ self._ensure_indexed()
395
+ snippets = self._search_context(target, top_k=top_k)
396
+ sym_context = self._symbol_context(target)
397
+
398
+ system = (
399
+ "You are CodexA, an intelligent code suggestion engine. Given context "
400
+ "about a codebase element, provide suggestions for improvements, fixes, "
401
+ "or optimizations. Return a JSON object with 'suggestions' — a list of "
402
+ "objects each having 'title', 'description', 'reason', and 'priority' "
403
+ "(high/medium/low)."
404
+ )
405
+ context_text = ""
406
+ for snip in snippets:
407
+ context_text += (
408
+ f"\n--- {snip.get('file_path', '?')} ---\n"
409
+ f"{snip.get('content', snip.get('chunk', ''))}\n"
410
+ )
411
+
412
+ user_msg = (
413
+ f"Target: {target}\n\n"
414
+ f"Symbol info:\n{sym_context}\n\n"
415
+ f"Related code:{context_text}"
416
+ )
417
+
418
+ messages = [
419
+ LLMMessage(role=MessageRole.SYSTEM, content=system),
420
+ LLMMessage(role=MessageRole.USER, content=user_msg),
421
+ ]
422
+ resp = self._provider.chat(messages)
423
+
424
+ suggestions: list[dict[str, Any]] = []
425
+ try:
426
+ parsed = json.loads(resp.content)
427
+ if isinstance(parsed, dict):
428
+ suggestions = parsed.get("suggestions", [])
429
+ elif isinstance(parsed, list):
430
+ suggestions = parsed
431
+ except (json.JSONDecodeError, TypeError):
432
+ suggestions = [{"title": "Raw response", "description": resp.content, "reason": "", "priority": "medium"}]
433
+
434
+ return SuggestResult(
435
+ target=target,
436
+ suggestions=suggestions,
437
+ llm_response=resp,
438
+ )
@@ -0,0 +1,110 @@
1
+ """Safety validator — checks LLM outputs before they are applied.
2
+
3
+ Provides basic guardrails to ensure LLM-generated code does not contain
4
+ obviously dangerous patterns. This is not a comprehensive security tool;
5
+ it is a first line of defence within the coding assistant workflow.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ from semantic_code_intelligence.utils.logging import get_logger
15
+
16
+ logger = get_logger("llm.safety")
17
+
18
+ # Patterns that should never appear in AI-generated code destined for execution.
19
+ _DANGEROUS_PATTERNS: list[tuple[str, str]] = [
20
+ # Command execution
21
+ (r"\bos\.system\s*\(", "os.system() call — use subprocess with shell=False instead"),
22
+ (r"subprocess\..*shell\s*=\s*True", "subprocess with shell=True — potential command injection"),
23
+ (r"\brm\s+-rf\s+/", "Destructive rm -rf / command"),
24
+ # Dynamic code execution
25
+ (r"\beval\s*\(", "eval() call — avoid dynamic code execution"),
26
+ (r"\bexec\s*\(", "exec() call — avoid dynamic code execution"),
27
+ (r"\b__import__\s*\(", "Dynamic __import__() — use explicit imports"),
28
+ # SQL injection risk
29
+ (r"DROP\s+TABLE|DROP\s+DATABASE", "SQL DROP statement — potential data loss"),
30
+ (r"TRUNCATE\s+TABLE", "SQL TRUNCATE statement — potential data loss"),
31
+ # Path traversal
32
+ (r"\.\./\.\./", "Path traversal pattern — potential directory escape"),
33
+ # Hardcoded secrets (Phase 12)
34
+ (r"""(?:password|secret|api_key|token)\s*=\s*["'][^"']{8,}["']""",
35
+ "Hardcoded secret — use environment variables or a secrets manager"),
36
+ # XSS risk (Phase 12)
37
+ (r"innerHTML\s*=", "innerHTML assignment — potential XSS vulnerability"),
38
+ (r"document\.write\s*\(", "document.write() — potential XSS vulnerability"),
39
+ # Insecure crypto (Phase 12)
40
+ (r"\bMD5\s*\(|\bmd5\s*\(", "MD5 hash — use SHA-256 or stronger for security"),
41
+ (r"\bSHA1\s*\(|\bsha1\s*\(", "SHA-1 hash — use SHA-256 or stronger for security"),
42
+ # Insecure network (Phase 12)
43
+ (r"http://(?!localhost|127\.0\.0\.1)", "Insecure HTTP URL — use HTTPS instead"),
44
+ (r"verify\s*=\s*False", "SSL verification disabled — potential MITM vulnerability"),
45
+ ]
46
+
47
+
48
+ @dataclass
49
+ class SafetyIssue:
50
+ """A single safety issue found in LLM output."""
51
+
52
+ pattern: str
53
+ description: str
54
+ line_number: int = 0
55
+ severity: str = "warning"
56
+
57
+ def to_dict(self) -> dict[str, Any]:
58
+ return {
59
+ "pattern": self.pattern,
60
+ "description": self.description,
61
+ "line_number": self.line_number,
62
+ "severity": self.severity,
63
+ }
64
+
65
+
66
+ @dataclass
67
+ class SafetyReport:
68
+ """Result of a safety validation pass."""
69
+
70
+ safe: bool = True
71
+ issues: list[SafetyIssue] = field(default_factory=list)
72
+
73
+ def to_dict(self) -> dict[str, Any]:
74
+ return {
75
+ "safe": self.safe,
76
+ "issue_count": len(self.issues),
77
+ "issues": [i.to_dict() for i in self.issues],
78
+ }
79
+
80
+
81
+ class SafetyValidator:
82
+ """Validates LLM-generated code for known dangerous patterns."""
83
+
84
+ def __init__(self, extra_patterns: list[tuple[str, str]] | None = None) -> None:
85
+ self._patterns = list(_DANGEROUS_PATTERNS)
86
+ if extra_patterns:
87
+ self._patterns.extend(extra_patterns)
88
+
89
+ def validate(self, code: str) -> SafetyReport:
90
+ """Scan ``code`` for dangerous patterns.
91
+
92
+ Returns a SafetyReport. If any issues are found, ``safe`` is False.
93
+ """
94
+ issues: list[SafetyIssue] = []
95
+ for line_no, line in enumerate(code.splitlines(), start=1):
96
+ for pattern, description in self._patterns:
97
+ if re.search(pattern, line, re.IGNORECASE):
98
+ issues.append(
99
+ SafetyIssue(
100
+ pattern=pattern,
101
+ description=description,
102
+ line_number=line_no,
103
+ )
104
+ )
105
+
106
+ return SafetyReport(safe=len(issues) == 0, issues=issues)
107
+
108
+ def is_safe(self, code: str) -> bool:
109
+ """Quick boolean check — True if no safety issues found."""
110
+ return self.validate(code).safe