codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,272 @@
1
+ """Cross-repo refactoring suggestions.
2
+
3
+ Compares symbols across workspace repositories to find:
4
+ - Duplicate or near-duplicate logic that could be shared
5
+ - Inconsistent API patterns across repos
6
+ - Refactoring opportunities informed by cross-repo dependency analysis
7
+
8
+ Uses the Workspace multi-repo search and per-repo ContextBuilder to
9
+ gather symbols, then asks the LLM for actionable suggestions.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from semantic_code_intelligence.context.engine import ContextBuilder
19
+ from semantic_code_intelligence.llm.provider import LLMMessage, LLMProvider, MessageRole
20
+ from semantic_code_intelligence.utils.logging import get_logger
21
+
22
+ logger = get_logger("llm.cross_refactor")
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Data types
27
+ # ---------------------------------------------------------------------------
28
+
29
+ @dataclass
30
+ class CrossRepoMatch:
31
+ """A pair of similar symbols found across repos."""
32
+
33
+ repo_a: str
34
+ symbol_a: str
35
+ file_a: str
36
+ repo_b: str
37
+ symbol_b: str
38
+ file_b: str
39
+ similarity_note: str = ""
40
+
41
+ def to_dict(self) -> dict[str, Any]:
42
+ return {
43
+ "repo_a": self.repo_a,
44
+ "symbol_a": self.symbol_a,
45
+ "file_a": self.file_a,
46
+ "repo_b": self.repo_b,
47
+ "symbol_b": self.symbol_b,
48
+ "file_b": self.file_b,
49
+ "similarity_note": self.similarity_note,
50
+ }
51
+
52
+
53
+ @dataclass
54
+ class CrossRefactorResult:
55
+ """Result of cross-repo refactoring analysis."""
56
+
57
+ repos_analyzed: list[str] = field(default_factory=list)
58
+ total_symbols: int = 0
59
+ matches: list[CrossRepoMatch] = field(default_factory=list)
60
+ suggestions: list[dict[str, Any]] = field(default_factory=list)
61
+ llm_used: bool = False
62
+
63
+ def to_dict(self) -> dict[str, Any]:
64
+ return {
65
+ "repos_analyzed": self.repos_analyzed,
66
+ "total_symbols": self.total_symbols,
67
+ "matches": [m.to_dict() for m in self.matches],
68
+ "suggestions": self.suggestions,
69
+ "llm_used": self.llm_used,
70
+ }
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Cross-repo symbol collection
75
+ # ---------------------------------------------------------------------------
76
+
77
+ def _collect_repo_symbols(
78
+ repo_name: str,
79
+ repo_path: Path,
80
+ ) -> list[dict[str, Any]]:
81
+ """Index a single repo and return a flat list of symbol dicts."""
82
+ builder = ContextBuilder()
83
+ from semantic_code_intelligence.config.settings import load_config
84
+ from semantic_code_intelligence.indexing.scanner import scan_repository
85
+
86
+ config = load_config(repo_path)
87
+ for sf in scan_repository(repo_path, config.index):
88
+ try:
89
+ builder.index_file(str(repo_path / sf.relative_path))
90
+ except Exception:
91
+ logger.debug("Skip unindexable file: %s", sf.relative_path)
92
+
93
+ symbols = builder.get_all_symbols()
94
+ result: list[dict[str, Any]] = []
95
+ for s in symbols:
96
+ if s.kind in ("function", "method", "class"):
97
+ result.append({
98
+ "repo": repo_name,
99
+ "name": s.name,
100
+ "kind": s.kind,
101
+ "file": s.file_path,
102
+ "lines": s.end_line - s.start_line + 1,
103
+ "body": s.body[:600] if s.body else "",
104
+ })
105
+ return result
106
+
107
+
108
+ # ---------------------------------------------------------------------------
109
+ # Duplicate detection across repos
110
+ # ---------------------------------------------------------------------------
111
+
112
+ def _find_cross_duplicates(
113
+ repo_symbols: dict[str, list[dict[str, Any]]],
114
+ threshold: float = 0.70,
115
+ min_lines: int = 4,
116
+ ) -> list[CrossRepoMatch]:
117
+ """Find near-duplicate symbols across different repos via trigram Jaccard."""
118
+ from semantic_code_intelligence.ci.quality import _normalize_body, _trigram_set, _jaccard
119
+
120
+ # Build (repo, sym_dict, trigrams) for each candidate
121
+ candidates: list[tuple[str, dict[str, Any], set[str]]] = []
122
+ for repo_name, syms in repo_symbols.items():
123
+ for s in syms:
124
+ if s["lines"] < min_lines or not s["body"].strip():
125
+ continue
126
+ norm = _normalize_body(s["body"])
127
+ tris = _trigram_set(norm)
128
+ if tris:
129
+ candidates.append((repo_name, s, tris))
130
+
131
+ matches: list[CrossRepoMatch] = []
132
+ seen: set[tuple[str, str]] = set()
133
+
134
+ for i, (repo_a, sym_a, tris_a) in enumerate(candidates):
135
+ for j in range(i + 1, len(candidates)):
136
+ repo_b, sym_b, tris_b = candidates[j]
137
+ if repo_a == repo_b:
138
+ continue # Only cross-repo
139
+ sim = _jaccard(tris_a, tris_b)
140
+ if sim >= threshold:
141
+ _sorted = sorted([f"{repo_a}:{sym_a['name']}", f"{repo_b}:{sym_b['name']}"])
142
+ pair_key = (_sorted[0], _sorted[1])
143
+ if pair_key in seen:
144
+ continue
145
+ seen.add(pair_key)
146
+ matches.append(CrossRepoMatch(
147
+ repo_a=repo_a,
148
+ symbol_a=sym_a["name"],
149
+ file_a=sym_a["file"],
150
+ repo_b=repo_b,
151
+ symbol_b=sym_b["name"],
152
+ file_b=sym_b["file"],
153
+ similarity_note=f"Jaccard similarity: {sim:.2f}",
154
+ ))
155
+
156
+ matches.sort(key=lambda m: m.similarity_note, reverse=True)
157
+ return matches
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # LLM-powered suggestion generation
162
+ # ---------------------------------------------------------------------------
163
+
164
+ def _generate_suggestions(
165
+ matches: list[CrossRepoMatch],
166
+ repo_symbols: dict[str, list[dict[str, Any]]],
167
+ provider: LLMProvider,
168
+ ) -> list[dict[str, Any]]:
169
+ """Ask the LLM for refactoring suggestions based on cross-repo matches."""
170
+ if not matches:
171
+ return []
172
+
173
+ # Build a concise summary for the LLM
174
+ match_text = ""
175
+ for m in matches[:10]:
176
+ match_text += (
177
+ f"- {m.repo_a}/{m.symbol_a} ({m.file_a}) ↔ "
178
+ f"{m.repo_b}/{m.symbol_b} ({m.file_b}) — {m.similarity_note}\n"
179
+ )
180
+
181
+ repo_summary = ""
182
+ for repo, syms in repo_symbols.items():
183
+ repo_summary += f"- {repo}: {len(syms)} symbols\n"
184
+
185
+ system = (
186
+ "You are CodexA, a cross-repository refactoring advisor. Given a list of "
187
+ "similar symbols found across different repositories, suggest refactoring "
188
+ "opportunities. Return a JSON list of objects with keys: 'title', "
189
+ "'description', 'affected_repos', 'priority' (high/medium/low)."
190
+ )
191
+ user_msg = (
192
+ f"Repositories:\n{repo_summary}\n"
193
+ f"Similar symbols across repos:\n{match_text}\n"
194
+ "Suggest refactoring strategies (e.g., extract shared lib, unify APIs)."
195
+ )
196
+
197
+ import json
198
+
199
+ messages = [
200
+ LLMMessage(role=MessageRole.SYSTEM, content=system),
201
+ LLMMessage(role=MessageRole.USER, content=user_msg),
202
+ ]
203
+ resp = provider.chat(messages)
204
+
205
+ try:
206
+ parsed = json.loads(resp.content)
207
+ if isinstance(parsed, list):
208
+ result: list[dict[str, Any]] = parsed
209
+ return result
210
+ if isinstance(parsed, dict) and "suggestions" in parsed:
211
+ suggestions: list[dict[str, Any]] = parsed["suggestions"]
212
+ return suggestions
213
+ except (json.JSONDecodeError, TypeError):
214
+ pass
215
+
216
+ return [{"title": "See raw analysis", "description": resp.content, "priority": "medium"}]
217
+
218
+
219
+ # ---------------------------------------------------------------------------
220
+ # Public API
221
+ # ---------------------------------------------------------------------------
222
+
223
+ def analyze_cross_repo(
224
+ workspace_root: Path,
225
+ *,
226
+ provider: LLMProvider | None = None,
227
+ threshold: float = 0.70,
228
+ repos: list[str] | None = None,
229
+ ) -> CrossRefactorResult:
230
+ """Analyse a workspace for cross-repo refactoring opportunities.
231
+
232
+ Args:
233
+ workspace_root: Workspace root containing ``.codexa/workspace.json``.
234
+ provider: Optional LLM provider for generating suggestions.
235
+ threshold: Jaccard similarity threshold for duplicate detection.
236
+ repos: Restrict to these repo names. None = all registered repos.
237
+
238
+ Returns:
239
+ CrossRefactorResult with matches and optional LLM suggestions.
240
+ """
241
+ from semantic_code_intelligence.workspace import Workspace
242
+
243
+ try:
244
+ ws = Workspace.load(workspace_root)
245
+ except FileNotFoundError:
246
+ return CrossRefactorResult()
247
+
248
+ targets = repos or [r.name for r in ws.repos]
249
+ repo_symbols: dict[str, list[dict[str, Any]]] = {}
250
+
251
+ for rname in targets:
252
+ entry = ws.get_repo(rname)
253
+ if entry is None:
254
+ continue
255
+ repo_symbols[rname] = _collect_repo_symbols(rname, Path(entry.path))
256
+
257
+ total_symbols = sum(len(s) for s in repo_symbols.values())
258
+ matches = _find_cross_duplicates(repo_symbols, threshold=threshold)
259
+
260
+ suggestions: list[dict[str, Any]] = []
261
+ llm_used = False
262
+ if provider and matches:
263
+ suggestions = _generate_suggestions(matches, repo_symbols, provider)
264
+ llm_used = True
265
+
266
+ return CrossRefactorResult(
267
+ repos_analyzed=targets,
268
+ total_symbols=total_symbols,
269
+ matches=matches,
270
+ suggestions=suggestions,
271
+ llm_used=llm_used,
272
+ )
@@ -0,0 +1,274 @@
1
+ """Autonomous investigation chains — multi-step code exploration.
2
+
3
+ An **InvestigationChain** drives an iterative loop:
4
+
5
+ 1. Formulate a search query from the user's question.
6
+ 2. Gather context (semantic search, symbol lookup, dependency analysis).
7
+ 3. Ask the LLM to evaluate what was found and decide the next action.
8
+ 4. Repeat until the LLM signals "conclude" or a step limit is reached.
9
+
10
+ Each step is recorded as a ``ReasoningStep`` in ``SessionMemory`` so
11
+ the chain is fully transparent and reproducible.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import uuid
18
+ from dataclasses import dataclass, field
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ from semantic_code_intelligence.context.engine import ContextBuilder
23
+ from semantic_code_intelligence.context.memory import ReasoningStep, SessionMemory
24
+ from semantic_code_intelligence.llm.provider import LLMMessage, LLMProvider, MessageRole
25
+ from semantic_code_intelligence.services.search_service import search_codebase
26
+ from semantic_code_intelligence.utils.logging import get_logger
27
+
28
+ logger = get_logger("llm.investigation")
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Data types
33
+ # ---------------------------------------------------------------------------
34
+
35
+ @dataclass
36
+ class InvestigationResult:
37
+ """Final result of an investigation chain."""
38
+
39
+ question: str
40
+ conclusion: str
41
+ steps: list[dict[str, Any]] = field(default_factory=list)
42
+ chain_id: str = ""
43
+ total_steps: int = 0
44
+
45
+ def to_dict(self) -> dict[str, Any]:
46
+ return {
47
+ "question": self.question,
48
+ "conclusion": self.conclusion,
49
+ "steps": self.steps,
50
+ "chain_id": self.chain_id,
51
+ "total_steps": self.total_steps,
52
+ }
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Action helpers
57
+ # ---------------------------------------------------------------------------
58
+
59
+ def _action_search(query: str, project_root: Path, top_k: int = 5) -> str:
60
+ """Run semantic search and return a text summary."""
61
+ try:
62
+ results = search_codebase(query, project_root, top_k=top_k, threshold=0.2)
63
+ if not results:
64
+ return "No results found."
65
+ parts: list[str] = []
66
+ for r in results:
67
+ d = r.to_dict()
68
+ parts.append(
69
+ f"[{d.get('file_path', '?')}] (score {d.get('score', 0):.2f})\n"
70
+ f"{d.get('content', d.get('chunk', ''))[:500]}"
71
+ )
72
+ return "\n---\n".join(parts)
73
+ except Exception:
74
+ return "Semantic search unavailable."
75
+
76
+
77
+ def _action_analyze(symbol_name: str, builder: ContextBuilder) -> str:
78
+ """Look up a symbol and return its context."""
79
+ matches = builder.find_symbol(symbol_name)
80
+ if not matches:
81
+ return f"Symbol '{symbol_name}' not found."
82
+ parts: list[str] = []
83
+ for s in matches[:3]:
84
+ parts.append(
85
+ f"{s.kind} {s.name} in {s.file_path} (L{s.start_line}-{s.end_line})\n"
86
+ f"{s.body[:400]}"
87
+ )
88
+ return "\n---\n".join(parts)
89
+
90
+
91
+ def _action_deps(file_path: str, builder: ContextBuilder, project_root: Path) -> str:
92
+ """Gather dependency info for a file."""
93
+ from semantic_code_intelligence.context.engine import DependencyMap
94
+
95
+ dm = DependencyMap()
96
+ full = project_root / file_path if not Path(file_path).is_absolute() else Path(file_path)
97
+ if full.exists():
98
+ deps = dm.add_file(str(full))
99
+ parts = [f"{d.import_text} (L{d.line})" for d in deps]
100
+ return "Dependencies: " + ", ".join(parts) if parts else "No dependencies found."
101
+ return f"File not found: {file_path}"
102
+
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # Investigation chain
106
+ # ---------------------------------------------------------------------------
107
+
108
+ _PLANNER_SYSTEM = """\
109
+ You are CodexA, an autonomous code investigation agent. Your task is to
110
+ answer the user's question by systematically exploring the codebase.
111
+
112
+ On each turn you will receive context gathered from the previous action.
113
+ Respond with a JSON object with exactly these keys:
114
+ - "thought": one sentence about what you learned and what to do next
115
+ - "action": one of "search", "analyze", "deps", "conclude"
116
+ - "action_input": the argument for the action (search query, symbol name, file path, or final answer)
117
+
118
+ When you have enough information, use action "conclude" and put your
119
+ final answer in "action_input".
120
+ """
121
+
122
+
123
+ class InvestigationChain:
124
+ """Drives an iterative search-analyze-conclude loop via LLM."""
125
+
126
+ def __init__(
127
+ self,
128
+ provider: LLMProvider,
129
+ project_root: Path,
130
+ *,
131
+ builder: ContextBuilder | None = None,
132
+ memory: SessionMemory | None = None,
133
+ max_steps: int = 6,
134
+ ) -> None:
135
+ self._provider = provider
136
+ self._root = project_root.resolve()
137
+ self._builder = builder or ContextBuilder()
138
+ self._memory = memory or SessionMemory()
139
+ self._max_steps = max_steps
140
+ self._indexed = False
141
+
142
+ def _ensure_indexed(self) -> None:
143
+ if self._indexed:
144
+ return
145
+ from semantic_code_intelligence.config.settings import load_config
146
+ from semantic_code_intelligence.indexing.scanner import scan_repository
147
+
148
+ config = load_config(self._root)
149
+ for sf in scan_repository(self._root, config.index):
150
+ try:
151
+ self._builder.index_file(str(self._root / sf.relative_path))
152
+ except Exception:
153
+ logger.debug("Skip unindexable file: %s", sf.relative_path)
154
+ self._indexed = True
155
+
156
+ def _run_action(self, action: str, action_input: str) -> str:
157
+ """Execute an action and return its text output."""
158
+ if action == "search":
159
+ return _action_search(action_input, self._root)
160
+ elif action == "analyze":
161
+ self._ensure_indexed()
162
+ return _action_analyze(action_input, self._builder)
163
+ elif action == "deps":
164
+ self._ensure_indexed()
165
+ return _action_deps(action_input, self._builder, self._root)
166
+ return action_input
167
+
168
+ def _parse_plan(self, text: str) -> dict[str, str]:
169
+ """Parse the LLM planner response into {thought, action, action_input}."""
170
+ # Try JSON first
171
+ try:
172
+ parsed = json.loads(text)
173
+ if isinstance(parsed, dict) and "action" in parsed:
174
+ return {
175
+ "thought": str(parsed.get("thought", "")),
176
+ "action": str(parsed.get("action", "conclude")),
177
+ "action_input": str(parsed.get("action_input", text)),
178
+ }
179
+ except (json.JSONDecodeError, TypeError):
180
+ pass
181
+ # Fallback — treat entire response as conclusion
182
+ return {"thought": "", "action": "conclude", "action_input": text}
183
+
184
+ def _conclude_streaming(self, messages: list[LLMMessage]) -> str:
185
+ """Stream the final conclusion tokens to stdout, return accumulated text."""
186
+ from semantic_code_intelligence.llm.streaming import stream_chat
187
+
188
+ gen = stream_chat(self._provider, messages)
189
+ accumulated = ""
190
+ import sys
191
+ for event in gen:
192
+ if event.kind == "token":
193
+ accumulated += event.content
194
+ sys.stdout.write(event.content)
195
+ sys.stdout.flush()
196
+ sys.stdout.write("\n")
197
+ return accumulated
198
+
199
+ def investigate(self, question: str, *, stream_conclusion: bool = False) -> InvestigationResult:
200
+ """Run a full investigation loop and return the result.
201
+
202
+ Args:
203
+ question: The question to investigate.
204
+ stream_conclusion: If True, yield the conclusion token-by-token
205
+ via ``stream_chat`` and print incrementally.
206
+ """
207
+ chain_id = uuid.uuid4().hex[:10]
208
+ self._memory.start_chain(chain_id)
209
+
210
+ messages: list[LLMMessage] = [
211
+ LLMMessage(role=MessageRole.SYSTEM, content=_PLANNER_SYSTEM),
212
+ LLMMessage(role=MessageRole.USER, content=f"Question: {question}"),
213
+ ]
214
+
215
+ steps: list[dict[str, Any]] = []
216
+ conclusion = ""
217
+
218
+ for step_num in range(1, self._max_steps + 1):
219
+ # Ask the planner
220
+ resp = self._provider.chat(messages)
221
+ plan = self._parse_plan(resp.content)
222
+
223
+ action = plan["action"]
224
+ action_input = plan["action_input"]
225
+ thought = plan["thought"]
226
+
227
+ if action == "conclude":
228
+ conclusion = action_input
229
+ self._memory.add_step(chain_id, "conclude", question, conclusion)
230
+ steps.append({
231
+ "step": step_num,
232
+ "action": "conclude",
233
+ "thought": thought,
234
+ "output": conclusion,
235
+ })
236
+ break
237
+
238
+ # Execute the action
239
+ output = self._run_action(action, action_input)
240
+ self._memory.add_step(chain_id, action, action_input, output)
241
+ steps.append({
242
+ "step": step_num,
243
+ "action": action,
244
+ "action_input": action_input,
245
+ "thought": thought,
246
+ "output": output[:500],
247
+ })
248
+
249
+ # Feed result back to planner
250
+ messages.append(LLMMessage(role=MessageRole.ASSISTANT, content=resp.content))
251
+ messages.append(LLMMessage(
252
+ role=MessageRole.USER,
253
+ content=f"Action result:\n{output[:2000]}",
254
+ ))
255
+ else:
256
+ # Exhausted steps — ask for final conclusion
257
+ messages.append(LLMMessage(
258
+ role=MessageRole.USER,
259
+ content="You have reached the step limit. Please provide your best conclusion now.",
260
+ ))
261
+ if stream_conclusion:
262
+ conclusion = self._conclude_streaming(messages)
263
+ else:
264
+ resp = self._provider.chat(messages)
265
+ conclusion = resp.content
266
+ self._memory.add_step(chain_id, "conclude", "forced", conclusion)
267
+
268
+ return InvestigationResult(
269
+ question=question,
270
+ conclusion=conclusion,
271
+ steps=steps,
272
+ chain_id=chain_id,
273
+ total_steps=len(steps),
274
+ )
@@ -0,0 +1,77 @@
1
+ """Mock LLM provider — deterministic responses for testing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from semantic_code_intelligence.llm.provider import (
8
+ LLMMessage,
9
+ LLMProvider,
10
+ LLMResponse,
11
+ )
12
+
13
+
14
+ class MockProvider(LLMProvider):
15
+ """A mock LLM provider that returns configurable responses.
16
+
17
+ Useful for unit tests and offline development without a live LLM.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ default_response: str = "This is a mock LLM response.",
23
+ model: str = "mock-model",
24
+ ) -> None:
25
+ self._default_response = default_response
26
+ self._model = model
27
+ self._call_history: list[dict[str, Any]] = []
28
+ self._response_queue: list[str] = []
29
+
30
+ @property
31
+ def name(self) -> str:
32
+ return "mock"
33
+
34
+ @property
35
+ def call_history(self) -> list[dict[str, Any]]:
36
+ """Return a log of all calls made to this mock."""
37
+ return list(self._call_history)
38
+
39
+ def enqueue_response(self, response: str) -> None:
40
+ """Enqueue a custom response. FIFO — next call pops from front."""
41
+ self._response_queue.append(response)
42
+
43
+ def _next_response(self) -> str:
44
+ if self._response_queue:
45
+ return self._response_queue.pop(0)
46
+ return self._default_response
47
+
48
+ def complete(self, prompt: str, **kwargs: Any) -> LLMResponse:
49
+ content = self._next_response()
50
+ self._call_history.append({
51
+ "method": "complete",
52
+ "prompt": prompt,
53
+ "kwargs": kwargs,
54
+ "response": content,
55
+ })
56
+ return LLMResponse(
57
+ content=content,
58
+ model=self._model,
59
+ provider=self.name,
60
+ usage={"prompt_tokens": len(prompt) // 4, "completion_tokens": len(content) // 4, "total_tokens": (len(prompt) + len(content)) // 4},
61
+ )
62
+
63
+ def chat(self, messages: list[LLMMessage], **kwargs: Any) -> LLMResponse:
64
+ content = self._next_response()
65
+ self._call_history.append({
66
+ "method": "chat",
67
+ "messages": [m.to_dict() for m in messages],
68
+ "kwargs": kwargs,
69
+ "response": content,
70
+ })
71
+ total_chars = sum(len(m.content) for m in messages) + len(content)
72
+ return LLMResponse(
73
+ content=content,
74
+ model=self._model,
75
+ provider=self.name,
76
+ usage={"prompt_tokens": total_chars // 4, "completion_tokens": len(content) // 4, "total_tokens": total_chars // 4},
77
+ )