codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""Cross-repo refactoring suggestions.
|
|
2
|
+
|
|
3
|
+
Compares symbols across workspace repositories to find:
|
|
4
|
+
- Duplicate or near-duplicate logic that could be shared
|
|
5
|
+
- Inconsistent API patterns across repos
|
|
6
|
+
- Refactoring opportunities informed by cross-repo dependency analysis
|
|
7
|
+
|
|
8
|
+
Uses the Workspace multi-repo search and per-repo ContextBuilder to
|
|
9
|
+
gather symbols, then asks the LLM for actionable suggestions.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from semantic_code_intelligence.context.engine import ContextBuilder
|
|
19
|
+
from semantic_code_intelligence.llm.provider import LLMMessage, LLMProvider, MessageRole
|
|
20
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
21
|
+
|
|
22
|
+
logger = get_logger("llm.cross_refactor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Data types
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class CrossRepoMatch:
|
|
31
|
+
"""A pair of similar symbols found across repos."""
|
|
32
|
+
|
|
33
|
+
repo_a: str
|
|
34
|
+
symbol_a: str
|
|
35
|
+
file_a: str
|
|
36
|
+
repo_b: str
|
|
37
|
+
symbol_b: str
|
|
38
|
+
file_b: str
|
|
39
|
+
similarity_note: str = ""
|
|
40
|
+
|
|
41
|
+
def to_dict(self) -> dict[str, Any]:
|
|
42
|
+
return {
|
|
43
|
+
"repo_a": self.repo_a,
|
|
44
|
+
"symbol_a": self.symbol_a,
|
|
45
|
+
"file_a": self.file_a,
|
|
46
|
+
"repo_b": self.repo_b,
|
|
47
|
+
"symbol_b": self.symbol_b,
|
|
48
|
+
"file_b": self.file_b,
|
|
49
|
+
"similarity_note": self.similarity_note,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class CrossRefactorResult:
|
|
55
|
+
"""Result of cross-repo refactoring analysis."""
|
|
56
|
+
|
|
57
|
+
repos_analyzed: list[str] = field(default_factory=list)
|
|
58
|
+
total_symbols: int = 0
|
|
59
|
+
matches: list[CrossRepoMatch] = field(default_factory=list)
|
|
60
|
+
suggestions: list[dict[str, Any]] = field(default_factory=list)
|
|
61
|
+
llm_used: bool = False
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict[str, Any]:
|
|
64
|
+
return {
|
|
65
|
+
"repos_analyzed": self.repos_analyzed,
|
|
66
|
+
"total_symbols": self.total_symbols,
|
|
67
|
+
"matches": [m.to_dict() for m in self.matches],
|
|
68
|
+
"suggestions": self.suggestions,
|
|
69
|
+
"llm_used": self.llm_used,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Cross-repo symbol collection
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def _collect_repo_symbols(
|
|
78
|
+
repo_name: str,
|
|
79
|
+
repo_path: Path,
|
|
80
|
+
) -> list[dict[str, Any]]:
|
|
81
|
+
"""Index a single repo and return a flat list of symbol dicts."""
|
|
82
|
+
builder = ContextBuilder()
|
|
83
|
+
from semantic_code_intelligence.config.settings import load_config
|
|
84
|
+
from semantic_code_intelligence.indexing.scanner import scan_repository
|
|
85
|
+
|
|
86
|
+
config = load_config(repo_path)
|
|
87
|
+
for sf in scan_repository(repo_path, config.index):
|
|
88
|
+
try:
|
|
89
|
+
builder.index_file(str(repo_path / sf.relative_path))
|
|
90
|
+
except Exception:
|
|
91
|
+
logger.debug("Skip unindexable file: %s", sf.relative_path)
|
|
92
|
+
|
|
93
|
+
symbols = builder.get_all_symbols()
|
|
94
|
+
result: list[dict[str, Any]] = []
|
|
95
|
+
for s in symbols:
|
|
96
|
+
if s.kind in ("function", "method", "class"):
|
|
97
|
+
result.append({
|
|
98
|
+
"repo": repo_name,
|
|
99
|
+
"name": s.name,
|
|
100
|
+
"kind": s.kind,
|
|
101
|
+
"file": s.file_path,
|
|
102
|
+
"lines": s.end_line - s.start_line + 1,
|
|
103
|
+
"body": s.body[:600] if s.body else "",
|
|
104
|
+
})
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
# Duplicate detection across repos
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def _find_cross_duplicates(
|
|
113
|
+
repo_symbols: dict[str, list[dict[str, Any]]],
|
|
114
|
+
threshold: float = 0.70,
|
|
115
|
+
min_lines: int = 4,
|
|
116
|
+
) -> list[CrossRepoMatch]:
|
|
117
|
+
"""Find near-duplicate symbols across different repos via trigram Jaccard."""
|
|
118
|
+
from semantic_code_intelligence.ci.quality import _normalize_body, _trigram_set, _jaccard
|
|
119
|
+
|
|
120
|
+
# Build (repo, sym_dict, trigrams) for each candidate
|
|
121
|
+
candidates: list[tuple[str, dict[str, Any], set[str]]] = []
|
|
122
|
+
for repo_name, syms in repo_symbols.items():
|
|
123
|
+
for s in syms:
|
|
124
|
+
if s["lines"] < min_lines or not s["body"].strip():
|
|
125
|
+
continue
|
|
126
|
+
norm = _normalize_body(s["body"])
|
|
127
|
+
tris = _trigram_set(norm)
|
|
128
|
+
if tris:
|
|
129
|
+
candidates.append((repo_name, s, tris))
|
|
130
|
+
|
|
131
|
+
matches: list[CrossRepoMatch] = []
|
|
132
|
+
seen: set[tuple[str, str]] = set()
|
|
133
|
+
|
|
134
|
+
for i, (repo_a, sym_a, tris_a) in enumerate(candidates):
|
|
135
|
+
for j in range(i + 1, len(candidates)):
|
|
136
|
+
repo_b, sym_b, tris_b = candidates[j]
|
|
137
|
+
if repo_a == repo_b:
|
|
138
|
+
continue # Only cross-repo
|
|
139
|
+
sim = _jaccard(tris_a, tris_b)
|
|
140
|
+
if sim >= threshold:
|
|
141
|
+
_sorted = sorted([f"{repo_a}:{sym_a['name']}", f"{repo_b}:{sym_b['name']}"])
|
|
142
|
+
pair_key = (_sorted[0], _sorted[1])
|
|
143
|
+
if pair_key in seen:
|
|
144
|
+
continue
|
|
145
|
+
seen.add(pair_key)
|
|
146
|
+
matches.append(CrossRepoMatch(
|
|
147
|
+
repo_a=repo_a,
|
|
148
|
+
symbol_a=sym_a["name"],
|
|
149
|
+
file_a=sym_a["file"],
|
|
150
|
+
repo_b=repo_b,
|
|
151
|
+
symbol_b=sym_b["name"],
|
|
152
|
+
file_b=sym_b["file"],
|
|
153
|
+
similarity_note=f"Jaccard similarity: {sim:.2f}",
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
matches.sort(key=lambda m: m.similarity_note, reverse=True)
|
|
157
|
+
return matches
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
# LLM-powered suggestion generation
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
def _generate_suggestions(
|
|
165
|
+
matches: list[CrossRepoMatch],
|
|
166
|
+
repo_symbols: dict[str, list[dict[str, Any]]],
|
|
167
|
+
provider: LLMProvider,
|
|
168
|
+
) -> list[dict[str, Any]]:
|
|
169
|
+
"""Ask the LLM for refactoring suggestions based on cross-repo matches."""
|
|
170
|
+
if not matches:
|
|
171
|
+
return []
|
|
172
|
+
|
|
173
|
+
# Build a concise summary for the LLM
|
|
174
|
+
match_text = ""
|
|
175
|
+
for m in matches[:10]:
|
|
176
|
+
match_text += (
|
|
177
|
+
f"- {m.repo_a}/{m.symbol_a} ({m.file_a}) ↔ "
|
|
178
|
+
f"{m.repo_b}/{m.symbol_b} ({m.file_b}) — {m.similarity_note}\n"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
repo_summary = ""
|
|
182
|
+
for repo, syms in repo_symbols.items():
|
|
183
|
+
repo_summary += f"- {repo}: {len(syms)} symbols\n"
|
|
184
|
+
|
|
185
|
+
system = (
|
|
186
|
+
"You are CodexA, a cross-repository refactoring advisor. Given a list of "
|
|
187
|
+
"similar symbols found across different repositories, suggest refactoring "
|
|
188
|
+
"opportunities. Return a JSON list of objects with keys: 'title', "
|
|
189
|
+
"'description', 'affected_repos', 'priority' (high/medium/low)."
|
|
190
|
+
)
|
|
191
|
+
user_msg = (
|
|
192
|
+
f"Repositories:\n{repo_summary}\n"
|
|
193
|
+
f"Similar symbols across repos:\n{match_text}\n"
|
|
194
|
+
"Suggest refactoring strategies (e.g., extract shared lib, unify APIs)."
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
import json
|
|
198
|
+
|
|
199
|
+
messages = [
|
|
200
|
+
LLMMessage(role=MessageRole.SYSTEM, content=system),
|
|
201
|
+
LLMMessage(role=MessageRole.USER, content=user_msg),
|
|
202
|
+
]
|
|
203
|
+
resp = provider.chat(messages)
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
parsed = json.loads(resp.content)
|
|
207
|
+
if isinstance(parsed, list):
|
|
208
|
+
result: list[dict[str, Any]] = parsed
|
|
209
|
+
return result
|
|
210
|
+
if isinstance(parsed, dict) and "suggestions" in parsed:
|
|
211
|
+
suggestions: list[dict[str, Any]] = parsed["suggestions"]
|
|
212
|
+
return suggestions
|
|
213
|
+
except (json.JSONDecodeError, TypeError):
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
return [{"title": "See raw analysis", "description": resp.content, "priority": "medium"}]
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# ---------------------------------------------------------------------------
|
|
220
|
+
# Public API
|
|
221
|
+
# ---------------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def analyze_cross_repo(
|
|
224
|
+
workspace_root: Path,
|
|
225
|
+
*,
|
|
226
|
+
provider: LLMProvider | None = None,
|
|
227
|
+
threshold: float = 0.70,
|
|
228
|
+
repos: list[str] | None = None,
|
|
229
|
+
) -> CrossRefactorResult:
|
|
230
|
+
"""Analyse a workspace for cross-repo refactoring opportunities.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
workspace_root: Workspace root containing ``.codexa/workspace.json``.
|
|
234
|
+
provider: Optional LLM provider for generating suggestions.
|
|
235
|
+
threshold: Jaccard similarity threshold for duplicate detection.
|
|
236
|
+
repos: Restrict to these repo names. None = all registered repos.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
CrossRefactorResult with matches and optional LLM suggestions.
|
|
240
|
+
"""
|
|
241
|
+
from semantic_code_intelligence.workspace import Workspace
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
ws = Workspace.load(workspace_root)
|
|
245
|
+
except FileNotFoundError:
|
|
246
|
+
return CrossRefactorResult()
|
|
247
|
+
|
|
248
|
+
targets = repos or [r.name for r in ws.repos]
|
|
249
|
+
repo_symbols: dict[str, list[dict[str, Any]]] = {}
|
|
250
|
+
|
|
251
|
+
for rname in targets:
|
|
252
|
+
entry = ws.get_repo(rname)
|
|
253
|
+
if entry is None:
|
|
254
|
+
continue
|
|
255
|
+
repo_symbols[rname] = _collect_repo_symbols(rname, Path(entry.path))
|
|
256
|
+
|
|
257
|
+
total_symbols = sum(len(s) for s in repo_symbols.values())
|
|
258
|
+
matches = _find_cross_duplicates(repo_symbols, threshold=threshold)
|
|
259
|
+
|
|
260
|
+
suggestions: list[dict[str, Any]] = []
|
|
261
|
+
llm_used = False
|
|
262
|
+
if provider and matches:
|
|
263
|
+
suggestions = _generate_suggestions(matches, repo_symbols, provider)
|
|
264
|
+
llm_used = True
|
|
265
|
+
|
|
266
|
+
return CrossRefactorResult(
|
|
267
|
+
repos_analyzed=targets,
|
|
268
|
+
total_symbols=total_symbols,
|
|
269
|
+
matches=matches,
|
|
270
|
+
suggestions=suggestions,
|
|
271
|
+
llm_used=llm_used,
|
|
272
|
+
)
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Autonomous investigation chains — multi-step code exploration.
|
|
2
|
+
|
|
3
|
+
An **InvestigationChain** drives an iterative loop:
|
|
4
|
+
|
|
5
|
+
1. Formulate a search query from the user's question.
|
|
6
|
+
2. Gather context (semantic search, symbol lookup, dependency analysis).
|
|
7
|
+
3. Ask the LLM to evaluate what was found and decide the next action.
|
|
8
|
+
4. Repeat until the LLM signals "conclude" or a step limit is reached.
|
|
9
|
+
|
|
10
|
+
Each step is recorded as a ``ReasoningStep`` in ``SessionMemory`` so
|
|
11
|
+
the chain is fully transparent and reproducible.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import uuid
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from semantic_code_intelligence.context.engine import ContextBuilder
|
|
23
|
+
from semantic_code_intelligence.context.memory import ReasoningStep, SessionMemory
|
|
24
|
+
from semantic_code_intelligence.llm.provider import LLMMessage, LLMProvider, MessageRole
|
|
25
|
+
from semantic_code_intelligence.services.search_service import search_codebase
|
|
26
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
27
|
+
|
|
28
|
+
logger = get_logger("llm.investigation")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Data types
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class InvestigationResult:
|
|
37
|
+
"""Final result of an investigation chain."""
|
|
38
|
+
|
|
39
|
+
question: str
|
|
40
|
+
conclusion: str
|
|
41
|
+
steps: list[dict[str, Any]] = field(default_factory=list)
|
|
42
|
+
chain_id: str = ""
|
|
43
|
+
total_steps: int = 0
|
|
44
|
+
|
|
45
|
+
def to_dict(self) -> dict[str, Any]:
|
|
46
|
+
return {
|
|
47
|
+
"question": self.question,
|
|
48
|
+
"conclusion": self.conclusion,
|
|
49
|
+
"steps": self.steps,
|
|
50
|
+
"chain_id": self.chain_id,
|
|
51
|
+
"total_steps": self.total_steps,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Action helpers
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
def _action_search(query: str, project_root: Path, top_k: int = 5) -> str:
|
|
60
|
+
"""Run semantic search and return a text summary."""
|
|
61
|
+
try:
|
|
62
|
+
results = search_codebase(query, project_root, top_k=top_k, threshold=0.2)
|
|
63
|
+
if not results:
|
|
64
|
+
return "No results found."
|
|
65
|
+
parts: list[str] = []
|
|
66
|
+
for r in results:
|
|
67
|
+
d = r.to_dict()
|
|
68
|
+
parts.append(
|
|
69
|
+
f"[{d.get('file_path', '?')}] (score {d.get('score', 0):.2f})\n"
|
|
70
|
+
f"{d.get('content', d.get('chunk', ''))[:500]}"
|
|
71
|
+
)
|
|
72
|
+
return "\n---\n".join(parts)
|
|
73
|
+
except Exception:
|
|
74
|
+
return "Semantic search unavailable."
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _action_analyze(symbol_name: str, builder: ContextBuilder) -> str:
|
|
78
|
+
"""Look up a symbol and return its context."""
|
|
79
|
+
matches = builder.find_symbol(symbol_name)
|
|
80
|
+
if not matches:
|
|
81
|
+
return f"Symbol '{symbol_name}' not found."
|
|
82
|
+
parts: list[str] = []
|
|
83
|
+
for s in matches[:3]:
|
|
84
|
+
parts.append(
|
|
85
|
+
f"{s.kind} {s.name} in {s.file_path} (L{s.start_line}-{s.end_line})\n"
|
|
86
|
+
f"{s.body[:400]}"
|
|
87
|
+
)
|
|
88
|
+
return "\n---\n".join(parts)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _action_deps(file_path: str, builder: ContextBuilder, project_root: Path) -> str:
|
|
92
|
+
"""Gather dependency info for a file."""
|
|
93
|
+
from semantic_code_intelligence.context.engine import DependencyMap
|
|
94
|
+
|
|
95
|
+
dm = DependencyMap()
|
|
96
|
+
full = project_root / file_path if not Path(file_path).is_absolute() else Path(file_path)
|
|
97
|
+
if full.exists():
|
|
98
|
+
deps = dm.add_file(str(full))
|
|
99
|
+
parts = [f"{d.import_text} (L{d.line})" for d in deps]
|
|
100
|
+
return "Dependencies: " + ", ".join(parts) if parts else "No dependencies found."
|
|
101
|
+
return f"File not found: {file_path}"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
# Investigation chain
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
_PLANNER_SYSTEM = """\
|
|
109
|
+
You are CodexA, an autonomous code investigation agent. Your task is to
|
|
110
|
+
answer the user's question by systematically exploring the codebase.
|
|
111
|
+
|
|
112
|
+
On each turn you will receive context gathered from the previous action.
|
|
113
|
+
Respond with a JSON object with exactly these keys:
|
|
114
|
+
- "thought": one sentence about what you learned and what to do next
|
|
115
|
+
- "action": one of "search", "analyze", "deps", "conclude"
|
|
116
|
+
- "action_input": the argument for the action (search query, symbol name, file path, or final answer)
|
|
117
|
+
|
|
118
|
+
When you have enough information, use action "conclude" and put your
|
|
119
|
+
final answer in "action_input".
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class InvestigationChain:
|
|
124
|
+
"""Drives an iterative search-analyze-conclude loop via LLM."""
|
|
125
|
+
|
|
126
|
+
def __init__(
|
|
127
|
+
self,
|
|
128
|
+
provider: LLMProvider,
|
|
129
|
+
project_root: Path,
|
|
130
|
+
*,
|
|
131
|
+
builder: ContextBuilder | None = None,
|
|
132
|
+
memory: SessionMemory | None = None,
|
|
133
|
+
max_steps: int = 6,
|
|
134
|
+
) -> None:
|
|
135
|
+
self._provider = provider
|
|
136
|
+
self._root = project_root.resolve()
|
|
137
|
+
self._builder = builder or ContextBuilder()
|
|
138
|
+
self._memory = memory or SessionMemory()
|
|
139
|
+
self._max_steps = max_steps
|
|
140
|
+
self._indexed = False
|
|
141
|
+
|
|
142
|
+
def _ensure_indexed(self) -> None:
|
|
143
|
+
if self._indexed:
|
|
144
|
+
return
|
|
145
|
+
from semantic_code_intelligence.config.settings import load_config
|
|
146
|
+
from semantic_code_intelligence.indexing.scanner import scan_repository
|
|
147
|
+
|
|
148
|
+
config = load_config(self._root)
|
|
149
|
+
for sf in scan_repository(self._root, config.index):
|
|
150
|
+
try:
|
|
151
|
+
self._builder.index_file(str(self._root / sf.relative_path))
|
|
152
|
+
except Exception:
|
|
153
|
+
logger.debug("Skip unindexable file: %s", sf.relative_path)
|
|
154
|
+
self._indexed = True
|
|
155
|
+
|
|
156
|
+
def _run_action(self, action: str, action_input: str) -> str:
|
|
157
|
+
"""Execute an action and return its text output."""
|
|
158
|
+
if action == "search":
|
|
159
|
+
return _action_search(action_input, self._root)
|
|
160
|
+
elif action == "analyze":
|
|
161
|
+
self._ensure_indexed()
|
|
162
|
+
return _action_analyze(action_input, self._builder)
|
|
163
|
+
elif action == "deps":
|
|
164
|
+
self._ensure_indexed()
|
|
165
|
+
return _action_deps(action_input, self._builder, self._root)
|
|
166
|
+
return action_input
|
|
167
|
+
|
|
168
|
+
def _parse_plan(self, text: str) -> dict[str, str]:
|
|
169
|
+
"""Parse the LLM planner response into {thought, action, action_input}."""
|
|
170
|
+
# Try JSON first
|
|
171
|
+
try:
|
|
172
|
+
parsed = json.loads(text)
|
|
173
|
+
if isinstance(parsed, dict) and "action" in parsed:
|
|
174
|
+
return {
|
|
175
|
+
"thought": str(parsed.get("thought", "")),
|
|
176
|
+
"action": str(parsed.get("action", "conclude")),
|
|
177
|
+
"action_input": str(parsed.get("action_input", text)),
|
|
178
|
+
}
|
|
179
|
+
except (json.JSONDecodeError, TypeError):
|
|
180
|
+
pass
|
|
181
|
+
# Fallback — treat entire response as conclusion
|
|
182
|
+
return {"thought": "", "action": "conclude", "action_input": text}
|
|
183
|
+
|
|
184
|
+
def _conclude_streaming(self, messages: list[LLMMessage]) -> str:
|
|
185
|
+
"""Stream the final conclusion tokens to stdout, return accumulated text."""
|
|
186
|
+
from semantic_code_intelligence.llm.streaming import stream_chat
|
|
187
|
+
|
|
188
|
+
gen = stream_chat(self._provider, messages)
|
|
189
|
+
accumulated = ""
|
|
190
|
+
import sys
|
|
191
|
+
for event in gen:
|
|
192
|
+
if event.kind == "token":
|
|
193
|
+
accumulated += event.content
|
|
194
|
+
sys.stdout.write(event.content)
|
|
195
|
+
sys.stdout.flush()
|
|
196
|
+
sys.stdout.write("\n")
|
|
197
|
+
return accumulated
|
|
198
|
+
|
|
199
|
+
def investigate(self, question: str, *, stream_conclusion: bool = False) -> InvestigationResult:
|
|
200
|
+
"""Run a full investigation loop and return the result.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
question: The question to investigate.
|
|
204
|
+
stream_conclusion: If True, yield the conclusion token-by-token
|
|
205
|
+
via ``stream_chat`` and print incrementally.
|
|
206
|
+
"""
|
|
207
|
+
chain_id = uuid.uuid4().hex[:10]
|
|
208
|
+
self._memory.start_chain(chain_id)
|
|
209
|
+
|
|
210
|
+
messages: list[LLMMessage] = [
|
|
211
|
+
LLMMessage(role=MessageRole.SYSTEM, content=_PLANNER_SYSTEM),
|
|
212
|
+
LLMMessage(role=MessageRole.USER, content=f"Question: {question}"),
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
steps: list[dict[str, Any]] = []
|
|
216
|
+
conclusion = ""
|
|
217
|
+
|
|
218
|
+
for step_num in range(1, self._max_steps + 1):
|
|
219
|
+
# Ask the planner
|
|
220
|
+
resp = self._provider.chat(messages)
|
|
221
|
+
plan = self._parse_plan(resp.content)
|
|
222
|
+
|
|
223
|
+
action = plan["action"]
|
|
224
|
+
action_input = plan["action_input"]
|
|
225
|
+
thought = plan["thought"]
|
|
226
|
+
|
|
227
|
+
if action == "conclude":
|
|
228
|
+
conclusion = action_input
|
|
229
|
+
self._memory.add_step(chain_id, "conclude", question, conclusion)
|
|
230
|
+
steps.append({
|
|
231
|
+
"step": step_num,
|
|
232
|
+
"action": "conclude",
|
|
233
|
+
"thought": thought,
|
|
234
|
+
"output": conclusion,
|
|
235
|
+
})
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
# Execute the action
|
|
239
|
+
output = self._run_action(action, action_input)
|
|
240
|
+
self._memory.add_step(chain_id, action, action_input, output)
|
|
241
|
+
steps.append({
|
|
242
|
+
"step": step_num,
|
|
243
|
+
"action": action,
|
|
244
|
+
"action_input": action_input,
|
|
245
|
+
"thought": thought,
|
|
246
|
+
"output": output[:500],
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
# Feed result back to planner
|
|
250
|
+
messages.append(LLMMessage(role=MessageRole.ASSISTANT, content=resp.content))
|
|
251
|
+
messages.append(LLMMessage(
|
|
252
|
+
role=MessageRole.USER,
|
|
253
|
+
content=f"Action result:\n{output[:2000]}",
|
|
254
|
+
))
|
|
255
|
+
else:
|
|
256
|
+
# Exhausted steps — ask for final conclusion
|
|
257
|
+
messages.append(LLMMessage(
|
|
258
|
+
role=MessageRole.USER,
|
|
259
|
+
content="You have reached the step limit. Please provide your best conclusion now.",
|
|
260
|
+
))
|
|
261
|
+
if stream_conclusion:
|
|
262
|
+
conclusion = self._conclude_streaming(messages)
|
|
263
|
+
else:
|
|
264
|
+
resp = self._provider.chat(messages)
|
|
265
|
+
conclusion = resp.content
|
|
266
|
+
self._memory.add_step(chain_id, "conclude", "forced", conclusion)
|
|
267
|
+
|
|
268
|
+
return InvestigationResult(
|
|
269
|
+
question=question,
|
|
270
|
+
conclusion=conclusion,
|
|
271
|
+
steps=steps,
|
|
272
|
+
chain_id=chain_id,
|
|
273
|
+
total_steps=len(steps),
|
|
274
|
+
)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Mock LLM provider — deterministic responses for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from semantic_code_intelligence.llm.provider import (
|
|
8
|
+
LLMMessage,
|
|
9
|
+
LLMProvider,
|
|
10
|
+
LLMResponse,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MockProvider(LLMProvider):
|
|
15
|
+
"""A mock LLM provider that returns configurable responses.
|
|
16
|
+
|
|
17
|
+
Useful for unit tests and offline development without a live LLM.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
default_response: str = "This is a mock LLM response.",
|
|
23
|
+
model: str = "mock-model",
|
|
24
|
+
) -> None:
|
|
25
|
+
self._default_response = default_response
|
|
26
|
+
self._model = model
|
|
27
|
+
self._call_history: list[dict[str, Any]] = []
|
|
28
|
+
self._response_queue: list[str] = []
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def name(self) -> str:
|
|
32
|
+
return "mock"
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def call_history(self) -> list[dict[str, Any]]:
|
|
36
|
+
"""Return a log of all calls made to this mock."""
|
|
37
|
+
return list(self._call_history)
|
|
38
|
+
|
|
39
|
+
def enqueue_response(self, response: str) -> None:
|
|
40
|
+
"""Enqueue a custom response. FIFO — next call pops from front."""
|
|
41
|
+
self._response_queue.append(response)
|
|
42
|
+
|
|
43
|
+
def _next_response(self) -> str:
|
|
44
|
+
if self._response_queue:
|
|
45
|
+
return self._response_queue.pop(0)
|
|
46
|
+
return self._default_response
|
|
47
|
+
|
|
48
|
+
def complete(self, prompt: str, **kwargs: Any) -> LLMResponse:
|
|
49
|
+
content = self._next_response()
|
|
50
|
+
self._call_history.append({
|
|
51
|
+
"method": "complete",
|
|
52
|
+
"prompt": prompt,
|
|
53
|
+
"kwargs": kwargs,
|
|
54
|
+
"response": content,
|
|
55
|
+
})
|
|
56
|
+
return LLMResponse(
|
|
57
|
+
content=content,
|
|
58
|
+
model=self._model,
|
|
59
|
+
provider=self.name,
|
|
60
|
+
usage={"prompt_tokens": len(prompt) // 4, "completion_tokens": len(content) // 4, "total_tokens": (len(prompt) + len(content)) // 4},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def chat(self, messages: list[LLMMessage], **kwargs: Any) -> LLMResponse:
|
|
64
|
+
content = self._next_response()
|
|
65
|
+
self._call_history.append({
|
|
66
|
+
"method": "chat",
|
|
67
|
+
"messages": [m.to_dict() for m in messages],
|
|
68
|
+
"kwargs": kwargs,
|
|
69
|
+
"response": content,
|
|
70
|
+
})
|
|
71
|
+
total_chars = sum(len(m.content) for m in messages) + len(content)
|
|
72
|
+
return LLMResponse(
|
|
73
|
+
content=content,
|
|
74
|
+
model=self._model,
|
|
75
|
+
provider=self.name,
|
|
76
|
+
usage={"prompt_tokens": total_chars // 4, "completion_tokens": len(content) // 4, "total_tokens": total_chars // 4},
|
|
77
|
+
)
|