codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
"""Reasoning engine — orchestrates context gathering and LLM interaction.
|
|
2
|
+
|
|
3
|
+
Provides the core logic behind ``codexa ask``, ``codexa review``, and
|
|
4
|
+
``codexa refactor`` by combining:
|
|
5
|
+
1. Semantic search results
|
|
6
|
+
2. Parsed symbol / context data
|
|
7
|
+
3. LLM conversations
|
|
8
|
+
|
|
9
|
+
Each public method returns structured data suitable for CLI display and
|
|
10
|
+
machine consumption.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from semantic_code_intelligence.analysis.ai_features import (
|
|
21
|
+
explain_symbol,
|
|
22
|
+
generate_ai_context,
|
|
23
|
+
summarize_repository,
|
|
24
|
+
)
|
|
25
|
+
from semantic_code_intelligence.context.engine import ContextBuilder
|
|
26
|
+
from semantic_code_intelligence.llm.provider import (
|
|
27
|
+
LLMMessage,
|
|
28
|
+
LLMProvider,
|
|
29
|
+
LLMResponse,
|
|
30
|
+
MessageRole,
|
|
31
|
+
)
|
|
32
|
+
from semantic_code_intelligence.services.search_service import search_codebase
|
|
33
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
34
|
+
|
|
35
|
+
logger = get_logger("llm.reasoning")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Data types
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class AskResult:
|
|
44
|
+
"""Result of an ``ask`` operation."""
|
|
45
|
+
|
|
46
|
+
question: str
|
|
47
|
+
answer: str
|
|
48
|
+
context_snippets: list[dict[str, Any]] = field(default_factory=list)
|
|
49
|
+
llm_response: LLMResponse | None = None
|
|
50
|
+
explainability: dict[str, Any] = field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict[str, Any]:
|
|
53
|
+
return {
|
|
54
|
+
"question": self.question,
|
|
55
|
+
"answer": self.answer,
|
|
56
|
+
"context_snippets": self.context_snippets,
|
|
57
|
+
"usage": self.llm_response.usage if self.llm_response else {},
|
|
58
|
+
"explainability": self.explainability,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class ReviewResult:
|
|
64
|
+
"""Result of a ``review`` operation."""
|
|
65
|
+
|
|
66
|
+
file_path: str
|
|
67
|
+
issues: list[dict[str, Any]] = field(default_factory=list)
|
|
68
|
+
summary: str = ""
|
|
69
|
+
llm_response: LLMResponse | None = None
|
|
70
|
+
explainability: dict[str, Any] = field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
def to_dict(self) -> dict[str, Any]:
|
|
73
|
+
return {
|
|
74
|
+
"file_path": self.file_path,
|
|
75
|
+
"issues": self.issues,
|
|
76
|
+
"summary": self.summary,
|
|
77
|
+
"usage": self.llm_response.usage if self.llm_response else {},
|
|
78
|
+
"explainability": self.explainability,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class RefactorResult:
|
|
84
|
+
"""Result of a ``refactor`` operation."""
|
|
85
|
+
|
|
86
|
+
file_path: str
|
|
87
|
+
original_code: str = ""
|
|
88
|
+
refactored_code: str = ""
|
|
89
|
+
explanation: str = ""
|
|
90
|
+
llm_response: LLMResponse | None = None
|
|
91
|
+
explainability: dict[str, Any] = field(default_factory=dict)
|
|
92
|
+
|
|
93
|
+
def to_dict(self) -> dict[str, Any]:
|
|
94
|
+
return {
|
|
95
|
+
"file_path": self.file_path,
|
|
96
|
+
"original_code": self.original_code,
|
|
97
|
+
"refactored_code": self.refactored_code,
|
|
98
|
+
"explanation": self.explanation,
|
|
99
|
+
"usage": self.llm_response.usage if self.llm_response else {},
|
|
100
|
+
"explainability": self.explainability,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class SuggestResult:
|
|
106
|
+
"""Result of a ``suggest`` operation."""
|
|
107
|
+
|
|
108
|
+
target: str
|
|
109
|
+
suggestions: list[dict[str, Any]] = field(default_factory=list)
|
|
110
|
+
llm_response: LLMResponse | None = None
|
|
111
|
+
explainability: dict[str, Any] = field(default_factory=dict)
|
|
112
|
+
|
|
113
|
+
def to_dict(self) -> dict[str, Any]:
|
|
114
|
+
return {
|
|
115
|
+
"target": self.target,
|
|
116
|
+
"suggestions": self.suggestions,
|
|
117
|
+
"usage": self.llm_response.usage if self.llm_response else {},
|
|
118
|
+
"explainability": self.explainability,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# Reasoning Engine
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
class ReasoningEngine:
|
|
127
|
+
"""High-level reasoning engine that combines semantic context with LLM.
|
|
128
|
+
|
|
129
|
+
This is the central orchestrator for all AI workflows in CodexA.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
DEFAULT_MAX_CONTEXT_CHARS = 6000
|
|
133
|
+
|
|
134
|
+
def __init__(
|
|
135
|
+
self,
|
|
136
|
+
provider: LLMProvider,
|
|
137
|
+
project_root: Path,
|
|
138
|
+
*,
|
|
139
|
+
builder: ContextBuilder | None = None,
|
|
140
|
+
max_context_chars: int | None = None,
|
|
141
|
+
) -> None:
|
|
142
|
+
self._provider = provider
|
|
143
|
+
self._root = project_root.resolve()
|
|
144
|
+
self._builder = builder or ContextBuilder()
|
|
145
|
+
self._indexed = False
|
|
146
|
+
self._max_ctx = max_context_chars or self.DEFAULT_MAX_CONTEXT_CHARS
|
|
147
|
+
|
|
148
|
+
def _ensure_indexed(self) -> None:
|
|
149
|
+
"""Lazy-index the project for symbol/context lookups."""
|
|
150
|
+
if self._indexed:
|
|
151
|
+
return
|
|
152
|
+
from semantic_code_intelligence.config.settings import load_config
|
|
153
|
+
from semantic_code_intelligence.indexing.scanner import scan_repository
|
|
154
|
+
|
|
155
|
+
config = load_config(self._root)
|
|
156
|
+
scanned = scan_repository(self._root, config.index)
|
|
157
|
+
for sf in scanned:
|
|
158
|
+
full_path = str(self._root / sf.relative_path)
|
|
159
|
+
try:
|
|
160
|
+
self._builder.index_file(full_path)
|
|
161
|
+
except Exception:
|
|
162
|
+
logger.debug("Failed to index %s", full_path)
|
|
163
|
+
self._indexed = True
|
|
164
|
+
|
|
165
|
+
# --- gather context helpers ---
|
|
166
|
+
|
|
167
|
+
def _search_context(self, query: str, top_k: int = 5) -> list[dict[str, Any]]:
|
|
168
|
+
"""Run semantic search and return snippet dicts."""
|
|
169
|
+
try:
|
|
170
|
+
results = search_codebase(query, self._root, top_k=top_k, threshold=0.2)
|
|
171
|
+
return [r.to_dict() for r in results]
|
|
172
|
+
except Exception:
|
|
173
|
+
logger.debug("Semantic search unavailable, proceeding without search context.")
|
|
174
|
+
return []
|
|
175
|
+
|
|
176
|
+
def _file_context(self, file_path: str) -> str:
|
|
177
|
+
"""Read a file's content, returning the text."""
|
|
178
|
+
try:
|
|
179
|
+
return Path(file_path).read_text(encoding="utf-8", errors="replace")
|
|
180
|
+
except (OSError, PermissionError):
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
def _symbol_context(self, symbol_name: str) -> str:
|
|
184
|
+
"""Build a text summary of a symbol for use in prompts."""
|
|
185
|
+
self._ensure_indexed()
|
|
186
|
+
matches = self._builder.find_symbol(symbol_name)
|
|
187
|
+
if not matches:
|
|
188
|
+
return ""
|
|
189
|
+
explanations = [explain_symbol(s, self._builder) for s in matches[:3]]
|
|
190
|
+
parts: list[str] = []
|
|
191
|
+
for exp in explanations:
|
|
192
|
+
parts.append(exp.render())
|
|
193
|
+
return "\n\n".join(parts)
|
|
194
|
+
|
|
195
|
+
# --- context pruning & scoring (Phase 12) ---
|
|
196
|
+
|
|
197
|
+
@staticmethod
|
|
198
|
+
def _score_snippet(snippet: dict[str, Any], query_lower: str) -> float:
|
|
199
|
+
"""Compute a priority score for a context snippet.
|
|
200
|
+
|
|
201
|
+
Combines the semantic search score with a keyword-overlap bonus.
|
|
202
|
+
"""
|
|
203
|
+
base = float(snippet.get("score", 0.0))
|
|
204
|
+
content = snippet.get("content", snippet.get("chunk", "")).lower()
|
|
205
|
+
# Keyword overlap bonus: fraction of query words found in snippet
|
|
206
|
+
words = [w for w in query_lower.split() if len(w) > 2]
|
|
207
|
+
if words:
|
|
208
|
+
found = sum(1 for w in words if w in content)
|
|
209
|
+
base += 0.1 * (found / len(words))
|
|
210
|
+
return round(base, 4)
|
|
211
|
+
|
|
212
|
+
def _prune_context(
|
|
213
|
+
self,
|
|
214
|
+
snippets: list[dict[str, Any]],
|
|
215
|
+
query: str,
|
|
216
|
+
max_chars: int | None = None,
|
|
217
|
+
) -> list[dict[str, Any]]:
|
|
218
|
+
"""Score, rank, and prune snippets to stay within token budget.
|
|
219
|
+
|
|
220
|
+
Returns a subset of *snippets* sorted by priority score (descending),
|
|
221
|
+
trimmed so total character count stays within *max_chars*.
|
|
222
|
+
"""
|
|
223
|
+
limit = max_chars or self._max_ctx
|
|
224
|
+
query_lower = query.lower()
|
|
225
|
+
scored = [
|
|
226
|
+
(self._score_snippet(s, query_lower), s)
|
|
227
|
+
for s in snippets
|
|
228
|
+
]
|
|
229
|
+
scored.sort(key=lambda t: t[0], reverse=True)
|
|
230
|
+
|
|
231
|
+
kept: list[dict[str, Any]] = []
|
|
232
|
+
total = 0
|
|
233
|
+
for score, snip in scored:
|
|
234
|
+
text = snip.get("content", snip.get("chunk", ""))
|
|
235
|
+
if total + len(text) > limit and kept:
|
|
236
|
+
break
|
|
237
|
+
snip["priority_score"] = score
|
|
238
|
+
kept.append(snip)
|
|
239
|
+
total += len(text)
|
|
240
|
+
return kept
|
|
241
|
+
|
|
242
|
+
# --- public AI workflows ---
|
|
243
|
+
|
|
244
|
+
def ask(self, question: str, *, top_k: int = 5) -> AskResult:
|
|
245
|
+
"""Answer a natural-language question about the codebase.
|
|
246
|
+
|
|
247
|
+
Gathers semantic search results and repo context, then asks the LLM.
|
|
248
|
+
"""
|
|
249
|
+
raw_snippets = self._search_context(question, top_k=top_k)
|
|
250
|
+
snippets = self._prune_context(raw_snippets, question)
|
|
251
|
+
self._ensure_indexed()
|
|
252
|
+
repo_summary = summarize_repository(self._builder).render()
|
|
253
|
+
|
|
254
|
+
# Build prompt
|
|
255
|
+
context_text = ""
|
|
256
|
+
for snip in snippets:
|
|
257
|
+
context_text += (
|
|
258
|
+
f"\n--- {snip.get('file_path', '?')} "
|
|
259
|
+
f"(score: {snip.get('score', 0):.2f}) ---\n"
|
|
260
|
+
f"{snip.get('content', snip.get('chunk', ''))}\n"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
system = (
|
|
264
|
+
"You are CodexA, an AI coding assistant. Answer questions about the "
|
|
265
|
+
"user's codebase using the provided context. Be concise, accurate, "
|
|
266
|
+
"and cite file paths when relevant."
|
|
267
|
+
)
|
|
268
|
+
user_msg = (
|
|
269
|
+
f"Repository summary:\n{repo_summary}\n\n"
|
|
270
|
+
f"Relevant code snippets:{context_text}\n\n"
|
|
271
|
+
f"Question: {question}"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
messages = [
|
|
275
|
+
LLMMessage(role=MessageRole.SYSTEM, content=system),
|
|
276
|
+
LLMMessage(role=MessageRole.USER, content=user_msg),
|
|
277
|
+
]
|
|
278
|
+
resp = self._provider.chat(messages)
|
|
279
|
+
|
|
280
|
+
return AskResult(
|
|
281
|
+
question=question,
|
|
282
|
+
answer=resp.content,
|
|
283
|
+
context_snippets=snippets,
|
|
284
|
+
llm_response=resp,
|
|
285
|
+
explainability={
|
|
286
|
+
"snippets_before_pruning": len(raw_snippets),
|
|
287
|
+
"snippets_after_pruning": len(snippets),
|
|
288
|
+
"context_chars": sum(len(s.get("content", "")) for s in snippets),
|
|
289
|
+
"method": "semantic_search+pruning",
|
|
290
|
+
},
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
def review(self, file_path: str) -> ReviewResult:
|
|
294
|
+
"""Review a file for potential issues, bugs, and improvements."""
|
|
295
|
+
content = self._file_context(file_path)
|
|
296
|
+
if not content:
|
|
297
|
+
return ReviewResult(file_path=file_path, summary="File not found or empty.")
|
|
298
|
+
|
|
299
|
+
self._ensure_indexed()
|
|
300
|
+
symbols = self._builder.get_symbols(file_path)
|
|
301
|
+
symbol_info = "\n".join(
|
|
302
|
+
f" - {s.kind} '{s.name}' (L{s.start_line}-{s.end_line})"
|
|
303
|
+
for s in symbols[:30]
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
system = (
|
|
307
|
+
"You are CodexA, a code reviewer. Analyze the provided file and "
|
|
308
|
+
"return a JSON object with keys: 'issues' (list of objects with "
|
|
309
|
+
"'severity', 'line', 'message', 'suggestion') and 'summary' (string). "
|
|
310
|
+
"Severities: 'error', 'warning', 'info'. Be precise and actionable."
|
|
311
|
+
)
|
|
312
|
+
user_msg = (
|
|
313
|
+
f"File: {file_path}\n\n"
|
|
314
|
+
f"Symbols:\n{symbol_info}\n\n"
|
|
315
|
+
f"Source code:\n```\n{content[:8000]}\n```"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
messages = [
|
|
319
|
+
LLMMessage(role=MessageRole.SYSTEM, content=system),
|
|
320
|
+
LLMMessage(role=MessageRole.USER, content=user_msg),
|
|
321
|
+
]
|
|
322
|
+
resp = self._provider.chat(messages)
|
|
323
|
+
|
|
324
|
+
# Parse JSON from response — fallback to raw text
|
|
325
|
+
issues: list[dict[str, Any]] = []
|
|
326
|
+
summary = resp.content
|
|
327
|
+
try:
|
|
328
|
+
parsed = json.loads(resp.content)
|
|
329
|
+
if isinstance(parsed, dict):
|
|
330
|
+
issues = parsed.get("issues", [])
|
|
331
|
+
summary = parsed.get("summary", resp.content)
|
|
332
|
+
except (json.JSONDecodeError, TypeError):
|
|
333
|
+
pass
|
|
334
|
+
|
|
335
|
+
return ReviewResult(
|
|
336
|
+
file_path=file_path,
|
|
337
|
+
issues=issues,
|
|
338
|
+
summary=summary,
|
|
339
|
+
llm_response=resp,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def refactor(
|
|
343
|
+
self,
|
|
344
|
+
file_path: str,
|
|
345
|
+
instruction: str = "Improve code quality, readability, and performance.",
|
|
346
|
+
) -> RefactorResult:
|
|
347
|
+
"""Suggest refactored code for a file based on an instruction."""
|
|
348
|
+
content = self._file_context(file_path)
|
|
349
|
+
if not content:
|
|
350
|
+
return RefactorResult(file_path=file_path, explanation="File not found or empty.")
|
|
351
|
+
|
|
352
|
+
system = (
|
|
353
|
+
"You are CodexA, a code refactoring assistant. Given the source code "
|
|
354
|
+
"and an instruction, return a JSON object with 'refactored_code' "
|
|
355
|
+
"(the improved code as a string) and 'explanation' (what you changed "
|
|
356
|
+
"and why). Do NOT include markdown fences inside the JSON values."
|
|
357
|
+
)
|
|
358
|
+
user_msg = (
|
|
359
|
+
f"File: {file_path}\n"
|
|
360
|
+
f"Instruction: {instruction}\n\n"
|
|
361
|
+
f"Source code:\n```\n{content[:8000]}\n```"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
messages = [
|
|
365
|
+
LLMMessage(role=MessageRole.SYSTEM, content=system),
|
|
366
|
+
LLMMessage(role=MessageRole.USER, content=user_msg),
|
|
367
|
+
]
|
|
368
|
+
resp = self._provider.chat(messages)
|
|
369
|
+
|
|
370
|
+
refactored = ""
|
|
371
|
+
explanation = resp.content
|
|
372
|
+
try:
|
|
373
|
+
parsed = json.loads(resp.content)
|
|
374
|
+
if isinstance(parsed, dict):
|
|
375
|
+
refactored = parsed.get("refactored_code", "")
|
|
376
|
+
explanation = parsed.get("explanation", resp.content)
|
|
377
|
+
except (json.JSONDecodeError, TypeError):
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
return RefactorResult(
|
|
381
|
+
file_path=file_path,
|
|
382
|
+
original_code=content,
|
|
383
|
+
refactored_code=refactored,
|
|
384
|
+
explanation=explanation,
|
|
385
|
+
llm_response=resp,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
def suggest(self, target: str, *, top_k: int = 5) -> SuggestResult:
|
|
389
|
+
"""Generate intelligent suggestions for a symbol, file, or topic.
|
|
390
|
+
|
|
391
|
+
Combines call-graph, dependency, and semantic data with LLM reasoning
|
|
392
|
+
to produce actionable suggestions with "why" reasoning.
|
|
393
|
+
"""
|
|
394
|
+
self._ensure_indexed()
|
|
395
|
+
snippets = self._search_context(target, top_k=top_k)
|
|
396
|
+
sym_context = self._symbol_context(target)
|
|
397
|
+
|
|
398
|
+
system = (
|
|
399
|
+
"You are CodexA, an intelligent code suggestion engine. Given context "
|
|
400
|
+
"about a codebase element, provide suggestions for improvements, fixes, "
|
|
401
|
+
"or optimizations. Return a JSON object with 'suggestions' — a list of "
|
|
402
|
+
"objects each having 'title', 'description', 'reason', and 'priority' "
|
|
403
|
+
"(high/medium/low)."
|
|
404
|
+
)
|
|
405
|
+
context_text = ""
|
|
406
|
+
for snip in snippets:
|
|
407
|
+
context_text += (
|
|
408
|
+
f"\n--- {snip.get('file_path', '?')} ---\n"
|
|
409
|
+
f"{snip.get('content', snip.get('chunk', ''))}\n"
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
user_msg = (
|
|
413
|
+
f"Target: {target}\n\n"
|
|
414
|
+
f"Symbol info:\n{sym_context}\n\n"
|
|
415
|
+
f"Related code:{context_text}"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
messages = [
|
|
419
|
+
LLMMessage(role=MessageRole.SYSTEM, content=system),
|
|
420
|
+
LLMMessage(role=MessageRole.USER, content=user_msg),
|
|
421
|
+
]
|
|
422
|
+
resp = self._provider.chat(messages)
|
|
423
|
+
|
|
424
|
+
suggestions: list[dict[str, Any]] = []
|
|
425
|
+
try:
|
|
426
|
+
parsed = json.loads(resp.content)
|
|
427
|
+
if isinstance(parsed, dict):
|
|
428
|
+
suggestions = parsed.get("suggestions", [])
|
|
429
|
+
elif isinstance(parsed, list):
|
|
430
|
+
suggestions = parsed
|
|
431
|
+
except (json.JSONDecodeError, TypeError):
|
|
432
|
+
suggestions = [{"title": "Raw response", "description": resp.content, "reason": "", "priority": "medium"}]
|
|
433
|
+
|
|
434
|
+
return SuggestResult(
|
|
435
|
+
target=target,
|
|
436
|
+
suggestions=suggestions,
|
|
437
|
+
llm_response=resp,
|
|
438
|
+
)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Safety validator — checks LLM outputs before they are applied.
|
|
2
|
+
|
|
3
|
+
Provides basic guardrails to ensure LLM-generated code does not contain
|
|
4
|
+
obviously dangerous patterns. This is not a comprehensive security tool;
|
|
5
|
+
it is a first line of defence within the coding assistant workflow.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger("llm.safety")
|
|
17
|
+
|
|
18
|
+
# Patterns that should never appear in AI-generated code destined for execution.
|
|
19
|
+
_DANGEROUS_PATTERNS: list[tuple[str, str]] = [
|
|
20
|
+
# Command execution
|
|
21
|
+
(r"\bos\.system\s*\(", "os.system() call — use subprocess with shell=False instead"),
|
|
22
|
+
(r"subprocess\..*shell\s*=\s*True", "subprocess with shell=True — potential command injection"),
|
|
23
|
+
(r"\brm\s+-rf\s+/", "Destructive rm -rf / command"),
|
|
24
|
+
# Dynamic code execution
|
|
25
|
+
(r"\beval\s*\(", "eval() call — avoid dynamic code execution"),
|
|
26
|
+
(r"\bexec\s*\(", "exec() call — avoid dynamic code execution"),
|
|
27
|
+
(r"\b__import__\s*\(", "Dynamic __import__() — use explicit imports"),
|
|
28
|
+
# SQL injection risk
|
|
29
|
+
(r"DROP\s+TABLE|DROP\s+DATABASE", "SQL DROP statement — potential data loss"),
|
|
30
|
+
(r"TRUNCATE\s+TABLE", "SQL TRUNCATE statement — potential data loss"),
|
|
31
|
+
# Path traversal
|
|
32
|
+
(r"\.\./\.\./", "Path traversal pattern — potential directory escape"),
|
|
33
|
+
# Hardcoded secrets (Phase 12)
|
|
34
|
+
(r"""(?:password|secret|api_key|token)\s*=\s*["'][^"']{8,}["']""",
|
|
35
|
+
"Hardcoded secret — use environment variables or a secrets manager"),
|
|
36
|
+
# XSS risk (Phase 12)
|
|
37
|
+
(r"innerHTML\s*=", "innerHTML assignment — potential XSS vulnerability"),
|
|
38
|
+
(r"document\.write\s*\(", "document.write() — potential XSS vulnerability"),
|
|
39
|
+
# Insecure crypto (Phase 12)
|
|
40
|
+
(r"\bMD5\s*\(|\bmd5\s*\(", "MD5 hash — use SHA-256 or stronger for security"),
|
|
41
|
+
(r"\bSHA1\s*\(|\bsha1\s*\(", "SHA-1 hash — use SHA-256 or stronger for security"),
|
|
42
|
+
# Insecure network (Phase 12)
|
|
43
|
+
(r"http://(?!localhost|127\.0\.0\.1)", "Insecure HTTP URL — use HTTPS instead"),
|
|
44
|
+
(r"verify\s*=\s*False", "SSL verification disabled — potential MITM vulnerability"),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class SafetyIssue:
|
|
50
|
+
"""A single safety issue found in LLM output."""
|
|
51
|
+
|
|
52
|
+
pattern: str
|
|
53
|
+
description: str
|
|
54
|
+
line_number: int = 0
|
|
55
|
+
severity: str = "warning"
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> dict[str, Any]:
|
|
58
|
+
return {
|
|
59
|
+
"pattern": self.pattern,
|
|
60
|
+
"description": self.description,
|
|
61
|
+
"line_number": self.line_number,
|
|
62
|
+
"severity": self.severity,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class SafetyReport:
|
|
68
|
+
"""Result of a safety validation pass."""
|
|
69
|
+
|
|
70
|
+
safe: bool = True
|
|
71
|
+
issues: list[SafetyIssue] = field(default_factory=list)
|
|
72
|
+
|
|
73
|
+
def to_dict(self) -> dict[str, Any]:
|
|
74
|
+
return {
|
|
75
|
+
"safe": self.safe,
|
|
76
|
+
"issue_count": len(self.issues),
|
|
77
|
+
"issues": [i.to_dict() for i in self.issues],
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SafetyValidator:
|
|
82
|
+
"""Validates LLM-generated code for known dangerous patterns."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, extra_patterns: list[tuple[str, str]] | None = None) -> None:
|
|
85
|
+
self._patterns = list(_DANGEROUS_PATTERNS)
|
|
86
|
+
if extra_patterns:
|
|
87
|
+
self._patterns.extend(extra_patterns)
|
|
88
|
+
|
|
89
|
+
def validate(self, code: str) -> SafetyReport:
|
|
90
|
+
"""Scan ``code`` for dangerous patterns.
|
|
91
|
+
|
|
92
|
+
Returns a SafetyReport. If any issues are found, ``safe`` is False.
|
|
93
|
+
"""
|
|
94
|
+
issues: list[SafetyIssue] = []
|
|
95
|
+
for line_no, line in enumerate(code.splitlines(), start=1):
|
|
96
|
+
for pattern, description in self._patterns:
|
|
97
|
+
if re.search(pattern, line, re.IGNORECASE):
|
|
98
|
+
issues.append(
|
|
99
|
+
SafetyIssue(
|
|
100
|
+
pattern=pattern,
|
|
101
|
+
description=description,
|
|
102
|
+
line_number=line_no,
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return SafetyReport(safe=len(issues) == 0, issues=issues)
|
|
107
|
+
|
|
108
|
+
def is_safe(self, code: str) -> bool:
|
|
109
|
+
"""Quick boolean check — True if no safety issues found."""
|
|
110
|
+
return self.validate(code).safe
|