cognify-code 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_code_assistant/__init__.py +14 -0
- ai_code_assistant/agent/__init__.py +63 -0
- ai_code_assistant/agent/code_agent.py +461 -0
- ai_code_assistant/agent/code_generator.py +388 -0
- ai_code_assistant/agent/code_reviewer.py +365 -0
- ai_code_assistant/agent/diff_engine.py +308 -0
- ai_code_assistant/agent/file_manager.py +300 -0
- ai_code_assistant/agent/intent_classifier.py +284 -0
- ai_code_assistant/chat/__init__.py +11 -0
- ai_code_assistant/chat/agent_session.py +156 -0
- ai_code_assistant/chat/session.py +165 -0
- ai_code_assistant/cli.py +1571 -0
- ai_code_assistant/config.py +149 -0
- ai_code_assistant/editor/__init__.py +8 -0
- ai_code_assistant/editor/diff_handler.py +270 -0
- ai_code_assistant/editor/file_editor.py +350 -0
- ai_code_assistant/editor/prompts.py +146 -0
- ai_code_assistant/generator/__init__.py +7 -0
- ai_code_assistant/generator/code_gen.py +265 -0
- ai_code_assistant/generator/prompts.py +114 -0
- ai_code_assistant/git/__init__.py +6 -0
- ai_code_assistant/git/commit_generator.py +130 -0
- ai_code_assistant/git/manager.py +203 -0
- ai_code_assistant/llm.py +111 -0
- ai_code_assistant/providers/__init__.py +23 -0
- ai_code_assistant/providers/base.py +124 -0
- ai_code_assistant/providers/cerebras.py +97 -0
- ai_code_assistant/providers/factory.py +148 -0
- ai_code_assistant/providers/google.py +103 -0
- ai_code_assistant/providers/groq.py +111 -0
- ai_code_assistant/providers/ollama.py +86 -0
- ai_code_assistant/providers/openai.py +114 -0
- ai_code_assistant/providers/openrouter.py +130 -0
- ai_code_assistant/py.typed +0 -0
- ai_code_assistant/refactor/__init__.py +20 -0
- ai_code_assistant/refactor/analyzer.py +189 -0
- ai_code_assistant/refactor/change_plan.py +172 -0
- ai_code_assistant/refactor/multi_file_editor.py +346 -0
- ai_code_assistant/refactor/prompts.py +175 -0
- ai_code_assistant/retrieval/__init__.py +19 -0
- ai_code_assistant/retrieval/chunker.py +215 -0
- ai_code_assistant/retrieval/indexer.py +236 -0
- ai_code_assistant/retrieval/search.py +239 -0
- ai_code_assistant/reviewer/__init__.py +7 -0
- ai_code_assistant/reviewer/analyzer.py +278 -0
- ai_code_assistant/reviewer/prompts.py +113 -0
- ai_code_assistant/utils/__init__.py +18 -0
- ai_code_assistant/utils/file_handler.py +155 -0
- ai_code_assistant/utils/formatters.py +259 -0
- cognify_code-0.2.0.dist-info/METADATA +383 -0
- cognify_code-0.2.0.dist-info/RECORD +55 -0
- cognify_code-0.2.0.dist-info/WHEEL +5 -0
- cognify_code-0.2.0.dist-info/entry_points.txt +3 -0
- cognify_code-0.2.0.dist-info/licenses/LICENSE +22 -0
- cognify_code-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code chunking strategies for semantic indexing.
|
|
3
|
+
|
|
4
|
+
Chunks code into meaningful segments for better retrieval.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CodeChunk:
|
|
15
|
+
"""Represents a chunk of code with metadata."""
|
|
16
|
+
content: str
|
|
17
|
+
file_path: str
|
|
18
|
+
start_line: int
|
|
19
|
+
end_line: int
|
|
20
|
+
chunk_type: str # function, class, module, block
|
|
21
|
+
name: Optional[str] = None
|
|
22
|
+
language: Optional[str] = None
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def id(self) -> str:
|
|
26
|
+
"""Generate unique ID for this chunk."""
|
|
27
|
+
return f"{self.file_path}:{self.start_line}-{self.end_line}"
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> dict:
|
|
30
|
+
"""Convert to dictionary for storage."""
|
|
31
|
+
return {
|
|
32
|
+
"content": self.content,
|
|
33
|
+
"file_path": self.file_path,
|
|
34
|
+
"start_line": self.start_line,
|
|
35
|
+
"end_line": self.end_line,
|
|
36
|
+
"chunk_type": self.chunk_type,
|
|
37
|
+
"name": self.name or "",
|
|
38
|
+
"language": self.language or "",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class CodeChunker:
|
|
44
|
+
"""Splits code files into semantic chunks."""
|
|
45
|
+
|
|
46
|
+
chunk_size: int = 50 # Max lines per chunk
|
|
47
|
+
chunk_overlap: int = 10 # Overlap between chunks
|
|
48
|
+
|
|
49
|
+
# Language detection by extension
|
|
50
|
+
LANGUAGE_MAP: dict = field(default_factory=lambda: {
|
|
51
|
+
".py": "python",
|
|
52
|
+
".js": "javascript",
|
|
53
|
+
".ts": "typescript",
|
|
54
|
+
".jsx": "javascript",
|
|
55
|
+
".tsx": "typescript",
|
|
56
|
+
".java": "java",
|
|
57
|
+
".go": "go",
|
|
58
|
+
".rs": "rust",
|
|
59
|
+
".rb": "ruby",
|
|
60
|
+
".php": "php",
|
|
61
|
+
".c": "c",
|
|
62
|
+
".cpp": "cpp",
|
|
63
|
+
".h": "c",
|
|
64
|
+
".hpp": "cpp",
|
|
65
|
+
".cs": "csharp",
|
|
66
|
+
".swift": "swift",
|
|
67
|
+
".kt": "kotlin",
|
|
68
|
+
".scala": "scala",
|
|
69
|
+
".sh": "bash",
|
|
70
|
+
".yaml": "yaml",
|
|
71
|
+
".yml": "yaml",
|
|
72
|
+
".json": "json",
|
|
73
|
+
".md": "markdown",
|
|
74
|
+
".sql": "sql",
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
# Patterns for detecting code boundaries
|
|
78
|
+
PYTHON_PATTERNS = {
|
|
79
|
+
"class": r"^class\s+(\w+)",
|
|
80
|
+
"function": r"^(?:async\s+)?def\s+(\w+)",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def detect_language(self, file_path: str) -> Optional[str]:
|
|
84
|
+
"""Detect programming language from file extension."""
|
|
85
|
+
ext = Path(file_path).suffix.lower()
|
|
86
|
+
return self.LANGUAGE_MAP.get(ext)
|
|
87
|
+
|
|
88
|
+
def chunk_file(self, file_path: str, content: str) -> List[CodeChunk]:
|
|
89
|
+
"""Split a file into semantic chunks."""
|
|
90
|
+
language = self.detect_language(file_path)
|
|
91
|
+
lines = content.split("\n")
|
|
92
|
+
|
|
93
|
+
if language == "python":
|
|
94
|
+
return self._chunk_python(file_path, lines, language)
|
|
95
|
+
else:
|
|
96
|
+
return self._chunk_generic(file_path, lines, language)
|
|
97
|
+
|
|
98
|
+
def _chunk_python(self, file_path: str, lines: List[str], language: str) -> List[CodeChunk]:
|
|
99
|
+
"""Chunk Python code by class/function boundaries."""
|
|
100
|
+
chunks = []
|
|
101
|
+
current_chunk_start = 0
|
|
102
|
+
current_chunk_lines = []
|
|
103
|
+
current_name = None
|
|
104
|
+
current_type = "module"
|
|
105
|
+
|
|
106
|
+
for i, line in enumerate(lines):
|
|
107
|
+
# Check for class definition
|
|
108
|
+
class_match = re.match(self.PYTHON_PATTERNS["class"], line)
|
|
109
|
+
func_match = re.match(self.PYTHON_PATTERNS["function"], line)
|
|
110
|
+
|
|
111
|
+
if class_match or func_match:
|
|
112
|
+
# Save previous chunk if exists
|
|
113
|
+
if current_chunk_lines:
|
|
114
|
+
chunks.append(CodeChunk(
|
|
115
|
+
content="\n".join(current_chunk_lines),
|
|
116
|
+
file_path=file_path,
|
|
117
|
+
start_line=current_chunk_start + 1,
|
|
118
|
+
end_line=i,
|
|
119
|
+
chunk_type=current_type,
|
|
120
|
+
name=current_name,
|
|
121
|
+
language=language,
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
# Start new chunk
|
|
125
|
+
current_chunk_start = i
|
|
126
|
+
current_chunk_lines = [line]
|
|
127
|
+
if class_match:
|
|
128
|
+
current_name = class_match.group(1)
|
|
129
|
+
current_type = "class"
|
|
130
|
+
else:
|
|
131
|
+
current_name = func_match.group(1)
|
|
132
|
+
current_type = "function"
|
|
133
|
+
else:
|
|
134
|
+
current_chunk_lines.append(line)
|
|
135
|
+
|
|
136
|
+
# Check if chunk is too large
|
|
137
|
+
if len(current_chunk_lines) >= self.chunk_size:
|
|
138
|
+
chunks.append(CodeChunk(
|
|
139
|
+
content="\n".join(current_chunk_lines),
|
|
140
|
+
file_path=file_path,
|
|
141
|
+
start_line=current_chunk_start + 1,
|
|
142
|
+
end_line=i + 1,
|
|
143
|
+
chunk_type=current_type,
|
|
144
|
+
name=current_name,
|
|
145
|
+
language=language,
|
|
146
|
+
))
|
|
147
|
+
# Start new chunk with overlap
|
|
148
|
+
overlap_start = max(0, len(current_chunk_lines) - self.chunk_overlap)
|
|
149
|
+
current_chunk_lines = current_chunk_lines[overlap_start:]
|
|
150
|
+
current_chunk_start = i - len(current_chunk_lines) + 1
|
|
151
|
+
|
|
152
|
+
# Add final chunk
|
|
153
|
+
if current_chunk_lines:
|
|
154
|
+
chunks.append(CodeChunk(
|
|
155
|
+
content="\n".join(current_chunk_lines),
|
|
156
|
+
file_path=file_path,
|
|
157
|
+
start_line=current_chunk_start + 1,
|
|
158
|
+
end_line=len(lines),
|
|
159
|
+
chunk_type=current_type,
|
|
160
|
+
name=current_name,
|
|
161
|
+
language=language,
|
|
162
|
+
))
|
|
163
|
+
|
|
164
|
+
return chunks
|
|
165
|
+
|
|
166
|
+
def _chunk_generic(self, file_path: str, lines: List[str], language: Optional[str]) -> List[CodeChunk]:
|
|
167
|
+
"""Chunk code using sliding window approach."""
|
|
168
|
+
chunks = []
|
|
169
|
+
total_lines = len(lines)
|
|
170
|
+
|
|
171
|
+
if total_lines == 0:
|
|
172
|
+
return chunks
|
|
173
|
+
|
|
174
|
+
# If file is small enough, return as single chunk
|
|
175
|
+
if total_lines <= self.chunk_size:
|
|
176
|
+
chunks.append(CodeChunk(
|
|
177
|
+
content="\n".join(lines),
|
|
178
|
+
file_path=file_path,
|
|
179
|
+
start_line=1,
|
|
180
|
+
end_line=total_lines,
|
|
181
|
+
chunk_type="module",
|
|
182
|
+
name=Path(file_path).stem,
|
|
183
|
+
language=language,
|
|
184
|
+
))
|
|
185
|
+
return chunks
|
|
186
|
+
|
|
187
|
+
# Use sliding window
|
|
188
|
+
start = 0
|
|
189
|
+
while start < total_lines:
|
|
190
|
+
end = min(start + self.chunk_size, total_lines)
|
|
191
|
+
chunk_lines = lines[start:end]
|
|
192
|
+
|
|
193
|
+
chunks.append(CodeChunk(
|
|
194
|
+
content="\n".join(chunk_lines),
|
|
195
|
+
file_path=file_path,
|
|
196
|
+
start_line=start + 1,
|
|
197
|
+
end_line=end,
|
|
198
|
+
chunk_type="block",
|
|
199
|
+
name=f"{Path(file_path).stem}:{start+1}-{end}",
|
|
200
|
+
language=language,
|
|
201
|
+
))
|
|
202
|
+
|
|
203
|
+
# Move window with overlap
|
|
204
|
+
start += self.chunk_size - self.chunk_overlap
|
|
205
|
+
|
|
206
|
+
# Avoid tiny final chunks
|
|
207
|
+
if total_lines - start < self.chunk_overlap:
|
|
208
|
+
break
|
|
209
|
+
|
|
210
|
+
return chunks
|
|
211
|
+
|
|
212
|
+
def chunk_text(self, text: str, file_path: str = "unknown") -> List[CodeChunk]:
|
|
213
|
+
"""Convenience method to chunk text directly."""
|
|
214
|
+
return self.chunk_file(file_path, text)
|
|
215
|
+
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Codebase indexer using ChromaDB and sentence-transformers.
|
|
3
|
+
|
|
4
|
+
Indexes code files for semantic search.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import hashlib
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional, Set
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
import chromadb
|
|
14
|
+
from chromadb.config import Settings
|
|
15
|
+
from sentence_transformers import SentenceTransformer
|
|
16
|
+
|
|
17
|
+
from .chunker import CodeChunker, CodeChunk
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class IndexStats:
|
|
22
|
+
"""Statistics about the index."""
|
|
23
|
+
total_files: int = 0
|
|
24
|
+
total_chunks: int = 0
|
|
25
|
+
indexed_files: List[str] = field(default_factory=list)
|
|
26
|
+
skipped_files: List[str] = field(default_factory=list)
|
|
27
|
+
errors: List[str] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class IndexConfig:
|
|
32
|
+
"""Configuration for the indexer."""
|
|
33
|
+
# Embedding model (runs locally)
|
|
34
|
+
embedding_model: str = "all-MiniLM-L6-v2"
|
|
35
|
+
# ChromaDB persistence directory
|
|
36
|
+
persist_directory: str = ".ai-assistant-index"
|
|
37
|
+
# Collection name
|
|
38
|
+
collection_name: str = "codebase"
|
|
39
|
+
# File extensions to index
|
|
40
|
+
extensions: Set[str] = field(default_factory=lambda: {
|
|
41
|
+
".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rs",
|
|
42
|
+
".rb", ".php", ".c", ".cpp", ".h", ".hpp", ".cs", ".swift",
|
|
43
|
+
".kt", ".scala", ".sh", ".yaml", ".yml", ".json", ".md", ".sql",
|
|
44
|
+
".html", ".css", ".scss", ".vue", ".svelte",
|
|
45
|
+
})
|
|
46
|
+
# Directories to ignore
|
|
47
|
+
ignore_dirs: Set[str] = field(default_factory=lambda: {
|
|
48
|
+
".git", ".svn", ".hg", "node_modules", "__pycache__", ".pytest_cache",
|
|
49
|
+
".mypy_cache", ".tox", ".venv", "venv", "env", ".env", "dist", "build",
|
|
50
|
+
"target", ".idea", ".vscode", "coverage", ".coverage", "htmlcov",
|
|
51
|
+
".eggs", "*.egg-info", ".ai-assistant-index",
|
|
52
|
+
})
|
|
53
|
+
# Max file size in bytes (skip large files)
|
|
54
|
+
max_file_size: int = 1024 * 1024 # 1MB
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class CodebaseIndexer:
|
|
58
|
+
"""Indexes codebase into ChromaDB for semantic search."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, config: Optional[IndexConfig] = None, root_path: Optional[str] = None):
|
|
61
|
+
"""Initialize the indexer.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config: Index configuration
|
|
65
|
+
root_path: Root directory of the codebase
|
|
66
|
+
"""
|
|
67
|
+
self.config = config or IndexConfig()
|
|
68
|
+
self.root_path = Path(root_path) if root_path else Path.cwd()
|
|
69
|
+
self.chunker = CodeChunker()
|
|
70
|
+
|
|
71
|
+
# Initialize embedding model (lazy load)
|
|
72
|
+
self._embedder: Optional[SentenceTransformer] = None
|
|
73
|
+
|
|
74
|
+
# Initialize ChromaDB
|
|
75
|
+
persist_path = self.root_path / self.config.persist_directory
|
|
76
|
+
self._client = chromadb.PersistentClient(
|
|
77
|
+
path=str(persist_path),
|
|
78
|
+
settings=Settings(anonymized_telemetry=False),
|
|
79
|
+
)
|
|
80
|
+
self._collection = self._client.get_or_create_collection(
|
|
81
|
+
name=self.config.collection_name,
|
|
82
|
+
metadata={"description": "Codebase index for semantic search"},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def embedder(self) -> SentenceTransformer:
|
|
87
|
+
"""Lazy load the embedding model."""
|
|
88
|
+
if self._embedder is None:
|
|
89
|
+
self._embedder = SentenceTransformer(self.config.embedding_model)
|
|
90
|
+
return self._embedder
|
|
91
|
+
|
|
92
|
+
def _should_index_file(self, file_path: Path) -> bool:
|
|
93
|
+
"""Check if a file should be indexed."""
|
|
94
|
+
# Check extension
|
|
95
|
+
if file_path.suffix.lower() not in self.config.extensions:
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
# Check if in ignored directory
|
|
99
|
+
for part in file_path.parts:
|
|
100
|
+
if part in self.config.ignore_dirs:
|
|
101
|
+
return False
|
|
102
|
+
# Handle glob patterns like *.egg-info
|
|
103
|
+
for pattern in self.config.ignore_dirs:
|
|
104
|
+
if "*" in pattern and file_path.match(pattern):
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
# Check file size
|
|
108
|
+
try:
|
|
109
|
+
if file_path.stat().st_size > self.config.max_file_size:
|
|
110
|
+
return False
|
|
111
|
+
except OSError:
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def _get_file_hash(self, content: str) -> str:
|
|
117
|
+
"""Get hash of file content for change detection."""
|
|
118
|
+
return hashlib.md5(content.encode()).hexdigest()
|
|
119
|
+
|
|
120
|
+
def _read_file(self, file_path: Path) -> Optional[str]:
|
|
121
|
+
"""Safely read a file."""
|
|
122
|
+
try:
|
|
123
|
+
return file_path.read_text(encoding="utf-8", errors="ignore")
|
|
124
|
+
except Exception:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
def index_file(self, file_path: Path, stats: IndexStats) -> bool:
|
|
128
|
+
"""Index a single file.
|
|
129
|
+
|
|
130
|
+
Returns True if file was indexed, False if skipped.
|
|
131
|
+
"""
|
|
132
|
+
content = self._read_file(file_path)
|
|
133
|
+
if content is None:
|
|
134
|
+
stats.errors.append(f"Could not read: {file_path}")
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
# Get relative path for storage
|
|
138
|
+
try:
|
|
139
|
+
rel_path = str(file_path.relative_to(self.root_path))
|
|
140
|
+
except ValueError:
|
|
141
|
+
rel_path = str(file_path)
|
|
142
|
+
|
|
143
|
+
# Chunk the file
|
|
144
|
+
chunks = self.chunker.chunk_file(rel_path, content)
|
|
145
|
+
if not chunks:
|
|
146
|
+
stats.skipped_files.append(rel_path)
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
# Generate embeddings
|
|
150
|
+
texts = [chunk.content for chunk in chunks]
|
|
151
|
+
embeddings = self.embedder.encode(texts, show_progress_bar=False).tolist()
|
|
152
|
+
|
|
153
|
+
# Prepare data for ChromaDB
|
|
154
|
+
ids = [chunk.id for chunk in chunks]
|
|
155
|
+
documents = texts
|
|
156
|
+
metadatas = [chunk.to_dict() for chunk in chunks]
|
|
157
|
+
|
|
158
|
+
# Delete old chunks for this file (if re-indexing)
|
|
159
|
+
try:
|
|
160
|
+
existing = self._collection.get(where={"file_path": rel_path})
|
|
161
|
+
if existing["ids"]:
|
|
162
|
+
self._collection.delete(ids=existing["ids"])
|
|
163
|
+
except Exception:
|
|
164
|
+
pass # Collection might be empty
|
|
165
|
+
|
|
166
|
+
# Add to collection
|
|
167
|
+
self._collection.add(
|
|
168
|
+
ids=ids,
|
|
169
|
+
documents=documents,
|
|
170
|
+
embeddings=embeddings,
|
|
171
|
+
metadatas=metadatas,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
stats.indexed_files.append(rel_path)
|
|
175
|
+
stats.total_chunks += len(chunks)
|
|
176
|
+
return True
|
|
177
|
+
|
|
178
|
+
def index_directory(self, directory: Optional[Path] = None, verbose: bool = True) -> IndexStats:
|
|
179
|
+
"""Index all files in a directory recursively.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
directory: Directory to index (defaults to root_path)
|
|
183
|
+
verbose: Print progress
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
IndexStats with results
|
|
187
|
+
"""
|
|
188
|
+
directory = directory or self.root_path
|
|
189
|
+
stats = IndexStats()
|
|
190
|
+
|
|
191
|
+
# Find all files to index
|
|
192
|
+
files_to_index = []
|
|
193
|
+
for file_path in directory.rglob("*"):
|
|
194
|
+
if file_path.is_file() and self._should_index_file(file_path):
|
|
195
|
+
files_to_index.append(file_path)
|
|
196
|
+
|
|
197
|
+
stats.total_files = len(files_to_index)
|
|
198
|
+
|
|
199
|
+
if verbose:
|
|
200
|
+
print(f"Found {len(files_to_index)} files to index...")
|
|
201
|
+
|
|
202
|
+
# Index each file
|
|
203
|
+
for i, file_path in enumerate(files_to_index):
|
|
204
|
+
if verbose and (i + 1) % 10 == 0:
|
|
205
|
+
print(f" Indexed {i + 1}/{len(files_to_index)} files...")
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
self.index_file(file_path, stats)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
stats.errors.append(f"{file_path}: {str(e)}")
|
|
211
|
+
|
|
212
|
+
if verbose:
|
|
213
|
+
print(f"\n✓ Indexed {len(stats.indexed_files)} files ({stats.total_chunks} chunks)")
|
|
214
|
+
if stats.errors:
|
|
215
|
+
print(f"⚠ {len(stats.errors)} errors occurred")
|
|
216
|
+
|
|
217
|
+
return stats
|
|
218
|
+
|
|
219
|
+
def clear_index(self) -> None:
|
|
220
|
+
"""Clear the entire index."""
|
|
221
|
+
self._client.delete_collection(self.config.collection_name)
|
|
222
|
+
self._collection = self._client.create_collection(
|
|
223
|
+
name=self.config.collection_name,
|
|
224
|
+
metadata={"description": "Codebase index for semantic search"},
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def get_stats(self) -> dict:
|
|
228
|
+
"""Get statistics about the current index."""
|
|
229
|
+
count = self._collection.count()
|
|
230
|
+
return {
|
|
231
|
+
"total_chunks": count,
|
|
232
|
+
"collection_name": self.config.collection_name,
|
|
233
|
+
"embedding_model": self.config.embedding_model,
|
|
234
|
+
"root_path": str(self.root_path),
|
|
235
|
+
}
|
|
236
|
+
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic search over indexed codebase.
|
|
3
|
+
|
|
4
|
+
Provides natural language search capabilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import chromadb
|
|
12
|
+
from chromadb.config import Settings
|
|
13
|
+
from sentence_transformers import SentenceTransformer
|
|
14
|
+
|
|
15
|
+
from .indexer import IndexConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SearchResult:
|
|
20
|
+
"""A single search result."""
|
|
21
|
+
content: str
|
|
22
|
+
file_path: str
|
|
23
|
+
start_line: int
|
|
24
|
+
end_line: int
|
|
25
|
+
chunk_type: str
|
|
26
|
+
name: str
|
|
27
|
+
language: str
|
|
28
|
+
score: float # Similarity score (0-1, higher is better)
|
|
29
|
+
|
|
30
|
+
def __str__(self) -> str:
|
|
31
|
+
"""Format result for display."""
|
|
32
|
+
return (
|
|
33
|
+
f"📄 {self.file_path}:{self.start_line}-{self.end_line} "
|
|
34
|
+
f"({self.chunk_type}: {self.name}) [score: {self.score:.3f}]"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict:
|
|
38
|
+
"""Convert to dictionary."""
|
|
39
|
+
return {
|
|
40
|
+
"content": self.content,
|
|
41
|
+
"file_path": self.file_path,
|
|
42
|
+
"start_line": self.start_line,
|
|
43
|
+
"end_line": self.end_line,
|
|
44
|
+
"chunk_type": self.chunk_type,
|
|
45
|
+
"name": self.name,
|
|
46
|
+
"language": self.language,
|
|
47
|
+
"score": self.score,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class SearchResponse:
|
|
53
|
+
"""Response from a search query."""
|
|
54
|
+
query: str
|
|
55
|
+
results: List[SearchResult] = field(default_factory=list)
|
|
56
|
+
total_results: int = 0
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def has_results(self) -> bool:
|
|
60
|
+
return len(self.results) > 0
|
|
61
|
+
|
|
62
|
+
def format_for_llm(self, max_results: int = 5) -> str:
|
|
63
|
+
"""Format results as context for LLM prompts."""
|
|
64
|
+
if not self.results:
|
|
65
|
+
return "No relevant code found."
|
|
66
|
+
|
|
67
|
+
parts = [f"Found {len(self.results)} relevant code sections:\n"]
|
|
68
|
+
|
|
69
|
+
for i, result in enumerate(self.results[:max_results], 1):
|
|
70
|
+
parts.append(f"\n--- Result {i}: {result.file_path}:{result.start_line}-{result.end_line} ---")
|
|
71
|
+
parts.append(f"Type: {result.chunk_type} | Name: {result.name}")
|
|
72
|
+
parts.append(f"```{result.language or ''}")
|
|
73
|
+
parts.append(result.content)
|
|
74
|
+
parts.append("```\n")
|
|
75
|
+
|
|
76
|
+
return "\n".join(parts)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class CodebaseSearch:
|
|
80
|
+
"""Semantic search over indexed codebase."""
|
|
81
|
+
|
|
82
|
+
def __init__(self, config: Optional[IndexConfig] = None, root_path: Optional[str] = None):
|
|
83
|
+
"""Initialize search.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
config: Index configuration (must match indexer config)
|
|
87
|
+
root_path: Root directory of the codebase
|
|
88
|
+
"""
|
|
89
|
+
self.config = config or IndexConfig()
|
|
90
|
+
self.root_path = Path(root_path) if root_path else Path.cwd()
|
|
91
|
+
|
|
92
|
+
# Initialize embedding model (lazy load)
|
|
93
|
+
self._embedder: Optional[SentenceTransformer] = None
|
|
94
|
+
|
|
95
|
+
# Connect to ChromaDB
|
|
96
|
+
persist_path = self.root_path / self.config.persist_directory
|
|
97
|
+
if not persist_path.exists():
|
|
98
|
+
raise FileNotFoundError(
|
|
99
|
+
f"Index not found at {persist_path}. Run 'ai-assist index' first."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
self._client = chromadb.PersistentClient(
|
|
103
|
+
path=str(persist_path),
|
|
104
|
+
settings=Settings(anonymized_telemetry=False),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
self._collection = self._client.get_collection(self.config.collection_name)
|
|
109
|
+
except Exception:
|
|
110
|
+
raise FileNotFoundError(
|
|
111
|
+
f"Collection '{self.config.collection_name}' not found. Run 'ai-assist index' first."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def embedder(self) -> SentenceTransformer:
|
|
116
|
+
"""Lazy load the embedding model."""
|
|
117
|
+
if self._embedder is None:
|
|
118
|
+
self._embedder = SentenceTransformer(self.config.embedding_model)
|
|
119
|
+
return self._embedder
|
|
120
|
+
|
|
121
|
+
def search(
|
|
122
|
+
self,
|
|
123
|
+
query: str,
|
|
124
|
+
top_k: int = 10,
|
|
125
|
+
min_score: float = 0.0,
|
|
126
|
+
file_filter: Optional[str] = None,
|
|
127
|
+
language_filter: Optional[str] = None,
|
|
128
|
+
) -> SearchResponse:
|
|
129
|
+
"""Search for relevant code.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
query: Natural language search query
|
|
133
|
+
top_k: Maximum number of results
|
|
134
|
+
min_score: Minimum similarity score (0-1)
|
|
135
|
+
file_filter: Filter by file path (substring match)
|
|
136
|
+
language_filter: Filter by programming language
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
SearchResponse with results
|
|
140
|
+
"""
|
|
141
|
+
# Generate query embedding
|
|
142
|
+
query_embedding = self.embedder.encode(query, show_progress_bar=False).tolist()
|
|
143
|
+
|
|
144
|
+
# Build where clause for filtering
|
|
145
|
+
where = None
|
|
146
|
+
if language_filter:
|
|
147
|
+
where = {"language": language_filter}
|
|
148
|
+
|
|
149
|
+
# Search ChromaDB
|
|
150
|
+
results = self._collection.query(
|
|
151
|
+
query_embeddings=[query_embedding],
|
|
152
|
+
n_results=top_k,
|
|
153
|
+
where=where,
|
|
154
|
+
include=["documents", "metadatas", "distances"],
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Process results
|
|
158
|
+
search_results = []
|
|
159
|
+
|
|
160
|
+
if results["documents"] and results["documents"][0]:
|
|
161
|
+
documents = results["documents"][0]
|
|
162
|
+
metadatas = results["metadatas"][0] if results["metadatas"] else [{}] * len(documents)
|
|
163
|
+
distances = results["distances"][0] if results["distances"] else [0] * len(documents)
|
|
164
|
+
|
|
165
|
+
for doc, meta, distance in zip(documents, metadatas, distances):
|
|
166
|
+
# Convert distance to similarity score (ChromaDB returns L2 distance)
|
|
167
|
+
# Lower distance = more similar, so we convert
|
|
168
|
+
score = 1 / (1 + distance)
|
|
169
|
+
|
|
170
|
+
# Apply minimum score filter
|
|
171
|
+
if score < min_score:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
# Apply file filter
|
|
175
|
+
file_path = meta.get("file_path", "")
|
|
176
|
+
if file_filter and file_filter.lower() not in file_path.lower():
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
search_results.append(SearchResult(
|
|
180
|
+
content=doc,
|
|
181
|
+
file_path=file_path,
|
|
182
|
+
start_line=int(meta.get("start_line", 0)),
|
|
183
|
+
end_line=int(meta.get("end_line", 0)),
|
|
184
|
+
chunk_type=meta.get("chunk_type", "unknown"),
|
|
185
|
+
name=meta.get("name", ""),
|
|
186
|
+
language=meta.get("language", ""),
|
|
187
|
+
score=score,
|
|
188
|
+
))
|
|
189
|
+
|
|
190
|
+
return SearchResponse(
|
|
191
|
+
query=query,
|
|
192
|
+
results=search_results,
|
|
193
|
+
total_results=len(search_results),
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def search_similar(self, code: str, top_k: int = 5) -> SearchResponse:
|
|
197
|
+
"""Find code similar to the given code snippet.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
code: Code snippet to find similar code for
|
|
201
|
+
top_k: Maximum number of results
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
SearchResponse with similar code
|
|
205
|
+
"""
|
|
206
|
+
return self.search(query=code, top_k=top_k)
|
|
207
|
+
|
|
208
|
+
def get_file_context(self, file_path: str) -> List[SearchResult]:
|
|
209
|
+
"""Get all indexed chunks for a specific file.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
file_path: Path to the file
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List of SearchResult for the file
|
|
216
|
+
"""
|
|
217
|
+
results = self._collection.get(
|
|
218
|
+
where={"file_path": file_path},
|
|
219
|
+
include=["documents", "metadatas"],
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
search_results = []
|
|
223
|
+
if results["documents"]:
|
|
224
|
+
for doc, meta in zip(results["documents"], results["metadatas"]):
|
|
225
|
+
search_results.append(SearchResult(
|
|
226
|
+
content=doc,
|
|
227
|
+
file_path=meta.get("file_path", ""),
|
|
228
|
+
start_line=int(meta.get("start_line", 0)),
|
|
229
|
+
end_line=int(meta.get("end_line", 0)),
|
|
230
|
+
chunk_type=meta.get("chunk_type", "unknown"),
|
|
231
|
+
name=meta.get("name", ""),
|
|
232
|
+
language=meta.get("language", ""),
|
|
233
|
+
score=1.0, # Not a similarity search
|
|
234
|
+
))
|
|
235
|
+
|
|
236
|
+
# Sort by line number
|
|
237
|
+
search_results.sort(key=lambda x: x.start_line)
|
|
238
|
+
return search_results
|
|
239
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Code review module for AI Code Assistant."""
|
|
2
|
+
|
|
3
|
+
from ai_code_assistant.reviewer.analyzer import CodeAnalyzer, ReviewResult, ReviewIssue
|
|
4
|
+
from ai_code_assistant.reviewer.prompts import REVIEW_PROMPTS
|
|
5
|
+
|
|
6
|
+
__all__ = ["CodeAnalyzer", "ReviewResult", "ReviewIssue", "REVIEW_PROMPTS"]
|
|
7
|
+
|