ai-coding-assistant 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_coding_assistant-0.5.0.dist-info/METADATA +226 -0
- ai_coding_assistant-0.5.0.dist-info/RECORD +89 -0
- ai_coding_assistant-0.5.0.dist-info/WHEEL +4 -0
- ai_coding_assistant-0.5.0.dist-info/entry_points.txt +3 -0
- ai_coding_assistant-0.5.0.dist-info/licenses/LICENSE +21 -0
- coding_assistant/__init__.py +3 -0
- coding_assistant/__main__.py +19 -0
- coding_assistant/cli/__init__.py +1 -0
- coding_assistant/cli/app.py +158 -0
- coding_assistant/cli/commands/__init__.py +19 -0
- coding_assistant/cli/commands/ask.py +178 -0
- coding_assistant/cli/commands/config.py +438 -0
- coding_assistant/cli/commands/diagram.py +267 -0
- coding_assistant/cli/commands/document.py +410 -0
- coding_assistant/cli/commands/explain.py +192 -0
- coding_assistant/cli/commands/fix.py +249 -0
- coding_assistant/cli/commands/index.py +162 -0
- coding_assistant/cli/commands/refactor.py +245 -0
- coding_assistant/cli/commands/search.py +182 -0
- coding_assistant/cli/commands/serve_docs.py +128 -0
- coding_assistant/cli/repl.py +381 -0
- coding_assistant/cli/theme.py +90 -0
- coding_assistant/codebase/__init__.py +1 -0
- coding_assistant/codebase/crawler.py +93 -0
- coding_assistant/codebase/parser.py +266 -0
- coding_assistant/config/__init__.py +25 -0
- coding_assistant/config/config_manager.py +615 -0
- coding_assistant/config/settings.py +82 -0
- coding_assistant/context/__init__.py +19 -0
- coding_assistant/context/chunker.py +443 -0
- coding_assistant/context/enhanced_retriever.py +322 -0
- coding_assistant/context/hybrid_search.py +311 -0
- coding_assistant/context/ranker.py +355 -0
- coding_assistant/context/retriever.py +119 -0
- coding_assistant/context/window.py +362 -0
- coding_assistant/documentation/__init__.py +23 -0
- coding_assistant/documentation/agents/__init__.py +27 -0
- coding_assistant/documentation/agents/coordinator.py +510 -0
- coding_assistant/documentation/agents/module_documenter.py +111 -0
- coding_assistant/documentation/agents/synthesizer.py +139 -0
- coding_assistant/documentation/agents/task_delegator.py +100 -0
- coding_assistant/documentation/decomposition/__init__.py +21 -0
- coding_assistant/documentation/decomposition/context_preserver.py +477 -0
- coding_assistant/documentation/decomposition/module_detector.py +302 -0
- coding_assistant/documentation/decomposition/partitioner.py +621 -0
- coding_assistant/documentation/generators/__init__.py +14 -0
- coding_assistant/documentation/generators/dataflow_generator.py +440 -0
- coding_assistant/documentation/generators/diagram_generator.py +511 -0
- coding_assistant/documentation/graph/__init__.py +13 -0
- coding_assistant/documentation/graph/dependency_builder.py +468 -0
- coding_assistant/documentation/graph/module_analyzer.py +475 -0
- coding_assistant/documentation/writers/__init__.py +11 -0
- coding_assistant/documentation/writers/markdown_writer.py +322 -0
- coding_assistant/embeddings/__init__.py +0 -0
- coding_assistant/embeddings/generator.py +89 -0
- coding_assistant/embeddings/store.py +187 -0
- coding_assistant/exceptions/__init__.py +50 -0
- coding_assistant/exceptions/base.py +110 -0
- coding_assistant/exceptions/llm.py +249 -0
- coding_assistant/exceptions/recovery.py +263 -0
- coding_assistant/exceptions/storage.py +213 -0
- coding_assistant/exceptions/validation.py +230 -0
- coding_assistant/llm/__init__.py +1 -0
- coding_assistant/llm/client.py +277 -0
- coding_assistant/llm/gemini_client.py +181 -0
- coding_assistant/llm/groq_client.py +160 -0
- coding_assistant/llm/prompts.py +98 -0
- coding_assistant/llm/together_client.py +160 -0
- coding_assistant/operations/__init__.py +13 -0
- coding_assistant/operations/differ.py +369 -0
- coding_assistant/operations/generator.py +347 -0
- coding_assistant/operations/linter.py +430 -0
- coding_assistant/operations/validator.py +406 -0
- coding_assistant/storage/__init__.py +9 -0
- coding_assistant/storage/database.py +363 -0
- coding_assistant/storage/session.py +231 -0
- coding_assistant/utils/__init__.py +31 -0
- coding_assistant/utils/cache.py +477 -0
- coding_assistant/utils/hardware.py +132 -0
- coding_assistant/utils/keystore.py +206 -0
- coding_assistant/utils/logger.py +32 -0
- coding_assistant/utils/progress.py +311 -0
- coding_assistant/validation/__init__.py +13 -0
- coding_assistant/validation/files.py +305 -0
- coding_assistant/validation/inputs.py +335 -0
- coding_assistant/validation/params.py +280 -0
- coding_assistant/validation/sanitizers.py +243 -0
- coding_assistant/vcs/__init__.py +5 -0
- coding_assistant/vcs/git.py +269 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Context ranking with language-aware scoring."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Dict, Optional
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ContextRanker:
|
|
9
|
+
"""
|
|
10
|
+
Rank retrieved code chunks based on multiple factors.
|
|
11
|
+
|
|
12
|
+
Scoring factors:
|
|
13
|
+
- Semantic similarity (40%)
|
|
14
|
+
- File proximity (20%)
|
|
15
|
+
- Dependency distance (20%)
|
|
16
|
+
- Recency (10%)
|
|
17
|
+
- Code popularity/references (10%)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, dependency_graph=None):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the context ranker.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
dependency_graph: Optional dependency graph for file relationships
|
|
26
|
+
"""
|
|
27
|
+
self.dependency_graph = dependency_graph
|
|
28
|
+
|
|
29
|
+
# Scoring weights
|
|
30
|
+
self.weights = {
|
|
31
|
+
'similarity': 0.4,
|
|
32
|
+
'file_proximity': 0.2,
|
|
33
|
+
'dependency': 0.2,
|
|
34
|
+
'recency': 0.1,
|
|
35
|
+
'popularity': 0.1
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def rank(self, chunks: List[Dict], query: str,
|
|
39
|
+
current_file: Optional[str] = None,
|
|
40
|
+
language: Optional[str] = None) -> List[Dict]:
|
|
41
|
+
"""
|
|
42
|
+
Rank chunks based on relevance to query and context.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
chunks: List of chunks with metadata (must have 'id', 'similarity', etc.)
|
|
46
|
+
query: The user's query
|
|
47
|
+
current_file: Current file being edited (for proximity bonus)
|
|
48
|
+
language: Programming language (for language-specific heuristics)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Ranked list of chunks with scores
|
|
52
|
+
"""
|
|
53
|
+
scored_chunks = []
|
|
54
|
+
|
|
55
|
+
for chunk in chunks:
|
|
56
|
+
score = self._calculate_score(chunk, query, current_file, language)
|
|
57
|
+
chunk['rank_score'] = score
|
|
58
|
+
scored_chunks.append(chunk)
|
|
59
|
+
|
|
60
|
+
# Sort by rank score (descending)
|
|
61
|
+
ranked = sorted(scored_chunks, key=lambda x: x['rank_score'], reverse=True)
|
|
62
|
+
|
|
63
|
+
return ranked
|
|
64
|
+
|
|
65
|
+
def _calculate_score(self, chunk: Dict, query: str,
|
|
66
|
+
current_file: Optional[str],
|
|
67
|
+
language: Optional[str]) -> float:
|
|
68
|
+
"""Calculate overall relevance score for a chunk."""
|
|
69
|
+
score = 0.0
|
|
70
|
+
|
|
71
|
+
# 1. Semantic similarity (from vector/hybrid search)
|
|
72
|
+
similarity = chunk.get('similarity', chunk.get('vector_score', 0.0))
|
|
73
|
+
score += similarity * self.weights['similarity']
|
|
74
|
+
|
|
75
|
+
# 2. File proximity
|
|
76
|
+
if current_file and 'file_path' in chunk:
|
|
77
|
+
proximity_score = self._file_proximity_score(chunk['file_path'], current_file)
|
|
78
|
+
score += proximity_score * self.weights['file_proximity']
|
|
79
|
+
|
|
80
|
+
# 3. Dependency distance
|
|
81
|
+
if current_file and self.dependency_graph and 'file_path' in chunk:
|
|
82
|
+
dep_score = self._dependency_score(chunk['file_path'], current_file)
|
|
83
|
+
score += dep_score * self.weights['dependency']
|
|
84
|
+
|
|
85
|
+
# 4. Recency
|
|
86
|
+
if 'last_modified' in chunk:
|
|
87
|
+
recency_score = self._recency_score(chunk['last_modified'])
|
|
88
|
+
score += recency_score * self.weights['recency']
|
|
89
|
+
|
|
90
|
+
# 5. Popularity
|
|
91
|
+
if 'reference_count' in chunk:
|
|
92
|
+
popularity_score = self._popularity_score(chunk['reference_count'])
|
|
93
|
+
score += popularity_score * self.weights['popularity']
|
|
94
|
+
|
|
95
|
+
# 6. Language-specific boosts
|
|
96
|
+
if language:
|
|
97
|
+
lang_boost = self._language_specific_boost(chunk, query, language)
|
|
98
|
+
score += lang_boost
|
|
99
|
+
|
|
100
|
+
return score
|
|
101
|
+
|
|
102
|
+
def _file_proximity_score(self, file_path: str, current_file: str) -> float:
|
|
103
|
+
"""
|
|
104
|
+
Score based on how close two files are in the directory structure.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Score between 0 and 1
|
|
108
|
+
"""
|
|
109
|
+
if file_path == current_file:
|
|
110
|
+
return 1.0
|
|
111
|
+
|
|
112
|
+
# Convert to Path objects
|
|
113
|
+
path1 = Path(file_path)
|
|
114
|
+
path2 = Path(current_file)
|
|
115
|
+
|
|
116
|
+
# Same directory: high score
|
|
117
|
+
if path1.parent == path2.parent:
|
|
118
|
+
return 0.8
|
|
119
|
+
|
|
120
|
+
# Check how many directory levels apart
|
|
121
|
+
try:
|
|
122
|
+
# Get relative path
|
|
123
|
+
rel_path = path1.relative_to(path2.parent)
|
|
124
|
+
levels = len(rel_path.parts) - 1
|
|
125
|
+
# Closer = higher score
|
|
126
|
+
return max(0.0, 1.0 - (levels * 0.2))
|
|
127
|
+
except ValueError:
|
|
128
|
+
# Not in same tree, check common parent
|
|
129
|
+
common_parts = 0
|
|
130
|
+
for p1, p2 in zip(path1.parts, path2.parts):
|
|
131
|
+
if p1 == p2:
|
|
132
|
+
common_parts += 1
|
|
133
|
+
else:
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
# More common directories = higher score
|
|
137
|
+
return min(1.0, common_parts * 0.15)
|
|
138
|
+
|
|
139
|
+
def _dependency_score(self, file_path: str, current_file: str) -> float:
|
|
140
|
+
"""
|
|
141
|
+
Score based on dependency distance in import graph.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Score between 0 and 1
|
|
145
|
+
"""
|
|
146
|
+
if not self.dependency_graph:
|
|
147
|
+
return 0.0
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
# Get shortest path distance in dependency graph
|
|
151
|
+
distance = self.dependency_graph.get_distance(current_file, file_path)
|
|
152
|
+
|
|
153
|
+
if distance == 0:
|
|
154
|
+
return 1.0
|
|
155
|
+
elif distance == 1:
|
|
156
|
+
# Direct dependency
|
|
157
|
+
return 0.8
|
|
158
|
+
elif distance == 2:
|
|
159
|
+
# Second-degree dependency
|
|
160
|
+
return 0.5
|
|
161
|
+
else:
|
|
162
|
+
# Further away
|
|
163
|
+
return max(0.0, 1.0 - (distance * 0.2))
|
|
164
|
+
except:
|
|
165
|
+
return 0.0
|
|
166
|
+
|
|
167
|
+
def _recency_score(self, last_modified: datetime) -> float:
|
|
168
|
+
"""
|
|
169
|
+
Score based on how recently the file was modified.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Score between 0 and 1
|
|
173
|
+
"""
|
|
174
|
+
if isinstance(last_modified, str):
|
|
175
|
+
try:
|
|
176
|
+
last_modified = datetime.fromisoformat(last_modified)
|
|
177
|
+
except:
|
|
178
|
+
return 0.5 # Default if can't parse
|
|
179
|
+
|
|
180
|
+
now = datetime.now()
|
|
181
|
+
age = now - last_modified
|
|
182
|
+
|
|
183
|
+
# Files modified in last day: 1.0
|
|
184
|
+
if age < timedelta(days=1):
|
|
185
|
+
return 1.0
|
|
186
|
+
# Last week: 0.8
|
|
187
|
+
elif age < timedelta(weeks=1):
|
|
188
|
+
return 0.8
|
|
189
|
+
# Last month: 0.5
|
|
190
|
+
elif age < timedelta(days=30):
|
|
191
|
+
return 0.5
|
|
192
|
+
# Last 3 months: 0.3
|
|
193
|
+
elif age < timedelta(days=90):
|
|
194
|
+
return 0.3
|
|
195
|
+
# Older: decay linearly
|
|
196
|
+
else:
|
|
197
|
+
days_old = age.days
|
|
198
|
+
return max(0.0, 1.0 - (days_old / 365))
|
|
199
|
+
|
|
200
|
+
def _popularity_score(self, reference_count: int) -> float:
|
|
201
|
+
"""
|
|
202
|
+
Score based on how many times the code is referenced.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Score between 0 and 1
|
|
206
|
+
"""
|
|
207
|
+
# Logarithmic scaling
|
|
208
|
+
# 0 refs: 0.0, 1 ref: 0.2, 10 refs: 0.5, 100 refs: 0.7, 1000 refs: 1.0
|
|
209
|
+
if reference_count == 0:
|
|
210
|
+
return 0.0
|
|
211
|
+
|
|
212
|
+
import math
|
|
213
|
+
return min(1.0, math.log10(reference_count + 1) / 3)
|
|
214
|
+
|
|
215
|
+
def _language_specific_boost(self, chunk: Dict, query: str, language: str) -> float:
|
|
216
|
+
"""
|
|
217
|
+
Apply language-specific heuristics for ranking.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Boost score (can be positive or negative, typically 0.0-0.2)
|
|
221
|
+
"""
|
|
222
|
+
boost = 0.0
|
|
223
|
+
file_path = chunk.get('file_path', '').lower()
|
|
224
|
+
query_lower = query.lower()
|
|
225
|
+
|
|
226
|
+
if language == 'python':
|
|
227
|
+
# Boost test files if query mentions "test"
|
|
228
|
+
if 'test' in query_lower and 'test_' in file_path:
|
|
229
|
+
boost += 0.15
|
|
230
|
+
|
|
231
|
+
# Boost __init__.py for module/package questions
|
|
232
|
+
if '__init__.py' in file_path:
|
|
233
|
+
if any(word in query_lower for word in ['module', 'package', 'import']):
|
|
234
|
+
boost += 0.1
|
|
235
|
+
|
|
236
|
+
# Boost setup.py, pyproject.toml for dependency questions
|
|
237
|
+
if 'setup.py' in file_path or 'pyproject.toml' in file_path:
|
|
238
|
+
if any(word in query_lower for word in ['dependency', 'install', 'package']):
|
|
239
|
+
boost += 0.15
|
|
240
|
+
|
|
241
|
+
# Boost main.py, app.py for entry point questions
|
|
242
|
+
if 'main.py' in file_path or 'app.py' in file_path:
|
|
243
|
+
if any(word in query_lower for word in ['start', 'entry', 'run', 'main']):
|
|
244
|
+
boost += 0.1
|
|
245
|
+
|
|
246
|
+
elif language in ('javascript', 'typescript'):
|
|
247
|
+
# Boost index.js, main.js, app.js for entry points
|
|
248
|
+
if any(name in file_path for name in ['index.js', 'main.js', 'app.js', 'index.ts', 'main.ts']):
|
|
249
|
+
if any(word in query_lower for word in ['start', 'entry', 'run', 'main']):
|
|
250
|
+
boost += 0.1
|
|
251
|
+
|
|
252
|
+
# Boost .tsx/.jsx for component questions
|
|
253
|
+
if file_path.endswith(('.tsx', '.jsx')):
|
|
254
|
+
if any(word in query_lower for word in ['component', 'render', 'ui', 'view']):
|
|
255
|
+
boost += 0.15
|
|
256
|
+
|
|
257
|
+
# Boost package.json for dependency questions
|
|
258
|
+
if 'package.json' in file_path:
|
|
259
|
+
if any(word in query_lower for word in ['dependency', 'install', 'package', 'npm']):
|
|
260
|
+
boost += 0.15
|
|
261
|
+
|
|
262
|
+
# Boost test files
|
|
263
|
+
if any(pattern in file_path for pattern in ['.test.', '.spec.', '__tests__']):
|
|
264
|
+
if 'test' in query_lower:
|
|
265
|
+
boost += 0.15
|
|
266
|
+
|
|
267
|
+
# Boost config files for configuration questions
|
|
268
|
+
if any(pattern in file_path for pattern in ['config.', '.config.', 'webpack', 'vite']):
|
|
269
|
+
if any(word in query_lower for word in ['config', 'setup', 'build']):
|
|
270
|
+
boost += 0.1
|
|
271
|
+
|
|
272
|
+
# Generic boosts (language-agnostic)
|
|
273
|
+
|
|
274
|
+
# Boost README files for overview questions
|
|
275
|
+
if 'readme' in file_path:
|
|
276
|
+
if any(word in query_lower for word in ['what', 'overview', 'about', 'intro']):
|
|
277
|
+
boost += 0.15
|
|
278
|
+
|
|
279
|
+
# Boost documentation files
|
|
280
|
+
if any(pattern in file_path for pattern in ['docs/', 'documentation/']):
|
|
281
|
+
if any(word in query_lower for word in ['how', 'guide', 'tutorial', 'example']):
|
|
282
|
+
boost += 0.1
|
|
283
|
+
|
|
284
|
+
# Penalize non-code files for code-specific questions
|
|
285
|
+
if any(word in query_lower for word in ['function', 'class', 'implement', 'code']):
|
|
286
|
+
if file_path.endswith(('.md', '.txt', '.json', '.yaml', '.yml')):
|
|
287
|
+
boost -= 0.1
|
|
288
|
+
|
|
289
|
+
return boost
|
|
290
|
+
|
|
291
|
+
def update_weights(self, **kwargs):
|
|
292
|
+
"""
|
|
293
|
+
Update scoring weights.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
**kwargs: New weights (similarity, file_proximity, dependency, recency, popularity)
|
|
297
|
+
"""
|
|
298
|
+
for key, value in kwargs.items():
|
|
299
|
+
if key in self.weights:
|
|
300
|
+
self.weights[key] = value
|
|
301
|
+
|
|
302
|
+
# Normalize weights to sum to 1.0
|
|
303
|
+
total = sum(self.weights.values())
|
|
304
|
+
if total > 0:
|
|
305
|
+
for key in self.weights:
|
|
306
|
+
self.weights[key] /= total
|
|
307
|
+
|
|
308
|
+
def get_weights(self) -> Dict[str, float]:
|
|
309
|
+
"""Get current scoring weights."""
|
|
310
|
+
return self.weights.copy()
|
|
311
|
+
|
|
312
|
+
def explain_ranking(self, chunk: Dict, query: str,
|
|
313
|
+
current_file: Optional[str] = None,
|
|
314
|
+
language: Optional[str] = None) -> Dict:
|
|
315
|
+
"""
|
|
316
|
+
Explain the ranking score for a specific chunk.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
Dict with breakdown of score components
|
|
320
|
+
"""
|
|
321
|
+
similarity = chunk.get('similarity', chunk.get('vector_score', 0.0))
|
|
322
|
+
|
|
323
|
+
explanation = {
|
|
324
|
+
'total_score': chunk.get('rank_score', 0.0),
|
|
325
|
+
'components': {
|
|
326
|
+
'similarity': similarity * self.weights['similarity'],
|
|
327
|
+
'file_proximity': 0.0,
|
|
328
|
+
'dependency': 0.0,
|
|
329
|
+
'recency': 0.0,
|
|
330
|
+
'popularity': 0.0,
|
|
331
|
+
'language_boost': 0.0
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if current_file and 'file_path' in chunk:
|
|
336
|
+
prox = self._file_proximity_score(chunk['file_path'], current_file)
|
|
337
|
+
explanation['components']['file_proximity'] = prox * self.weights['file_proximity']
|
|
338
|
+
|
|
339
|
+
if current_file and self.dependency_graph and 'file_path' in chunk:
|
|
340
|
+
dep = self._dependency_score(chunk['file_path'], current_file)
|
|
341
|
+
explanation['components']['dependency'] = dep * self.weights['dependency']
|
|
342
|
+
|
|
343
|
+
if 'last_modified' in chunk:
|
|
344
|
+
rec = self._recency_score(chunk['last_modified'])
|
|
345
|
+
explanation['components']['recency'] = rec * self.weights['recency']
|
|
346
|
+
|
|
347
|
+
if 'reference_count' in chunk:
|
|
348
|
+
pop = self._popularity_score(chunk['reference_count'])
|
|
349
|
+
explanation['components']['popularity'] = pop * self.weights['popularity']
|
|
350
|
+
|
|
351
|
+
if language:
|
|
352
|
+
boost = self._language_specific_boost(chunk, query, language)
|
|
353
|
+
explanation['components']['language_boost'] = boost
|
|
354
|
+
|
|
355
|
+
return explanation
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Semantic retriever for code search using embeddings."""
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Dict, Optional
|
|
4
|
+
from coding_assistant.codebase.crawler import CodebaseCrawler
|
|
5
|
+
from coding_assistant.codebase.parser import CodeParser
|
|
6
|
+
from coding_assistant.embeddings.generator import EmbeddingGenerator
|
|
7
|
+
from coding_assistant.embeddings.store import VectorStore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SemanticRetriever:
|
|
11
|
+
"""Retrieve relevant code using semantic search."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, project_path: Path):
|
|
14
|
+
"""
|
|
15
|
+
Initialize the semantic retriever.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
project_path: Path to the project root
|
|
19
|
+
"""
|
|
20
|
+
self.project_path = Path(project_path).resolve()
|
|
21
|
+
self.crawler = CodebaseCrawler(self.project_path)
|
|
22
|
+
self.parser = CodeParser()
|
|
23
|
+
self.embedder = EmbeddingGenerator()
|
|
24
|
+
self.store = VectorStore(persist_dir=self.project_path / ".coding_assistant" / "chroma_db")
|
|
25
|
+
|
|
26
|
+
def clear_index(self):
|
|
27
|
+
"""Clear the existing index."""
|
|
28
|
+
self.store.clear()
|
|
29
|
+
|
|
30
|
+
def index_codebase(self, max_files: int = 100):
|
|
31
|
+
"""
|
|
32
|
+
Index the codebase for semantic search.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
max_files: Maximum number of files to index
|
|
36
|
+
"""
|
|
37
|
+
# Scan files
|
|
38
|
+
files = self.crawler.scan(max_files=max_files)
|
|
39
|
+
|
|
40
|
+
all_chunks = []
|
|
41
|
+
|
|
42
|
+
# Parse each file
|
|
43
|
+
for file_info in files:
|
|
44
|
+
try:
|
|
45
|
+
content = self.crawler.read_file(file_info['path'])
|
|
46
|
+
|
|
47
|
+
# Only parse Python files for now (parser supports Python)
|
|
48
|
+
if file_info['extension'] == '.py':
|
|
49
|
+
parsed = self.parser.parse_file(file_info['path'], content)
|
|
50
|
+
all_chunks.extend(parsed['chunks'])
|
|
51
|
+
else:
|
|
52
|
+
# For non-Python files, create a simple file-level chunk
|
|
53
|
+
all_chunks.append({
|
|
54
|
+
'type': 'file',
|
|
55
|
+
'file_path': file_info['path'],
|
|
56
|
+
'content': content[:5000], # Limit content size
|
|
57
|
+
'start_line': 0,
|
|
58
|
+
'end_line': len(content.split('\n'))
|
|
59
|
+
})
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# Skip files that can't be parsed
|
|
62
|
+
if hasattr(self, '_verbose') and self._verbose:
|
|
63
|
+
print(f"Warning: Could not parse {file_info['path']}: {e}")
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
if not all_chunks:
|
|
67
|
+
print("No chunks to index")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
# Generate embeddings
|
|
71
|
+
embedded_chunks = self.embedder.embed_code_chunks(all_chunks)
|
|
72
|
+
|
|
73
|
+
# Store in vector database
|
|
74
|
+
self.store.add_chunks(embedded_chunks)
|
|
75
|
+
|
|
76
|
+
def get_stats(self) -> Dict:
|
|
77
|
+
"""
|
|
78
|
+
Get statistics about the indexed codebase.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Dictionary with stats including total_chunks and embedding_dimension
|
|
82
|
+
"""
|
|
83
|
+
return {
|
|
84
|
+
'total_chunks': self.store.count(),
|
|
85
|
+
'embedding_dimension': self.embedder.dimension
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
def retrieve(self, query: str, k: int = 5) -> List[Dict]:
|
|
89
|
+
"""
|
|
90
|
+
Retrieve relevant code chunks for a query.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
query: User query/question
|
|
94
|
+
k: Number of results to return
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of relevant chunks with metadata
|
|
98
|
+
"""
|
|
99
|
+
# Generate query embedding
|
|
100
|
+
query_embedding = self.embedder.generate_embedding(query)
|
|
101
|
+
|
|
102
|
+
# Search vector store
|
|
103
|
+
results = self.store.search(query_embedding, n_results=k)
|
|
104
|
+
|
|
105
|
+
# Format results to match expected structure
|
|
106
|
+
formatted_results = []
|
|
107
|
+
for result in results:
|
|
108
|
+
metadata = result['metadata']
|
|
109
|
+
formatted_results.append({
|
|
110
|
+
'path': metadata['file_path'],
|
|
111
|
+
'type': metadata['type'],
|
|
112
|
+
'similarity': result['similarity'],
|
|
113
|
+
'start_line': metadata['start_line'],
|
|
114
|
+
'end_line': metadata['end_line'],
|
|
115
|
+
'content': result['content']
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
return formatted_results
|
|
119
|
+
|