ai-coding-assistant 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_coding_assistant-0.5.0.dist-info/METADATA +226 -0
- ai_coding_assistant-0.5.0.dist-info/RECORD +89 -0
- ai_coding_assistant-0.5.0.dist-info/WHEEL +4 -0
- ai_coding_assistant-0.5.0.dist-info/entry_points.txt +3 -0
- ai_coding_assistant-0.5.0.dist-info/licenses/LICENSE +21 -0
- coding_assistant/__init__.py +3 -0
- coding_assistant/__main__.py +19 -0
- coding_assistant/cli/__init__.py +1 -0
- coding_assistant/cli/app.py +158 -0
- coding_assistant/cli/commands/__init__.py +19 -0
- coding_assistant/cli/commands/ask.py +178 -0
- coding_assistant/cli/commands/config.py +438 -0
- coding_assistant/cli/commands/diagram.py +267 -0
- coding_assistant/cli/commands/document.py +410 -0
- coding_assistant/cli/commands/explain.py +192 -0
- coding_assistant/cli/commands/fix.py +249 -0
- coding_assistant/cli/commands/index.py +162 -0
- coding_assistant/cli/commands/refactor.py +245 -0
- coding_assistant/cli/commands/search.py +182 -0
- coding_assistant/cli/commands/serve_docs.py +128 -0
- coding_assistant/cli/repl.py +381 -0
- coding_assistant/cli/theme.py +90 -0
- coding_assistant/codebase/__init__.py +1 -0
- coding_assistant/codebase/crawler.py +93 -0
- coding_assistant/codebase/parser.py +266 -0
- coding_assistant/config/__init__.py +25 -0
- coding_assistant/config/config_manager.py +615 -0
- coding_assistant/config/settings.py +82 -0
- coding_assistant/context/__init__.py +19 -0
- coding_assistant/context/chunker.py +443 -0
- coding_assistant/context/enhanced_retriever.py +322 -0
- coding_assistant/context/hybrid_search.py +311 -0
- coding_assistant/context/ranker.py +355 -0
- coding_assistant/context/retriever.py +119 -0
- coding_assistant/context/window.py +362 -0
- coding_assistant/documentation/__init__.py +23 -0
- coding_assistant/documentation/agents/__init__.py +27 -0
- coding_assistant/documentation/agents/coordinator.py +510 -0
- coding_assistant/documentation/agents/module_documenter.py +111 -0
- coding_assistant/documentation/agents/synthesizer.py +139 -0
- coding_assistant/documentation/agents/task_delegator.py +100 -0
- coding_assistant/documentation/decomposition/__init__.py +21 -0
- coding_assistant/documentation/decomposition/context_preserver.py +477 -0
- coding_assistant/documentation/decomposition/module_detector.py +302 -0
- coding_assistant/documentation/decomposition/partitioner.py +621 -0
- coding_assistant/documentation/generators/__init__.py +14 -0
- coding_assistant/documentation/generators/dataflow_generator.py +440 -0
- coding_assistant/documentation/generators/diagram_generator.py +511 -0
- coding_assistant/documentation/graph/__init__.py +13 -0
- coding_assistant/documentation/graph/dependency_builder.py +468 -0
- coding_assistant/documentation/graph/module_analyzer.py +475 -0
- coding_assistant/documentation/writers/__init__.py +11 -0
- coding_assistant/documentation/writers/markdown_writer.py +322 -0
- coding_assistant/embeddings/__init__.py +0 -0
- coding_assistant/embeddings/generator.py +89 -0
- coding_assistant/embeddings/store.py +187 -0
- coding_assistant/exceptions/__init__.py +50 -0
- coding_assistant/exceptions/base.py +110 -0
- coding_assistant/exceptions/llm.py +249 -0
- coding_assistant/exceptions/recovery.py +263 -0
- coding_assistant/exceptions/storage.py +213 -0
- coding_assistant/exceptions/validation.py +230 -0
- coding_assistant/llm/__init__.py +1 -0
- coding_assistant/llm/client.py +277 -0
- coding_assistant/llm/gemini_client.py +181 -0
- coding_assistant/llm/groq_client.py +160 -0
- coding_assistant/llm/prompts.py +98 -0
- coding_assistant/llm/together_client.py +160 -0
- coding_assistant/operations/__init__.py +13 -0
- coding_assistant/operations/differ.py +369 -0
- coding_assistant/operations/generator.py +347 -0
- coding_assistant/operations/linter.py +430 -0
- coding_assistant/operations/validator.py +406 -0
- coding_assistant/storage/__init__.py +9 -0
- coding_assistant/storage/database.py +363 -0
- coding_assistant/storage/session.py +231 -0
- coding_assistant/utils/__init__.py +31 -0
- coding_assistant/utils/cache.py +477 -0
- coding_assistant/utils/hardware.py +132 -0
- coding_assistant/utils/keystore.py +206 -0
- coding_assistant/utils/logger.py +32 -0
- coding_assistant/utils/progress.py +311 -0
- coding_assistant/validation/__init__.py +13 -0
- coding_assistant/validation/files.py +305 -0
- coding_assistant/validation/inputs.py +335 -0
- coding_assistant/validation/params.py +280 -0
- coding_assistant/validation/sanitizers.py +243 -0
- coding_assistant/vcs/__init__.py +5 -0
- coding_assistant/vcs/git.py +269 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Enhanced semantic retriever with hybrid search and context ranking."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Dict, Optional
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from coding_assistant.codebase.crawler import CodebaseCrawler
|
|
8
|
+
from coding_assistant.codebase.parser import CodeParser
|
|
9
|
+
from coding_assistant.embeddings.generator import EmbeddingGenerator
|
|
10
|
+
from coding_assistant.embeddings.store import VectorStore
|
|
11
|
+
from coding_assistant.context.chunker import SmartChunker
|
|
12
|
+
from coding_assistant.context.hybrid_search import HybridSearch
|
|
13
|
+
from coding_assistant.context.ranker import ContextRanker
|
|
14
|
+
from coding_assistant.vcs.git import GitIntegration
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EnhancedSemanticRetriever:
|
|
18
|
+
"""
|
|
19
|
+
Enhanced retriever with hybrid search and intelligent ranking.
|
|
20
|
+
|
|
21
|
+
Combines:
|
|
22
|
+
- Vector similarity search
|
|
23
|
+
- BM25 keyword search
|
|
24
|
+
- Smart code chunking
|
|
25
|
+
- Context-aware ranking
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, project_path: Path, dependency_graph=None):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the enhanced retriever.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
project_path: Path to the project root
|
|
34
|
+
dependency_graph: Optional dependency graph for file relationships
|
|
35
|
+
"""
|
|
36
|
+
self.project_path = Path(project_path).resolve()
|
|
37
|
+
self.crawler = CodebaseCrawler(self.project_path)
|
|
38
|
+
self.parser = CodeParser()
|
|
39
|
+
self.embedder = EmbeddingGenerator()
|
|
40
|
+
|
|
41
|
+
# Vector store
|
|
42
|
+
persist_dir = self.project_path / ".coding_assistant" / "chroma_db"
|
|
43
|
+
self.store = VectorStore(persist_dir=persist_dir)
|
|
44
|
+
|
|
45
|
+
# Smart chunker
|
|
46
|
+
self.chunker = SmartChunker(max_chunk_tokens=500)
|
|
47
|
+
|
|
48
|
+
# Hybrid search (will be initialized after indexing)
|
|
49
|
+
self.hybrid_search = None
|
|
50
|
+
|
|
51
|
+
# Context ranker
|
|
52
|
+
self.ranker = ContextRanker(dependency_graph=dependency_graph)
|
|
53
|
+
|
|
54
|
+
# Git integration (optional - won't fail if not a git repo)
|
|
55
|
+
try:
|
|
56
|
+
self.git = GitIntegration(str(self.project_path))
|
|
57
|
+
except Exception:
|
|
58
|
+
self.git = None
|
|
59
|
+
|
|
60
|
+
# Cache for indexed chunks
|
|
61
|
+
self._indexed_chunks = []
|
|
62
|
+
|
|
63
|
+
def clear_index(self):
|
|
64
|
+
"""Clear the existing index."""
|
|
65
|
+
self.store.clear()
|
|
66
|
+
self._indexed_chunks = []
|
|
67
|
+
self.hybrid_search = None
|
|
68
|
+
|
|
69
|
+
def index_codebase(self, max_files: int = 100, force_reindex: bool = False):
|
|
70
|
+
"""
|
|
71
|
+
Index the codebase with smart chunking.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
max_files: Maximum number of files to index
|
|
75
|
+
force_reindex: Force re-indexing even if already indexed
|
|
76
|
+
"""
|
|
77
|
+
if self._indexed_chunks and not force_reindex:
|
|
78
|
+
print("Codebase already indexed. Use force_reindex=True to re-index.")
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
print("Indexing codebase with smart chunking...")
|
|
82
|
+
|
|
83
|
+
# Scan files
|
|
84
|
+
files = self.crawler.scan(max_files=max_files)
|
|
85
|
+
|
|
86
|
+
all_chunks = []
|
|
87
|
+
|
|
88
|
+
# Parse and chunk each file
|
|
89
|
+
for file_info in files:
|
|
90
|
+
try:
|
|
91
|
+
content = self.crawler.read_file(file_info['path'])
|
|
92
|
+
language = self._detect_language(file_info['extension'])
|
|
93
|
+
|
|
94
|
+
if language in ('python', 'javascript', 'typescript', 'jsx', 'tsx'):
|
|
95
|
+
# Use smart chunking for supported languages
|
|
96
|
+
chunks = self.chunker.chunk_code(
|
|
97
|
+
content,
|
|
98
|
+
file_info['path'],
|
|
99
|
+
language
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Convert to dict format
|
|
103
|
+
for chunk in chunks:
|
|
104
|
+
all_chunks.append({
|
|
105
|
+
'type': chunk.type,
|
|
106
|
+
'file_path': chunk.file_path,
|
|
107
|
+
'content': chunk.content,
|
|
108
|
+
'start_line': chunk.start_line,
|
|
109
|
+
'end_line': chunk.end_line,
|
|
110
|
+
'language': chunk.language,
|
|
111
|
+
'name': chunk.name,
|
|
112
|
+
'docstring': chunk.docstring,
|
|
113
|
+
'last_modified': datetime.fromtimestamp(
|
|
114
|
+
Path(chunk.file_path).stat().st_mtime
|
|
115
|
+
).isoformat()
|
|
116
|
+
})
|
|
117
|
+
else:
|
|
118
|
+
# For other files, create simple file-level chunk
|
|
119
|
+
all_chunks.append({
|
|
120
|
+
'type': 'file',
|
|
121
|
+
'file_path': file_info['path'],
|
|
122
|
+
'content': content[:5000],
|
|
123
|
+
'start_line': 0,
|
|
124
|
+
'end_line': len(content.split('\n')),
|
|
125
|
+
'language': language,
|
|
126
|
+
'name': Path(file_info['path']).name,
|
|
127
|
+
'last_modified': datetime.fromtimestamp(
|
|
128
|
+
Path(file_info['path']).stat().st_mtime
|
|
129
|
+
).isoformat()
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print(f"Warning: Could not process {file_info['path']}: {e}")
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
if not all_chunks:
|
|
137
|
+
print("No chunks to index")
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
print(f"Generating embeddings for {len(all_chunks)} chunks...")
|
|
141
|
+
|
|
142
|
+
# Generate embeddings
|
|
143
|
+
embedded_chunks = self.embedder.embed_code_chunks(all_chunks)
|
|
144
|
+
|
|
145
|
+
# Store in vector database
|
|
146
|
+
print("Storing in vector database...")
|
|
147
|
+
self.store.add_chunks(embedded_chunks)
|
|
148
|
+
|
|
149
|
+
# Cache chunks for BM25
|
|
150
|
+
self._indexed_chunks = embedded_chunks
|
|
151
|
+
|
|
152
|
+
# Initialize hybrid search
|
|
153
|
+
print("Building BM25 index...")
|
|
154
|
+
self.hybrid_search = HybridSearch(self.store, self.embedder)
|
|
155
|
+
self.hybrid_search.index_corpus(embedded_chunks)
|
|
156
|
+
|
|
157
|
+
print(f"✓ Indexed {len(all_chunks)} code chunks")
|
|
158
|
+
|
|
159
|
+
def retrieve(self, query: str, k: int = 5,
|
|
160
|
+
current_file: Optional[str] = None,
|
|
161
|
+
language: Optional[str] = None,
|
|
162
|
+
use_hybrid: bool = True,
|
|
163
|
+
use_ranking: bool = True) -> List[Dict]:
|
|
164
|
+
"""
|
|
165
|
+
Retrieve relevant code chunks with hybrid search and ranking.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
query: User query/question
|
|
169
|
+
k: Number of results to return
|
|
170
|
+
current_file: Current file for proximity scoring
|
|
171
|
+
language: Programming language for language-aware ranking
|
|
172
|
+
use_hybrid: Whether to use hybrid search (vector + BM25)
|
|
173
|
+
use_ranking: Whether to apply context ranking
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
List of relevant chunks with metadata
|
|
177
|
+
"""
|
|
178
|
+
if use_hybrid and self.hybrid_search:
|
|
179
|
+
# Hybrid search (vector + BM25)
|
|
180
|
+
results = self.hybrid_search.search(query, n_results=k * 2)
|
|
181
|
+
|
|
182
|
+
# Convert hybrid results to full chunk data
|
|
183
|
+
enriched_results = []
|
|
184
|
+
for result in results:
|
|
185
|
+
chunk_id = result['id']
|
|
186
|
+
# Find full chunk data
|
|
187
|
+
for chunk in self._indexed_chunks:
|
|
188
|
+
if chunk.get('id') == chunk_id:
|
|
189
|
+
chunk_copy = chunk.copy()
|
|
190
|
+
chunk_copy['similarity'] = result.get('vector_score', 0.0)
|
|
191
|
+
chunk_copy['bm25_score'] = result.get('bm25_score', 0.0)
|
|
192
|
+
chunk_copy['rrf_score'] = result.get('rrf_score', 0.0)
|
|
193
|
+
enriched_results.append(chunk_copy)
|
|
194
|
+
break
|
|
195
|
+
else:
|
|
196
|
+
# Fallback to vector-only search
|
|
197
|
+
query_embedding = self.embedder.generate_embedding(query)
|
|
198
|
+
results = self.store.search(query_embedding, n_results=k * 2)
|
|
199
|
+
enriched_results = results
|
|
200
|
+
|
|
201
|
+
if not enriched_results:
|
|
202
|
+
return []
|
|
203
|
+
|
|
204
|
+
# Apply context ranking
|
|
205
|
+
if use_ranking:
|
|
206
|
+
ranked_results = self.ranker.rank(
|
|
207
|
+
enriched_results,
|
|
208
|
+
query,
|
|
209
|
+
current_file=current_file,
|
|
210
|
+
language=language
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
ranked_results = enriched_results
|
|
214
|
+
|
|
215
|
+
# Format results
|
|
216
|
+
formatted_results = []
|
|
217
|
+
for result in ranked_results[:k]:
|
|
218
|
+
metadata = result.get('metadata', {})
|
|
219
|
+
formatted_results.append({
|
|
220
|
+
'path': result.get('file_path', metadata.get('file_path', 'unknown')),
|
|
221
|
+
'type': result.get('type', metadata.get('type', 'file')),
|
|
222
|
+
'name': result.get('name', metadata.get('name', '')),
|
|
223
|
+
'similarity': result.get('similarity', 0.0),
|
|
224
|
+
'rank_score': result.get('rank_score', result.get('similarity', 0.0)),
|
|
225
|
+
'start_line': result.get('start_line', metadata.get('start_line', 0)),
|
|
226
|
+
'end_line': result.get('end_line', metadata.get('end_line', 0)),
|
|
227
|
+
'content': result.get('content', ''),
|
|
228
|
+
'language': result.get('language', metadata.get('language', 'unknown')),
|
|
229
|
+
'bm25_score': result.get('bm25_score', 0.0),
|
|
230
|
+
'rrf_score': result.get('rrf_score', 0.0)
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
return formatted_results
|
|
234
|
+
|
|
235
|
+
def get_stats(self) -> Dict:
|
|
236
|
+
"""
|
|
237
|
+
Get statistics about the indexed codebase.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Dictionary with stats
|
|
241
|
+
"""
|
|
242
|
+
stats = {
|
|
243
|
+
'total_chunks': self.store.count(),
|
|
244
|
+
'embedding_dimension': self.embedder.dimension,
|
|
245
|
+
'indexed_chunks': len(self._indexed_chunks),
|
|
246
|
+
'hybrid_search_enabled': self.hybrid_search is not None
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if self.hybrid_search:
|
|
250
|
+
stats.update(self.hybrid_search.get_stats())
|
|
251
|
+
|
|
252
|
+
return stats
|
|
253
|
+
|
|
254
|
+
def explain_retrieval(self, query: str, chunk_id: str) -> Dict:
|
|
255
|
+
"""
|
|
256
|
+
Explain why a specific chunk was retrieved for a query.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
query: The search query
|
|
260
|
+
chunk_id: ID of the chunk to explain
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Dict with retrieval explanation
|
|
264
|
+
"""
|
|
265
|
+
explanation = {}
|
|
266
|
+
|
|
267
|
+
if self.hybrid_search:
|
|
268
|
+
explanation['hybrid_search'] = self.hybrid_search.explain_ranking(query, chunk_id)
|
|
269
|
+
|
|
270
|
+
# Find the chunk
|
|
271
|
+
for chunk in self._indexed_chunks:
|
|
272
|
+
if chunk.get('id') == chunk_id:
|
|
273
|
+
if self.ranker:
|
|
274
|
+
explanation['ranking'] = self.ranker.explain_ranking(chunk, query)
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
return explanation
|
|
278
|
+
|
|
279
|
+
def _detect_language(self, extension: str) -> str:
|
|
280
|
+
"""Detect language from file extension."""
|
|
281
|
+
extension = extension.lower()
|
|
282
|
+
language_map = {
|
|
283
|
+
'.py': 'python',
|
|
284
|
+
'.js': 'javascript',
|
|
285
|
+
'.jsx': 'jsx',
|
|
286
|
+
'.ts': 'typescript',
|
|
287
|
+
'.tsx': 'tsx',
|
|
288
|
+
'.mjs': 'javascript',
|
|
289
|
+
'.cjs': 'javascript',
|
|
290
|
+
}
|
|
291
|
+
return language_map.get(extension, 'text')
|
|
292
|
+
|
|
293
|
+
def update_chunk_metadata(self, file_path: str, **metadata):
|
|
294
|
+
"""
|
|
295
|
+
Update metadata for chunks from a specific file.
|
|
296
|
+
|
|
297
|
+
Useful for updating reference counts, popularity, etc.
|
|
298
|
+
"""
|
|
299
|
+
for chunk in self._indexed_chunks:
|
|
300
|
+
if chunk.get('file_path') == file_path:
|
|
301
|
+
chunk.update(metadata)
|
|
302
|
+
|
|
303
|
+
def configure_ranker(self, **weights):
|
|
304
|
+
"""
|
|
305
|
+
Configure context ranker weights.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
**weights: New weights (similarity, file_proximity, etc.)
|
|
309
|
+
"""
|
|
310
|
+
self.ranker.update_weights(**weights)
|
|
311
|
+
|
|
312
|
+
def configure_hybrid_search(self, vector_weight: float = 0.5,
|
|
313
|
+
keyword_weight: float = 0.5):
|
|
314
|
+
"""
|
|
315
|
+
Configure hybrid search weights.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
vector_weight: Weight for vector search
|
|
319
|
+
keyword_weight: Weight for keyword search
|
|
320
|
+
"""
|
|
321
|
+
if self.hybrid_search:
|
|
322
|
+
self.hybrid_search.update_weights(vector_weight, keyword_weight)
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Hybrid search combining vector similarity and keyword search (BM25)."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Dict, Optional, Tuple
|
|
4
|
+
from rank_bm25 import BM25Okapi
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HybridSearch:
|
|
9
|
+
"""
|
|
10
|
+
Hybrid search that combines vector similarity with BM25 keyword search.
|
|
11
|
+
|
|
12
|
+
Uses Reciprocal Rank Fusion (RRF) to combine results from both methods.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
RRF_CONSTANT = 60 # Standard RRF constant
|
|
16
|
+
|
|
17
|
+
def __init__(self, vector_store, embedding_generator):
|
|
18
|
+
"""
|
|
19
|
+
Initialize hybrid search.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
vector_store: VectorStore instance for semantic search
|
|
23
|
+
embedding_generator: EmbeddingGenerator for query embeddings
|
|
24
|
+
"""
|
|
25
|
+
self.vector_store = vector_store
|
|
26
|
+
self.embedding_generator = embedding_generator
|
|
27
|
+
|
|
28
|
+
# BM25 index (will be built from corpus)
|
|
29
|
+
self.bm25 = None
|
|
30
|
+
self.corpus = []
|
|
31
|
+
self.chunk_ids = []
|
|
32
|
+
|
|
33
|
+
def index_corpus(self, chunks: List[Dict]):
|
|
34
|
+
"""
|
|
35
|
+
Build BM25 index from code chunks.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
chunks: List of code chunks with 'id' and 'content' keys
|
|
39
|
+
"""
|
|
40
|
+
self.corpus = []
|
|
41
|
+
self.chunk_ids = []
|
|
42
|
+
|
|
43
|
+
for chunk in chunks:
|
|
44
|
+
content = chunk.get('content', '')
|
|
45
|
+
self.corpus.append(content)
|
|
46
|
+
self.chunk_ids.append(chunk.get('id'))
|
|
47
|
+
|
|
48
|
+
# Tokenize corpus for BM25
|
|
49
|
+
tokenized_corpus = [self._tokenize(doc) for doc in self.corpus]
|
|
50
|
+
|
|
51
|
+
# Build BM25 index
|
|
52
|
+
self.bm25 = BM25Okapi(tokenized_corpus)
|
|
53
|
+
|
|
54
|
+
def search(self, query: str, n_results: int = 10,
|
|
55
|
+
vector_weight: float = 0.5,
|
|
56
|
+
keyword_weight: float = 0.5) -> List[Dict]:
|
|
57
|
+
"""
|
|
58
|
+
Hybrid search combining vector and keyword search.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
query: Search query
|
|
62
|
+
n_results: Number of results to return
|
|
63
|
+
vector_weight: Weight for vector search (0-1)
|
|
64
|
+
keyword_weight: Weight for keyword search (0-1)
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of ranked results
|
|
68
|
+
"""
|
|
69
|
+
if not self.bm25 or not self.corpus:
|
|
70
|
+
# Fallback to vector-only search
|
|
71
|
+
return self._vector_only_search(query, n_results)
|
|
72
|
+
|
|
73
|
+
# 1. Vector similarity search
|
|
74
|
+
vector_results = self._vector_search(query, n_results * 2)
|
|
75
|
+
|
|
76
|
+
# 2. BM25 keyword search
|
|
77
|
+
bm25_results = self._bm25_search(query, n_results * 2)
|
|
78
|
+
|
|
79
|
+
# 3. Combine using Reciprocal Rank Fusion
|
|
80
|
+
combined = self._reciprocal_rank_fusion(
|
|
81
|
+
vector_results,
|
|
82
|
+
bm25_results,
|
|
83
|
+
vector_weight,
|
|
84
|
+
keyword_weight
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Return top N results
|
|
88
|
+
return combined[:n_results]
|
|
89
|
+
|
|
90
|
+
def _vector_search(self, query: str, k: int) -> List[Tuple[str, float]]:
|
|
91
|
+
"""
|
|
92
|
+
Perform vector similarity search.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of (chunk_id, score) tuples
|
|
96
|
+
"""
|
|
97
|
+
# Generate query embedding
|
|
98
|
+
query_embedding = self.embedding_generator.generate_embedding(query)
|
|
99
|
+
|
|
100
|
+
# Search vector store
|
|
101
|
+
results = self.vector_store.search(query_embedding, n_results=k)
|
|
102
|
+
|
|
103
|
+
# Extract chunk IDs and scores (similarity)
|
|
104
|
+
return [(r['id'], r['similarity']) for r in results]
|
|
105
|
+
|
|
106
|
+
def _bm25_search(self, query: str, k: int) -> List[Tuple[str, float]]:
|
|
107
|
+
"""
|
|
108
|
+
Perform BM25 keyword search.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List of (chunk_id, score) tuples
|
|
112
|
+
"""
|
|
113
|
+
if not self.bm25:
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
# Tokenize query
|
|
117
|
+
tokenized_query = self._tokenize(query)
|
|
118
|
+
|
|
119
|
+
# Get BM25 scores
|
|
120
|
+
scores = self.bm25.get_scores(tokenized_query)
|
|
121
|
+
|
|
122
|
+
# Sort by score and get top K
|
|
123
|
+
scored_docs = [(self.chunk_ids[i], scores[i]) for i in range(len(scores))]
|
|
124
|
+
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
|
125
|
+
|
|
126
|
+
return scored_docs[:k]
|
|
127
|
+
|
|
128
|
+
def _reciprocal_rank_fusion(
|
|
129
|
+
self,
|
|
130
|
+
vector_results: List[Tuple[str, float]],
|
|
131
|
+
bm25_results: List[Tuple[str, float]],
|
|
132
|
+
vector_weight: float = 0.5,
|
|
133
|
+
keyword_weight: float = 0.5
|
|
134
|
+
) -> List[Dict]:
|
|
135
|
+
"""
|
|
136
|
+
Combine results using Reciprocal Rank Fusion.
|
|
137
|
+
|
|
138
|
+
RRF formula: score = Σ weight / (rank + constant)
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
vector_results: Results from vector search
|
|
142
|
+
bm25_results: Results from BM25 search
|
|
143
|
+
vector_weight: Weight for vector results
|
|
144
|
+
keyword_weight: Weight for keyword results
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Combined and ranked results
|
|
148
|
+
"""
|
|
149
|
+
rrf_scores = {}
|
|
150
|
+
|
|
151
|
+
# Add scores from vector search
|
|
152
|
+
for rank, (chunk_id, score) in enumerate(vector_results):
|
|
153
|
+
rrf_score = vector_weight / (rank + self.RRF_CONSTANT)
|
|
154
|
+
if chunk_id in rrf_scores:
|
|
155
|
+
rrf_scores[chunk_id]['rrf_score'] += rrf_score
|
|
156
|
+
rrf_scores[chunk_id]['vector_score'] = score
|
|
157
|
+
else:
|
|
158
|
+
rrf_scores[chunk_id] = {
|
|
159
|
+
'id': chunk_id,
|
|
160
|
+
'rrf_score': rrf_score,
|
|
161
|
+
'vector_score': score,
|
|
162
|
+
'bm25_score': 0.0,
|
|
163
|
+
'vector_rank': rank + 1,
|
|
164
|
+
'bm25_rank': None
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# Add scores from BM25 search
|
|
168
|
+
for rank, (chunk_id, score) in enumerate(bm25_results):
|
|
169
|
+
rrf_score = keyword_weight / (rank + self.RRF_CONSTANT)
|
|
170
|
+
if chunk_id in rrf_scores:
|
|
171
|
+
rrf_scores[chunk_id]['rrf_score'] += rrf_score
|
|
172
|
+
rrf_scores[chunk_id]['bm25_score'] = score
|
|
173
|
+
rrf_scores[chunk_id]['bm25_rank'] = rank + 1
|
|
174
|
+
else:
|
|
175
|
+
rrf_scores[chunk_id] = {
|
|
176
|
+
'id': chunk_id,
|
|
177
|
+
'rrf_score': rrf_score,
|
|
178
|
+
'vector_score': 0.0,
|
|
179
|
+
'bm25_score': score,
|
|
180
|
+
'vector_rank': None,
|
|
181
|
+
'bm25_rank': rank + 1
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
# Sort by RRF score
|
|
185
|
+
ranked = sorted(
|
|
186
|
+
rrf_scores.values(),
|
|
187
|
+
key=lambda x: x['rrf_score'],
|
|
188
|
+
reverse=True
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return ranked
|
|
192
|
+
|
|
193
|
+
def _vector_only_search(self, query: str, k: int) -> List[Dict]:
|
|
194
|
+
"""Fallback to vector-only search."""
|
|
195
|
+
query_embedding = self.embedding_generator.generate_embedding(query)
|
|
196
|
+
results = self.vector_store.search(query_embedding, n_results=k)
|
|
197
|
+
|
|
198
|
+
return [{
|
|
199
|
+
'id': r['id'],
|
|
200
|
+
'rrf_score': r['similarity'],
|
|
201
|
+
'vector_score': r['similarity'],
|
|
202
|
+
'bm25_score': 0.0,
|
|
203
|
+
'vector_rank': i + 1,
|
|
204
|
+
'bm25_rank': None
|
|
205
|
+
} for i, r in enumerate(results)]
|
|
206
|
+
|
|
207
|
+
def _tokenize(self, text: str) -> List[str]:
|
|
208
|
+
"""
|
|
209
|
+
Tokenize text for BM25 search.
|
|
210
|
+
|
|
211
|
+
Uses code-aware tokenization that preserves identifiers and keywords.
|
|
212
|
+
"""
|
|
213
|
+
# Convert to lowercase
|
|
214
|
+
text = text.lower()
|
|
215
|
+
|
|
216
|
+
# Split on non-alphanumeric characters but keep underscores
|
|
217
|
+
# This preserves Python/JS identifiers like my_function_name
|
|
218
|
+
tokens = re.findall(r'\b\w+\b', text)
|
|
219
|
+
|
|
220
|
+
# Also split camelCase into separate tokens
|
|
221
|
+
expanded_tokens = []
|
|
222
|
+
for token in tokens:
|
|
223
|
+
# Split camelCase: myFunctionName -> my, function, name
|
|
224
|
+
camel_split = re.sub('([A-Z][a-z]+)', r' \1', token)
|
|
225
|
+
camel_split = re.sub('([A-Z]+)', r' \1', camel_split)
|
|
226
|
+
expanded_tokens.extend(camel_split.split())
|
|
227
|
+
|
|
228
|
+
# Remove very short tokens (< 2 chars) and common code words
|
|
229
|
+
stop_words = {'if', 'for', 'while', 'do', 'is', 'to', 'in', 'of', 'and', 'or', 'not'}
|
|
230
|
+
filtered = [t for t in expanded_tokens if len(t) >= 2 and t not in stop_words]
|
|
231
|
+
|
|
232
|
+
return filtered
|
|
233
|
+
|
|
234
|
+
def get_stats(self) -> Dict:
|
|
235
|
+
"""Get statistics about the search index."""
|
|
236
|
+
return {
|
|
237
|
+
'corpus_size': len(self.corpus),
|
|
238
|
+
'bm25_indexed': self.bm25 is not None,
|
|
239
|
+
'vector_store_count': self.vector_store.count() if self.vector_store else 0
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
def update_weights(self, vector_weight: float, keyword_weight: float):
|
|
243
|
+
"""
|
|
244
|
+
Update the weights for vector vs keyword search.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
vector_weight: Weight for vector search (0-1)
|
|
248
|
+
keyword_weight: Weight for keyword search (0-1)
|
|
249
|
+
"""
|
|
250
|
+
# Normalize weights to sum to 1
|
|
251
|
+
total = vector_weight + keyword_weight
|
|
252
|
+
if total > 0:
|
|
253
|
+
self.vector_weight = vector_weight / total
|
|
254
|
+
self.keyword_weight = keyword_weight / total
|
|
255
|
+
else:
|
|
256
|
+
self.vector_weight = 0.5
|
|
257
|
+
self.keyword_weight = 0.5
|
|
258
|
+
|
|
259
|
+
def explain_ranking(self, query: str, chunk_id: str) -> Dict:
|
|
260
|
+
"""
|
|
261
|
+
Explain why a specific chunk was ranked for a query.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
query: The search query
|
|
265
|
+
chunk_id: ID of the chunk to explain
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Dict with ranking explanation
|
|
269
|
+
"""
|
|
270
|
+
# Get vector score
|
|
271
|
+
query_embedding = self.embedding_generator.generate_embedding(query)
|
|
272
|
+
vector_results = self.vector_store.search(query_embedding, n_results=100)
|
|
273
|
+
vector_rank = None
|
|
274
|
+
vector_score = 0.0
|
|
275
|
+
|
|
276
|
+
for i, result in enumerate(vector_results):
|
|
277
|
+
if result['id'] == chunk_id:
|
|
278
|
+
vector_rank = i + 1
|
|
279
|
+
vector_score = result['similarity']
|
|
280
|
+
break
|
|
281
|
+
|
|
282
|
+
# Get BM25 score
|
|
283
|
+
tokenized_query = self._tokenize(query)
|
|
284
|
+
bm25_scores = self.bm25.get_scores(tokenized_query) if self.bm25 else []
|
|
285
|
+
|
|
286
|
+
bm25_rank = None
|
|
287
|
+
bm25_score = 0.0
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
idx = self.chunk_ids.index(chunk_id)
|
|
291
|
+
bm25_score = bm25_scores[idx]
|
|
292
|
+
|
|
293
|
+
# Find rank
|
|
294
|
+
sorted_scores = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)
|
|
295
|
+
for rank, (i, score) in enumerate(sorted_scores):
|
|
296
|
+
if i == idx:
|
|
297
|
+
bm25_rank = rank + 1
|
|
298
|
+
break
|
|
299
|
+
except (ValueError, IndexError):
|
|
300
|
+
pass
|
|
301
|
+
|
|
302
|
+
return {
|
|
303
|
+
'chunk_id': chunk_id,
|
|
304
|
+
'query': query,
|
|
305
|
+
'vector_rank': vector_rank,
|
|
306
|
+
'vector_score': vector_score,
|
|
307
|
+
'bm25_rank': bm25_rank,
|
|
308
|
+
'bm25_score': bm25_score,
|
|
309
|
+
'appears_in_vector': vector_rank is not None,
|
|
310
|
+
'appears_in_bm25': bm25_rank is not None
|
|
311
|
+
}
|