ai-coding-assistant 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. ai_coding_assistant-0.5.0.dist-info/METADATA +226 -0
  2. ai_coding_assistant-0.5.0.dist-info/RECORD +89 -0
  3. ai_coding_assistant-0.5.0.dist-info/WHEEL +4 -0
  4. ai_coding_assistant-0.5.0.dist-info/entry_points.txt +3 -0
  5. ai_coding_assistant-0.5.0.dist-info/licenses/LICENSE +21 -0
  6. coding_assistant/__init__.py +3 -0
  7. coding_assistant/__main__.py +19 -0
  8. coding_assistant/cli/__init__.py +1 -0
  9. coding_assistant/cli/app.py +158 -0
  10. coding_assistant/cli/commands/__init__.py +19 -0
  11. coding_assistant/cli/commands/ask.py +178 -0
  12. coding_assistant/cli/commands/config.py +438 -0
  13. coding_assistant/cli/commands/diagram.py +267 -0
  14. coding_assistant/cli/commands/document.py +410 -0
  15. coding_assistant/cli/commands/explain.py +192 -0
  16. coding_assistant/cli/commands/fix.py +249 -0
  17. coding_assistant/cli/commands/index.py +162 -0
  18. coding_assistant/cli/commands/refactor.py +245 -0
  19. coding_assistant/cli/commands/search.py +182 -0
  20. coding_assistant/cli/commands/serve_docs.py +128 -0
  21. coding_assistant/cli/repl.py +381 -0
  22. coding_assistant/cli/theme.py +90 -0
  23. coding_assistant/codebase/__init__.py +1 -0
  24. coding_assistant/codebase/crawler.py +93 -0
  25. coding_assistant/codebase/parser.py +266 -0
  26. coding_assistant/config/__init__.py +25 -0
  27. coding_assistant/config/config_manager.py +615 -0
  28. coding_assistant/config/settings.py +82 -0
  29. coding_assistant/context/__init__.py +19 -0
  30. coding_assistant/context/chunker.py +443 -0
  31. coding_assistant/context/enhanced_retriever.py +322 -0
  32. coding_assistant/context/hybrid_search.py +311 -0
  33. coding_assistant/context/ranker.py +355 -0
  34. coding_assistant/context/retriever.py +119 -0
  35. coding_assistant/context/window.py +362 -0
  36. coding_assistant/documentation/__init__.py +23 -0
  37. coding_assistant/documentation/agents/__init__.py +27 -0
  38. coding_assistant/documentation/agents/coordinator.py +510 -0
  39. coding_assistant/documentation/agents/module_documenter.py +111 -0
  40. coding_assistant/documentation/agents/synthesizer.py +139 -0
  41. coding_assistant/documentation/agents/task_delegator.py +100 -0
  42. coding_assistant/documentation/decomposition/__init__.py +21 -0
  43. coding_assistant/documentation/decomposition/context_preserver.py +477 -0
  44. coding_assistant/documentation/decomposition/module_detector.py +302 -0
  45. coding_assistant/documentation/decomposition/partitioner.py +621 -0
  46. coding_assistant/documentation/generators/__init__.py +14 -0
  47. coding_assistant/documentation/generators/dataflow_generator.py +440 -0
  48. coding_assistant/documentation/generators/diagram_generator.py +511 -0
  49. coding_assistant/documentation/graph/__init__.py +13 -0
  50. coding_assistant/documentation/graph/dependency_builder.py +468 -0
  51. coding_assistant/documentation/graph/module_analyzer.py +475 -0
  52. coding_assistant/documentation/writers/__init__.py +11 -0
  53. coding_assistant/documentation/writers/markdown_writer.py +322 -0
  54. coding_assistant/embeddings/__init__.py +0 -0
  55. coding_assistant/embeddings/generator.py +89 -0
  56. coding_assistant/embeddings/store.py +187 -0
  57. coding_assistant/exceptions/__init__.py +50 -0
  58. coding_assistant/exceptions/base.py +110 -0
  59. coding_assistant/exceptions/llm.py +249 -0
  60. coding_assistant/exceptions/recovery.py +263 -0
  61. coding_assistant/exceptions/storage.py +213 -0
  62. coding_assistant/exceptions/validation.py +230 -0
  63. coding_assistant/llm/__init__.py +1 -0
  64. coding_assistant/llm/client.py +277 -0
  65. coding_assistant/llm/gemini_client.py +181 -0
  66. coding_assistant/llm/groq_client.py +160 -0
  67. coding_assistant/llm/prompts.py +98 -0
  68. coding_assistant/llm/together_client.py +160 -0
  69. coding_assistant/operations/__init__.py +13 -0
  70. coding_assistant/operations/differ.py +369 -0
  71. coding_assistant/operations/generator.py +347 -0
  72. coding_assistant/operations/linter.py +430 -0
  73. coding_assistant/operations/validator.py +406 -0
  74. coding_assistant/storage/__init__.py +9 -0
  75. coding_assistant/storage/database.py +363 -0
  76. coding_assistant/storage/session.py +231 -0
  77. coding_assistant/utils/__init__.py +31 -0
  78. coding_assistant/utils/cache.py +477 -0
  79. coding_assistant/utils/hardware.py +132 -0
  80. coding_assistant/utils/keystore.py +206 -0
  81. coding_assistant/utils/logger.py +32 -0
  82. coding_assistant/utils/progress.py +311 -0
  83. coding_assistant/validation/__init__.py +13 -0
  84. coding_assistant/validation/files.py +305 -0
  85. coding_assistant/validation/inputs.py +335 -0
  86. coding_assistant/validation/params.py +280 -0
  87. coding_assistant/validation/sanitizers.py +243 -0
  88. coding_assistant/vcs/__init__.py +5 -0
  89. coding_assistant/vcs/git.py +269 -0
@@ -0,0 +1,322 @@
1
+ """Enhanced semantic retriever with hybrid search and context ranking."""
2
+
3
+ from pathlib import Path
4
+ from typing import List, Dict, Optional
5
+ from datetime import datetime
6
+
7
+ from coding_assistant.codebase.crawler import CodebaseCrawler
8
+ from coding_assistant.codebase.parser import CodeParser
9
+ from coding_assistant.embeddings.generator import EmbeddingGenerator
10
+ from coding_assistant.embeddings.store import VectorStore
11
+ from coding_assistant.context.chunker import SmartChunker
12
+ from coding_assistant.context.hybrid_search import HybridSearch
13
+ from coding_assistant.context.ranker import ContextRanker
14
+ from coding_assistant.vcs.git import GitIntegration
15
+
16
+
17
+ class EnhancedSemanticRetriever:
18
+ """
19
+ Enhanced retriever with hybrid search and intelligent ranking.
20
+
21
+ Combines:
22
+ - Vector similarity search
23
+ - BM25 keyword search
24
+ - Smart code chunking
25
+ - Context-aware ranking
26
+ """
27
+
28
+ def __init__(self, project_path: Path, dependency_graph=None):
29
+ """
30
+ Initialize the enhanced retriever.
31
+
32
+ Args:
33
+ project_path: Path to the project root
34
+ dependency_graph: Optional dependency graph for file relationships
35
+ """
36
+ self.project_path = Path(project_path).resolve()
37
+ self.crawler = CodebaseCrawler(self.project_path)
38
+ self.parser = CodeParser()
39
+ self.embedder = EmbeddingGenerator()
40
+
41
+ # Vector store
42
+ persist_dir = self.project_path / ".coding_assistant" / "chroma_db"
43
+ self.store = VectorStore(persist_dir=persist_dir)
44
+
45
+ # Smart chunker
46
+ self.chunker = SmartChunker(max_chunk_tokens=500)
47
+
48
+ # Hybrid search (will be initialized after indexing)
49
+ self.hybrid_search = None
50
+
51
+ # Context ranker
52
+ self.ranker = ContextRanker(dependency_graph=dependency_graph)
53
+
54
+ # Git integration (optional - won't fail if not a git repo)
55
+ try:
56
+ self.git = GitIntegration(str(self.project_path))
57
+ except Exception:
58
+ self.git = None
59
+
60
+ # Cache for indexed chunks
61
+ self._indexed_chunks = []
62
+
63
+ def clear_index(self):
64
+ """Clear the existing index."""
65
+ self.store.clear()
66
+ self._indexed_chunks = []
67
+ self.hybrid_search = None
68
+
69
+ def index_codebase(self, max_files: int = 100, force_reindex: bool = False):
70
+ """
71
+ Index the codebase with smart chunking.
72
+
73
+ Args:
74
+ max_files: Maximum number of files to index
75
+ force_reindex: Force re-indexing even if already indexed
76
+ """
77
+ if self._indexed_chunks and not force_reindex:
78
+ print("Codebase already indexed. Use force_reindex=True to re-index.")
79
+ return
80
+
81
+ print("Indexing codebase with smart chunking...")
82
+
83
+ # Scan files
84
+ files = self.crawler.scan(max_files=max_files)
85
+
86
+ all_chunks = []
87
+
88
+ # Parse and chunk each file
89
+ for file_info in files:
90
+ try:
91
+ content = self.crawler.read_file(file_info['path'])
92
+ language = self._detect_language(file_info['extension'])
93
+
94
+ if language in ('python', 'javascript', 'typescript', 'jsx', 'tsx'):
95
+ # Use smart chunking for supported languages
96
+ chunks = self.chunker.chunk_code(
97
+ content,
98
+ file_info['path'],
99
+ language
100
+ )
101
+
102
+ # Convert to dict format
103
+ for chunk in chunks:
104
+ all_chunks.append({
105
+ 'type': chunk.type,
106
+ 'file_path': chunk.file_path,
107
+ 'content': chunk.content,
108
+ 'start_line': chunk.start_line,
109
+ 'end_line': chunk.end_line,
110
+ 'language': chunk.language,
111
+ 'name': chunk.name,
112
+ 'docstring': chunk.docstring,
113
+ 'last_modified': datetime.fromtimestamp(
114
+ Path(chunk.file_path).stat().st_mtime
115
+ ).isoformat()
116
+ })
117
+ else:
118
+ # For other files, create simple file-level chunk
119
+ all_chunks.append({
120
+ 'type': 'file',
121
+ 'file_path': file_info['path'],
122
+ 'content': content[:5000],
123
+ 'start_line': 0,
124
+ 'end_line': len(content.split('\n')),
125
+ 'language': language,
126
+ 'name': Path(file_info['path']).name,
127
+ 'last_modified': datetime.fromtimestamp(
128
+ Path(file_info['path']).stat().st_mtime
129
+ ).isoformat()
130
+ })
131
+
132
+ except Exception as e:
133
+ print(f"Warning: Could not process {file_info['path']}: {e}")
134
+ continue
135
+
136
+ if not all_chunks:
137
+ print("No chunks to index")
138
+ return
139
+
140
+ print(f"Generating embeddings for {len(all_chunks)} chunks...")
141
+
142
+ # Generate embeddings
143
+ embedded_chunks = self.embedder.embed_code_chunks(all_chunks)
144
+
145
+ # Store in vector database
146
+ print("Storing in vector database...")
147
+ self.store.add_chunks(embedded_chunks)
148
+
149
+ # Cache chunks for BM25
150
+ self._indexed_chunks = embedded_chunks
151
+
152
+ # Initialize hybrid search
153
+ print("Building BM25 index...")
154
+ self.hybrid_search = HybridSearch(self.store, self.embedder)
155
+ self.hybrid_search.index_corpus(embedded_chunks)
156
+
157
+ print(f"✓ Indexed {len(all_chunks)} code chunks")
158
+
159
+ def retrieve(self, query: str, k: int = 5,
160
+ current_file: Optional[str] = None,
161
+ language: Optional[str] = None,
162
+ use_hybrid: bool = True,
163
+ use_ranking: bool = True) -> List[Dict]:
164
+ """
165
+ Retrieve relevant code chunks with hybrid search and ranking.
166
+
167
+ Args:
168
+ query: User query/question
169
+ k: Number of results to return
170
+ current_file: Current file for proximity scoring
171
+ language: Programming language for language-aware ranking
172
+ use_hybrid: Whether to use hybrid search (vector + BM25)
173
+ use_ranking: Whether to apply context ranking
174
+
175
+ Returns:
176
+ List of relevant chunks with metadata
177
+ """
178
+ if use_hybrid and self.hybrid_search:
179
+ # Hybrid search (vector + BM25)
180
+ results = self.hybrid_search.search(query, n_results=k * 2)
181
+
182
+ # Convert hybrid results to full chunk data
183
+ enriched_results = []
184
+ for result in results:
185
+ chunk_id = result['id']
186
+ # Find full chunk data
187
+ for chunk in self._indexed_chunks:
188
+ if chunk.get('id') == chunk_id:
189
+ chunk_copy = chunk.copy()
190
+ chunk_copy['similarity'] = result.get('vector_score', 0.0)
191
+ chunk_copy['bm25_score'] = result.get('bm25_score', 0.0)
192
+ chunk_copy['rrf_score'] = result.get('rrf_score', 0.0)
193
+ enriched_results.append(chunk_copy)
194
+ break
195
+ else:
196
+ # Fallback to vector-only search
197
+ query_embedding = self.embedder.generate_embedding(query)
198
+ results = self.store.search(query_embedding, n_results=k * 2)
199
+ enriched_results = results
200
+
201
+ if not enriched_results:
202
+ return []
203
+
204
+ # Apply context ranking
205
+ if use_ranking:
206
+ ranked_results = self.ranker.rank(
207
+ enriched_results,
208
+ query,
209
+ current_file=current_file,
210
+ language=language
211
+ )
212
+ else:
213
+ ranked_results = enriched_results
214
+
215
+ # Format results
216
+ formatted_results = []
217
+ for result in ranked_results[:k]:
218
+ metadata = result.get('metadata', {})
219
+ formatted_results.append({
220
+ 'path': result.get('file_path', metadata.get('file_path', 'unknown')),
221
+ 'type': result.get('type', metadata.get('type', 'file')),
222
+ 'name': result.get('name', metadata.get('name', '')),
223
+ 'similarity': result.get('similarity', 0.0),
224
+ 'rank_score': result.get('rank_score', result.get('similarity', 0.0)),
225
+ 'start_line': result.get('start_line', metadata.get('start_line', 0)),
226
+ 'end_line': result.get('end_line', metadata.get('end_line', 0)),
227
+ 'content': result.get('content', ''),
228
+ 'language': result.get('language', metadata.get('language', 'unknown')),
229
+ 'bm25_score': result.get('bm25_score', 0.0),
230
+ 'rrf_score': result.get('rrf_score', 0.0)
231
+ })
232
+
233
+ return formatted_results
234
+
235
+ def get_stats(self) -> Dict:
236
+ """
237
+ Get statistics about the indexed codebase.
238
+
239
+ Returns:
240
+ Dictionary with stats
241
+ """
242
+ stats = {
243
+ 'total_chunks': self.store.count(),
244
+ 'embedding_dimension': self.embedder.dimension,
245
+ 'indexed_chunks': len(self._indexed_chunks),
246
+ 'hybrid_search_enabled': self.hybrid_search is not None
247
+ }
248
+
249
+ if self.hybrid_search:
250
+ stats.update(self.hybrid_search.get_stats())
251
+
252
+ return stats
253
+
254
+ def explain_retrieval(self, query: str, chunk_id: str) -> Dict:
255
+ """
256
+ Explain why a specific chunk was retrieved for a query.
257
+
258
+ Args:
259
+ query: The search query
260
+ chunk_id: ID of the chunk to explain
261
+
262
+ Returns:
263
+ Dict with retrieval explanation
264
+ """
265
+ explanation = {}
266
+
267
+ if self.hybrid_search:
268
+ explanation['hybrid_search'] = self.hybrid_search.explain_ranking(query, chunk_id)
269
+
270
+ # Find the chunk
271
+ for chunk in self._indexed_chunks:
272
+ if chunk.get('id') == chunk_id:
273
+ if self.ranker:
274
+ explanation['ranking'] = self.ranker.explain_ranking(chunk, query)
275
+ break
276
+
277
+ return explanation
278
+
279
+ def _detect_language(self, extension: str) -> str:
280
+ """Detect language from file extension."""
281
+ extension = extension.lower()
282
+ language_map = {
283
+ '.py': 'python',
284
+ '.js': 'javascript',
285
+ '.jsx': 'jsx',
286
+ '.ts': 'typescript',
287
+ '.tsx': 'tsx',
288
+ '.mjs': 'javascript',
289
+ '.cjs': 'javascript',
290
+ }
291
+ return language_map.get(extension, 'text')
292
+
293
+ def update_chunk_metadata(self, file_path: str, **metadata):
294
+ """
295
+ Update metadata for chunks from a specific file.
296
+
297
+ Useful for updating reference counts, popularity, etc.
298
+ """
299
+ for chunk in self._indexed_chunks:
300
+ if chunk.get('file_path') == file_path:
301
+ chunk.update(metadata)
302
+
303
+ def configure_ranker(self, **weights):
304
+ """
305
+ Configure context ranker weights.
306
+
307
+ Args:
308
+ **weights: New weights (similarity, file_proximity, etc.)
309
+ """
310
+ self.ranker.update_weights(**weights)
311
+
312
+ def configure_hybrid_search(self, vector_weight: float = 0.5,
313
+ keyword_weight: float = 0.5):
314
+ """
315
+ Configure hybrid search weights.
316
+
317
+ Args:
318
+ vector_weight: Weight for vector search
319
+ keyword_weight: Weight for keyword search
320
+ """
321
+ if self.hybrid_search:
322
+ self.hybrid_search.update_weights(vector_weight, keyword_weight)
@@ -0,0 +1,311 @@
1
+ """Hybrid search combining vector similarity and keyword search (BM25)."""
2
+
3
+ from typing import List, Dict, Optional, Tuple
4
+ from rank_bm25 import BM25Okapi
5
+ import re
6
+
7
+
8
+ class HybridSearch:
9
+ """
10
+ Hybrid search that combines vector similarity with BM25 keyword search.
11
+
12
+ Uses Reciprocal Rank Fusion (RRF) to combine results from both methods.
13
+ """
14
+
15
+ RRF_CONSTANT = 60 # Standard RRF constant
16
+
17
+ def __init__(self, vector_store, embedding_generator):
18
+ """
19
+ Initialize hybrid search.
20
+
21
+ Args:
22
+ vector_store: VectorStore instance for semantic search
23
+ embedding_generator: EmbeddingGenerator for query embeddings
24
+ """
25
+ self.vector_store = vector_store
26
+ self.embedding_generator = embedding_generator
27
+
28
+ # BM25 index (will be built from corpus)
29
+ self.bm25 = None
30
+ self.corpus = []
31
+ self.chunk_ids = []
32
+
33
+ def index_corpus(self, chunks: List[Dict]):
34
+ """
35
+ Build BM25 index from code chunks.
36
+
37
+ Args:
38
+ chunks: List of code chunks with 'id' and 'content' keys
39
+ """
40
+ self.corpus = []
41
+ self.chunk_ids = []
42
+
43
+ for chunk in chunks:
44
+ content = chunk.get('content', '')
45
+ self.corpus.append(content)
46
+ self.chunk_ids.append(chunk.get('id'))
47
+
48
+ # Tokenize corpus for BM25
49
+ tokenized_corpus = [self._tokenize(doc) for doc in self.corpus]
50
+
51
+ # Build BM25 index
52
+ self.bm25 = BM25Okapi(tokenized_corpus)
53
+
54
+ def search(self, query: str, n_results: int = 10,
55
+ vector_weight: float = 0.5,
56
+ keyword_weight: float = 0.5) -> List[Dict]:
57
+ """
58
+ Hybrid search combining vector and keyword search.
59
+
60
+ Args:
61
+ query: Search query
62
+ n_results: Number of results to return
63
+ vector_weight: Weight for vector search (0-1)
64
+ keyword_weight: Weight for keyword search (0-1)
65
+
66
+ Returns:
67
+ List of ranked results
68
+ """
69
+ if not self.bm25 or not self.corpus:
70
+ # Fallback to vector-only search
71
+ return self._vector_only_search(query, n_results)
72
+
73
+ # 1. Vector similarity search
74
+ vector_results = self._vector_search(query, n_results * 2)
75
+
76
+ # 2. BM25 keyword search
77
+ bm25_results = self._bm25_search(query, n_results * 2)
78
+
79
+ # 3. Combine using Reciprocal Rank Fusion
80
+ combined = self._reciprocal_rank_fusion(
81
+ vector_results,
82
+ bm25_results,
83
+ vector_weight,
84
+ keyword_weight
85
+ )
86
+
87
+ # Return top N results
88
+ return combined[:n_results]
89
+
90
+ def _vector_search(self, query: str, k: int) -> List[Tuple[str, float]]:
91
+ """
92
+ Perform vector similarity search.
93
+
94
+ Returns:
95
+ List of (chunk_id, score) tuples
96
+ """
97
+ # Generate query embedding
98
+ query_embedding = self.embedding_generator.generate_embedding(query)
99
+
100
+ # Search vector store
101
+ results = self.vector_store.search(query_embedding, n_results=k)
102
+
103
+ # Extract chunk IDs and scores (similarity)
104
+ return [(r['id'], r['similarity']) for r in results]
105
+
106
+ def _bm25_search(self, query: str, k: int) -> List[Tuple[str, float]]:
107
+ """
108
+ Perform BM25 keyword search.
109
+
110
+ Returns:
111
+ List of (chunk_id, score) tuples
112
+ """
113
+ if not self.bm25:
114
+ return []
115
+
116
+ # Tokenize query
117
+ tokenized_query = self._tokenize(query)
118
+
119
+ # Get BM25 scores
120
+ scores = self.bm25.get_scores(tokenized_query)
121
+
122
+ # Sort by score and get top K
123
+ scored_docs = [(self.chunk_ids[i], scores[i]) for i in range(len(scores))]
124
+ scored_docs.sort(key=lambda x: x[1], reverse=True)
125
+
126
+ return scored_docs[:k]
127
+
128
+ def _reciprocal_rank_fusion(
129
+ self,
130
+ vector_results: List[Tuple[str, float]],
131
+ bm25_results: List[Tuple[str, float]],
132
+ vector_weight: float = 0.5,
133
+ keyword_weight: float = 0.5
134
+ ) -> List[Dict]:
135
+ """
136
+ Combine results using Reciprocal Rank Fusion.
137
+
138
+ RRF formula: score = Σ weight / (rank + constant)
139
+
140
+ Args:
141
+ vector_results: Results from vector search
142
+ bm25_results: Results from BM25 search
143
+ vector_weight: Weight for vector results
144
+ keyword_weight: Weight for keyword results
145
+
146
+ Returns:
147
+ Combined and ranked results
148
+ """
149
+ rrf_scores = {}
150
+
151
+ # Add scores from vector search
152
+ for rank, (chunk_id, score) in enumerate(vector_results):
153
+ rrf_score = vector_weight / (rank + self.RRF_CONSTANT)
154
+ if chunk_id in rrf_scores:
155
+ rrf_scores[chunk_id]['rrf_score'] += rrf_score
156
+ rrf_scores[chunk_id]['vector_score'] = score
157
+ else:
158
+ rrf_scores[chunk_id] = {
159
+ 'id': chunk_id,
160
+ 'rrf_score': rrf_score,
161
+ 'vector_score': score,
162
+ 'bm25_score': 0.0,
163
+ 'vector_rank': rank + 1,
164
+ 'bm25_rank': None
165
+ }
166
+
167
+ # Add scores from BM25 search
168
+ for rank, (chunk_id, score) in enumerate(bm25_results):
169
+ rrf_score = keyword_weight / (rank + self.RRF_CONSTANT)
170
+ if chunk_id in rrf_scores:
171
+ rrf_scores[chunk_id]['rrf_score'] += rrf_score
172
+ rrf_scores[chunk_id]['bm25_score'] = score
173
+ rrf_scores[chunk_id]['bm25_rank'] = rank + 1
174
+ else:
175
+ rrf_scores[chunk_id] = {
176
+ 'id': chunk_id,
177
+ 'rrf_score': rrf_score,
178
+ 'vector_score': 0.0,
179
+ 'bm25_score': score,
180
+ 'vector_rank': None,
181
+ 'bm25_rank': rank + 1
182
+ }
183
+
184
+ # Sort by RRF score
185
+ ranked = sorted(
186
+ rrf_scores.values(),
187
+ key=lambda x: x['rrf_score'],
188
+ reverse=True
189
+ )
190
+
191
+ return ranked
192
+
193
+ def _vector_only_search(self, query: str, k: int) -> List[Dict]:
194
+ """Fallback to vector-only search."""
195
+ query_embedding = self.embedding_generator.generate_embedding(query)
196
+ results = self.vector_store.search(query_embedding, n_results=k)
197
+
198
+ return [{
199
+ 'id': r['id'],
200
+ 'rrf_score': r['similarity'],
201
+ 'vector_score': r['similarity'],
202
+ 'bm25_score': 0.0,
203
+ 'vector_rank': i + 1,
204
+ 'bm25_rank': None
205
+ } for i, r in enumerate(results)]
206
+
207
+ def _tokenize(self, text: str) -> List[str]:
208
+ """
209
+ Tokenize text for BM25 search.
210
+
211
+ Uses code-aware tokenization that preserves identifiers and keywords.
212
+ """
213
+ # Convert to lowercase
214
+ text = text.lower()
215
+
216
+ # Split on non-alphanumeric characters but keep underscores
217
+ # This preserves Python/JS identifiers like my_function_name
218
+ tokens = re.findall(r'\b\w+\b', text)
219
+
220
+ # Also split camelCase into separate tokens
221
+ expanded_tokens = []
222
+ for token in tokens:
223
+ # Split camelCase: myFunctionName -> my, function, name
224
+ camel_split = re.sub('([A-Z][a-z]+)', r' \1', token)
225
+ camel_split = re.sub('([A-Z]+)', r' \1', camel_split)
226
+ expanded_tokens.extend(camel_split.split())
227
+
228
+ # Remove very short tokens (< 2 chars) and common code words
229
+ stop_words = {'if', 'for', 'while', 'do', 'is', 'to', 'in', 'of', 'and', 'or', 'not'}
230
+ filtered = [t for t in expanded_tokens if len(t) >= 2 and t not in stop_words]
231
+
232
+ return filtered
233
+
234
+ def get_stats(self) -> Dict:
235
+ """Get statistics about the search index."""
236
+ return {
237
+ 'corpus_size': len(self.corpus),
238
+ 'bm25_indexed': self.bm25 is not None,
239
+ 'vector_store_count': self.vector_store.count() if self.vector_store else 0
240
+ }
241
+
242
+ def update_weights(self, vector_weight: float, keyword_weight: float):
243
+ """
244
+ Update the weights for vector vs keyword search.
245
+
246
+ Args:
247
+ vector_weight: Weight for vector search (0-1)
248
+ keyword_weight: Weight for keyword search (0-1)
249
+ """
250
+ # Normalize weights to sum to 1
251
+ total = vector_weight + keyword_weight
252
+ if total > 0:
253
+ self.vector_weight = vector_weight / total
254
+ self.keyword_weight = keyword_weight / total
255
+ else:
256
+ self.vector_weight = 0.5
257
+ self.keyword_weight = 0.5
258
+
259
+ def explain_ranking(self, query: str, chunk_id: str) -> Dict:
260
+ """
261
+ Explain why a specific chunk was ranked for a query.
262
+
263
+ Args:
264
+ query: The search query
265
+ chunk_id: ID of the chunk to explain
266
+
267
+ Returns:
268
+ Dict with ranking explanation
269
+ """
270
+ # Get vector score
271
+ query_embedding = self.embedding_generator.generate_embedding(query)
272
+ vector_results = self.vector_store.search(query_embedding, n_results=100)
273
+ vector_rank = None
274
+ vector_score = 0.0
275
+
276
+ for i, result in enumerate(vector_results):
277
+ if result['id'] == chunk_id:
278
+ vector_rank = i + 1
279
+ vector_score = result['similarity']
280
+ break
281
+
282
+ # Get BM25 score
283
+ tokenized_query = self._tokenize(query)
284
+ bm25_scores = self.bm25.get_scores(tokenized_query) if self.bm25 else []
285
+
286
+ bm25_rank = None
287
+ bm25_score = 0.0
288
+
289
+ try:
290
+ idx = self.chunk_ids.index(chunk_id)
291
+ bm25_score = bm25_scores[idx]
292
+
293
+ # Find rank
294
+ sorted_scores = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)
295
+ for rank, (i, score) in enumerate(sorted_scores):
296
+ if i == idx:
297
+ bm25_rank = rank + 1
298
+ break
299
+ except (ValueError, IndexError):
300
+ pass
301
+
302
+ return {
303
+ 'chunk_id': chunk_id,
304
+ 'query': query,
305
+ 'vector_rank': vector_rank,
306
+ 'vector_score': vector_score,
307
+ 'bm25_rank': bm25_rank,
308
+ 'bm25_score': bm25_score,
309
+ 'appears_in_vector': vector_rank is not None,
310
+ 'appears_in_bm25': bm25_rank is not None
311
+ }