mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/database.py +251 -85
  2. mcp_code_indexer/database/models.py +66 -24
  3. mcp_code_indexer/database/retry_executor.py +15 -5
  4. mcp_code_indexer/file_scanner.py +107 -12
  5. mcp_code_indexer/main.py +43 -30
  6. mcp_code_indexer/server/mcp_server.py +191 -1
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  9. mcp_code_indexer/vector_mode/config.py +113 -45
  10. mcp_code_indexer/vector_mode/const.py +24 -0
  11. mcp_code_indexer/vector_mode/daemon.py +860 -98
  12. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  13. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  14. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  15. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  16. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  17. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  18. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  19. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  20. mcp_code_indexer/vector_mode/types.py +46 -0
  21. mcp_code_indexer/vector_mode/utils.py +50 -0
  22. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
  23. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
  24. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
  25. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
  26. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,459 @@
1
+ """
2
+ Vector Mode Tools Service for similarity search operations.
3
+
4
+ Orchestrates ASTChunker, EmbeddingService, and VectorStorageService to provide
5
+ find_similar_code functionality for both code snippets and file sections.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Dict, Any, List, Optional
12
+
13
+ from turbopuffer.types import Row
14
+
15
+ from .. import is_vector_mode_available
16
+ from ..chunking.ast_chunker import ASTChunker, CodeChunk
17
+ from ..config import VectorConfig, load_vector_config
18
+ from ..const import MODEL_DIMENSIONS
19
+ from ..providers.voyage_client import VoyageClient
20
+ from ..providers.turbopuffer_client import TurbopufferClient
21
+ from .embedding_service import EmbeddingService
22
+ from .vector_storage_service import VectorStorageService
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class VectorModeToolsService:
28
+ """
29
+ Service for vector-based code similarity search operations.
30
+
31
+ Provides find_similar_code functionality that handles both code snippet
32
+ and file section inputs, orchestrating chunking, embedding, and search.
33
+ """
34
+
35
+ def __init__(self):
36
+ """Initialize VectorModeToolsService and set up vector mode dependencies."""
37
+ self.ast_chunker: Optional[ASTChunker] = None
38
+ self.embedding_service: Optional[EmbeddingService] = None
39
+ self.vector_storage_service: Optional[VectorStorageService] = None
40
+ self.config: Optional[VectorConfig] = None
41
+ self._initialized = False
42
+
43
+ def _ensure_initialized(self) -> None:
44
+ """Ensure vector mode services are initialized."""
45
+ if self._initialized:
46
+ return
47
+
48
+ # Check if vector mode dependencies are available
49
+ if not is_vector_mode_available():
50
+ raise RuntimeError("Vector mode dependencies are not available")
51
+
52
+ # Load vector configuration
53
+ self.config = load_vector_config()
54
+
55
+ # Initialize clients
56
+ voyage_client = VoyageClient(
57
+ api_key=self.config.voyage_api_key, model=self.config.embedding_model
58
+ )
59
+
60
+ turbopuffer_client = TurbopufferClient(
61
+ api_key=self.config.turbopuffer_api_key,
62
+ region=self.config.turbopuffer_region,
63
+ )
64
+
65
+ # Get embedding dimension for the model
66
+ embedding_dimension = MODEL_DIMENSIONS.get(self.config.embedding_model, 1024)
67
+
68
+ # Initialize services
69
+ self.ast_chunker = ASTChunker()
70
+ self.embedding_service = EmbeddingService(voyage_client, self.config)
71
+ self.vector_storage_service = VectorStorageService(
72
+ turbopuffer_client, embedding_dimension, self.config
73
+ )
74
+
75
+ self._initialized = True
76
+
77
+ logger.info(
78
+ "Vector mode services initialized",
79
+ extra={
80
+ "structured_data": {
81
+ "embedding_model": self.config.embedding_model,
82
+ "embedding_dimension": embedding_dimension,
83
+ "batch_size": self.config.batch_size,
84
+ }
85
+ },
86
+ )
87
+
88
+ async def find_similar_code(
89
+ self,
90
+ project_name: str,
91
+ folder_path: str,
92
+ code_snippet: Optional[str] = None,
93
+ file_path: Optional[str] = None,
94
+ line_start: Optional[int] = None,
95
+ line_end: Optional[int] = None,
96
+ similarity_threshold: Optional[float] = None,
97
+ max_results: Optional[int] = None,
98
+ ) -> Dict[str, Any]:
99
+ """
100
+ Find code similar to a given snippet or file section.
101
+
102
+ Args:
103
+ project_name: Name of the project to search in
104
+ folder_path: Root folder path of the project
105
+ code_snippet: Direct code snippet to search for (mutually exclusive with file_path)
106
+ file_path: Path to file containing code to search for (requires line_start/line_end)
107
+ line_start: Starting line number for file section (1-indexed)
108
+ line_end: Ending line number for file section (1-indexed)
109
+ similarity_threshold: Minimum similarity score (defaults to config value)
110
+ max_results: Maximum number of results (defaults to config value)
111
+
112
+ Returns:
113
+ Dictionary containing search results and metadata
114
+
115
+ Raises:
116
+ ValueError: If input validation fails
117
+ RuntimeError: If search operations fail
118
+ """
119
+ # Validate mutually exclusive inputs
120
+ if code_snippet and file_path:
121
+ raise ValueError("Cannot specify both code_snippet and file_path")
122
+
123
+ if not code_snippet and not file_path:
124
+ raise ValueError(
125
+ "Must specify either code_snippet or file_path with line range"
126
+ )
127
+
128
+ if file_path and (line_start is None or line_end is None):
129
+ raise ValueError("file_path requires both line_start and line_end")
130
+
131
+ # Ensure services are initialized
132
+ self._ensure_initialized()
133
+
134
+ # Use config defaults if not specified
135
+ similarity_threshold = similarity_threshold or self.config.similarity_threshold
136
+ max_results = max_results or self.config.max_search_results
137
+
138
+ logger.info(
139
+ "Starting code similarity search",
140
+ extra={
141
+ "structured_data": {
142
+ "project_name": project_name,
143
+ "has_code_snippet": bool(code_snippet),
144
+ "file_path": file_path,
145
+ "line_range": f"{line_start}-{line_end}" if line_start else None,
146
+ "similarity_threshold": similarity_threshold,
147
+ "max_results": max_results,
148
+ }
149
+ },
150
+ )
151
+
152
+ try:
153
+ # Get query code content
154
+ if code_snippet:
155
+ query_code = code_snippet
156
+ query_source = "code_snippet"
157
+ else:
158
+ query_code = await self._read_file_section(
159
+ folder_path, file_path, line_start, line_end
160
+ )
161
+ query_source = f"{file_path}:{line_start}-{line_end}"
162
+
163
+ # Chunk the query code
164
+ query_chunks = await self._chunk_code(query_code, query_source)
165
+
166
+ if not query_chunks:
167
+ logger.warning(
168
+ "No chunks generated from query code",
169
+ extra={"structured_data": {"query_source": query_source}},
170
+ )
171
+ return {
172
+ "results": [],
173
+ "total_results": 0,
174
+ "query_info": {
175
+ "source": query_source,
176
+ "chunks_generated": 0,
177
+ "similarity_threshold": similarity_threshold,
178
+ },
179
+ "message": "No valid code chunks could be generated from the input",
180
+ }
181
+
182
+ # Generate embeddings for query chunks
183
+ query_embeddings = (
184
+ await self.embedding_service.generate_embeddings_for_chunks(
185
+ query_chunks, project_name, Path(query_source)
186
+ )
187
+ )
188
+
189
+ # Search for similar code using each query chunk
190
+ # Get more results per chunk to allow for deduplication and filtering
191
+ results_per_chunk = min(
192
+ max_results * 2, 50
193
+ ) # Cap at 50 per chunk to avoid excessive results
194
+
195
+ all_results: List[Row] = []
196
+ for i, (chunk, embedding) in enumerate(zip(query_chunks, query_embeddings)):
197
+ logger.debug(
198
+ f"Searching with query chunk {i+1}/{len(query_chunks)}",
199
+ extra={
200
+ "structured_data": {
201
+ "chunk_type": chunk.chunk_type.value,
202
+ "chunk_name": chunk.name,
203
+ }
204
+ },
205
+ )
206
+
207
+ chunk_results = await self.vector_storage_service.search_similar_chunks(
208
+ query_embedding=embedding,
209
+ project_name=project_name,
210
+ top_k=results_per_chunk,
211
+ )
212
+
213
+ if chunk_results:
214
+ all_results.extend(chunk_results)
215
+
216
+ # Filter by similarity threshold and deduplicate
217
+ filtered_results = self._filter_and_deduplicate_results(
218
+ all_results, similarity_threshold, max_results
219
+ )
220
+
221
+ logger.info(
222
+ "Code similarity search completed",
223
+ extra={
224
+ "structured_data": {
225
+ "project_name": project_name,
226
+ "query_chunks": len(query_chunks),
227
+ "raw_results": len(all_results),
228
+ "filtered_results": len(filtered_results),
229
+ "similarity_threshold": similarity_threshold,
230
+ }
231
+ },
232
+ )
233
+
234
+ return {
235
+ "results": filtered_results,
236
+ "total_results": len(filtered_results),
237
+ "query_info": {
238
+ "source": query_source,
239
+ "chunks_generated": len(query_chunks),
240
+ "similarity_threshold": similarity_threshold,
241
+ "max_results": max_results,
242
+ },
243
+ }
244
+
245
+ except Exception as e:
246
+ logger.error(
247
+ "Code similarity search failed",
248
+ extra={
249
+ "structured_data": {
250
+ "project_name": project_name,
251
+ "query_source": (
252
+ query_source if "query_source" in locals() else "unknown"
253
+ ),
254
+ "error": str(e),
255
+ }
256
+ },
257
+ exc_info=True,
258
+ )
259
+ raise RuntimeError(f"Similarity search failed: {e}") from e
260
+
261
+ async def _read_file_section(
262
+ self, folder_path: str, file_path: str, line_start: int, line_end: int
263
+ ) -> str:
264
+ """
265
+ Read a specific section of a file.
266
+
267
+ Args:
268
+ folder_path: Root folder path
269
+ file_path: Relative path to the file
270
+ line_start: Starting line number (1-indexed)
271
+ line_end: Ending line number (1-indexed, inclusive)
272
+
273
+ Returns:
274
+ Content of the specified file section
275
+
276
+ Raises:
277
+ ValueError: If file path is invalid or lines are out of range
278
+ RuntimeError: If file cannot be read
279
+ """
280
+ try:
281
+ # Resolve file path safely
282
+ folder_path_obj = Path(folder_path).expanduser().resolve()
283
+ file_path_obj = folder_path_obj / file_path
284
+
285
+ # Security check: ensure file is within project folder
286
+ if not str(file_path_obj.resolve()).startswith(str(folder_path_obj)):
287
+ raise ValueError(f"File path {file_path} is outside project folder")
288
+
289
+ if not file_path_obj.exists():
290
+ raise ValueError(f"File not found: {file_path}")
291
+
292
+ # Read file content
293
+ content = file_path_obj.read_text(encoding="utf-8", errors="replace")
294
+ lines = content.splitlines()
295
+
296
+ # Validate line range
297
+ if line_start < 1 or line_end < 1:
298
+ raise ValueError("Line numbers must be >= 1")
299
+
300
+ if line_start > len(lines):
301
+ raise ValueError(
302
+ f"line_start {line_start} exceeds file length {len(lines)}"
303
+ )
304
+
305
+ # Clamp line_end to file length
306
+ actual_line_end = min(line_end, len(lines))
307
+ if line_end > len(lines):
308
+ logger.warning(
309
+ f"line_end {line_end} exceeds file length {len(lines)}, clamping to {actual_line_end}"
310
+ )
311
+
312
+ # Extract section (convert to 0-based indexing)
313
+ section_lines = lines[line_start - 1 : actual_line_end]
314
+ return "\n".join(section_lines)
315
+
316
+ except ValueError:
317
+ raise
318
+ except Exception as e:
319
+ raise RuntimeError(
320
+ f"Failed to read file section {file_path}:{line_start}-{line_end}: {e}"
321
+ ) from e
322
+
323
+ async def _chunk_code(
324
+ self, code_content: str, source_identifier: str
325
+ ) -> List[CodeChunk]:
326
+ """
327
+ Chunk code content using AST-based analysis.
328
+
329
+ Args:
330
+ code_content: Code to chunk
331
+ source_identifier: Identifier for logging (e.g., filename or "code_snippet")
332
+
333
+ Returns:
334
+ List of code chunks
335
+
336
+ Raises:
337
+ RuntimeError: If chunking fails
338
+ """
339
+ try:
340
+ # Attempt to detect language from source or content
341
+ language = "python" # Default fallback
342
+ # TODO: remove. This is handle by language handler in ASTChunker
343
+ if "." in source_identifier:
344
+ ext = Path(source_identifier).suffix.lower()
345
+ if ext in [".py"]:
346
+ language = "python"
347
+ elif ext in [".js", ".ts"]:
348
+ language = "javascript"
349
+ elif ext in [".java"]:
350
+ language = "java"
351
+ elif ext in [".cpp", ".cc", ".cxx"]:
352
+ language = "cpp"
353
+ elif ext in [".c"]:
354
+ language = "c"
355
+
356
+ # Run in executor to avoid blocking the event loop (CPU-bound work)
357
+ loop = asyncio.get_running_loop()
358
+
359
+ def do_chunk() -> List[CodeChunk]:
360
+ return self.ast_chunker.chunk_content(
361
+ content=code_content,
362
+ file_path=source_identifier,
363
+ language=language,
364
+ )
365
+
366
+ chunks = await loop.run_in_executor(None, do_chunk)
367
+
368
+ logger.debug(
369
+ f"Generated {len(chunks)} chunks from {source_identifier}",
370
+ extra={
371
+ "structured_data": {
372
+ "source": source_identifier,
373
+ "language": language,
374
+ "chunk_count": len(chunks),
375
+ }
376
+ },
377
+ )
378
+
379
+ return chunks
380
+
381
+ except Exception as e:
382
+ logger.error(
383
+ f"Failed to chunk code from {source_identifier}: {e}",
384
+ extra={"structured_data": {"source": source_identifier}},
385
+ exc_info=True,
386
+ )
387
+ raise RuntimeError(f"Code chunking failed: {e}") from e
388
+
389
+ def _filter_and_deduplicate_results(
390
+ self,
391
+ results: List[Row],
392
+ similarity_threshold: float,
393
+ max_results: int,
394
+ ) -> List[Dict[str, Any]]:
395
+ """
396
+ Filter results by similarity threshold and remove duplicates.
397
+
398
+ Args:
399
+ results: Raw search results (turbopuffer Row objects)
400
+ similarity_threshold: Minimum similarity score
401
+ max_results: Maximum number of results to return
402
+
403
+ Returns:
404
+ Filtered and deduplicated results as dictionaries
405
+ """
406
+ processed_results = []
407
+
408
+ for row in results:
409
+ if row is None:
410
+ continue
411
+
412
+ # Extract similarity score (turbopuffer uses $dist for distance)
413
+ # Lower distance = higher similarity, so convert: similarity = 1 - distance
414
+ distance = getattr(row, "$dist", 1.0)
415
+ similarity = 1.0 - distance
416
+
417
+ # Filter by similarity threshold
418
+ if similarity < similarity_threshold:
419
+ continue
420
+
421
+ # Convert to result dictionary
422
+ file_path = getattr(row, "file_path", "")
423
+ file_name = Path(file_path).name if file_path else ""
424
+
425
+ result_dict = {
426
+ "file_name": file_name,
427
+ "start_line": getattr(row, "start_line", 0),
428
+ "end_line": getattr(row, "end_line", 0),
429
+ "score": similarity,
430
+ "content": getattr(row, "content", ""),
431
+ "metadata": {
432
+ "file_path": file_path,
433
+ "content_hash": getattr(row, "content_hash", ""),
434
+ "chunk_type": getattr(row, "chunk_type", ""),
435
+ },
436
+ }
437
+ processed_results.append(result_dict)
438
+
439
+ # Sort by similarity score (descending)
440
+ processed_results.sort(key=lambda x: x["score"], reverse=True)
441
+
442
+ # Deduplicate by file_path + content hash to avoid duplicate chunks
443
+ seen = set()
444
+ deduplicated = []
445
+
446
+ for result in processed_results:
447
+ metadata = result["metadata"]
448
+ file_path = metadata["file_path"]
449
+ content_hash = metadata["content_hash"]
450
+ dedup_key = f"{file_path}:{content_hash}"
451
+
452
+ if dedup_key not in seen:
453
+ seen.add(dedup_key)
454
+ deduplicated.append(result)
455
+
456
+ if len(deduplicated) >= max_results:
457
+ break
458
+
459
+ return deduplicated