mcp-vector-search 0.12.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mcp_vector_search/__init__.py +10 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/auto_index.py +397 -0
  5. mcp_vector_search/cli/commands/config.py +393 -0
  6. mcp_vector_search/cli/commands/demo.py +358 -0
  7. mcp_vector_search/cli/commands/index.py +744 -0
  8. mcp_vector_search/cli/commands/init.py +645 -0
  9. mcp_vector_search/cli/commands/install.py +675 -0
  10. mcp_vector_search/cli/commands/install_old.py +696 -0
  11. mcp_vector_search/cli/commands/mcp.py +1182 -0
  12. mcp_vector_search/cli/commands/reset.py +393 -0
  13. mcp_vector_search/cli/commands/search.py +773 -0
  14. mcp_vector_search/cli/commands/status.py +549 -0
  15. mcp_vector_search/cli/commands/uninstall.py +485 -0
  16. mcp_vector_search/cli/commands/visualize.py +1467 -0
  17. mcp_vector_search/cli/commands/watch.py +287 -0
  18. mcp_vector_search/cli/didyoumean.py +500 -0
  19. mcp_vector_search/cli/export.py +320 -0
  20. mcp_vector_search/cli/history.py +295 -0
  21. mcp_vector_search/cli/interactive.py +342 -0
  22. mcp_vector_search/cli/main.py +461 -0
  23. mcp_vector_search/cli/output.py +412 -0
  24. mcp_vector_search/cli/suggestions.py +375 -0
  25. mcp_vector_search/config/__init__.py +1 -0
  26. mcp_vector_search/config/constants.py +24 -0
  27. mcp_vector_search/config/defaults.py +200 -0
  28. mcp_vector_search/config/settings.py +134 -0
  29. mcp_vector_search/core/__init__.py +1 -0
  30. mcp_vector_search/core/auto_indexer.py +298 -0
  31. mcp_vector_search/core/connection_pool.py +360 -0
  32. mcp_vector_search/core/database.py +1214 -0
  33. mcp_vector_search/core/directory_index.py +318 -0
  34. mcp_vector_search/core/embeddings.py +294 -0
  35. mcp_vector_search/core/exceptions.py +89 -0
  36. mcp_vector_search/core/factory.py +318 -0
  37. mcp_vector_search/core/git_hooks.py +345 -0
  38. mcp_vector_search/core/indexer.py +1002 -0
  39. mcp_vector_search/core/models.py +294 -0
  40. mcp_vector_search/core/project.py +333 -0
  41. mcp_vector_search/core/scheduler.py +330 -0
  42. mcp_vector_search/core/search.py +952 -0
  43. mcp_vector_search/core/watcher.py +322 -0
  44. mcp_vector_search/mcp/__init__.py +5 -0
  45. mcp_vector_search/mcp/__main__.py +25 -0
  46. mcp_vector_search/mcp/server.py +733 -0
  47. mcp_vector_search/parsers/__init__.py +8 -0
  48. mcp_vector_search/parsers/base.py +296 -0
  49. mcp_vector_search/parsers/dart.py +605 -0
  50. mcp_vector_search/parsers/html.py +413 -0
  51. mcp_vector_search/parsers/javascript.py +643 -0
  52. mcp_vector_search/parsers/php.py +694 -0
  53. mcp_vector_search/parsers/python.py +502 -0
  54. mcp_vector_search/parsers/registry.py +223 -0
  55. mcp_vector_search/parsers/ruby.py +678 -0
  56. mcp_vector_search/parsers/text.py +186 -0
  57. mcp_vector_search/parsers/utils.py +265 -0
  58. mcp_vector_search/py.typed +1 -0
  59. mcp_vector_search/utils/__init__.py +40 -0
  60. mcp_vector_search/utils/gitignore.py +250 -0
  61. mcp_vector_search/utils/monorepo.py +277 -0
  62. mcp_vector_search/utils/timing.py +334 -0
  63. mcp_vector_search/utils/version.py +47 -0
  64. mcp_vector_search-0.12.6.dist-info/METADATA +754 -0
  65. mcp_vector_search-0.12.6.dist-info/RECORD +68 -0
  66. mcp_vector_search-0.12.6.dist-info/WHEEL +4 -0
  67. mcp_vector_search-0.12.6.dist-info/entry_points.txt +2 -0
  68. mcp_vector_search-0.12.6.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1214 @@
1
+ """Database abstraction and ChromaDB implementation for MCP Vector Search."""
2
+
3
+ import asyncio
4
+ import json
5
+ import shutil
6
+ from abc import ABC, abstractmethod
7
+ from pathlib import Path
8
+ from typing import Any, Protocol, runtime_checkable
9
+
10
+ from loguru import logger
11
+
12
+ from .connection_pool import ChromaConnectionPool
13
+ from .exceptions import (
14
+ DatabaseError,
15
+ DatabaseInitializationError,
16
+ DatabaseNotInitializedError,
17
+ DocumentAdditionError,
18
+ IndexCorruptionError,
19
+ SearchError,
20
+ )
21
+ from .models import CodeChunk, IndexStats, SearchResult
22
+
23
+
24
+ @runtime_checkable
25
+ class EmbeddingFunction(Protocol):
26
+ """Protocol for embedding functions."""
27
+
28
+ def __call__(self, texts: list[str]) -> list[list[float]]:
29
+ """Generate embeddings for input texts."""
30
+ ...
31
+
32
+
33
+ class VectorDatabase(ABC):
34
+ """Abstract interface for vector database operations."""
35
+
36
+ @abstractmethod
37
+ async def initialize(self) -> None:
38
+ """Initialize the database connection and collections."""
39
+ ...
40
+
41
+ @abstractmethod
42
+ async def close(self) -> None:
43
+ """Close database connections and cleanup resources."""
44
+ ...
45
+
46
+ @abstractmethod
47
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
48
+ """Add code chunks to the database.
49
+
50
+ Args:
51
+ chunks: List of code chunks to add
52
+ """
53
+ ...
54
+
55
+ @abstractmethod
56
+ async def search(
57
+ self,
58
+ query: str,
59
+ limit: int = 10,
60
+ filters: dict[str, Any] | None = None,
61
+ similarity_threshold: float = 0.7,
62
+ ) -> list[SearchResult]:
63
+ """Search for similar code chunks.
64
+
65
+ Args:
66
+ query: Search query
67
+ limit: Maximum number of results
68
+ filters: Optional filters to apply
69
+ similarity_threshold: Minimum similarity score
70
+
71
+ Returns:
72
+ List of search results
73
+ """
74
+ ...
75
+
76
+ @abstractmethod
77
+ async def delete_by_file(self, file_path: Path) -> int:
78
+ """Delete all chunks for a specific file.
79
+
80
+ Args:
81
+ file_path: Path to the file
82
+
83
+ Returns:
84
+ Number of deleted chunks
85
+ """
86
+ ...
87
+
88
+ @abstractmethod
89
+ async def get_stats(self) -> IndexStats:
90
+ """Get database statistics.
91
+
92
+ Returns:
93
+ Index statistics
94
+ """
95
+ ...
96
+
97
+ @abstractmethod
98
+ async def reset(self) -> None:
99
+ """Reset the database (delete all data)."""
100
+ ...
101
+
102
+ @abstractmethod
103
+ async def get_all_chunks(self) -> list[CodeChunk]:
104
+ """Get all chunks from the database.
105
+
106
+ Returns:
107
+ List of all code chunks with metadata
108
+ """
109
+ ...
110
+
111
+ @abstractmethod
112
+ async def health_check(self) -> bool:
113
+ """Check database health and integrity.
114
+
115
+ Returns:
116
+ True if database is healthy, False otherwise
117
+ """
118
+ ...
119
+
120
+ async def __aenter__(self) -> "VectorDatabase":
121
+ """Async context manager entry."""
122
+ await self.initialize()
123
+ return self
124
+
125
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
126
+ """Async context manager exit."""
127
+ await self.close()
128
+
129
+
130
+ class ChromaVectorDatabase(VectorDatabase):
131
+ """ChromaDB implementation of vector database."""
132
+
133
+ def __init__(
134
+ self,
135
+ persist_directory: Path,
136
+ embedding_function: EmbeddingFunction,
137
+ collection_name: str = "code_search",
138
+ ) -> None:
139
+ """Initialize ChromaDB vector database.
140
+
141
+ Args:
142
+ persist_directory: Directory to persist database
143
+ embedding_function: Function to generate embeddings
144
+ collection_name: Name of the collection
145
+ """
146
+ self.persist_directory = persist_directory
147
+ self.embedding_function = embedding_function
148
+ self.collection_name = collection_name
149
+ self._client = None
150
+ self._collection = None
151
+
152
+ async def initialize(self) -> None:
153
+ """Initialize ChromaDB client and collection with corruption recovery."""
154
+ try:
155
+ import chromadb
156
+
157
+ # Ensure directory exists
158
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
159
+
160
+ # Check for corruption before initializing
161
+ await self._detect_and_recover_corruption()
162
+
163
+ # Create client with new API
164
+ self._client = chromadb.PersistentClient(
165
+ path=str(self.persist_directory),
166
+ settings=chromadb.Settings(
167
+ anonymized_telemetry=False,
168
+ allow_reset=True,
169
+ ),
170
+ )
171
+
172
+ # Create or get collection
173
+ self._collection = self._client.get_or_create_collection(
174
+ name=self.collection_name,
175
+ embedding_function=self.embedding_function,
176
+ metadata={
177
+ "description": "Semantic code search collection",
178
+ },
179
+ )
180
+
181
+ logger.debug(f"ChromaDB initialized at {self.persist_directory}")
182
+
183
+ except Exception as e:
184
+ # Check if this is a corruption error
185
+ error_msg = str(e).lower()
186
+ if any(
187
+ indicator in error_msg
188
+ for indicator in [
189
+ "pickle",
190
+ "unpickling",
191
+ "eof",
192
+ "ran out of input",
193
+ "hnsw",
194
+ "index",
195
+ "deserialize",
196
+ "corrupt",
197
+ ]
198
+ ):
199
+ logger.warning(f"Detected index corruption: {e}")
200
+ # Try to recover
201
+ await self._recover_from_corruption()
202
+ # Retry initialization
203
+ await self.initialize()
204
+ else:
205
+ logger.error(f"Failed to initialize ChromaDB: {e}")
206
+ raise DatabaseInitializationError(
207
+ f"ChromaDB initialization failed: {e}"
208
+ ) from e
209
+
210
+ async def remove_file_chunks(self, file_path: str) -> int:
211
+ """Remove all chunks for a specific file.
212
+
213
+ Args:
214
+ file_path: Relative path to the file
215
+
216
+ Returns:
217
+ Number of chunks removed
218
+ """
219
+ if not self._collection:
220
+ raise DatabaseNotInitializedError("Database not initialized")
221
+
222
+ try:
223
+ # Get all chunks for this file
224
+ results = self._collection.get(where={"file_path": file_path})
225
+
226
+ if not results["ids"]:
227
+ return 0
228
+
229
+ # Delete the chunks
230
+ self._collection.delete(ids=results["ids"])
231
+
232
+ removed_count = len(results["ids"])
233
+ logger.debug(f"Removed {removed_count} chunks for file: {file_path}")
234
+ return removed_count
235
+
236
+ except Exception as e:
237
+ logger.error(f"Failed to remove chunks for file {file_path}: {e}")
238
+ return 0
239
+
240
+ async def close(self) -> None:
241
+ """Close database connections."""
242
+ if self._client:
243
+ # ChromaDB doesn't require explicit closing
244
+ self._client = None
245
+ self._collection = None
246
+ logger.debug("ChromaDB connections closed")
247
+
248
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
249
+ """Add code chunks to the database."""
250
+ if not self._collection:
251
+ raise DatabaseNotInitializedError("Database not initialized")
252
+
253
+ if not chunks:
254
+ return
255
+
256
+ try:
257
+ documents = []
258
+ metadatas = []
259
+ ids = []
260
+
261
+ for chunk in chunks:
262
+ # Create searchable text
263
+ searchable_text = self._create_searchable_text(chunk)
264
+ documents.append(searchable_text)
265
+
266
+ # Create metadata
267
+ metadata = {
268
+ "file_path": str(chunk.file_path),
269
+ "start_line": chunk.start_line,
270
+ "end_line": chunk.end_line,
271
+ "language": chunk.language,
272
+ "chunk_type": chunk.chunk_type,
273
+ "function_name": chunk.function_name or "",
274
+ "class_name": chunk.class_name or "",
275
+ "docstring": chunk.docstring or "",
276
+ "complexity_score": chunk.complexity_score,
277
+ # Hierarchy fields (convert lists to JSON strings for ChromaDB)
278
+ "chunk_id": chunk.chunk_id or "",
279
+ "parent_chunk_id": chunk.parent_chunk_id or "",
280
+ "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
281
+ "chunk_depth": chunk.chunk_depth,
282
+ # Additional metadata (convert lists/dicts to JSON strings)
283
+ "decorators": json.dumps(chunk.decorators or []),
284
+ "parameters": json.dumps(chunk.parameters or []),
285
+ "return_type": chunk.return_type or "",
286
+ "type_annotations": json.dumps(chunk.type_annotations or {}),
287
+ # Monorepo support
288
+ "subproject_name": chunk.subproject_name or "",
289
+ "subproject_path": chunk.subproject_path or "",
290
+ }
291
+ metadatas.append(metadata)
292
+
293
+ # Use chunk ID
294
+ ids.append(chunk.id)
295
+
296
+ # Add to collection
297
+ self._collection.add(
298
+ documents=documents,
299
+ metadatas=metadatas,
300
+ ids=ids,
301
+ )
302
+
303
+ logger.debug(f"Added {len(chunks)} chunks to database")
304
+
305
+ except Exception as e:
306
+ logger.error(f"Failed to add chunks: {e}")
307
+ raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
308
+
309
+ async def search(
310
+ self,
311
+ query: str,
312
+ limit: int = 10,
313
+ filters: dict[str, Any] | None = None,
314
+ similarity_threshold: float = 0.7,
315
+ ) -> list[SearchResult]:
316
+ """Search for similar code chunks."""
317
+ if not self._collection:
318
+ raise DatabaseNotInitializedError("Database not initialized")
319
+
320
+ try:
321
+ # Build where clause
322
+ where_clause = self._build_where_clause(filters) if filters else None
323
+
324
+ # Perform search
325
+ results = self._collection.query(
326
+ query_texts=[query],
327
+ n_results=limit,
328
+ where=where_clause,
329
+ include=["documents", "metadatas", "distances"],
330
+ )
331
+
332
+ # Process results
333
+ search_results = []
334
+
335
+ if results["documents"] and results["documents"][0]:
336
+ for i, (doc, metadata, distance) in enumerate(
337
+ zip(
338
+ results["documents"][0],
339
+ results["metadatas"][0],
340
+ results["distances"][0],
341
+ strict=False,
342
+ )
343
+ ):
344
+ # Convert distance to similarity (ChromaDB uses cosine distance)
345
+ # For cosine distance, use a more permissive conversion that handles distances > 1.0
346
+ # Convert to a 0-1 similarity score where lower distances = higher similarity
347
+ similarity = max(0.0, 1.0 / (1.0 + distance))
348
+
349
+ if similarity >= similarity_threshold:
350
+ result = SearchResult(
351
+ content=doc,
352
+ file_path=Path(metadata["file_path"]),
353
+ start_line=metadata["start_line"],
354
+ end_line=metadata["end_line"],
355
+ language=metadata["language"],
356
+ similarity_score=similarity,
357
+ rank=i + 1,
358
+ chunk_type=metadata.get("chunk_type", "code"),
359
+ function_name=metadata.get("function_name") or None,
360
+ class_name=metadata.get("class_name") or None,
361
+ )
362
+ search_results.append(result)
363
+
364
+ logger.debug(f"Found {len(search_results)} results for query: {query}")
365
+ return search_results
366
+
367
+ except Exception as e:
368
+ logger.error(f"Search failed: {e}")
369
+ raise SearchError(f"Search failed: {e}") from e
370
+
371
+ async def delete_by_file(self, file_path: Path) -> int:
372
+ """Delete all chunks for a specific file."""
373
+ if not self._collection:
374
+ raise DatabaseNotInitializedError("Database not initialized")
375
+
376
+ try:
377
+ # Get all chunks for this file
378
+ results = self._collection.get(
379
+ where={"file_path": str(file_path)},
380
+ include=["metadatas"],
381
+ )
382
+
383
+ if results["ids"]:
384
+ self._collection.delete(ids=results["ids"])
385
+ count = len(results["ids"])
386
+ logger.debug(f"Deleted {count} chunks for {file_path}")
387
+ return count
388
+
389
+ return 0
390
+
391
+ except Exception as e:
392
+ logger.error(f"Failed to delete chunks for {file_path}: {e}")
393
+ raise DatabaseError(f"Failed to delete chunks: {e}") from e
394
+
395
+ async def get_stats(self) -> IndexStats:
396
+ """Get database statistics with optimized chunked queries."""
397
+ if not self._collection:
398
+ raise DatabaseNotInitializedError("Database not initialized")
399
+
400
+ try:
401
+ # Get total count (fast operation)
402
+ count = self._collection.count()
403
+
404
+ if count == 0:
405
+ return IndexStats(
406
+ total_files=0,
407
+ total_chunks=0,
408
+ languages={},
409
+ file_types={},
410
+ index_size_mb=0.0,
411
+ last_updated="N/A",
412
+ embedding_model="unknown",
413
+ )
414
+
415
+ # Process in chunks to avoid loading everything at once
416
+ batch_size_limit = 1000
417
+
418
+ files = set()
419
+ language_counts: dict[str, int] = {}
420
+ file_type_counts: dict[str, int] = {}
421
+
422
+ offset = 0
423
+ while offset < count:
424
+ # Fetch batch
425
+ batch_size = min(batch_size_limit, count - offset)
426
+ logger.debug(
427
+ f"Processing database stats: batch {offset // batch_size_limit + 1}, "
428
+ f"{offset}-{offset + batch_size} of {count} chunks"
429
+ )
430
+
431
+ results = self._collection.get(
432
+ include=["metadatas"],
433
+ limit=batch_size,
434
+ offset=offset,
435
+ )
436
+
437
+ # Process batch metadata
438
+ for metadata in results.get("metadatas", []):
439
+ # Language stats
440
+ lang = metadata.get("language", "unknown")
441
+ language_counts[lang] = language_counts.get(lang, 0) + 1
442
+
443
+ # File stats
444
+ file_path = metadata.get("file_path", "")
445
+ if file_path:
446
+ files.add(file_path)
447
+ ext = Path(file_path).suffix or "no_extension"
448
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
449
+
450
+ offset += batch_size
451
+
452
+ # Yield to event loop periodically to prevent blocking
453
+ await asyncio.sleep(0)
454
+
455
+ # Estimate index size (rough approximation: ~1KB per chunk)
456
+ index_size_mb = count * 0.001
457
+
458
+ return IndexStats(
459
+ total_files=len(files),
460
+ total_chunks=count,
461
+ languages=language_counts,
462
+ file_types=file_type_counts,
463
+ index_size_mb=index_size_mb,
464
+ last_updated="unknown",
465
+ embedding_model="unknown",
466
+ )
467
+
468
+ except Exception as e:
469
+ logger.error(f"Failed to get database statistics: {e}")
470
+ # Return empty stats instead of raising
471
+ return IndexStats(
472
+ total_files=0,
473
+ total_chunks=0,
474
+ languages={},
475
+ file_types={},
476
+ index_size_mb=0.0,
477
+ last_updated="error",
478
+ embedding_model="unknown",
479
+ )
480
+
481
+ async def reset(self) -> None:
482
+ """Reset the database."""
483
+ if self._client:
484
+ try:
485
+ self._client.reset()
486
+ # Recreate collection
487
+ await self.initialize()
488
+ logger.info("Database reset successfully")
489
+ except Exception as e:
490
+ logger.error(f"Failed to reset database: {e}")
491
+ raise DatabaseError(f"Failed to reset database: {e}") from e
492
+
493
+ async def get_all_chunks(self) -> list[CodeChunk]:
494
+ """Get all chunks from the database.
495
+
496
+ Returns:
497
+ List of all code chunks with metadata
498
+ """
499
+ if not self._collection:
500
+ raise DatabaseNotInitializedError("Database not initialized")
501
+
502
+ try:
503
+ # Get all documents from collection
504
+ results = self._collection.get(include=["metadatas", "documents"])
505
+
506
+ chunks = []
507
+ if results and results.get("ids"):
508
+ for i, _chunk_id in enumerate(results["ids"]):
509
+ metadata = results["metadatas"][i]
510
+ content = results["documents"][i]
511
+
512
+ # Parse JSON strings back to lists/dicts
513
+ child_chunk_ids = metadata.get("child_chunk_ids", "[]")
514
+ if isinstance(child_chunk_ids, str):
515
+ child_chunk_ids = json.loads(child_chunk_ids)
516
+
517
+ decorators = metadata.get("decorators", "[]")
518
+ if isinstance(decorators, str):
519
+ decorators = json.loads(decorators)
520
+
521
+ parameters = metadata.get("parameters", "[]")
522
+ if isinstance(parameters, str):
523
+ parameters = json.loads(parameters)
524
+
525
+ type_annotations = metadata.get("type_annotations", "{}")
526
+ if isinstance(type_annotations, str):
527
+ type_annotations = json.loads(type_annotations)
528
+
529
+ chunk = CodeChunk(
530
+ content=content,
531
+ file_path=Path(metadata["file_path"]),
532
+ start_line=metadata["start_line"],
533
+ end_line=metadata["end_line"],
534
+ language=metadata["language"],
535
+ chunk_type=metadata.get("chunk_type", "code"),
536
+ function_name=metadata.get("function_name"),
537
+ class_name=metadata.get("class_name"),
538
+ docstring=metadata.get("docstring"),
539
+ imports=metadata.get("imports", []),
540
+ complexity_score=metadata.get("complexity_score", 0.0),
541
+ chunk_id=metadata.get("chunk_id"),
542
+ parent_chunk_id=metadata.get("parent_chunk_id"),
543
+ child_chunk_ids=child_chunk_ids,
544
+ chunk_depth=metadata.get("chunk_depth", 0),
545
+ decorators=decorators,
546
+ parameters=parameters,
547
+ return_type=metadata.get("return_type"),
548
+ type_annotations=type_annotations,
549
+ subproject_name=metadata.get("subproject_name"),
550
+ subproject_path=metadata.get("subproject_path"),
551
+ )
552
+ chunks.append(chunk)
553
+
554
+ logger.debug(f"Retrieved {len(chunks)} chunks from database")
555
+ return chunks
556
+
557
+ except Exception as e:
558
+ logger.error(f"Failed to get all chunks: {e}")
559
+ raise DatabaseError(f"Failed to get all chunks: {e}") from e
560
+
561
+ def _create_searchable_text(self, chunk: CodeChunk) -> str:
562
+ """Create optimized searchable text from code chunk."""
563
+ parts = [chunk.content]
564
+
565
+ # Add contextual information
566
+ if chunk.function_name:
567
+ parts.append(f"Function: {chunk.function_name}")
568
+
569
+ if chunk.class_name:
570
+ parts.append(f"Class: {chunk.class_name}")
571
+
572
+ if chunk.docstring:
573
+ parts.append(f"Documentation: {chunk.docstring}")
574
+
575
+ # Add language and file context
576
+ parts.append(f"Language: {chunk.language}")
577
+ parts.append(f"File: {chunk.file_path.name}")
578
+
579
+ return "\n".join(parts)
580
+
581
+ def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
582
+ """Build ChromaDB where clause from filters."""
583
+ where = {}
584
+
585
+ for key, value in filters.items():
586
+ if isinstance(value, list):
587
+ where[key] = {"$in": value}
588
+ elif isinstance(value, str) and value.startswith("!"):
589
+ where[key] = {"$ne": value[1:]}
590
+ else:
591
+ where[key] = value
592
+
593
+ return where
594
+
595
+ async def _detect_and_recover_corruption(self) -> None:
596
+ """Detect and recover from index corruption proactively."""
597
+ # Check for common corruption indicators in ChromaDB files
598
+ chroma_db_path = self.persist_directory / "chroma.sqlite3"
599
+
600
+ # If database doesn't exist yet, nothing to check
601
+ if not chroma_db_path.exists():
602
+ return
603
+
604
+ # Check for HNSW index files that might be corrupted
605
+ self.persist_directory / "chroma-collections.parquet"
606
+ index_path = self.persist_directory / "index"
607
+
608
+ if index_path.exists():
609
+ # Look for pickle files in the index
610
+ pickle_files = list(index_path.glob("**/*.pkl"))
611
+ pickle_files.extend(list(index_path.glob("**/*.pickle")))
612
+
613
+ for pickle_file in pickle_files:
614
+ try:
615
+ # Try to read the pickle file to detect corruption
616
+ import pickle
617
+
618
+ with open(pickle_file, "rb") as f:
619
+ pickle.load(f)
620
+ except (EOFError, pickle.UnpicklingError, Exception) as e:
621
+ logger.warning(
622
+ f"Corrupted index file detected: {pickle_file} - {e}"
623
+ )
624
+ await self._recover_from_corruption()
625
+ return
626
+
627
+ async def _recover_from_corruption(self) -> None:
628
+ """Recover from index corruption by rebuilding the index."""
629
+ logger.info("Attempting to recover from index corruption...")
630
+
631
+ # Create backup directory
632
+ backup_dir = (
633
+ self.persist_directory.parent / f"{self.persist_directory.name}_backup"
634
+ )
635
+ backup_dir.mkdir(exist_ok=True)
636
+
637
+ # Backup current state (in case we need it)
638
+ import time
639
+
640
+ timestamp = int(time.time())
641
+ backup_path = backup_dir / f"backup_{timestamp}"
642
+
643
+ if self.persist_directory.exists():
644
+ try:
645
+ shutil.copytree(self.persist_directory, backup_path)
646
+ logger.info(f"Created backup at {backup_path}")
647
+ except Exception as e:
648
+ logger.warning(f"Could not create backup: {e}")
649
+
650
+ # Clear the corrupted index
651
+ if self.persist_directory.exists():
652
+ try:
653
+ shutil.rmtree(self.persist_directory)
654
+ logger.info(f"Cleared corrupted index at {self.persist_directory}")
655
+ except Exception as e:
656
+ logger.error(f"Failed to clear corrupted index: {e}")
657
+ raise IndexCorruptionError(
658
+ f"Could not clear corrupted index: {e}"
659
+ ) from e
660
+
661
+ # Recreate the directory
662
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
663
+ logger.info("Index directory recreated. Please re-index your codebase.")
664
+
665
+ async def health_check(self) -> bool:
666
+ """Check database health and integrity.
667
+
668
+ Returns:
669
+ True if database is healthy, False otherwise
670
+ """
671
+ try:
672
+ # First check if client is initialized
673
+ if not self._client or not self._collection:
674
+ logger.warning("Database not initialized")
675
+ return False
676
+
677
+ # Try a simple operation to test the connection
678
+ try:
679
+ # Attempt to get count - this will fail if index is corrupted
680
+ count = self._collection.count()
681
+ logger.debug(f"Health check passed: {count} chunks in database")
682
+
683
+ # Try a minimal query to ensure search works
684
+ self._collection.query(
685
+ query_texts=["test"], n_results=1, include=["metadatas"]
686
+ )
687
+
688
+ return True
689
+
690
+ except Exception as e:
691
+ error_msg = str(e).lower()
692
+ if any(
693
+ indicator in error_msg
694
+ for indicator in [
695
+ "pickle",
696
+ "unpickling",
697
+ "eof",
698
+ "ran out of input",
699
+ "hnsw",
700
+ "index",
701
+ "deserialize",
702
+ "corrupt",
703
+ ]
704
+ ):
705
+ logger.error(f"Index corruption detected during health check: {e}")
706
+ return False
707
+ else:
708
+ # Some other error
709
+ logger.warning(f"Health check failed: {e}")
710
+ return False
711
+
712
+ except Exception as e:
713
+ logger.error(f"Health check error: {e}")
714
+ return False
715
+
716
+
717
+ class PooledChromaVectorDatabase(VectorDatabase):
718
+ """ChromaDB implementation with connection pooling for improved performance."""
719
+
720
+ def __init__(
721
+ self,
722
+ persist_directory: Path,
723
+ embedding_function: EmbeddingFunction,
724
+ collection_name: str = "code_search",
725
+ max_connections: int = 10,
726
+ min_connections: int = 2,
727
+ max_idle_time: float = 300.0,
728
+ max_connection_age: float = 3600.0,
729
+ ) -> None:
730
+ """Initialize pooled ChromaDB vector database.
731
+
732
+ Args:
733
+ persist_directory: Directory to persist database
734
+ embedding_function: Function to generate embeddings
735
+ collection_name: Name of the collection
736
+ max_connections: Maximum number of connections in pool
737
+ min_connections: Minimum number of connections to maintain
738
+ max_idle_time: Maximum time a connection can be idle (seconds)
739
+ max_connection_age: Maximum age of a connection (seconds)
740
+ """
741
+ self.persist_directory = persist_directory
742
+ self.embedding_function = embedding_function
743
+ self.collection_name = collection_name
744
+
745
+ self._pool = ChromaConnectionPool(
746
+ persist_directory=persist_directory,
747
+ embedding_function=embedding_function,
748
+ collection_name=collection_name,
749
+ max_connections=max_connections,
750
+ min_connections=min_connections,
751
+ max_idle_time=max_idle_time,
752
+ max_connection_age=max_connection_age,
753
+ )
754
+
755
+ async def initialize(self) -> None:
756
+ """Initialize the connection pool."""
757
+ await self._pool.initialize()
758
+ logger.debug(f"Pooled ChromaDB initialized at {self.persist_directory}")
759
+
760
+ async def close(self) -> None:
761
+ """Close the connection pool."""
762
+ await self._pool.close()
763
+ logger.debug("Pooled ChromaDB connections closed")
764
+
765
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
766
+ """Add code chunks to the database using pooled connection."""
767
+ if not chunks:
768
+ return
769
+
770
+ # Ensure pool is initialized
771
+ if not self._pool._initialized:
772
+ await self._pool.initialize()
773
+
774
+ try:
775
+ async with self._pool.get_connection() as conn:
776
+ # Prepare data for ChromaDB
777
+ documents = []
778
+ metadatas = []
779
+ ids = []
780
+
781
+ for chunk in chunks:
782
+ documents.append(chunk.content)
783
+ metadatas.append(
784
+ {
785
+ "file_path": str(chunk.file_path),
786
+ "start_line": chunk.start_line,
787
+ "end_line": chunk.end_line,
788
+ "language": chunk.language,
789
+ "chunk_type": chunk.chunk_type,
790
+ "function_name": chunk.function_name or "",
791
+ "class_name": chunk.class_name or "",
792
+ "docstring": chunk.docstring or "",
793
+ "complexity_score": chunk.complexity_score,
794
+ # Hierarchy fields (convert lists to JSON strings for ChromaDB)
795
+ "chunk_id": chunk.chunk_id or "",
796
+ "parent_chunk_id": chunk.parent_chunk_id or "",
797
+ "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
798
+ "chunk_depth": chunk.chunk_depth,
799
+ # Additional metadata (convert lists/dicts to JSON strings)
800
+ "decorators": json.dumps(chunk.decorators or []),
801
+ "parameters": json.dumps(chunk.parameters or []),
802
+ "return_type": chunk.return_type or "",
803
+ "type_annotations": json.dumps(
804
+ chunk.type_annotations or {}
805
+ ),
806
+ # Monorepo support
807
+ "subproject_name": chunk.subproject_name or "",
808
+ "subproject_path": chunk.subproject_path or "",
809
+ }
810
+ )
811
+ ids.append(chunk.id)
812
+
813
+ # Add to collection
814
+ conn.collection.add(documents=documents, metadatas=metadatas, ids=ids)
815
+
816
+ logger.debug(f"Added {len(chunks)} chunks to database")
817
+
818
+ except Exception as e:
819
+ logger.error(f"Failed to add chunks: {e}")
820
+ raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
821
+
822
+ async def search(
823
+ self,
824
+ query: str,
825
+ limit: int = 10,
826
+ filters: dict[str, Any] | None = None,
827
+ similarity_threshold: float = 0.7,
828
+ ) -> list[SearchResult]:
829
+ """Search for similar code chunks using pooled connection."""
830
+ # Ensure pool is initialized
831
+ if not self._pool._initialized:
832
+ await self._pool.initialize()
833
+
834
+ try:
835
+ async with self._pool.get_connection() as conn:
836
+ # Build where clause
837
+ where_clause = self._build_where_clause(filters) if filters else None
838
+
839
+ # Perform search
840
+ results = conn.collection.query(
841
+ query_texts=[query],
842
+ n_results=limit,
843
+ where=where_clause,
844
+ include=["documents", "metadatas", "distances"],
845
+ )
846
+
847
+ # Process results
848
+ search_results = []
849
+
850
+ if results["documents"] and results["documents"][0]:
851
+ for i, (doc, metadata, distance) in enumerate(
852
+ zip(
853
+ results["documents"][0],
854
+ results["metadatas"][0],
855
+ results["distances"][0],
856
+ strict=False,
857
+ )
858
+ ):
859
+ # Convert distance to similarity (ChromaDB uses cosine distance)
860
+ # For cosine distance, use a more permissive conversion that handles distances > 1.0
861
+ # Convert to a 0-1 similarity score where lower distances = higher similarity
862
+ similarity = max(0.0, 1.0 / (1.0 + distance))
863
+
864
+ if similarity >= similarity_threshold:
865
+ result = SearchResult(
866
+ content=doc,
867
+ file_path=Path(metadata["file_path"]),
868
+ start_line=metadata["start_line"],
869
+ end_line=metadata["end_line"],
870
+ language=metadata["language"],
871
+ similarity_score=similarity,
872
+ rank=i + 1,
873
+ chunk_type=metadata.get("chunk_type", "code"),
874
+ function_name=metadata.get("function_name") or None,
875
+ class_name=metadata.get("class_name") or None,
876
+ )
877
+ search_results.append(result)
878
+
879
+ logger.debug(f"Found {len(search_results)} results for query: {query}")
880
+ return search_results
881
+
882
+ except Exception as e:
883
+ logger.error(f"Search failed: {e}")
884
+ raise SearchError(f"Search failed: {e}") from e
885
+
886
+ async def delete_by_file(self, file_path: Path) -> int:
887
+ """Delete all chunks for a specific file using pooled connection."""
888
+ try:
889
+ async with self._pool.get_connection() as conn:
890
+ # Get all chunks for this file
891
+ results = conn.collection.get(
892
+ where={"file_path": str(file_path)}, include=["metadatas"]
893
+ )
894
+
895
+ if not results["ids"]:
896
+ return 0
897
+
898
+ # Delete the chunks
899
+ conn.collection.delete(ids=results["ids"])
900
+
901
+ deleted_count = len(results["ids"])
902
+ logger.debug(f"Deleted {deleted_count} chunks for file: {file_path}")
903
+ return deleted_count
904
+
905
+ except Exception as e:
906
+ logger.error(f"Failed to delete chunks for file {file_path}: {e}")
907
+ raise DatabaseError(f"Failed to delete chunks: {e}") from e
908
+
909
+ async def get_stats(self) -> IndexStats:
910
+ """Get database statistics with connection pooling and chunked queries."""
911
+ try:
912
+ async with self._pool.get_connection() as conn:
913
+ # Get total count (fast operation)
914
+ count = conn.collection.count()
915
+
916
+ if count == 0:
917
+ return IndexStats(
918
+ total_files=0,
919
+ total_chunks=0,
920
+ languages={},
921
+ file_types={},
922
+ index_size_mb=0.0,
923
+ last_updated="N/A",
924
+ embedding_model="unknown",
925
+ )
926
+
927
+ # Process in chunks to avoid loading everything at once
928
+ batch_size_limit = 1000
929
+
930
+ files = set()
931
+ language_counts: dict[str, int] = {}
932
+ file_type_counts: dict[str, int] = {}
933
+
934
+ offset = 0
935
+ while offset < count:
936
+ # Fetch batch
937
+ batch_size = min(batch_size_limit, count - offset)
938
+ logger.debug(
939
+ f"Processing database stats: batch {offset // batch_size_limit + 1}, "
940
+ f"{offset}-{offset + batch_size} of {count} chunks"
941
+ )
942
+
943
+ results = conn.collection.get(
944
+ include=["metadatas"],
945
+ limit=batch_size,
946
+ offset=offset,
947
+ )
948
+
949
+ # Process batch metadata
950
+ for metadata in results.get("metadatas", []):
951
+ # Language stats
952
+ lang = metadata.get("language", "unknown")
953
+ language_counts[lang] = language_counts.get(lang, 0) + 1
954
+
955
+ # File stats
956
+ file_path = metadata.get("file_path", "")
957
+ if file_path:
958
+ files.add(file_path)
959
+ ext = Path(file_path).suffix or "no_extension"
960
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
961
+
962
+ offset += batch_size
963
+
964
+ # Yield to event loop periodically to prevent blocking
965
+ await asyncio.sleep(0)
966
+
967
+ # Estimate index size (rough approximation: ~1KB per chunk)
968
+ index_size_mb = count * 0.001
969
+
970
+ return IndexStats(
971
+ total_files=len(files),
972
+ total_chunks=count,
973
+ languages=language_counts,
974
+ file_types=file_type_counts,
975
+ index_size_mb=index_size_mb,
976
+ last_updated="unknown",
977
+ embedding_model="unknown",
978
+ )
979
+
980
+ except Exception as e:
981
+ logger.error(f"Failed to get database statistics: {e}")
982
+ # Return empty stats instead of raising
983
+ return IndexStats(
984
+ total_files=0,
985
+ total_chunks=0,
986
+ languages={},
987
+ file_types={},
988
+ index_size_mb=0.0,
989
+ last_updated="error",
990
+ embedding_model="unknown",
991
+ )
992
+
993
+ async def remove_file_chunks(self, file_path: str) -> int:
994
+ """Remove all chunks for a specific file using pooled connection."""
995
+ try:
996
+ async with self._pool.get_connection() as conn:
997
+ # Get all chunks for this file
998
+ results = conn.collection.get(where={"file_path": file_path})
999
+
1000
+ if not results["ids"]:
1001
+ return 0
1002
+
1003
+ # Delete the chunks
1004
+ conn.collection.delete(ids=results["ids"])
1005
+
1006
+ return len(results["ids"])
1007
+
1008
+ except Exception as e:
1009
+ logger.error(f"Failed to remove chunks for file {file_path}: {e}")
1010
+ return 0
1011
+
1012
+ async def reset(self) -> None:
1013
+ """Reset the database using pooled connection."""
1014
+ try:
1015
+ async with self._pool.get_connection() as conn:
1016
+ conn.client.reset()
1017
+ # Reinitialize the pool after reset
1018
+ await self._pool.close()
1019
+ await self._pool.initialize()
1020
+ logger.info("Database reset successfully")
1021
+ except Exception as e:
1022
+ logger.error(f"Failed to reset database: {e}")
1023
+ raise DatabaseError(f"Failed to reset database: {e}") from e
1024
+
1025
+ async def get_all_chunks(self) -> list[CodeChunk]:
1026
+ """Get all chunks from the database using pooled connection.
1027
+
1028
+ Returns:
1029
+ List of all code chunks with metadata
1030
+ """
1031
+ try:
1032
+ async with self._pool.get_connection() as conn:
1033
+ # Get all documents from collection
1034
+ results = conn.collection.get(include=["metadatas", "documents"])
1035
+
1036
+ chunks = []
1037
+ if results and results.get("ids"):
1038
+ for i, _chunk_id in enumerate(results["ids"]):
1039
+ metadata = results["metadatas"][i]
1040
+ content = results["documents"][i]
1041
+
1042
+ # Parse JSON strings back to lists/dicts
1043
+ child_chunk_ids = metadata.get("child_chunk_ids", "[]")
1044
+ if isinstance(child_chunk_ids, str):
1045
+ child_chunk_ids = json.loads(child_chunk_ids)
1046
+
1047
+ decorators = metadata.get("decorators", "[]")
1048
+ if isinstance(decorators, str):
1049
+ decorators = json.loads(decorators)
1050
+
1051
+ parameters = metadata.get("parameters", "[]")
1052
+ if isinstance(parameters, str):
1053
+ parameters = json.loads(parameters)
1054
+
1055
+ type_annotations = metadata.get("type_annotations", "{}")
1056
+ if isinstance(type_annotations, str):
1057
+ type_annotations = json.loads(type_annotations)
1058
+
1059
+ chunk = CodeChunk(
1060
+ content=content,
1061
+ file_path=Path(metadata["file_path"]),
1062
+ start_line=metadata["start_line"],
1063
+ end_line=metadata["end_line"],
1064
+ language=metadata["language"],
1065
+ chunk_type=metadata.get("chunk_type", "code"),
1066
+ function_name=metadata.get("function_name"),
1067
+ class_name=metadata.get("class_name"),
1068
+ docstring=metadata.get("docstring"),
1069
+ imports=metadata.get("imports", []),
1070
+ complexity_score=metadata.get("complexity_score", 0.0),
1071
+ chunk_id=metadata.get("chunk_id"),
1072
+ parent_chunk_id=metadata.get("parent_chunk_id"),
1073
+ child_chunk_ids=child_chunk_ids,
1074
+ chunk_depth=metadata.get("chunk_depth", 0),
1075
+ decorators=decorators,
1076
+ parameters=parameters,
1077
+ return_type=metadata.get("return_type"),
1078
+ type_annotations=type_annotations,
1079
+ subproject_name=metadata.get("subproject_name"),
1080
+ subproject_path=metadata.get("subproject_path"),
1081
+ )
1082
+ chunks.append(chunk)
1083
+
1084
+ logger.debug(f"Retrieved {len(chunks)} chunks from database")
1085
+ return chunks
1086
+
1087
+ except Exception as e:
1088
+ logger.error(f"Failed to get all chunks: {e}")
1089
+ raise DatabaseError(f"Failed to get all chunks: {e}") from e
1090
+
1091
+ def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any] | None:
1092
+ """Build ChromaDB where clause from filters."""
1093
+ if not filters:
1094
+ return None
1095
+
1096
+ conditions = []
1097
+
1098
+ for key, value in filters.items():
1099
+ if key == "language" and value:
1100
+ conditions.append({"language": {"$eq": value}})
1101
+ elif key == "file_path" and value:
1102
+ if isinstance(value, list):
1103
+ conditions.append({"file_path": {"$in": [str(p) for p in value]}})
1104
+ else:
1105
+ conditions.append({"file_path": {"$eq": str(value)}})
1106
+ elif key == "chunk_type" and value:
1107
+ conditions.append({"chunk_type": {"$eq": value}})
1108
+
1109
+ if not conditions:
1110
+ return None
1111
+ elif len(conditions) > 1:
1112
+ return {"$and": conditions}
1113
+ else:
1114
+ return conditions[0]
1115
+
1116
+ def get_pool_stats(self) -> dict[str, Any]:
1117
+ """Get connection pool statistics."""
1118
+ return self._pool.get_stats()
1119
+
1120
+ async def health_check(self) -> bool:
1121
+ """Perform a health check on the database and connection pool."""
1122
+ try:
1123
+ # Check pool health
1124
+ pool_healthy = await self._pool.health_check()
1125
+ if not pool_healthy:
1126
+ return False
1127
+
1128
+ # Try a simple query to verify database integrity
1129
+ try:
1130
+ async with self._pool.get_connection() as conn:
1131
+ # Test basic operations
1132
+ conn.collection.count()
1133
+ conn.collection.query(
1134
+ query_texts=["test"], n_results=1, include=["metadatas"]
1135
+ )
1136
+ return True
1137
+ except Exception as e:
1138
+ error_msg = str(e).lower()
1139
+ if any(
1140
+ indicator in error_msg
1141
+ for indicator in [
1142
+ "pickle",
1143
+ "unpickling",
1144
+ "eof",
1145
+ "ran out of input",
1146
+ "hnsw",
1147
+ "index",
1148
+ "deserialize",
1149
+ "corrupt",
1150
+ ]
1151
+ ):
1152
+ logger.error(f"Index corruption detected: {e}")
1153
+ # Attempt recovery
1154
+ await self._recover_from_corruption()
1155
+ return False
1156
+ else:
1157
+ logger.warning(f"Health check failed: {e}")
1158
+ return False
1159
+ except Exception as e:
1160
+ logger.error(f"Health check error: {e}")
1161
+ return False
1162
+
1163
+ async def _recover_from_corruption(self) -> None:
1164
+ """Recover from index corruption by rebuilding the index."""
1165
+ logger.info("Attempting to recover from index corruption...")
1166
+
1167
+ # Close the pool first
1168
+ await self._pool.close()
1169
+
1170
+ # Create backup directory
1171
+ backup_dir = (
1172
+ self.persist_directory.parent / f"{self.persist_directory.name}_backup"
1173
+ )
1174
+ backup_dir.mkdir(exist_ok=True)
1175
+
1176
+ # Backup current state
1177
+ import time
1178
+
1179
+ timestamp = int(time.time())
1180
+ backup_path = backup_dir / f"backup_{timestamp}"
1181
+
1182
+ if self.persist_directory.exists():
1183
+ try:
1184
+ shutil.copytree(self.persist_directory, backup_path)
1185
+ logger.info(f"Created backup at {backup_path}")
1186
+ except Exception as e:
1187
+ logger.warning(f"Could not create backup: {e}")
1188
+
1189
+ # Clear the corrupted index
1190
+ if self.persist_directory.exists():
1191
+ try:
1192
+ shutil.rmtree(self.persist_directory)
1193
+ logger.info(f"Cleared corrupted index at {self.persist_directory}")
1194
+ except Exception as e:
1195
+ logger.error(f"Failed to clear corrupted index: {e}")
1196
+ raise IndexCorruptionError(
1197
+ f"Could not clear corrupted index: {e}"
1198
+ ) from e
1199
+
1200
+ # Recreate the directory
1201
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
1202
+
1203
+ # Reinitialize the pool
1204
+ await self._pool.initialize()
1205
+ logger.info("Index recovered. Please re-index your codebase.")
1206
+
1207
+ async def __aenter__(self):
1208
+ """Async context manager entry."""
1209
+ await self.initialize()
1210
+ return self
1211
+
1212
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1213
+ """Async context manager exit."""
1214
+ await self.close()