mcp-vector-search 0.15.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (86) hide show
  1. mcp_vector_search/__init__.py +10 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/auto_index.py +397 -0
  5. mcp_vector_search/cli/commands/chat.py +534 -0
  6. mcp_vector_search/cli/commands/config.py +393 -0
  7. mcp_vector_search/cli/commands/demo.py +358 -0
  8. mcp_vector_search/cli/commands/index.py +762 -0
  9. mcp_vector_search/cli/commands/init.py +658 -0
  10. mcp_vector_search/cli/commands/install.py +869 -0
  11. mcp_vector_search/cli/commands/install_old.py +700 -0
  12. mcp_vector_search/cli/commands/mcp.py +1254 -0
  13. mcp_vector_search/cli/commands/reset.py +393 -0
  14. mcp_vector_search/cli/commands/search.py +796 -0
  15. mcp_vector_search/cli/commands/setup.py +1133 -0
  16. mcp_vector_search/cli/commands/status.py +584 -0
  17. mcp_vector_search/cli/commands/uninstall.py +404 -0
  18. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  19. mcp_vector_search/cli/commands/visualize/cli.py +265 -0
  20. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  21. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  22. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
  23. mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
  24. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  25. mcp_vector_search/cli/commands/visualize/server.py +201 -0
  26. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  27. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  28. mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
  29. mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
  30. mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
  31. mcp_vector_search/cli/commands/visualize.py.original +2536 -0
  32. mcp_vector_search/cli/commands/watch.py +287 -0
  33. mcp_vector_search/cli/didyoumean.py +520 -0
  34. mcp_vector_search/cli/export.py +320 -0
  35. mcp_vector_search/cli/history.py +295 -0
  36. mcp_vector_search/cli/interactive.py +342 -0
  37. mcp_vector_search/cli/main.py +484 -0
  38. mcp_vector_search/cli/output.py +414 -0
  39. mcp_vector_search/cli/suggestions.py +375 -0
  40. mcp_vector_search/config/__init__.py +1 -0
  41. mcp_vector_search/config/constants.py +24 -0
  42. mcp_vector_search/config/defaults.py +200 -0
  43. mcp_vector_search/config/settings.py +146 -0
  44. mcp_vector_search/core/__init__.py +1 -0
  45. mcp_vector_search/core/auto_indexer.py +298 -0
  46. mcp_vector_search/core/config_utils.py +394 -0
  47. mcp_vector_search/core/connection_pool.py +360 -0
  48. mcp_vector_search/core/database.py +1237 -0
  49. mcp_vector_search/core/directory_index.py +318 -0
  50. mcp_vector_search/core/embeddings.py +294 -0
  51. mcp_vector_search/core/exceptions.py +89 -0
  52. mcp_vector_search/core/factory.py +318 -0
  53. mcp_vector_search/core/git_hooks.py +345 -0
  54. mcp_vector_search/core/indexer.py +1002 -0
  55. mcp_vector_search/core/llm_client.py +453 -0
  56. mcp_vector_search/core/models.py +294 -0
  57. mcp_vector_search/core/project.py +350 -0
  58. mcp_vector_search/core/scheduler.py +330 -0
  59. mcp_vector_search/core/search.py +952 -0
  60. mcp_vector_search/core/watcher.py +322 -0
  61. mcp_vector_search/mcp/__init__.py +5 -0
  62. mcp_vector_search/mcp/__main__.py +25 -0
  63. mcp_vector_search/mcp/server.py +752 -0
  64. mcp_vector_search/parsers/__init__.py +8 -0
  65. mcp_vector_search/parsers/base.py +296 -0
  66. mcp_vector_search/parsers/dart.py +605 -0
  67. mcp_vector_search/parsers/html.py +413 -0
  68. mcp_vector_search/parsers/javascript.py +643 -0
  69. mcp_vector_search/parsers/php.py +694 -0
  70. mcp_vector_search/parsers/python.py +502 -0
  71. mcp_vector_search/parsers/registry.py +223 -0
  72. mcp_vector_search/parsers/ruby.py +678 -0
  73. mcp_vector_search/parsers/text.py +186 -0
  74. mcp_vector_search/parsers/utils.py +265 -0
  75. mcp_vector_search/py.typed +1 -0
  76. mcp_vector_search/utils/__init__.py +42 -0
  77. mcp_vector_search/utils/gitignore.py +250 -0
  78. mcp_vector_search/utils/gitignore_updater.py +212 -0
  79. mcp_vector_search/utils/monorepo.py +339 -0
  80. mcp_vector_search/utils/timing.py +338 -0
  81. mcp_vector_search/utils/version.py +47 -0
  82. mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
  83. mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
  84. mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
  85. mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
  86. mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1237 @@
1
+ """Database abstraction and ChromaDB implementation for MCP Vector Search."""
2
+
3
+ import asyncio
4
+ import json
5
+ import shutil
6
+ from abc import ABC, abstractmethod
7
+ from pathlib import Path
8
+ from typing import Any, Protocol, runtime_checkable
9
+
10
+ from loguru import logger
11
+
12
+ from .connection_pool import ChromaConnectionPool
13
+ from .exceptions import (
14
+ DatabaseError,
15
+ DatabaseInitializationError,
16
+ DatabaseNotInitializedError,
17
+ DocumentAdditionError,
18
+ IndexCorruptionError,
19
+ SearchError,
20
+ )
21
+ from .models import CodeChunk, IndexStats, SearchResult
22
+
23
+
24
+ @runtime_checkable
25
+ class EmbeddingFunction(Protocol):
26
+ """Protocol for embedding functions."""
27
+
28
+ def __call__(self, texts: list[str]) -> list[list[float]]:
29
+ """Generate embeddings for input texts."""
30
+ ...
31
+
32
+
33
+ class VectorDatabase(ABC):
34
+ """Abstract interface for vector database operations."""
35
+
36
+ @abstractmethod
37
+ async def initialize(self) -> None:
38
+ """Initialize the database connection and collections."""
39
+ ...
40
+
41
+ @abstractmethod
42
+ async def close(self) -> None:
43
+ """Close database connections and cleanup resources."""
44
+ ...
45
+
46
+ @abstractmethod
47
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
48
+ """Add code chunks to the database.
49
+
50
+ Args:
51
+ chunks: List of code chunks to add
52
+ """
53
+ ...
54
+
55
+ @abstractmethod
56
+ async def search(
57
+ self,
58
+ query: str,
59
+ limit: int = 10,
60
+ filters: dict[str, Any] | None = None,
61
+ similarity_threshold: float = 0.7,
62
+ ) -> list[SearchResult]:
63
+ """Search for similar code chunks.
64
+
65
+ Args:
66
+ query: Search query
67
+ limit: Maximum number of results
68
+ filters: Optional filters to apply
69
+ similarity_threshold: Minimum similarity score
70
+
71
+ Returns:
72
+ List of search results
73
+ """
74
+ ...
75
+
76
+ @abstractmethod
77
+ async def delete_by_file(self, file_path: Path) -> int:
78
+ """Delete all chunks for a specific file.
79
+
80
+ Args:
81
+ file_path: Path to the file
82
+
83
+ Returns:
84
+ Number of deleted chunks
85
+ """
86
+ ...
87
+
88
+ @abstractmethod
89
+ async def get_stats(self) -> IndexStats:
90
+ """Get database statistics.
91
+
92
+ Returns:
93
+ Index statistics
94
+ """
95
+ ...
96
+
97
+ @abstractmethod
98
+ async def reset(self) -> None:
99
+ """Reset the database (delete all data)."""
100
+ ...
101
+
102
+ @abstractmethod
103
+ async def get_all_chunks(self) -> list[CodeChunk]:
104
+ """Get all chunks from the database.
105
+
106
+ Returns:
107
+ List of all code chunks with metadata
108
+ """
109
+ ...
110
+
111
+ @abstractmethod
112
+ async def health_check(self) -> bool:
113
+ """Check database health and integrity.
114
+
115
+ Returns:
116
+ True if database is healthy, False otherwise
117
+ """
118
+ ...
119
+
120
+ async def __aenter__(self) -> "VectorDatabase":
121
+ """Async context manager entry."""
122
+ await self.initialize()
123
+ return self
124
+
125
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
126
+ """Async context manager exit."""
127
+ await self.close()
128
+
129
+
130
+ class ChromaVectorDatabase(VectorDatabase):
131
+ """ChromaDB implementation of vector database."""
132
+
133
+ def __init__(
134
+ self,
135
+ persist_directory: Path,
136
+ embedding_function: EmbeddingFunction,
137
+ collection_name: str = "code_search",
138
+ ) -> None:
139
+ """Initialize ChromaDB vector database.
140
+
141
+ Args:
142
+ persist_directory: Directory to persist database
143
+ embedding_function: Function to generate embeddings
144
+ collection_name: Name of the collection
145
+ """
146
+ self.persist_directory = persist_directory
147
+ self.embedding_function = embedding_function
148
+ self.collection_name = collection_name
149
+ self._client = None
150
+ self._collection = None
151
+
152
+ async def initialize(self) -> None:
153
+ """Initialize ChromaDB client and collection with corruption recovery."""
154
+ try:
155
+ import chromadb
156
+
157
+ # Ensure directory exists
158
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
159
+
160
+ # Check for corruption before initializing
161
+ await self._detect_and_recover_corruption()
162
+
163
+ # Create client with new API
164
+ self._client = chromadb.PersistentClient(
165
+ path=str(self.persist_directory),
166
+ settings=chromadb.Settings(
167
+ anonymized_telemetry=False,
168
+ allow_reset=True,
169
+ ),
170
+ )
171
+
172
+ # Create or get collection
173
+ self._collection = self._client.get_or_create_collection(
174
+ name=self.collection_name,
175
+ embedding_function=self.embedding_function,
176
+ metadata={
177
+ "description": "Semantic code search collection",
178
+ },
179
+ )
180
+
181
+ logger.debug(f"ChromaDB initialized at {self.persist_directory}")
182
+
183
+ except Exception as e:
184
+ # Check if this is a corruption error
185
+ error_msg = str(e).lower()
186
+ if any(
187
+ indicator in error_msg
188
+ for indicator in [
189
+ "pickle",
190
+ "unpickling",
191
+ "eof",
192
+ "ran out of input",
193
+ "hnsw",
194
+ "index",
195
+ "deserialize",
196
+ "corrupt",
197
+ ]
198
+ ):
199
+ logger.warning(f"Detected index corruption: {e}")
200
+ # Try to recover
201
+ await self._recover_from_corruption()
202
+ # Retry initialization
203
+ await self.initialize()
204
+ else:
205
+ logger.error(f"Failed to initialize ChromaDB: {e}")
206
+ raise DatabaseInitializationError(
207
+ f"ChromaDB initialization failed: {e}"
208
+ ) from e
209
+
210
+ async def remove_file_chunks(self, file_path: str) -> int:
211
+ """Remove all chunks for a specific file.
212
+
213
+ Args:
214
+ file_path: Relative path to the file
215
+
216
+ Returns:
217
+ Number of chunks removed
218
+ """
219
+ if not self._collection:
220
+ raise DatabaseNotInitializedError("Database not initialized")
221
+
222
+ try:
223
+ # Get all chunks for this file
224
+ results = self._collection.get(where={"file_path": file_path})
225
+
226
+ if not results["ids"]:
227
+ return 0
228
+
229
+ # Delete the chunks
230
+ self._collection.delete(ids=results["ids"])
231
+
232
+ removed_count = len(results["ids"])
233
+ logger.debug(f"Removed {removed_count} chunks for file: {file_path}")
234
+ return removed_count
235
+
236
+ except Exception as e:
237
+ logger.error(f"Failed to remove chunks for file {file_path}: {e}")
238
+ return 0
239
+
240
+ async def close(self) -> None:
241
+ """Close database connections."""
242
+ if self._client:
243
+ # ChromaDB doesn't require explicit closing
244
+ self._client = None
245
+ self._collection = None
246
+ logger.debug("ChromaDB connections closed")
247
+
248
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
249
+ """Add code chunks to the database."""
250
+ if not self._collection:
251
+ raise DatabaseNotInitializedError("Database not initialized")
252
+
253
+ if not chunks:
254
+ return
255
+
256
+ try:
257
+ documents = []
258
+ metadatas = []
259
+ ids = []
260
+
261
+ for chunk in chunks:
262
+ # Debug: Check first chunk content
263
+ if len(documents) == 0:
264
+ import sys
265
+
266
+ has_meta = "Language:" in chunk.content and "File:" in chunk.content
267
+ print("\n[DATABASE] First chunk content check:", file=sys.stderr)
268
+ print(f" Type: {chunk.chunk_type}", file=sys.stderr)
269
+ print(f" File: {chunk.file_path.name}", file=sys.stderr)
270
+ print(
271
+ f" Has metadata IN chunk.content: {has_meta}", file=sys.stderr
272
+ )
273
+ print(
274
+ f" Last 100 chars: {repr(chunk.content[-100:])}",
275
+ file=sys.stderr,
276
+ )
277
+
278
+ # Store original content directly in documents (no metadata appended)
279
+ # The embedding will be created from the original content
280
+ documents.append(chunk.content)
281
+
282
+ # Create metadata (searchable fields as metadata, not appended to content)
283
+ metadata = {
284
+ "file_path": str(chunk.file_path),
285
+ "start_line": chunk.start_line,
286
+ "end_line": chunk.end_line,
287
+ "language": chunk.language,
288
+ "chunk_type": chunk.chunk_type,
289
+ "function_name": chunk.function_name or "",
290
+ "class_name": chunk.class_name or "",
291
+ "docstring": chunk.docstring or "",
292
+ "complexity_score": chunk.complexity_score,
293
+ # Hierarchy fields (convert lists to JSON strings for ChromaDB)
294
+ "chunk_id": chunk.chunk_id or "",
295
+ "parent_chunk_id": chunk.parent_chunk_id or "",
296
+ "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
297
+ "chunk_depth": chunk.chunk_depth,
298
+ # Additional metadata (convert lists/dicts to JSON strings)
299
+ "decorators": json.dumps(chunk.decorators or []),
300
+ "parameters": json.dumps(chunk.parameters or []),
301
+ "return_type": chunk.return_type or "",
302
+ "type_annotations": json.dumps(chunk.type_annotations or {}),
303
+ # Monorepo support
304
+ "subproject_name": chunk.subproject_name or "",
305
+ "subproject_path": chunk.subproject_path or "",
306
+ }
307
+ metadatas.append(metadata)
308
+
309
+ # Use chunk ID
310
+ ids.append(chunk.id)
311
+
312
+ # Add to collection
313
+ self._collection.add(
314
+ documents=documents,
315
+ metadatas=metadatas,
316
+ ids=ids,
317
+ )
318
+
319
+ logger.debug(f"Added {len(chunks)} chunks to database")
320
+
321
+ except Exception as e:
322
+ logger.error(f"Failed to add chunks: {e}")
323
+ raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
324
+
325
+ async def search(
326
+ self,
327
+ query: str,
328
+ limit: int = 10,
329
+ filters: dict[str, Any] | None = None,
330
+ similarity_threshold: float = 0.7,
331
+ ) -> list[SearchResult]:
332
+ """Search for similar code chunks."""
333
+ if not self._collection:
334
+ raise DatabaseNotInitializedError("Database not initialized")
335
+
336
+ try:
337
+ # Build where clause
338
+ where_clause = self._build_where_clause(filters) if filters else None
339
+
340
+ # Perform search
341
+ results = self._collection.query(
342
+ query_texts=[query],
343
+ n_results=limit,
344
+ where=where_clause,
345
+ include=["documents", "metadatas", "distances"],
346
+ )
347
+
348
+ # Process results
349
+ search_results = []
350
+
351
+ if results["documents"] and results["documents"][0]:
352
+ for i, (doc, metadata, distance) in enumerate(
353
+ zip(
354
+ results["documents"][0],
355
+ results["metadatas"][0],
356
+ results["distances"][0],
357
+ strict=False,
358
+ )
359
+ ):
360
+ # Convert distance to similarity (ChromaDB uses cosine distance)
361
+ # For cosine distance, use a more permissive conversion that handles distances > 1.0
362
+ # Convert to a 0-1 similarity score where lower distances = higher similarity
363
+ similarity = max(0.0, 1.0 / (1.0 + distance))
364
+
365
+ if similarity >= similarity_threshold:
366
+ # Document contains the original content (no metadata appended)
367
+ result = SearchResult(
368
+ content=doc,
369
+ file_path=Path(metadata["file_path"]),
370
+ start_line=metadata["start_line"],
371
+ end_line=metadata["end_line"],
372
+ language=metadata["language"],
373
+ similarity_score=similarity,
374
+ rank=i + 1,
375
+ chunk_type=metadata.get("chunk_type", "code"),
376
+ function_name=metadata.get("function_name") or None,
377
+ class_name=metadata.get("class_name") or None,
378
+ )
379
+ search_results.append(result)
380
+
381
+ logger.debug(f"Found {len(search_results)} results for query: {query}")
382
+ return search_results
383
+
384
+ except Exception as e:
385
+ logger.error(f"Search failed: {e}")
386
+ raise SearchError(f"Search failed: {e}") from e
387
+
388
+ async def delete_by_file(self, file_path: Path) -> int:
389
+ """Delete all chunks for a specific file."""
390
+ if not self._collection:
391
+ raise DatabaseNotInitializedError("Database not initialized")
392
+
393
+ try:
394
+ # Get all chunks for this file
395
+ results = self._collection.get(
396
+ where={"file_path": str(file_path)},
397
+ include=["metadatas"],
398
+ )
399
+
400
+ if results["ids"]:
401
+ self._collection.delete(ids=results["ids"])
402
+ count = len(results["ids"])
403
+ logger.debug(f"Deleted {count} chunks for {file_path}")
404
+ return count
405
+
406
+ return 0
407
+
408
+ except Exception as e:
409
+ logger.error(f"Failed to delete chunks for {file_path}: {e}")
410
+ raise DatabaseError(f"Failed to delete chunks: {e}") from e
411
+
412
+ async def get_stats(self) -> IndexStats:
413
+ """Get database statistics with optimized chunked queries."""
414
+ if not self._collection:
415
+ raise DatabaseNotInitializedError("Database not initialized")
416
+
417
+ try:
418
+ # Get total count (fast operation)
419
+ count = self._collection.count()
420
+
421
+ if count == 0:
422
+ return IndexStats(
423
+ total_files=0,
424
+ total_chunks=0,
425
+ languages={},
426
+ file_types={},
427
+ index_size_mb=0.0,
428
+ last_updated="N/A",
429
+ embedding_model="unknown",
430
+ )
431
+
432
+ # Process in chunks to avoid loading everything at once
433
+ batch_size_limit = 1000
434
+
435
+ files = set()
436
+ language_counts: dict[str, int] = {}
437
+ file_type_counts: dict[str, int] = {}
438
+
439
+ offset = 0
440
+ while offset < count:
441
+ # Fetch batch
442
+ batch_size = min(batch_size_limit, count - offset)
443
+ logger.debug(
444
+ f"Processing database stats: batch {offset // batch_size_limit + 1}, "
445
+ f"{offset}-{offset + batch_size} of {count} chunks"
446
+ )
447
+
448
+ results = self._collection.get(
449
+ include=["metadatas"],
450
+ limit=batch_size,
451
+ offset=offset,
452
+ )
453
+
454
+ # Process batch metadata
455
+ for metadata in results.get("metadatas", []):
456
+ # Language stats
457
+ lang = metadata.get("language", "unknown")
458
+ language_counts[lang] = language_counts.get(lang, 0) + 1
459
+
460
+ # File stats
461
+ file_path = metadata.get("file_path", "")
462
+ if file_path:
463
+ files.add(file_path)
464
+ ext = Path(file_path).suffix or "no_extension"
465
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
466
+
467
+ offset += batch_size
468
+
469
+ # Yield to event loop periodically to prevent blocking
470
+ await asyncio.sleep(0)
471
+
472
+ # Estimate index size (rough approximation: ~1KB per chunk)
473
+ index_size_mb = count * 0.001
474
+
475
+ return IndexStats(
476
+ total_files=len(files),
477
+ total_chunks=count,
478
+ languages=language_counts,
479
+ file_types=file_type_counts,
480
+ index_size_mb=index_size_mb,
481
+ last_updated="unknown",
482
+ embedding_model="unknown",
483
+ )
484
+
485
+ except Exception as e:
486
+ logger.error(f"Failed to get database statistics: {e}")
487
+ # Return empty stats instead of raising
488
+ return IndexStats(
489
+ total_files=0,
490
+ total_chunks=0,
491
+ languages={},
492
+ file_types={},
493
+ index_size_mb=0.0,
494
+ last_updated="error",
495
+ embedding_model="unknown",
496
+ )
497
+
498
+ async def reset(self) -> None:
499
+ """Reset the database."""
500
+ if self._client:
501
+ try:
502
+ self._client.reset()
503
+ # Recreate collection
504
+ await self.initialize()
505
+ logger.info("Database reset successfully")
506
+ except Exception as e:
507
+ logger.error(f"Failed to reset database: {e}")
508
+ raise DatabaseError(f"Failed to reset database: {e}") from e
509
+
510
+ async def get_all_chunks(self) -> list[CodeChunk]:
511
+ """Get all chunks from the database.
512
+
513
+ Returns:
514
+ List of all code chunks with metadata
515
+ """
516
+ if not self._collection:
517
+ raise DatabaseNotInitializedError("Database not initialized")
518
+
519
+ try:
520
+ # Get all documents from collection
521
+ results = self._collection.get(include=["metadatas", "documents"])
522
+
523
+ chunks = []
524
+ if results and results.get("ids"):
525
+ for i, _chunk_id in enumerate(results["ids"]):
526
+ metadata = results["metadatas"][i]
527
+ # Document now contains the original content (no metadata appended)
528
+ content = results["documents"][i]
529
+
530
+ # Parse JSON strings back to lists/dicts
531
+ child_chunk_ids = metadata.get("child_chunk_ids", "[]")
532
+ if isinstance(child_chunk_ids, str):
533
+ child_chunk_ids = json.loads(child_chunk_ids)
534
+
535
+ decorators = metadata.get("decorators", "[]")
536
+ if isinstance(decorators, str):
537
+ decorators = json.loads(decorators)
538
+
539
+ parameters = metadata.get("parameters", "[]")
540
+ if isinstance(parameters, str):
541
+ parameters = json.loads(parameters)
542
+
543
+ type_annotations = metadata.get("type_annotations", "{}")
544
+ if isinstance(type_annotations, str):
545
+ type_annotations = json.loads(type_annotations)
546
+
547
+ chunk = CodeChunk(
548
+ content=content,
549
+ file_path=Path(metadata["file_path"]),
550
+ start_line=metadata["start_line"],
551
+ end_line=metadata["end_line"],
552
+ language=metadata["language"],
553
+ chunk_type=metadata.get("chunk_type", "code"),
554
+ function_name=metadata.get("function_name"),
555
+ class_name=metadata.get("class_name"),
556
+ docstring=metadata.get("docstring"),
557
+ imports=metadata.get("imports", []),
558
+ complexity_score=metadata.get("complexity_score", 0.0),
559
+ chunk_id=metadata.get("chunk_id"),
560
+ parent_chunk_id=metadata.get("parent_chunk_id"),
561
+ child_chunk_ids=child_chunk_ids,
562
+ chunk_depth=metadata.get("chunk_depth", 0),
563
+ decorators=decorators,
564
+ parameters=parameters,
565
+ return_type=metadata.get("return_type"),
566
+ type_annotations=type_annotations,
567
+ subproject_name=metadata.get("subproject_name"),
568
+ subproject_path=metadata.get("subproject_path"),
569
+ )
570
+ chunks.append(chunk)
571
+
572
+ logger.debug(f"Retrieved {len(chunks)} chunks from database")
573
+ return chunks
574
+
575
+ except Exception as e:
576
+ logger.error(f"Failed to get all chunks: {e}")
577
+ raise DatabaseError(f"Failed to get all chunks: {e}") from e
578
+
579
+ def _create_searchable_text(self, chunk: CodeChunk) -> str:
580
+ """Create optimized searchable text from code chunk."""
581
+ import sys
582
+
583
+ print("WARNING: _create_searchable_text IS BEING CALLED!", file=sys.stderr)
584
+ parts = [chunk.content]
585
+
586
+ # Add contextual information
587
+ if chunk.function_name:
588
+ parts.append(f"Function: {chunk.function_name}")
589
+
590
+ if chunk.class_name:
591
+ parts.append(f"Class: {chunk.class_name}")
592
+
593
+ if chunk.docstring:
594
+ parts.append(f"Documentation: {chunk.docstring}")
595
+
596
+ # Add language and file context
597
+ parts.append(f"Language: {chunk.language}")
598
+ parts.append(f"File: {chunk.file_path.name}")
599
+
600
+ return "\n".join(parts)
601
+
602
+ def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
603
+ """Build ChromaDB where clause from filters."""
604
+ where = {}
605
+
606
+ for key, value in filters.items():
607
+ if isinstance(value, list):
608
+ where[key] = {"$in": value}
609
+ elif isinstance(value, str) and value.startswith("!"):
610
+ where[key] = {"$ne": value[1:]}
611
+ else:
612
+ where[key] = value
613
+
614
+ return where
615
+
616
+ async def _detect_and_recover_corruption(self) -> None:
617
+ """Detect and recover from index corruption proactively."""
618
+ # Check for common corruption indicators in ChromaDB files
619
+ chroma_db_path = self.persist_directory / "chroma.sqlite3"
620
+
621
+ # If database doesn't exist yet, nothing to check
622
+ if not chroma_db_path.exists():
623
+ return
624
+
625
+ # Check for HNSW index files that might be corrupted
626
+ self.persist_directory / "chroma-collections.parquet"
627
+ index_path = self.persist_directory / "index"
628
+
629
+ if index_path.exists():
630
+ # Look for pickle files in the index
631
+ pickle_files = list(index_path.glob("**/*.pkl"))
632
+ pickle_files.extend(list(index_path.glob("**/*.pickle")))
633
+
634
+ for pickle_file in pickle_files:
635
+ try:
636
+ # Try to read the pickle file to detect corruption
637
+ import pickle # nosec B403 # Trusted internal index files only
638
+
639
+ with open(pickle_file, "rb") as f:
640
+ pickle.load(f) # nosec B301 # Trusted internal index files only
641
+ except (EOFError, pickle.UnpicklingError, Exception) as e:
642
+ logger.warning(
643
+ f"Corrupted index file detected: {pickle_file} - {e}"
644
+ )
645
+ await self._recover_from_corruption()
646
+ return
647
+
648
+ async def _recover_from_corruption(self) -> None:
649
+ """Recover from index corruption by rebuilding the index."""
650
+ logger.info("Attempting to recover from index corruption...")
651
+
652
+ # Create backup directory
653
+ backup_dir = (
654
+ self.persist_directory.parent / f"{self.persist_directory.name}_backup"
655
+ )
656
+ backup_dir.mkdir(exist_ok=True)
657
+
658
+ # Backup current state (in case we need it)
659
+ import time
660
+
661
+ timestamp = int(time.time())
662
+ backup_path = backup_dir / f"backup_{timestamp}"
663
+
664
+ if self.persist_directory.exists():
665
+ try:
666
+ shutil.copytree(self.persist_directory, backup_path)
667
+ logger.info(f"Created backup at {backup_path}")
668
+ except Exception as e:
669
+ logger.warning(f"Could not create backup: {e}")
670
+
671
+ # Clear the corrupted index
672
+ if self.persist_directory.exists():
673
+ try:
674
+ shutil.rmtree(self.persist_directory)
675
+ logger.info(f"Cleared corrupted index at {self.persist_directory}")
676
+ except Exception as e:
677
+ logger.error(f"Failed to clear corrupted index: {e}")
678
+ raise IndexCorruptionError(
679
+ f"Could not clear corrupted index: {e}"
680
+ ) from e
681
+
682
+ # Recreate the directory
683
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
684
+ logger.info("Index directory recreated. Please re-index your codebase.")
685
+
686
+ async def health_check(self) -> bool:
687
+ """Check database health and integrity.
688
+
689
+ Returns:
690
+ True if database is healthy, False otherwise
691
+ """
692
+ try:
693
+ # First check if client is initialized
694
+ if not self._client or not self._collection:
695
+ logger.warning("Database not initialized")
696
+ return False
697
+
698
+ # Try a simple operation to test the connection
699
+ try:
700
+ # Attempt to get count - this will fail if index is corrupted
701
+ count = self._collection.count()
702
+ logger.debug(f"Health check passed: {count} chunks in database")
703
+
704
+ # Try a minimal query to ensure search works
705
+ self._collection.query(
706
+ query_texts=["test"], n_results=1, include=["metadatas"]
707
+ )
708
+
709
+ return True
710
+
711
+ except Exception as e:
712
+ error_msg = str(e).lower()
713
+ if any(
714
+ indicator in error_msg
715
+ for indicator in [
716
+ "pickle",
717
+ "unpickling",
718
+ "eof",
719
+ "ran out of input",
720
+ "hnsw",
721
+ "index",
722
+ "deserialize",
723
+ "corrupt",
724
+ ]
725
+ ):
726
+ logger.error(f"Index corruption detected during health check: {e}")
727
+ return False
728
+ else:
729
+ # Some other error
730
+ logger.warning(f"Health check failed: {e}")
731
+ return False
732
+
733
+ except Exception as e:
734
+ logger.error(f"Health check error: {e}")
735
+ return False
736
+
737
+
738
+ class PooledChromaVectorDatabase(VectorDatabase):
739
+ """ChromaDB implementation with connection pooling for improved performance."""
740
+
741
+ def __init__(
742
+ self,
743
+ persist_directory: Path,
744
+ embedding_function: EmbeddingFunction,
745
+ collection_name: str = "code_search",
746
+ max_connections: int = 10,
747
+ min_connections: int = 2,
748
+ max_idle_time: float = 300.0,
749
+ max_connection_age: float = 3600.0,
750
+ ) -> None:
751
+ """Initialize pooled ChromaDB vector database.
752
+
753
+ Args:
754
+ persist_directory: Directory to persist database
755
+ embedding_function: Function to generate embeddings
756
+ collection_name: Name of the collection
757
+ max_connections: Maximum number of connections in pool
758
+ min_connections: Minimum number of connections to maintain
759
+ max_idle_time: Maximum time a connection can be idle (seconds)
760
+ max_connection_age: Maximum age of a connection (seconds)
761
+ """
762
+ self.persist_directory = persist_directory
763
+ self.embedding_function = embedding_function
764
+ self.collection_name = collection_name
765
+
766
+ self._pool = ChromaConnectionPool(
767
+ persist_directory=persist_directory,
768
+ embedding_function=embedding_function,
769
+ collection_name=collection_name,
770
+ max_connections=max_connections,
771
+ min_connections=min_connections,
772
+ max_idle_time=max_idle_time,
773
+ max_connection_age=max_connection_age,
774
+ )
775
+
776
+ async def initialize(self) -> None:
777
+ """Initialize the connection pool."""
778
+ await self._pool.initialize()
779
+ logger.debug(f"Pooled ChromaDB initialized at {self.persist_directory}")
780
+
781
+ async def close(self) -> None:
782
+ """Close the connection pool."""
783
+ await self._pool.close()
784
+ logger.debug("Pooled ChromaDB connections closed")
785
+
786
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
787
+ """Add code chunks to the database using pooled connection."""
788
+ if not chunks:
789
+ return
790
+
791
+ # Ensure pool is initialized
792
+ if not self._pool._initialized:
793
+ await self._pool.initialize()
794
+
795
+ try:
796
+ async with self._pool.get_connection() as conn:
797
+ # Prepare data for ChromaDB
798
+ documents = []
799
+ metadatas = []
800
+ ids = []
801
+
802
+ for chunk in chunks:
803
+ # Store original content in documents (no metadata appended)
804
+ documents.append(chunk.content)
805
+ metadatas.append(
806
+ {
807
+ "file_path": str(chunk.file_path),
808
+ "start_line": chunk.start_line,
809
+ "end_line": chunk.end_line,
810
+ "language": chunk.language,
811
+ "chunk_type": chunk.chunk_type,
812
+ "function_name": chunk.function_name or "",
813
+ "class_name": chunk.class_name or "",
814
+ "docstring": chunk.docstring or "",
815
+ "complexity_score": chunk.complexity_score,
816
+ # Hierarchy fields (convert lists to JSON strings for ChromaDB)
817
+ "chunk_id": chunk.chunk_id or "",
818
+ "parent_chunk_id": chunk.parent_chunk_id or "",
819
+ "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
820
+ "chunk_depth": chunk.chunk_depth,
821
+ # Additional metadata (convert lists/dicts to JSON strings)
822
+ "decorators": json.dumps(chunk.decorators or []),
823
+ "parameters": json.dumps(chunk.parameters or []),
824
+ "return_type": chunk.return_type or "",
825
+ "type_annotations": json.dumps(
826
+ chunk.type_annotations or {}
827
+ ),
828
+ # Monorepo support
829
+ "subproject_name": chunk.subproject_name or "",
830
+ "subproject_path": chunk.subproject_path or "",
831
+ }
832
+ )
833
+ ids.append(chunk.id)
834
+
835
+ # Add to collection
836
+ conn.collection.add(documents=documents, metadatas=metadatas, ids=ids)
837
+
838
+ logger.debug(f"Added {len(chunks)} chunks to database")
839
+
840
+ except Exception as e:
841
+ logger.error(f"Failed to add chunks: {e}")
842
+ raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
843
+
844
+ async def search(
845
+ self,
846
+ query: str,
847
+ limit: int = 10,
848
+ filters: dict[str, Any] | None = None,
849
+ similarity_threshold: float = 0.7,
850
+ ) -> list[SearchResult]:
851
+ """Search for similar code chunks using pooled connection."""
852
+ # Ensure pool is initialized
853
+ if not self._pool._initialized:
854
+ await self._pool.initialize()
855
+
856
+ try:
857
+ async with self._pool.get_connection() as conn:
858
+ # Build where clause
859
+ where_clause = self._build_where_clause(filters) if filters else None
860
+
861
+ # Perform search
862
+ results = conn.collection.query(
863
+ query_texts=[query],
864
+ n_results=limit,
865
+ where=where_clause,
866
+ include=["documents", "metadatas", "distances"],
867
+ )
868
+
869
+ # Process results
870
+ search_results = []
871
+
872
+ if results["documents"] and results["documents"][0]:
873
+ for i, (doc, metadata, distance) in enumerate(
874
+ zip(
875
+ results["documents"][0],
876
+ results["metadatas"][0],
877
+ results["distances"][0],
878
+ strict=False,
879
+ )
880
+ ):
881
+ # Convert distance to similarity (ChromaDB uses cosine distance)
882
+ # For cosine distance, use a more permissive conversion that handles distances > 1.0
883
+ # Convert to a 0-1 similarity score where lower distances = higher similarity
884
+ similarity = max(0.0, 1.0 / (1.0 + distance))
885
+
886
+ if similarity >= similarity_threshold:
887
+ # Document contains the original content (no metadata appended)
888
+ result = SearchResult(
889
+ content=doc,
890
+ file_path=Path(metadata["file_path"]),
891
+ start_line=metadata["start_line"],
892
+ end_line=metadata["end_line"],
893
+ language=metadata["language"],
894
+ similarity_score=similarity,
895
+ rank=i + 1,
896
+ chunk_type=metadata.get("chunk_type", "code"),
897
+ function_name=metadata.get("function_name") or None,
898
+ class_name=metadata.get("class_name") or None,
899
+ )
900
+ search_results.append(result)
901
+
902
+ logger.debug(f"Found {len(search_results)} results for query: {query}")
903
+ return search_results
904
+
905
+ except Exception as e:
906
+ logger.error(f"Search failed: {e}")
907
+ raise SearchError(f"Search failed: {e}") from e
908
+
909
+ async def delete_by_file(self, file_path: Path) -> int:
910
+ """Delete all chunks for a specific file using pooled connection."""
911
+ try:
912
+ async with self._pool.get_connection() as conn:
913
+ # Get all chunks for this file
914
+ results = conn.collection.get(
915
+ where={"file_path": str(file_path)}, include=["metadatas"]
916
+ )
917
+
918
+ if not results["ids"]:
919
+ return 0
920
+
921
+ # Delete the chunks
922
+ conn.collection.delete(ids=results["ids"])
923
+
924
+ deleted_count = len(results["ids"])
925
+ logger.debug(f"Deleted {deleted_count} chunks for file: {file_path}")
926
+ return deleted_count
927
+
928
+ except Exception as e:
929
+ logger.error(f"Failed to delete chunks for file {file_path}: {e}")
930
+ raise DatabaseError(f"Failed to delete chunks: {e}") from e
931
+
932
+ async def get_stats(self) -> IndexStats:
933
+ """Get database statistics with connection pooling and chunked queries."""
934
+ try:
935
+ async with self._pool.get_connection() as conn:
936
+ # Get total count (fast operation)
937
+ count = conn.collection.count()
938
+
939
+ if count == 0:
940
+ return IndexStats(
941
+ total_files=0,
942
+ total_chunks=0,
943
+ languages={},
944
+ file_types={},
945
+ index_size_mb=0.0,
946
+ last_updated="N/A",
947
+ embedding_model="unknown",
948
+ )
949
+
950
+ # Process in chunks to avoid loading everything at once
951
+ batch_size_limit = 1000
952
+
953
+ files = set()
954
+ language_counts: dict[str, int] = {}
955
+ file_type_counts: dict[str, int] = {}
956
+
957
+ offset = 0
958
+ while offset < count:
959
+ # Fetch batch
960
+ batch_size = min(batch_size_limit, count - offset)
961
+ logger.debug(
962
+ f"Processing database stats: batch {offset // batch_size_limit + 1}, "
963
+ f"{offset}-{offset + batch_size} of {count} chunks"
964
+ )
965
+
966
+ results = conn.collection.get(
967
+ include=["metadatas"],
968
+ limit=batch_size,
969
+ offset=offset,
970
+ )
971
+
972
+ # Process batch metadata
973
+ for metadata in results.get("metadatas", []):
974
+ # Language stats
975
+ lang = metadata.get("language", "unknown")
976
+ language_counts[lang] = language_counts.get(lang, 0) + 1
977
+
978
+ # File stats
979
+ file_path = metadata.get("file_path", "")
980
+ if file_path:
981
+ files.add(file_path)
982
+ ext = Path(file_path).suffix or "no_extension"
983
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
984
+
985
+ offset += batch_size
986
+
987
+ # Yield to event loop periodically to prevent blocking
988
+ await asyncio.sleep(0)
989
+
990
+ # Estimate index size (rough approximation: ~1KB per chunk)
991
+ index_size_mb = count * 0.001
992
+
993
+ return IndexStats(
994
+ total_files=len(files),
995
+ total_chunks=count,
996
+ languages=language_counts,
997
+ file_types=file_type_counts,
998
+ index_size_mb=index_size_mb,
999
+ last_updated="unknown",
1000
+ embedding_model="unknown",
1001
+ )
1002
+
1003
+ except Exception as e:
1004
+ logger.error(f"Failed to get database statistics: {e}")
1005
+ # Return empty stats instead of raising
1006
+ return IndexStats(
1007
+ total_files=0,
1008
+ total_chunks=0,
1009
+ languages={},
1010
+ file_types={},
1011
+ index_size_mb=0.0,
1012
+ last_updated="error",
1013
+ embedding_model="unknown",
1014
+ )
1015
+
1016
+ async def remove_file_chunks(self, file_path: str) -> int:
1017
+ """Remove all chunks for a specific file using pooled connection."""
1018
+ try:
1019
+ async with self._pool.get_connection() as conn:
1020
+ # Get all chunks for this file
1021
+ results = conn.collection.get(where={"file_path": file_path})
1022
+
1023
+ if not results["ids"]:
1024
+ return 0
1025
+
1026
+ # Delete the chunks
1027
+ conn.collection.delete(ids=results["ids"])
1028
+
1029
+ return len(results["ids"])
1030
+
1031
+ except Exception as e:
1032
+ logger.error(f"Failed to remove chunks for file {file_path}: {e}")
1033
+ return 0
1034
+
1035
+ async def reset(self) -> None:
1036
+ """Reset the database using pooled connection."""
1037
+ try:
1038
+ async with self._pool.get_connection() as conn:
1039
+ conn.client.reset()
1040
+ # Reinitialize the pool after reset
1041
+ await self._pool.close()
1042
+ await self._pool.initialize()
1043
+ logger.info("Database reset successfully")
1044
+ except Exception as e:
1045
+ logger.error(f"Failed to reset database: {e}")
1046
+ raise DatabaseError(f"Failed to reset database: {e}") from e
1047
+
1048
+ async def get_all_chunks(self) -> list[CodeChunk]:
1049
+ """Get all chunks from the database using pooled connection.
1050
+
1051
+ Returns:
1052
+ List of all code chunks with metadata
1053
+ """
1054
+ try:
1055
+ async with self._pool.get_connection() as conn:
1056
+ # Get all documents from collection
1057
+ results = conn.collection.get(include=["metadatas", "documents"])
1058
+
1059
+ chunks = []
1060
+ if results and results.get("ids"):
1061
+ for i, _chunk_id in enumerate(results["ids"]):
1062
+ metadata = results["metadatas"][i]
1063
+ content = results["documents"][i]
1064
+
1065
+ # Parse JSON strings back to lists/dicts
1066
+ child_chunk_ids = metadata.get("child_chunk_ids", "[]")
1067
+ if isinstance(child_chunk_ids, str):
1068
+ child_chunk_ids = json.loads(child_chunk_ids)
1069
+
1070
+ decorators = metadata.get("decorators", "[]")
1071
+ if isinstance(decorators, str):
1072
+ decorators = json.loads(decorators)
1073
+
1074
+ parameters = metadata.get("parameters", "[]")
1075
+ if isinstance(parameters, str):
1076
+ parameters = json.loads(parameters)
1077
+
1078
+ type_annotations = metadata.get("type_annotations", "{}")
1079
+ if isinstance(type_annotations, str):
1080
+ type_annotations = json.loads(type_annotations)
1081
+
1082
+ chunk = CodeChunk(
1083
+ content=content,
1084
+ file_path=Path(metadata["file_path"]),
1085
+ start_line=metadata["start_line"],
1086
+ end_line=metadata["end_line"],
1087
+ language=metadata["language"],
1088
+ chunk_type=metadata.get("chunk_type", "code"),
1089
+ function_name=metadata.get("function_name"),
1090
+ class_name=metadata.get("class_name"),
1091
+ docstring=metadata.get("docstring"),
1092
+ imports=metadata.get("imports", []),
1093
+ complexity_score=metadata.get("complexity_score", 0.0),
1094
+ chunk_id=metadata.get("chunk_id"),
1095
+ parent_chunk_id=metadata.get("parent_chunk_id"),
1096
+ child_chunk_ids=child_chunk_ids,
1097
+ chunk_depth=metadata.get("chunk_depth", 0),
1098
+ decorators=decorators,
1099
+ parameters=parameters,
1100
+ return_type=metadata.get("return_type"),
1101
+ type_annotations=type_annotations,
1102
+ subproject_name=metadata.get("subproject_name"),
1103
+ subproject_path=metadata.get("subproject_path"),
1104
+ )
1105
+ chunks.append(chunk)
1106
+
1107
+ logger.debug(f"Retrieved {len(chunks)} chunks from database")
1108
+ return chunks
1109
+
1110
+ except Exception as e:
1111
+ logger.error(f"Failed to get all chunks: {e}")
1112
+ raise DatabaseError(f"Failed to get all chunks: {e}") from e
1113
+
1114
+ def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any] | None:
1115
+ """Build ChromaDB where clause from filters."""
1116
+ if not filters:
1117
+ return None
1118
+
1119
+ conditions = []
1120
+
1121
+ for key, value in filters.items():
1122
+ if key == "language" and value:
1123
+ conditions.append({"language": {"$eq": value}})
1124
+ elif key == "file_path" and value:
1125
+ if isinstance(value, list):
1126
+ conditions.append({"file_path": {"$in": [str(p) for p in value]}})
1127
+ else:
1128
+ conditions.append({"file_path": {"$eq": str(value)}})
1129
+ elif key == "chunk_type" and value:
1130
+ conditions.append({"chunk_type": {"$eq": value}})
1131
+
1132
+ if not conditions:
1133
+ return None
1134
+ elif len(conditions) > 1:
1135
+ return {"$and": conditions}
1136
+ else:
1137
+ return conditions[0]
1138
+
1139
+ def get_pool_stats(self) -> dict[str, Any]:
1140
+ """Get connection pool statistics."""
1141
+ return self._pool.get_stats()
1142
+
1143
+ async def health_check(self) -> bool:
1144
+ """Perform a health check on the database and connection pool."""
1145
+ try:
1146
+ # Check pool health
1147
+ pool_healthy = await self._pool.health_check()
1148
+ if not pool_healthy:
1149
+ return False
1150
+
1151
+ # Try a simple query to verify database integrity
1152
+ try:
1153
+ async with self._pool.get_connection() as conn:
1154
+ # Test basic operations
1155
+ conn.collection.count()
1156
+ conn.collection.query(
1157
+ query_texts=["test"], n_results=1, include=["metadatas"]
1158
+ )
1159
+ return True
1160
+ except Exception as e:
1161
+ error_msg = str(e).lower()
1162
+ if any(
1163
+ indicator in error_msg
1164
+ for indicator in [
1165
+ "pickle",
1166
+ "unpickling",
1167
+ "eof",
1168
+ "ran out of input",
1169
+ "hnsw",
1170
+ "index",
1171
+ "deserialize",
1172
+ "corrupt",
1173
+ ]
1174
+ ):
1175
+ logger.error(f"Index corruption detected: {e}")
1176
+ # Attempt recovery
1177
+ await self._recover_from_corruption()
1178
+ return False
1179
+ else:
1180
+ logger.warning(f"Health check failed: {e}")
1181
+ return False
1182
+ except Exception as e:
1183
+ logger.error(f"Health check error: {e}")
1184
+ return False
1185
+
1186
+ async def _recover_from_corruption(self) -> None:
1187
+ """Recover from index corruption by rebuilding the index."""
1188
+ logger.info("Attempting to recover from index corruption...")
1189
+
1190
+ # Close the pool first
1191
+ await self._pool.close()
1192
+
1193
+ # Create backup directory
1194
+ backup_dir = (
1195
+ self.persist_directory.parent / f"{self.persist_directory.name}_backup"
1196
+ )
1197
+ backup_dir.mkdir(exist_ok=True)
1198
+
1199
+ # Backup current state
1200
+ import time
1201
+
1202
+ timestamp = int(time.time())
1203
+ backup_path = backup_dir / f"backup_{timestamp}"
1204
+
1205
+ if self.persist_directory.exists():
1206
+ try:
1207
+ shutil.copytree(self.persist_directory, backup_path)
1208
+ logger.info(f"Created backup at {backup_path}")
1209
+ except Exception as e:
1210
+ logger.warning(f"Could not create backup: {e}")
1211
+
1212
+ # Clear the corrupted index
1213
+ if self.persist_directory.exists():
1214
+ try:
1215
+ shutil.rmtree(self.persist_directory)
1216
+ logger.info(f"Cleared corrupted index at {self.persist_directory}")
1217
+ except Exception as e:
1218
+ logger.error(f"Failed to clear corrupted index: {e}")
1219
+ raise IndexCorruptionError(
1220
+ f"Could not clear corrupted index: {e}"
1221
+ ) from e
1222
+
1223
+ # Recreate the directory
1224
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
1225
+
1226
+ # Reinitialize the pool
1227
+ await self._pool.initialize()
1228
+ logger.info("Index recovered. Please re-index your codebase.")
1229
+
1230
+ async def __aenter__(self):
1231
+ """Async context manager entry."""
1232
+ await self.initialize()
1233
+ return self
1234
+
1235
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1236
+ """Async context manager exit."""
1237
+ await self.close()