mcp-vector-search 0.12.6__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. mcp_vector_search/__init__.py +2 -2
  2. mcp_vector_search/analysis/__init__.py +64 -0
  3. mcp_vector_search/analysis/collectors/__init__.py +39 -0
  4. mcp_vector_search/analysis/collectors/base.py +164 -0
  5. mcp_vector_search/analysis/collectors/complexity.py +743 -0
  6. mcp_vector_search/analysis/metrics.py +341 -0
  7. mcp_vector_search/analysis/reporters/__init__.py +5 -0
  8. mcp_vector_search/analysis/reporters/console.py +222 -0
  9. mcp_vector_search/cli/commands/analyze.py +408 -0
  10. mcp_vector_search/cli/commands/chat.py +1262 -0
  11. mcp_vector_search/cli/commands/index.py +21 -3
  12. mcp_vector_search/cli/commands/init.py +13 -0
  13. mcp_vector_search/cli/commands/install.py +597 -335
  14. mcp_vector_search/cli/commands/install_old.py +8 -4
  15. mcp_vector_search/cli/commands/mcp.py +78 -6
  16. mcp_vector_search/cli/commands/reset.py +68 -26
  17. mcp_vector_search/cli/commands/search.py +30 -7
  18. mcp_vector_search/cli/commands/setup.py +1133 -0
  19. mcp_vector_search/cli/commands/status.py +37 -2
  20. mcp_vector_search/cli/commands/uninstall.py +276 -357
  21. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  22. mcp_vector_search/cli/commands/visualize/cli.py +276 -0
  23. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  24. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  25. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
  26. mcp_vector_search/cli/commands/visualize/graph_builder.py +714 -0
  27. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  28. mcp_vector_search/cli/commands/visualize/server.py +311 -0
  29. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  30. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  31. mcp_vector_search/cli/commands/visualize/templates/base.py +180 -0
  32. mcp_vector_search/cli/commands/visualize/templates/scripts.py +2507 -0
  33. mcp_vector_search/cli/commands/visualize/templates/styles.py +1313 -0
  34. mcp_vector_search/cli/commands/visualize.py.original +2536 -0
  35. mcp_vector_search/cli/didyoumean.py +22 -2
  36. mcp_vector_search/cli/main.py +115 -159
  37. mcp_vector_search/cli/output.py +24 -8
  38. mcp_vector_search/config/__init__.py +4 -0
  39. mcp_vector_search/config/default_thresholds.yaml +52 -0
  40. mcp_vector_search/config/settings.py +12 -0
  41. mcp_vector_search/config/thresholds.py +185 -0
  42. mcp_vector_search/core/auto_indexer.py +3 -3
  43. mcp_vector_search/core/boilerplate.py +186 -0
  44. mcp_vector_search/core/config_utils.py +394 -0
  45. mcp_vector_search/core/database.py +369 -94
  46. mcp_vector_search/core/exceptions.py +11 -0
  47. mcp_vector_search/core/git_hooks.py +4 -4
  48. mcp_vector_search/core/indexer.py +221 -4
  49. mcp_vector_search/core/llm_client.py +751 -0
  50. mcp_vector_search/core/models.py +3 -0
  51. mcp_vector_search/core/project.py +17 -0
  52. mcp_vector_search/core/scheduler.py +11 -11
  53. mcp_vector_search/core/search.py +179 -29
  54. mcp_vector_search/mcp/server.py +24 -5
  55. mcp_vector_search/utils/__init__.py +2 -0
  56. mcp_vector_search/utils/gitignore_updater.py +212 -0
  57. mcp_vector_search/utils/monorepo.py +66 -4
  58. mcp_vector_search/utils/timing.py +10 -6
  59. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/METADATA +182 -52
  60. mcp_vector_search-1.0.3.dist-info/RECORD +97 -0
  61. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/WHEEL +1 -1
  62. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/entry_points.txt +1 -0
  63. mcp_vector_search/cli/commands/visualize.py +0 -1467
  64. mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
  65. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/licenses/LICENSE +0 -0
@@ -44,11 +44,14 @@ class VectorDatabase(ABC):
44
44
  ...
45
45
 
46
46
  @abstractmethod
47
- async def add_chunks(self, chunks: list[CodeChunk]) -> None:
48
- """Add code chunks to the database.
47
+ async def add_chunks(
48
+ self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
49
+ ) -> None:
50
+ """Add code chunks to the database with optional structural metrics.
49
51
 
50
52
  Args:
51
53
  chunks: List of code chunks to add
54
+ metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
52
55
  """
53
56
  ...
54
57
 
@@ -148,6 +151,7 @@ class ChromaVectorDatabase(VectorDatabase):
148
151
  self.collection_name = collection_name
149
152
  self._client = None
150
153
  self._collection = None
154
+ self._recovery_attempted = False # Guard against infinite recursion
151
155
 
152
156
  async def initialize(self) -> None:
153
157
  """Initialize ChromaDB client and collection with corruption recovery."""
@@ -157,49 +161,144 @@ class ChromaVectorDatabase(VectorDatabase):
157
161
  # Ensure directory exists
158
162
  self.persist_directory.mkdir(parents=True, exist_ok=True)
159
163
 
160
- # Check for corruption before initializing
164
+ # LAYER 1: Check for corruption before initializing (SQLite + HNSW checks)
161
165
  await self._detect_and_recover_corruption()
162
166
 
163
- # Create client with new API
164
- self._client = chromadb.PersistentClient(
165
- path=str(self.persist_directory),
166
- settings=chromadb.Settings(
167
- anonymized_telemetry=False,
168
- allow_reset=True,
169
- ),
170
- )
167
+ # LAYER 2: Wrap ChromaDB initialization with Rust panic detection
168
+ try:
169
+ # Create client with new API
170
+ self._client = chromadb.PersistentClient(
171
+ path=str(self.persist_directory),
172
+ settings=chromadb.Settings(
173
+ anonymized_telemetry=False,
174
+ allow_reset=True,
175
+ ),
176
+ )
171
177
 
172
- # Create or get collection
173
- self._collection = self._client.get_or_create_collection(
174
- name=self.collection_name,
175
- embedding_function=self.embedding_function,
176
- metadata={
177
- "description": "Semantic code search collection",
178
- },
179
- )
178
+ # Create or get collection
179
+ self._collection = self._client.get_or_create_collection(
180
+ name=self.collection_name,
181
+ embedding_function=self.embedding_function,
182
+ metadata={
183
+ "description": "Semantic code search collection",
184
+ },
185
+ )
186
+
187
+ # Reset recovery flag on successful initialization
188
+ self._recovery_attempted = False
189
+
190
+ logger.debug(f"ChromaDB initialized at {self.persist_directory}")
191
+
192
+ except BaseException as init_error:
193
+ # Re-raise system exceptions we should never catch
194
+ if isinstance(
195
+ init_error, (KeyboardInterrupt, SystemExit, GeneratorExit)
196
+ ):
197
+ raise
198
+
199
+ # LAYER 2: Detect Rust panic patterns during initialization
200
+ error_msg = str(init_error).lower()
201
+
202
+ # Rust panic patterns (common ChromaDB Rust panics)
203
+ rust_panic_patterns = [
204
+ "range start index",
205
+ "out of range",
206
+ "panic",
207
+ "thread panicked",
208
+ "slice of length",
209
+ "index out of bounds",
210
+ ]
211
+
212
+ if any(pattern in error_msg for pattern in rust_panic_patterns):
213
+ logger.warning(
214
+ f"Rust panic detected during ChromaDB initialization: {init_error}"
215
+ )
216
+ logger.info(
217
+ "Attempting automatic recovery from database corruption..."
218
+ )
219
+ await self._recover_from_corruption()
220
+
221
+ # Retry initialization ONCE after recovery
222
+ try:
223
+ logger.info(
224
+ "Retrying ChromaDB initialization after recovery..."
225
+ )
226
+ self._client = chromadb.PersistentClient(
227
+ path=str(self.persist_directory),
228
+ settings=chromadb.Settings(
229
+ anonymized_telemetry=False,
230
+ allow_reset=True,
231
+ ),
232
+ )
233
+
234
+ self._collection = self._client.get_or_create_collection(
235
+ name=self.collection_name,
236
+ embedding_function=self.embedding_function,
237
+ metadata={
238
+ "description": "Semantic code search collection",
239
+ },
240
+ )
241
+
242
+ logger.info("ChromaDB successfully initialized after recovery")
180
243
 
181
- logger.debug(f"ChromaDB initialized at {self.persist_directory}")
244
+ except BaseException as retry_error:
245
+ # Re-raise system exceptions
246
+ if isinstance(
247
+ retry_error, (KeyboardInterrupt, SystemExit, GeneratorExit)
248
+ ):
249
+ raise
182
250
 
251
+ logger.error(
252
+ f"Failed to recover from database corruption: {retry_error}"
253
+ )
254
+ # Mark recovery as attempted to prevent infinite loops
255
+ self._recovery_attempted = True
256
+ raise DatabaseError(
257
+ f"Failed to recover from database corruption. "
258
+ f"Please run 'mcp-vector-search reset index' to clear the database. "
259
+ f"Error: {retry_error}"
260
+ ) from retry_error
261
+ else:
262
+ # Not a Rust panic, re-raise original exception
263
+ raise
264
+
265
+ except (DatabaseError, DatabaseInitializationError):
266
+ # Re-raise our own errors without re-processing
267
+ raise
183
268
  except Exception as e:
184
- # Check if this is a corruption error
269
+ # Check if this is a corruption error (legacy detection for backward compatibility)
185
270
  error_msg = str(e).lower()
186
- if any(
187
- indicator in error_msg
188
- for indicator in [
189
- "pickle",
190
- "unpickling",
191
- "eof",
192
- "ran out of input",
193
- "hnsw",
194
- "index",
195
- "deserialize",
196
- "corrupt",
197
- ]
198
- ):
271
+ corruption_indicators = [
272
+ "pickle",
273
+ "unpickling",
274
+ "eof",
275
+ "ran out of input",
276
+ "hnsw",
277
+ "index",
278
+ "deserialize",
279
+ "corrupt",
280
+ "file is not a database", # SQLite corruption
281
+ "database error", # ChromaDB database errors
282
+ ]
283
+
284
+ if any(indicator in error_msg for indicator in corruption_indicators):
285
+ # Prevent infinite recursion - only attempt recovery once
286
+ if self._recovery_attempted:
287
+ logger.error(
288
+ f"Recovery already attempted but corruption persists: {e}"
289
+ )
290
+ raise DatabaseInitializationError(
291
+ f"Failed to recover from database corruption. "
292
+ f"Please run 'mcp-vector-search reset index' to clear and rebuild the database. Error: {e}"
293
+ ) from e
294
+
199
295
  logger.warning(f"Detected index corruption: {e}")
296
+ self._recovery_attempted = True
297
+
200
298
  # Try to recover
201
299
  await self._recover_from_corruption()
202
- # Retry initialization
300
+
301
+ # Retry initialization ONE TIME
203
302
  await self.initialize()
204
303
  else:
205
304
  logger.error(f"Failed to initialize ChromaDB: {e}")
@@ -245,8 +344,16 @@ class ChromaVectorDatabase(VectorDatabase):
245
344
  self._collection = None
246
345
  logger.debug("ChromaDB connections closed")
247
346
 
248
- async def add_chunks(self, chunks: list[CodeChunk]) -> None:
249
- """Add code chunks to the database."""
347
+ async def add_chunks(
348
+ self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
349
+ ) -> None:
350
+ """Add code chunks to the database with optional structural metrics.
351
+
352
+ Args:
353
+ chunks: List of code chunks to add
354
+ metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
355
+ Example: {"chunk_id_1": {"cognitive_complexity": 5, ...}, ...}
356
+ """
250
357
  if not self._collection:
251
358
  raise DatabaseNotInitializedError("Database not initialized")
252
359
 
@@ -259,11 +366,27 @@ class ChromaVectorDatabase(VectorDatabase):
259
366
  ids = []
260
367
 
261
368
  for chunk in chunks:
262
- # Create searchable text
263
- searchable_text = self._create_searchable_text(chunk)
264
- documents.append(searchable_text)
369
+ # Debug: Check first chunk content
370
+ if len(documents) == 0:
371
+ import sys
372
+
373
+ has_meta = "Language:" in chunk.content and "File:" in chunk.content
374
+ print("\n[DATABASE] First chunk content check:", file=sys.stderr)
375
+ print(f" Type: {chunk.chunk_type}", file=sys.stderr)
376
+ print(f" File: {chunk.file_path.name}", file=sys.stderr)
377
+ print(
378
+ f" Has metadata IN chunk.content: {has_meta}", file=sys.stderr
379
+ )
380
+ print(
381
+ f" Last 100 chars: {repr(chunk.content[-100:])}",
382
+ file=sys.stderr,
383
+ )
265
384
 
266
- # Create metadata
385
+ # Store original content directly in documents (no metadata appended)
386
+ # The embedding will be created from the original content
387
+ documents.append(chunk.content)
388
+
389
+ # Create metadata (searchable fields as metadata, not appended to content)
267
390
  metadata = {
268
391
  "file_path": str(chunk.file_path),
269
392
  "start_line": chunk.start_line,
@@ -288,6 +411,12 @@ class ChromaVectorDatabase(VectorDatabase):
288
411
  "subproject_name": chunk.subproject_name or "",
289
412
  "subproject_path": chunk.subproject_path or "",
290
413
  }
414
+
415
+ # Merge structural metrics if provided
416
+ if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
417
+ chunk_metrics = metrics[chunk.chunk_id]
418
+ metadata.update(chunk_metrics)
419
+
291
420
  metadatas.append(metadata)
292
421
 
293
422
  # Use chunk ID
@@ -347,6 +476,7 @@ class ChromaVectorDatabase(VectorDatabase):
347
476
  similarity = max(0.0, 1.0 / (1.0 + distance))
348
477
 
349
478
  if similarity >= similarity_threshold:
479
+ # Document contains the original content (no metadata appended)
350
480
  result = SearchResult(
351
481
  content=doc,
352
482
  file_path=Path(metadata["file_path"]),
@@ -507,6 +637,7 @@ class ChromaVectorDatabase(VectorDatabase):
507
637
  if results and results.get("ids"):
508
638
  for i, _chunk_id in enumerate(results["ids"]):
509
639
  metadata = results["metadatas"][i]
640
+ # Document now contains the original content (no metadata appended)
510
641
  content = results["documents"][i]
511
642
 
512
643
  # Parse JSON strings back to lists/dicts
@@ -560,6 +691,9 @@ class ChromaVectorDatabase(VectorDatabase):
560
691
 
561
692
  def _create_searchable_text(self, chunk: CodeChunk) -> str:
562
693
  """Create optimized searchable text from code chunk."""
694
+ import sys
695
+
696
+ print("WARNING: _create_searchable_text IS BEING CALLED!", file=sys.stderr)
563
697
  parts = [chunk.content]
564
698
 
565
699
  # Add contextual information
@@ -579,7 +713,24 @@ class ChromaVectorDatabase(VectorDatabase):
579
713
  return "\n".join(parts)
580
714
 
581
715
  def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
582
- """Build ChromaDB where clause from filters."""
716
+ """Build ChromaDB where clause from filters.
717
+
718
+ Supports filtering by:
719
+ - language, file_path, chunk_type (standard fields)
720
+ - complexity_grade (A, B, C, D, F)
721
+ - smell_count (0, >0)
722
+ - cognitive_complexity (range queries using $and)
723
+
724
+ Args:
725
+ filters: Dictionary of filter criteria
726
+
727
+ Returns:
728
+ ChromaDB where clause
729
+ """
730
+ # If filters already contain ChromaDB operators ($and, $or), pass through
731
+ if "$and" in filters or "$or" in filters:
732
+ return filters
733
+
583
734
  where = {}
584
735
 
585
736
  for key, value in filters.items():
@@ -587,46 +738,140 @@ class ChromaVectorDatabase(VectorDatabase):
587
738
  where[key] = {"$in": value}
588
739
  elif isinstance(value, str) and value.startswith("!"):
589
740
  where[key] = {"$ne": value[1:]}
741
+ elif isinstance(value, dict):
742
+ # Support operator queries like {"$gte": 10}
743
+ where[key] = value
590
744
  else:
591
745
  where[key] = value
592
746
 
593
747
  return where
594
748
 
595
749
  async def _detect_and_recover_corruption(self) -> None:
596
- """Detect and recover from index corruption proactively."""
597
- # Check for common corruption indicators in ChromaDB files
750
+ """Detect and recover from index corruption proactively.
751
+
752
+ This method checks for:
753
+ 1. SQLite database corruption (LAYER 1: Pre-initialization check)
754
+ 2. HNSW pickle file corruption
755
+ 3. Metadata/data inconsistencies
756
+ 4. File size anomalies
757
+ """
758
+ # LAYER 1: Check SQLite database integrity FIRST (before ChromaDB initialization)
598
759
  chroma_db_path = self.persist_directory / "chroma.sqlite3"
599
760
 
600
761
  # If database doesn't exist yet, nothing to check
601
762
  if not chroma_db_path.exists():
602
763
  return
603
764
 
765
+ # SQLite integrity check - catches corruption BEFORE Rust panic
766
+ try:
767
+ import sqlite3
768
+
769
+ logger.debug("Running SQLite integrity check...")
770
+ conn = sqlite3.connect(str(chroma_db_path))
771
+ cursor = conn.execute("PRAGMA quick_check")
772
+ result = cursor.fetchone()[0]
773
+ conn.close()
774
+
775
+ if result != "ok":
776
+ logger.warning(f"SQLite database corruption detected: {result}")
777
+ logger.info("Initiating automatic recovery from database corruption...")
778
+ await self._recover_from_corruption()
779
+ return
780
+
781
+ logger.debug("SQLite integrity check passed")
782
+
783
+ except sqlite3.Error as e:
784
+ logger.warning(f"SQLite database error during integrity check: {e}")
785
+ logger.info("Initiating automatic recovery from database corruption...")
786
+ await self._recover_from_corruption()
787
+ return
788
+
604
789
  # Check for HNSW index files that might be corrupted
605
- self.persist_directory / "chroma-collections.parquet"
606
790
  index_path = self.persist_directory / "index"
607
791
 
608
792
  if index_path.exists():
609
- # Look for pickle files in the index
793
+ # Look for pickle files in the index (HNSW metadata)
610
794
  pickle_files = list(index_path.glob("**/*.pkl"))
611
795
  pickle_files.extend(list(index_path.glob("**/*.pickle")))
796
+ pickle_files.extend(list(index_path.glob("**/*.bin"))) # Binary HNSW files
797
+
798
+ logger.debug(
799
+ f"Checking {len(pickle_files)} HNSW index files for corruption..."
800
+ )
612
801
 
613
802
  for pickle_file in pickle_files:
614
803
  try:
615
- # Try to read the pickle file to detect corruption
616
- import pickle
617
-
618
- with open(pickle_file, "rb") as f:
619
- pickle.load(f)
620
- except (EOFError, pickle.UnpicklingError, Exception) as e:
621
- logger.warning(
622
- f"Corrupted index file detected: {pickle_file} - {e}"
623
- )
804
+ # Check file size - suspiciously small files might be corrupted
805
+ file_size = pickle_file.stat().st_size
806
+ if file_size == 0:
807
+ logger.warning(
808
+ f"Empty HNSW index file detected: {pickle_file} (0 bytes)"
809
+ )
810
+ await self._recover_from_corruption()
811
+ return
812
+
813
+ # Only validate pickle files (not binary .bin files)
814
+ if pickle_file.suffix in (".pkl", ".pickle"):
815
+ # Try to read the pickle file to detect corruption
816
+ import pickle # nosec B403 # Trusted internal index files only
817
+
818
+ with open(pickle_file, "rb") as f:
819
+ data = pickle.load(f) # nosec B301 # Trusted internal index files only
820
+
821
+ # Additional validation: check if data structure is valid
822
+ if data is None:
823
+ logger.warning(
824
+ f"HNSW index file contains None data: {pickle_file}"
825
+ )
826
+ await self._recover_from_corruption()
827
+ return
828
+
829
+ # Check for metadata consistency (if it's a dict)
830
+ if isinstance(data, dict):
831
+ # Look for known metadata keys that should exist
832
+ if "space" in data and "dim" in data:
833
+ # Validate dimensions are reasonable
834
+ if data.get("dim", 0) <= 0:
835
+ logger.warning(
836
+ f"Invalid dimensions in HNSW index: {pickle_file} (dim={data.get('dim')})"
837
+ )
838
+ await self._recover_from_corruption()
839
+ return
840
+
841
+ except (EOFError, pickle.UnpicklingError) as e:
842
+ logger.warning(f"Pickle corruption detected in {pickle_file}: {e}")
624
843
  await self._recover_from_corruption()
625
844
  return
845
+ except Exception as e:
846
+ # Check if this is a Rust panic pattern
847
+ error_msg = str(e).lower()
848
+ if "range start index" in error_msg and "out of range" in error_msg:
849
+ logger.warning(
850
+ f"Rust panic pattern detected in {pickle_file}: {e}"
851
+ )
852
+ await self._recover_from_corruption()
853
+ return
854
+ else:
855
+ logger.warning(
856
+ f"Error reading HNSW index file {pickle_file}: {e}"
857
+ )
858
+ # Continue checking other files before deciding to recover
859
+ continue
860
+
861
+ logger.debug("HNSW index files validation passed")
626
862
 
627
863
  async def _recover_from_corruption(self) -> None:
628
- """Recover from index corruption by rebuilding the index."""
629
- logger.info("Attempting to recover from index corruption...")
864
+ """Recover from index corruption by rebuilding the index.
865
+
866
+ This method:
867
+ 1. Creates a timestamped backup of the corrupted index
868
+ 2. Clears the corrupted index directory
869
+ 3. Recreates the directory structure
870
+ 4. Logs detailed recovery steps and instructions
871
+ """
872
+ logger.warning("=" * 80)
873
+ logger.warning("INDEX CORRUPTION DETECTED - Initiating recovery...")
874
+ logger.warning("=" * 80)
630
875
 
631
876
  # Create backup directory
632
877
  backup_dir = (
@@ -634,7 +879,7 @@ class ChromaVectorDatabase(VectorDatabase):
634
879
  )
635
880
  backup_dir.mkdir(exist_ok=True)
636
881
 
637
- # Backup current state (in case we need it)
882
+ # Backup current state (in case we need it for debugging)
638
883
  import time
639
884
 
640
885
  timestamp = int(time.time())
@@ -643,24 +888,41 @@ class ChromaVectorDatabase(VectorDatabase):
643
888
  if self.persist_directory.exists():
644
889
  try:
645
890
  shutil.copytree(self.persist_directory, backup_path)
646
- logger.info(f"Created backup at {backup_path}")
891
+ logger.info(f"Created backup at {backup_path}")
647
892
  except Exception as e:
648
- logger.warning(f"Could not create backup: {e}")
893
+ logger.warning(f"Could not create backup: {e}")
649
894
 
650
895
  # Clear the corrupted index
651
896
  if self.persist_directory.exists():
652
897
  try:
898
+ # Log what we're about to delete
899
+ total_size = sum(
900
+ f.stat().st_size
901
+ for f in self.persist_directory.rglob("*")
902
+ if f.is_file()
903
+ )
904
+ logger.info(
905
+ f"Clearing corrupted index ({total_size / 1024 / 1024:.2f} MB)..."
906
+ )
907
+
653
908
  shutil.rmtree(self.persist_directory)
654
- logger.info(f"Cleared corrupted index at {self.persist_directory}")
909
+ logger.info(f"Cleared corrupted index at {self.persist_directory}")
655
910
  except Exception as e:
656
- logger.error(f"Failed to clear corrupted index: {e}")
911
+ logger.error(f"Failed to clear corrupted index: {e}")
657
912
  raise IndexCorruptionError(
658
- f"Could not clear corrupted index: {e}"
913
+ f"Could not clear corrupted index: {e}. "
914
+ f"Please manually delete {self.persist_directory} and try again."
659
915
  ) from e
660
916
 
661
917
  # Recreate the directory
662
918
  self.persist_directory.mkdir(parents=True, exist_ok=True)
663
- logger.info("Index directory recreated. Please re-index your codebase.")
919
+ logger.info("Index directory recreated")
920
+
921
+ logger.warning("=" * 80)
922
+ logger.warning("RECOVERY COMPLETE - Next steps:")
923
+ logger.warning(" 1. Run 'mcp-vector-search index' to rebuild the index")
924
+ logger.warning(f" 2. Backup saved to: {backup_path}")
925
+ logger.warning("=" * 80)
664
926
 
665
927
  async def health_check(self) -> bool:
666
928
  """Check database health and integrity.
@@ -762,8 +1024,15 @@ class PooledChromaVectorDatabase(VectorDatabase):
762
1024
  await self._pool.close()
763
1025
  logger.debug("Pooled ChromaDB connections closed")
764
1026
 
765
- async def add_chunks(self, chunks: list[CodeChunk]) -> None:
766
- """Add code chunks to the database using pooled connection."""
1027
+ async def add_chunks(
1028
+ self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
1029
+ ) -> None:
1030
+ """Add code chunks to the database using pooled connection with optional metrics.
1031
+
1032
+ Args:
1033
+ chunks: List of code chunks to add
1034
+ metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
1035
+ """
767
1036
  if not chunks:
768
1037
  return
769
1038
 
@@ -779,35 +1048,40 @@ class PooledChromaVectorDatabase(VectorDatabase):
779
1048
  ids = []
780
1049
 
781
1050
  for chunk in chunks:
1051
+ # Store original content in documents (no metadata appended)
782
1052
  documents.append(chunk.content)
783
- metadatas.append(
784
- {
785
- "file_path": str(chunk.file_path),
786
- "start_line": chunk.start_line,
787
- "end_line": chunk.end_line,
788
- "language": chunk.language,
789
- "chunk_type": chunk.chunk_type,
790
- "function_name": chunk.function_name or "",
791
- "class_name": chunk.class_name or "",
792
- "docstring": chunk.docstring or "",
793
- "complexity_score": chunk.complexity_score,
794
- # Hierarchy fields (convert lists to JSON strings for ChromaDB)
795
- "chunk_id": chunk.chunk_id or "",
796
- "parent_chunk_id": chunk.parent_chunk_id or "",
797
- "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
798
- "chunk_depth": chunk.chunk_depth,
799
- # Additional metadata (convert lists/dicts to JSON strings)
800
- "decorators": json.dumps(chunk.decorators or []),
801
- "parameters": json.dumps(chunk.parameters or []),
802
- "return_type": chunk.return_type or "",
803
- "type_annotations": json.dumps(
804
- chunk.type_annotations or {}
805
- ),
806
- # Monorepo support
807
- "subproject_name": chunk.subproject_name or "",
808
- "subproject_path": chunk.subproject_path or "",
809
- }
810
- )
1053
+
1054
+ metadata = {
1055
+ "file_path": str(chunk.file_path),
1056
+ "start_line": chunk.start_line,
1057
+ "end_line": chunk.end_line,
1058
+ "language": chunk.language,
1059
+ "chunk_type": chunk.chunk_type,
1060
+ "function_name": chunk.function_name or "",
1061
+ "class_name": chunk.class_name or "",
1062
+ "docstring": chunk.docstring or "",
1063
+ "complexity_score": chunk.complexity_score,
1064
+ # Hierarchy fields (convert lists to JSON strings for ChromaDB)
1065
+ "chunk_id": chunk.chunk_id or "",
1066
+ "parent_chunk_id": chunk.parent_chunk_id or "",
1067
+ "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
1068
+ "chunk_depth": chunk.chunk_depth,
1069
+ # Additional metadata (convert lists/dicts to JSON strings)
1070
+ "decorators": json.dumps(chunk.decorators or []),
1071
+ "parameters": json.dumps(chunk.parameters or []),
1072
+ "return_type": chunk.return_type or "",
1073
+ "type_annotations": json.dumps(chunk.type_annotations or {}),
1074
+ # Monorepo support
1075
+ "subproject_name": chunk.subproject_name or "",
1076
+ "subproject_path": chunk.subproject_path or "",
1077
+ }
1078
+
1079
+ # Merge structural metrics if provided
1080
+ if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
1081
+ chunk_metrics = metrics[chunk.chunk_id]
1082
+ metadata.update(chunk_metrics)
1083
+
1084
+ metadatas.append(metadata)
811
1085
  ids.append(chunk.id)
812
1086
 
813
1087
  # Add to collection
@@ -862,6 +1136,7 @@ class PooledChromaVectorDatabase(VectorDatabase):
862
1136
  similarity = max(0.0, 1.0 / (1.0 + distance))
863
1137
 
864
1138
  if similarity >= similarity_threshold:
1139
+ # Document contains the original content (no metadata appended)
865
1140
  result = SearchResult(
866
1141
  content=doc,
867
1142
  file_path=Path(metadata["file_path"]),
@@ -53,6 +53,17 @@ class IndexCorruptionError(DatabaseError):
53
53
  pass
54
54
 
55
55
 
56
+ class RustPanicError(DatabaseError):
57
+ """ChromaDB Rust bindings panic detected.
58
+
59
+ This error occurs when ChromaDB's Rust bindings encounter
60
+ HNSW index metadata inconsistencies, typically manifesting as:
61
+ 'range start index X out of range for slice of length Y'
62
+ """
63
+
64
+ pass
65
+
66
+
56
67
  class ParsingError(MCPVectorSearchError):
57
68
  """Code parsing errors."""
58
69