mcp-vector-search 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. mcp_vector_search/__init__.py +3 -3
  2. mcp_vector_search/analysis/__init__.py +111 -0
  3. mcp_vector_search/analysis/baseline/__init__.py +68 -0
  4. mcp_vector_search/analysis/baseline/comparator.py +462 -0
  5. mcp_vector_search/analysis/baseline/manager.py +621 -0
  6. mcp_vector_search/analysis/collectors/__init__.py +74 -0
  7. mcp_vector_search/analysis/collectors/base.py +164 -0
  8. mcp_vector_search/analysis/collectors/cohesion.py +463 -0
  9. mcp_vector_search/analysis/collectors/complexity.py +743 -0
  10. mcp_vector_search/analysis/collectors/coupling.py +1162 -0
  11. mcp_vector_search/analysis/collectors/halstead.py +514 -0
  12. mcp_vector_search/analysis/collectors/smells.py +325 -0
  13. mcp_vector_search/analysis/debt.py +516 -0
  14. mcp_vector_search/analysis/interpretation.py +685 -0
  15. mcp_vector_search/analysis/metrics.py +414 -0
  16. mcp_vector_search/analysis/reporters/__init__.py +7 -0
  17. mcp_vector_search/analysis/reporters/console.py +646 -0
  18. mcp_vector_search/analysis/reporters/markdown.py +480 -0
  19. mcp_vector_search/analysis/reporters/sarif.py +377 -0
  20. mcp_vector_search/analysis/storage/__init__.py +93 -0
  21. mcp_vector_search/analysis/storage/metrics_store.py +762 -0
  22. mcp_vector_search/analysis/storage/schema.py +245 -0
  23. mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
  24. mcp_vector_search/analysis/trends.py +308 -0
  25. mcp_vector_search/analysis/visualizer/__init__.py +90 -0
  26. mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
  27. mcp_vector_search/analysis/visualizer/exporter.py +484 -0
  28. mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
  29. mcp_vector_search/analysis/visualizer/schemas.py +525 -0
  30. mcp_vector_search/cli/commands/analyze.py +1062 -0
  31. mcp_vector_search/cli/commands/chat.py +1455 -0
  32. mcp_vector_search/cli/commands/index.py +621 -5
  33. mcp_vector_search/cli/commands/index_background.py +467 -0
  34. mcp_vector_search/cli/commands/init.py +13 -0
  35. mcp_vector_search/cli/commands/install.py +597 -335
  36. mcp_vector_search/cli/commands/install_old.py +8 -4
  37. mcp_vector_search/cli/commands/mcp.py +78 -6
  38. mcp_vector_search/cli/commands/reset.py +68 -26
  39. mcp_vector_search/cli/commands/search.py +224 -8
  40. mcp_vector_search/cli/commands/setup.py +1184 -0
  41. mcp_vector_search/cli/commands/status.py +339 -5
  42. mcp_vector_search/cli/commands/uninstall.py +276 -357
  43. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  44. mcp_vector_search/cli/commands/visualize/cli.py +292 -0
  45. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  46. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  47. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +33 -0
  48. mcp_vector_search/cli/commands/visualize/graph_builder.py +647 -0
  49. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  50. mcp_vector_search/cli/commands/visualize/server.py +600 -0
  51. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  52. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  53. mcp_vector_search/cli/commands/visualize/templates/base.py +234 -0
  54. mcp_vector_search/cli/commands/visualize/templates/scripts.py +4542 -0
  55. mcp_vector_search/cli/commands/visualize/templates/styles.py +2522 -0
  56. mcp_vector_search/cli/didyoumean.py +27 -2
  57. mcp_vector_search/cli/main.py +127 -160
  58. mcp_vector_search/cli/output.py +158 -13
  59. mcp_vector_search/config/__init__.py +4 -0
  60. mcp_vector_search/config/default_thresholds.yaml +52 -0
  61. mcp_vector_search/config/settings.py +12 -0
  62. mcp_vector_search/config/thresholds.py +273 -0
  63. mcp_vector_search/core/__init__.py +16 -0
  64. mcp_vector_search/core/auto_indexer.py +3 -3
  65. mcp_vector_search/core/boilerplate.py +186 -0
  66. mcp_vector_search/core/config_utils.py +394 -0
  67. mcp_vector_search/core/database.py +406 -94
  68. mcp_vector_search/core/embeddings.py +24 -0
  69. mcp_vector_search/core/exceptions.py +11 -0
  70. mcp_vector_search/core/git.py +380 -0
  71. mcp_vector_search/core/git_hooks.py +4 -4
  72. mcp_vector_search/core/indexer.py +632 -54
  73. mcp_vector_search/core/llm_client.py +756 -0
  74. mcp_vector_search/core/models.py +91 -1
  75. mcp_vector_search/core/project.py +17 -0
  76. mcp_vector_search/core/relationships.py +473 -0
  77. mcp_vector_search/core/scheduler.py +11 -11
  78. mcp_vector_search/core/search.py +179 -29
  79. mcp_vector_search/mcp/server.py +819 -9
  80. mcp_vector_search/parsers/python.py +285 -5
  81. mcp_vector_search/utils/__init__.py +2 -0
  82. mcp_vector_search/utils/gitignore.py +0 -3
  83. mcp_vector_search/utils/gitignore_updater.py +212 -0
  84. mcp_vector_search/utils/monorepo.py +66 -4
  85. mcp_vector_search/utils/timing.py +10 -6
  86. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +184 -53
  87. mcp_vector_search-1.1.22.dist-info/RECORD +120 -0
  88. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +1 -1
  89. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +1 -0
  90. mcp_vector_search/cli/commands/visualize.py +0 -1467
  91. mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
  92. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
@@ -44,11 +44,14 @@ class VectorDatabase(ABC):
44
44
  ...
45
45
 
46
46
  @abstractmethod
47
- async def add_chunks(self, chunks: list[CodeChunk]) -> None:
48
- """Add code chunks to the database.
47
+ async def add_chunks(
48
+ self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
49
+ ) -> None:
50
+ """Add code chunks to the database with optional structural metrics.
49
51
 
50
52
  Args:
51
53
  chunks: List of code chunks to add
54
+ metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
52
55
  """
53
56
  ...
54
57
 
@@ -148,6 +151,7 @@ class ChromaVectorDatabase(VectorDatabase):
148
151
  self.collection_name = collection_name
149
152
  self._client = None
150
153
  self._collection = None
154
+ self._recovery_attempted = False # Guard against infinite recursion
151
155
 
152
156
  async def initialize(self) -> None:
153
157
  """Initialize ChromaDB client and collection with corruption recovery."""
@@ -157,49 +161,144 @@ class ChromaVectorDatabase(VectorDatabase):
157
161
  # Ensure directory exists
158
162
  self.persist_directory.mkdir(parents=True, exist_ok=True)
159
163
 
160
- # Check for corruption before initializing
164
+ # LAYER 1: Check for corruption before initializing (SQLite + HNSW checks)
161
165
  await self._detect_and_recover_corruption()
162
166
 
163
- # Create client with new API
164
- self._client = chromadb.PersistentClient(
165
- path=str(self.persist_directory),
166
- settings=chromadb.Settings(
167
- anonymized_telemetry=False,
168
- allow_reset=True,
169
- ),
170
- )
167
+ # LAYER 2: Wrap ChromaDB initialization with Rust panic detection
168
+ try:
169
+ # Create client with new API
170
+ self._client = chromadb.PersistentClient(
171
+ path=str(self.persist_directory),
172
+ settings=chromadb.Settings(
173
+ anonymized_telemetry=False,
174
+ allow_reset=True,
175
+ ),
176
+ )
171
177
 
172
- # Create or get collection
173
- self._collection = self._client.get_or_create_collection(
174
- name=self.collection_name,
175
- embedding_function=self.embedding_function,
176
- metadata={
177
- "description": "Semantic code search collection",
178
- },
179
- )
178
+ # Create or get collection
179
+ self._collection = self._client.get_or_create_collection(
180
+ name=self.collection_name,
181
+ embedding_function=self.embedding_function,
182
+ metadata={
183
+ "description": "Semantic code search collection",
184
+ },
185
+ )
186
+
187
+ # Reset recovery flag on successful initialization
188
+ self._recovery_attempted = False
189
+
190
+ logger.debug(f"ChromaDB initialized at {self.persist_directory}")
191
+
192
+ except BaseException as init_error:
193
+ # Re-raise system exceptions we should never catch
194
+ if isinstance(
195
+ init_error, KeyboardInterrupt | SystemExit | GeneratorExit
196
+ ):
197
+ raise
198
+
199
+ # LAYER 2: Detect Rust panic patterns during initialization
200
+ error_msg = str(init_error).lower()
201
+
202
+ # Rust panic patterns (common ChromaDB Rust panics)
203
+ rust_panic_patterns = [
204
+ "range start index",
205
+ "out of range",
206
+ "panic",
207
+ "thread panicked",
208
+ "slice of length",
209
+ "index out of bounds",
210
+ ]
211
+
212
+ if any(pattern in error_msg for pattern in rust_panic_patterns):
213
+ logger.warning(
214
+ f"Rust panic detected during ChromaDB initialization: {init_error}"
215
+ )
216
+ logger.info(
217
+ "Attempting automatic recovery from database corruption..."
218
+ )
219
+ await self._recover_from_corruption()
220
+
221
+ # Retry initialization ONCE after recovery
222
+ try:
223
+ logger.info(
224
+ "Retrying ChromaDB initialization after recovery..."
225
+ )
226
+ self._client = chromadb.PersistentClient(
227
+ path=str(self.persist_directory),
228
+ settings=chromadb.Settings(
229
+ anonymized_telemetry=False,
230
+ allow_reset=True,
231
+ ),
232
+ )
233
+
234
+ self._collection = self._client.get_or_create_collection(
235
+ name=self.collection_name,
236
+ embedding_function=self.embedding_function,
237
+ metadata={
238
+ "description": "Semantic code search collection",
239
+ },
240
+ )
241
+
242
+ logger.info("ChromaDB successfully initialized after recovery")
180
243
 
181
- logger.debug(f"ChromaDB initialized at {self.persist_directory}")
244
+ except BaseException as retry_error:
245
+ # Re-raise system exceptions
246
+ if isinstance(
247
+ retry_error, KeyboardInterrupt | SystemExit | GeneratorExit
248
+ ):
249
+ raise
182
250
 
251
+ logger.error(
252
+ f"Failed to recover from database corruption: {retry_error}"
253
+ )
254
+ # Mark recovery as attempted to prevent infinite loops
255
+ self._recovery_attempted = True
256
+ raise DatabaseError(
257
+ f"Failed to recover from database corruption. "
258
+ f"Please run 'mcp-vector-search reset index' to clear the database. "
259
+ f"Error: {retry_error}"
260
+ ) from retry_error
261
+ else:
262
+ # Not a Rust panic, re-raise original exception
263
+ raise
264
+
265
+ except (DatabaseError, DatabaseInitializationError):
266
+ # Re-raise our own errors without re-processing
267
+ raise
183
268
  except Exception as e:
184
- # Check if this is a corruption error
269
+ # Check if this is a corruption error (legacy detection for backward compatibility)
185
270
  error_msg = str(e).lower()
186
- if any(
187
- indicator in error_msg
188
- for indicator in [
189
- "pickle",
190
- "unpickling",
191
- "eof",
192
- "ran out of input",
193
- "hnsw",
194
- "index",
195
- "deserialize",
196
- "corrupt",
197
- ]
198
- ):
271
+ corruption_indicators = [
272
+ "pickle",
273
+ "unpickling",
274
+ "eof",
275
+ "ran out of input",
276
+ "hnsw",
277
+ "index",
278
+ "deserialize",
279
+ "corrupt",
280
+ "file is not a database", # SQLite corruption
281
+ "database error", # ChromaDB database errors
282
+ ]
283
+
284
+ if any(indicator in error_msg for indicator in corruption_indicators):
285
+ # Prevent infinite recursion - only attempt recovery once
286
+ if self._recovery_attempted:
287
+ logger.error(
288
+ f"Recovery already attempted but corruption persists: {e}"
289
+ )
290
+ raise DatabaseInitializationError(
291
+ f"Failed to recover from database corruption. "
292
+ f"Please run 'mcp-vector-search reset index' to clear and rebuild the database. Error: {e}"
293
+ ) from e
294
+
199
295
  logger.warning(f"Detected index corruption: {e}")
296
+ self._recovery_attempted = True
297
+
200
298
  # Try to recover
201
299
  await self._recover_from_corruption()
202
- # Retry initialization
300
+
301
+ # Retry initialization ONE TIME
203
302
  await self.initialize()
204
303
  else:
205
304
  logger.error(f"Failed to initialize ChromaDB: {e}")
@@ -245,8 +344,16 @@ class ChromaVectorDatabase(VectorDatabase):
245
344
  self._collection = None
246
345
  logger.debug("ChromaDB connections closed")
247
346
 
248
- async def add_chunks(self, chunks: list[CodeChunk]) -> None:
249
- """Add code chunks to the database."""
347
+ async def add_chunks(
348
+ self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
349
+ ) -> None:
350
+ """Add code chunks to the database with optional structural metrics.
351
+
352
+ Args:
353
+ chunks: List of code chunks to add
354
+ metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
355
+ Example: {"chunk_id_1": {"cognitive_complexity": 5, ...}, ...}
356
+ """
250
357
  if not self._collection:
251
358
  raise DatabaseNotInitializedError("Database not initialized")
252
359
 
@@ -259,11 +366,27 @@ class ChromaVectorDatabase(VectorDatabase):
259
366
  ids = []
260
367
 
261
368
  for chunk in chunks:
262
- # Create searchable text
263
- searchable_text = self._create_searchable_text(chunk)
264
- documents.append(searchable_text)
369
+ # Debug: Check first chunk content
370
+ if len(documents) == 0:
371
+ import sys
372
+
373
+ has_meta = "Language:" in chunk.content and "File:" in chunk.content
374
+ print("\n[DATABASE] First chunk content check:", file=sys.stderr)
375
+ print(f" Type: {chunk.chunk_type}", file=sys.stderr)
376
+ print(f" File: {chunk.file_path.name}", file=sys.stderr)
377
+ print(
378
+ f" Has metadata IN chunk.content: {has_meta}", file=sys.stderr
379
+ )
380
+ print(
381
+ f" Last 100 chars: {repr(chunk.content[-100:])}",
382
+ file=sys.stderr,
383
+ )
265
384
 
266
- # Create metadata
385
+ # Store original content directly in documents (no metadata appended)
386
+ # The embedding will be created from the original content
387
+ documents.append(chunk.content)
388
+
389
+ # Create metadata (searchable fields as metadata, not appended to content)
267
390
  metadata = {
268
391
  "file_path": str(chunk.file_path),
269
392
  "start_line": chunk.start_line,
@@ -288,6 +411,12 @@ class ChromaVectorDatabase(VectorDatabase):
288
411
  "subproject_name": chunk.subproject_name or "",
289
412
  "subproject_path": chunk.subproject_path or "",
290
413
  }
414
+
415
+ # Merge structural metrics if provided
416
+ if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
417
+ chunk_metrics = metrics[chunk.chunk_id]
418
+ metadata.update(chunk_metrics)
419
+
291
420
  metadatas.append(metadata)
292
421
 
293
422
  # Use chunk ID
@@ -347,6 +476,34 @@ class ChromaVectorDatabase(VectorDatabase):
347
476
  similarity = max(0.0, 1.0 / (1.0 + distance))
348
477
 
349
478
  if similarity >= similarity_threshold:
479
+ # Document contains the original content (no metadata appended)
480
+ # Parse code smells from JSON if present
481
+ code_smells = []
482
+ if "code_smells" in metadata:
483
+ try:
484
+ code_smells = json.loads(metadata["code_smells"])
485
+ except (json.JSONDecodeError, TypeError):
486
+ code_smells = []
487
+
488
+ # Calculate quality score from metrics (0-100 scale)
489
+ quality_score = None
490
+ if (
491
+ "cognitive_complexity" in metadata
492
+ and "smell_count" in metadata
493
+ ):
494
+ # Simple quality score: penalize complexity and smells
495
+ complexity = metadata["cognitive_complexity"]
496
+ smells = metadata["smell_count"]
497
+
498
+ # Start with 100, penalize for complexity and smells
499
+ score = 100
500
+ # Complexity penalty: -2 points per complexity unit
501
+ score -= min(50, complexity * 2)
502
+ # Smell penalty: -10 points per smell
503
+ score -= min(30, smells * 10)
504
+
505
+ quality_score = max(0, score)
506
+
350
507
  result = SearchResult(
351
508
  content=doc,
352
509
  file_path=Path(metadata["file_path"]),
@@ -358,6 +515,16 @@ class ChromaVectorDatabase(VectorDatabase):
358
515
  chunk_type=metadata.get("chunk_type", "code"),
359
516
  function_name=metadata.get("function_name") or None,
360
517
  class_name=metadata.get("class_name") or None,
518
+ # Quality metrics from structural analysis
519
+ cognitive_complexity=metadata.get("cognitive_complexity"),
520
+ cyclomatic_complexity=metadata.get("cyclomatic_complexity"),
521
+ max_nesting_depth=metadata.get("max_nesting_depth"),
522
+ parameter_count=metadata.get("parameter_count"),
523
+ lines_of_code=metadata.get("lines_of_code"),
524
+ complexity_grade=metadata.get("complexity_grade"),
525
+ code_smells=code_smells,
526
+ smell_count=metadata.get("smell_count"),
527
+ quality_score=quality_score,
361
528
  )
362
529
  search_results.append(result)
363
530
 
@@ -507,6 +674,7 @@ class ChromaVectorDatabase(VectorDatabase):
507
674
  if results and results.get("ids"):
508
675
  for i, _chunk_id in enumerate(results["ids"]):
509
676
  metadata = results["metadatas"][i]
677
+ # Document now contains the original content (no metadata appended)
510
678
  content = results["documents"][i]
511
679
 
512
680
  # Parse JSON strings back to lists/dicts
@@ -560,6 +728,9 @@ class ChromaVectorDatabase(VectorDatabase):
560
728
 
561
729
  def _create_searchable_text(self, chunk: CodeChunk) -> str:
562
730
  """Create optimized searchable text from code chunk."""
731
+ import sys
732
+
733
+ print("WARNING: _create_searchable_text IS BEING CALLED!", file=sys.stderr)
563
734
  parts = [chunk.content]
564
735
 
565
736
  # Add contextual information
@@ -579,7 +750,24 @@ class ChromaVectorDatabase(VectorDatabase):
579
750
  return "\n".join(parts)
580
751
 
581
752
  def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
582
- """Build ChromaDB where clause from filters."""
753
+ """Build ChromaDB where clause from filters.
754
+
755
+ Supports filtering by:
756
+ - language, file_path, chunk_type (standard fields)
757
+ - complexity_grade (A, B, C, D, F)
758
+ - smell_count (0, >0)
759
+ - cognitive_complexity (range queries using $and)
760
+
761
+ Args:
762
+ filters: Dictionary of filter criteria
763
+
764
+ Returns:
765
+ ChromaDB where clause
766
+ """
767
+ # If filters already contain ChromaDB operators ($and, $or), pass through
768
+ if "$and" in filters or "$or" in filters:
769
+ return filters
770
+
583
771
  where = {}
584
772
 
585
773
  for key, value in filters.items():
@@ -587,46 +775,140 @@ class ChromaVectorDatabase(VectorDatabase):
587
775
  where[key] = {"$in": value}
588
776
  elif isinstance(value, str) and value.startswith("!"):
589
777
  where[key] = {"$ne": value[1:]}
778
+ elif isinstance(value, dict):
779
+ # Support operator queries like {"$gte": 10}
780
+ where[key] = value
590
781
  else:
591
782
  where[key] = value
592
783
 
593
784
  return where
594
785
 
595
786
  async def _detect_and_recover_corruption(self) -> None:
596
- """Detect and recover from index corruption proactively."""
597
- # Check for common corruption indicators in ChromaDB files
787
+ """Detect and recover from index corruption proactively.
788
+
789
+ This method checks for:
790
+ 1. SQLite database corruption (LAYER 1: Pre-initialization check)
791
+ 2. HNSW pickle file corruption
792
+ 3. Metadata/data inconsistencies
793
+ 4. File size anomalies
794
+ """
795
+ # LAYER 1: Check SQLite database integrity FIRST (before ChromaDB initialization)
598
796
  chroma_db_path = self.persist_directory / "chroma.sqlite3"
599
797
 
600
798
  # If database doesn't exist yet, nothing to check
601
799
  if not chroma_db_path.exists():
602
800
  return
603
801
 
802
+ # SQLite integrity check - catches corruption BEFORE Rust panic
803
+ try:
804
+ import sqlite3
805
+
806
+ logger.debug("Running SQLite integrity check...")
807
+ conn = sqlite3.connect(str(chroma_db_path))
808
+ cursor = conn.execute("PRAGMA quick_check")
809
+ result = cursor.fetchone()[0]
810
+ conn.close()
811
+
812
+ if result != "ok":
813
+ logger.warning(f"SQLite database corruption detected: {result}")
814
+ logger.info("Initiating automatic recovery from database corruption...")
815
+ await self._recover_from_corruption()
816
+ return
817
+
818
+ logger.debug("SQLite integrity check passed")
819
+
820
+ except sqlite3.Error as e:
821
+ logger.warning(f"SQLite database error during integrity check: {e}")
822
+ logger.info("Initiating automatic recovery from database corruption...")
823
+ await self._recover_from_corruption()
824
+ return
825
+
604
826
  # Check for HNSW index files that might be corrupted
605
- self.persist_directory / "chroma-collections.parquet"
606
827
  index_path = self.persist_directory / "index"
607
828
 
608
829
  if index_path.exists():
609
- # Look for pickle files in the index
830
+ # Look for pickle files in the index (HNSW metadata)
610
831
  pickle_files = list(index_path.glob("**/*.pkl"))
611
832
  pickle_files.extend(list(index_path.glob("**/*.pickle")))
833
+ pickle_files.extend(list(index_path.glob("**/*.bin"))) # Binary HNSW files
834
+
835
+ logger.debug(
836
+ f"Checking {len(pickle_files)} HNSW index files for corruption..."
837
+ )
612
838
 
613
839
  for pickle_file in pickle_files:
614
840
  try:
615
- # Try to read the pickle file to detect corruption
616
- import pickle
617
-
618
- with open(pickle_file, "rb") as f:
619
- pickle.load(f)
620
- except (EOFError, pickle.UnpicklingError, Exception) as e:
621
- logger.warning(
622
- f"Corrupted index file detected: {pickle_file} - {e}"
623
- )
841
+ # Check file size - suspiciously small files might be corrupted
842
+ file_size = pickle_file.stat().st_size
843
+ if file_size == 0:
844
+ logger.warning(
845
+ f"Empty HNSW index file detected: {pickle_file} (0 bytes)"
846
+ )
847
+ await self._recover_from_corruption()
848
+ return
849
+
850
+ # Only validate pickle files (not binary .bin files)
851
+ if pickle_file.suffix in (".pkl", ".pickle"):
852
+ # Try to read the pickle file to detect corruption
853
+ import pickle # nosec B403 # Trusted internal index files only
854
+
855
+ with open(pickle_file, "rb") as f:
856
+ data = pickle.load(f) # nosec B301 # Trusted internal index files only
857
+
858
+ # Additional validation: check if data structure is valid
859
+ if data is None:
860
+ logger.warning(
861
+ f"HNSW index file contains None data: {pickle_file}"
862
+ )
863
+ await self._recover_from_corruption()
864
+ return
865
+
866
+ # Check for metadata consistency (if it's a dict)
867
+ if isinstance(data, dict):
868
+ # Look for known metadata keys that should exist
869
+ if "space" in data and "dim" in data:
870
+ # Validate dimensions are reasonable
871
+ if data.get("dim", 0) <= 0:
872
+ logger.warning(
873
+ f"Invalid dimensions in HNSW index: {pickle_file} (dim={data.get('dim')})"
874
+ )
875
+ await self._recover_from_corruption()
876
+ return
877
+
878
+ except (EOFError, pickle.UnpicklingError) as e:
879
+ logger.warning(f"Pickle corruption detected in {pickle_file}: {e}")
624
880
  await self._recover_from_corruption()
625
881
  return
882
+ except Exception as e:
883
+ # Check if this is a Rust panic pattern
884
+ error_msg = str(e).lower()
885
+ if "range start index" in error_msg and "out of range" in error_msg:
886
+ logger.warning(
887
+ f"Rust panic pattern detected in {pickle_file}: {e}"
888
+ )
889
+ await self._recover_from_corruption()
890
+ return
891
+ else:
892
+ logger.warning(
893
+ f"Error reading HNSW index file {pickle_file}: {e}"
894
+ )
895
+ # Continue checking other files before deciding to recover
896
+ continue
897
+
898
+ logger.debug("HNSW index files validation passed")
626
899
 
627
900
  async def _recover_from_corruption(self) -> None:
628
- """Recover from index corruption by rebuilding the index."""
629
- logger.info("Attempting to recover from index corruption...")
901
+ """Recover from index corruption by rebuilding the index.
902
+
903
+ This method:
904
+ 1. Creates a timestamped backup of the corrupted index
905
+ 2. Clears the corrupted index directory
906
+ 3. Recreates the directory structure
907
+ 4. Logs detailed recovery steps and instructions
908
+ """
909
+ logger.warning("=" * 80)
910
+ logger.warning("INDEX CORRUPTION DETECTED - Initiating recovery...")
911
+ logger.warning("=" * 80)
630
912
 
631
913
  # Create backup directory
632
914
  backup_dir = (
@@ -634,7 +916,7 @@ class ChromaVectorDatabase(VectorDatabase):
634
916
  )
635
917
  backup_dir.mkdir(exist_ok=True)
636
918
 
637
- # Backup current state (in case we need it)
919
+ # Backup current state (in case we need it for debugging)
638
920
  import time
639
921
 
640
922
  timestamp = int(time.time())
@@ -643,24 +925,41 @@ class ChromaVectorDatabase(VectorDatabase):
643
925
  if self.persist_directory.exists():
644
926
  try:
645
927
  shutil.copytree(self.persist_directory, backup_path)
646
- logger.info(f"Created backup at {backup_path}")
928
+ logger.info(f"Created backup at {backup_path}")
647
929
  except Exception as e:
648
- logger.warning(f"Could not create backup: {e}")
930
+ logger.warning(f"Could not create backup: {e}")
649
931
 
650
932
  # Clear the corrupted index
651
933
  if self.persist_directory.exists():
652
934
  try:
935
+ # Log what we're about to delete
936
+ total_size = sum(
937
+ f.stat().st_size
938
+ for f in self.persist_directory.rglob("*")
939
+ if f.is_file()
940
+ )
941
+ logger.info(
942
+ f"Clearing corrupted index ({total_size / 1024 / 1024:.2f} MB)..."
943
+ )
944
+
653
945
  shutil.rmtree(self.persist_directory)
654
- logger.info(f"Cleared corrupted index at {self.persist_directory}")
946
+ logger.info(f"Cleared corrupted index at {self.persist_directory}")
655
947
  except Exception as e:
656
- logger.error(f"Failed to clear corrupted index: {e}")
948
+ logger.error(f"Failed to clear corrupted index: {e}")
657
949
  raise IndexCorruptionError(
658
- f"Could not clear corrupted index: {e}"
950
+ f"Could not clear corrupted index: {e}. "
951
+ f"Please manually delete {self.persist_directory} and try again."
659
952
  ) from e
660
953
 
661
954
  # Recreate the directory
662
955
  self.persist_directory.mkdir(parents=True, exist_ok=True)
663
- logger.info("Index directory recreated. Please re-index your codebase.")
956
+ logger.info("Index directory recreated")
957
+
958
+ logger.warning("=" * 80)
959
+ logger.warning("RECOVERY COMPLETE - Next steps:")
960
+ logger.warning(" 1. Run 'mcp-vector-search index' to rebuild the index")
961
+ logger.warning(f" 2. Backup saved to: {backup_path}")
962
+ logger.warning("=" * 80)
664
963
 
665
964
  async def health_check(self) -> bool:
666
965
  """Check database health and integrity.
@@ -762,8 +1061,15 @@ class PooledChromaVectorDatabase(VectorDatabase):
762
1061
  await self._pool.close()
763
1062
  logger.debug("Pooled ChromaDB connections closed")
764
1063
 
765
- async def add_chunks(self, chunks: list[CodeChunk]) -> None:
766
- """Add code chunks to the database using pooled connection."""
1064
+ async def add_chunks(
1065
+ self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
1066
+ ) -> None:
1067
+ """Add code chunks to the database using pooled connection with optional metrics.
1068
+
1069
+ Args:
1070
+ chunks: List of code chunks to add
1071
+ metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
1072
+ """
767
1073
  if not chunks:
768
1074
  return
769
1075
 
@@ -779,35 +1085,40 @@ class PooledChromaVectorDatabase(VectorDatabase):
779
1085
  ids = []
780
1086
 
781
1087
  for chunk in chunks:
1088
+ # Store original content in documents (no metadata appended)
782
1089
  documents.append(chunk.content)
783
- metadatas.append(
784
- {
785
- "file_path": str(chunk.file_path),
786
- "start_line": chunk.start_line,
787
- "end_line": chunk.end_line,
788
- "language": chunk.language,
789
- "chunk_type": chunk.chunk_type,
790
- "function_name": chunk.function_name or "",
791
- "class_name": chunk.class_name or "",
792
- "docstring": chunk.docstring or "",
793
- "complexity_score": chunk.complexity_score,
794
- # Hierarchy fields (convert lists to JSON strings for ChromaDB)
795
- "chunk_id": chunk.chunk_id or "",
796
- "parent_chunk_id": chunk.parent_chunk_id or "",
797
- "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
798
- "chunk_depth": chunk.chunk_depth,
799
- # Additional metadata (convert lists/dicts to JSON strings)
800
- "decorators": json.dumps(chunk.decorators or []),
801
- "parameters": json.dumps(chunk.parameters or []),
802
- "return_type": chunk.return_type or "",
803
- "type_annotations": json.dumps(
804
- chunk.type_annotations or {}
805
- ),
806
- # Monorepo support
807
- "subproject_name": chunk.subproject_name or "",
808
- "subproject_path": chunk.subproject_path or "",
809
- }
810
- )
1090
+
1091
+ metadata = {
1092
+ "file_path": str(chunk.file_path),
1093
+ "start_line": chunk.start_line,
1094
+ "end_line": chunk.end_line,
1095
+ "language": chunk.language,
1096
+ "chunk_type": chunk.chunk_type,
1097
+ "function_name": chunk.function_name or "",
1098
+ "class_name": chunk.class_name or "",
1099
+ "docstring": chunk.docstring or "",
1100
+ "complexity_score": chunk.complexity_score,
1101
+ # Hierarchy fields (convert lists to JSON strings for ChromaDB)
1102
+ "chunk_id": chunk.chunk_id or "",
1103
+ "parent_chunk_id": chunk.parent_chunk_id or "",
1104
+ "child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
1105
+ "chunk_depth": chunk.chunk_depth,
1106
+ # Additional metadata (convert lists/dicts to JSON strings)
1107
+ "decorators": json.dumps(chunk.decorators or []),
1108
+ "parameters": json.dumps(chunk.parameters or []),
1109
+ "return_type": chunk.return_type or "",
1110
+ "type_annotations": json.dumps(chunk.type_annotations or {}),
1111
+ # Monorepo support
1112
+ "subproject_name": chunk.subproject_name or "",
1113
+ "subproject_path": chunk.subproject_path or "",
1114
+ }
1115
+
1116
+ # Merge structural metrics if provided
1117
+ if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
1118
+ chunk_metrics = metrics[chunk.chunk_id]
1119
+ metadata.update(chunk_metrics)
1120
+
1121
+ metadatas.append(metadata)
811
1122
  ids.append(chunk.id)
812
1123
 
813
1124
  # Add to collection
@@ -862,6 +1173,7 @@ class PooledChromaVectorDatabase(VectorDatabase):
862
1173
  similarity = max(0.0, 1.0 / (1.0 + distance))
863
1174
 
864
1175
  if similarity >= similarity_threshold:
1176
+ # Document contains the original content (no metadata appended)
865
1177
  result = SearchResult(
866
1178
  content=doc,
867
1179
  file_path=Path(metadata["file_path"]),