mcp-vector-search 0.12.6__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/analysis/__init__.py +64 -0
- mcp_vector_search/analysis/collectors/__init__.py +39 -0
- mcp_vector_search/analysis/collectors/base.py +164 -0
- mcp_vector_search/analysis/collectors/complexity.py +743 -0
- mcp_vector_search/analysis/metrics.py +341 -0
- mcp_vector_search/analysis/reporters/__init__.py +5 -0
- mcp_vector_search/analysis/reporters/console.py +222 -0
- mcp_vector_search/cli/commands/analyze.py +408 -0
- mcp_vector_search/cli/commands/chat.py +1262 -0
- mcp_vector_search/cli/commands/index.py +21 -3
- mcp_vector_search/cli/commands/init.py +13 -0
- mcp_vector_search/cli/commands/install.py +597 -335
- mcp_vector_search/cli/commands/install_old.py +8 -4
- mcp_vector_search/cli/commands/mcp.py +78 -6
- mcp_vector_search/cli/commands/reset.py +68 -26
- mcp_vector_search/cli/commands/search.py +30 -7
- mcp_vector_search/cli/commands/setup.py +1133 -0
- mcp_vector_search/cli/commands/status.py +37 -2
- mcp_vector_search/cli/commands/uninstall.py +276 -357
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +276 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +714 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +311 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +180 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +2507 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +1313 -0
- mcp_vector_search/cli/commands/visualize.py.original +2536 -0
- mcp_vector_search/cli/didyoumean.py +22 -2
- mcp_vector_search/cli/main.py +115 -159
- mcp_vector_search/cli/output.py +24 -8
- mcp_vector_search/config/__init__.py +4 -0
- mcp_vector_search/config/default_thresholds.yaml +52 -0
- mcp_vector_search/config/settings.py +12 -0
- mcp_vector_search/config/thresholds.py +185 -0
- mcp_vector_search/core/auto_indexer.py +3 -3
- mcp_vector_search/core/boilerplate.py +186 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/database.py +369 -94
- mcp_vector_search/core/exceptions.py +11 -0
- mcp_vector_search/core/git_hooks.py +4 -4
- mcp_vector_search/core/indexer.py +221 -4
- mcp_vector_search/core/llm_client.py +751 -0
- mcp_vector_search/core/models.py +3 -0
- mcp_vector_search/core/project.py +17 -0
- mcp_vector_search/core/scheduler.py +11 -11
- mcp_vector_search/core/search.py +179 -29
- mcp_vector_search/mcp/server.py +24 -5
- mcp_vector_search/utils/__init__.py +2 -0
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +66 -4
- mcp_vector_search/utils/timing.py +10 -6
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/METADATA +182 -52
- mcp_vector_search-1.0.3.dist-info/RECORD +97 -0
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/WHEEL +1 -1
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/entry_points.txt +1 -0
- mcp_vector_search/cli/commands/visualize.py +0 -1467
- mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.0.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -44,11 +44,14 @@ class VectorDatabase(ABC):
|
|
|
44
44
|
...
|
|
45
45
|
|
|
46
46
|
@abstractmethod
|
|
47
|
-
async def add_chunks(
|
|
48
|
-
|
|
47
|
+
async def add_chunks(
|
|
48
|
+
self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Add code chunks to the database with optional structural metrics.
|
|
49
51
|
|
|
50
52
|
Args:
|
|
51
53
|
chunks: List of code chunks to add
|
|
54
|
+
metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
|
|
52
55
|
"""
|
|
53
56
|
...
|
|
54
57
|
|
|
@@ -148,6 +151,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
148
151
|
self.collection_name = collection_name
|
|
149
152
|
self._client = None
|
|
150
153
|
self._collection = None
|
|
154
|
+
self._recovery_attempted = False # Guard against infinite recursion
|
|
151
155
|
|
|
152
156
|
async def initialize(self) -> None:
|
|
153
157
|
"""Initialize ChromaDB client and collection with corruption recovery."""
|
|
@@ -157,49 +161,144 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
157
161
|
# Ensure directory exists
|
|
158
162
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
159
163
|
|
|
160
|
-
# Check for corruption before initializing
|
|
164
|
+
# LAYER 1: Check for corruption before initializing (SQLite + HNSW checks)
|
|
161
165
|
await self._detect_and_recover_corruption()
|
|
162
166
|
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
167
|
+
# LAYER 2: Wrap ChromaDB initialization with Rust panic detection
|
|
168
|
+
try:
|
|
169
|
+
# Create client with new API
|
|
170
|
+
self._client = chromadb.PersistentClient(
|
|
171
|
+
path=str(self.persist_directory),
|
|
172
|
+
settings=chromadb.Settings(
|
|
173
|
+
anonymized_telemetry=False,
|
|
174
|
+
allow_reset=True,
|
|
175
|
+
),
|
|
176
|
+
)
|
|
171
177
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
178
|
+
# Create or get collection
|
|
179
|
+
self._collection = self._client.get_or_create_collection(
|
|
180
|
+
name=self.collection_name,
|
|
181
|
+
embedding_function=self.embedding_function,
|
|
182
|
+
metadata={
|
|
183
|
+
"description": "Semantic code search collection",
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Reset recovery flag on successful initialization
|
|
188
|
+
self._recovery_attempted = False
|
|
189
|
+
|
|
190
|
+
logger.debug(f"ChromaDB initialized at {self.persist_directory}")
|
|
191
|
+
|
|
192
|
+
except BaseException as init_error:
|
|
193
|
+
# Re-raise system exceptions we should never catch
|
|
194
|
+
if isinstance(
|
|
195
|
+
init_error, (KeyboardInterrupt, SystemExit, GeneratorExit)
|
|
196
|
+
):
|
|
197
|
+
raise
|
|
198
|
+
|
|
199
|
+
# LAYER 2: Detect Rust panic patterns during initialization
|
|
200
|
+
error_msg = str(init_error).lower()
|
|
201
|
+
|
|
202
|
+
# Rust panic patterns (common ChromaDB Rust panics)
|
|
203
|
+
rust_panic_patterns = [
|
|
204
|
+
"range start index",
|
|
205
|
+
"out of range",
|
|
206
|
+
"panic",
|
|
207
|
+
"thread panicked",
|
|
208
|
+
"slice of length",
|
|
209
|
+
"index out of bounds",
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
if any(pattern in error_msg for pattern in rust_panic_patterns):
|
|
213
|
+
logger.warning(
|
|
214
|
+
f"Rust panic detected during ChromaDB initialization: {init_error}"
|
|
215
|
+
)
|
|
216
|
+
logger.info(
|
|
217
|
+
"Attempting automatic recovery from database corruption..."
|
|
218
|
+
)
|
|
219
|
+
await self._recover_from_corruption()
|
|
220
|
+
|
|
221
|
+
# Retry initialization ONCE after recovery
|
|
222
|
+
try:
|
|
223
|
+
logger.info(
|
|
224
|
+
"Retrying ChromaDB initialization after recovery..."
|
|
225
|
+
)
|
|
226
|
+
self._client = chromadb.PersistentClient(
|
|
227
|
+
path=str(self.persist_directory),
|
|
228
|
+
settings=chromadb.Settings(
|
|
229
|
+
anonymized_telemetry=False,
|
|
230
|
+
allow_reset=True,
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
self._collection = self._client.get_or_create_collection(
|
|
235
|
+
name=self.collection_name,
|
|
236
|
+
embedding_function=self.embedding_function,
|
|
237
|
+
metadata={
|
|
238
|
+
"description": "Semantic code search collection",
|
|
239
|
+
},
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
logger.info("ChromaDB successfully initialized after recovery")
|
|
180
243
|
|
|
181
|
-
|
|
244
|
+
except BaseException as retry_error:
|
|
245
|
+
# Re-raise system exceptions
|
|
246
|
+
if isinstance(
|
|
247
|
+
retry_error, (KeyboardInterrupt, SystemExit, GeneratorExit)
|
|
248
|
+
):
|
|
249
|
+
raise
|
|
182
250
|
|
|
251
|
+
logger.error(
|
|
252
|
+
f"Failed to recover from database corruption: {retry_error}"
|
|
253
|
+
)
|
|
254
|
+
# Mark recovery as attempted to prevent infinite loops
|
|
255
|
+
self._recovery_attempted = True
|
|
256
|
+
raise DatabaseError(
|
|
257
|
+
f"Failed to recover from database corruption. "
|
|
258
|
+
f"Please run 'mcp-vector-search reset index' to clear the database. "
|
|
259
|
+
f"Error: {retry_error}"
|
|
260
|
+
) from retry_error
|
|
261
|
+
else:
|
|
262
|
+
# Not a Rust panic, re-raise original exception
|
|
263
|
+
raise
|
|
264
|
+
|
|
265
|
+
except (DatabaseError, DatabaseInitializationError):
|
|
266
|
+
# Re-raise our own errors without re-processing
|
|
267
|
+
raise
|
|
183
268
|
except Exception as e:
|
|
184
|
-
# Check if this is a corruption error
|
|
269
|
+
# Check if this is a corruption error (legacy detection for backward compatibility)
|
|
185
270
|
error_msg = str(e).lower()
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
271
|
+
corruption_indicators = [
|
|
272
|
+
"pickle",
|
|
273
|
+
"unpickling",
|
|
274
|
+
"eof",
|
|
275
|
+
"ran out of input",
|
|
276
|
+
"hnsw",
|
|
277
|
+
"index",
|
|
278
|
+
"deserialize",
|
|
279
|
+
"corrupt",
|
|
280
|
+
"file is not a database", # SQLite corruption
|
|
281
|
+
"database error", # ChromaDB database errors
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
if any(indicator in error_msg for indicator in corruption_indicators):
|
|
285
|
+
# Prevent infinite recursion - only attempt recovery once
|
|
286
|
+
if self._recovery_attempted:
|
|
287
|
+
logger.error(
|
|
288
|
+
f"Recovery already attempted but corruption persists: {e}"
|
|
289
|
+
)
|
|
290
|
+
raise DatabaseInitializationError(
|
|
291
|
+
f"Failed to recover from database corruption. "
|
|
292
|
+
f"Please run 'mcp-vector-search reset index' to clear and rebuild the database. Error: {e}"
|
|
293
|
+
) from e
|
|
294
|
+
|
|
199
295
|
logger.warning(f"Detected index corruption: {e}")
|
|
296
|
+
self._recovery_attempted = True
|
|
297
|
+
|
|
200
298
|
# Try to recover
|
|
201
299
|
await self._recover_from_corruption()
|
|
202
|
-
|
|
300
|
+
|
|
301
|
+
# Retry initialization ONE TIME
|
|
203
302
|
await self.initialize()
|
|
204
303
|
else:
|
|
205
304
|
logger.error(f"Failed to initialize ChromaDB: {e}")
|
|
@@ -245,8 +344,16 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
245
344
|
self._collection = None
|
|
246
345
|
logger.debug("ChromaDB connections closed")
|
|
247
346
|
|
|
248
|
-
async def add_chunks(
|
|
249
|
-
|
|
347
|
+
async def add_chunks(
|
|
348
|
+
self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
|
|
349
|
+
) -> None:
|
|
350
|
+
"""Add code chunks to the database with optional structural metrics.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
chunks: List of code chunks to add
|
|
354
|
+
metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
|
|
355
|
+
Example: {"chunk_id_1": {"cognitive_complexity": 5, ...}, ...}
|
|
356
|
+
"""
|
|
250
357
|
if not self._collection:
|
|
251
358
|
raise DatabaseNotInitializedError("Database not initialized")
|
|
252
359
|
|
|
@@ -259,11 +366,27 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
259
366
|
ids = []
|
|
260
367
|
|
|
261
368
|
for chunk in chunks:
|
|
262
|
-
#
|
|
263
|
-
|
|
264
|
-
|
|
369
|
+
# Debug: Check first chunk content
|
|
370
|
+
if len(documents) == 0:
|
|
371
|
+
import sys
|
|
372
|
+
|
|
373
|
+
has_meta = "Language:" in chunk.content and "File:" in chunk.content
|
|
374
|
+
print("\n[DATABASE] First chunk content check:", file=sys.stderr)
|
|
375
|
+
print(f" Type: {chunk.chunk_type}", file=sys.stderr)
|
|
376
|
+
print(f" File: {chunk.file_path.name}", file=sys.stderr)
|
|
377
|
+
print(
|
|
378
|
+
f" Has metadata IN chunk.content: {has_meta}", file=sys.stderr
|
|
379
|
+
)
|
|
380
|
+
print(
|
|
381
|
+
f" Last 100 chars: {repr(chunk.content[-100:])}",
|
|
382
|
+
file=sys.stderr,
|
|
383
|
+
)
|
|
265
384
|
|
|
266
|
-
#
|
|
385
|
+
# Store original content directly in documents (no metadata appended)
|
|
386
|
+
# The embedding will be created from the original content
|
|
387
|
+
documents.append(chunk.content)
|
|
388
|
+
|
|
389
|
+
# Create metadata (searchable fields as metadata, not appended to content)
|
|
267
390
|
metadata = {
|
|
268
391
|
"file_path": str(chunk.file_path),
|
|
269
392
|
"start_line": chunk.start_line,
|
|
@@ -288,6 +411,12 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
288
411
|
"subproject_name": chunk.subproject_name or "",
|
|
289
412
|
"subproject_path": chunk.subproject_path or "",
|
|
290
413
|
}
|
|
414
|
+
|
|
415
|
+
# Merge structural metrics if provided
|
|
416
|
+
if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
|
|
417
|
+
chunk_metrics = metrics[chunk.chunk_id]
|
|
418
|
+
metadata.update(chunk_metrics)
|
|
419
|
+
|
|
291
420
|
metadatas.append(metadata)
|
|
292
421
|
|
|
293
422
|
# Use chunk ID
|
|
@@ -347,6 +476,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
347
476
|
similarity = max(0.0, 1.0 / (1.0 + distance))
|
|
348
477
|
|
|
349
478
|
if similarity >= similarity_threshold:
|
|
479
|
+
# Document contains the original content (no metadata appended)
|
|
350
480
|
result = SearchResult(
|
|
351
481
|
content=doc,
|
|
352
482
|
file_path=Path(metadata["file_path"]),
|
|
@@ -507,6 +637,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
507
637
|
if results and results.get("ids"):
|
|
508
638
|
for i, _chunk_id in enumerate(results["ids"]):
|
|
509
639
|
metadata = results["metadatas"][i]
|
|
640
|
+
# Document now contains the original content (no metadata appended)
|
|
510
641
|
content = results["documents"][i]
|
|
511
642
|
|
|
512
643
|
# Parse JSON strings back to lists/dicts
|
|
@@ -560,6 +691,9 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
560
691
|
|
|
561
692
|
def _create_searchable_text(self, chunk: CodeChunk) -> str:
|
|
562
693
|
"""Create optimized searchable text from code chunk."""
|
|
694
|
+
import sys
|
|
695
|
+
|
|
696
|
+
print("WARNING: _create_searchable_text IS BEING CALLED!", file=sys.stderr)
|
|
563
697
|
parts = [chunk.content]
|
|
564
698
|
|
|
565
699
|
# Add contextual information
|
|
@@ -579,7 +713,24 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
579
713
|
return "\n".join(parts)
|
|
580
714
|
|
|
581
715
|
def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
|
|
582
|
-
"""Build ChromaDB where clause from filters.
|
|
716
|
+
"""Build ChromaDB where clause from filters.
|
|
717
|
+
|
|
718
|
+
Supports filtering by:
|
|
719
|
+
- language, file_path, chunk_type (standard fields)
|
|
720
|
+
- complexity_grade (A, B, C, D, F)
|
|
721
|
+
- smell_count (0, >0)
|
|
722
|
+
- cognitive_complexity (range queries using $and)
|
|
723
|
+
|
|
724
|
+
Args:
|
|
725
|
+
filters: Dictionary of filter criteria
|
|
726
|
+
|
|
727
|
+
Returns:
|
|
728
|
+
ChromaDB where clause
|
|
729
|
+
"""
|
|
730
|
+
# If filters already contain ChromaDB operators ($and, $or), pass through
|
|
731
|
+
if "$and" in filters or "$or" in filters:
|
|
732
|
+
return filters
|
|
733
|
+
|
|
583
734
|
where = {}
|
|
584
735
|
|
|
585
736
|
for key, value in filters.items():
|
|
@@ -587,46 +738,140 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
587
738
|
where[key] = {"$in": value}
|
|
588
739
|
elif isinstance(value, str) and value.startswith("!"):
|
|
589
740
|
where[key] = {"$ne": value[1:]}
|
|
741
|
+
elif isinstance(value, dict):
|
|
742
|
+
# Support operator queries like {"$gte": 10}
|
|
743
|
+
where[key] = value
|
|
590
744
|
else:
|
|
591
745
|
where[key] = value
|
|
592
746
|
|
|
593
747
|
return where
|
|
594
748
|
|
|
595
749
|
async def _detect_and_recover_corruption(self) -> None:
|
|
596
|
-
"""Detect and recover from index corruption proactively.
|
|
597
|
-
|
|
750
|
+
"""Detect and recover from index corruption proactively.
|
|
751
|
+
|
|
752
|
+
This method checks for:
|
|
753
|
+
1. SQLite database corruption (LAYER 1: Pre-initialization check)
|
|
754
|
+
2. HNSW pickle file corruption
|
|
755
|
+
3. Metadata/data inconsistencies
|
|
756
|
+
4. File size anomalies
|
|
757
|
+
"""
|
|
758
|
+
# LAYER 1: Check SQLite database integrity FIRST (before ChromaDB initialization)
|
|
598
759
|
chroma_db_path = self.persist_directory / "chroma.sqlite3"
|
|
599
760
|
|
|
600
761
|
# If database doesn't exist yet, nothing to check
|
|
601
762
|
if not chroma_db_path.exists():
|
|
602
763
|
return
|
|
603
764
|
|
|
765
|
+
# SQLite integrity check - catches corruption BEFORE Rust panic
|
|
766
|
+
try:
|
|
767
|
+
import sqlite3
|
|
768
|
+
|
|
769
|
+
logger.debug("Running SQLite integrity check...")
|
|
770
|
+
conn = sqlite3.connect(str(chroma_db_path))
|
|
771
|
+
cursor = conn.execute("PRAGMA quick_check")
|
|
772
|
+
result = cursor.fetchone()[0]
|
|
773
|
+
conn.close()
|
|
774
|
+
|
|
775
|
+
if result != "ok":
|
|
776
|
+
logger.warning(f"SQLite database corruption detected: {result}")
|
|
777
|
+
logger.info("Initiating automatic recovery from database corruption...")
|
|
778
|
+
await self._recover_from_corruption()
|
|
779
|
+
return
|
|
780
|
+
|
|
781
|
+
logger.debug("SQLite integrity check passed")
|
|
782
|
+
|
|
783
|
+
except sqlite3.Error as e:
|
|
784
|
+
logger.warning(f"SQLite database error during integrity check: {e}")
|
|
785
|
+
logger.info("Initiating automatic recovery from database corruption...")
|
|
786
|
+
await self._recover_from_corruption()
|
|
787
|
+
return
|
|
788
|
+
|
|
604
789
|
# Check for HNSW index files that might be corrupted
|
|
605
|
-
self.persist_directory / "chroma-collections.parquet"
|
|
606
790
|
index_path = self.persist_directory / "index"
|
|
607
791
|
|
|
608
792
|
if index_path.exists():
|
|
609
|
-
# Look for pickle files in the index
|
|
793
|
+
# Look for pickle files in the index (HNSW metadata)
|
|
610
794
|
pickle_files = list(index_path.glob("**/*.pkl"))
|
|
611
795
|
pickle_files.extend(list(index_path.glob("**/*.pickle")))
|
|
796
|
+
pickle_files.extend(list(index_path.glob("**/*.bin"))) # Binary HNSW files
|
|
797
|
+
|
|
798
|
+
logger.debug(
|
|
799
|
+
f"Checking {len(pickle_files)} HNSW index files for corruption..."
|
|
800
|
+
)
|
|
612
801
|
|
|
613
802
|
for pickle_file in pickle_files:
|
|
614
803
|
try:
|
|
615
|
-
#
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
804
|
+
# Check file size - suspiciously small files might be corrupted
|
|
805
|
+
file_size = pickle_file.stat().st_size
|
|
806
|
+
if file_size == 0:
|
|
807
|
+
logger.warning(
|
|
808
|
+
f"Empty HNSW index file detected: {pickle_file} (0 bytes)"
|
|
809
|
+
)
|
|
810
|
+
await self._recover_from_corruption()
|
|
811
|
+
return
|
|
812
|
+
|
|
813
|
+
# Only validate pickle files (not binary .bin files)
|
|
814
|
+
if pickle_file.suffix in (".pkl", ".pickle"):
|
|
815
|
+
# Try to read the pickle file to detect corruption
|
|
816
|
+
import pickle # nosec B403 # Trusted internal index files only
|
|
817
|
+
|
|
818
|
+
with open(pickle_file, "rb") as f:
|
|
819
|
+
data = pickle.load(f) # nosec B301 # Trusted internal index files only
|
|
820
|
+
|
|
821
|
+
# Additional validation: check if data structure is valid
|
|
822
|
+
if data is None:
|
|
823
|
+
logger.warning(
|
|
824
|
+
f"HNSW index file contains None data: {pickle_file}"
|
|
825
|
+
)
|
|
826
|
+
await self._recover_from_corruption()
|
|
827
|
+
return
|
|
828
|
+
|
|
829
|
+
# Check for metadata consistency (if it's a dict)
|
|
830
|
+
if isinstance(data, dict):
|
|
831
|
+
# Look for known metadata keys that should exist
|
|
832
|
+
if "space" in data and "dim" in data:
|
|
833
|
+
# Validate dimensions are reasonable
|
|
834
|
+
if data.get("dim", 0) <= 0:
|
|
835
|
+
logger.warning(
|
|
836
|
+
f"Invalid dimensions in HNSW index: {pickle_file} (dim={data.get('dim')})"
|
|
837
|
+
)
|
|
838
|
+
await self._recover_from_corruption()
|
|
839
|
+
return
|
|
840
|
+
|
|
841
|
+
except (EOFError, pickle.UnpicklingError) as e:
|
|
842
|
+
logger.warning(f"Pickle corruption detected in {pickle_file}: {e}")
|
|
624
843
|
await self._recover_from_corruption()
|
|
625
844
|
return
|
|
845
|
+
except Exception as e:
|
|
846
|
+
# Check if this is a Rust panic pattern
|
|
847
|
+
error_msg = str(e).lower()
|
|
848
|
+
if "range start index" in error_msg and "out of range" in error_msg:
|
|
849
|
+
logger.warning(
|
|
850
|
+
f"Rust panic pattern detected in {pickle_file}: {e}"
|
|
851
|
+
)
|
|
852
|
+
await self._recover_from_corruption()
|
|
853
|
+
return
|
|
854
|
+
else:
|
|
855
|
+
logger.warning(
|
|
856
|
+
f"Error reading HNSW index file {pickle_file}: {e}"
|
|
857
|
+
)
|
|
858
|
+
# Continue checking other files before deciding to recover
|
|
859
|
+
continue
|
|
860
|
+
|
|
861
|
+
logger.debug("HNSW index files validation passed")
|
|
626
862
|
|
|
627
863
|
async def _recover_from_corruption(self) -> None:
|
|
628
|
-
"""Recover from index corruption by rebuilding the index.
|
|
629
|
-
|
|
864
|
+
"""Recover from index corruption by rebuilding the index.
|
|
865
|
+
|
|
866
|
+
This method:
|
|
867
|
+
1. Creates a timestamped backup of the corrupted index
|
|
868
|
+
2. Clears the corrupted index directory
|
|
869
|
+
3. Recreates the directory structure
|
|
870
|
+
4. Logs detailed recovery steps and instructions
|
|
871
|
+
"""
|
|
872
|
+
logger.warning("=" * 80)
|
|
873
|
+
logger.warning("INDEX CORRUPTION DETECTED - Initiating recovery...")
|
|
874
|
+
logger.warning("=" * 80)
|
|
630
875
|
|
|
631
876
|
# Create backup directory
|
|
632
877
|
backup_dir = (
|
|
@@ -634,7 +879,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
634
879
|
)
|
|
635
880
|
backup_dir.mkdir(exist_ok=True)
|
|
636
881
|
|
|
637
|
-
# Backup current state (in case we need it)
|
|
882
|
+
# Backup current state (in case we need it for debugging)
|
|
638
883
|
import time
|
|
639
884
|
|
|
640
885
|
timestamp = int(time.time())
|
|
@@ -643,24 +888,41 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
643
888
|
if self.persist_directory.exists():
|
|
644
889
|
try:
|
|
645
890
|
shutil.copytree(self.persist_directory, backup_path)
|
|
646
|
-
logger.info(f"Created backup at {backup_path}")
|
|
891
|
+
logger.info(f"✓ Created backup at {backup_path}")
|
|
647
892
|
except Exception as e:
|
|
648
|
-
logger.warning(f"Could not create backup: {e}")
|
|
893
|
+
logger.warning(f"⚠ Could not create backup: {e}")
|
|
649
894
|
|
|
650
895
|
# Clear the corrupted index
|
|
651
896
|
if self.persist_directory.exists():
|
|
652
897
|
try:
|
|
898
|
+
# Log what we're about to delete
|
|
899
|
+
total_size = sum(
|
|
900
|
+
f.stat().st_size
|
|
901
|
+
for f in self.persist_directory.rglob("*")
|
|
902
|
+
if f.is_file()
|
|
903
|
+
)
|
|
904
|
+
logger.info(
|
|
905
|
+
f"Clearing corrupted index ({total_size / 1024 / 1024:.2f} MB)..."
|
|
906
|
+
)
|
|
907
|
+
|
|
653
908
|
shutil.rmtree(self.persist_directory)
|
|
654
|
-
logger.info(f"Cleared corrupted index at {self.persist_directory}")
|
|
909
|
+
logger.info(f"✓ Cleared corrupted index at {self.persist_directory}")
|
|
655
910
|
except Exception as e:
|
|
656
|
-
logger.error(f"Failed to clear corrupted index: {e}")
|
|
911
|
+
logger.error(f"✗ Failed to clear corrupted index: {e}")
|
|
657
912
|
raise IndexCorruptionError(
|
|
658
|
-
f"Could not clear corrupted index: {e}"
|
|
913
|
+
f"Could not clear corrupted index: {e}. "
|
|
914
|
+
f"Please manually delete {self.persist_directory} and try again."
|
|
659
915
|
) from e
|
|
660
916
|
|
|
661
917
|
# Recreate the directory
|
|
662
918
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
663
|
-
logger.info("Index directory recreated
|
|
919
|
+
logger.info("✓ Index directory recreated")
|
|
920
|
+
|
|
921
|
+
logger.warning("=" * 80)
|
|
922
|
+
logger.warning("RECOVERY COMPLETE - Next steps:")
|
|
923
|
+
logger.warning(" 1. Run 'mcp-vector-search index' to rebuild the index")
|
|
924
|
+
logger.warning(f" 2. Backup saved to: {backup_path}")
|
|
925
|
+
logger.warning("=" * 80)
|
|
664
926
|
|
|
665
927
|
async def health_check(self) -> bool:
|
|
666
928
|
"""Check database health and integrity.
|
|
@@ -762,8 +1024,15 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
762
1024
|
await self._pool.close()
|
|
763
1025
|
logger.debug("Pooled ChromaDB connections closed")
|
|
764
1026
|
|
|
765
|
-
async def add_chunks(
|
|
766
|
-
|
|
1027
|
+
async def add_chunks(
|
|
1028
|
+
self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
|
|
1029
|
+
) -> None:
|
|
1030
|
+
"""Add code chunks to the database using pooled connection with optional metrics.
|
|
1031
|
+
|
|
1032
|
+
Args:
|
|
1033
|
+
chunks: List of code chunks to add
|
|
1034
|
+
metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
|
|
1035
|
+
"""
|
|
767
1036
|
if not chunks:
|
|
768
1037
|
return
|
|
769
1038
|
|
|
@@ -779,35 +1048,40 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
779
1048
|
ids = []
|
|
780
1049
|
|
|
781
1050
|
for chunk in chunks:
|
|
1051
|
+
# Store original content in documents (no metadata appended)
|
|
782
1052
|
documents.append(chunk.content)
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
1053
|
+
|
|
1054
|
+
metadata = {
|
|
1055
|
+
"file_path": str(chunk.file_path),
|
|
1056
|
+
"start_line": chunk.start_line,
|
|
1057
|
+
"end_line": chunk.end_line,
|
|
1058
|
+
"language": chunk.language,
|
|
1059
|
+
"chunk_type": chunk.chunk_type,
|
|
1060
|
+
"function_name": chunk.function_name or "",
|
|
1061
|
+
"class_name": chunk.class_name or "",
|
|
1062
|
+
"docstring": chunk.docstring or "",
|
|
1063
|
+
"complexity_score": chunk.complexity_score,
|
|
1064
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
1065
|
+
"chunk_id": chunk.chunk_id or "",
|
|
1066
|
+
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
1067
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
1068
|
+
"chunk_depth": chunk.chunk_depth,
|
|
1069
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
1070
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
1071
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
1072
|
+
"return_type": chunk.return_type or "",
|
|
1073
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
1074
|
+
# Monorepo support
|
|
1075
|
+
"subproject_name": chunk.subproject_name or "",
|
|
1076
|
+
"subproject_path": chunk.subproject_path or "",
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
# Merge structural metrics if provided
|
|
1080
|
+
if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
|
|
1081
|
+
chunk_metrics = metrics[chunk.chunk_id]
|
|
1082
|
+
metadata.update(chunk_metrics)
|
|
1083
|
+
|
|
1084
|
+
metadatas.append(metadata)
|
|
811
1085
|
ids.append(chunk.id)
|
|
812
1086
|
|
|
813
1087
|
# Add to collection
|
|
@@ -862,6 +1136,7 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
862
1136
|
similarity = max(0.0, 1.0 / (1.0 + distance))
|
|
863
1137
|
|
|
864
1138
|
if similarity >= similarity_threshold:
|
|
1139
|
+
# Document contains the original content (no metadata appended)
|
|
865
1140
|
result = SearchResult(
|
|
866
1141
|
content=doc,
|
|
867
1142
|
file_path=Path(metadata["file_path"]),
|
|
@@ -53,6 +53,17 @@ class IndexCorruptionError(DatabaseError):
|
|
|
53
53
|
pass
|
|
54
54
|
|
|
55
55
|
|
|
56
|
+
class RustPanicError(DatabaseError):
|
|
57
|
+
"""ChromaDB Rust bindings panic detected.
|
|
58
|
+
|
|
59
|
+
This error occurs when ChromaDB's Rust bindings encounter
|
|
60
|
+
HNSW index metadata inconsistencies, typically manifesting as:
|
|
61
|
+
'range start index X out of range for slice of length Y'
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
56
67
|
class ParsingError(MCPVectorSearchError):
|
|
57
68
|
"""Code parsing errors."""
|
|
58
69
|
|