mcp-vector-search 0.4.13__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/index.py +73 -31
- mcp_vector_search/cli/commands/init.py +189 -113
- mcp_vector_search/cli/commands/install.py +525 -113
- mcp_vector_search/cli/commands/mcp.py +201 -151
- mcp_vector_search/cli/commands/reset.py +41 -41
- mcp_vector_search/cli/commands/search.py +73 -14
- mcp_vector_search/cli/commands/status.py +51 -17
- mcp_vector_search/cli/didyoumean.py +254 -246
- mcp_vector_search/cli/main.py +171 -52
- mcp_vector_search/cli/output.py +152 -0
- mcp_vector_search/cli/suggestions.py +246 -197
- mcp_vector_search/core/database.py +81 -49
- mcp_vector_search/core/indexer.py +10 -4
- mcp_vector_search/core/search.py +17 -6
- mcp_vector_search/mcp/__main__.py +1 -1
- mcp_vector_search/mcp/server.py +211 -203
- mcp_vector_search/parsers/__init__.py +6 -0
- mcp_vector_search/parsers/dart.py +605 -0
- mcp_vector_search/parsers/php.py +694 -0
- mcp_vector_search/parsers/registry.py +16 -1
- mcp_vector_search/parsers/ruby.py +678 -0
- mcp_vector_search/parsers/text.py +31 -25
- mcp_vector_search/utils/gitignore.py +72 -71
- {mcp_vector_search-0.4.13.dist-info → mcp_vector_search-0.5.0.dist-info}/METADATA +59 -2
- {mcp_vector_search-0.4.13.dist-info → mcp_vector_search-0.5.0.dist-info}/RECORD +29 -26
- {mcp_vector_search-0.4.13.dist-info → mcp_vector_search-0.5.0.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.4.13.dist-info → mcp_vector_search-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.4.13.dist-info → mcp_vector_search-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -100,7 +100,7 @@ class VectorDatabase(ABC):
|
|
|
100
100
|
@abstractmethod
|
|
101
101
|
async def health_check(self) -> bool:
|
|
102
102
|
"""Check database health and integrity.
|
|
103
|
-
|
|
103
|
+
|
|
104
104
|
Returns:
|
|
105
105
|
True if database is healthy, False otherwise
|
|
106
106
|
"""
|
|
@@ -145,7 +145,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
145
145
|
|
|
146
146
|
# Ensure directory exists
|
|
147
147
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
# Check for corruption before initializing
|
|
150
150
|
await self._detect_and_recover_corruption()
|
|
151
151
|
|
|
@@ -172,10 +172,19 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
172
172
|
except Exception as e:
|
|
173
173
|
# Check if this is a corruption error
|
|
174
174
|
error_msg = str(e).lower()
|
|
175
|
-
if any(
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
175
|
+
if any(
|
|
176
|
+
indicator in error_msg
|
|
177
|
+
for indicator in [
|
|
178
|
+
"pickle",
|
|
179
|
+
"unpickling",
|
|
180
|
+
"eof",
|
|
181
|
+
"ran out of input",
|
|
182
|
+
"hnsw",
|
|
183
|
+
"index",
|
|
184
|
+
"deserialize",
|
|
185
|
+
"corrupt",
|
|
186
|
+
]
|
|
187
|
+
):
|
|
179
188
|
logger.warning(f"Detected index corruption: {e}")
|
|
180
189
|
# Try to recover
|
|
181
190
|
await self._recover_from_corruption()
|
|
@@ -468,51 +477,57 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
468
477
|
"""Detect and recover from index corruption proactively."""
|
|
469
478
|
# Check for common corruption indicators in ChromaDB files
|
|
470
479
|
chroma_db_path = self.persist_directory / "chroma.sqlite3"
|
|
471
|
-
|
|
480
|
+
|
|
472
481
|
# If database doesn't exist yet, nothing to check
|
|
473
482
|
if not chroma_db_path.exists():
|
|
474
483
|
return
|
|
475
|
-
|
|
484
|
+
|
|
476
485
|
# Check for HNSW index files that might be corrupted
|
|
477
486
|
collection_path = self.persist_directory / "chroma-collections.parquet"
|
|
478
487
|
index_path = self.persist_directory / "index"
|
|
479
|
-
|
|
488
|
+
|
|
480
489
|
if index_path.exists():
|
|
481
490
|
# Look for pickle files in the index
|
|
482
491
|
pickle_files = list(index_path.glob("**/*.pkl"))
|
|
483
492
|
pickle_files.extend(list(index_path.glob("**/*.pickle")))
|
|
484
|
-
|
|
493
|
+
|
|
485
494
|
for pickle_file in pickle_files:
|
|
486
495
|
try:
|
|
487
496
|
# Try to read the pickle file to detect corruption
|
|
488
497
|
import pickle
|
|
489
|
-
|
|
498
|
+
|
|
499
|
+
with open(pickle_file, "rb") as f:
|
|
490
500
|
pickle.load(f)
|
|
491
501
|
except (EOFError, pickle.UnpicklingError, Exception) as e:
|
|
492
|
-
logger.warning(
|
|
502
|
+
logger.warning(
|
|
503
|
+
f"Corrupted index file detected: {pickle_file} - {e}"
|
|
504
|
+
)
|
|
493
505
|
await self._recover_from_corruption()
|
|
494
506
|
return
|
|
495
507
|
|
|
496
508
|
async def _recover_from_corruption(self) -> None:
|
|
497
509
|
"""Recover from index corruption by rebuilding the index."""
|
|
498
510
|
logger.info("Attempting to recover from index corruption...")
|
|
499
|
-
|
|
511
|
+
|
|
500
512
|
# Create backup directory
|
|
501
|
-
backup_dir =
|
|
513
|
+
backup_dir = (
|
|
514
|
+
self.persist_directory.parent / f"{self.persist_directory.name}_backup"
|
|
515
|
+
)
|
|
502
516
|
backup_dir.mkdir(exist_ok=True)
|
|
503
|
-
|
|
517
|
+
|
|
504
518
|
# Backup current state (in case we need it)
|
|
505
519
|
import time
|
|
520
|
+
|
|
506
521
|
timestamp = int(time.time())
|
|
507
522
|
backup_path = backup_dir / f"backup_{timestamp}"
|
|
508
|
-
|
|
523
|
+
|
|
509
524
|
if self.persist_directory.exists():
|
|
510
525
|
try:
|
|
511
526
|
shutil.copytree(self.persist_directory, backup_path)
|
|
512
527
|
logger.info(f"Created backup at {backup_path}")
|
|
513
528
|
except Exception as e:
|
|
514
529
|
logger.warning(f"Could not create backup: {e}")
|
|
515
|
-
|
|
530
|
+
|
|
516
531
|
# Clear the corrupted index
|
|
517
532
|
if self.persist_directory.exists():
|
|
518
533
|
try:
|
|
@@ -523,14 +538,14 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
523
538
|
raise IndexCorruptionError(
|
|
524
539
|
f"Could not clear corrupted index: {e}"
|
|
525
540
|
) from e
|
|
526
|
-
|
|
541
|
+
|
|
527
542
|
# Recreate the directory
|
|
528
543
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
529
544
|
logger.info("Index directory recreated. Please re-index your codebase.")
|
|
530
545
|
|
|
531
546
|
async def health_check(self) -> bool:
|
|
532
547
|
"""Check database health and integrity.
|
|
533
|
-
|
|
548
|
+
|
|
534
549
|
Returns:
|
|
535
550
|
True if database is healthy, False otherwise
|
|
536
551
|
"""
|
|
@@ -539,35 +554,42 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
539
554
|
if not self._client or not self._collection:
|
|
540
555
|
logger.warning("Database not initialized")
|
|
541
556
|
return False
|
|
542
|
-
|
|
557
|
+
|
|
543
558
|
# Try a simple operation to test the connection
|
|
544
559
|
try:
|
|
545
560
|
# Attempt to get count - this will fail if index is corrupted
|
|
546
561
|
count = self._collection.count()
|
|
547
562
|
logger.debug(f"Health check passed: {count} chunks in database")
|
|
548
|
-
|
|
563
|
+
|
|
549
564
|
# Try a minimal query to ensure search works
|
|
550
565
|
self._collection.query(
|
|
551
|
-
query_texts=["test"],
|
|
552
|
-
n_results=1,
|
|
553
|
-
include=["metadatas"]
|
|
566
|
+
query_texts=["test"], n_results=1, include=["metadatas"]
|
|
554
567
|
)
|
|
555
|
-
|
|
568
|
+
|
|
556
569
|
return True
|
|
557
|
-
|
|
570
|
+
|
|
558
571
|
except Exception as e:
|
|
559
572
|
error_msg = str(e).lower()
|
|
560
|
-
if any(
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
573
|
+
if any(
|
|
574
|
+
indicator in error_msg
|
|
575
|
+
for indicator in [
|
|
576
|
+
"pickle",
|
|
577
|
+
"unpickling",
|
|
578
|
+
"eof",
|
|
579
|
+
"ran out of input",
|
|
580
|
+
"hnsw",
|
|
581
|
+
"index",
|
|
582
|
+
"deserialize",
|
|
583
|
+
"corrupt",
|
|
584
|
+
]
|
|
585
|
+
):
|
|
564
586
|
logger.error(f"Index corruption detected during health check: {e}")
|
|
565
587
|
return False
|
|
566
588
|
else:
|
|
567
589
|
# Some other error
|
|
568
590
|
logger.warning(f"Health check failed: {e}")
|
|
569
591
|
return False
|
|
570
|
-
|
|
592
|
+
|
|
571
593
|
except Exception as e:
|
|
572
594
|
logger.error(f"Health check error: {e}")
|
|
573
595
|
return False
|
|
@@ -793,7 +815,7 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
793
815
|
file_types=file_type_counts,
|
|
794
816
|
index_size_mb=index_size_mb,
|
|
795
817
|
last_updated="unknown", # ChromaDB doesn't track this
|
|
796
|
-
embedding_model="unknown" # TODO: Track this in metadata
|
|
818
|
+
embedding_model="unknown", # TODO: Track this in metadata
|
|
797
819
|
)
|
|
798
820
|
|
|
799
821
|
except Exception as e:
|
|
@@ -868,24 +890,31 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
868
890
|
pool_healthy = await self._pool.health_check()
|
|
869
891
|
if not pool_healthy:
|
|
870
892
|
return False
|
|
871
|
-
|
|
893
|
+
|
|
872
894
|
# Try a simple query to verify database integrity
|
|
873
895
|
try:
|
|
874
896
|
async with self._pool.get_connection() as conn:
|
|
875
897
|
# Test basic operations
|
|
876
898
|
conn.collection.count()
|
|
877
899
|
conn.collection.query(
|
|
878
|
-
query_texts=["test"],
|
|
879
|
-
n_results=1,
|
|
880
|
-
include=["metadatas"]
|
|
900
|
+
query_texts=["test"], n_results=1, include=["metadatas"]
|
|
881
901
|
)
|
|
882
902
|
return True
|
|
883
903
|
except Exception as e:
|
|
884
904
|
error_msg = str(e).lower()
|
|
885
|
-
if any(
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
905
|
+
if any(
|
|
906
|
+
indicator in error_msg
|
|
907
|
+
for indicator in [
|
|
908
|
+
"pickle",
|
|
909
|
+
"unpickling",
|
|
910
|
+
"eof",
|
|
911
|
+
"ran out of input",
|
|
912
|
+
"hnsw",
|
|
913
|
+
"index",
|
|
914
|
+
"deserialize",
|
|
915
|
+
"corrupt",
|
|
916
|
+
]
|
|
917
|
+
):
|
|
889
918
|
logger.error(f"Index corruption detected: {e}")
|
|
890
919
|
# Attempt recovery
|
|
891
920
|
await self._recover_from_corruption()
|
|
@@ -896,30 +925,33 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
896
925
|
except Exception as e:
|
|
897
926
|
logger.error(f"Health check error: {e}")
|
|
898
927
|
return False
|
|
899
|
-
|
|
928
|
+
|
|
900
929
|
async def _recover_from_corruption(self) -> None:
|
|
901
930
|
"""Recover from index corruption by rebuilding the index."""
|
|
902
931
|
logger.info("Attempting to recover from index corruption...")
|
|
903
|
-
|
|
932
|
+
|
|
904
933
|
# Close the pool first
|
|
905
934
|
await self._pool.close()
|
|
906
|
-
|
|
935
|
+
|
|
907
936
|
# Create backup directory
|
|
908
|
-
backup_dir =
|
|
937
|
+
backup_dir = (
|
|
938
|
+
self.persist_directory.parent / f"{self.persist_directory.name}_backup"
|
|
939
|
+
)
|
|
909
940
|
backup_dir.mkdir(exist_ok=True)
|
|
910
|
-
|
|
941
|
+
|
|
911
942
|
# Backup current state
|
|
912
943
|
import time
|
|
944
|
+
|
|
913
945
|
timestamp = int(time.time())
|
|
914
946
|
backup_path = backup_dir / f"backup_{timestamp}"
|
|
915
|
-
|
|
947
|
+
|
|
916
948
|
if self.persist_directory.exists():
|
|
917
949
|
try:
|
|
918
950
|
shutil.copytree(self.persist_directory, backup_path)
|
|
919
951
|
logger.info(f"Created backup at {backup_path}")
|
|
920
952
|
except Exception as e:
|
|
921
953
|
logger.warning(f"Could not create backup: {e}")
|
|
922
|
-
|
|
954
|
+
|
|
923
955
|
# Clear the corrupted index
|
|
924
956
|
if self.persist_directory.exists():
|
|
925
957
|
try:
|
|
@@ -930,10 +962,10 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
930
962
|
raise IndexCorruptionError(
|
|
931
963
|
f"Could not clear corrupted index: {e}"
|
|
932
964
|
) from e
|
|
933
|
-
|
|
965
|
+
|
|
934
966
|
# Recreate the directory
|
|
935
967
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
936
|
-
|
|
968
|
+
|
|
937
969
|
# Reinitialize the pool
|
|
938
970
|
await self._pool.initialize()
|
|
939
971
|
logger.info("Index recovered. Please re-index your codebase.")
|
|
@@ -9,7 +9,7 @@ from loguru import logger
|
|
|
9
9
|
|
|
10
10
|
from ..config.defaults import DEFAULT_IGNORE_PATTERNS
|
|
11
11
|
from ..parsers.registry import get_parser_registry
|
|
12
|
-
from ..utils.gitignore import create_gitignore_parser
|
|
12
|
+
from ..utils.gitignore import create_gitignore_parser
|
|
13
13
|
from .database import VectorDatabase
|
|
14
14
|
from .exceptions import ParsingError
|
|
15
15
|
from .models import CodeChunk
|
|
@@ -51,7 +51,9 @@ class SemanticIndexer:
|
|
|
51
51
|
# Initialize gitignore parser
|
|
52
52
|
try:
|
|
53
53
|
self.gitignore_parser = create_gitignore_parser(project_root)
|
|
54
|
-
logger.debug(
|
|
54
|
+
logger.debug(
|
|
55
|
+
f"Loaded {len(self.gitignore_parser.patterns)} gitignore patterns"
|
|
56
|
+
)
|
|
55
57
|
except Exception as e:
|
|
56
58
|
logger.warning(f"Failed to load gitignore patterns: {e}")
|
|
57
59
|
self.gitignore_parser = None
|
|
@@ -376,14 +378,18 @@ class SemanticIndexer:
|
|
|
376
378
|
# Check each part of the path against default ignore patterns
|
|
377
379
|
for part in relative_path.parts:
|
|
378
380
|
if part in self._ignore_patterns:
|
|
379
|
-
logger.debug(
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"Path ignored by default pattern '{part}': {file_path}"
|
|
383
|
+
)
|
|
380
384
|
return True
|
|
381
385
|
|
|
382
386
|
# Check if any parent directory should be ignored
|
|
383
387
|
for parent in relative_path.parents:
|
|
384
388
|
for part in parent.parts:
|
|
385
389
|
if part in self._ignore_patterns:
|
|
386
|
-
logger.debug(
|
|
390
|
+
logger.debug(
|
|
391
|
+
f"Path ignored by parent pattern '{part}': {file_path}"
|
|
392
|
+
)
|
|
387
393
|
return True
|
|
388
394
|
|
|
389
395
|
return False
|
mcp_vector_search/core/search.py
CHANGED
|
@@ -68,7 +68,7 @@ class SemanticSearchEngine:
|
|
|
68
68
|
|
|
69
69
|
# Health check before search
|
|
70
70
|
try:
|
|
71
|
-
if hasattr(self.database,
|
|
71
|
+
if hasattr(self.database, "health_check"):
|
|
72
72
|
is_healthy = await self.database.health_check()
|
|
73
73
|
if not is_healthy:
|
|
74
74
|
logger.warning("Database health check failed - attempting recovery")
|
|
@@ -118,12 +118,23 @@ class SemanticSearchEngine:
|
|
|
118
118
|
except Exception as e:
|
|
119
119
|
error_msg = str(e).lower()
|
|
120
120
|
# Check for corruption indicators
|
|
121
|
-
if any(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
121
|
+
if any(
|
|
122
|
+
indicator in error_msg
|
|
123
|
+
for indicator in [
|
|
124
|
+
"pickle",
|
|
125
|
+
"unpickling",
|
|
126
|
+
"eof",
|
|
127
|
+
"ran out of input",
|
|
128
|
+
"hnsw",
|
|
129
|
+
"index",
|
|
130
|
+
"deserialize",
|
|
131
|
+
"corrupt",
|
|
132
|
+
]
|
|
133
|
+
):
|
|
125
134
|
logger.error(f"Index corruption detected during search: {e}")
|
|
126
|
-
logger.info(
|
|
135
|
+
logger.info(
|
|
136
|
+
"The index appears to be corrupted. Please run 'mcp-vector-search reset' to clear the index and then 'mcp-vector-search index' to rebuild it."
|
|
137
|
+
)
|
|
127
138
|
raise SearchError(
|
|
128
139
|
"Index corruption detected. Please run 'mcp-vector-search reset' followed by 'mcp-vector-search index' to rebuild."
|
|
129
140
|
) from e
|
|
@@ -11,7 +11,7 @@ def main():
|
|
|
11
11
|
"""Main entry point for the MCP server."""
|
|
12
12
|
# Allow specifying project root as command line argument
|
|
13
13
|
project_root = Path(sys.argv[1]) if len(sys.argv) > 1 else None
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
try:
|
|
16
16
|
asyncio.run(run_mcp_server(project_root))
|
|
17
17
|
except KeyboardInterrupt:
|