PyPI - mcp-vector-search - Versions diffs - 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl - Mend

mcp-vector-search 0.0.3py3-none-any.whl → 0.4.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (49) hide show

mcp_vector_search/__init__.py +3 -2
mcp_vector_search/cli/commands/auto_index.py +397 -0
mcp_vector_search/cli/commands/config.py +88 -40
mcp_vector_search/cli/commands/index.py +198 -52
mcp_vector_search/cli/commands/init.py +471 -58
mcp_vector_search/cli/commands/install.py +284 -0
mcp_vector_search/cli/commands/mcp.py +495 -0
mcp_vector_search/cli/commands/search.py +241 -87
mcp_vector_search/cli/commands/status.py +184 -58
mcp_vector_search/cli/commands/watch.py +34 -35
mcp_vector_search/cli/didyoumean.py +184 -0
mcp_vector_search/cli/export.py +320 -0
mcp_vector_search/cli/history.py +292 -0
mcp_vector_search/cli/interactive.py +342 -0
mcp_vector_search/cli/main.py +175 -27
mcp_vector_search/cli/output.py +63 -45
mcp_vector_search/config/defaults.py +50 -36
mcp_vector_search/config/settings.py +49 -35
mcp_vector_search/core/auto_indexer.py +298 -0
mcp_vector_search/core/connection_pool.py +322 -0
mcp_vector_search/core/database.py +335 -25
mcp_vector_search/core/embeddings.py +73 -29
mcp_vector_search/core/exceptions.py +19 -2
mcp_vector_search/core/factory.py +310 -0
mcp_vector_search/core/git_hooks.py +345 -0
mcp_vector_search/core/indexer.py +237 -73
mcp_vector_search/core/models.py +21 -19
mcp_vector_search/core/project.py +73 -58
mcp_vector_search/core/scheduler.py +330 -0
mcp_vector_search/core/search.py +574 -86
mcp_vector_search/core/watcher.py +48 -46
mcp_vector_search/mcp/__init__.py +4 -0
mcp_vector_search/mcp/__main__.py +25 -0
mcp_vector_search/mcp/server.py +701 -0
mcp_vector_search/parsers/base.py +30 -31
mcp_vector_search/parsers/javascript.py +74 -48
mcp_vector_search/parsers/python.py +57 -49
mcp_vector_search/parsers/registry.py +47 -32
mcp_vector_search/parsers/text.py +179 -0
mcp_vector_search/utils/__init__.py +40 -0
mcp_vector_search/utils/gitignore.py +229 -0
mcp_vector_search/utils/timing.py +334 -0
mcp_vector_search/utils/version.py +47 -0
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/METADATA +173 -7
mcp_vector_search-0.4.12.dist-info/RECORD +54 -0
mcp_vector_search-0.0.3.dist-info/RECORD +0 -35
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/WHEEL +0 -0
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/entry_points.txt +0 -0
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/licenses/LICENSE +0 -0

mcp_vector_search/core/embeddings.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import hashlib
 import json
 from pathlib import Path
-from typing import Dict, List, Optional
 import aiofiles
 from loguru import logger
@@ -17,7 +16,7 @@ class EmbeddingCache:
     def __init__(self, cache_dir: Path, max_size: int = 1000) -> None:
         """Initialize embedding cache.
         Args:
             cache_dir: Directory to store cached embeddings
             max_size: Maximum number of embeddings to keep in memory
@@ -25,45 +24,51 @@ class EmbeddingCache:
         self.cache_dir = cache_dir
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         self.max_size = max_size
-        self._memory_cache: Dict[str, List[float]] = {}
+        self._memory_cache: dict[str, list[float]] = {}
+        self._access_order: list[str] = []  # For LRU eviction
+        self._cache_hits = 0
+        self._cache_misses = 0
     def _hash_content(self, content: str) -> str:
         """Generate cache key from content."""
         return hashlib.sha256(content.encode()).hexdigest()[:16]
-    async def get_embedding(self, content: str) -> Optional[List[float]]:
+    async def get_embedding(self, content: str) -> list[float] | None:
         """Get cached embedding for content."""
         cache_key = self._hash_content(content)
         # Check memory cache first
         if cache_key in self._memory_cache:
+            self._cache_hits += 1
+            # Move to end for LRU
+            self._access_order.remove(cache_key)
+            self._access_order.append(cache_key)
             return self._memory_cache[cache_key]
         # Check disk cache
         cache_file = self.cache_dir / f"{cache_key}.json"
         if cache_file.exists():
             try:
-                async with aiofiles.open(cache_file, "r") as f:
+                async with aiofiles.open(cache_file) as f:
                     content_str = await f.read()
                     embedding = json.loads(content_str)
-                    # Add to memory cache if space available
-                    if len(self._memory_cache) < self.max_size:
-                        self._memory_cache[cache_key] = embedding
+                    # Add to memory cache with LRU management
+                    self._add_to_memory_cache(cache_key, embedding)
+                    self._cache_hits += 1
                     return embedding
             except Exception as e:
                 logger.warning(f"Failed to load cached embedding: {e}")
+        self._cache_misses += 1
         return None
-    async def store_embedding(self, content: str, embedding: List[float]) -> None:
+    async def store_embedding(self, content: str, embedding: list[float]) -> None:
         """Store embedding in cache."""
         cache_key = self._hash_content(content)
-        # Store in memory cache if space available
-        if len(self._memory_cache) < self.max_size:
-            self._memory_cache[cache_key] = embedding
+        # Store in memory cache with LRU management
+        self._add_to_memory_cache(cache_key, embedding)
         # Store in disk cache
         cache_file = self.cache_dir / f"{cache_key}.json"
@@ -73,17 +78,56 @@ class EmbeddingCache:
         except Exception as e:
             logger.warning(f"Failed to cache embedding: {e}")
+    def _add_to_memory_cache(self, cache_key: str, embedding: list[float]) -> None:
+        """Add embedding to memory cache with LRU eviction.
+        Args:
+            cache_key: Cache key for the embedding
+            embedding: Embedding vector to cache
+        """
+        # If already in cache, update and move to end
+        if cache_key in self._memory_cache:
+            self._access_order.remove(cache_key)
+            self._access_order.append(cache_key)
+            self._memory_cache[cache_key] = embedding
+            return
+        # If cache is full, evict least recently used
+        if len(self._memory_cache) >= self.max_size:
+            lru_key = self._access_order.pop(0)
+            del self._memory_cache[lru_key]
+        # Add new embedding
+        self._memory_cache[cache_key] = embedding
+        self._access_order.append(cache_key)
     def clear_memory_cache(self) -> None:
         """Clear the in-memory cache."""
         self._memory_cache.clear()
+        self._access_order.clear()
+    def get_cache_stats(self) -> dict[str, any]:
+        """Get cache performance statistics.
+        Returns:
+            Dictionary with cache statistics
+        """
+        total_requests = self._cache_hits + self._cache_misses
+        hit_rate = self._cache_hits / total_requests if total_requests > 0 else 0.0
+        disk_files = (
+            len(list(self.cache_dir.glob("*.json"))) if self.cache_dir.exists() else 0
+        )
-    def get_cache_stats(self) -> Dict[str, int]:
-        """Get cache statistics."""
-        disk_files = len(list(self.cache_dir.glob("*.json")))
         return {
-            "memory_cached": len(self._memory_cache),
-            "disk_cached": disk_files,
-            "memory_limit": self.max_size,
+            "memory_cache_size": len(self._memory_cache),
+            "memory_cached": len(self._memory_cache),  # Alias for compatibility
+            "max_cache_size": self.max_size,
+            "memory_limit": self.max_size,  # Alias for compatibility
+            "cache_hits": self._cache_hits,
+            "cache_misses": self._cache_misses,
+            "hit_rate": round(hit_rate, 3),
+            "disk_cache_files": disk_files,
+            "disk_cached": disk_files,  # Alias for compatibility
         }
@@ -104,7 +148,7 @@ class CodeBERTEmbeddingFunction:
             logger.error(f"Failed to load embedding model {model_name}: {e}")
             raise EmbeddingError(f"Failed to load embedding model: {e}") from e
-    def __call__(self, input: List[str]) -> List[List[float]]:
+    def __call__(self, input: list[str]) -> list[list[float]]:
         """Generate embeddings for input texts (ChromaDB interface)."""
         try:
             embeddings = self.model.encode(input, convert_to_numpy=True)
@@ -120,11 +164,11 @@ class BatchEmbeddingProcessor:
     def __init__(
         self,
         embedding_function: CodeBERTEmbeddingFunction,
-        cache: Optional[EmbeddingCache] = None,
+        cache: EmbeddingCache | None = None,
         batch_size: int = 32,
     ) -> None:
         """Initialize batch embedding processor.
         Args:
             embedding_function: Function to generate embeddings
             cache: Optional embedding cache
@@ -134,12 +178,12 @@ class BatchEmbeddingProcessor:
         self.cache = cache
         self.batch_size = batch_size
-    async def process_batch(self, contents: List[str]) -> List[List[float]]:
+    async def process_batch(self, contents: list[str]) -> list[list[float]]:
         """Process a batch of content for embeddings.
         Args:
             contents: List of text content to embed
         Returns:
             List of embeddings
         """
@@ -179,7 +223,7 @@ class BatchEmbeddingProcessor:
                 # Cache new embeddings and fill placeholders
                 for i, (content, embedding) in enumerate(
-                    zip(uncached_contents, new_embeddings)
+                    zip(uncached_contents, new_embeddings, strict=False)
                 ):
                     if self.cache:
                         await self.cache.store_embedding(content, embedding)
@@ -191,7 +235,7 @@ class BatchEmbeddingProcessor:
         return embeddings
-    def get_stats(self) -> Dict[str, any]:
+    def get_stats(self) -> dict[str, any]:
         """Get processor statistics."""
         stats = {
             "model_name": self.embedding_function.model_name,
@@ -207,7 +251,7 @@ class BatchEmbeddingProcessor:
 def create_embedding_function(
     model_name: str = "microsoft/codebert-base",
-    cache_dir: Optional[Path] = None,
+    cache_dir: Path | None = None,
     cache_size: int = 1000,
 ):
     """Create embedding function and cache.
@@ -236,7 +280,7 @@ def create_embedding_function(
             model_name=actual_model
         )
-        logger.info(f"Created ChromaDB embedding function with model: {actual_model}")
+        logger.debug(f"Created ChromaDB embedding function with model: {actual_model}")
     except Exception as e:
         logger.warning(f"Failed to create ChromaDB embedding function: {e}")

mcp_vector_search/core/exceptions.py CHANGED Viewed

@@ -1,66 +1,83 @@
 """Custom exception hierarchy for MCP Vector Search."""
-from typing import Any, Dict, Optional
+from typing import Any
 class MCPVectorSearchError(Exception):
     """Base exception for MCP Vector Search."""
-    def __init__(self, message: str, context: Optional[Dict[str, Any]] = None) -> None:
+    def __init__(self, message: str, context: dict[str, Any] | None = None) -> None:
         super().__init__(message)
         self.context = context or {}
 class DatabaseError(MCPVectorSearchError):
     """Database-related errors."""
     pass
 class DatabaseInitializationError(DatabaseError):
     """Database initialization failed."""
     pass
 class DatabaseNotInitializedError(DatabaseError):
     """Operation attempted on uninitialized database."""
+    pass
+class ConnectionPoolError(DatabaseError):
+    """Connection pool operation failed."""
     pass
 class DocumentAdditionError(DatabaseError):
     """Failed to add documents to database."""
     pass
 class SearchError(DatabaseError):
     """Search operation failed."""
     pass
 class ParsingError(MCPVectorSearchError):
     """Code parsing errors."""
     pass
 class EmbeddingError(MCPVectorSearchError):
     """Embedding generation errors."""
     pass
 class ConfigurationError(MCPVectorSearchError):
     """Configuration validation errors."""
     pass
 class ProjectError(MCPVectorSearchError):
     """Project management errors."""
     pass
 class ProjectNotFoundError(ProjectError):
     """Project directory or configuration not found."""
     pass
 class ProjectInitializationError(ProjectError):
     """Failed to initialize project."""
     pass

mcp_vector_search/core/factory.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Component factory for creating commonly used objects."""
+import functools
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, TypeVar
+import typer
+from loguru import logger
+from ..cli.output import print_error
+from ..config.settings import ProjectConfig
+from .auto_indexer import AutoIndexer
+from .database import ChromaVectorDatabase, PooledChromaVectorDatabase, VectorDatabase
+from .embeddings import CodeBERTEmbeddingFunction, create_embedding_function
+from .indexer import SemanticIndexer
+from .project import ProjectManager
+from .search import SemanticSearchEngine
+F = TypeVar("F", bound=Callable[..., Any])
+@dataclass
+class ComponentBundle:
+    """Bundle of commonly used components."""
+    project_manager: ProjectManager
+    config: ProjectConfig
+    database: VectorDatabase
+    indexer: SemanticIndexer
+    embedding_function: CodeBERTEmbeddingFunction
+    search_engine: SemanticSearchEngine | None = None
+    auto_indexer: AutoIndexer | None = None
+class ComponentFactory:
+    """Factory for creating commonly used components."""
+    @staticmethod
+    def create_project_manager(project_root: Path) -> ProjectManager:
+        """Create a project manager."""
+        return ProjectManager(project_root)
+    @staticmethod
+    def load_config(project_root: Path) -> tuple[ProjectManager, ProjectConfig]:
+        """Load project configuration."""
+        project_manager = ComponentFactory.create_project_manager(project_root)
+        config = project_manager.load_config()
+        return project_manager, config
+    @staticmethod
+    def create_embedding_function(
+        model_name: str,
+    ) -> tuple[CodeBERTEmbeddingFunction, Any]:
+        """Create embedding function."""
+        return create_embedding_function(model_name)
+    @staticmethod
+    def create_database(
+        config: ProjectConfig,
+        embedding_function: CodeBERTEmbeddingFunction,
+        use_pooling: bool = False,
+        **pool_kwargs,
+    ) -> VectorDatabase:
+        """Create vector database."""
+        if use_pooling:
+            return PooledChromaVectorDatabase(
+                persist_directory=config.index_path,
+                embedding_function=embedding_function,
+                collection_name="code_search",
+                **pool_kwargs,
+            )
+        else:
+            return ChromaVectorDatabase(
+                persist_directory=config.index_path,
+                embedding_function=embedding_function,
+                collection_name="code_search",
+            )
+    @staticmethod
+    def create_indexer(
+        database: VectorDatabase, project_root: Path, config: ProjectConfig
+    ) -> SemanticIndexer:
+        """Create semantic indexer."""
+        return SemanticIndexer(
+            database=database,
+            project_root=project_root,
+            file_extensions=config.file_extensions,
+        )
+    @staticmethod
+    def create_search_engine(
+        database: VectorDatabase,
+        project_root: Path,
+        similarity_threshold: float = 0.7,
+        auto_indexer: AutoIndexer | None = None,
+        enable_auto_reindex: bool = True,
+    ) -> SemanticSearchEngine:
+        """Create semantic search engine."""
+        return SemanticSearchEngine(
+            database=database,
+            project_root=project_root,
+            similarity_threshold=similarity_threshold,
+            auto_indexer=auto_indexer,
+            enable_auto_reindex=enable_auto_reindex,
+        )
+    @staticmethod
+    def create_auto_indexer(
+        indexer: SemanticIndexer,
+        database: VectorDatabase,
+        auto_reindex_threshold: int = 5,
+        staleness_threshold: float = 300.0,
+    ) -> AutoIndexer:
+        """Create auto-indexer."""
+        return AutoIndexer(
+            indexer=indexer,
+            database=database,
+            auto_reindex_threshold=auto_reindex_threshold,
+            staleness_threshold=staleness_threshold,
+        )
+    @staticmethod
+    async def create_standard_components(
+        project_root: Path,
+        use_pooling: bool = False,
+        include_search_engine: bool = False,
+        include_auto_indexer: bool = False,
+        similarity_threshold: float = 0.7,
+        auto_reindex_threshold: int = 5,
+        **pool_kwargs,
+    ) -> ComponentBundle:
+        """Create standard set of components for CLI commands.
+        Args:
+            project_root: Project root directory
+            use_pooling: Whether to use connection pooling
+            include_search_engine: Whether to create search engine
+            include_auto_indexer: Whether to create auto-indexer
+            similarity_threshold: Default similarity threshold for search
+            auto_reindex_threshold: Max files to auto-reindex
+            **pool_kwargs: Additional arguments for connection pool
+        Returns:
+            ComponentBundle with requested components
+        """
+        # Load configuration
+        project_manager, config = ComponentFactory.load_config(project_root)
+        # Create embedding function
+        embedding_function, _ = ComponentFactory.create_embedding_function(
+            config.embedding_model
+        )
+        # Create database
+        database = ComponentFactory.create_database(
+            config=config,
+            embedding_function=embedding_function,
+            use_pooling=use_pooling,
+            **pool_kwargs,
+        )
+        # Create indexer
+        indexer = ComponentFactory.create_indexer(
+            database=database,
+            project_root=project_root,
+            config=config,
+        )
+        # Create optional components
+        search_engine = None
+        auto_indexer = None
+        if include_auto_indexer:
+            auto_indexer = ComponentFactory.create_auto_indexer(
+                indexer=indexer,
+                database=database,
+                auto_reindex_threshold=auto_reindex_threshold,
+            )
+        if include_search_engine:
+            search_engine = ComponentFactory.create_search_engine(
+                database=database,
+                project_root=project_root,
+                similarity_threshold=similarity_threshold,
+                auto_indexer=auto_indexer,
+                enable_auto_reindex=include_auto_indexer,
+            )
+        return ComponentBundle(
+            project_manager=project_manager,
+            config=config,
+            database=database,
+            indexer=indexer,
+            embedding_function=embedding_function,
+            search_engine=search_engine,
+            auto_indexer=auto_indexer,
+        )
+class DatabaseContext:
+    """Context manager for database lifecycle management."""
+    def __init__(self, database: VectorDatabase):
+        """Initialize database context.
+        Args:
+            database: Vector database instance
+        """
+        self.database = database
+    async def __aenter__(self) -> VectorDatabase:
+        """Enter context and initialize database."""
+        await self.database.initialize()
+        return self.database
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit context and close database."""
+        await self.database.close()
+def handle_cli_errors(operation_name: str) -> Callable[[F], F]:
+    """Decorator for consistent CLI error handling.
+    Args:
+        operation_name: Name of the operation for error messages
+    Returns:
+        Decorator function
+    """
+    def decorator(func: F) -> F:
+        @functools.wraps(func)
+        async def async_wrapper(*args, **kwargs):
+            try:
+                return await func(*args, **kwargs)
+            except Exception as e:
+                logger.error(f"{operation_name} failed: {e}")
+                print_error(f"{operation_name} failed: {e}")
+                raise typer.Exit(1)
+        @functools.wraps(func)
+        def sync_wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except Exception as e:
+                logger.error(f"{operation_name} failed: {e}")
+                print_error(f"{operation_name} failed: {e}")
+                raise typer.Exit(1)
+        # Return appropriate wrapper based on function type
+        if hasattr(func, "__code__") and "await" in func.__code__.co_names:
+            return async_wrapper
+        else:
+            return sync_wrapper
+    return decorator
+class ConfigurationService:
+    """Centralized configuration management service."""
+    def __init__(self, project_root: Path):
+        """Initialize configuration service.
+        Args:
+            project_root: Project root directory
+        """
+        self.project_root = project_root
+        self._project_manager: ProjectManager | None = None
+        self._config: ProjectConfig | None = None
+    @property
+    def project_manager(self) -> ProjectManager:
+        """Get project manager (lazy loaded)."""
+        if self._project_manager is None:
+            self._project_manager = ProjectManager(self.project_root)
+        return self._project_manager
+    @property
+    def config(self) -> ProjectConfig:
+        """Get project configuration (lazy loaded)."""
+        if self._config is None:
+            self._config = self.project_manager.load_config()
+        return self._config
+    def ensure_initialized(self) -> bool:
+        """Ensure project is initialized.
+        Returns:
+            True if project is initialized, False otherwise
+        """
+        if not self.project_manager.is_initialized():
+            print_error("Project not initialized. Run 'mcp-vector-search init' first.")
+            return False
+        return True
+    def reload_config(self) -> None:
+        """Reload configuration from disk."""
+        self._config = None
+    def save_config(self, config: ProjectConfig) -> None:
+        """Save configuration to disk.
+        Args:
+            config: Configuration to save
+        """
+        self.project_manager.save_config(config)
+        self._config = config

mcp-vector-search 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl

Potentially problematic release.

mcp-vector-search 0.0.3py3-none-any.whl → 0.4.12py3-none-any.whl