PyPI - mcp-vector-search - Versions diffs - 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl - Mend

mcp-vector-search 0.0.3py3-none-any.whl → 0.4.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (49) hide show

mcp_vector_search/__init__.py +3 -2
mcp_vector_search/cli/commands/auto_index.py +397 -0
mcp_vector_search/cli/commands/config.py +88 -40
mcp_vector_search/cli/commands/index.py +198 -52
mcp_vector_search/cli/commands/init.py +471 -58
mcp_vector_search/cli/commands/install.py +284 -0
mcp_vector_search/cli/commands/mcp.py +495 -0
mcp_vector_search/cli/commands/search.py +241 -87
mcp_vector_search/cli/commands/status.py +184 -58
mcp_vector_search/cli/commands/watch.py +34 -35
mcp_vector_search/cli/didyoumean.py +184 -0
mcp_vector_search/cli/export.py +320 -0
mcp_vector_search/cli/history.py +292 -0
mcp_vector_search/cli/interactive.py +342 -0
mcp_vector_search/cli/main.py +175 -27
mcp_vector_search/cli/output.py +63 -45
mcp_vector_search/config/defaults.py +50 -36
mcp_vector_search/config/settings.py +49 -35
mcp_vector_search/core/auto_indexer.py +298 -0
mcp_vector_search/core/connection_pool.py +322 -0
mcp_vector_search/core/database.py +335 -25
mcp_vector_search/core/embeddings.py +73 -29
mcp_vector_search/core/exceptions.py +19 -2
mcp_vector_search/core/factory.py +310 -0
mcp_vector_search/core/git_hooks.py +345 -0
mcp_vector_search/core/indexer.py +237 -73
mcp_vector_search/core/models.py +21 -19
mcp_vector_search/core/project.py +73 -58
mcp_vector_search/core/scheduler.py +330 -0
mcp_vector_search/core/search.py +574 -86
mcp_vector_search/core/watcher.py +48 -46
mcp_vector_search/mcp/__init__.py +4 -0
mcp_vector_search/mcp/__main__.py +25 -0
mcp_vector_search/mcp/server.py +701 -0
mcp_vector_search/parsers/base.py +30 -31
mcp_vector_search/parsers/javascript.py +74 -48
mcp_vector_search/parsers/python.py +57 -49
mcp_vector_search/parsers/registry.py +47 -32
mcp_vector_search/parsers/text.py +179 -0
mcp_vector_search/utils/__init__.py +40 -0
mcp_vector_search/utils/gitignore.py +229 -0
mcp_vector_search/utils/timing.py +334 -0
mcp_vector_search/utils/version.py +47 -0
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/METADATA +173 -7
mcp_vector_search-0.4.12.dist-info/RECORD +54 -0
mcp_vector_search-0.0.3.dist-info/RECORD +0 -35
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/WHEEL +0 -0
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/entry_points.txt +0 -0
{mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/licenses/LICENSE +0 -0

mcp_vector_search/core/indexer.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """Semantic indexer for MCP Vector Search."""
 import asyncio
+import json
+import os
 from pathlib import Path
-from typing import List, Optional, Set
 from loguru import logger
 from ..config.defaults import DEFAULT_IGNORE_PATTERNS
 from ..parsers.registry import get_parser_registry
+from ..utils.gitignore import create_gitignore_parser, GitignoreParser
 from .database import VectorDatabase
 from .exceptions import ParsingError
 from .models import CodeChunk
@@ -20,20 +22,39 @@ class SemanticIndexer:
         self,
         database: VectorDatabase,
         project_root: Path,
-        file_extensions: List[str],
+        file_extensions: list[str],
+        max_workers: int | None = None,
+        batch_size: int = 10,
     ) -> None:
         """Initialize semantic indexer.
         Args:
             database: Vector database instance
             project_root: Project root directory
             file_extensions: File extensions to index
+            max_workers: Maximum number of worker threads for parallel processing
+            batch_size: Number of files to process in each batch
         """
         self.database = database
         self.project_root = project_root
-        self.file_extensions = set(ext.lower() for ext in file_extensions)
+        self.file_extensions = {ext.lower() for ext in file_extensions}
         self.parser_registry = get_parser_registry()
         self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
+        self.max_workers = max_workers or min(
+            4, (asyncio.get_event_loop().get_debug() and 1) or 4
+        )
+        self.batch_size = batch_size
+        self._index_metadata_file = (
+            project_root / ".mcp-vector-search" / "index_metadata.json"
+        )
+        # Initialize gitignore parser
+        try:
+            self.gitignore_parser = create_gitignore_parser(project_root)
+            logger.debug(f"Loaded {len(self.gitignore_parser.patterns)} gitignore patterns")
+        except Exception as e:
+            logger.warning(f"Failed to load gitignore patterns: {e}")
+            self.gitignore_parser = None
     async def index_project(
         self,
@@ -41,60 +62,191 @@ class SemanticIndexer:
         show_progress: bool = True,
     ) -> int:
         """Index all files in the project.
         Args:
             force_reindex: Whether to reindex existing files
             show_progress: Whether to show progress information
         Returns:
             Number of files indexed
         """
         logger.info(f"Starting indexing of project: {self.project_root}")
         # Find all indexable files
-        files_to_index = self._find_indexable_files()
-        if not files_to_index:
+        all_files = self._find_indexable_files()
+        if not all_files:
             logger.warning("No indexable files found")
             return 0
-        logger.info(f"Found {len(files_to_index)} files to index")
-        # Index files
+        # Load existing metadata for incremental indexing
+        metadata = self._load_index_metadata()
+        # Filter files that need indexing
+        if force_reindex:
+            files_to_index = all_files
+            logger.info(f"Force reindex: processing all {len(files_to_index)} files")
+        else:
+            files_to_index = [
+                f for f in all_files if self._needs_reindexing(f, metadata)
+            ]
+            logger.info(
+                f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
+            )
+        if not files_to_index:
+            logger.info("All files are up to date")
+            return 0
+        # Index files in parallel batches
         indexed_count = 0
         failed_count = 0
-        for i, file_path in enumerate(files_to_index):
-            if show_progress and (i + 1) % 10 == 0:
-                logger.info(f"Indexing progress: {i + 1}/{len(files_to_index)}")
-            try:
-                success = await self.index_file(file_path, force_reindex)
+        # Process files in batches for better memory management
+        for i in range(0, len(files_to_index), self.batch_size):
+            batch = files_to_index[i : i + self.batch_size]
+            if show_progress:
+                logger.info(
+                    f"Processing batch {i // self.batch_size + 1}/{(len(files_to_index) + self.batch_size - 1) // self.batch_size} ({len(batch)} files)"
+                )
+            # Process batch in parallel
+            batch_results = await self._process_file_batch(batch, force_reindex)
+            # Count results
+            for success in batch_results:
                 if success:
                     indexed_count += 1
                 else:
                     failed_count += 1
-            except Exception as e:
-                logger.error(f"Failed to index {file_path}: {e}")
-                failed_count += 1
+        # Update metadata for successfully indexed files
+        if indexed_count > 0:
+            for file_path in files_to_index:
+                try:
+                    metadata[str(file_path)] = os.path.getmtime(file_path)
+                except OSError:
+                    pass  # File might have been deleted during indexing
+            self._save_index_metadata(metadata)
         logger.info(
             f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
         )
         return indexed_count
+    async def _process_file_batch(
+        self, file_paths: list[Path], force_reindex: bool = False
+    ) -> list[bool]:
+        """Process a batch of files in parallel.
+        Args:
+            file_paths: List of file paths to process
+            force_reindex: Whether to force reindexing
+        Returns:
+            List of success flags for each file
+        """
+        # Create tasks for parallel processing
+        tasks = []
+        for file_path in file_paths:
+            task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
+            tasks.append(task)
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Convert results to success flags
+        success_flags = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Failed to index {file_paths[i]}: {result}")
+                success_flags.append(False)
+            else:
+                success_flags.append(result)
+        return success_flags
+    def _load_index_metadata(self) -> dict[str, float]:
+        """Load file modification times from metadata file.
+        Returns:
+            Dictionary mapping file paths to modification times
+        """
+        if not self._index_metadata_file.exists():
+            return {}
+        try:
+            with open(self._index_metadata_file) as f:
+                return json.load(f)
+        except Exception as e:
+            logger.warning(f"Failed to load index metadata: {e}")
+            return {}
+    def _save_index_metadata(self, metadata: dict[str, float]) -> None:
+        """Save file modification times to metadata file.
+        Args:
+            metadata: Dictionary mapping file paths to modification times
+        """
+        try:
+            # Ensure directory exists
+            self._index_metadata_file.parent.mkdir(parents=True, exist_ok=True)
+            with open(self._index_metadata_file, "w") as f:
+                json.dump(metadata, f, indent=2)
+        except Exception as e:
+            logger.warning(f"Failed to save index metadata: {e}")
+    def _needs_reindexing(self, file_path: Path, metadata: dict[str, float]) -> bool:
+        """Check if a file needs reindexing based on modification time.
+        Args:
+            file_path: Path to the file
+            metadata: Current metadata dictionary
+        Returns:
+            True if file needs reindexing
+        """
+        try:
+            current_mtime = os.path.getmtime(file_path)
+            stored_mtime = metadata.get(str(file_path), 0)
+            return current_mtime > stored_mtime
+        except OSError:
+            # File doesn't exist or can't be accessed
+            return False
+    async def _index_file_safe(
+        self, file_path: Path, force_reindex: bool = False
+    ) -> bool:
+        """Safely index a single file with error handling.
+        Args:
+            file_path: Path to the file to index
+            force_reindex: Whether to force reindexing
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            return await self.index_file(file_path, force_reindex)
+        except Exception as e:
+            logger.error(f"Error indexing {file_path}: {e}")
+            return False
     async def index_file(
         self,
         file_path: Path,
         force_reindex: bool = False,
     ) -> bool:
         """Index a single file.
         Args:
             file_path: Path to the file to index
             force_reindex: Whether to reindex if already indexed
         Returns:
             True if file was successfully indexed
         """
@@ -102,34 +254,39 @@ class SemanticIndexer:
             # Check if file should be indexed
             if not self._should_index_file(file_path):
                 return False
-            # Remove existing chunks for this file if reindexing
-            if force_reindex:
-                await self.database.delete_by_file(file_path)
+            # Always remove existing chunks when reindexing a file
+            # This prevents duplicate chunks and ensures consistency
+            await self.database.delete_by_file(file_path)
             # Parse file into chunks
             chunks = await self._parse_file(file_path)
             if not chunks:
                 logger.debug(f"No chunks extracted from {file_path}")
                 return True  # Not an error, just empty file
             # Add chunks to database
             await self.database.add_chunks(chunks)
+            # Update metadata after successful indexing
+            metadata = self._load_index_metadata()
+            metadata[str(file_path)] = os.path.getmtime(file_path)
+            self._save_index_metadata(metadata)
             logger.debug(f"Indexed {len(chunks)} chunks from {file_path}")
             return True
         except Exception as e:
             logger.error(f"Failed to index file {file_path}: {e}")
             raise ParsingError(f"Failed to index file {file_path}: {e}") from e
     async def reindex_file(self, file_path: Path) -> bool:
         """Reindex a single file (removes existing chunks first).
         Args:
             file_path: Path to the file to reindex
         Returns:
             True if file was successfully reindexed
         """
@@ -137,10 +294,10 @@ class SemanticIndexer:
     async def remove_file(self, file_path: Path) -> int:
         """Remove all chunks for a file from the index.
         Args:
             file_path: Path to the file to remove
         Returns:
             Number of chunks removed
         """
@@ -152,41 +309,41 @@ class SemanticIndexer:
             logger.error(f"Failed to remove file {file_path}: {e}")
             return 0
-    def _find_indexable_files(self) -> List[Path]:
+    def _find_indexable_files(self) -> list[Path]:
         """Find all files that should be indexed.
         Returns:
             List of file paths to index
         """
         indexable_files = []
         for file_path in self.project_root.rglob("*"):
             if self._should_index_file(file_path):
                 indexable_files.append(file_path)
         return sorted(indexable_files)
     def _should_index_file(self, file_path: Path) -> bool:
         """Check if a file should be indexed.
         Args:
             file_path: Path to check
         Returns:
             True if file should be indexed
         """
         # Must be a file
         if not file_path.is_file():
             return False
         # Check file extension
         if file_path.suffix.lower() not in self.file_extensions:
             return False
         # Check if path should be ignored
         if self._should_ignore_path(file_path):
             return False
         # Check file size (skip very large files)
         try:
             file_size = file_path.stat().st_size
@@ -195,67 +352,74 @@ class SemanticIndexer:
                 return False
         except OSError:
             return False
         return True
     def _should_ignore_path(self, file_path: Path) -> bool:
         """Check if a path should be ignored.
         Args:
             file_path: Path to check
         Returns:
             True if path should be ignored
         """
         try:
+            # First check gitignore rules if available
+            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path):
+                logger.debug(f"Path ignored by .gitignore: {file_path}")
+                return True
             # Get relative path from project root
             relative_path = file_path.relative_to(self.project_root)
-            # Check each part of the path
+            # Check each part of the path against default ignore patterns
             for part in relative_path.parts:
                 if part in self._ignore_patterns:
+                    logger.debug(f"Path ignored by default pattern '{part}': {file_path}")
                     return True
             # Check if any parent directory should be ignored
             for parent in relative_path.parents:
                 for part in parent.parts:
                     if part in self._ignore_patterns:
+                        logger.debug(f"Path ignored by parent pattern '{part}': {file_path}")
                         return True
             return False
         except ValueError:
             # Path is not relative to project root
             return True
-    async def _parse_file(self, file_path: Path) -> List[CodeChunk]:
+    async def _parse_file(self, file_path: Path) -> list[CodeChunk]:
         """Parse a file into code chunks.
         Args:
             file_path: Path to the file to parse
         Returns:
             List of code chunks
         """
         try:
             # Get appropriate parser
             parser = self.parser_registry.get_parser_for_file(file_path)
             # Parse file
             chunks = await parser.parse_file(file_path)
             # Filter out empty chunks
             valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
             return valid_chunks
         except Exception as e:
             logger.error(f"Failed to parse file {file_path}: {e}")
             raise ParsingError(f"Failed to parse file {file_path}: {e}") from e
     def add_ignore_pattern(self, pattern: str) -> None:
         """Add a pattern to ignore during indexing.
         Args:
             pattern: Pattern to ignore (directory or file name)
         """
@@ -263,15 +427,15 @@ class SemanticIndexer:
     def remove_ignore_pattern(self, pattern: str) -> None:
         """Remove an ignore pattern.
         Args:
             pattern: Pattern to remove
         """
         self._ignore_patterns.discard(pattern)
-    def get_ignore_patterns(self) -> Set[str]:
+    def get_ignore_patterns(self) -> set[str]:
         """Get current ignore patterns.
         Returns:
             Set of ignore patterns
         """
@@ -279,17 +443,17 @@ class SemanticIndexer:
     async def get_indexing_stats(self) -> dict:
         """Get statistics about the indexing process.
         Returns:
             Dictionary with indexing statistics
         """
         try:
             # Get database stats
             db_stats = await self.database.get_stats()
             # Count indexable files
             indexable_files = self._find_indexable_files()
             return {
                 "total_indexable_files": len(indexable_files),
                 "indexed_files": db_stats.total_files,
@@ -299,7 +463,7 @@ class SemanticIndexer:
                 "ignore_patterns": list(self._ignore_patterns),
                 "parser_info": self.parser_registry.get_parser_info(),
             }
         except Exception as e:
             logger.error(f"Failed to get indexing stats: {e}")
             return {

mcp_vector_search/core/models.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any
 from pydantic import BaseModel, Field
@@ -17,12 +17,12 @@ class CodeChunk:
     end_line: int
     language: str
     chunk_type: str = "code"  # code, function, class, comment, docstring
-    function_name: Optional[str] = None
-    class_name: Optional[str] = None
-    docstring: Optional[str] = None
-    imports: List[str] = None
+    function_name: str | None = None
+    class_name: str | None = None
+    docstring: str | None = None
+    imports: list[str] = None
     complexity_score: float = 0.0
     def __post_init__(self) -> None:
         """Initialize default values."""
         if self.imports is None:
@@ -38,7 +38,7 @@ class CodeChunk:
         """Get the number of lines in this chunk."""
         return self.end_line - self.start_line + 1
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for storage."""
         return {
             "content": self.content,
@@ -55,7 +55,7 @@ class CodeChunk:
         }
     @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "CodeChunk":
+    def from_dict(cls, data: dict[str, Any]) -> "CodeChunk":
         """Create from dictionary."""
         return cls(
             content=data["content"],
@@ -83,11 +83,13 @@ class SearchResult(BaseModel):
     similarity_score: float = Field(..., description="Similarity score (0.0 to 1.0)")
     rank: int = Field(..., description="Result rank in search results")
     chunk_type: str = Field(default="code", description="Type of code chunk")
-    function_name: Optional[str] = Field(default=None, description="Function name if applicable")
-    class_name: Optional[str] = Field(default=None, description="Class name if applicable")
-    context_before: List[str] = Field(default=[], description="Lines before the match")
-    context_after: List[str] = Field(default=[], description="Lines after the match")
-    highlights: List[str] = Field(default=[], description="Highlighted terms")
+    function_name: str | None = Field(
+        default=None, description="Function name if applicable"
+    )
+    class_name: str | None = Field(default=None, description="Class name if applicable")
+    context_before: list[str] = Field(default=[], description="Lines before the match")
+    context_after: list[str] = Field(default=[], description="Lines after the match")
+    highlights: list[str] = Field(default=[], description="Highlighted terms")
     class Config:
         arbitrary_types_allowed = True
@@ -102,7 +104,7 @@ class SearchResult(BaseModel):
         """Get a human-readable location string."""
         return f"{self.file_path}:{self.start_line}-{self.end_line}"
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for serialization."""
         return {
             "content": self.content,
@@ -128,13 +130,13 @@ class IndexStats(BaseModel):
     total_files: int = Field(..., description="Total number of indexed files")
     total_chunks: int = Field(..., description="Total number of code chunks")
-    languages: Dict[str, int] = Field(..., description="Language distribution")
-    file_types: Dict[str, int] = Field(..., description="File type distribution")
+    languages: dict[str, int] = Field(..., description="Language distribution")
+    file_types: dict[str, int] = Field(..., description="File type distribution")
     index_size_mb: float = Field(..., description="Index size in megabytes")
     last_updated: str = Field(..., description="Last update timestamp")
     embedding_model: str = Field(..., description="Embedding model used")
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for serialization."""
         return {
             "total_files": self.total_files,
@@ -155,13 +157,13 @@ class ProjectInfo(BaseModel):
     config_path: Path = Field(..., description="Configuration file path")
     index_path: Path = Field(..., description="Index directory path")
     is_initialized: bool = Field(..., description="Whether project is initialized")
-    languages: List[str] = Field(default=[], description="Detected languages")
+    languages: list[str] = Field(default=[], description="Detected languages")
     file_count: int = Field(default=0, description="Number of indexable files")
     class Config:
         arbitrary_types_allowed = True
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for serialization."""
         return {
             "name": self.name,

mcp-vector-search 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl

Potentially problematic release.

mcp-vector-search 0.0.3py3-none-any.whl → 0.4.12py3-none-any.whl