PyPI - mcp-vector-search - Versions diffs - 0.0.3__py3-none-any.whl - Mend

mcp-vector-search 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (35) hide show

mcp_vector_search/__init__.py +9 -0
mcp_vector_search/cli/__init__.py +1 -0
mcp_vector_search/cli/commands/__init__.py +1 -0
mcp_vector_search/cli/commands/config.py +303 -0
mcp_vector_search/cli/commands/index.py +304 -0
mcp_vector_search/cli/commands/init.py +212 -0
mcp_vector_search/cli/commands/search.py +395 -0
mcp_vector_search/cli/commands/status.py +340 -0
mcp_vector_search/cli/commands/watch.py +288 -0
mcp_vector_search/cli/main.py +117 -0
mcp_vector_search/cli/output.py +242 -0
mcp_vector_search/config/__init__.py +1 -0
mcp_vector_search/config/defaults.py +175 -0
mcp_vector_search/config/settings.py +108 -0
mcp_vector_search/core/__init__.py +1 -0
mcp_vector_search/core/database.py +431 -0
mcp_vector_search/core/embeddings.py +250 -0
mcp_vector_search/core/exceptions.py +66 -0
mcp_vector_search/core/indexer.py +310 -0
mcp_vector_search/core/models.py +174 -0
mcp_vector_search/core/project.py +304 -0
mcp_vector_search/core/search.py +324 -0
mcp_vector_search/core/watcher.py +320 -0
mcp_vector_search/mcp/__init__.py +1 -0
mcp_vector_search/parsers/__init__.py +1 -0
mcp_vector_search/parsers/base.py +180 -0
mcp_vector_search/parsers/javascript.py +238 -0
mcp_vector_search/parsers/python.py +407 -0
mcp_vector_search/parsers/registry.py +187 -0
mcp_vector_search/py.typed +1 -0
mcp_vector_search-0.0.3.dist-info/METADATA +333 -0
mcp_vector_search-0.0.3.dist-info/RECORD +35 -0
mcp_vector_search-0.0.3.dist-info/WHEEL +4 -0
mcp_vector_search-0.0.3.dist-info/entry_points.txt +2 -0
mcp_vector_search-0.0.3.dist-info/licenses/LICENSE +21 -0

mcp_vector_search/core/exceptions.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Custom exception hierarchy for MCP Vector Search."""
+from typing import Any, Dict, Optional
+class MCPVectorSearchError(Exception):
+    """Base exception for MCP Vector Search."""
+    def __init__(self, message: str, context: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(message)
+        self.context = context or {}
+class DatabaseError(MCPVectorSearchError):
+    """Database-related errors."""
+    pass
+class DatabaseInitializationError(DatabaseError):
+    """Database initialization failed."""
+    pass
+class DatabaseNotInitializedError(DatabaseError):
+    """Operation attempted on uninitialized database."""
+    pass
+class DocumentAdditionError(DatabaseError):
+    """Failed to add documents to database."""
+    pass
+class SearchError(DatabaseError):
+    """Search operation failed."""
+    pass
+class ParsingError(MCPVectorSearchError):
+    """Code parsing errors."""
+    pass
+class EmbeddingError(MCPVectorSearchError):
+    """Embedding generation errors."""
+    pass
+class ConfigurationError(MCPVectorSearchError):
+    """Configuration validation errors."""
+    pass
+class ProjectError(MCPVectorSearchError):
+    """Project management errors."""
+    pass
+class ProjectNotFoundError(ProjectError):
+    """Project directory or configuration not found."""
+    pass
+class ProjectInitializationError(ProjectError):
+    """Failed to initialize project."""
+    pass

mcp_vector_search/core/indexer.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Semantic indexer for MCP Vector Search."""
+import asyncio
+from pathlib import Path
+from typing import List, Optional, Set
+from loguru import logger
+from ..config.defaults import DEFAULT_IGNORE_PATTERNS
+from ..parsers.registry import get_parser_registry
+from .database import VectorDatabase
+from .exceptions import ParsingError
+from .models import CodeChunk
+class SemanticIndexer:
+    """Semantic indexer for parsing and indexing code files."""
+    def __init__(
+        self,
+        database: VectorDatabase,
+        project_root: Path,
+        file_extensions: List[str],
+    ) -> None:
+        """Initialize semantic indexer.
+        Args:
+            database: Vector database instance
+            project_root: Project root directory
+            file_extensions: File extensions to index
+        """
+        self.database = database
+        self.project_root = project_root
+        self.file_extensions = set(ext.lower() for ext in file_extensions)
+        self.parser_registry = get_parser_registry()
+        self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
+    async def index_project(
+        self,
+        force_reindex: bool = False,
+        show_progress: bool = True,
+    ) -> int:
+        """Index all files in the project.
+        Args:
+            force_reindex: Whether to reindex existing files
+            show_progress: Whether to show progress information
+        Returns:
+            Number of files indexed
+        """
+        logger.info(f"Starting indexing of project: {self.project_root}")
+        # Find all indexable files
+        files_to_index = self._find_indexable_files()
+        if not files_to_index:
+            logger.warning("No indexable files found")
+            return 0
+        logger.info(f"Found {len(files_to_index)} files to index")
+        # Index files
+        indexed_count = 0
+        failed_count = 0
+        for i, file_path in enumerate(files_to_index):
+            if show_progress and (i + 1) % 10 == 0:
+                logger.info(f"Indexing progress: {i + 1}/{len(files_to_index)}")
+            try:
+                success = await self.index_file(file_path, force_reindex)
+                if success:
+                    indexed_count += 1
+                else:
+                    failed_count += 1
+            except Exception as e:
+                logger.error(f"Failed to index {file_path}: {e}")
+                failed_count += 1
+        logger.info(
+            f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
+        )
+        return indexed_count
+    async def index_file(
+        self,
+        file_path: Path,
+        force_reindex: bool = False,
+    ) -> bool:
+        """Index a single file.
+        Args:
+            file_path: Path to the file to index
+            force_reindex: Whether to reindex if already indexed
+        Returns:
+            True if file was successfully indexed
+        """
+        try:
+            # Check if file should be indexed
+            if not self._should_index_file(file_path):
+                return False
+            # Remove existing chunks for this file if reindexing
+            if force_reindex:
+                await self.database.delete_by_file(file_path)
+            # Parse file into chunks
+            chunks = await self._parse_file(file_path)
+            if not chunks:
+                logger.debug(f"No chunks extracted from {file_path}")
+                return True  # Not an error, just empty file
+            # Add chunks to database
+            await self.database.add_chunks(chunks)
+            logger.debug(f"Indexed {len(chunks)} chunks from {file_path}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to index file {file_path}: {e}")
+            raise ParsingError(f"Failed to index file {file_path}: {e}") from e
+    async def reindex_file(self, file_path: Path) -> bool:
+        """Reindex a single file (removes existing chunks first).
+        Args:
+            file_path: Path to the file to reindex
+        Returns:
+            True if file was successfully reindexed
+        """
+        return await self.index_file(file_path, force_reindex=True)
+    async def remove_file(self, file_path: Path) -> int:
+        """Remove all chunks for a file from the index.
+        Args:
+            file_path: Path to the file to remove
+        Returns:
+            Number of chunks removed
+        """
+        try:
+            count = await self.database.delete_by_file(file_path)
+            logger.debug(f"Removed {count} chunks for {file_path}")
+            return count
+        except Exception as e:
+            logger.error(f"Failed to remove file {file_path}: {e}")
+            return 0
+    def _find_indexable_files(self) -> List[Path]:
+        """Find all files that should be indexed.
+        Returns:
+            List of file paths to index
+        """
+        indexable_files = []
+        for file_path in self.project_root.rglob("*"):
+            if self._should_index_file(file_path):
+                indexable_files.append(file_path)
+        return sorted(indexable_files)
+    def _should_index_file(self, file_path: Path) -> bool:
+        """Check if a file should be indexed.
+        Args:
+            file_path: Path to check
+        Returns:
+            True if file should be indexed
+        """
+        # Must be a file
+        if not file_path.is_file():
+            return False
+        # Check file extension
+        if file_path.suffix.lower() not in self.file_extensions:
+            return False
+        # Check if path should be ignored
+        if self._should_ignore_path(file_path):
+            return False
+        # Check file size (skip very large files)
+        try:
+            file_size = file_path.stat().st_size
+            if file_size > 10 * 1024 * 1024:  # 10MB limit
+                logger.warning(f"Skipping large file: {file_path} ({file_size} bytes)")
+                return False
+        except OSError:
+            return False
+        return True
+    def _should_ignore_path(self, file_path: Path) -> bool:
+        """Check if a path should be ignored.
+        Args:
+            file_path: Path to check
+        Returns:
+            True if path should be ignored
+        """
+        try:
+            # Get relative path from project root
+            relative_path = file_path.relative_to(self.project_root)
+            # Check each part of the path
+            for part in relative_path.parts:
+                if part in self._ignore_patterns:
+                    return True
+            # Check if any parent directory should be ignored
+            for parent in relative_path.parents:
+                for part in parent.parts:
+                    if part in self._ignore_patterns:
+                        return True
+            return False
+        except ValueError:
+            # Path is not relative to project root
+            return True
+    async def _parse_file(self, file_path: Path) -> List[CodeChunk]:
+        """Parse a file into code chunks.
+        Args:
+            file_path: Path to the file to parse
+        Returns:
+            List of code chunks
+        """
+        try:
+            # Get appropriate parser
+            parser = self.parser_registry.get_parser_for_file(file_path)
+            # Parse file
+            chunks = await parser.parse_file(file_path)
+            # Filter out empty chunks
+            valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
+            return valid_chunks
+        except Exception as e:
+            logger.error(f"Failed to parse file {file_path}: {e}")
+            raise ParsingError(f"Failed to parse file {file_path}: {e}") from e
+    def add_ignore_pattern(self, pattern: str) -> None:
+        """Add a pattern to ignore during indexing.
+        Args:
+            pattern: Pattern to ignore (directory or file name)
+        """
+        self._ignore_patterns.add(pattern)
+    def remove_ignore_pattern(self, pattern: str) -> None:
+        """Remove an ignore pattern.
+        Args:
+            pattern: Pattern to remove
+        """
+        self._ignore_patterns.discard(pattern)
+    def get_ignore_patterns(self) -> Set[str]:
+        """Get current ignore patterns.
+        Returns:
+            Set of ignore patterns
+        """
+        return self._ignore_patterns.copy()
+    async def get_indexing_stats(self) -> dict:
+        """Get statistics about the indexing process.
+        Returns:
+            Dictionary with indexing statistics
+        """
+        try:
+            # Get database stats
+            db_stats = await self.database.get_stats()
+            # Count indexable files
+            indexable_files = self._find_indexable_files()
+            return {
+                "total_indexable_files": len(indexable_files),
+                "indexed_files": db_stats.total_files,
+                "total_chunks": db_stats.total_chunks,
+                "languages": db_stats.languages,
+                "file_extensions": list(self.file_extensions),
+                "ignore_patterns": list(self._ignore_patterns),
+                "parser_info": self.parser_registry.get_parser_info(),
+            }
+        except Exception as e:
+            logger.error(f"Failed to get indexing stats: {e}")
+            return {
+                "error": str(e),
+                "total_indexable_files": 0,
+                "indexed_files": 0,
+                "total_chunks": 0,
+            }

mcp_vector_search/core/models.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Data models for MCP Vector Search."""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+@dataclass
+class CodeChunk:
+    """Represents a chunk of code with metadata."""
+    content: str
+    file_path: Path
+    start_line: int
+    end_line: int
+    language: str
+    chunk_type: str = "code"  # code, function, class, comment, docstring
+    function_name: Optional[str] = None
+    class_name: Optional[str] = None
+    docstring: Optional[str] = None
+    imports: List[str] = None
+    complexity_score: float = 0.0
+    def __post_init__(self) -> None:
+        """Initialize default values."""
+        if self.imports is None:
+            self.imports = []
+    @property
+    def id(self) -> str:
+        """Generate unique ID for this chunk."""
+        return f"{self.file_path}:{self.start_line}:{self.end_line}"
+    @property
+    def line_count(self) -> int:
+        """Get the number of lines in this chunk."""
+        return self.end_line - self.start_line + 1
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for storage."""
+        return {
+            "content": self.content,
+            "file_path": str(self.file_path),
+            "start_line": self.start_line,
+            "end_line": self.end_line,
+            "language": self.language,
+            "chunk_type": self.chunk_type,
+            "function_name": self.function_name,
+            "class_name": self.class_name,
+            "docstring": self.docstring,
+            "imports": self.imports,
+            "complexity_score": self.complexity_score,
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "CodeChunk":
+        """Create from dictionary."""
+        return cls(
+            content=data["content"],
+            file_path=Path(data["file_path"]),
+            start_line=data["start_line"],
+            end_line=data["end_line"],
+            language=data["language"],
+            chunk_type=data.get("chunk_type", "code"),
+            function_name=data.get("function_name"),
+            class_name=data.get("class_name"),
+            docstring=data.get("docstring"),
+            imports=data.get("imports", []),
+            complexity_score=data.get("complexity_score", 0.0),
+        )
+class SearchResult(BaseModel):
+    """Represents a search result with metadata."""
+    content: str = Field(..., description="The matched code content")
+    file_path: Path = Field(..., description="Path to the source file")
+    start_line: int = Field(..., description="Starting line number")
+    end_line: int = Field(..., description="Ending line number")
+    language: str = Field(..., description="Programming language")
+    similarity_score: float = Field(..., description="Similarity score (0.0 to 1.0)")
+    rank: int = Field(..., description="Result rank in search results")
+    chunk_type: str = Field(default="code", description="Type of code chunk")
+    function_name: Optional[str] = Field(default=None, description="Function name if applicable")
+    class_name: Optional[str] = Field(default=None, description="Class name if applicable")
+    context_before: List[str] = Field(default=[], description="Lines before the match")
+    context_after: List[str] = Field(default=[], description="Lines after the match")
+    highlights: List[str] = Field(default=[], description="Highlighted terms")
+    class Config:
+        arbitrary_types_allowed = True
+    @property
+    def line_count(self) -> int:
+        """Get the number of lines in this result."""
+        return self.end_line - self.start_line + 1
+    @property
+    def location(self) -> str:
+        """Get a human-readable location string."""
+        return f"{self.file_path}:{self.start_line}-{self.end_line}"
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "content": self.content,
+            "file_path": str(self.file_path),
+            "start_line": self.start_line,
+            "end_line": self.end_line,
+            "language": self.language,
+            "similarity_score": self.similarity_score,
+            "rank": self.rank,
+            "chunk_type": self.chunk_type,
+            "function_name": self.function_name,
+            "class_name": self.class_name,
+            "context_before": self.context_before,
+            "context_after": self.context_after,
+            "highlights": self.highlights,
+            "location": self.location,
+            "line_count": self.line_count,
+        }
+class IndexStats(BaseModel):
+    """Statistics about the search index."""
+    total_files: int = Field(..., description="Total number of indexed files")
+    total_chunks: int = Field(..., description="Total number of code chunks")
+    languages: Dict[str, int] = Field(..., description="Language distribution")
+    file_types: Dict[str, int] = Field(..., description="File type distribution")
+    index_size_mb: float = Field(..., description="Index size in megabytes")
+    last_updated: str = Field(..., description="Last update timestamp")
+    embedding_model: str = Field(..., description="Embedding model used")
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "total_files": self.total_files,
+            "total_chunks": self.total_chunks,
+            "languages": self.languages,
+            "file_types": self.file_types,
+            "index_size_mb": self.index_size_mb,
+            "last_updated": self.last_updated,
+            "embedding_model": self.embedding_model,
+        }
+class ProjectInfo(BaseModel):
+    """Information about a project."""
+    name: str = Field(..., description="Project name")
+    root_path: Path = Field(..., description="Project root directory")
+    config_path: Path = Field(..., description="Configuration file path")
+    index_path: Path = Field(..., description="Index directory path")
+    is_initialized: bool = Field(..., description="Whether project is initialized")
+    languages: List[str] = Field(default=[], description="Detected languages")
+    file_count: int = Field(default=0, description="Number of indexable files")
+    class Config:
+        arbitrary_types_allowed = True
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "name": self.name,
+            "root_path": str(self.root_path),
+            "config_path": str(self.config_path),
+            "index_path": str(self.index_path),
+            "is_initialized": self.is_initialized,
+            "languages": self.languages,
+            "file_count": self.file_count,
+        }